Sabtu, 07 Mei 2022

PDF to TEXT

 <?php
$time_start = microtime_float();

$doc="ypathfile/Kejuaraan-Nasional-Karate-Piala-Panglima-TNI-III-2014.pdf";
$ada=strpos($doc,".txt")+0;


require_once __DIR__ . '/vendor/autoload.php';
$isidoc="";

if(strpos($doc,".txt")>0){
$fh = fopen($doc,'r');
while ($line = fgets($fh)) {
$isidoc.=$line;
}
fclose($fh);
    }

else{//pdf
$initop = new \Smalot\PdfParser\Parser();
$ekstrakpdf = $initop->parseFile($doc);
$isidoc= $ekstrakpdf->gettext();
    }

$initos = new \Sastrawi\Stemmer\StemmerFactory();
$bikinos = $initos->createStemmer();
$stemming=$bikinos->stem($isidoc);
echo"Isi Doc=$isidoc<hr>";
echo"Stemming=$stemming<hr>";

 

$time_end = microtime_float();
$waktu = $time_end - $time_start;


function microtime_float(){
    list($usec, $sec) = explode(" ", microtime());
   return ((float)$usec + (float)$sec);
}

?>

Bonus Normalisasi Text:

 <?php

function getBersih($var){	
$var = mb_convert_encoding($var, 'UTF-8', 'UTF-8');
$var = preg_replace('/\xE0[\x80-\x9F][\x80-\xBF]'.
 '|\xED[\xA0-\xBF][\x80-\xBF]/S','?', $var );
 
 $var = preg_replace('/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]'.
 '|[\x00-\x7F][\x80-\xBF]+'.
 '|([\xC0\xC1]|[\xF0-\xFF])[\x80-\xBF]*'.
 '|[\xC2-\xDF]((?![\x80-\xBF])|[\x80-\xBF]{2,})'.
 '|[\xE0-\xEF](([\x80-\xBF](?![\x80-\xBF]))|(?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/S',
 '?', $var );
return $var;
}
function getNorm($konten){
	$konten=getBersih($konten);
	$konten = iconv("UTF-8", "UTF-8//IGNORE", $konten);
	$konten=strtolower("$konten");
	$konten = mb_convert_encoding($konten, 'UTF-8', 'UTF-8');
			
require_once __DIR__ . '/vendor/autoload.php';
$initos = new \Sastrawi\Stemmer\StemmerFactory();
$bikinos = $initos->createStemmer();
$stemming=$bikinos->stem($konten);
$stemmingnew=strtolower($stemming);

$ak=getStopNumber();
$ar=getStopWords();
$wordStop=$stemmingnew;
for($i=0;$i<count($ar);$i++){
 $wordStop =str_replace(" ".$ar[$i]." "," ", $wordStop); 
}

for($i=0;$i<count($ak);$i++){
 $wordStop =str_replace($ak[$i],"", $wordStop); 
}

$juduluji=str_replace("  "," ", $wordStop); 
$juduluji=str_replace("  "," ", $wordStop); 
$juduluji=str_replace("  "," ", $wordStop); 

//=====================================================	
 $stemming=$juduluji;
 $arAsli=explode(".",$stemming);
 $arr=explode(" ",$stemming);
 $unique = array_unique($arr); 
 $dupes = array_diff_key( $arr, $unique ); 
 //$m=array_count_values($dupes); 

 $gab= implode(" ",$unique)
 return trim($gab);
}
 
function getStopWords(){
        return array(
            'yang', 'untuk', 'pada', 'ke', 'para', 'namun', 'menurut', 'antara', 'dia', 'dua',
            'ia', 'seperti', 'jika', 'jika', 'sehingga', 'kembali', 'dan', 'tidak', 'ini', 'karena',
            'kepada', 'oleh', 'saat', 'harus', 'sementara', 'setelah', 'belum', 'kami', 'sekitar',
            'bagi', 'serta', 'di', 'dari', 'telah', 'sebagai', 'masih', 'hal', 'ketika', 'adalah',
            'itu', 'dalam', 'bisa', 'bahwa', 'atau', 'hanya', 'kita', 'dengan', 'akan', 'juga',
            'ada', 'mereka', 'sudah', 'saya', 'terhadap', 'secara', 'agar', 'lain', 'anda',
            'begitu', 'mengapa', 'kenapa', 'yaitu', 'yakni', 'daripada', 'itulah', 'lagi', 'maka',
            'tentang', 'demi', 'dimana', 'kemana', 'pula', 'sambil', 'sebelum', 'sesudah', 'supaya',
            'guna', 'kah', 'pun', 'sampai', 'sedangkan', 'selagi', 'sementara', 'tetapi', 'apakah',
            'kecuali', 'sebab', 'selain', 'seolah', 'seraya', 'seterusnya', 'tanpa', 'agak', 'boleh',
            'dapat', 'dsb', 'dst', 'dll', 'dahulu', 'dulunya', 'anu', 'demikian', 'tapi', 'ingin',
            'juga', 'nggak', 'mari', 'nanti', 'melainkan', 'oh', 'ok', 'seharusnya', 'sebetulnya',
            'setiap', 'setidaknya', 'sesuatu', 'pasti', 'saja', 'toh', 'ya', 'walau', 'tolong',
            'tentu', 'amat', 'apalagi', 'bagaimanapun'
        );
    }


function getStopNumber(){
        return array(
            '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '!', '@', '#', '$', '%'
        );
    }
 
 
 
function getUnix($array){
error_reporting(0);
$unique = array_flip(array_flip($array));
//print_r($unique);
$jd=count($array);
//echo $jd."#<br>";
$m=0;
for($i=0;$i<$jd;$i++){
 if(strlen($unique[$i])>0){
  //echo "$m =".$unique[$i]."<br>";
  $M[$m]=$unique[$i];
  $m++;
 }
}
 return $M;
}
?>



Tidak ada komentar:

Posting Komentar

global_priv WARNING Selalu

 Jika muncul pesan kesalahan: Warning in .\libraries\classes\Dbal\DbiMysqli.php#209  mysqli::query(): (HY000/1034): Index for table 'glo...