<?php
$time_start = microtime_float();
$doc="ypathfile/Kejuaraan-Nasional-Karate-Piala-Panglima-TNI-III-2014.pdf";
$ada=strpos($doc,".txt")+0;
require_once __DIR__ . '/vendor/autoload.php';
$isidoc="";
if(strpos($doc,".txt")>0){
$fh = fopen($doc,'r');
while ($line = fgets($fh)) {
$isidoc.=$line;
}
fclose($fh);
}
else{//pdf
$initop = new \Smalot\PdfParser\Parser();
$ekstrakpdf = $initop->parseFile($doc);
$isidoc= $ekstrakpdf->gettext();
}
$initos = new \Sastrawi\Stemmer\StemmerFactory();
$bikinos = $initos->createStemmer();
$stemming=$bikinos->stem($isidoc);
echo"Isi Doc=$isidoc<hr>";
echo"Stemming=$stemming<hr>";
$time_end = microtime_float();
$waktu = $time_end - $time_start;
function microtime_float(){
list($usec, $sec) = explode(" ", microtime());
return ((float)$usec + (float)$sec);
}
?>
Bonus Normalisasi Text:
<?php
function getBersih($var){ $var = mb_convert_encoding($var, 'UTF-8', 'UTF-8'); $var = preg_replace('/\xE0[\x80-\x9F][\x80-\xBF]'. '|\xED[\xA0-\xBF][\x80-\xBF]/S','?', $var ); $var = preg_replace('/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]'. '|[\x00-\x7F][\x80-\xBF]+'. '|([\xC0\xC1]|[\xF0-\xFF])[\x80-\xBF]*'. '|[\xC2-\xDF]((?![\x80-\xBF])|[\x80-\xBF]{2,})'. '|[\xE0-\xEF](([\x80-\xBF](?![\x80-\xBF]))|(?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/S', '?', $var ); return $var; }
function getNorm($konten){ $konten=getBersih($konten); $konten = iconv("UTF-8", "UTF-8//IGNORE", $konten); $konten=strtolower("$konten"); $konten = mb_convert_encoding($konten, 'UTF-8', 'UTF-8'); require_once __DIR__ . '/vendor/autoload.php'; $initos = new \Sastrawi\Stemmer\StemmerFactory(); $bikinos = $initos->createStemmer(); $stemming=$bikinos->stem($konten); $stemmingnew=strtolower($stemming); $ak=getStopNumber(); $ar=getStopWords(); $wordStop=$stemmingnew; for($i=0;$i<count($ar);$i++){ $wordStop =str_replace(" ".$ar[$i]." "," ", $wordStop); } for($i=0;$i<count($ak);$i++){ $wordStop =str_replace($ak[$i],"", $wordStop); } $juduluji=str_replace(" "," ", $wordStop); $juduluji=str_replace(" "," ", $wordStop); $juduluji=str_replace(" "," ", $wordStop); //===================================================== $stemming=$juduluji; $arAsli=explode(".",$stemming); $arr=explode(" ",$stemming); $unique = array_unique($arr); $dupes = array_diff_key( $arr, $unique ); //$m=array_count_values($dupes); $gab= implode(" ",$unique) return trim($gab); } function getStopWords(){ return array( 'yang', 'untuk', 'pada', 'ke', 'para', 'namun', 'menurut', 'antara', 'dia', 'dua', 'ia', 'seperti', 'jika', 'jika', 'sehingga', 'kembali', 'dan', 'tidak', 'ini', 'karena', 'kepada', 'oleh', 'saat', 'harus', 'sementara', 'setelah', 'belum', 'kami', 'sekitar', 'bagi', 'serta', 'di', 'dari', 'telah', 'sebagai', 'masih', 'hal', 'ketika', 'adalah', 'itu', 'dalam', 'bisa', 'bahwa', 'atau', 'hanya', 'kita', 'dengan', 'akan', 'juga', 'ada', 'mereka', 'sudah', 'saya', 'terhadap', 'secara', 'agar', 'lain', 'anda', 'begitu', 'mengapa', 'kenapa', 'yaitu', 'yakni', 'daripada', 'itulah', 'lagi', 'maka', 'tentang', 'demi', 'dimana', 'kemana', 'pula', 'sambil', 'sebelum', 'sesudah', 'supaya', 'guna', 'kah', 'pun', 'sampai', 'sedangkan', 'selagi', 'sementara', 'tetapi', 'apakah', 'kecuali', 'sebab', 'selain', 'seolah', 'seraya', 'seterusnya', 'tanpa', 'agak', 'boleh', 'dapat', 'dsb', 'dst', 'dll', 'dahulu', 'dulunya', 'anu', 'demikian', 'tapi', 'ingin', 'juga', 'nggak', 'mari', 'nanti', 'melainkan', 'oh', 'ok', 'seharusnya', 'sebetulnya', 'setiap', 'setidaknya', 'sesuatu', 'pasti', 'saja', 'toh', 'ya', 'walau', 'tolong', 'tentu', 'amat', 'apalagi', 'bagaimanapun' ); } function getStopNumber(){ return array( '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '!', '@', '#', '$', '%' ); } function getUnix($array){ error_reporting(0); $unique = array_flip(array_flip($array)); //print_r($unique); $jd=count($array); //echo $jd."#<br>"; $m=0; for($i=0;$i<$jd;$i++){ if(strlen($unique[$i])>0){ //echo "$m =".$unique[$i]."<br>"; $M[$m]=$unique[$i]; $m++; } } return $M; }
?>
Tidak ada komentar:
Posting Komentar