<?php
$time_start = microtime_float();
$doc="ypathfile/Kejuaraan-Nasional-Karate-Piala-Panglima-TNI-III-2014.pdf";
$ada=strpos($doc,".txt")+0;
require_once __DIR__ . '/vendor/autoload.php';
$isidoc="";
if(strpos($doc,".txt")>0){
$fh = fopen($doc,'r');
while ($line = fgets($fh)) {
$isidoc.=$line;
}
fclose($fh);
}
else{//pdf
$initop = new \Smalot\PdfParser\Parser();
$ekstrakpdf = $initop->parseFile($doc);
$isidoc= $ekstrakpdf->gettext();
}
$initos = new \Sastrawi\Stemmer\StemmerFactory();
$bikinos = $initos->createStemmer();
$stemming=$bikinos->stem($isidoc);
echo"Isi Doc=$isidoc<hr>";
echo"Stemming=$stemming<hr>";
$time_end = microtime_float();
$waktu = $time_end - $time_start;
function microtime_float(){
list($usec, $sec) = explode(" ", microtime());
return ((float)$usec + (float)$sec);
}
?>
Bonus Normalisasi Text:
<?php
function getBersih($var){
$var = mb_convert_encoding($var, 'UTF-8', 'UTF-8');
$var = preg_replace('/\xE0[\x80-\x9F][\x80-\xBF]'.
'|\xED[\xA0-\xBF][\x80-\xBF]/S','?', $var );
$var = preg_replace('/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]'.
'|[\x00-\x7F][\x80-\xBF]+'.
'|([\xC0\xC1]|[\xF0-\xFF])[\x80-\xBF]*'.
'|[\xC2-\xDF]((?![\x80-\xBF])|[\x80-\xBF]{2,})'.
'|[\xE0-\xEF](([\x80-\xBF](?![\x80-\xBF]))|(?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/S',
'?', $var );
return $var;
}function getNorm($konten){
$konten=getBersih($konten);
$konten = iconv("UTF-8", "UTF-8//IGNORE", $konten);
$konten=strtolower("$konten");
$konten = mb_convert_encoding($konten, 'UTF-8', 'UTF-8');
require_once __DIR__ . '/vendor/autoload.php';
$initos = new \Sastrawi\Stemmer\StemmerFactory();
$bikinos = $initos->createStemmer();
$stemming=$bikinos->stem($konten);
$stemmingnew=strtolower($stemming);
$ak=getStopNumber();
$ar=getStopWords();
$wordStop=$stemmingnew;
for($i=0;$i<count($ar);$i++){
$wordStop =str_replace(" ".$ar[$i]." "," ", $wordStop);
}
for($i=0;$i<count($ak);$i++){
$wordStop =str_replace($ak[$i],"", $wordStop);
}
$juduluji=str_replace(" "," ", $wordStop);
$juduluji=str_replace(" "," ", $wordStop);
$juduluji=str_replace(" "," ", $wordStop);
//=====================================================
$stemming=$juduluji;
$arAsli=explode(".",$stemming);
$arr=explode(" ",$stemming);
$unique = array_unique($arr);
$dupes = array_diff_key( $arr, $unique );
//$m=array_count_values($dupes);
$gab= implode(" ",$unique)
return trim($gab);
}
function getStopWords(){
return array(
'yang', 'untuk', 'pada', 'ke', 'para', 'namun', 'menurut', 'antara', 'dia', 'dua',
'ia', 'seperti', 'jika', 'jika', 'sehingga', 'kembali', 'dan', 'tidak', 'ini', 'karena',
'kepada', 'oleh', 'saat', 'harus', 'sementara', 'setelah', 'belum', 'kami', 'sekitar',
'bagi', 'serta', 'di', 'dari', 'telah', 'sebagai', 'masih', 'hal', 'ketika', 'adalah',
'itu', 'dalam', 'bisa', 'bahwa', 'atau', 'hanya', 'kita', 'dengan', 'akan', 'juga',
'ada', 'mereka', 'sudah', 'saya', 'terhadap', 'secara', 'agar', 'lain', 'anda',
'begitu', 'mengapa', 'kenapa', 'yaitu', 'yakni', 'daripada', 'itulah', 'lagi', 'maka',
'tentang', 'demi', 'dimana', 'kemana', 'pula', 'sambil', 'sebelum', 'sesudah', 'supaya',
'guna', 'kah', 'pun', 'sampai', 'sedangkan', 'selagi', 'sementara', 'tetapi', 'apakah',
'kecuali', 'sebab', 'selain', 'seolah', 'seraya', 'seterusnya', 'tanpa', 'agak', 'boleh',
'dapat', 'dsb', 'dst', 'dll', 'dahulu', 'dulunya', 'anu', 'demikian', 'tapi', 'ingin',
'juga', 'nggak', 'mari', 'nanti', 'melainkan', 'oh', 'ok', 'seharusnya', 'sebetulnya',
'setiap', 'setidaknya', 'sesuatu', 'pasti', 'saja', 'toh', 'ya', 'walau', 'tolong',
'tentu', 'amat', 'apalagi', 'bagaimanapun'
);
}
function getStopNumber(){
return array(
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '!', '@', '#', '$', '%'
);
}
function getUnix($array){
error_reporting(0);
$unique = array_flip(array_flip($array));
//print_r($unique);
$jd=count($array);
//echo $jd."#<br>";
$m=0;
for($i=0;$i<$jd;$i++){
if(strlen($unique[$i])>0){
//echo "$m =".$unique[$i]."<br>";
$M[$m]=$unique[$i];
$m++;
}
}
return $M;
}?>
Tidak ada komentar:
Posting Komentar