cd ~/moses-pba/mesin-a
>>>>cleaning + tokenisasi
~/mosesdecoder/scripts/training/clean-corpus-n.perl corpus/untan ind pnk corpus/untan.clean 1 40
perl ~/myperl/siappakai/clean.plx corpus/untan.clean.pnk corpus/untan.clean1.pnk
perl ~/myperl/siappakai/clean.plx corpus/untan.clean.ind corpus/untan.clean1.ind
~/mosesdecoder/scripts/tokenizer/lowercase.perl < corpus/untan.clean1.pnk > corpus/untan.lowercased.pnk
~/mosesdecoder/scripts/tokenizer/lowercase.perl < corpus/untan.clean1.ind > corpus/untan.lowercased.ind
>>>> training Model bahasa
~/srilm/bin/i686/ngram-count -order 3 -interpolate -unk -text corpus/untan.clean.pnk -lm lm/melayu.lm
>>>> training Model translasi
~/mosesdecoder/scripts/training/train-model.perl -root-dir . --corpus corpus/untan.lowercased --f ind --e pnk --lm 0:3:/home/herry/moses-pba/mesin-a/lm/melayu.lm:0
>>>> RUN DECODER
~/mosesdecoder/moses-cmd/src/moses -f model/moses.ini
>>>> Pengujian otomatis dengan BLEU
~/mosesdecoder/moses-cmd/src/moses -f model/moses.ini < indo.txt > pnk.abcd
~/mosesdecoder/scripts/generic/multi-bleu.perl pnk.txt < pnk.abcd
Source code cleaning dan tokenisasi
#!/usr/bin/perl
# nl.plx
# tokenisasi dan cleaning
# by. Herry S
# perintah : perl clean.plx fileinput fileoutput
use warnings;
use strict;
my $input = shift;
my $output = shift;
open INPUT, $input or die $!;
my @kalimat;
@kalimat = <INPUT>;
my $i;
my $j;
my $string;
my $string1;
my $string2;
my $string3;
my @kalimat2;
open OUT, ">$output" or die "Can't write on file $output: $!\n";
print "Proses...\n";
my $no=0;
#spasi dan length
foreach $i (@kalimat) {
#buang enter
for ($i) {s/\n+$//;}
$string = $i;
my $mypj=length($string);
if ($mypj>0 and $mypj<1000) {
my $k1=" ";
my $k2=" ";
$string=~ s/$k1/$k2/g;
$string=~ s/$k1/$k2/g;
$kalimat2[$no]=$string;
$no++;
}
}
$no=0;
#spasi ujung
foreach $j (@kalimat2) {
my $k1=substr($j,length($j)-1,1);
if ($k1 eq " ") {
$kalimat2[$no]=substr($j,0,length($j)-1);
}
else
{
$kalimat2[$no]=$j;
}
$no++;
}
$no=0;
#garis miring /
foreach $j (@kalimat2) {
my $k1="/";
my $k2=" / ";
$string=$j;
$string=~ s/$k2/$k1/g;
$string=~ s/$k1/$k2/g;
$kalimat2[$no]=$string;
$no++;
}
$no=0;
#koma
foreach $j (@kalimat2) {
my $k1=",";
my $k2=" , ";
$string=$j;
$string=~ s/$k2/$k1/g;
$string=~ s/$k1/$k2/g;
$kalimat2[$no]=$string;
$no++;
}
$no=0;
#titik
foreach $j (@kalimat2) {
if (substr($j, -1) eq '.') {
$string = substr($j, 0, - 1);
} else {$string =$j}
my $k1='\. ';
my $k2=" ";
$string1=$string;
$string1=~ s/$k1/$k2/g;
$kalimat2[$no]=$string;
$no++;
}
$no=0;
#tanda tanya
foreach $j (@kalimat2) {
if (substr($j, -1) eq '?') {
$string = substr($j, 0, - 1).' ?';
} else {$string =$j}
my $k1=' \?';
my $k2=' ?';
$string1=$string;
$string1=~ s/$k1/$k2/g;
$kalimat2[$no]=$string;
$no++;
}
$no=0;
#tanda seru
foreach $j (@kalimat2) {
if (substr($j, -1) eq '!') {
$string = substr($j, 0, - 1).' !';
} else {$string =$j}
my $k1=' \!';
my $k2=' !';
$string1=$string;
$string1=~ s/$k1/$k2/g;
$kalimat2[$no]=$string;
$no++;
}
$no=0;
#kurung buka
foreach $j (@kalimat2) {
my $k1='\(';
my $k2=' ( ';
$string=$j;
$string=~ s/$k1/$k2/g;
$kalimat2[$no]=$string;
$no++;
}
$no=0;
#kurung tutup
foreach $j (@kalimat2) {
my $k1='\)';
my $k2=' ) ';
$string=$j;
$string=~ s/$k1/$k2/g;
$kalimat2[$no]=$string;
$no++;
}
$no=0;
#kutip 2
foreach $j (@kalimat2) {
my $k1='\"';
my $k2=' " ';
$string=$j;
$string=~ s/$k1/$k2/g;
$kalimat2[$no]=$string;
$no++;
}
$no=0;
#kutip 2
foreach $j (@kalimat2) {
my $k1='\”';
my $k2=' ” ';
$string=$j;
$string=~ s/$k1/$k2/g;
$kalimat2[$no]=$string;
$no++;
}
$no=0;
#kutip 2 terbalik
foreach $j (@kalimat2) {
my $k1='\“';
my $k2=' “ ';
$string=$j;
$string=~ s/$k1/$k2/g;
$kalimat2[$no]=$string;
$no++;
}
$no=0;
#kutip 1
foreach $j (@kalimat2) {
my $k1='\'';
my $k2=' \' ';
$string=$j;
$string=~ s/$k1/$k2/g;
$kalimat2[$no]=$string;
$no++;
}
$no=0;
#last spasi
foreach $j (@kalimat2) {
my $k1=" ";
my $k2=" ";
$string=$j;
$string=~ s/$k1/$k2/g;
$kalimat2[$no]=$string;
$no++;
}
$no=0;
#last spasi lagi
foreach $j (@kalimat2) {
my $k1=" ";
my $k2=" ";
$string=$j;
$string=~ s/$k1/$k2/g;
$kalimat2[$no]=$string;
$no++;
}
$no=0;
#spasi diawal
foreach $j (@kalimat2) {
my $k1=substr($j,0,1);
if ($k1 eq " ") {
$kalimat2[$no]=substr($j,1,length($j)-1);
}
else
{
$kalimat2[$no]=$j;
}
$no++;
print ">$k1<\n";
}
$no=0;
#cetak
foreach $j (@kalimat2) {
# $string=$kalimat2[$no];
$no++;
# print "1:$j";
# print "$j\n";
print OUT "$j\n";
}
print "Total = $no baris\n";
Tidak ada komentar:
Posting Komentar