Senin, 16 Maret 2015

Perintah terminal mesin penerjemah statistik


cd ~/moses-pba/mesin-a

>>>>cleaning + tokenisasi
~/mosesdecoder/scripts/training/clean-corpus-n.perl corpus/untan ind pnk corpus/untan.clean 1 40
perl ~/myperl/siappakai/clean.plx corpus/untan.clean.pnk corpus/untan.clean1.pnk
perl ~/myperl/siappakai/clean.plx corpus/untan.clean.ind corpus/untan.clean1.ind
~/mosesdecoder/scripts/tokenizer/lowercase.perl < corpus/untan.clean1.pnk > corpus/untan.lowercased.pnk
~/mosesdecoder/scripts/tokenizer/lowercase.perl < corpus/untan.clean1.ind > corpus/untan.lowercased.ind

>>>> training Model bahasa
~/srilm/bin/i686/ngram-count -order 3 -interpolate -unk -text corpus/untan.clean.pnk -lm lm/melayu.lm

>>>> training Model translasi
~/mosesdecoder/scripts/training/train-model.perl -root-dir . --corpus corpus/untan.lowercased --f ind --e pnk --lm 0:3:/home/herry/moses-pba/mesin-a/lm/melayu.lm:0

>>>> RUN DECODER
~/mosesdecoder/moses-cmd/src/moses -f model/moses.ini

>>>> Pengujian otomatis dengan BLEU
~/mosesdecoder/moses-cmd/src/moses -f model/moses.ini < indo.txt > pnk.abcd
~/mosesdecoder/scripts/generic/multi-bleu.perl pnk.txt < pnk.abcd


Source code cleaning dan tokenisasi


#!/usr/bin/perl
# nl.plx
# tokenisasi dan cleaning
# by. Herry S
# perintah : perl clean.plx fileinput fileoutput
use warnings;
use strict;

my $input = shift;
my $output = shift;

open INPUT, $input or die $!;
my @kalimat;
@kalimat = <INPUT>;

my $i;
my $j;
my $string;
my $string1;
my $string2;
my $string3;
my @kalimat2;


open OUT, ">$output" or die "Can't write on file $output: $!\n";
print "Proses...\n";

my $no=0;
#spasi dan length
foreach $i (@kalimat) {
#buang enter
  for ($i) {s/\n+$//;}
  $string = $i;
  my $mypj=length($string);
  if ($mypj>0 and $mypj<1000) {
    my $k1="  ";
    my $k2=" ";
    $string=~ s/$k1/$k2/g;
    $string=~ s/$k1/$k2/g;
    $kalimat2[$no]=$string;
    $no++;
  }
}

$no=0;
#spasi ujung
foreach $j (@kalimat2) {
    my $k1=substr($j,length($j)-1,1);
  if ($k1 eq " ") {
 $kalimat2[$no]=substr($j,0,length($j)-1);
}
else
{
 $kalimat2[$no]=$j;
}
  $no++;

}

$no=0;
#garis miring /
foreach $j (@kalimat2) {
    my $k1="/";
    my $k2=" / ";
  $string=$j;
  $string=~ s/$k2/$k1/g;
  $string=~ s/$k1/$k2/g;
  $kalimat2[$no]=$string;
  $no++;
}

$no=0;
#koma
foreach $j (@kalimat2) {
    my $k1=",";
    my $k2=" , ";
  $string=$j;
  $string=~ s/$k2/$k1/g;
  $string=~ s/$k1/$k2/g;
  $kalimat2[$no]=$string;
  $no++;
}

$no=0;
#titik
foreach $j (@kalimat2) {
  if (substr($j,  -1) eq '.') {
  $string = substr($j, 0, - 1);
  } else {$string =$j}

    my $k1='\. ';
    my $k2=" ";
  $string1=$string;
  $string1=~ s/$k1/$k2/g;
  $kalimat2[$no]=$string;
  $no++;
}

$no=0;
#tanda tanya
foreach $j (@kalimat2) {
  if (substr($j,  -1) eq '?') {
  $string = substr($j, 0, - 1).' ?';
  } else {$string =$j}

    my $k1='  \?';
    my $k2=' ?';
  $string1=$string;
  $string1=~ s/$k1/$k2/g;
  $kalimat2[$no]=$string;
  $no++;
}

$no=0;
#tanda seru
foreach $j (@kalimat2) {
  if (substr($j,  -1) eq '!') {
  $string = substr($j, 0, - 1).' !';
  } else {$string =$j}

    my $k1='  \!';
    my $k2=' !';
  $string1=$string;
  $string1=~ s/$k1/$k2/g;
  $kalimat2[$no]=$string;
  $no++;
}

$no=0;
#kurung buka
foreach $j (@kalimat2) {
    my $k1='\(';
    my $k2=' ( ';
  $string=$j;
  $string=~ s/$k1/$k2/g;
  $kalimat2[$no]=$string;
  $no++;
}

$no=0;
#kurung tutup
foreach $j (@kalimat2) {
    my $k1='\)';
    my $k2=' ) ';
  $string=$j;
  $string=~ s/$k1/$k2/g;
  $kalimat2[$no]=$string;
  $no++;
}


$no=0;
#kutip 2
foreach $j (@kalimat2) {
    my $k1='\"';
    my $k2=' " ';
  $string=$j;
  $string=~ s/$k1/$k2/g;
  $kalimat2[$no]=$string;
  $no++;
}


$no=0;
#kutip 2
foreach $j (@kalimat2) {
    my $k1='\”';
    my $k2=' ” ';
  $string=$j;
  $string=~ s/$k1/$k2/g;
  $kalimat2[$no]=$string;
  $no++;
}

$no=0;
#kutip 2 terbalik
foreach $j (@kalimat2) {
    my $k1='\“';
    my $k2=' “ ';
  $string=$j;
  $string=~ s/$k1/$k2/g;
  $kalimat2[$no]=$string;
  $no++;
}

$no=0;
#kutip 1
foreach $j (@kalimat2) {
    my $k1='\'';
    my $k2=' \' ';
  $string=$j;
  $string=~ s/$k1/$k2/g;
  $kalimat2[$no]=$string;
  $no++;
}

$no=0;
#last spasi
foreach $j (@kalimat2) {
    my $k1="  ";
    my $k2=" ";
  $string=$j;
  $string=~ s/$k1/$k2/g;
  $kalimat2[$no]=$string;
  $no++;
}

$no=0;
#last spasi lagi
foreach $j (@kalimat2) {
    my $k1="  ";
    my $k2=" ";
  $string=$j;
  $string=~ s/$k1/$k2/g;
  $kalimat2[$no]=$string;
  $no++;
}

$no=0;
#spasi diawal
foreach $j (@kalimat2) {
    my $k1=substr($j,0,1);
  if ($k1 eq " ") {
 $kalimat2[$no]=substr($j,1,length($j)-1);
}
else
{
 $kalimat2[$no]=$j;
}
  $no++;
print ">$k1<\n";
}

$no=0;
#cetak
foreach $j (@kalimat2) {
#  $string=$kalimat2[$no];
  $no++;
#  print "1:$j";
#  print "$j\n";
  print OUT "$j\n";
}
print "Total = $no baris\n";


Tidak ada komentar:

Posting Komentar