Acme-Lingua-ZH-Remix
view release on metacpan or search on metacpan
lib/Acme/Lingua/ZH/Remix.pm view on Meta::CPAN
$corpus =~ s/^\#.*$//gm;
# Squeeze whitespaces
$corpus =~ s/(\s|ã)*//gs;
# Ignore certain punctuations
$corpus =~ s/(ââ|ââ)//gs;
my @xc = split /(?:ï¼(.+?)ï¼|ï¼?ã(.+?)ã|ã(.+?)ã|â(.+?)â)/, $corpus;
my @phrases = uniq sort grep /.(ï¼|ã|ï¼|ï¼)$/,
map {
my @x = split /(ï¼|ã|ï¼|ï¼)/, $_;
my @r = ();
while (@x) {
my $s = shift @x;
my $p = shift @x or next;
$s =~ s/^(ï¼|ã|ï¼|ï¼|\s)+//;
push @r, "$s$p";
}
@r;
} map {
s/^\s+//;
s/\s+$//;
s/^(.+?) //;
$_;
} grep { $_ } @xc;
return @phrases;
}
=head2 feed($corpus_text)
( run in 0.262 second using v1.01-cache-2.11-cpan-0d8aa00de5b )