Acme-Lingua-ZH-Remix

 view release on metacpan or  search on metacpan

lib/Acme/Lingua/ZH/Remix.pm  view on Meta::CPAN

    $corpus =~ s/^\#.*$//gm;

    # Squeeze whitespaces
    $corpus =~ s/(\s| )*//gs;

    # Ignore certain punctuations
    $corpus =~ s/(——|──)//gs;

    my @xc = split /(?:((.+?))|:?「(.+?)」|〔(.+?)〕|“(.+?)”)/, $corpus;
    my @phrases = uniq sort grep /.(,|。|?|!)$/,
        map {
            my @x = split /(,|。|?|!)/, $_;
            my @r = ();
            while (@x) {
                my $s = shift @x;
                my $p = shift @x or next;

                $s =~ s/^(,|。|?|!|\s)+//;
                push @r, "$s$p";
            }
            @r;
        } map {
            s/^\s+//;
            s/\s+$//;
            s/^(.+?) //;
            $_;
        } grep { $_ } @xc;

    return @phrases;
}

=head2 feed($corpus_text)



( run in 0.226 second using v1.01-cache-2.11-cpan-0d8aa00de5b )