Acme-Pinoko
view release on metacpan or search on metacpan
lib/Acme/Pinoko.pm view on Meta::CPAN
}
}
}
else # Text::KyTea
{
my $results = $self->{parser}->parse($$text_ref);
$results = Data::Recursive::Encode->decode_utf8($results);
for my $result (@{$results})
{
push(@surfaces, $result->{surface});
push(@poses, $result->{tags}[$KYTEA_POSTAG_NUM][0]{feature});
push(@prons, $result->{tags}[$KYTEA_PRONTAG_NUM][0]{feature});
}
}
return (\@surfaces, \@poses, \@prons);
}
sub _to_pinoko
{
my ($self, $surfaces_ref, $poses_ref, $prons_ref) = @_;
my $ret = '';
for my $i (0 .. $#{$prons_ref})
{
my $surf = $surfaces_ref->[$i];
if (
$poses_ref->[$i] eq 'è¨å·'
|| $poses_ref->[$i] eq 'è£å©è¨å·'
|| ( $prons_ref->[$i] eq 'UNK' && $surf =~ /[^\p{InHalfwidthKatakana}]/ )
|| $surf =~ /^[a-zA-Zï½-ï½ï¼¡-Z0-9ï¼-ï¼]+$/
)
{
$ret .= $surf;
}
elsif ($surf =~ /[^\p{InHiragana}]/)
{
if (
$surf eq 'æè¡'
|| $surf eq 'ç¬'
|| $surf eq 'ã·ã¼ã¦ã¼ã'
|| $surf eq 'ã¢ã©ãã³ãã¥'
|| $surf eq 'ã·ã¼ã¦ã¼ãã¢ã©ãã³ãã¥'
|| $surf =~ /^ã¢ããã§ã³ããªã±ã¼*/
)
{
$ret .= $surf;
}
else
{
# e.g. ãï½±ããã«åè¡å½ã®å·ãã®å ´å
# @surfaces ã®ä¸èº«ã¯ä»¥ä¸ã®éã
# [0]: ï½±ããã«
# [1]: åè¡å½
# [2]: ã®
# [3]: å·
my @surfaces = grep { length } split(/([0-9ï¼-ï¼]*[\p{Han}ã±ã¶]+[0-9ï¼-ï¼]*|[^\p{Han}]+)/, $surf);
my (@kanji_prons, $regexp);
for my $surface (@surfaces)
{
if ($surface =~ /[0-9ï¼-ï¼]*[\p{Han}ã±ã¶]/) { $regexp .= "(.+)"; }
else
{
if ($self->{parser_name} eq 'Text::MeCab')
{
$regexp .= Lingua::JA::Regular::Unicode::katakana2hiragana($surface);
}
else # Text::KyTea
{
if ($surface =~ /(?:ã|ã¥)/)
{
my $pron = Lingua::JA::Regular::Unicode::katakana2hiragana($surface);
my $du = $pron; $du =~ tr/ã/ã¥/;
my $zu = $pron; $zu =~ tr/ã¥/ã/;
$regexp .= "(?:$du|$zu)";
}
else
{
if ($surface =~ /[ã-ãã-ãã¢-ãªã¡-ã©]{1}/)
{
$regexp .= "[" . Lingua::JA::Regular::Unicode::katakana2hiragana($surface) . "|ã¼]";
}
else { $regexp .= Lingua::JA::Regular::Unicode::katakana2hiragana($surface); }
}
}
}
}
if ($regexp =~ /\(\.\+\)/)
{
$regexp =~ tr/\x{005F}\x{3000}\x{3095}/\x{FF3F}\x{FF3F}\x{304B}/; # ã_ããã-> ã__ãã
@kanji_prons = $prons_ref->[$i] =~ /$regexp/;
}
for my $surface (@surfaces)
{
if ($surface =~ /\p{Han}/)
{
my $pron = shift @kanji_prons;
my $pinoko_pron = $self->pinoko($pron);
if ( (! defined $pinoko_pron) || $pron eq $pinoko_pron ) { $ret .= $surface; }
else { $ret .= $pron; }
}
else
{
if ($surface =~ /[^\p{InHalfwidthKatakana}]/)
{
if ($surface =~ /^\p{InKatakana}+$/)
{
my $pron = Lingua::JA::Regular::Unicode::katakana2hiragana($surface);
$ret .= Lingua::JA::Regular::Unicode::hiragana2katakana($self->pinoko($pron));
}
else { $ret .= $surface; }
( run in 1.407 second using v1.01-cache-2.11-cpan-5735350b133 )