Acme-Pinoko

 view release on metacpan or  search on metacpan

lib/Acme/Pinoko.pm  view on Meta::CPAN

            }
        }
    }
    else # Text::KyTea
    {
        my $results = $self->{parser}->parse($$text_ref);

        $results = Data::Recursive::Encode->decode_utf8($results);

        for my $result (@{$results})
        {
            push(@surfaces, $result->{surface});
            push(@poses,    $result->{tags}[$KYTEA_POSTAG_NUM][0]{feature});
            push(@prons,    $result->{tags}[$KYTEA_PRONTAG_NUM][0]{feature});
        }
    }

    return (\@surfaces, \@poses, \@prons);
}

sub _to_pinoko
{
    my ($self, $surfaces_ref, $poses_ref, $prons_ref) = @_;

    my $ret = '';

    for my $i (0 .. $#{$prons_ref})
    {
        my $surf = $surfaces_ref->[$i];

        if (
             $poses_ref->[$i] eq '記号'
         ||  $poses_ref->[$i] eq '補助記号'
         || ( $prons_ref->[$i] eq 'UNK' && $surf =~ /[^\p{InHalfwidthKatakana}]/ )
         || $surf =~ /^[a-zA-Za-zA-Z0-90-9]+$/
        )
        {
            $ret .= $surf;
        }
        elsif ($surf =~ /[^\p{InHiragana}]/)
        {
            if (
                $surf eq '手術'
             || $surf eq '笑'
             || $surf eq 'シーウーノ'
             || $surf eq 'アラマンチュ'
             || $surf eq 'シーウーノアラマンチュ'
             || $surf =~ /^アッチョンブリケー*/
            )
            {
                $ret .= $surf;
            }
            else
            {
                # e.g. 「アめりカ合衆国の州」の場合
                # @surfaces の中身は以下の通り
                # [0]: アめりカ
                # [1]: 合衆国
                # [2]: の
                # [3]: å·ž
                my @surfaces = grep { length } split(/([0-90-9]*[\p{Han}ケヶ]+[0-90-9]*|[^\p{Han}]+)/, $surf);

                my (@kanji_prons, $regexp);

                for my $surface (@surfaces)
                {
                    if ($surface =~ /[0-90-9]*[\p{Han}ケヶ]/) { $regexp .= "(.+)"; }
                    else
                    {
                        if ($self->{parser_name} eq 'Text::MeCab')
                        {
                            $regexp .= Lingua::JA::Regular::Unicode::katakana2hiragana($surface);
                        }
                        else # Text::KyTea
                        {
                            if ($surface =~ /(?:ず|づ)/)
                            {
                                my $pron = Lingua::JA::Regular::Unicode::katakana2hiragana($surface);
                                my $du = $pron; $du =~ tr/ず/づ/;
                                my $zu = $pron; $zu =~ tr/づ/ず/;

                                $regexp .= "(?:$du|$zu)";
                            }
                            else
                            {
                                if ($surface =~ /[あ-おぁ-ぉア-オァ-ォ]{1}/)
                                {
                                    $regexp .= "[" . Lingua::JA::Regular::Unicode::katakana2hiragana($surface) . "|ー]";
                                }
                                else { $regexp .= Lingua::JA::Regular::Unicode::katakana2hiragana($surface); }
                            }
                        }
                    }
                }

                if ($regexp =~ /\(\.\+\)/)
                {
                    $regexp =~ tr/\x{005F}\x{3000}\x{3095}/\x{FF3F}\x{FF3F}\x{304B}/; # 「_ ゕ」-> 「__か」
                    @kanji_prons = $prons_ref->[$i] =~ /$regexp/;
                }

                for my $surface (@surfaces)
                {
                    if ($surface =~ /\p{Han}/)
                    {
                        my $pron        = shift @kanji_prons;
                        my $pinoko_pron = $self->pinoko($pron);

                        if ( (! defined $pinoko_pron) || $pron eq $pinoko_pron ) { $ret .= $surface; }
                        else                                                     { $ret .= $pron;    }
                    }
                    else
                    {
                        if ($surface =~ /[^\p{InHalfwidthKatakana}]/)
                        {
                            if ($surface =~ /^\p{InKatakana}+$/)
                            {
                                my $pron = Lingua::JA::Regular::Unicode::katakana2hiragana($surface);
                                $ret .= Lingua::JA::Regular::Unicode::hiragana2katakana($self->pinoko($pron));
                            }
                            else { $ret .= $surface; }



( run in 1.407 second using v1.01-cache-2.11-cpan-5735350b133 )