Acme-Pinoko

 view release on metacpan or  search on metacpan

lib/Acme/Pinoko.pm  view on Meta::CPAN

package Acme::Pinoko;

use 5.008_008;
use strict;
use warnings;
use utf8;

use Carp   ();
use Encode ();
use Module::Load ();
use Data::Recursive::Encode ();
use Lingua::JA::Regular::Unicode ();
use Lingua::JA::Halfwidth::Katakana;

our $VERSION = '0.02';

# KyTea でデフォルトじゃないモデルを使う場合は変更が必要な場合もある
our $KYTEA_POSTAG_NUM  = 0;
our $KYTEA_PRONTAG_NUM = 1;

my @PARSERS = qw/Text::MeCab Text::KyTea/;

my %HIRAGANA_INVALID_POS;
@HIRAGANA_INVALID_POS{qw/助詞 語尾 副詞 動詞 助動詞 形容詞 形状詞 連体詞 接頭詞 接頭辞 代名詞/} = ();

my %TERMINATOR_CHAR;
@TERMINATOR_CHAR{ split(//, "。。..  \n\t…‥!!") } = ();

sub _options
{
    return {
        parser        => 'Text::MeCab',
        parser_config => undef,
    };
}

sub new
{
    my $class = shift;
    my %args  = (ref $_[0] eq 'HASH' ? %{$_[0]} : @_);

    my $options = $class->_options;

    for my $key (keys %args)
    {
        if ( ! exists $options->{$key} ) { Carp::croak "Unknown option: '$key'"; }
        else                             { $options->{$key} = $args{$key};       }
    }

    Carp::croak "Invalid parser: '$options->{parser}'" if ! grep { $options->{parser} eq $_ } @PARSERS;

    Module::Load::load($options->{parser});

    my $self = bless $options, $class;

    $self->_load_parser;

    return $self;
}

sub say
{
    my ($self, $text) = @_;

    return unless defined $text;
    return $self->_to_pinoko( $self->_parse(\$text) );
}

sub _load_parser
{
    my ($self) = @_;

    $self->{parser_name} = delete $self->{parser};

    if ($self->{parser_name} eq 'Text::MeCab')
    {
        my $mecab;

        if ( ! $self->{paser_config} ) { $mecab = Text::MeCab->new;                         }
        else                           { $mecab = Text::MeCab->new($self->{parser_config}); }

        $self->{parser}  = $mecab;
        $self->{encoder} = Encode::find_encoding(Text::MeCab::ENCODING());
    }
    else # Text::KyTea
    {
        my $kytea;

        if ( ! $self->{parser_config} ) { $kytea = Text::KyTea->new({ tagmax => 1 });        }
        else                            { $kytea = Text::KyTea->new($self->{parser_config}); }

        $self->{parser} = $kytea;
    }

    return;
}

sub _parse
{
    my ($self, $text_ref) = @_;

    my (@surfaces, @poses, @prons);

    if ($self->{parser_name} eq 'Text::MeCab')
    {
        my $encoder = $self->{encoder};

        for my $text ( split(/(\s+)/, $$text_ref) )
        {
            if ($text =~ /\s/)
            {
                push(@surfaces, $text);
                push(@poses, '記号');
                push(@prons, 'UNK');



( run in 2.559 seconds using v1.01-cache-2.11-cpan-ceb78f64989 )