Acme-Pinoko
view release on metacpan or search on metacpan
lib/Acme/Pinoko.pm view on Meta::CPAN
package Acme::Pinoko;
use 5.008_008;
use strict;
use warnings;
use utf8;
use Carp ();
use Encode ();
use Module::Load ();
use Data::Recursive::Encode ();
use Lingua::JA::Regular::Unicode ();
use Lingua::JA::Halfwidth::Katakana;
our $VERSION = '0.02';
# KyTea ã§ããã©ã«ããããªãã¢ãã«ã使ãå ´åã¯å¤æ´ãå¿
è¦ãªå ´åããã
our $KYTEA_POSTAG_NUM = 0;
our $KYTEA_PRONTAG_NUM = 1;
my @PARSERS = qw/Text::MeCab Text::KyTea/;
my %HIRAGANA_INVALID_POS;
@HIRAGANA_INVALID_POS{qw/å©è© èªå°¾ å¯è© åè© å©åè© å½¢å®¹è© å½¢ç¶è© é£ä½è© æ¥é è© æ¥é è¾ ä»£åè©/} = ();
my %TERMINATOR_CHAR;
@TERMINATOR_CHAR{ split(//, "ã。.ï¼ ã\n\tâ¦â¥!ï¼") } = ();
sub _options
{
return {
parser => 'Text::MeCab',
parser_config => undef,
};
}
sub new
{
my $class = shift;
my %args = (ref $_[0] eq 'HASH' ? %{$_[0]} : @_);
my $options = $class->_options;
for my $key (keys %args)
{
if ( ! exists $options->{$key} ) { Carp::croak "Unknown option: '$key'"; }
else { $options->{$key} = $args{$key}; }
}
Carp::croak "Invalid parser: '$options->{parser}'" if ! grep { $options->{parser} eq $_ } @PARSERS;
Module::Load::load($options->{parser});
my $self = bless $options, $class;
$self->_load_parser;
return $self;
}
sub say
{
my ($self, $text) = @_;
return unless defined $text;
return $self->_to_pinoko( $self->_parse(\$text) );
}
sub _load_parser
{
my ($self) = @_;
$self->{parser_name} = delete $self->{parser};
if ($self->{parser_name} eq 'Text::MeCab')
{
my $mecab;
if ( ! $self->{paser_config} ) { $mecab = Text::MeCab->new; }
else { $mecab = Text::MeCab->new($self->{parser_config}); }
$self->{parser} = $mecab;
$self->{encoder} = Encode::find_encoding(Text::MeCab::ENCODING());
}
else # Text::KyTea
{
my $kytea;
if ( ! $self->{parser_config} ) { $kytea = Text::KyTea->new({ tagmax => 1 }); }
else { $kytea = Text::KyTea->new($self->{parser_config}); }
$self->{parser} = $kytea;
}
return;
}
sub _parse
{
my ($self, $text_ref) = @_;
my (@surfaces, @poses, @prons);
if ($self->{parser_name} eq 'Text::MeCab')
{
my $encoder = $self->{encoder};
for my $text ( split(/(\s+)/, $$text_ref) )
{
if ($text =~ /\s/)
{
push(@surfaces, $text);
push(@poses, 'è¨å·');
push(@prons, 'UNK');
( run in 2.559 seconds using v1.01-cache-2.11-cpan-ceb78f64989 )