Acme-Pinoko
view release on metacpan or search on metacpan
lib/Acme/Pinoko.pm view on Meta::CPAN
package Acme::Pinoko;
use 5.008_008;
use strict;
use warnings;
use utf8;
use Carp ();
use Encode ();
use Module::Load ();
use Data::Recursive::Encode ();
use Lingua::JA::Regular::Unicode ();
use Lingua::JA::Halfwidth::Katakana;
our $VERSION = '0.02';
# KyTea ã§ããã©ã«ããããªãã¢ãã«ã使ãå ´åã¯å¤æ´ãå¿
è¦ãªå ´åããã
our $KYTEA_POSTAG_NUM = 0;
our $KYTEA_PRONTAG_NUM = 1;
my @PARSERS = qw/Text::MeCab Text::KyTea/;
my %HIRAGANA_INVALID_POS;
@HIRAGANA_INVALID_POS{qw/å©è© èªå°¾ å¯è© åè© å©åè© å½¢å®¹è© å½¢ç¶è© é£ä½è© æ¥é è© æ¥é è¾ ä»£åè©/} = ();
my %TERMINATOR_CHAR;
@TERMINATOR_CHAR{ split(//, "ã。.ï¼ ã\n\tâ¦â¥!ï¼") } = ();
sub _options
{
return {
parser => 'Text::MeCab',
parser_config => undef,
};
}
sub new
{
my $class = shift;
my %args = (ref $_[0] eq 'HASH' ? %{$_[0]} : @_);
my $options = $class->_options;
for my $key (keys %args)
{
if ( ! exists $options->{$key} ) { Carp::croak "Unknown option: '$key'"; }
else { $options->{$key} = $args{$key}; }
}
Carp::croak "Invalid parser: '$options->{parser}'" if ! grep { $options->{parser} eq $_ } @PARSERS;
Module::Load::load($options->{parser});
my $self = bless $options, $class;
$self->_load_parser;
return $self;
}
sub say
{
my ($self, $text) = @_;
return unless defined $text;
return $self->_to_pinoko( $self->_parse(\$text) );
lib/Acme/Pinoko.pm view on Meta::CPAN
else # Text::KyTea
{
my $kytea;
if ( ! $self->{parser_config} ) { $kytea = Text::KyTea->new({ tagmax => 1 }); }
else { $kytea = Text::KyTea->new($self->{parser_config}); }
$self->{parser} = $kytea;
}
return;
}
sub _parse
{
my ($self, $text_ref) = @_;
my (@surfaces, @poses, @prons);
if ($self->{parser_name} eq 'Text::MeCab')
{
my $encoder = $self->{encoder};
for my $text ( split(/(\s+)/, $$text_ref) )
{
if ($text =~ /\s/)
{
push(@surfaces, $text);
push(@poses, 'è¨å·');
push(@prons, 'UNK');
next;
}
my $encoded_text = $encoder->encode($text);
for (my $node = $self->{parser}->parse($encoded_text); $node; $node = $node->next)
{
next if $node->stat == 2 || $node->stat == 3;
my $surface = $encoder->decode($node->surface);
push(@surfaces, $surface);
my ($pos, $pron) = (split(/,/, $encoder->decode($node->feature), 9))[0,7];
if ( (! defined $pron) || $pron eq '*' )
{
if ($surface =~ /^\p{InKatakana}+$/) { $pron = Lingua::JA::Regular::Unicode::katakana2hiragana($surface); }
else { $pron = 'UNK'; }
}
else { $pron = Lingua::JA::Regular::Unicode::katakana2hiragana($pron); }
push(@poses, $pos);
push(@prons, $pron);
}
}
}
else # Text::KyTea
{
my $results = $self->{parser}->parse($$text_ref);
$results = Data::Recursive::Encode->decode_utf8($results);
for my $result (@{$results})
{
push(@surfaces, $result->{surface});
push(@poses, $result->{tags}[$KYTEA_POSTAG_NUM][0]{feature});
push(@prons, $result->{tags}[$KYTEA_PRONTAG_NUM][0]{feature});
}
}
return (\@surfaces, \@poses, \@prons);
}
sub _to_pinoko
{
my ($self, $surfaces_ref, $poses_ref, $prons_ref) = @_;
my $ret = '';
for my $i (0 .. $#{$prons_ref})
{
my $surf = $surfaces_ref->[$i];
if (
$poses_ref->[$i] eq 'è¨å·'
|| $poses_ref->[$i] eq 'è£å©è¨å·'
|| ( $prons_ref->[$i] eq 'UNK' && $surf =~ /[^\p{InHalfwidthKatakana}]/ )
|| $surf =~ /^[a-zA-Zï½-ï½ï¼¡-Z0-9ï¼-ï¼]+$/
)
{
$ret .= $surf;
}
elsif ($surf =~ /[^\p{InHiragana}]/)
{
if (
$surf eq 'æè¡'
|| $surf eq 'ç¬'
|| $surf eq 'ã·ã¼ã¦ã¼ã'
|| $surf eq 'ã¢ã©ãã³ãã¥'
|| $surf eq 'ã·ã¼ã¦ã¼ãã¢ã©ãã³ãã¥'
|| $surf =~ /^ã¢ããã§ã³ããªã±ã¼*/
)
{
$ret .= $surf;
}
else
{
# e.g. ãï½±ããã«åè¡å½ã®å·ãã®å ´å
# @surfaces ã®ä¸èº«ã¯ä»¥ä¸ã®éã
# [0]: ï½±ããã«
# [1]: åè¡å½
# [2]: ã®
# [3]: å·
my @surfaces = grep { length } split(/([0-9ï¼-ï¼]*[\p{Han}ã±ã¶]+[0-9ï¼-ï¼]*|[^\p{Han}]+)/, $surf);
my (@kanji_prons, $regexp);
for my $surface (@surfaces)
{
if ($surface =~ /[0-9ï¼-ï¼]*[\p{Han}ã±ã¶]/) { $regexp .= "(.+)"; }
else
lib/Acme/Pinoko.pm view on Meta::CPAN
}
else # ã
{
if ($prev_surface eq 'ã' || $prev_surface eq 'ã') { $ret .= 'ã®ã'; }
else { $ret .= 'ã'; }
}
}
else { $ret .= $pron; }
}
}
else { $ret .= $pron; }
}
}
return $self->pinoko($ret);
}
sub pinoko
{
local $_ = $_[1];
return unless defined $_;
s/奥ãã/ãããã/g;
s/æè¡/ã·ã¦ã/g;
s/ãã
ãã
ã¤/ããã¤/g;
s/æé¬±/ã¦ã¼ã/g;
s/æç¾¤/ããã°ã³/g;
s/ã¦ã½ãã/ã¦ã½ãã¥ã/g; # MeCabå°ç¨
s/ããããã/ããã¡ãã¡ãã/g;
s/ã/ã¡ã
/g;
s/ã¥/ãã
/g;
s/ã(?=ã)/ãã
/g;
s/ã(?!ã¼)/ãã
/g;
s/ã£ã¤/ã£ã¡ã
/g;
s/ãã©/ãã/g;
s/ãã®ã/ããã/g;
s/ãã©ã/ããã/g;
s/ãªãã /ãªãã/g;
s/ããã§/ããã/g;
s/ããã[ãã¼]/ããã¡/g;
s/ããã¤ã/ããã¡ã
ã/g;
s/(?<!ã®ã|ãã®)ã(?!ã)/ã¡ã/g; # ã®ãã, ãã®ã, ãã ã§ãªããã° ã -> ã¡ã
s/(?<!ãã)ã(?!ã¡ã|ã¡ã
|ã§ã)/ã/g; # ããã, ãã¡ã, ãã¡ã
, ãã§ã ã§ãªããã° ã -> ã
s/ã(?!ã|ã¡)/ã¡/g; # ãã, ãã¡ ã§ãªããã° ã -> ã¡
s/ãã§ãã¼?/ããã/g;
s/ãããã§ãã¼?/ããããã¼/g;
s/ã(?!ã|ã
|ã)/ã/g;
s/(?<!ãª)ã®ãã®ã/ã®ãã/g;
tr/ã§ããã/ãããã/;
s/ã©(?!ã)/ã/g;
s/ã (?!ã®ã|ã|ã)/ã/g;
$_;
}
1;
__END__
=encoding utf8
=head1 NAME
Acme::Pinoko - Acchonburike!
=for test_synopsis
my (%config);
=head1 SYNOPSIS
use Acme::Pinoko;
use utf8;
my $pinoko = Acme::Pinoko->new(%config);
print $pinoko->say('ããã³ï¼ï¼ã®ã¬ãã£ãªã®ã');
# -> ããã³ï¼ï¼ã®ã¬ã¬ã¤ãªã®ãã
=head1 DESCRIPTION
Acme::Pinoko converts standard Japanese text to Pinoko-ish Japanese text.
Pinoko is a Japanese manga character. She speaks with a lisp and
therefore her spoken Japanese is slightly different from standard Japanese.
=head1 METHODS
=head2 $pinoko = Acme::Pinoko->new(%config)
Creates a new Acme::Pinoko instance.
my $pinoko = Acme::Pinoko->new(
parser => 'Text::MeCab' or 'Text::KyTea', # default is 'Text::MeCab'
parser_config => \%parser_config, # default is undef
);
=head2 $pinoko_ish_text = $pinoko->say($text)
Pinoko says $text.
=head1 AUTHOR
pawa E<lt>pawapawa@cpan.orgE<gt>
=head1 SEE ALSO
L<https://en.wikipedia.org/wiki/Black_Jack_%28manga%29#Characters>
=head1 LICENSE
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut
( run in 1.202 second using v1.01-cache-2.11-cpan-fe3c2283af0 )