Acme-Shukugawa-Atom
view release on metacpan or search on metacpan
lib/Acme/Shukugawa/Atom.pm view on Meta::CPAN
# $Id: /mirror/coderepos/lang/perl/Acme-Shukugawa-Atom/trunk/lib/Acme/Shukugawa/Atom.pm 47728 2008-03-14T01:07:28.622095Z daisuke $
package Acme::Shukugawa::Atom;
use strict;
use warnings;
use base qw(Class::Accessor::Fast);
use utf8;
use Encode qw(decode_utf8);
use File::ShareDir;
use Text::MeCab;
use YAML ();
our $VERSION = '0.00004';
__PACKAGE__->mk_accessors($_) for qw(custom_words);
# Special case handling -- this could be optimized further
# put it in a sharefile later
our ($CONFIG, @DEFAULT_WORDS, $RE_EXCEPTION, $RE_SMALL, $RE_SYLLABLE, $RE_NBAR);
BEGIN
{
my $config = YAML::LoadFile(
$CONFIG || File::ShareDir::module_file(__PACKAGE__, 'config.yaml') );
$RE_SMALL = decode_utf8("[ã£ã¥ã§ãã¼]");
$RE_SYLLABLE = decode_utf8("(?:.$RE_SMALL?)");
$RE_NBAR = decode_utf8("^ã³ã¼");
@DEFAULT_WORDS = map {
(decode_utf8($_->[0]), decode_utf8($_->[1]))
} @{ $config->{custom_words} || [] };
}
sub _create_exception_re
{
my $self = shift;
my $custom = $self->custom_words;
return decode_utf8(join("|",
map { $custom->[$_ * 2 + 1] } (0..(scalar(@$custom) - 1)/2) ));
}
sub translate
{
my $self = shift;
my $string = decode_utf8(shift);
if (! ref $self) {
$self = $self->new({ custom_words => \@DEFAULT_WORDS, @_ });
}
# Create local RE_EXCEPTION
local $RE_EXCEPTION = $self->_create_exception_re;
$self->preprocess(\$string);
$self->runthrough(\$string);
$self->postprocess(\$string);
return $string;
}
sub preprocess
{
my ($self, $strref) = @_;
my $custom = $self->custom_words;
for(0..(scalar(@$custom) - 1)/2) {
my $pattern = $custom->[$_ * 2];
my $replace = $custom->[$_ * 2 + 1];
$$strref =~ s/$pattern/$replace/g;
}
}
sub runthrough
{
my ($self, $strref) = @_;
my $mecab = Text::MeCab->new;
# First, make it all katakana, except for where the surface is already
# in hiragana
my $ret = '';
foreach my $text (split(/($RE_EXCEPTION|\s+)/, $$strref)) {
if ($text =~ /$RE_EXCEPTION/) {
$ret .= $text;
next;
}
if ($text !~ /\S/) {
$ret .= $text;
next;
}
foreach (my $node = $mecab->parse($text); $node; $node = $node->next) {
next unless $node->surface;
my $surface = decode_utf8($node->surface);
my $feature = decode_utf8($node->feature);
my ($type, $yomi) = (split(/,/, $feature))[0,8];
# warn "$surface -> $type, $yomi";
if ($surface eq '䏿') {
$ret .= 'ãã¤ã¦ã¼';
next;
}
if ($type eq 'åè©' && $node->next) {
# å©åè©ãè¨ç®ã«å
¥ãã
my $next_feature = decode_utf8($node->next->feature);
my ($next_type, $next_yomi) = (split(/,/, $next_feature))[0,8];
if ($next_type eq 'å©åè©') {
$yomi .= $next_yomi;
$node = $node->next;
}
}
if ($type =~ /å¯è©|å©åè©|形容è©|æ¥ç¶è©|å©è©/ && $surface =~ /^\p{InHiragana}+$/) {
$ret .= $surface;
} elsif ($yomi) {
$ret .= $self->atomize($yomi) || $surface;
} else {
$ret .= $surface;
}
}
}
$$strref = $ret;
}
sub postprocess {}
# ã·ã¼ã¹ã¼ã«ã¼ã«
# 寿å¸âã·ã¼ã¹ã¼
# ã³ããæå¾ã ã£ããã²ã£ããè¿ããªã
sub apply_shisu_rule
{
my ($self, $yomi) = @_;
return $yomi if $yomi =~ s{^($RE_SYLLABLE)($RE_SYLLABLE)$}{
my ($a, $b) = ($1, $2);
$a =~ s/ã¼$//;
$b =~ s/ã¼$//;
"${b}ã¼${a}ã¼";
}e;
return;
}
# ã¯ã¤ãã¼ã«ã¼ã«
# ãã¯ã¤âã¯ã¤ãã¼
sub apply_waiha_rule
{
my ($self, $yomi) = @_;
# warn "WAIHA $yomi";
if ($yomi =~ s/^(${RE_SYLLABLE}[$RE_NBAR]?)([^$RE_NBAR].)$/$2$1/) {
$yomi =~ s/(^.[^ã¼].*[^ã¼])$/$1ã¼/;
return $yomi;
}
return;
}
# ã¯ãªããã«ã¼ã«
# ã³ã£ããâã¯ãªãã
sub apply_kuribitsu_rule
{
my ($self, $yomi) = @_;
# warn "KURIBITSU $yomi";
if ($yomi =~ s/^(${RE_SYLLABLE}.)([^$RE_NBAR]${RE_SYLLABLE}$)/$2$1/) {
return $yomi;
}
return;
}
sub atomize
{
my ($self, $yomi) = @_;
$yomi =~ s/ã¼+/ã¼/g;
# Length
my $word_length = length($yomi);
my $length = $word_length - ($yomi =~ /$RE_SMALL/g);
if ($length == 3 && $yomi =~ s/^(${RE_SYLLABLE})ã/${1}ã/) {
# warn "Special rule!";
$length = 4;
}
my $done = 0;
# warn "$yomi LENGTH: $length";
if ($length == 2) {
my $tmp = $self->apply_shisu_rule($yomi);
( run in 1.456 second using v1.01-cache-2.11-cpan-5b529ec07f3 )