Lingua-TFIDF
view release on metacpan - search on metacpan
view release on metacpan or search on metacpan
lib/Lingua/TFIDF/WordSegmenter/JA/MeCab.pm view on Meta::CPAN
package Lingua::TFIDF::WordSegmenter::JA::MeCab;
# ABSTRACT: Word segmenter for Japanese documents
use strict;
use warnings;
use Encode qw//;
use Smart::Args;
use Text::MeCab;
my $mecab_encoding = Encode::find_encoding(Text::MeCab::ENCODING);
sub new {
args
my $class => 'ClassName',
my $mecab => +{ isa => 'Text::MeCab', optional => 1 };
$mecab = Text::MeCab->new unless defined $mecab;
bless +{ mecab => $mecab } => $class;
}
sub mecab { $_[0]->{mecab} }
sub segment {
args_pos
my $self,
my $document => 'Ref | Str';
my $input = $mecab_encoding->encode(ref $document ? $$document : $document);
my $node = $self->mecab->parse($input);
sub {
return unless $node and $node->stat != Text::MeCab::MECAB_EOS_NODE;
my $word = $mecab_encoding->decode($node->surface);
$node = $node->next;
return $word;
};
}
1;
__END__
=pod
=encoding UTF-8
=head1 NAME
Lingua::TFIDF::WordSegmenter::JA::MeCab - Word segmenter for Japanese documents
=head1 VERSION
version 0.01
=head1 SYNOPSIS
use utf8;
use Lingua::TFIDF::WordSegmenter::JA::MeCab;
my $segmenter = Lingua::TFIDF::WordSegmenter::JA::MeCab->new;
my $iter = $segmenter->segment('æãåºããæãåºãã11æ5æ¥ã...');
while (defined(my $word = $iter->())) { ... }
=head1 DESCRIPTION
This class is a word segmenter for documents written in Japanese.
=head1 METHODS
=head2 new([ mecab => Text::MeCab->new ])
Constructor.
=head2 segment($document | \$document)
Executes word segmentation on given C<$document> and returns an word iterator.
=head1 SEE ALSO
view all matches for this distributionview release on metacpan - search on metacpan
( run in 0.693 second using v1.00-cache-2.02-grep-82fe00e-cpan-1925d2aa809 )