App-sdif

 view release on metacpan or  search on metacpan

lib/App/cdif/Command/Mecab.pm  view on Meta::CPAN

package App::cdif::Command::Mecab;

use parent "App::cdif::Command";

use v5.14;
use warnings;
use utf8;
use Carp;
use Data::Dumper;

our $debug;

sub wordlist {
    my $obj = shift;
    my $text = shift;

    ##
    ## mecab ignores trailing spaces.
    ##
    my $removeme = sub {
	local *_ = shift;
	return sub { 0 } unless /[ \t]+$/m;
	my $magic = "15570"."67583";
	$magic++ while /$magic/;
	s/[ \t]+\K$/$magic/mg;
	sub { $_ eq $magic };
    }->(\$text);

    my $eos = "EOS" . "000";
    $eos++ while $text =~ /$eos/;
    my $is_newline = sub { $_ eq $eos };

    my @mecab = ('mecab', '--node-format', '%M\\n', '--eos-format', "$eos\\n");
    my $result = $obj->command(@mecab)->setstdin($text)->update->data;
    warn $result =~ s/^/MECAB: /mgr if $debug;
    do {
	map  { $is_newline->() ? "\n" : $_ }
	grep { not $removeme->() }
	map  { /\A\w/ ? $_ : uniqchar($_) }
	grep { length }
	$result =~ /^([^\w\n]*+)(.*)\n/mg;
    };
}

sub uniqchar {
    my @s;
    for (@_) {
	while (/(\X)\g{-1}*/pg) {
	    push @s, ${^MATCH};
	}
    }
    @s;
}

1;



( run in 0.499 second using v1.01-cache-2.11-cpan-d7a12ab2c7f )