Treex-JA
view release on metacpan or search on metacpan
lib/Treex/Tool/Parser/JDEPP.pm view on Meta::CPAN
package Treex::Tool::Parser::JDEPP;
$Treex::Tool::Parser::JDEPP::VERSION = '0.13095';
use strict;
use warnings;
use Moose;
use Treex::Core::Common;
use Treex::Core::Config;
use Treex::Tool::ProcessUtils;
use Treex::Core::Resource;
has model_dir => ( isa => 'Str', is => 'rw', required => 1 );
sub BUILD {
my ($self) = @_;
# TODO find architecture independent solution
my $bin_path = require_file_from_share(
'installed_tools/parser/jdepp/bin/jdepp',
ref($self)
);
#TODO: fix setting up of the model_dir via Treex (see W2A::JA::ParseJDEPP)
# right now only way of selecting model_dir is via configuring Jdepp
# my $model_dir = $self->model_dir;
# in the worst case, it will use default model
my $cmd = "$bin_path".' 2>/dev/null';
# start JDEPP parser
my ( $reader, $writer, $pid ) = Treex::Tool::ProcessUtils::bipipe( $cmd, ':encoding(utf-8)' );
$self->{reader} = $reader;
$self->{writer} = $writer;
$self->{pid} = $pid;
return;
}
sub parse_sentence {
my ( $self, $forms_rf, $tags_rf ) = @_;
if ( ref($forms_rf) ne "ARRAY" or ref($tags_rf) ne "ARRAY" ) {
log_fatal('Both arguments must be array references.');
}
if ( $#{$forms_rf} != $#{$tags_rf} or @$forms_rf == 0 ) {
log_warn "FORMS: @$forms_rf\n";
log_warn "TAGS: @$tags_rf\n";
log_fatal('Both arguments must be references to nonempty arrays of equal length.');
}
if ( my @ret = grep { $_ =~ /^\s+$/ } ( @{$forms_rf}, @{$tags_rf} ) ) {
log_debug("@ret");
log_fatal('Elements of argument arrays must not be empty and must not contain white-space characters');
}
my @parents;
my $input = "";
my $writer = $self->{writer};
my $reader = $self->{reader};
foreach my $form ( @$forms_rf ) {
my $tag = shift @$tags_rf;
$tag =~ s{-}{,}g;
$input .= $form . "\t" . $tag . "\n";
}
$input .= "EOS\n";
print $writer $input;
my $line = <$reader>;
#JDEPP uses different token ordering than Treex, because it creates "bunsetsus" out of multiple tokens (parsing is done on these "bunsetsus"
my @bun_heads;
my $current_token = 1;
( run in 3.677 seconds using v1.01-cache-2.11-cpan-437f7b0c052 )