Treex-JA

 view release on metacpan or  search on metacpan

lib/Treex/Tool/Parser/JDEPP.pm  view on Meta::CPAN

package Treex::Tool::Parser::JDEPP;
$Treex::Tool::Parser::JDEPP::VERSION = '0.13095';
use strict;
use warnings;

use Moose;
use Treex::Core::Common;
use Treex::Core::Config;
use Treex::Tool::ProcessUtils;
use Treex::Core::Resource;

has model_dir => ( isa => 'Str', is => 'rw', required => 1 );

sub BUILD {
    my ($self) = @_;

    # TODO find architecture independent solution
    my $bin_path = require_file_from_share(
        'installed_tools/parser/jdepp/bin/jdepp',
        ref($self)
    );
 
    #TODO: fix setting up of the model_dir via Treex (see W2A::JA::ParseJDEPP)
    # right now only way of selecting model_dir is via configuring Jdepp
    # my $model_dir = $self->model_dir;

    # in the worst case, it will use default model
    my $cmd = "$bin_path".' 2>/dev/null';
 
    # start JDEPP parser
    my ( $reader, $writer, $pid ) = Treex::Tool::ProcessUtils::bipipe( $cmd, ':encoding(utf-8)' );    

    $self->{reader} = $reader;
    $self->{writer} = $writer;
    $self->{pid}    = $pid;

    return;
}

sub parse_sentence {

    my ( $self, $forms_rf, $tags_rf ) = @_;

    if ( ref($forms_rf) ne "ARRAY" or ref($tags_rf) ne "ARRAY" ) {
        log_fatal('Both arguments must be array references.');
    }

    if ( $#{$forms_rf} != $#{$tags_rf} or @$forms_rf == 0 ) {
        log_warn "FORMS: @$forms_rf\n";
        log_warn "TAGS:  @$tags_rf\n";
        log_fatal('Both arguments must be references to nonempty arrays of equal length.');
    }

    if ( my @ret = grep { $_ =~ /^\s+$/ } ( @{$forms_rf}, @{$tags_rf} ) ) {
        log_debug("@ret");
        log_fatal('Elements of argument arrays must not be empty and must not contain white-space characters');
    }

    my @parents;
    my $input = "";
    my $writer = $self->{writer};
    my $reader = $self->{reader};

    foreach my $form ( @$forms_rf ) {
        my $tag = shift @$tags_rf;
        $tag =~ s{-}{,}g;
        $input .= $form . "\t" . $tag . "\n";
    }
    $input .= "EOS\n";  

    print $writer $input;

    my $line = <$reader>;
    
 
    #JDEPP uses different token ordering than Treex, because it creates "bunsetsus" out of multiple tokens (parsing is done on these "bunsetsus"
    my @bun_heads;
    my $current_token = 1;



( run in 3.677 seconds using v1.01-cache-2.11-cpan-437f7b0c052 )