Alvis-NLPPlatform

 view release on metacpan or  search on metacpan

lib/Alvis/NLPPlatform/UserNLPWrappers.pm  view on Meta::CPAN

package Alvis::NLPPlatform::UserNLPWrappers;


use Alvis::NLPPlatform::NLPWrappers;

use strict;
use warnings;

use Data::Dumper;

use UNIVERSAL qw(isa);

our @ISA = ("Alvis::NLPPlatform::NLPWrappers");


our $VERSION=$Alvis::NLPPlatform::VERSION;


sub tokenize {
    my @arg = @_;

    my $class = shift @arg;

    return($class->SUPER::tokenize(@arg));

}



sub scan_ne 
{
    my @arg = @_;

    my $class = shift @arg;

    $class->SUPER::scan_ne(@arg);

}

sub word_segmentation 
{
    my @arg = @_;

    my $class = shift @arg;

    $class->SUPER::word_segmentation(@arg);

}

sub sentence_segmentation 
{
    my @arg = @_;

    my $class = shift @arg;

    $class->SUPER::sentence_segmentation(@arg);

}


sub pos_tag 
{
    my @arg = @_;

    my $class = shift @arg;

    $class->SUPER::pos_tag(@arg);

}


sub lemmatization 
{
    my @arg = @_;

    my $class = shift @arg;

    $class->SUPER::lemmatization(@arg);

}


sub term_tag
{
    my @arg = @_;

    my $class = shift @arg;

    $class->SUPER::term_tag(@arg);
#           &PrintOutputTreeTagger(@arg, \*STDOUT);
#           exit;
#            &execYaTeA(@arg);
#      exit;
}

sub PrintOutputTreeTagger {
    my ($h_config, $doc_hash, $output_stream) = @_;

    my $line;
    my $insentence;
    my $sentence;

    my $tokens;
    my $analyses;
    my $analysis;
    my $nsentence;
    my $token_start;
    my $token_end;
    my $relation;
    my $left_wall;
    my $right_wall;

    my $relation_id;

    my @arr_tokens;
    my $last_token;
    my $wordidshift=0;

    my $phrase_idx=$Alvis::NLPPlatform::Annotation::phrase_idx;

    print STDERR "  Performing TreeTagger like Output\n";

    my $word;
    my $worddecal;
    my $word_cont;
    my $word_id;
    my $i;
    my $sentences_cont="";

    my @tab_word_punct;
    my @tab_word;
    my $idx_tab_word_punct=1;
    my $idx_tab_word=1;
    my @tab_mapping;

    # print out words+punct and fill in a tab
    push @tab_word_punct," ";
    push @tab_word," ";

    my $decal=1;

    my $searchterm;
    my $sti;
    my $word_np;
    
    my @tab_tmp;
    my $tmp_sp;
    my $spi=0;

lib/Alvis/NLPPlatform/UserNLPWrappers.pm  view on Meta::CPAN

		    }
		}
		$relation_id++;
	    }
	}
    }
    $Alvis::NLPPlatform::Annotation::syntactic_relation_idx = $relation_id;

    print STDERR "done - Found ". ($relation_id - 1) ." syntactic relations\n";
    push @{$doc_hash->{"log_processing1"}->{"comments"}},  "Found Syntactic Relations: " . ($relation_id - 1);

    
}


sub getSem_unitFromTermOcc
{
    my ($termKey, $phrase_set, $start_char, $end_char, $ref_YateaTermOcc2AlvisSemUnits) = @_;

    my @occurrences;
    my $i;


    my $term_candidate = $phrase_set->getTermCandidates->{$termKey};


    @occurrences = @{$term_candidate->getOccurrences};


    $i = 0;

    while (($i<scalar(@occurrences)) && (($start_char > $occurrences[$i]->getStartChar) || ($occurrences[$i]->getStartChar > $end_char))) {
	$i++;
    }
    if ($i < scalar @occurrences) {
	return($ref_YateaTermOcc2AlvisSemUnits->{$occurrences[$i]->getID});
    }

    return(-1);


}

sub mergeYaTeAResults
{
    my ($doc_hash, $yatea) = @_;

    # creation of the terms

    # creation of the phrases

}

sub syntactic_parsing
{
    my @arg = @_;

    
    my $class = shift @arg;

          $class->SUPER::syntactic_parsing(@arg);
#             &bio_syntactic_parsing(@arg);
}

my $word_id_np=1;

sub parse_constituents {
    my $constituents=$_[0];
    my $tmpptr=$_[1];
    my $decal_phrase_idx=$_[1];
    my $doc_hash=$_[2];
    my $lexer;
    my @tab_type;
    my @tab_string;
    my $lconst = 0;
    my $nconst = 0;
    my $phrase_id = "";
    my $csti;
    my $phrase_idx_start = $Alvis::NLPPlatform::Annotation::phrase_idx;
    require Alvis::NLPPlatform::ParseConstituents;


    my $parser = Alvis::NLPPlatform::ParseConstituents->new();

#     print STDERR $constituents;

    $parser->YYData->{CONSTITUENT_STRING} = $constituents;
    $parser->YYData->{DOC_HASH} = $doc_hash;
    $parser->YYData->{DECAL_PHRASE_IDX} = $decal_phrase_idx;
    $parser->YYData->{WORD_ID_NP_REF} =  \$word_id_np;
    $parser->YYData->{TAB_TYPE_REF} =  \@tab_type;
    $parser->YYData->{TAB_STRING_REF} =  \@tab_string;

    $parser->YYData->{LCONST_REF} =  \$lconst;
    $parser->YYData->{NCONST_REF} =  \$nconst;
;
    $parser->YYParse(yylex => \&Alvis::NLPPlatform::ParseConstituents::_Lexer, yyerror => \&Alvis::NLPPlatform::ParseConstituents::_Error);



    for($csti=1;$csti<scalar @tab_type;$csti++){
	$phrase_id = "phrase" . $Alvis::NLPPlatform::Annotation::phrase_idx;
	$doc_hash->{$phrase_id}={};
	$doc_hash->{$phrase_id}->{"id"}=$phrase_id;
	$doc_hash->{$phrase_id}->{"datatype"}="phrase";
	$doc_hash->{$phrase_id}->{"type"}=$tab_type[$csti];
	$doc_hash->{$phrase_id}->{'list_refid_components'}={};
	$doc_hash->{$phrase_id}->{'list_refid_components'}->{"datatype"}="list_refid_components";
	if (scalar(@{$tab_string[$csti]}) == 1) {
	    $doc_hash->{$phrase_id}->{'list_refid_components'}->{"refid_word"}=$tab_string[$csti];
	} else {
	    $doc_hash->{$phrase_id}->{'list_refid_components'}->{"refid_phrase"}=$tab_string[$csti];
	}
	$Alvis::NLPPlatform::Annotation::phrase_idx++;
   }
    
    print STDERR "done - Found ". ($Alvis::NLPPlatform::Annotation::phrase_idx - $phrase_idx_start) ." semantic units\n";
    push @{$doc_hash->{"log_processing1"}->{"comments"}},  "Found Terms: " . ($Alvis::NLPPlatform::Annotation::phrase_idx - $phrase_idx_start);

#    $word_count=$word_id_np-$word_count;
#    print STDERR "\nWord count for this sentence: $word_count\n";

lib/Alvis/NLPPlatform/UserNLPWrappers.pm  view on Meta::CPAN

# 	print $line;
# 	    $line_prec = $line;
	};

	if ((defined $line) && ($line =~ /^\+\+\+\+Time/o)) {
	    $linkage_counter = 0;
	    @linkage_output = ();
	    do {
		#We remove the postscript output until we found constituent part
		while((defined ($line = <INFILE>)) && ($line !~ /^\[/o)) {
		    # nothing 
		}
		# we print the output until we find the next postscript part 
		$linkage_output[$linkage_counter] = $line;
# 	    print $line;
		while((defined ($line = <INFILE>)) && ($line ne "diagram\n")) {
# 		    print STDERR "=> $line\n";
		    $linkage_output[$linkage_counter] .= $line;
# 		print $line;
		}
		# we remove the next postscript part 
		while ((defined ($line =<INFILE>)) && ($line ne "%%EndDocument\n")) {
		    # nothing 
		}
		$line = <INFILE>;
		$linkage_output[$linkage_counter] .= "\n";
		$linkage_counter++;
# 	    print "\n";
		# Next Linkage ?
	    } while((defined ($line = <INFILE>)) && ($line =~ /^%!PS-Adobe/o));
	    # we print the constituent
	    print OUTFILE "[Sentence " . $sentence_counter . "]\n";
	    $sentence_counter++;
	    for($linkage_counter = 0; $linkage_counter < scalar(@linkage_output); $linkage_counter++) {
		print OUTFILE "[Linkage " . $linkage_counter ."]\n";
		print OUTFILE $linkage_output[$linkage_counter];
		print OUTFILE "$line\n";
		
	    }
	    # we remove all it remains
	    while((defined ($line = <INFILE>)) && ($line ne "Press RETURN for the next linkage.\n")) {
		#nothing
	    }
	}

    } while ($line = <INFILE>);

    close INFILE;
    close OUTFILE;

    return 0;

}

sub semantic_feature_tagging
{
    my @arg = @_;

    my $class = shift @arg;

    $class->SUPER::semantic_feature_tagging(@arg);

}

sub semantic_relation_tagging
{
    my @arg = @_;

    my $class = shift @arg;

    $class->SUPER::semantic_relation_tagging(@arg);

}


sub anaphora_resolution
{
    my @arg = @_;

    my $class = shift @arg;

    $class->SUPER::anaphora_resolution(@arg);

}



1;

__END__

=head1 NAME

Alvis::NLPPlatform::UserNLPWRapper - User interface for customizing
the NLP wrappers used to linguistically annotating of XML documents
in Alvis

=head1 SYNOPSIS

use Alvis::NLPPlatform::UserNLPWrapper;

Alvis::NLPPlatform::UserNLPWrappers::tokenize($h_config,$doc_hash);

=head1 DESCRIPTION

This module is a mere interface for allowing the cutomisation of the
NLP Wrappers. Anyone who wants to integrated a new NLP tool has to
overwrite the default wrapper. The aim of this module is to simplify
the development a specific wrapper, its integration and its use in the
platform.


Before developing a new wrapper, it is necessary to copy and modify
this file in a local directory and add this directory to the PERL5LIB
variable.

=head1 METHODS


=head2 tokenize()

    tokenize($h_config, $doc_hash);

This method carries out the tokenisation process of the input
document. C<$doc_hash> is the hashtable containing containing all the
annotations of the input document. See documentation in
C<Alvis::NLPPlatform::NLPWrappers>.  It is not recommended to
overwrite this method.

C<$hash_config> is the
reference to the hashtable containing the variables defined in the
configuration file.

The method returns the number of tokens.



=head2 scan_ne()

    scan_ne($h_config, $doc_hash);

This method wraps the Named entity recognition and tagging



( run in 1.164 second using v1.01-cache-2.11-cpan-0bb4e1dffa6 )