Alvis-NLPPlatform
view release on metacpan or search on metacpan
lib/Alvis/NLPPlatform/UserNLPWrappers.pm view on Meta::CPAN
package Alvis::NLPPlatform::UserNLPWrappers;
use Alvis::NLPPlatform::NLPWrappers;
use strict;
use warnings;
use Data::Dumper;
use UNIVERSAL qw(isa);
our @ISA = ("Alvis::NLPPlatform::NLPWrappers");
our $VERSION=$Alvis::NLPPlatform::VERSION;
sub tokenize {
my @arg = @_;
my $class = shift @arg;
return($class->SUPER::tokenize(@arg));
}
sub scan_ne
{
my @arg = @_;
my $class = shift @arg;
$class->SUPER::scan_ne(@arg);
}
sub word_segmentation
{
my @arg = @_;
my $class = shift @arg;
$class->SUPER::word_segmentation(@arg);
}
sub sentence_segmentation
{
my @arg = @_;
my $class = shift @arg;
$class->SUPER::sentence_segmentation(@arg);
}
sub pos_tag
{
my @arg = @_;
my $class = shift @arg;
$class->SUPER::pos_tag(@arg);
}
sub lemmatization
{
my @arg = @_;
my $class = shift @arg;
$class->SUPER::lemmatization(@arg);
}
sub term_tag
{
my @arg = @_;
my $class = shift @arg;
$class->SUPER::term_tag(@arg);
# &PrintOutputTreeTagger(@arg, \*STDOUT);
# exit;
# &execYaTeA(@arg);
# exit;
}
sub PrintOutputTreeTagger {
my ($h_config, $doc_hash, $output_stream) = @_;
my $line;
my $insentence;
my $sentence;
my $tokens;
my $analyses;
my $analysis;
my $nsentence;
my $token_start;
my $token_end;
my $relation;
my $left_wall;
my $right_wall;
my $relation_id;
my @arr_tokens;
my $last_token;
my $wordidshift=0;
my $phrase_idx=$Alvis::NLPPlatform::Annotation::phrase_idx;
print STDERR " Performing TreeTagger like Output\n";
my $word;
my $worddecal;
my $word_cont;
my $word_id;
my $i;
my $sentences_cont="";
my @tab_word_punct;
my @tab_word;
my $idx_tab_word_punct=1;
my $idx_tab_word=1;
my @tab_mapping;
# print out words+punct and fill in a tab
push @tab_word_punct," ";
push @tab_word," ";
my $decal=1;
my $searchterm;
my $sti;
my $word_np;
my @tab_tmp;
my $tmp_sp;
my $spi=0;
lib/Alvis/NLPPlatform/UserNLPWrappers.pm view on Meta::CPAN
}
}
$relation_id++;
}
}
}
$Alvis::NLPPlatform::Annotation::syntactic_relation_idx = $relation_id;
print STDERR "done - Found ". ($relation_id - 1) ." syntactic relations\n";
push @{$doc_hash->{"log_processing1"}->{"comments"}}, "Found Syntactic Relations: " . ($relation_id - 1);
}
sub getSem_unitFromTermOcc
{
my ($termKey, $phrase_set, $start_char, $end_char, $ref_YateaTermOcc2AlvisSemUnits) = @_;
my @occurrences;
my $i;
my $term_candidate = $phrase_set->getTermCandidates->{$termKey};
@occurrences = @{$term_candidate->getOccurrences};
$i = 0;
while (($i<scalar(@occurrences)) && (($start_char > $occurrences[$i]->getStartChar) || ($occurrences[$i]->getStartChar > $end_char))) {
$i++;
}
if ($i < scalar @occurrences) {
return($ref_YateaTermOcc2AlvisSemUnits->{$occurrences[$i]->getID});
}
return(-1);
}
sub mergeYaTeAResults
{
my ($doc_hash, $yatea) = @_;
# creation of the terms
# creation of the phrases
}
sub syntactic_parsing
{
my @arg = @_;
my $class = shift @arg;
$class->SUPER::syntactic_parsing(@arg);
# &bio_syntactic_parsing(@arg);
}
my $word_id_np=1;
sub parse_constituents {
my $constituents=$_[0];
my $tmpptr=$_[1];
my $decal_phrase_idx=$_[1];
my $doc_hash=$_[2];
my $lexer;
my @tab_type;
my @tab_string;
my $lconst = 0;
my $nconst = 0;
my $phrase_id = "";
my $csti;
my $phrase_idx_start = $Alvis::NLPPlatform::Annotation::phrase_idx;
require Alvis::NLPPlatform::ParseConstituents;
my $parser = Alvis::NLPPlatform::ParseConstituents->new();
# print STDERR $constituents;
$parser->YYData->{CONSTITUENT_STRING} = $constituents;
$parser->YYData->{DOC_HASH} = $doc_hash;
$parser->YYData->{DECAL_PHRASE_IDX} = $decal_phrase_idx;
$parser->YYData->{WORD_ID_NP_REF} = \$word_id_np;
$parser->YYData->{TAB_TYPE_REF} = \@tab_type;
$parser->YYData->{TAB_STRING_REF} = \@tab_string;
$parser->YYData->{LCONST_REF} = \$lconst;
$parser->YYData->{NCONST_REF} = \$nconst;
;
$parser->YYParse(yylex => \&Alvis::NLPPlatform::ParseConstituents::_Lexer, yyerror => \&Alvis::NLPPlatform::ParseConstituents::_Error);
for($csti=1;$csti<scalar @tab_type;$csti++){
$phrase_id = "phrase" . $Alvis::NLPPlatform::Annotation::phrase_idx;
$doc_hash->{$phrase_id}={};
$doc_hash->{$phrase_id}->{"id"}=$phrase_id;
$doc_hash->{$phrase_id}->{"datatype"}="phrase";
$doc_hash->{$phrase_id}->{"type"}=$tab_type[$csti];
$doc_hash->{$phrase_id}->{'list_refid_components'}={};
$doc_hash->{$phrase_id}->{'list_refid_components'}->{"datatype"}="list_refid_components";
if (scalar(@{$tab_string[$csti]}) == 1) {
$doc_hash->{$phrase_id}->{'list_refid_components'}->{"refid_word"}=$tab_string[$csti];
} else {
$doc_hash->{$phrase_id}->{'list_refid_components'}->{"refid_phrase"}=$tab_string[$csti];
}
$Alvis::NLPPlatform::Annotation::phrase_idx++;
}
print STDERR "done - Found ". ($Alvis::NLPPlatform::Annotation::phrase_idx - $phrase_idx_start) ." semantic units\n";
push @{$doc_hash->{"log_processing1"}->{"comments"}}, "Found Terms: " . ($Alvis::NLPPlatform::Annotation::phrase_idx - $phrase_idx_start);
# $word_count=$word_id_np-$word_count;
# print STDERR "\nWord count for this sentence: $word_count\n";
lib/Alvis/NLPPlatform/UserNLPWrappers.pm view on Meta::CPAN
# print $line;
# $line_prec = $line;
};
if ((defined $line) && ($line =~ /^\+\+\+\+Time/o)) {
$linkage_counter = 0;
@linkage_output = ();
do {
#We remove the postscript output until we found constituent part
while((defined ($line = <INFILE>)) && ($line !~ /^\[/o)) {
# nothing
}
# we print the output until we find the next postscript part
$linkage_output[$linkage_counter] = $line;
# print $line;
while((defined ($line = <INFILE>)) && ($line ne "diagram\n")) {
# print STDERR "=> $line\n";
$linkage_output[$linkage_counter] .= $line;
# print $line;
}
# we remove the next postscript part
while ((defined ($line =<INFILE>)) && ($line ne "%%EndDocument\n")) {
# nothing
}
$line = <INFILE>;
$linkage_output[$linkage_counter] .= "\n";
$linkage_counter++;
# print "\n";
# Next Linkage ?
} while((defined ($line = <INFILE>)) && ($line =~ /^%!PS-Adobe/o));
# we print the constituent
print OUTFILE "[Sentence " . $sentence_counter . "]\n";
$sentence_counter++;
for($linkage_counter = 0; $linkage_counter < scalar(@linkage_output); $linkage_counter++) {
print OUTFILE "[Linkage " . $linkage_counter ."]\n";
print OUTFILE $linkage_output[$linkage_counter];
print OUTFILE "$line\n";
}
# we remove all it remains
while((defined ($line = <INFILE>)) && ($line ne "Press RETURN for the next linkage.\n")) {
#nothing
}
}
} while ($line = <INFILE>);
close INFILE;
close OUTFILE;
return 0;
}
sub semantic_feature_tagging
{
my @arg = @_;
my $class = shift @arg;
$class->SUPER::semantic_feature_tagging(@arg);
}
sub semantic_relation_tagging
{
my @arg = @_;
my $class = shift @arg;
$class->SUPER::semantic_relation_tagging(@arg);
}
sub anaphora_resolution
{
my @arg = @_;
my $class = shift @arg;
$class->SUPER::anaphora_resolution(@arg);
}
1;
__END__
=head1 NAME
Alvis::NLPPlatform::UserNLPWRapper - User interface for customizing
the NLP wrappers used to linguistically annotating of XML documents
in Alvis
=head1 SYNOPSIS
use Alvis::NLPPlatform::UserNLPWrapper;
Alvis::NLPPlatform::UserNLPWrappers::tokenize($h_config,$doc_hash);
=head1 DESCRIPTION
This module is a mere interface for allowing the cutomisation of the
NLP Wrappers. Anyone who wants to integrated a new NLP tool has to
overwrite the default wrapper. The aim of this module is to simplify
the development a specific wrapper, its integration and its use in the
platform.
Before developing a new wrapper, it is necessary to copy and modify
this file in a local directory and add this directory to the PERL5LIB
variable.
=head1 METHODS
=head2 tokenize()
tokenize($h_config, $doc_hash);
This method carries out the tokenisation process of the input
document. C<$doc_hash> is the hashtable containing containing all the
annotations of the input document. See documentation in
C<Alvis::NLPPlatform::NLPWrappers>. It is not recommended to
overwrite this method.
C<$hash_config> is the
reference to the hashtable containing the variables defined in the
configuration file.
The method returns the number of tokens.
=head2 scan_ne()
scan_ne($h_config, $doc_hash);
This method wraps the Named entity recognition and tagging
( run in 1.164 second using v1.01-cache-2.11-cpan-0bb4e1dffa6 )