Alvis-NLPPlatform
view release on metacpan or search on metacpan
lib/Alvis/NLPPlatform/NLPWrappers.pm view on Meta::CPAN
return($Alvis::NLPPlatform::Annotation::nb_max_tokens);
}
sub scan_ne
{
my ($class, $h_config, $doc_hash) = @_;
my $corpus;
my $token;
my $line;
my $id;
my $tok_ct;
my @tab_tokens; # experimental
my $t; # experimental
my $NE_type;
my $NE_start;
my $NE_end;
my $offset=0;
my $i;
my $en=0;
my $j;
my $start;
my $end;
my $ref_tab;
my $refid_n;
my $en_cont;
my $number_of_tokens;
my $last_en;
my $corpus_filename;
my $result_filename;
print STDERR " Named entites tagging... ";
$corpus="";
foreach $token(Alvis::NLPPlatform::Annotation::sort(\%Alvis::NLPPlatform::hash_tokens)){
$tok_ct=$Alvis::NLPPlatform::hash_tokens{$token}; # why not $token ? (TH)
Alvis::NLPPlatform::XMLEntities::decode($tok_ct);
# (TH) those replacements are required to workaround a bug in
# tagen (Named entity following a \n is not analyse - because
# n is concatenate with the next word)
$tok_ct=~s/\\n/\\n /go;
$tok_ct=~s/\\r/\\r /go;
$tok_ct=~s/\\t/\\t /go;
$corpus.=$tok_ct;
push @tab_tokens,$tok_ct;
}
$corpus_filename = $h_config->{'TMPFILE'} . ".corpus_en.txt";
open CORPUS,">$corpus_filename";
# binmode(CORPUS,":utf8");
print CORPUS Encode::encode_utf8($corpus);
close CORPUS;
print STDERR "done\n";
my $command_line;
if($Alvis::NLPPlatform::Annotation::ALVISLANGUAGE eq "FR"){
$command_line = $h_config->{'NLP_tools'}->{'NETAG_FR'} . " $corpus_filename 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;
} else {
$command_line = $h_config->{'NLP_tools'}->{'NETAG_EN'} . " $corpus_filename 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;
}
# nice idea, though TagEN seems to return 0 anyhow...
#`$command_line` && print STDERR "FAILED TO EXECUTE \"$command_line\": &!\n";
`$command_line`;
$Alvis::NLPPlatform::ALVISDEBUG || unlink $corpus_filename;
@Alvis::NLPPlatform::en_start=();
@Alvis::NLPPlatform::en_end=();
@Alvis::NLPPlatform::en_type=();
$result_filename = $h_config->{'TMPFILE'} . ".corpus_en.tag.txt";
open REN,"<$result_filename" or warn "Can't open the file $result_filename";
binmode REN;
while($line=<REN>){
($NE_type, $NE_start, $NE_end) = split /\t/, $line;
# $line=~m/(.+)\s+([0-9]+)\s+([0-9]+)/;
# $NE_type = $1;
# $NE_start = $2;
# $NE_end = $3;
push @Alvis::NLPPlatform::en_type,$NE_type;
if ((exists($h_config->{'XML_INPUT'}->{"PRESERVEWHITESPACE"})) && ($h_config->{'XML_INPUT'}->{"PRESERVEWHITESPACE"})) {
push @Alvis::NLPPlatform::en_start,($NE_start-1);
push @Alvis::NLPPlatform::en_end,($NE_end-1);
} else {
push @Alvis::NLPPlatform::en_start,$NE_start;
push @Alvis::NLPPlatform::en_end,$NE_end;
}
}
close REN;
$Alvis::NLPPlatform::ALVISDEBUG || unlink $result_filename;
# print STDERR scalar(@Alvis::NLPPlatform::en_type) . " to find\n";
print STDERR " Matching EN with tokens... ";
# scan tokens and match with NE
@Alvis::NLPPlatform::en_tokens_start=();
@Alvis::NLPPlatform::en_tokens_end=();
%Alvis::NLPPlatform::en_tokens_hash=();
$number_of_tokens=scalar @tab_tokens;
$en=$Alvis::NLPPlatform::last_semantic_unit+1;
$last_en=0;
my $en_str = "";
for($t=0;$t<$number_of_tokens;$t++){
print STDERR "\r Matching EN with tokens... ".($t+1)."/".$number_of_tokens." ";
for($i=$last_en;$i<scalar @Alvis::NLPPlatform::en_start;$i++){
# print STDERR "\ti = $i :: last_en = $last_en\n";
lib/Alvis/NLPPlatform/NLPWrappers.pm view on Meta::CPAN
$ref_tab->{'datatype'}="list_refid_token";
$en_cont="";
$refid_n=1;
my @tab_tokens_en;
$ref_tab->{"refid_token"}=\@tab_tokens_en;
for($j=$start;$j<=$end;$j++){
push @tab_tokens_en, "token$j";
$refid_n++;
$en_cont.=$Alvis::NLPPlatform::hash_tokens{"token$j"};
}
$doc_hash->{$en_str}->{"named_entity"}->{"form"}=$en_cont;
$Alvis::NLPPlatform::hash_named_entities{$en_str}=$en_cont;
$en++;
last; # go out the Named Entity hash table scan
}
}
$offset+=length($tab_tokens[$t]);
}
$Alvis::NLPPlatform::last_semantic_unit=$en ;
print STDERR "done - Found ". ($Alvis::NLPPlatform::last_semantic_unit - 1) ." named entities\n";
push @{$doc_hash->{"log_processing1"}->{"comments"}}, "Found Named Entities : " . ($Alvis::NLPPlatform::last_semantic_unit - 1);
}
sub word_segmentation
{
my ($class, $h_config, $doc_hash) = @_;
my $token;
my $id;
my $nb_doc;
my $command_line;
my $proposedword;
my $current_word = "";
my $token_id;
my $word_id;
my $ref_tab;
my $elision;
my $i;
my $is_en;
my $en_id;
my $token_end;
my $token_start;
my $append;
my $refid_n;
my $token_tmp;
my $corpus_filename;
my $result_filename;
my $token_id_str;
my $word_id_str;
####
print STDERR " Word segmentation... ";
my $content;
# open CORPUS,">:utf8",$h_config->{'TMPFILE'} . ".corpus.tmp";
$corpus_filename = $h_config->{'TMPFILE'} . ".corpus_word.tmp";
$result_filename = $h_config->{'TMPFILE'} . ".words.tmp";
open CORPUS,">$corpus_filename";
# binmode(CORPUS);
# binmode(CORPUS, ":utf8");
foreach $token(Alvis::NLPPlatform::Annotation::sort(\%Alvis::NLPPlatform::hash_tokens)){
$content=$Alvis::NLPPlatform::hash_tokens{$token};
$content=~s/\\n/\n/og;
$content=~s/\\t/\t/og;
$content=~s/\\r/\r/og;
#Encode::decode_utf8("Å")
# $content =~ s/\x{65}/oe/g;
Alvis::NLPPlatform::XMLEntities::decode($content);
# Encode::from_to($content, "utf8", "iso-8859-1");
print CORPUS Encode::encode("iso-8859-1", $content, Encode::FB_DEFAULT);
# print CORPUS $content;
}
close CORPUS;
if($Alvis::NLPPlatform::Annotation::ALVISLANGUAGE eq "FR"){
$command_line = $h_config->{"NLP_tools"}->{'WORDSEG_FR'} . " < $corpus_filename > $result_filename 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;
}else{
$command_line = $h_config->{"NLP_tools"}->{'WORDSEG_EN'} . " < $corpus_filename > $result_filename 2>> ". $Alvis::NLPPlatform::ALVISLOGFILE;
}
`$command_line`;
open(MOTS, $result_filename) or warn "Can't open the file $result_filename";;
# binmode(MOTS,":utf8");
binmode(MOTS);
$token_id=1;
$word_id=1;
$token_id_str = "token$token_id";
while($proposedword=<MOTS>)
{
# $proposedword = Encode::encode_utf8($proposedword);
$word_id_str = "word$word_id";
# if ($proposedword !~ /^[\s ]*\n$/o) {
if ($proposedword !~ /^[\s\x{A0}]*\n$/o) {
chomp $proposedword;
# print STDERR $proposedword;
$current_word="";
$doc_hash->{$word_id_str}={};
$doc_hash->{$word_id_str}->{'id'}=$word_id_str;
$doc_hash->{$word_id_str}->{'datatype'}='word';
$ref_tab=$doc_hash->{$word_id_str}->{'list_refid_token'}={};
$ref_tab->{'datatype'}="list_refid_token";
my @tab_tokens;
$refid_n=1;
$ref_tab->{"refid_token"}=\@tab_tokens;
$is_en=0;
while(length($current_word)<length($proposedword)){
if($token_id>$Alvis::NLPPlatform::Annotation::nb_max_tokens){
$Alvis::NLPPlatform::dont_annotate=1;
return;
}
if($doc_hash->{$token_id_str}->{'type'} ne "sep"){
if(exists $Alvis::NLPPlatform::en_tokens_hash{$token_id}){
$en_id=$Alvis::NLPPlatform::en_tokens_hash{$token_id};
$is_en=1;
}
$token_tmp=$Alvis::NLPPlatform::hash_tokens{$token_id_str};
################################
$token_tmp=~s/\\n/\n/og;
$token_tmp=~s/\\t/\t/og;
$token_tmp=~s/\\r/\r/og;
Alvis::NLPPlatform::XMLEntities::decode($token_tmp);
################################
$token_tmp=~s/\s+/ /og;
$current_word=$current_word.$token_tmp;
push @tab_tokens, $token_id_str;
if($refid_n==1){
$Alvis::NLPPlatform::word_start[$word_id]=$token_id;
}
$Alvis::NLPPlatform::word_end[$word_id]=$token_id;
$refid_n++;
}
$token_id++;
$token_id_str = "token$token_id";
}
#### is the rebuilt word a named entity ? is it fully built
my $append;
if($is_en){
$Alvis::NLPPlatform::en_tokens_start[$en_id] =~ m/^token([0-9]+)/io;
$token_start=$1;
$Alvis::NLPPlatform::en_tokens_end[$en_id] =~ m/^token([0-9]+)/io;
$token_end=$1;
while($token_end>($token_id-1)){
$token_tmp=$Alvis::NLPPlatform::hash_tokens{$token_id_str};
################################
lib/Alvis/NLPPlatform/NLPWrappers.pm view on Meta::CPAN
$doc_hash->{$sentence_id_str}->{'id'}=$sentence_id_str;
$doc_hash->{$sentence_id_str}->{'datatype'}='sentence';
$doc_hash->{$sentence_id_str}->{'refid_start_token'}="token$start_token";
$doc_hash->{$sentence_id_str}->{'refid_end_token'}="token$btw_end";
$sentence_cont =~s /\\n/\n/go;
$sentence_cont =~s /\\r/\r/go;
$sentence_cont =~s /\\t/\t/go;
$doc_hash->{$sentence_id_str}->{'form'}=$sentence_cont;
$sentence_cont="";
}
foreach $sentence(keys %$doc_hash){
if($sentence=~/^(sentence[0-9]+)/o){
$Alvis::NLPPlatform::hash_sentences{$1}=$doc_hash->{$1}->{'form'};
}
}
$Alvis::NLPPlatform::number_of_sentences=$sentence_id-1;
print STDERR "done - Found ".$Alvis::NLPPlatform::number_of_sentences." sentences\n";
push @{$doc_hash->{"log_processing1"}->{"comments"}}, "Found Sentences: $Alvis::NLPPlatform::number_of_sentences";
}
sub pos_tag
{
my ($class, $h_config, $doc_hash) = @_;
my $word;
my $cont;
my $i;
my $line;
my $word_id = 0;
my $tag;
my $lemma;
my %hash_validtags_en;
my %hash_validtags_fr;
my $inflected;
my $corpus_filename;
my $result_filename;
my $morphosyntactic_features_id;
my $lemma_id;
my $word_id_str;
my $word_punct_id_str;
my @words;
$corpus_filename = $h_config->{'TMPFILE'} . ".corpus_pos.tmp";
$result_filename = $h_config->{'TMPFILE'} . ".tags.tmp";
print STDERR " Part-Of-Speech tagging..";
open CORPUS,">$corpus_filename";
# binmode(CORPUS,":encoding(latin1)");
# TH - 16/07/2007 - replacement of hash_words by hash_words_punct
my $fullcontent = "";
foreach $word (Alvis::NLPPlatform::Annotation::sort(\%Alvis::NLPPlatform::hash_words_punct)){
$cont=$Alvis::NLPPlatform::hash_words_punct{$word};
$fullcontent .= Encode::encode("iso-8859-1", $cont, Encode::FB_DEFAULT);
$fullcontent .= "\n";
# Encode::from_to($cont, "utf8", "iso-8859-1");
# $fullcontent .= "$cont\n";
}
print CORPUS $fullcontent;
close CORPUS;
my $command_line;
if($Alvis::NLPPlatform::Annotation::ALVISLANGUAGE eq "FR"){
$command_line = $h_config->{'NLP_tools'}->{'POSTAG_FR'} . " < $corpus_filename > $result_filename 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;
}else{
$command_line = $h_config->{'NLP_tools'}->{'POSTAG_EN'} . " < $corpus_filename > $result_filename 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;
}
`$command_line`;
open TAGS,"<$result_filename";
binmode(TAGS); #, ":encoding(latin9)");
$word_id=0;
my $decal = 0;
my $wordecal;
my $word_punct_id = 1;
$word_punct_id_str = "word$word_punct_id";
while ($line = <TAGS>) {
# Read $Alvis::NLPPlatform::hash_words_punct{"word$word_punct"}
# Encode::from_to($line, "iso-8859-9", "utf8");
$line = Encode::decode("latin9",$line);
chomp $line;
($inflected, $tag, $lemma) = split /\t/, $line;
$word_id = $word_punct_id + $decal;
$word_id_str = "word$word_id";
if ((!exists $Alvis::NLPPlatform::hash_words{$word_id_str}) || ($Alvis::NLPPlatform::hash_words_punct{$word_punct_id_str} ne $Alvis::NLPPlatform::hash_words{$word_id_str})) {
# it is not a word
# punctuation, delay incrementation of index "decal"
$decal--;
} else {
#######################################################
# Correct outputs from treetagger
if (!defined $tag) { $tag = "NP"; $lemma = $inflected;}
if ((($Alvis::NLPPlatform::Annotation::ALVISLANGUAGE eq "FR") && (!exists($h_config->{"NLP_misc"}->{"POSTAG_LIST"}->{"FR"}->{$tag})))||
(($Alvis::NLPPlatform::Annotation::ALVISLANGUAGE ne "FR") && (!exists($h_config->{"NLP_misc"}->{"POSTAG_LIST"}->{"EN"}->{$tag})))){
if ($inflected ne $tag){ # ???
$tag="NP";
}
}
# in case of named entities, we remove '_' in lemma and inflected form
# and we force the POS tag to be NP
if ((index($lemma,"_")>-1)&&(index($inflected,"_")==-1)){
$lemma=~s/\_/ /og;
$tag="NP";
}
# in case of number, lemma is the same as the inflected form
if ($lemma eq '@card@'){
$lemma=$inflected;
}
#######################################################
# POS tag
$morphosyntactic_features_id = "morphosyntactic_features$word_id";
$doc_hash->{$morphosyntactic_features_id}={};
$doc_hash->{$morphosyntactic_features_id}->{'id'}=$morphosyntactic_features_id;
$doc_hash->{$morphosyntactic_features_id}->{'datatype'}="morphosyntactic_features";
$doc_hash->{$morphosyntactic_features_id}->{'refid_word'}=$word_id_str;
$doc_hash->{$morphosyntactic_features_id}->{'syntactic_category'}="$tag";
$Alvis::NLPPlatform::hash_postags{$word_id_str}=$tag;
# lemma
$lemma_id = "lemma$word_id";
$doc_hash->{$lemma_id}={};
$doc_hash->{$lemma_id}->{'id'}=$lemma_id;
$doc_hash->{$lemma_id}->{'datatype'}="lemma";
$doc_hash->{$lemma_id}->{'refid_word'}=$word_id_str;
$doc_hash->{$lemma_id}->{'canonical_form'}="$lemma";
$Alvis::NLPPlatform::hash_lemmas{$word_id_str}=$lemma;
}
$word_punct_id++;
$word_punct_id_str = "word$word_punct_id";
}
close TAGS;
$Alvis::NLPPlatform::ALVISDEBUG || unlink $corpus_filename;
$Alvis::NLPPlatform::ALVISDEBUG || unlink $result_filename;
print STDERR "done - Found " . $word_id ." tags.\n";
push @{$doc_hash->{"log_processing1"}->{"comments"}}, "Found POS Tags: " . $word_id ;
}
# sub pos_tag # WRAPPER FOR BRILL
# {
# my $word;
# my $cont;
# print STDERR " Part-Of-Speech tagging...";
# open CORPUS,">$TMPFILE.corpus.tmp";
# binmode(CORPUS,":utf8");
# foreach $word(sort Alvis::NLPPlatform::Annotation::sort_keys keys %Alvis::NLPPlatform::hash_words){
# $cont=$Alvis::NLPPlatform::hash_words{$word};
# print CORPUS "$cont ";
# if($cont eq "."){
# print CORPUS "\n";
# }
# }
# close CORPUS;
# }
sub lemmatization
{
my ($class, $h_config, $doc_hash) = @_;
# done with the postagging
}
# TODO : Check that term tagging is only performed on english texts
sub term_tag
{
my ($class, $h_config, $doc_hash) = @_;
my $cont;
my $word;
my $sentence;
my $i;
my $s;
my $line;
my $tmp;
my %tabh_sent_terms;
my $key;
my $sent;
my $term_regex;
my $term;
my $phrase_idx=1;
my $canonical_form;
my %corpus;
my %lc_corpus;
my $sent_id;
my $command_line;
my %corpus_index;
my %idtrm_select;
my @tab_results;
my $semtag;
my $token_start;
my $token_end;
my $offset_start;
my $offset_end;
my $offset;
my $semantic_unit_id_str;
my $semantic_feature_id_str;
my $sf = 1;
my $token_term;
my $token_term_end;
lib/Alvis/NLPPlatform/NLPWrappers.pm view on Meta::CPAN
if((defined($tab_mapping[$token_start+$wordidshift])) && (defined($tab_mapping[$token_end+$wordidshift]) ne "")){
$syntactic_relation_id = "syntactic_relation$relation_id";
$doc_hash->{$syntactic_relation_id}={};
$doc_hash->{$syntactic_relation_id}->{'id'}=$syntactic_relation_id;
$doc_hash->{$syntactic_relation_id}->{'datatype'}="syntactic_relation";
$doc_hash->{$syntactic_relation_id}->{'syntactic_relation_type'}="$relation";
$doc_hash->{$syntactic_relation_id}->{'refid_head'} = {};
$doc_hash->{$syntactic_relation_id}->{'refid_head'}->{'datatype'}="refid_head";
$doc_hash->{$syntactic_relation_id}->{'refid_head'}->{"refid_word"}="word".$tab_mapping[($token_start+$wordidshift)];
$doc_hash->{$syntactic_relation_id}->{'refid_modifier'} = {};
$doc_hash->{$syntactic_relation_id}->{'refid_modifier'}->{'datatype'}="refid_modifier";
$doc_hash->{$syntactic_relation_id}->{'refid_modifier'}->{"refid_word"}="word".$tab_mapping[($token_end+$wordidshift)];
$relation_id++;
}
}
}
}
# trash everything and continue the loop
$insentence=0;
$wordidshift+=$last_token-1;
}
}
close SYN_RES;
$Alvis::NLPPlatform::ALVISDEBUG || unlink $h_config->{'TMPFILE'} . ".result.tmp";
$Alvis::NLPPlatform::nb_relations=$relation_id-1;
print STDERR "done - Found $Alvis::NLPPlatform::nb_relations relations.\n";
} else {
print STDERR "No parser for language $Alvis::NLPPlatform::Annotation::ALVISLANGUAGE - continue to the next step\n";
}
push @{$doc_hash->{"log_processing1"}->{"comments"}}, "Found Syntactic Relations : " . $Alvis::NLPPlatform::nb_relations;
}
sub semantic_feature_tagging
{
my ($class, $h_config, $doc_hash) = @_;
# &temp_semantic_feature_tagging(@arg);
}
sub temp_semantic_feature_tagging
{
my ($class, $h_config, $doc_hash) = @_;
print STDERR " Semantic tagging... ";
my $in_fn = $h_config->{'TMPFILE'} . ".ast.in";
if($Alvis::NLPPlatform::Annotation::ALVISLANGUAGE eq "FR"){
# French parser command line
}else{
open DOC,">$in_fn";
binmode(DOC,":utf8");
Alvis::NLPPlatform::Annotation::render_xml($doc_hash, \*DOC, 1);
close DOC;
my $cmdline = $h_config->{'NLP_tools'}->{'SEMTAG_EN'} . " $in_fn > " . $h_config->{'TMPFILE'} . ".ast.out 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;
# print STDERR "$cmdline\n";
`$cmdline`;
$Alvis::NLPPlatform::ALVISDEBUG || unlink $h_config->{'TMPFILE'} . ".ast.in";
$Alvis::NLPPlatform::ALVISDEBUG || unlink $h_config->{'TMPFILE'} . ".ast.out";
# $semtagout == doc XML enriched-document
# return $semtagout;
}
print STDERR "done\n";
}
sub semantic_relation_tagging
{
my ($class, $h_config, $doc_hash) = @_;
}
sub anaphora_resolution
{
my ($class, $h_config, $doc_hash) = @_;
}
1;
__END__
=head1 NAME
Alvis::NLPPlatform::NLPWrapper - Perl extension for the wrappers used
for linguistically annotating XML documents in Alvis
=head1 SYNOPSIS
use Alvis::NLPPlatform::NLPWrappers;
Alvis::NLPPlatform::NLPWrappers::tokenize($h_config,$doc_hash);
=head1 DESCRIPTION
This module provides defaults wrappers of the Natural Language
Processing (NLP) tools. These wrappers are called in the ALVIS NLP
Platform (see C<Alvis::NLPPlatform>).
Default wrappers can be overwritten by defining new wrappers in a new
and local UserNPWrappers module.
=head1 METHODS
( run in 0.361 second using v1.01-cache-2.11-cpan-39bf76dae61 )