Alvis-NLPPlatform
view release on metacpan or search on metacpan
lib/Alvis/NLPPlatform/NLPWrappers.pm view on Meta::CPAN
$Alvis::NLPPlatform::ALVISDEBUG || unlink $corpus_filename;
$Alvis::NLPPlatform::ALVISDEBUG || unlink $result_filename;
print STDERR "done - Found " . $word_id ." tags.\n";
push @{$doc_hash->{"log_processing1"}->{"comments"}}, "Found POS Tags: " . $word_id ;
}
# sub pos_tag # WRAPPER FOR BRILL
# {
# my $word;
# my $cont;
# print STDERR " Part-Of-Speech tagging...";
# open CORPUS,">$TMPFILE.corpus.tmp";
# binmode(CORPUS,":utf8");
# foreach $word(sort Alvis::NLPPlatform::Annotation::sort_keys keys %Alvis::NLPPlatform::hash_words){
# $cont=$Alvis::NLPPlatform::hash_words{$word};
# print CORPUS "$cont ";
# if($cont eq "."){
# print CORPUS "\n";
# }
# }
# close CORPUS;
# }
sub lemmatization
{
my ($class, $h_config, $doc_hash) = @_;
# done with the postagging
}
# TODO : Check that term tagging is only performed on english texts
sub term_tag
{
my ($class, $h_config, $doc_hash) = @_;
my $cont;
my $word;
my $sentence;
my $i;
my $s;
my $line;
my $tmp;
my %tabh_sent_terms;
my $key;
my $sent;
my $term_regex;
my $term;
my $phrase_idx=1;
my $canonical_form;
my %corpus;
my %lc_corpus;
my $sent_id;
my $command_line;
my %corpus_index;
my %idtrm_select;
my @tab_results;
my $semtag;
my $token_start;
my $token_end;
my $offset_start;
my $offset_end;
my $offset;
my $semantic_unit_id_str;
my $semantic_feature_id_str;
my $sf = 1;
my $token_term;
my $token_term_end;
my $j;
print STDERR " Term tagging... ";
$sent_id = 1;
foreach $sentence(Alvis::NLPPlatform::Annotation::sort(\%Alvis::NLPPlatform::hash_sentences)){
$tmp = "$Alvis::NLPPlatform::hash_sentences{$sentence}\n";
$tmp=~s/\n/ /go;
$tmp=~s/\r/ /go;
$tmp=~s/\t/ /go;
# $tmp=~s/\n/\\n/go;
# $tmp=~s/\r/\\r/go;
# $tmp=~s/\t/\\t/go;
# print STDERR "$tmp\n";
$corpus{$sent_id} = $tmp;
$lc_corpus{$sent_id} = lc($tmp);
$sent_id++;
}
# Term list loading
if($Alvis::NLPPlatform::Annotation::ALVISLANGUAGE eq "FR"){
if (scalar(@term_list_FR) == 0) {
Alvis::TermTagger::load_TermList($h_config->{'NLP_misc'}->{'TERM_LIST_FR'},\@term_list_FR);
Alvis::TermTagger::get_Regex_TermList(\@term_list_FR, \@regex_term_list_FR);
}
Alvis::TermTagger::corpus_Indexing(\%lc_corpus, \%corpus_index);
Alvis::TermTagger::term_Selection(\%corpus_index, \@term_list_FR, \%idtrm_select);
Alvis::TermTagger::term_tagging_offset_tab(\@term_list_FR, \@regex_term_list_FR, \%idtrm_select, \%corpus, \%tabh_sent_terms);
} else {
if (scalar(@term_list_EN) == 0) {
Alvis::TermTagger::load_TermList($h_config->{'NLP_misc'}->{'TERM_LIST_EN'},\@term_list_EN);
Alvis::TermTagger::get_Regex_TermList(\@term_list_EN, \@regex_term_list_EN);
}
Alvis::TermTagger::corpus_Indexing(\%lc_corpus, \%corpus_index);
Alvis::TermTagger::term_Selection(\%corpus_index, \@term_list_EN, \%idtrm_select);
Alvis::TermTagger::term_tagging_offset_tab(\@term_list_EN, \@regex_term_list_EN, \%idtrm_select, \%corpus, \%tabh_sent_terms);
}
%lc_corpus = ();
%corpus_index = ();
%idtrm_select = ();
%corpus = ();
# TODO : taking into account the case where terms appear at least twice in a sentence
$i=0;
for $key (keys %tabh_sent_terms) {
$sent = $tabh_sent_terms{$key}->[0];
$term = $tabh_sent_terms{$key}->[1];
$term_regex = $term;
$term_regex =~ s/ /\[ \n\]+/go;
# print STDERR "try to find $term in sentence$sent\n";
$canonical_form = $tabh_sent_terms{$key}->[2];
$semtag = $tabh_sent_terms{$key}->[3];
# look for the term in the sentence, compute the reference to the words
$token_term = -1;
$offset = 0;
while (($offset != -1)&&($token_term == -1)) {
if ($Alvis::NLPPlatform::hash_sentences{"sentence$sent"} =~ /$term_regex/igc) { # replace regex by index/subtring ?
$offset = length($`);
} else {
$offset = -1;
}
# print STDERR "Found (offset = $offset)\n";
if ($offset != -1) {
$doc_hash->{"sentence$sent"}->{"refid_start_token"}=~m/token([0-9]+)/i;
$token_start=$1;
$doc_hash->{"sentence$sent"}->{"refid_end_token"}=~m/token([0-9]+)/i;
$token_end=$1;
$offset_start=$doc_hash->{"token$token_start"}->{"from"};
$offset_end=$doc_hash->{"token$token_end"}->{"to"};
$offset+=$offset_start;
# print STDERR "Search token starting at $offset\n";
for($j=$token_start;$j<$token_end;$j++){
# print STDERR "Current offset : " . $doc_hash->{"token$j"}->{"from"} . "\n";
if($doc_hash->{"token$j"}->{"from"}==$offset){
$token_term=$j;
last;
}
}
# print STDERR "Token Term start at $token_term\n";
if ($token_term != -1) {
$cont="";
my @tab_tokens;
for($j=$token_term;length($cont)<length($term);$j++){
$cont.=$Alvis::NLPPlatform::hash_tokens{"token$j"};
push @tab_tokens, "token$j";
$cont =~ s/\\[nrt]/ /go;
}
# print STDERR "$cont\n";
if (length($cont) == length($term)) {
$token_term_end=$j-1;
$Alvis::NLPPlatform::hash_sentences{"sentence$sent"} =~ /^/g;
# Creation of a semantic unit
$s=$Alvis::NLPPlatform::last_semantic_unit;
( run in 1.939 second using v1.01-cache-2.11-cpan-39bf76dae61 )