Alvis-NLPPlatform

 view release on metacpan or  search on metacpan

lib/Alvis/NLPPlatform/NLPWrappers.pm  view on Meta::CPAN


    $Alvis::NLPPlatform::ALVISDEBUG || unlink $corpus_filename;
    $Alvis::NLPPlatform::ALVISDEBUG || unlink $result_filename;

    print STDERR "done - Found " . $word_id ." tags.\n";
    push @{$doc_hash->{"log_processing1"}->{"comments"}},  "Found POS Tags: " . $word_id ;
}

# sub pos_tag # WRAPPER FOR BRILL
# {
#     my $word;
#     my $cont;

#     print STDERR "   Part-Of-Speech tagging...";
#     open CORPUS,">$TMPFILE.corpus.tmp";
#     binmode(CORPUS,":utf8");
#     foreach $word(sort Alvis::NLPPlatform::Annotation::sort_keys keys %Alvis::NLPPlatform::hash_words){
# 	$cont=$Alvis::NLPPlatform::hash_words{$word};
# 	print CORPUS "$cont ";
# 	if($cont eq "."){
# 	    print CORPUS "\n";
# 	}
#     }
#     close CORPUS;
# }


sub lemmatization
{
    my ($class, $h_config, $doc_hash) = @_;

    # done with the postagging
}


# TODO : Check that term tagging is only performed on english texts

sub term_tag
{
    my ($class, $h_config, $doc_hash) = @_;

    my $cont;
    my $word;
    my $sentence;
    my $i;
    my $s;
    my $line;
    my $tmp;
    my %tabh_sent_terms;
    my $key;
    my $sent;
    my $term_regex;
    my $term;
    my $phrase_idx=1;
    my $canonical_form;
    my %corpus;
    my %lc_corpus;
    my $sent_id;
    my $command_line;
    my %corpus_index;
    my %idtrm_select;
    my @tab_results;
    my $semtag;

    my $token_start;
    my $token_end;
    my $offset_start;
    my $offset_end;
    my $offset;

    my $semantic_unit_id_str;
    my $semantic_feature_id_str;
    my $sf = 1;

    my $token_term;
    my $token_term_end;
    my $j;


    print STDERR "  Term tagging...         ";

    $sent_id = 1;
    foreach $sentence(Alvis::NLPPlatform::Annotation::sort(\%Alvis::NLPPlatform::hash_sentences)){
	$tmp = "$Alvis::NLPPlatform::hash_sentences{$sentence}\n";
 	$tmp=~s/\n/ /go;
	$tmp=~s/\r/ /go;
	$tmp=~s/\t/ /go;
# 	$tmp=~s/\n/\\n/go;
# 	$tmp=~s/\r/\\r/go;
# 	$tmp=~s/\t/\\t/go;
# 	print STDERR "$tmp\n";
	$corpus{$sent_id} = $tmp;
	$lc_corpus{$sent_id} = lc($tmp);
	$sent_id++;
    }



    # Term list loading 

    if($Alvis::NLPPlatform::Annotation::ALVISLANGUAGE eq "FR"){
	if (scalar(@term_list_FR) == 0) {
	    Alvis::TermTagger::load_TermList($h_config->{'NLP_misc'}->{'TERM_LIST_FR'},\@term_list_FR);
	      Alvis::TermTagger::get_Regex_TermList(\@term_list_FR, \@regex_term_list_FR);
	  }
	Alvis::TermTagger::corpus_Indexing(\%lc_corpus, \%corpus_index);
	Alvis::TermTagger::term_Selection(\%corpus_index, \@term_list_FR, \%idtrm_select);
	Alvis::TermTagger::term_tagging_offset_tab(\@term_list_FR, \@regex_term_list_FR, \%idtrm_select, \%corpus, \%tabh_sent_terms);
    } else {
	if (scalar(@term_list_EN) == 0) {
	    Alvis::TermTagger::load_TermList($h_config->{'NLP_misc'}->{'TERM_LIST_EN'},\@term_list_EN);
	      Alvis::TermTagger::get_Regex_TermList(\@term_list_EN, \@regex_term_list_EN);
	  }
	Alvis::TermTagger::corpus_Indexing(\%lc_corpus, \%corpus_index);
	Alvis::TermTagger::term_Selection(\%corpus_index, \@term_list_EN, \%idtrm_select);
	Alvis::TermTagger::term_tagging_offset_tab(\@term_list_EN, \@regex_term_list_EN, \%idtrm_select, \%corpus, \%tabh_sent_terms);
      }
    %lc_corpus = ();
    %corpus_index = ();
    %idtrm_select = ();
    %corpus = ();

# TODO : taking into account the case where terms appear at least twice in a sentence

    $i=0;
    for $key (keys %tabh_sent_terms) {
	$sent = $tabh_sent_terms{$key}->[0];
	$term = $tabh_sent_terms{$key}->[1];
	$term_regex = $term;
 	$term_regex =~ s/ /\[ \n\]+/go;
#  	print STDERR "try to find $term in sentence$sent\n";


        $canonical_form = $tabh_sent_terms{$key}->[2];
        $semtag = $tabh_sent_terms{$key}->[3];

	# look for the term in the sentence, compute the reference to the words
	$token_term = -1;
	$offset = 0;
	while (($offset != -1)&&($token_term == -1)) {
	    if ($Alvis::NLPPlatform::hash_sentences{"sentence$sent"} =~ /$term_regex/igc) { # replace regex by index/subtring ?
		$offset = length($`);
	    } else {
		$offset = -1;
	    }
#  		print STDERR "Found (offset = $offset)\n";
	    if ($offset != -1) {
		$doc_hash->{"sentence$sent"}->{"refid_start_token"}=~m/token([0-9]+)/i;
		$token_start=$1;
		$doc_hash->{"sentence$sent"}->{"refid_end_token"}=~m/token([0-9]+)/i;
		$token_end=$1;
		$offset_start=$doc_hash->{"token$token_start"}->{"from"};
		$offset_end=$doc_hash->{"token$token_end"}->{"to"};

		$offset+=$offset_start;

#  		print STDERR "Search token starting at $offset\n";
		for($j=$token_start;$j<$token_end;$j++){
# 		    print STDERR "Current offset : " . $doc_hash->{"token$j"}->{"from"} . "\n";
		    if($doc_hash->{"token$j"}->{"from"}==$offset){
			$token_term=$j;
			last;
		    }
		}
# 		print STDERR "Token Term start at $token_term\n";
		if ($token_term != -1) {
		    $cont="";
		    my @tab_tokens;
		    for($j=$token_term;length($cont)<length($term);$j++){
			$cont.=$Alvis::NLPPlatform::hash_tokens{"token$j"};
			push @tab_tokens, "token$j";
			$cont =~ s/\\[nrt]/ /go;
		    }
# 		    print STDERR "$cont\n";
		    if (length($cont) == length($term)) {
			$token_term_end=$j-1;
			$Alvis::NLPPlatform::hash_sentences{"sentence$sent"} =~ /^/g;
			
			# Creation of a semantic unit
			$s=$Alvis::NLPPlatform::last_semantic_unit;



( run in 1.939 second using v1.01-cache-2.11-cpan-39bf76dae61 )