utf8 results from the CPAN

Alvis-NLPPlatform

    return($Alvis::NLPPlatform::Annotation::nb_max_tokens);
}


sub scan_ne
{
    my ($class, $h_config, $doc_hash) = @_;

    my $corpus;
    my $token;
    my $line;
    my $id;
    my $tok_ct;

    my @tab_tokens; # experimental
    my $t; # experimental


    my $NE_type;
    my $NE_start;
    my $NE_end;

    my $offset=0;
    my $i;
    my $en=0;
    my $j;
    my $start;
    my $end;
    my $ref_tab;
    my $refid_n;

    my $en_cont;
    my $number_of_tokens;
    my $last_en;

    my $corpus_filename;
    my $result_filename;

    print STDERR "  Named entites tagging...     ";
    
    $corpus="";

    foreach $token(Alvis::NLPPlatform::Annotation::sort(\%Alvis::NLPPlatform::hash_tokens)){
	$tok_ct=$Alvis::NLPPlatform::hash_tokens{$token}; # why not $token ? (TH)
	Alvis::NLPPlatform::XMLEntities::decode($tok_ct);

	# (TH) those replacements are required to workaround a bug in
	# tagen (Named entity following a \n is not analyse - because
	# n is concatenate with the next word)

	$tok_ct=~s/\\n/\\n /go;
	$tok_ct=~s/\\r/\\r /go;
	$tok_ct=~s/\\t/\\t /go;
	$corpus.=$tok_ct;
	push @tab_tokens,$tok_ct;
    }

    $corpus_filename = $h_config->{'TMPFILE'} . ".corpus_en.txt";
    
    open CORPUS,">$corpus_filename";
#     binmode(CORPUS,":utf8");


    print CORPUS Encode::encode_utf8($corpus);
    close CORPUS;

    print STDERR "done\n";
    
    my $command_line;
    if($Alvis::NLPPlatform::Annotation::ALVISLANGUAGE eq "FR"){
	$command_line = $h_config->{'NLP_tools'}->{'NETAG_FR'} . " $corpus_filename 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;
    } else {
	$command_line = $h_config->{'NLP_tools'}->{'NETAG_EN'} . " $corpus_filename 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;
    }
    # nice idea, though TagEN seems to return 0 anyhow...
    #`$command_line` && print STDERR "FAILED TO EXECUTE \"$command_line\": &!\n";
    `$command_line`;
    $Alvis::NLPPlatform::ALVISDEBUG || unlink $corpus_filename;
    @Alvis::NLPPlatform::en_start=();
    @Alvis::NLPPlatform::en_end=();
    @Alvis::NLPPlatform::en_type=();

    $result_filename = $h_config->{'TMPFILE'} . ".corpus_en.tag.txt";

    open REN,"<$result_filename"  or warn "Can't open the file $result_filename";
    binmode REN;
    while($line=<REN>){
	($NE_type, $NE_start, $NE_end) = split /\t/, $line;
# 	$line=~m/(.+)\s+([0-9]+)\s+([0-9]+)/;
# 	$NE_type = $1;
# 	$NE_start = $2;
# 	$NE_end = $3;
	push @Alvis::NLPPlatform::en_type,$NE_type;
	if ((exists($h_config->{'XML_INPUT'}->{"PRESERVEWHITESPACE"})) && ($h_config->{'XML_INPUT'}->{"PRESERVEWHITESPACE"})) {
	    push @Alvis::NLPPlatform::en_start,($NE_start-1);
	    push @Alvis::NLPPlatform::en_end,($NE_end-1);
	} else {
	    push @Alvis::NLPPlatform::en_start,$NE_start;
	    push @Alvis::NLPPlatform::en_end,$NE_end;
	}
    }
    close REN;

    $Alvis::NLPPlatform::ALVISDEBUG || unlink $result_filename;

#    print STDERR scalar(@Alvis::NLPPlatform::en_type) . " to find\n";

    print STDERR "  Matching EN with tokens...   ";

    # scan tokens and match with NE

    @Alvis::NLPPlatform::en_tokens_start=();
    @Alvis::NLPPlatform::en_tokens_end=();
    %Alvis::NLPPlatform::en_tokens_hash=();
    $number_of_tokens=scalar @tab_tokens;

    $en=$Alvis::NLPPlatform::last_semantic_unit+1;
    $last_en=0;

    my $en_str = "";
    for($t=0;$t<$number_of_tokens;$t++){
	print STDERR "\r  Matching EN with tokens...   ".($t+1)."/".$number_of_tokens." ";
	for($i=$last_en;$i<scalar @Alvis::NLPPlatform::en_start;$i++){
# 	    print STDERR "\ti = $i :: last_en = $last_en\n";

lib/Alvis/NLPPlatform/NLPWrappers.pm view on Meta::CPAN

		$ref_tab->{'datatype'}="list_refid_token";
		$en_cont="";
		$refid_n=1;
		my @tab_tokens_en;
		$ref_tab->{"refid_token"}=\@tab_tokens_en;
		for($j=$start;$j<=$end;$j++){
		    push @tab_tokens_en, "token$j";
		    $refid_n++;
		    $en_cont.=$Alvis::NLPPlatform::hash_tokens{"token$j"};
		}
		$doc_hash->{$en_str}->{"named_entity"}->{"form"}=$en_cont;

		$Alvis::NLPPlatform::hash_named_entities{$en_str}=$en_cont;

		$en++;
		last; # go out the Named Entity hash table scan
	    }
	}
	$offset+=length($tab_tokens[$t]);
    }
    $Alvis::NLPPlatform::last_semantic_unit=$en ;
    print STDERR "done - Found ". ($Alvis::NLPPlatform::last_semantic_unit - 1) ." named entities\n";
    push @{$doc_hash->{"log_processing1"}->{"comments"}},  "Found Named Entities : " . ($Alvis::NLPPlatform::last_semantic_unit - 1);
}


sub word_segmentation
{
    my ($class, $h_config, $doc_hash) = @_;
    my $token;
    my $id;
    my $nb_doc;
    my $command_line;

    my $proposedword;
    my $current_word = "";
    my $token_id;
    my $word_id;
    my $ref_tab;
    my $elision;
    my $i;

    my $is_en;
    my $en_id;
    my $token_end;
    my $token_start;
    my $append;
    my $refid_n;

    my $token_tmp;

    my $corpus_filename;
    my $result_filename;

    my $token_id_str;
    my $word_id_str;

####
    print STDERR "  Word segmentation...    ";
    my $content;
#     open CORPUS,">:utf8",$h_config->{'TMPFILE'} . ".corpus.tmp";

    $corpus_filename = $h_config->{'TMPFILE'} . ".corpus_word.tmp";
    $result_filename = $h_config->{'TMPFILE'} . ".words.tmp";

    open CORPUS,">$corpus_filename";
#    binmode(CORPUS);
#     binmode(CORPUS, ":utf8");
    foreach $token(Alvis::NLPPlatform::Annotation::sort(\%Alvis::NLPPlatform::hash_tokens)){
	$content=$Alvis::NLPPlatform::hash_tokens{$token};
	$content=~s/\\n/\n/og;
	$content=~s/\\t/\t/og;
	$content=~s/\\r/\r/og;
	#Encode::decode_utf8("Å“")
#	$content =~ s/\x{65}/oe/g;

	Alvis::NLPPlatform::XMLEntities::decode($content);
#  	Encode::from_to($content, "utf8", "iso-8859-1");
  	print CORPUS Encode::encode("iso-8859-1", $content, Encode::FB_DEFAULT);
#	print CORPUS $content;
    }
    close CORPUS;

    if($Alvis::NLPPlatform::Annotation::ALVISLANGUAGE eq "FR"){
	$command_line = $h_config->{"NLP_tools"}->{'WORDSEG_FR'} . " < $corpus_filename > $result_filename 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;
    }else{
	$command_line = $h_config->{"NLP_tools"}->{'WORDSEG_EN'} . " < $corpus_filename > $result_filename 2>> ". $Alvis::NLPPlatform::ALVISLOGFILE;
    }

    `$command_line`;
    
    open(MOTS, $result_filename) or warn "Can't open the file $result_filename";;
#    binmode(MOTS,":utf8");
     binmode(MOTS);
    
    $token_id=1;
    $word_id=1;
    
    $token_id_str = "token$token_id";
    while($proposedword=<MOTS>)
    {
#	$proposedword = Encode::encode_utf8($proposedword);
	$word_id_str = "word$word_id";
#	if ($proposedword !~ /^[\s ]*\n$/o) {
	if ($proposedword !~ /^[\s\x{A0}]*\n$/o) {
	    chomp $proposedword;

#	    print STDERR $proposedword;
	    $current_word="";
	    $doc_hash->{$word_id_str}={};
	    $doc_hash->{$word_id_str}->{'id'}=$word_id_str;
	    $doc_hash->{$word_id_str}->{'datatype'}='word';
	    $ref_tab=$doc_hash->{$word_id_str}->{'list_refid_token'}={};
	    $ref_tab->{'datatype'}="list_refid_token";
	    my @tab_tokens;
	    $refid_n=1;
	    $ref_tab->{"refid_token"}=\@tab_tokens;

	    $is_en=0;
	    while(length($current_word)<length($proposedword)){
		if($token_id>$Alvis::NLPPlatform::Annotation::nb_max_tokens){
		    $Alvis::NLPPlatform::dont_annotate=1;
		    return;
		}
		if($doc_hash->{$token_id_str}->{'type'} ne "sep"){
		    if(exists $Alvis::NLPPlatform::en_tokens_hash{$token_id}){
			$en_id=$Alvis::NLPPlatform::en_tokens_hash{$token_id};
			$is_en=1;
		    }

		    $token_tmp=$Alvis::NLPPlatform::hash_tokens{$token_id_str};
		    ################################
		    $token_tmp=~s/\\n/\n/og;
		    $token_tmp=~s/\\t/\t/og;
		    $token_tmp=~s/\\r/\r/og;
		    Alvis::NLPPlatform::XMLEntities::decode($token_tmp);
		    ################################

		    $token_tmp=~s/\s+/ /og;
		    $current_word=$current_word.$token_tmp;
		    push @tab_tokens, $token_id_str;
		    if($refid_n==1){
			$Alvis::NLPPlatform::word_start[$word_id]=$token_id;
		    }
		    $Alvis::NLPPlatform::word_end[$word_id]=$token_id;
		    $refid_n++;
		}
		$token_id++;
		$token_id_str = "token$token_id";
	    }
	    #### is the rebuilt word a named entity ? is it fully built
	    my $append;
	    if($is_en){
		$Alvis::NLPPlatform::en_tokens_start[$en_id] =~ m/^token([0-9]+)/io;
		$token_start=$1;

		$Alvis::NLPPlatform::en_tokens_end[$en_id] =~ m/^token([0-9]+)/io;
		$token_end=$1;

		while($token_end>($token_id-1)){
		    $token_tmp=$Alvis::NLPPlatform::hash_tokens{$token_id_str};
		    ################################

lib/Alvis/NLPPlatform/NLPWrappers.pm view on Meta::CPAN

	$doc_hash->{$sentence_id_str}->{'id'}=$sentence_id_str;
	$doc_hash->{$sentence_id_str}->{'datatype'}='sentence';
	$doc_hash->{$sentence_id_str}->{'refid_start_token'}="token$start_token";
	$doc_hash->{$sentence_id_str}->{'refid_end_token'}="token$btw_end";
	$sentence_cont =~s /\\n/\n/go;
	$sentence_cont =~s /\\r/\r/go;
	$sentence_cont =~s /\\t/\t/go;
	$doc_hash->{$sentence_id_str}->{'form'}=$sentence_cont;
	$sentence_cont="";
    }

    foreach $sentence(keys %$doc_hash){
	if($sentence=~/^(sentence[0-9]+)/o){
	    $Alvis::NLPPlatform::hash_sentences{$1}=$doc_hash->{$1}->{'form'};
	}
    }
    $Alvis::NLPPlatform::number_of_sentences=$sentence_id-1;
    print STDERR "done - Found ".$Alvis::NLPPlatform::number_of_sentences." sentences\n";
    push @{$doc_hash->{"log_processing1"}->{"comments"}},  "Found Sentences: $Alvis::NLPPlatform::number_of_sentences";
}



sub pos_tag 
{
    my ($class, $h_config, $doc_hash) = @_;
    my $word;
    my $cont;
    my $i;
    my $line;
    my $word_id = 0;
    my $tag;
    my $lemma;
    my %hash_validtags_en;
    my %hash_validtags_fr;
    my $inflected;

    my $corpus_filename;
    my $result_filename;

    my $morphosyntactic_features_id;
    my $lemma_id;
    my $word_id_str;
    my $word_punct_id_str;

    my @words;

    $corpus_filename = $h_config->{'TMPFILE'} . ".corpus_pos.tmp";
    $result_filename = $h_config->{'TMPFILE'} . ".tags.tmp";

    print STDERR "  Part-Of-Speech tagging..";
    open CORPUS,">$corpus_filename";
#      binmode(CORPUS,":encoding(latin1)");
    # TH - 16/07/2007 - replacement of hash_words by hash_words_punct

    my $fullcontent = "";
    foreach $word (Alvis::NLPPlatform::Annotation::sort(\%Alvis::NLPPlatform::hash_words_punct)){
	$cont=$Alvis::NLPPlatform::hash_words_punct{$word};
  	$fullcontent .= Encode::encode("iso-8859-1", $cont, Encode::FB_DEFAULT);
  	$fullcontent .= "\n";
#   	Encode::from_to($cont, "utf8", "iso-8859-1");
# 	$fullcontent .= "$cont\n";
    }
    print CORPUS $fullcontent;
    close CORPUS;

    my $command_line;
    if($Alvis::NLPPlatform::Annotation::ALVISLANGUAGE eq "FR"){
	$command_line = $h_config->{'NLP_tools'}->{'POSTAG_FR'} . " < $corpus_filename  > $result_filename 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;
    }else{
	$command_line = $h_config->{'NLP_tools'}->{'POSTAG_EN'} . " < $corpus_filename  > $result_filename 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;
    }
    `$command_line`;

    open TAGS,"<$result_filename";
    binmode(TAGS); #, ":encoding(latin9)");
    $word_id=0;

    my $decal = 0;
    my $wordecal;
    my $word_punct_id = 1;
    $word_punct_id_str = "word$word_punct_id";  

    while ($line = <TAGS>) {
	# Read $Alvis::NLPPlatform::hash_words_punct{"word$word_punct"}
#	Encode::from_to($line, "iso-8859-9", "utf8");
	$line = Encode::decode("latin9",$line);
	chomp $line;
	($inflected, $tag, $lemma) = split /\t/, $line;

	$word_id = $word_punct_id + $decal;
	$word_id_str = "word$word_id";  

        if ((!exists $Alvis::NLPPlatform::hash_words{$word_id_str}) || ($Alvis::NLPPlatform::hash_words_punct{$word_punct_id_str} ne $Alvis::NLPPlatform::hash_words{$word_id_str})) {
	      # it is not a word
	      # punctuation, delay incrementation of index "decal"
	    $decal--;
	} else { 
	    #######################################################
	    # Correct outputs from treetagger
	    if (!defined $tag) { $tag = "NP"; $lemma = $inflected;}

	    if ((($Alvis::NLPPlatform::Annotation::ALVISLANGUAGE eq "FR") && (!exists($h_config->{"NLP_misc"}->{"POSTAG_LIST"}->{"FR"}->{$tag})))||
                (($Alvis::NLPPlatform::Annotation::ALVISLANGUAGE ne "FR") && (!exists($h_config->{"NLP_misc"}->{"POSTAG_LIST"}->{"EN"}->{$tag})))){
		if ($inflected ne $tag){ # ???
		    $tag="NP";
		}
	    }

	    # in case of named entities, we remove '_' in lemma and inflected form
            # and we force the POS tag to be NP	    
	    if ((index($lemma,"_")>-1)&&(index($inflected,"_")==-1)){
		$lemma=~s/\_/ /og;
		$tag="NP";
	    }
	    
	    # in case of number, lemma is the same as the inflected form
	    if ($lemma eq '@card@'){
		$lemma=$inflected;
	    }
	    #######################################################
	    
	    # POS tag
	    $morphosyntactic_features_id = "morphosyntactic_features$word_id";
	    $doc_hash->{$morphosyntactic_features_id}={};
	    $doc_hash->{$morphosyntactic_features_id}->{'id'}=$morphosyntactic_features_id;
	    $doc_hash->{$morphosyntactic_features_id}->{'datatype'}="morphosyntactic_features";
	    $doc_hash->{$morphosyntactic_features_id}->{'refid_word'}=$word_id_str;
	    $doc_hash->{$morphosyntactic_features_id}->{'syntactic_category'}="$tag";
	    
	    $Alvis::NLPPlatform::hash_postags{$word_id_str}=$tag;
	    
	    # lemma
	    $lemma_id = "lemma$word_id";
	    $doc_hash->{$lemma_id}={};
	    $doc_hash->{$lemma_id}->{'id'}=$lemma_id;
	    $doc_hash->{$lemma_id}->{'datatype'}="lemma";
	    $doc_hash->{$lemma_id}->{'refid_word'}=$word_id_str;
	    $doc_hash->{$lemma_id}->{'canonical_form'}="$lemma";

	    
	    $Alvis::NLPPlatform::hash_lemmas{$word_id_str}=$lemma;
	}
        $word_punct_id++;
	$word_punct_id_str = "word$word_punct_id";  
    }
    close TAGS;

    $Alvis::NLPPlatform::ALVISDEBUG || unlink $corpus_filename;
    $Alvis::NLPPlatform::ALVISDEBUG || unlink $result_filename;

    print STDERR "done - Found " . $word_id ." tags.\n";
    push @{$doc_hash->{"log_processing1"}->{"comments"}},  "Found POS Tags: " . $word_id ;
}

# sub pos_tag # WRAPPER FOR BRILL
# {
#     my $word;
#     my $cont;

#     print STDERR "   Part-Of-Speech tagging...";
#     open CORPUS,">$TMPFILE.corpus.tmp";
#     binmode(CORPUS,":utf8");
#     foreach $word(sort Alvis::NLPPlatform::Annotation::sort_keys keys %Alvis::NLPPlatform::hash_words){
# 	$cont=$Alvis::NLPPlatform::hash_words{$word};
# 	print CORPUS "$cont ";
# 	if($cont eq "."){
# 	    print CORPUS "\n";
# 	}
#     }
#     close CORPUS;
# }


sub lemmatization
{
    my ($class, $h_config, $doc_hash) = @_;

    # done with the postagging
}


# TODO : Check that term tagging is only performed on english texts

sub term_tag
{
    my ($class, $h_config, $doc_hash) = @_;

    my $cont;
    my $word;
    my $sentence;
    my $i;
    my $s;
    my $line;
    my $tmp;
    my %tabh_sent_terms;
    my $key;
    my $sent;
    my $term_regex;
    my $term;
    my $phrase_idx=1;
    my $canonical_form;
    my %corpus;
    my %lc_corpus;
    my $sent_id;
    my $command_line;
    my %corpus_index;
    my %idtrm_select;
    my @tab_results;
    my $semtag;

    my $token_start;
    my $token_end;
    my $offset_start;
    my $offset_end;
    my $offset;

    my $semantic_unit_id_str;
    my $semantic_feature_id_str;
    my $sf = 1;

    my $token_term;
    my $token_term_end;

lib/Alvis/NLPPlatform/NLPWrappers.pm view on Meta::CPAN

			    if((defined($tab_mapping[$token_start+$wordidshift])) && (defined($tab_mapping[$token_end+$wordidshift]) ne "")){
				$syntactic_relation_id = "syntactic_relation$relation_id";
				$doc_hash->{$syntactic_relation_id}={};
				$doc_hash->{$syntactic_relation_id}->{'id'}=$syntactic_relation_id;
				$doc_hash->{$syntactic_relation_id}->{'datatype'}="syntactic_relation";
				$doc_hash->{$syntactic_relation_id}->{'syntactic_relation_type'}="$relation";
				$doc_hash->{$syntactic_relation_id}->{'refid_head'} = {};
				$doc_hash->{$syntactic_relation_id}->{'refid_head'}->{'datatype'}="refid_head";
				$doc_hash->{$syntactic_relation_id}->{'refid_head'}->{"refid_word"}="word".$tab_mapping[($token_start+$wordidshift)];
				$doc_hash->{$syntactic_relation_id}->{'refid_modifier'} = {};
				$doc_hash->{$syntactic_relation_id}->{'refid_modifier'}->{'datatype'}="refid_modifier";
				$doc_hash->{$syntactic_relation_id}->{'refid_modifier'}->{"refid_word"}="word".$tab_mapping[($token_end+$wordidshift)];
				$relation_id++;
			    }
			}
		    }
		}
		# trash everything and continue the loop

		$insentence=0;
		$wordidshift+=$last_token-1;
	    }
	}
	close SYN_RES;

	$Alvis::NLPPlatform::ALVISDEBUG || unlink $h_config->{'TMPFILE'} . ".result.tmp";

	$Alvis::NLPPlatform::nb_relations=$relation_id-1;

	print STDERR "done - Found $Alvis::NLPPlatform::nb_relations relations.\n";
    } else {
	print STDERR "No parser for language $Alvis::NLPPlatform::Annotation::ALVISLANGUAGE - continue to the next step\n";
    }
    
    push @{$doc_hash->{"log_processing1"}->{"comments"}},  "Found Syntactic Relations : " . $Alvis::NLPPlatform::nb_relations;
}



sub semantic_feature_tagging
{

    my ($class, $h_config, $doc_hash) = @_;

#    &temp_semantic_feature_tagging(@arg);

}

sub temp_semantic_feature_tagging
{
    my ($class, $h_config, $doc_hash) = @_;

    print STDERR "  Semantic tagging...     ";

    my $in_fn = $h_config->{'TMPFILE'} . ".ast.in";

    if($Alvis::NLPPlatform::Annotation::ALVISLANGUAGE eq "FR"){
	# French parser command line
    }else{
	open DOC,">$in_fn";
	binmode(DOC,":utf8");
	Alvis::NLPPlatform::Annotation::render_xml($doc_hash, \*DOC, 1);
	close DOC;
    
	my $cmdline = $h_config->{'NLP_tools'}->{'SEMTAG_EN'} . " $in_fn > " . $h_config->{'TMPFILE'} . ".ast.out 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;
#  	print STDERR "$cmdline\n";
	
 	`$cmdline`;
	$Alvis::NLPPlatform::ALVISDEBUG || unlink $h_config->{'TMPFILE'} . ".ast.in";
	$Alvis::NLPPlatform::ALVISDEBUG || unlink $h_config->{'TMPFILE'} . ".ast.out";
	# $semtagout == doc XML enriched-document
# 	return $semtagout;

    }
    print STDERR "done\n";
}



sub semantic_relation_tagging
{

    my ($class, $h_config, $doc_hash) = @_;

}


sub anaphora_resolution
{
    my ($class, $h_config, $doc_hash) = @_;


}


1;

__END__

=head1 NAME

Alvis::NLPPlatform::NLPWrapper - Perl extension for the wrappers used
for linguistically annotating XML documents in Alvis

=head1 SYNOPSIS

use Alvis::NLPPlatform::NLPWrappers;

Alvis::NLPPlatform::NLPWrappers::tokenize($h_config,$doc_hash);

=head1 DESCRIPTION

This module provides defaults wrappers of the Natural Language
Processing (NLP) tools. These wrappers are called in the ALVIS NLP
Platform (see C<Alvis::NLPPlatform>).

Default wrappers can be overwritten by defining new wrappers in a new
and local UserNPWrappers module.

=head1 METHODS

( run in 0.361 second using v1.01-cache-2.11-cpan-39bf76dae61 )