utf8 results from the CPAN

utf8

Alvis-NLPPlatform

view release on metacpan or search on metacpan

lib/Alvis/NLPPlatform/UserNLPWrappers.pm view on Meta::CPAN

    my $token_start;
    my $token_end;
    my $relation;
    my $left_wall;
    my $right_wall;

    my $relation_id;

    my @arr_tokens;
    my $last_token;
    my $wordidshift=0;

    my $phrase_idx=$Alvis::NLPPlatform::Annotation::phrase_idx;

    my $word;
    my $word_cont;
    my $word_id;
    my $i;
    my $sentences_cont="";

    my @tab_word_punct;
    my @tab_word;
    my $idx_tab_word_punct=1;
    my $idx_tab_word=1;
    my @tab_mapping;

    # print out words+punct and fill in a tab
    push @tab_word_punct," ";
    push @tab_word," ";
    my $decal=1;

    my $searchterm;
    my $sti;
    my $word_np;
    
    my @tab_tmp;
    my $tmp_sp;
    my $spi=0;

    my $termsfound=0;
    my $stubs=0;

    my $skip=0;

    my @tab_start_term=();
    my @tab_end_term=();

    my $constituents;
    my $nb_constituents;

    my $min;
    my $max;

    my $btw_start;
    my $btw_end;
    my $token;
    my $sentence_cont;

    print STDERR "  Performing term extraction... \n";
    open CORPUS, ">>" . $h_config->{"TMPFILE"} . ".corpus.yatea.tmp";
    binmode(CORPUS, ":utf8");

    print CORPUS $Alvis::NLPPlatform::Annotation::document_record_id . "\tDOCUMENT\t" . $Alvis::NLPPlatform::Annotation::document_record_id . "\n" ;

    &PrintOutputTreeTagger($h_config, $doc_hash, \*CORPUS);

    close CORPUS;

#     if ((exists $h_config->{"XML_OUTPUT"}->{"YATEA"}) && ($h_config->{"XML_OUTPUT"}->{"YATEA"} == 1)) {
# 	%$doc_hash = ();
# 	%Alvis::NLPPlatform::hash_tokens = ();
# 	%Alvis::NLPPlatform::hash_words = ();
# 	%Alvis::NLPPlatform::hash_words_punct = ();
# 	%Alvis::NLPPlatform::hash_sentences = ();
# 	%Alvis::NLPPlatform::hash_postags = ();
# 	%Alvis::NLPPlatform::hash_named_entities = ();
# 	%Alvis::NLPPlatform::hash_lemmas = ();
	
# 	$Alvis::NLPPlatform::number_of_words = 0;
# 	$Alvis::NLPPlatform::number_of_sentences = 0;
# 	$Alvis::NLPPlatform::nb_relations = 0;
# 	$Alvis::NLPPlatform::dont_annotate = 0;
	
# 	@Alvis::NLPPlatform::word_start = ();
# 	@Alvis::NLPPlatform::word_end = ();
	
# 	@Alvis::NLPPlatform::en_start = ();
# 	@Alvis::NLPPlatform::en_end = ();
# 	@Alvis::NLPPlatform::en_type = ();
	
# 	@Alvis::NLPPlatform::en_tokens_start = ();
# 	@Alvis::NLPPlatform::en_tokens_end = ();
# 	%Alvis::NLPPlatform::en_tokens_hash = ();

#     }
    
    if (    $Alvis::NLPPlatform::last_doc == 0) {
	return(1);
    }

    require Lingua::YaTeA::Corpus;
    require Lingua::YaTeA;
    my %config_yatea = Lingua::YaTeA::load_config($h_config->{'NLP_tools'}->{'YATEARC'});


    my $yatea = Lingua::YaTeA->new($config_yatea{"OPTIONS"}, \%config_yatea);

    if (defined $h_config->{'NLP_tools'}->{'YATEAOUTPUT'}) {
	print STDERR "\nYaTeA output defined is " . $h_config->{'NLP_tools'}->{'YATEAOUTPUT'} . "\n\n";
	$yatea->getOptionSet->addOption("output-path", $h_config->{'NLP_tools'}->{'YATEAOUTPUT'});
    } else {
	print STDERR "\nNo YaTeA output defined\n\n";
	$yatea->getOptionSet->addOption("output-path", $h_config->{"ALVISTMP"});
    }

    my $corpus_path = $h_config->{"TMPFILE"} . ".corpus.yatea.tmp";
    my $corpus = Lingua::YaTeA::Corpus->new($corpus_path,$yatea->getOptionSet,$yatea->getMessageSet);

    

########################################################################

lib/Alvis/NLPPlatform/UserNLPWrappers.pm view on Meta::CPAN

	push @tab_word,$Alvis::NLPPlatform::hash_words{$word};
    }

    # pre-compute mapping between words+punct and words
    my $idx_nopunct=1;
    for($i=0;$i<scalar @tab_word_punct;$i++){
	if(($idx_nopunct<scalar @tab_word)&&($tab_word_punct[$i] eq $tab_word[$idx_nopunct])){
	    $tab_mapping[$i]=$idx_nopunct;
	    $idx_nopunct++;
	}
    }
#     for($i=0;$i<scalar @tab_mapping;$i++){
# 	print STDERR "$i : " . $tab_mapping[$i] . "\n";
#     }

    # remove whitespaces in NE
    my $ne;
    my $ne_cont;
    my $ne_mod;
    foreach $ne(keys %Alvis::NLPPlatform::hash_named_entities){
	$ne_cont=$Alvis::NLPPlatform::hash_named_entities{$ne};
	$ne_mod=$ne_cont;
	if($ne_cont=~/ /){
	    if($sentences_cont=~/\Q$ne_cont\E/){
		$ne_mod=~s/ /\_/g;
		$sentences_cont=~s/\Q$ne_cont\E/$ne_mod/g;
	    }
	}
    }

    $sentences_cont=~s/<sentence>\n$//sgo;

    if ($sentences_cont !~ /<\sentence>\n/) {
	# to remove after checking the wrapper (above)
	    $sentences_cont.="</sentence>\n";
    }

    $sentences_cont .= "\n\n</sentences>\n";

    # Setting options

    $sentences_cont = "<command string=\"constituents=2\">\n" . $sentences_cont;
    $sentences_cont .= "</command>";
    $sentences_cont = "<command string=\"graphics\">\n" . $sentences_cont;
    $sentences_cont .= "</command>";
#    $sentences_cont = "<command string=\"timeout=60\">\n" . $sentences_cont;
#     $sentences_cont .= "</command>";
    $sentences_cont = "<command string=\"postscript\">\n" . $sentences_cont;
    $sentences_cont .= "</command>";
#      $sentences_cont = "<command string=\"null\">\n" . $sentences_cont;
#      $sentences_cont .= "</command>";
#     $sentences_cont = "<command string=\"ask\">\n" . $sentences_cont;
#     $sentences_cont .= "</command>";
#     $sentences_cont = "<command string=\"walls\">\n" . $sentences_cont;
#     $sentences_cont .= "</command>";
#     $sentences_cont = "<command string=\"union\">\n" . $sentences_cont;
#     $sentences_cont .= "</command>";

    open CORPUS, ">" . $h_config->{"TMPFILE"} . ".corpus.tmp";

    print CORPUS Encode::encode_utf8($sentences_cont);
#     print CORPUS $sentences_cont;
    close CORPUS;

    my $command_line;
    my $command_line2;
    my $command_line3;
    if($Alvis::NLPPlatform::Annotation::ALVISLANGUAGE eq "FR"){
	# French parser command line
    }else{
	$command_line = $h_config->{'NLP_tools'}->{'SYNTACTIC_ANALYSIS_EN'} . " < " . $h_config->{'TMPFILE'} . ".corpus.tmp > " . $h_config->{'TMPFILE'} . ".result.tmp.1 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;
 	$command_line2 = $h_config->{'NLP_tools'}->{'SYNTACTIC_ANALYSIS_EN_LP2LP_CLEAN'} . " < " . $h_config->{'TMPFILE'} . ".result.tmp.1 > " . $h_config->{'TMPFILE'} . ".result.tmp.2 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;

 	$command_line3 = $h_config->{'NLP_tools'}->{'SYNTACTIC_ANALYSIS_EN_LP2LP'} . " " . $h_config->{'TMPFILE'} . ".result.tmp.2 > " . $h_config->{'TMPFILE'} . ".result.tmp 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;
    }
#        print STDERR "$command_line\n";
    `$command_line`;

	&clean_bioLG($h_config->{'TMPFILE'} . ".result.tmp.1", $h_config->{'TMPFILE'} . ".result.tmp.2");
#       print STDERR "$command_line2\n";
#     `$command_line2`;

#        print STDERR "$command_line3\n";
    `$command_line3`;

#    print STDERR "\n$command_line\n$command_line2\n";
    $Alvis::NLPPlatform::ALVISDEBUG || unlink $h_config->{'TMPFILE'} . ".result.tmp.1";
    $Alvis::NLPPlatform::ALVISDEBUG || unlink $h_config->{'TMPFILE'} . ".result.tmp.2";

    # process syntactic analysis

    $insentence=0;
    $nsentence=0;
    $relation_id=1;

    $constituents="";
    $nb_constituents=0;

    open SYN_RES, "<" . $h_config->{'TMPFILE'}. ".result.tmp";

    open CONSTITUENT_OUTPUT,">" . $Alvis::NLPPlatform::Annotation::document_record_id . ".constituents";

    while($line=<SYN_RES>)
    {
	if (index($line, "[Sentence") == 0) {
	    $parsedconstituent = 0;
	}
	if(index($line,"[(")==0){
	    $insentence=1;
            # XXX
	    $nsentence++;
	    $sentence="";
	    $tokens="";
	    $analyses="";
	    $left_wall=0;
	}
	if(index($line,"[S ")==0){
	    $constituents=$line;
#	    print STDERR "**** FOUND CONSTITUENTS SENDING DECAL $phrase_idx ****\n";
	    if ($parsedconstituent == 0) {
		$nb_constituents++;

( run in 1.138 second using v1.01-cache-2.11-cpan-600a1bdf6e4 )