Alvis-NLPPlatform

 view release on metacpan or  search on metacpan

lib/Alvis/NLPPlatform/UserNLPWrappers.pm  view on Meta::CPAN

    my @arr_tokens;
    my $last_token;
    my $wordidshift=0;

    my $phrase_idx=$Alvis::NLPPlatform::Annotation::phrase_idx;

    my $word;
    my $word_cont;
    my $word_id;
    my $i;
    my $sentences_cont="";

    my @tab_word_punct;
    my @tab_word;
    my $idx_tab_word_punct=1;
    my $idx_tab_word=1;
    my @tab_mapping;

    # print out words+punct and fill in a tab
    push @tab_word_punct," ";
    push @tab_word," ";
    my $decal=1;

    my $searchterm;
    my $sti;
    my $word_np;
    
    my @tab_tmp;
    my $tmp_sp;
    my $spi=0;

    my $termsfound=0;
    my $stubs=0;

    my $skip=0;

    my @tab_start_term=();
    my @tab_end_term=();

    my $constituents;
    my $nb_constituents;

    my $min;
    my $max;

    my $btw_start;
    my $btw_end;
    my $token;
    my $sentence_cont;

    print STDERR "  Performing term extraction... \n";
    open CORPUS, ">>" . $h_config->{"TMPFILE"} . ".corpus.yatea.tmp";
    binmode(CORPUS, ":utf8");

    print CORPUS $Alvis::NLPPlatform::Annotation::document_record_id . "\tDOCUMENT\t" . $Alvis::NLPPlatform::Annotation::document_record_id . "\n" ;

    &PrintOutputTreeTagger($h_config, $doc_hash, \*CORPUS);

    close CORPUS;

#     if ((exists $h_config->{"XML_OUTPUT"}->{"YATEA"}) && ($h_config->{"XML_OUTPUT"}->{"YATEA"} == 1)) {
# 	%$doc_hash = ();
# 	%Alvis::NLPPlatform::hash_tokens = ();
# 	%Alvis::NLPPlatform::hash_words = ();
# 	%Alvis::NLPPlatform::hash_words_punct = ();
# 	%Alvis::NLPPlatform::hash_sentences = ();
# 	%Alvis::NLPPlatform::hash_postags = ();
# 	%Alvis::NLPPlatform::hash_named_entities = ();
# 	%Alvis::NLPPlatform::hash_lemmas = ();
	
# 	$Alvis::NLPPlatform::number_of_words = 0;
# 	$Alvis::NLPPlatform::number_of_sentences = 0;
# 	$Alvis::NLPPlatform::nb_relations = 0;
# 	$Alvis::NLPPlatform::dont_annotate = 0;
	
# 	@Alvis::NLPPlatform::word_start = ();
# 	@Alvis::NLPPlatform::word_end = ();
	
# 	@Alvis::NLPPlatform::en_start = ();
# 	@Alvis::NLPPlatform::en_end = ();
# 	@Alvis::NLPPlatform::en_type = ();
	
# 	@Alvis::NLPPlatform::en_tokens_start = ();
# 	@Alvis::NLPPlatform::en_tokens_end = ();
# 	%Alvis::NLPPlatform::en_tokens_hash = ();

#     }
    
    if (    $Alvis::NLPPlatform::last_doc == 0) {
	return(1);
    }

    require Lingua::YaTeA::Corpus;
    require Lingua::YaTeA;
    my %config_yatea = Lingua::YaTeA::load_config($h_config->{'NLP_tools'}->{'YATEARC'});


    my $yatea = Lingua::YaTeA->new($config_yatea{"OPTIONS"}, \%config_yatea);

    if (defined $h_config->{'NLP_tools'}->{'YATEAOUTPUT'}) {
	print STDERR "\nYaTeA output defined is " . $h_config->{'NLP_tools'}->{'YATEAOUTPUT'} . "\n\n";
	$yatea->getOptionSet->addOption("output-path", $h_config->{'NLP_tools'}->{'YATEAOUTPUT'});
    } else {
	print STDERR "\nNo YaTeA output defined\n\n";
	$yatea->getOptionSet->addOption("output-path", $h_config->{"ALVISTMP"});
    }

    my $corpus_path = $h_config->{"TMPFILE"} . ".corpus.yatea.tmp";
    my $corpus = Lingua::YaTeA::Corpus->new($corpus_path,$yatea->getOptionSet,$yatea->getMessageSet);

    

########################################################################

    my $sentence_boundary = $yatea->getOptionSet->getSentenceBoundary;
    my $document_boundary = $yatea->getOptionSet->getDocumentBoundary;

#    $yatea->loadTestifiedTerms(\$process_counter,$corpus,$sentence_boundary,$document_boundary,$yatea->getOptionSet->MatchTypeValue,$yatea->getMessageSet,$yatea->getOptionSet->getDisplayLanguage);

    print STDERR $Lingua::YaTeA::process_counter++ . ") " . ($yatea->getMessageSet->getMessage('LOAD_CORPUS')->getContent($yatea->getOptionSet->getDisplayLanguage)) . "\n";

    $corpus->read($sentence_boundary,$document_boundary,$yatea->getFSSet,$yatea->getTestifiedTermSet,$yatea->getMessageSet,$yatea->getOptionSet->getDisplayLanguage);

    my $phrase_set = Lingua::YaTeA::PhraseSet->new;
    
    print STDERR $Lingua::YaTeA::process_counter++ . ") " . ($yatea->getMessageSet->getMessage('CHUNKING')->getContent($yatea->getOptionSet->getDisplayLanguage)) . "\n";
    $corpus->chunk($phrase_set,$sentence_boundary,$document_boundary,$yatea->getChunkingDataSet,$yatea->getFSSet,$yatea->getTagSet,$yatea->getParsingPatternSet,$yatea->getTestifiedTermSet,$yatea->getOptionSet);

    $phrase_set->sortUnparsed;
    
    print STDERR $Lingua::YaTeA::process_counter++ . ") " . ($yatea->getMessageSet->getMessage('PARSING')->getContent($yatea->getOptionSet->getDisplayLanguage)) . "\n";
    
    $phrase_set->parseProgressively($yatea->getTagSet,$yatea->getOptionSet->getParsingDirection,$yatea->getParsingPatternSet,$yatea->getChunkingDataSet,$corpus->getLexicon,$corpus->getSentenceSet,$yatea->getMessageSet,$yatea->getOptionSet->getDisplay...
    
    $phrase_set->addTermCandidates($yatea->getOptionSet);
    
    print STDERR $Lingua::YaTeA::process_counter++ . ") " . ($yatea->getMessageSet->getMessage('RESULTS')->getContent($yatea->getOptionSet->getDisplayLanguage)) . "\n";


# coments to keep
    print STDERR "\t-" . ($yatea->getMessageSet->getMessage('DISPLAY_RAW')->getContent($yatea->getOptionSet->getDisplayLanguage)) . "\'". $corpus->getOutputFileSet->getFile('debug')->getPath . "'\n";
    $phrase_set->printPhrases(FileHandle->new(">" . $corpus->getOutputFileSet->getFile('debug')->getPath));
    $phrase_set->printUnparsable($corpus->getOutputFileSet->getFile('unparsable'));


    print STDERR "\t-" . ($yatea->getMessageSet->getMessage('DISPLAY_TC_XML')->getContent($yatea->getOptionSet->getDisplayLanguage)) . "\'". $corpus->getOutputFileSet->getFile('candidates')->getPath . "'\n";
# 
    if ((exists $h_config->{"XML_OUTPUT"}->{"YATEA"}) && ($h_config->{"XML_OUTPUT"}->{"YATEA"} == 1)) {
	$phrase_set->printTermCandidatesXML("stdout",$yatea->getTagSet);
	exit;
    } else {
# 	$phrase_set->printTermCandidatesXML($corpus->getOutputFileSet->getFile("candidates"),$yatea->getTagSet);
	&storeTerms($phrase_set,$doc_hash,$yatea->getTagSet);
    }

########################################################################



########################################################################

#     }


########################################################################

    $Alvis::NLPPlatform::ALVISDEBUG || unlink $h_config->{'TMPFILE'}. ".corpus.tmp";

#    print STDERR "done - Found $Alvis::NLPPlatform::nb_relations relations, $termsfound full terms, $nb_constituents constituents.\n";
}


sub storeTerms
{
    my ($phrase_set,$doc_hash,$tagset) = @_;

    my $fh = \*STDOUT;
    
    my $term_candidate;
    my $if;
    my $pos;
    my $lf;
    my $occurrence;
    my $island;
    my $position;

    my $sem_unit;
    my $term;
    my $term_id = 1;
    my $syn_relation_id = 1;

    my $phrase_idx = $Alvis::NLPPlatform::Annotation::phrase_idx;
    my $relation_id = $Alvis::NLPPlatform::Annotation::syntactic_relation_idx;

    my %YateaTerms2AlvisSemUnits;
    my %YateaTermOcc2AlvisSemUnits;

    my $syntactic_relation_id;

    my $term_refid_head;
    my $term_refid_modifier;
    my $refid_head;
    my $refid_modifier;

    my $i_sem;


	    $sem_unit=$Alvis::NLPPlatform::last_semantic_unit + 1;

lib/Alvis/NLPPlatform/UserNLPWrappers.pm  view on Meta::CPAN


    $sentences_cont = "<command string=\"constituents=2\">\n" . $sentences_cont;
    $sentences_cont .= "</command>";
    $sentences_cont = "<command string=\"graphics\">\n" . $sentences_cont;
    $sentences_cont .= "</command>";
#    $sentences_cont = "<command string=\"timeout=60\">\n" . $sentences_cont;
#     $sentences_cont .= "</command>";
    $sentences_cont = "<command string=\"postscript\">\n" . $sentences_cont;
    $sentences_cont .= "</command>";
#      $sentences_cont = "<command string=\"null\">\n" . $sentences_cont;
#      $sentences_cont .= "</command>";
#     $sentences_cont = "<command string=\"ask\">\n" . $sentences_cont;
#     $sentences_cont .= "</command>";
#     $sentences_cont = "<command string=\"walls\">\n" . $sentences_cont;
#     $sentences_cont .= "</command>";
#     $sentences_cont = "<command string=\"union\">\n" . $sentences_cont;
#     $sentences_cont .= "</command>";

    open CORPUS, ">" . $h_config->{"TMPFILE"} . ".corpus.tmp";

    print CORPUS Encode::encode_utf8($sentences_cont);
#     print CORPUS $sentences_cont;
    close CORPUS;

    my $command_line;
    my $command_line2;
    my $command_line3;
    if($Alvis::NLPPlatform::Annotation::ALVISLANGUAGE eq "FR"){
	# French parser command line
    }else{
	$command_line = $h_config->{'NLP_tools'}->{'SYNTACTIC_ANALYSIS_EN'} . " < " . $h_config->{'TMPFILE'} . ".corpus.tmp > " . $h_config->{'TMPFILE'} . ".result.tmp.1 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;
 	$command_line2 = $h_config->{'NLP_tools'}->{'SYNTACTIC_ANALYSIS_EN_LP2LP_CLEAN'} . " < " . $h_config->{'TMPFILE'} . ".result.tmp.1 > " . $h_config->{'TMPFILE'} . ".result.tmp.2 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;

 	$command_line3 = $h_config->{'NLP_tools'}->{'SYNTACTIC_ANALYSIS_EN_LP2LP'} . " " . $h_config->{'TMPFILE'} . ".result.tmp.2 > " . $h_config->{'TMPFILE'} . ".result.tmp 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;
    }
#        print STDERR "$command_line\n";
    `$command_line`;

	&clean_bioLG($h_config->{'TMPFILE'} . ".result.tmp.1", $h_config->{'TMPFILE'} . ".result.tmp.2");
#       print STDERR "$command_line2\n";
#     `$command_line2`;

#        print STDERR "$command_line3\n";
    `$command_line3`;

#    print STDERR "\n$command_line\n$command_line2\n";
    $Alvis::NLPPlatform::ALVISDEBUG || unlink $h_config->{'TMPFILE'} . ".result.tmp.1";
    $Alvis::NLPPlatform::ALVISDEBUG || unlink $h_config->{'TMPFILE'} . ".result.tmp.2";

    # process syntactic analysis

    $insentence=0;
    $nsentence=0;
    $relation_id=1;

    $constituents="";
    $nb_constituents=0;

    open SYN_RES, "<" . $h_config->{'TMPFILE'}. ".result.tmp";

    open CONSTITUENT_OUTPUT,">" . $Alvis::NLPPlatform::Annotation::document_record_id . ".constituents";

    while($line=<SYN_RES>)
    {
	if (index($line, "[Sentence") == 0) {
	    $parsedconstituent = 0;
	}
	if(index($line,"[(")==0){
	    $insentence=1;
            # XXX
	    $nsentence++;
	    $sentence="";
	    $tokens="";
	    $analyses="";
	    $left_wall=0;
	}
	if(index($line,"[S ")==0){
	    $constituents=$line;
#	    print STDERR "**** FOUND CONSTITUENTS SENDING DECAL $phrase_idx ****\n";
	    if ($parsedconstituent == 0) {
		$nb_constituents++;
		$phrase_idx=parse_constituents($constituents,$phrase_idx,$doc_hash);
	    }
	    $parsedconstituent = 1;
	    $constituents=~s/\[([A-Z]+) /($1 /sgo;
# 	    $constituents=~s/\[([A-Z]+) /<constituent>$1 /sgo;
	    $constituents=~s/[A-Z]+\]/)/sgo;
# 	    $constituents=~s/[A-Z]+\]/<\/constituent>/sgo;
# 	    print CONSTITUENT_OUTPUT $Alvis::NLPPlatform::Annotation::document_record_id . "\t";
	    print CONSTITUENT_OUTPUT "$constituents\n";

#	    print STDERR "**** RECUP $phrase_idx ****\n";
	}
	if($insentence==1){
	    $sentence.=$line;
	}
# 	if(index($line,"diagram")==0){
	if(index($line,"[]")==0){
	    # process the line
	    $sentence=~s/\[Sentence\s+[0-9]+\]//sgo;
	    $sentence=~s/\[Linkage\s+[0-9]+\]//sgo;
	    $sentence=~s/\[\]//sgo;
	    $sentence=~s/\n//sgo;
# 	    $sentence=~s/\[[0-9\s]*\]diagram$//g;
	    if ($sentence=~m/^(.+)\[\[/) {
		$tokens=$1;
	#	print STDERR "\n\n--> $sentence\n\n";
		$analyses = $';
            # '
		# output
		# search left-wall to shift identifiers
		if($tokens =~ /LEFT\-WALL/so){
		    $left_wall=1;
		}else{
		    $left_wall=0;
		}
		
		# search right-wall, simply to ignore it
		if($tokens =~ /RIGHT\-WALL/so){
		    $right_wall=1;
		}else{
		    $right_wall=0;
		}

		# parse tokens
		@arr_tokens=split /\)\(/,$tokens;
		$last_token=(scalar @arr_tokens)-1;
		$arr_tokens[0]=~s/^\[\(//sgo;
		$arr_tokens[$last_token]=~s/\)\]$//sgo;

#	    my $tmpfdsf;
# 	    for($tmpfdsf=0;$tmpfdsf<=$last_token;$tmpfdsf++){
# 		#print STDERR "******\$\$\$\$\$\$****** ($tmpfdsf) $arr_tokens[$tmpfdsf]\n";
# 	    }

		# Parsing
		my $valid_analysis;
		while($analyses=~/(\[[0-9]+\s[0-9]+\s[0-9]+\s[^\]]+\])/sgoc){
		    my $kref=0;
		    $analysis=$1;
		    if($analysis=~m/\[([0-9]+)\s([0-9]+)\s([0-9]+)\s\(([^\]]+)\)\]/sgo){ # m??
			$valid_analysis=1;
		    }else{
			$valid_analysis=0;
		    }
		    $token_start=$1;
		    $token_end=$2;
		    $relation=$4;
		    if(
		       (($left_wall==1)&&(($token_start==0) || ($token_end==0)))

lib/Alvis/NLPPlatform/UserNLPWrappers.pm  view on Meta::CPAN

				    }

				    # is modifier within term?
				    if(
				    ($tab_mapping[$tmp2]>=$tab_start_term[$lft] &&
					$tab_mapping[$tmp2]<=$tab_end_term[$lft])
					){
					$tmp2_within=1;
				    }

				    # rules set here:
				    # relation between two words in a term: W-W relation
				    # relation between two words outside of a term: W-W relation
				    # relation between a word in a term and another word outside this term: W-P relation
				    if(($tmp1_within+$tmp2_within)==1){
					# one of them is in, the other is out
#					print STDERR "\n";
					# find term id
					$kref=$Alvis::NLPPlatform::found_terms_tidx[$lft];
					$kref++; # it's always >0
					last;
				    }
				}
				$doc_hash->{"syntactic_relation$relation_id"}={};
				$doc_hash->{"syntactic_relation$relation_id"}->{'id'}="syntactic_relation$relation_id";
				$doc_hash->{"syntactic_relation$relation_id"}->{'datatype'}="syntactic_relation";
				$doc_hash->{"syntactic_relation$relation_id"}->{'syntactic_relation_type'}="$relation";
				$doc_hash->{"syntactic_relation$relation_id"}->{'refid_head'} = {};
				$doc_hash->{"syntactic_relation$relation_id"}->{'refid_head'}->{'datatype'}="refid_head";
				if(($kref>0)&&($tmp1_within==1)&&($Alvis::NLPPlatform::found_terms_phr[($kref-1)]!=-666)){
				    $doc_hash->{"syntactic_relation$relation_id"}->{'refid_head'}->{"refid_phrase"}="phrase".$Alvis::NLPPlatform::found_terms_phr[($kref-1)];
#				    print STDERR "\n\nSize: ".scalar @Alvis::NLPPlatform::found_terms_phr."\n";
#				    print STDERR "Index: $kref\n\n";
				}else{
				    $doc_hash->{"syntactic_relation$relation_id"}->{'refid_head'}->{"refid_word"}="word".$tab_mapping[($token_start+$wordidshift)];
				}
# 				$doc_hash->{"syntactic_relation$relation_id"}->{'refid_head'}="word".$tab_mapping[($token_start+$wordidshift)];
				$doc_hash->{"syntactic_relation$relation_id"}->{'refid_modifier'} = {};
				$doc_hash->{"syntactic_relation$relation_id"}->{'refid_modifier'}->{'datatype'}="refid_modifier";
				if(($kref>0)&&($tmp2_within==1)&&($Alvis::NLPPlatform::found_terms_phr[($kref-1)]!=-666)){
				    $doc_hash->{"syntactic_relation$relation_id"}->{'refid_modifier'}->{"refid_phrase"}="phrase".$Alvis::NLPPlatform::found_terms_phr[($kref-1)];
#				    print STDERR "\n\nIndex: $kref\n\n";
				}else{
				    $doc_hash->{"syntactic_relation$relation_id"}->{'refid_modifier'}->{"refid_word"}="word".$tab_mapping[($token_end+$wordidshift)];
				}
# 				$doc_hash->{"syntactic_relation$relation_id"}->{'refid_modifier'}="word".$tab_mapping[($token_end+$wordidshift)];
				
				$relation_id++;
			    }
			}
		    }
		}
		
		# trash everything and continue the loop

		$insentence=0;
		$wordidshift+=$last_token-1;
	    }
	}
    }
    close CONSTITUENT_OUTPUT;
    close SYN_RES;

#    print STDERR $h_config->{'TMPFILE'}. ".corpus.tmp" . "\n";
    $Alvis::NLPPlatform::ALVISDEBUG || unlink $h_config->{'TMPFILE'}. ".corpus.tmp";
    $Alvis::NLPPlatform::ALVISDEBUG || unlink $h_config->{'TMPFILE'} . ".result.tmp";

    $Alvis::NLPPlatform::nb_relations=$relation_id-1;
    $Alvis::NLPPlatform::Annotation::phrase_idx=$phrase_idx;

    print STDERR "done - Found $Alvis::NLPPlatform::nb_relations relations, $termsfound full terms, $nb_constituents constituents.\n";
    push @{$doc_hash->{"log_processing1"}->{"comments"}},  "Found Syntactic Relations : " . $Alvis::NLPPlatform::nb_relations;
}


sub clean_bioLG {

    my ($infile, $outfile) = @_;


    my $line = "";
    my $sentence_counter = 0;
    my $linkage_counter = 0;
    
    my @linkage_output;
    
#     my $line_prec = "";

    open INFILE, $infile or die "No such file $infile\n";
    binmode INFILE;
    open OUTFILE, ">$outfile" or die "No such file $outfile\n";
 
    # puts the text on only one line
    do {
	# We first remove the outputting input 
	while((defined  ($line = <INFILE>)) && ($line !~ /^\+\+\+\+Time/o)) {
# 	print $line;
# 	    $line_prec = $line;
	};

	if ((defined $line) && ($line =~ /^\+\+\+\+Time/o)) {
	    $linkage_counter = 0;
	    @linkage_output = ();
	    do {
		#We remove the postscript output until we found constituent part
		while((defined ($line = <INFILE>)) && ($line !~ /^\[/o)) {
		    # nothing 
		}
		# we print the output until we find the next postscript part 
		$linkage_output[$linkage_counter] = $line;
# 	    print $line;
		while((defined ($line = <INFILE>)) && ($line ne "diagram\n")) {
# 		    print STDERR "=> $line\n";
		    $linkage_output[$linkage_counter] .= $line;
# 		print $line;
		}
		# we remove the next postscript part 
		while ((defined ($line =<INFILE>)) && ($line ne "%%EndDocument\n")) {
		    # nothing 
		}
		$line = <INFILE>;



( run in 1.582 second using v1.01-cache-2.11-cpan-39bf76dae61 )