PUT results from the CPAN

Alvis-NLPPlatform

view release on metacpan or search on metacpan

- remove any use of xmlfoot and xmlhead

- managing several document in the client/server mode

- integrate features defined in XML_INPUT and XML_OUTPUT

- complete the modification of the xml loader (above all managing
  terms like EN)

- modify the spit_to_docrecfunction in NLPPlatform to take into
  account

 - integrate properly the semantic tagging

 - checking and cleaning the bioLG wrapper

bin/alvis-nlp-standalone view on Meta::CPAN



if ($#ARGV == -1) {
    while($line=<STDIN>) {
	$doc_xml .= $line;
    }
} else {
    warn $ARGV[$#ARGV] . "\n";

    if ( -f $ARGV[$#ARGV] ) {
	open INPUTFILE, $ARGV[$#ARGV] or die "No such file or directory\n";
	while($line=<INPUTFILE>) {
	    $doc_xml .= $line;
	}
	close INPUTFILE;
    } else {
	die "No such file or dorectory\n";
    }
}

# my @tab_docs_xml = Alvis::NLPPlatform::split_to_docRecs($doc_xml);

# Alvis::NLPPlatform::Annotation::print_documentCollectionHeader(\*STDOUT);

my $i;

etc/alvis-nlpplatform/nlpplatform-test.rc view on Meta::CPAN

	SPOOLDIR	= $HOME/tmp/spool
	OUTDIR		= $HOME/tmp/Outdir
</alvis_connection>

<NLP_connection>
	SERVER			= localhost
	PORT			= 1510
	RETRY_CONNECTION	= 10
</NLP_connection>

<XML_INPUT>
    PRESERVEWHITESPACE = 0
    LINGUISTIC_ANNOTATION_LOADING = 0
</XML_INPUT>


<XML_OUTPUT>
    FORM = 1
    ID = 1
    TOKEN_LEVEL = 1
    SEMANTIC_UNIT_NAMED_ENTITY_LEVEL	= 1
    WORD_LEVEL	= 1
    SENTENCE_LEVEL = 1
    MORPHOSYNTACTIC_FEATURE_LEVEL	= 1
    LEMMA_LEVEL	= 1
    SEMANTIC_UNIT_TERM_LEVEL	= 1
    SEMANTIC_UNIT_LEVEL	= 1
    SYNTACTIC_RELATION_LEVEL	= 1

    NO_STD_XML_OUTPUT = 0

    YATEA = 1
</XML_OUTPUT>

<linguistic_annotation>
	ENABLE_TOKEN	= 1
	ENABLE_NER	= 0
	ENABLE_WORD	= 0
	ENABLE_SENTENCE = 0
	ENABLE_POS	= 0
	ENABLE_LEMMA	= 0
	ENABLE_TERM_TAG	= 0
	ENABLE_SYNTAX	= 0

etc/alvis-nlpplatform/nlpplatform-test.rc view on Meta::CPAN

	POSTAG_FR		= "$NLP_tools_root/TreeTagger/bin/tree-tagger $NLP_tools_root/TreeTagger/lib/french.par -token -lemma -sgml -no-unknown"
	# SYNTACTIC_PATH_EN	= "$NLP_tools_root/link-4.1b"
	SYNTACTIC_PATH_EN       = "$NLP_tools_root/biolgForAlvis/biolg-1.1.7b"
	SYNTACTIC_PATH_FR	= ""
	# SYNTACTIC_ANALYSIS_EN	= "cd $SYNTACTIC_PATH_EN ; $SYNTACTIC_PATH_EN/parse"
	SYNTACTIC_ANALYSIS_EN	= "cd $SYNTACTIC_PATH_EN ; $SYNTACTIC_PATH_EN/parse -xmlin"
	SYNTACTIC_ANALYSIS_FR	= ""
	TERM_TAG_FR		= ""
	TERM_TAG_EN		= ""
        YATEARC                 = "/etc/yatea/yatea-devTH.rc"
        YATEAOUTPUT             = "$ALVISTMP"
</NLP_tools>

<CONVERTERS>
text/plain = text2xhtml  <

text/html = 

www/unknown = 
#text/plain ; ; GuessText

etc/alvis-nlpplatform/nlpplatform.rc view on Meta::CPAN

	SPOOLDIR	= $HOME/tmp/spool
	OUTDIR		= $HOME/tmp/Outdir
</alvis_connection>

<NLP_connection>
	SERVER			= localhost
	PORT			= 1510
	RETRY_CONNECTION	= 10
</NLP_connection>

<XML_INPUT>
    PRESERVEWHITESPACE = 0
    LINGUISTIC_ANNOTATION_LOADING = 0
</XML_INPUT>


<XML_OUTPUT>
    FORM = 1
    ID = 1
    TOKEN_LEVEL = 1
    SEMANTIC_UNIT_NAMED_ENTITY_LEVEL	= 1
    WORD_LEVEL	= 1
    SENTENCE_LEVEL = 1
    MORPHOSYNTACTIC_FEATURE_LEVEL	= 1
    LEMMA_LEVEL	= 1
    SEMANTIC_UNIT_TERM_LEVEL	= 1
    SEMANTIC_UNIT_LEVEL	= 1
    SYNTACTIC_RELATION_LEVEL	= 1

    NO_STD_XML_OUTPUT = 0

    YATEA = 1
</XML_OUTPUT>

<linguistic_annotation>
	ENABLE_TOKEN	= 1
	ENABLE_NER	= 1
	ENABLE_WORD	= 1
	ENABLE_SENTENCE = 1
	ENABLE_POS	= 1
	ENABLE_LEMMA	= 1
	ENABLE_TERM_TAG	= 1
	ENABLE_SYNTAX	= 0

etc/alvis-nlpplatform/nlpplatform.rc view on Meta::CPAN

#         SYNTACTIC_ANALYSIS_EN_LP2LP_CLEAN       = "cd $SYNTACTIC_PATH_EN_SUPPL ; $SYNTACTIC_PATH_EN_SUPPL/clean_BioLG_output_for_lp2lp.pl"
#         SYNTACTIC_ANALYSIS_EN_LP2LP     = "cd $SYNTACTIC_PATH_EN_SUPPL ; $SYNTACTIC_PATH_EN_SUPPL/lp2lp/lp2lp -r $SYNTACTIC_PATH_EN_SUPPL/lp2lp/test/lp2lp.conf"

	# SYNTACTIC_ANALYSIS_EN	= "cd $SYNTACTIC_PATH_EN ; $SYNTACTIC_PATH_EN/parse -xmlin"
	SYNTACTIC_ANALYSIS_FR	= ""
	TERM_TAG_FR		= ""
	TERM_TAG_EN		= ""
        SEMTAG_EN               = "$SEMTAG_EN_DIR/src/AlvisSemTag -c $CANONICAL_DICT -p $PARENT_DICT -o $ONTOLOGY"
        SEMTAG_FR               = ""
        YATEARC                 = "/etc/yatea/yatea.rc"
        YATEAOUTPUT             = "$ALVISTMP"
</NLP_tools>

<CONVERTERS>
text/plain = text2xhtml  <

text/html = 

www/unknown = 
#text/plain ; ; GuessText

examples/InputDocument.xml view on Meta::CPAN

            <section title="token_id_just_before_last_of_list_refid_token()">
              <section>token_id_just_before_last_of_list_refid_token()</section> token_id_just_before_last_of_list_refid_token($list_refid_token, $token_to_search); 
              <section>The method returns 1 if the token $token_to_search is just before the first token of the list $list_refid_token , 0 else.</section></section>
            <section title="unparseable_id()">
              <section>unparseable_id()</section> unparseable_id($id) 
              <section>The method checks if the id have been parsed or not. If not, it prints a warning.</section></section></section>
          <section title="PLATFORM CONFIGURATION">
            <section>PLATFORM CONFIGURATION</section> 
            <section>The configuration file of the NLP Platform is composed of global variables and divided into several sections:</section>  
            <section>Global variables. 
              <section>The two mandatory variables are ALVISTMP and PRESERVEWHITESPACE (in the XML_INPUT section).</section>  
              <section>
                <section>ALVISTMP : it defines the temporary directory used during the annotation process. The files are recorded in (XML files and input/output of the NLP tools) during the annotation step. It must be writable to the user the process...
              <section>
                <section>DEBUG : this variable indicates if the NLP platform is run in a debug mode or not. The value are 1 (debug mode) or 0 (no debug mode). Default value is 0. The main consequence of the debug mode is to keep the temporary file.</...
              <section>Additional variables and environement variables can be used if they are interpolated in the configuration file. For instance, in the default configuration file, we add</section>  
              <section>
                <section>PLATFORM_ROOT : directory where are installed NLP tools and resources.</section></section> 
              <section>
                <section>NLP_tools_root : root directory where are installed the NLP tools</section></section> 
              <section>

examples/InputDocument.xml view on Meta::CPAN

              <section>
                <section>OUTDIR : the directory where are stored the annotated documents if SAVE_IN_OUTDIR (in Section NLP_misc ) is set.</section> 
                <section>It must be writable to the user the process is running as.</section></section></section> 
            <section>Section NLP_connection  
              <section>
                <section>SERVER : The host name where the NLP server is running, for the connections with the NLP clients.</section></section> 
              <section>
                <section>PORT : The listening port of the NLP server, for the connections with the NLP clients.</section></section> 
              <section>
                <section>RETRY_CONNECTION : The number of times that the clients attempts to connect to the server.</section></section></section> 
            <section>XML_INPUT  
              <section>
                <section>PRESERVEWHITESPACE is a boolean indicating if the linguistic annotation will be done by preserving white space or not, i.e. XML blank nodes and white space at the beginning and the end of any line, but also indentation of the...
                <section>Default value is 0 or false (blank nodes and indentation characters are removed).</section></section> 
              <section>
                <section>LINGUISTIC_ANNOTATION_LOADING : The linguistic annotations already existing in the input documents are loaded or not. Default value is c60162 or true (linguistic annotations are loaded).</section></section></section> 
            <section>
              <section>XML_OUTPUT (Not available yet)</section>   
              <section>
                <section>FORM</section></section>  
              <section>
                <section>ID</section></section></section> 
            <section>Section linguistic_annotation 
              <section>the section defines the NLP steps that will be used for annotating documents. The values are 0 or 1 .</section>  
              <section>
                <section>ENABLE_TOKEN : toggles the tokenization step.</section></section> 
              <section>
                <section>ENABLE_NER : toggles the named entity recognition step.</section></section>

examples/nlpplatform-test.rc view on Meta::CPAN

	SPOOLDIR	= $HOME/tmp/spool
	OUTDIR		= $HOME/tmp/Outdir
</alvis_connection>

<NLP_connection>
	SERVER			= localhost
	PORT			= 1510
	RETRY_CONNECTION	= 10
</NLP_connection>

<XML_INPUT>
    PRESERVEWHITESPACE = 0
    LINGUISTIC_ANNOTATION_LOADING = 0
</XML_INPUT>


<XML_OUTPUT>
    FORM = 1
    ID = 1
    TOKEN_LEVEL = 1
    SEMANTIC_UNIT_NAMED_ENTITY_LEVEL	= 1
    WORD_LEVEL	= 1
    SENTENCE_LEVEL = 1
    MORPHOSYNTACTIC_FEATURE_LEVEL	= 1
    LEMMA_LEVEL	= 1
    SEMANTIC_UNIT_TERM_LEVEL	= 1
    SEMANTIC_UNIT_LEVEL	= 1
    SYNTACTIC_RELATION_LEVEL	= 1

    NO_STD_XML_OUTPUT = 0

    YATEA = 1
</XML_OUTPUT>

<linguistic_annotation>
	ENABLE_TOKEN	= 1
	ENABLE_NER	= 0
	ENABLE_WORD	= 0
	ENABLE_SENTENCE = 0
	ENABLE_POS	= 0
	ENABLE_LEMMA	= 0
	ENABLE_TERM_TAG	= 0
	ENABLE_SYNTAX	= 0

lib/Alvis/NLPPlatform.pm view on Meta::CPAN

	close(FILETMP_OUT);

	open FILETMP_OUT, "$tmpfile" or die "No such file or directory\n";
	@cur_doc = <FILETMP_OUT>;
	$j = 0;
	while(($j< scalar @cur_doc) && ($cur_doc[$j] !~ s/\@RENDER_TIME_NOT_SET\@/$render_time/)) {
	    $j++;
	}
	close(FILETMP_OUT);

        if (!((exists $config->{"XML_OUTPUT"}->{"NO_STD_XML_OUTPUT"}) && ($config->{"XML_OUTPUT"}->{"NO_STD_XML_OUTPUT"} == 1))) {
	    if (scalar(@records) > 1) {
		if ($i == 0){
		    pop @cur_doc;
		} else {
		    shift @cur_doc;
		    shift @cur_doc;
		}
	    }
#	    push @doc_collection_out, @cur_doc;
		print @cur_doc;

lib/Alvis/NLPPlatform.pm view on Meta::CPAN

				     "PORT" => "NLP Server port",
				     "RETRY_CONNECTION" => "Number of time for retrying the connection",
	);
	foreach $var (keys %nlp_connection_vars) {
	    if (defined $config->{"NLP_connection"}->{$var}) { 
		print STDERR "\t" . $nlp_connection_vars{$var} . " : " . $config->{"NLP_connection"}->{$var} . "\n";
	    }
	}
    }

    if (defined $config->{"XML_INPUT"}) {
	print STDERR "  Section Configuration of the XML INPUT\n";

	my %xml_input_vars = ("PRESERVEWHITESPACE" => "Preserve XML White space?",
				     "LINGUISTIC_ANNOTATION_LOADING" => "Loading previous linguistic annotation?",
	);
	foreach $var (keys %xml_input_vars) {
	    if (defined $config->{"XML_INPUT"}->{$var}) { 
		print STDERR "\t" . $xml_input_vars{$var} . " : " . $config->{"XML_INPUT"}->{$var} . "\n";
	    }
	}
    }

    if (defined $config->{"XML_OUTPUT"}) {
	print STDERR "  Section Configuration of the XML OUTPUT\n";

	my %xml_output_vars = ("NO_STD_XML_OUTPUT" => "No printing standard XML output?",
	);
	foreach $var (keys %xml_output_vars) {
	    if (defined $config->{"XML_OUTPUT"}->{$var}) { 
		print STDERR "\t" . $xml_output_vars{$var} . " : " . $config->{"XML_OUTPUT"}->{$var} . "\n";
	    }
	}
    }

    &compute_dependencies($config);

    if (defined $config->{"NLP_misc"}) {
	print STDERR "  Section Miscellaneous NLP configuration features\n";

	my %NLP_misc_vars = ("NLP_resources" => "NLP resource directory",

lib/Alvis/NLPPlatform.pm view on Meta::CPAN

	);
	foreach $var (keys %NLP_tools_vars) {
	    if (defined $config->{"NLP_tools"}->{$var}) { 
		print STDERR "\t" . $NLP_tools_vars{$var} . " : " . $config->{"NLP_tools"}->{$var} . "\n";
	    }
	}
    }


    if (defined $config->{"CONVERTERS"}) {
	print STDERR "  Section INPUT CONVERTERS\n";

	my %Converter_vars = ("SupplMagicFile" => "File for Additional Definition of Magic Number",
	);
	foreach $var (keys %Converter_vars) {
	    if (defined $config->{"CONVERTERS"}->{$var}) { 
		print STDERR "\t" . $Converter_vars{$var} . " : " . $config->{"CONVERTERS"}->{$var} . "\n";
	    }
	}
	print STDERR "\tRecognized formats:\n";
	$Converter_vars{"STYLESHEET"} = 1;

lib/Alvis/NLPPlatform.pm view on Meta::CPAN

=head1 PLATFORM CONFIGURATION

The configuration file of the NLP Platform is composed of global
variables and divided into several sections:

=over 

=item * Global variables.

The two mandatory variables are C<ALVISTMP> and C<PRESERVEWHITESPACE>
 (in the XML_INPUT section). 


=over 8

=item * 

C<ALVISTMP> : it defines the temporary directory used during the
 annotation process. The files are recorded in (XML files and
 input/output of the NLP tools) during the annotation step.  It must
 be writable to the user the process is running as.

lib/Alvis/NLPPlatform.pm view on Meta::CPAN

C<PORT>: The listening port of the NLP server, for the
connections with the NLP clients.

=item * 

C<RETRY_CONNECTION>: The number of  times that
the clients attempts to connect to the server.

=back

=item * C<XML_INPUT>

=over 8

=item *

C<PRESERVEWHITESPACE> is a boolean indicating if the linguistic
 annotation will be done by preserving white space or not, i.e. XML
 blank nodes and white space at the beginning and the end of any line,
 but also indentation of the text in the canonicalDocument

lib/Alvis/NLPPlatform.pm view on Meta::CPAN


C<LINGUISTIC_ANNOTATION_LOADING>: The linguistic annotations already
existing in the input documents are loaded or not. Default value is
c<1> or true (linguistic annotations are loaded).

=back


=item * 

C<XML_OUTPUT> (Not available yet)

=over 8

=item *

C<NO_STD_XML_OUTPUT>: The standard XML output is not printed. Default
value is false.

=item 

FORM

=item 

ID

lib/Alvis/NLPPlatform/Annotation.pm view on Meta::CPAN

    $enter=~s/ encoding *= *\"([^\"]*)\"/ encoding=\"UTF-8\"/;
    if($enter=~/(<\?xml version="[0-9\.]+")(.*?)([ \s\t]*<documentRecord)/sgo){
	$header=$1.$2;
    }else{
	$enter=$header.$enter;
    }
    $acquisitionData=~/<url>([^<]+)<\/url>/g;
    $documenturl=$1;

    my $string_parse;
    if ((!exists $h_config->{"XML_INPUT"}->{"LINGUISTIC_ANNOTATION_LOADING"}) || ($h_config->{"XML_INPUT"}->{"LINGUISTIC_ANNOTATION_LOADING"} != 0)) {
	warn "  Loading existing linguistic annotations if necessary\n";
	$parser->parse(Source=>{String=>$enter});
	
	
	# Caveat !!! we assume that there is only named entities in the loaded documents
	$Alvis::NLPPlatform::last_semantic_unit = $myreceiver->{"counter_id"};
	
    
	
    }

lib/Alvis/NLPPlatform/Document.pm view on Meta::CPAN

    {
	$doc=$Parser->parse_file($xmlalvisfile);
    };
    if (!$@)
    {
	if ($doc)
	{
	    my $xmlalvisdata = &get_language($doc);


	    open OUTPUT_FILE, ">$outfile";
	    binmode(OUTPUT_FILE, ":utf8");
	    print OUTPUT_FILE "$xmlalvisdata\n";
	    close(OUTPUT_FILE);
	    return($outfile);
	}
	else
	{
	    warn "Parsing the doc failed.\n";
	}
    } else {
	warn "Parsing the doc failed.\n";
	print STDERR $@;
    }

lib/Alvis/NLPPlatform/NLPWrappers.pm view on Meta::CPAN

    $last_char=0;
    $length=0;
    $string="";
    $token_id=1;


    
    print STDERR "  Tokenizing...           ";
    
    $canonical = $Alvis::NLPPlatform::Annotation::canonicalDocument;
    $canonical = Alvis::NLPPlatform::Canonical::CleanUp($canonical, $h_config->{"XML_INPUT"}->{"PRESERVEWHITESPACE"});

    @lines=split /\n/,$canonical;
#     map {$_ .= "\n"} @lines;

    foreach $line(@lines)
    {
	$line .= "\n";
	# convert SGML into characters
	
	# character spliting

lib/Alvis/NLPPlatform/NLPWrappers.pm view on Meta::CPAN


    open REN,"<$result_filename"  or warn "Can't open the file $result_filename";
    binmode REN;
    while($line=<REN>){
	($NE_type, $NE_start, $NE_end) = split /\t/, $line;
# 	$line=~m/(.+)\s+([0-9]+)\s+([0-9]+)/;
# 	$NE_type = $1;
# 	$NE_start = $2;
# 	$NE_end = $3;
	push @Alvis::NLPPlatform::en_type,$NE_type;
	if ((exists($h_config->{'XML_INPUT'}->{"PRESERVEWHITESPACE"})) && ($h_config->{'XML_INPUT'}->{"PRESERVEWHITESPACE"})) {
	    push @Alvis::NLPPlatform::en_start,($NE_start-1);
	    push @Alvis::NLPPlatform::en_end,($NE_end-1);
	} else {
	    push @Alvis::NLPPlatform::en_start,$NE_start;
	    push @Alvis::NLPPlatform::en_end,$NE_end;
	}
    }
    close REN;

    $Alvis::NLPPlatform::ALVISDEBUG || unlink $result_filename;

lib/Alvis/NLPPlatform/ParseConstituents.pm view on Meta::CPAN

    $word_id_np_ref = $parser->YYData->{WORD_ID_NP_REF} or  return('',undef);

    $tab_type_ref = $parser->YYData->{TAB_TYPE_REF};
    $tab_string_ref = $parser->YYData->{TAB_STRING_REF};

    # $lconst = $parser->YYData->{LCONST_REF};
    $nconst = $parser->YYData->{NCONST_REF};

    $word_count=$$word_id_np_ref;

#     $parser->YYData->{INPUT}
#     or $parser->YYData->{INPUT} = "[PP of [NP two transcription factors factors NP] PP]\n"
#     or  return('',undef);
#      $parser->YYData->{INPUT} = $parser->YYData->{CONSTITUENT_STRING};
#  or  return('',undef);

#     chomp $parser->YYData->{INPUT};
#     chop $parser->YYData->{INPUT};
#     print STDERR $parser->YYData->{INPUT};
#     print STDERR ";;\n";

#     print STDERR "==>";
#     print STDERR $parser->YYData->{CONSTITUENT_STRING};
#     print STDERR "\n";

#      print STDERR "$lconst : $$nconst\n";

    $parser->YYData->{CONSTITUENT_STRING}=~s/^[ \t]*#.*//;
    $parser->YYData->{CONSTITUENT_STRING}=~s/^[ \t]*//;

lib/Alvis/NLPPlatform/ParseConstituents.yp view on Meta::CPAN

    $word_id_np_ref = $parser->YYData->{WORD_ID_NP_REF} or  return('',undef);

    $tab_type_ref = $parser->YYData->{TAB_TYPE_REF};
    $tab_string_ref = $parser->YYData->{TAB_STRING_REF};

    # $lconst = $parser->YYData->{LCONST_REF};
    $nconst = $parser->YYData->{NCONST_REF};

    $word_count=$$word_id_np_ref;

#     $parser->YYData->{INPUT}
#     or $parser->YYData->{INPUT} = "[PP of [NP two transcription factors factors NP] PP]\n"
#     or  return('',undef);
#      $parser->YYData->{INPUT} = $parser->YYData->{CONSTITUENT_STRING};
#  or  return('',undef);

#     chomp $parser->YYData->{INPUT};
#     chop $parser->YYData->{INPUT};
#     print STDERR $parser->YYData->{INPUT};
#     print STDERR ";;\n";

#     print STDERR "==>";
#     print STDERR $parser->YYData->{CONSTITUENT_STRING};
#     print STDERR "\n";

#      print STDERR "$lconst : $$nconst\n";

    $parser->YYData->{CONSTITUENT_STRING}=~s/^[ \t]*#.*//;
    $parser->YYData->{CONSTITUENT_STRING}=~s/^[ \t]*//;

lib/Alvis/NLPPlatform/UserNLPWrappers.pm view on Meta::CPAN

    print STDERR "  Performing term extraction... \n";
    open CORPUS, ">>" . $h_config->{"TMPFILE"} . ".corpus.yatea.tmp";
    binmode(CORPUS, ":utf8");

    print CORPUS $Alvis::NLPPlatform::Annotation::document_record_id . "\tDOCUMENT\t" . $Alvis::NLPPlatform::Annotation::document_record_id . "\n" ;

    &PrintOutputTreeTagger($h_config, $doc_hash, \*CORPUS);

    close CORPUS;

#     if ((exists $h_config->{"XML_OUTPUT"}->{"YATEA"}) && ($h_config->{"XML_OUTPUT"}->{"YATEA"} == 1)) {
# 	%$doc_hash = ();
# 	%Alvis::NLPPlatform::hash_tokens = ();
# 	%Alvis::NLPPlatform::hash_words = ();
# 	%Alvis::NLPPlatform::hash_words_punct = ();
# 	%Alvis::NLPPlatform::hash_sentences = ();
# 	%Alvis::NLPPlatform::hash_postags = ();
# 	%Alvis::NLPPlatform::hash_named_entities = ();
# 	%Alvis::NLPPlatform::hash_lemmas = ();
	
# 	$Alvis::NLPPlatform::number_of_words = 0;

lib/Alvis/NLPPlatform/UserNLPWrappers.pm view on Meta::CPAN

	return(1);
    }

    require Lingua::YaTeA::Corpus;
    require Lingua::YaTeA;
    my %config_yatea = Lingua::YaTeA::load_config($h_config->{'NLP_tools'}->{'YATEARC'});


    my $yatea = Lingua::YaTeA->new($config_yatea{"OPTIONS"}, \%config_yatea);

    if (defined $h_config->{'NLP_tools'}->{'YATEAOUTPUT'}) {
	print STDERR "\nYaTeA output defined is " . $h_config->{'NLP_tools'}->{'YATEAOUTPUT'} . "\n\n";
	$yatea->getOptionSet->addOption("output-path", $h_config->{'NLP_tools'}->{'YATEAOUTPUT'});
    } else {
	print STDERR "\nNo YaTeA output defined\n\n";
	$yatea->getOptionSet->addOption("output-path", $h_config->{"ALVISTMP"});
    }

    my $corpus_path = $h_config->{"TMPFILE"} . ".corpus.yatea.tmp";
    my $corpus = Lingua::YaTeA::Corpus->new($corpus_path,$yatea->getOptionSet,$yatea->getMessageSet);

lib/Alvis/NLPPlatform/UserNLPWrappers.pm view on Meta::CPAN



# coments to keep
    print STDERR "\t-" . ($yatea->getMessageSet->getMessage('DISPLAY_RAW')->getContent($yatea->getOptionSet->getDisplayLanguage)) . "\'". $corpus->getOutputFileSet->getFile('debug')->getPath . "'\n";
    $phrase_set->printPhrases(FileHandle->new(">" . $corpus->getOutputFileSet->getFile('debug')->getPath));
    $phrase_set->printUnparsable($corpus->getOutputFileSet->getFile('unparsable'));


    print STDERR "\t-" . ($yatea->getMessageSet->getMessage('DISPLAY_TC_XML')->getContent($yatea->getOptionSet->getDisplayLanguage)) . "\'". $corpus->getOutputFileSet->getFile('candidates')->getPath . "'\n";
# 
    if ((exists $h_config->{"XML_OUTPUT"}->{"YATEA"}) && ($h_config->{"XML_OUTPUT"}->{"YATEA"} == 1)) {
	$phrase_set->printTermCandidatesXML("stdout",$yatea->getTagSet);
	exit;
    } else {
# 	$phrase_set->printTermCandidatesXML($corpus->getOutputFileSet->getFile("candidates"),$yatea->getTagSet);
	&storeTerms($phrase_set,$doc_hash,$yatea->getTagSet);
    }

########################################################################

lib/Alvis/NLPPlatform/UserNLPWrappers.pm view on Meta::CPAN


    $insentence=0;
    $nsentence=0;
    $relation_id=1;

    $constituents="";
    $nb_constituents=0;

    open SYN_RES, "<" . $h_config->{'TMPFILE'}. ".result.tmp";

    open CONSTITUENT_OUTPUT,">" . $Alvis::NLPPlatform::Annotation::document_record_id . ".constituents";

    while($line=<SYN_RES>)
    {
	if (index($line, "[Sentence") == 0) {
	    $parsedconstituent = 0;
	}
	if(index($line,"[(")==0){
	    $insentence=1;
            # XXX
	    $nsentence++;

lib/Alvis/NLPPlatform/UserNLPWrappers.pm view on Meta::CPAN

#	    print STDERR "**** FOUND CONSTITUENTS SENDING DECAL $phrase_idx ****\n";
	    if ($parsedconstituent == 0) {
		$nb_constituents++;
		$phrase_idx=parse_constituents($constituents,$phrase_idx,$doc_hash);
	    }
	    $parsedconstituent = 1;
	    $constituents=~s/\[([A-Z]+) /($1 /sgo;
# 	    $constituents=~s/\[([A-Z]+) /<constituent>$1 /sgo;
	    $constituents=~s/[A-Z]+\]/)/sgo;
# 	    $constituents=~s/[A-Z]+\]/<\/constituent>/sgo;
# 	    print CONSTITUENT_OUTPUT $Alvis::NLPPlatform::Annotation::document_record_id . "\t";
	    print CONSTITUENT_OUTPUT "$constituents\n";

#	    print STDERR "**** RECUP $phrase_idx ****\n";
	}
	if($insentence==1){
	    $sentence.=$line;
	}
# 	if(index($line,"diagram")==0){
	if(index($line,"[]")==0){
	    # process the line
	    $sentence=~s/\[Sentence\s+[0-9]+\]//sgo;

lib/Alvis/NLPPlatform/UserNLPWrappers.pm view on Meta::CPAN

		    }
		}
		
		# trash everything and continue the loop

		$insentence=0;
		$wordidshift+=$last_token-1;
	    }
	}
    }
    close CONSTITUENT_OUTPUT;
    close SYN_RES;

#    print STDERR $h_config->{'TMPFILE'}. ".corpus.tmp" . "\n";
    $Alvis::NLPPlatform::ALVISDEBUG || unlink $h_config->{'TMPFILE'}. ".corpus.tmp";
    $Alvis::NLPPlatform::ALVISDEBUG || unlink $h_config->{'TMPFILE'} . ".result.tmp";

    $Alvis::NLPPlatform::nb_relations=$relation_id-1;
    $Alvis::NLPPlatform::Annotation::phrase_idx=$phrase_idx;

    print STDERR "done - Found $Alvis::NLPPlatform::nb_relations relations, $termsfound full terms, $nb_constituents constituents.\n";

( run in 0.829 second using v1.01-cache-2.11-cpan-4e96b696675 )