Alvis-NLPPlatform
view release on metacpan or search on metacpan
lib/Alvis/NLPPlatform.pm view on Meta::CPAN
###########################################################################
###########################################################################
sub standalone {
my $config = shift;
my $HOSTNAME = shift;
my $doc = shift;
# print STDERR "$ref_doc\n";
# my $tab_docs_xml = shift;
# my $doc_num = shift;
my $i;
my @cur_doc;
my $j;
my $tmpfile;
my $render_time;
my @records;
my $rec;
my $docR;
my $id;
my @doc_collection_out;
$tmpfile = $config->{'ALVISTMP'} . "/$HOSTNAME.$$.outfile";
# print STDERR $doc;
@records=&split_to_docRecs($doc);
$Alvis::NLPPlatform::last_doc = 0;
unlink $config->{'ALVISTMP'} . "/$HOSTNAME.$$.corpus.yatea.tmp";
for($i=0;$i <scalar(@records); $i++) {
if ($i == $#records) {
$Alvis::NLPPlatform::last_doc = 1;
}
$rec = $records[$i];
($id,$docR)=@$rec;
warn "Process document $id\n";
open FILETMP_OUT, ">$tmpfile";
binmode(FILETMP_OUT, ":utf8");
# binmode(FILETMP_OUT);
# print FILETMP_OUT Encode::decode_utf8($doc);
Alvis::NLPPlatform::platform_reset();
$render_time = Alvis::NLPPlatform::standalone_main($config, $docR, \*FILETMP_OUT, 1); #${$tab_docs_xml->[$doc_num]}[1] ; ${$ref_doc}[1]
close(FILETMP_OUT);
open FILETMP_OUT, "$tmpfile" or die "No such file or directory\n";
@cur_doc = <FILETMP_OUT>;
$j = 0;
while(($j< scalar @cur_doc) && ($cur_doc[$j] !~ s/\@RENDER_TIME_NOT_SET\@/$render_time/)) {
$j++;
}
close(FILETMP_OUT);
if (!((exists $config->{"XML_OUTPUT"}->{"NO_STD_XML_OUTPUT"}) && ($config->{"XML_OUTPUT"}->{"NO_STD_XML_OUTPUT"} == 1))) {
if (scalar(@records) > 1) {
if ($i == 0){
pop @cur_doc;
} else {
shift @cur_doc;
shift @cur_doc;
}
}
# push @doc_collection_out, @cur_doc;
print @cur_doc;
}
$time_total=$time_load+$time_tok+$time_ne+$time_word+$time_sent+$time_pos+$time_lemm+$time_term+$time_synt + $time_semtag + $time_render;
warn "Total processing time: $time_total\n";
}
# print STDERR "$tmpfile\n";
unlink $tmpfile;
# return @cur_doc;
return @doc_collection_out;
}
sub standalone_main {
my $h_config = $_[0];
my $doc_xml = $_[1];
my $descriptor = $_[2];
my $printCollectionHeaderFooter = $_[3];
my $xmlhead="";#"<?xml version=\"1.0\" encoding=\"$charset\"?>\n<documentCollection xmlns=\"http://alvis.info/enriched/\" version=\"1.1\">\n";
my $xmlfoot="";#</documentCollection>\n";
my $doc_hash;
$last_semantic_unit=0;
$last_semantic_feature = 0;
$cur_doc_nb=1;
compute_dependencies($h_config);
$NLPTOOLS=$h_config->{'NLP_tools_root'};
$ALVISTMP=$h_config->{'ALVISTMP'};
$HOSTNAME=hostname
$ALVISRSC=$h_config->{'NLP_misc'}->{'NLP_resources'};
if (!exists $h_config->{'TMPFILE'}) {
$h_config->{'TMPFILE'}="$ALVISTMP/$HOSTNAME.$$";
}
$ALVISLOGFILE= "$ALVISTMP/alvis.$HOSTNAME.$$.log";
if (exists $h_config->{'DEBUG'}) {
$ALVISDEBUG = $h_config->{'DEBUG'};
}
print STDERR "\n";
$time_load=0;
$time_tok=0;
$time_ne=0;
$time_word=0;
$time_sent=0;
$time_pos=0;
lib/Alvis/NLPPlatform.pm view on Meta::CPAN
my $conf = new Config::General('-ConfigFile' => $rcfile,
'-InterPolateVars' => 1,
'-InterPolateEnv' => 1
);
my %config = $conf->getall;
mkpath($config{'ALVISTMP'});
return(%config);
}
sub print_config
{
my $config = $_[0];
my $var;
my %general_vars = ( "ALVISTMP" => "Temporary directory",
"PLATFORM_ROOT" => "Platform Root Directory",
"NLP_tools_root" => "Root directory of the NLP tools",
);
print STDERR "General variables\n";
foreach $var (keys %general_vars) {
if (defined $config->{$var}) {
print STDERR "\t". $general_vars{$var} . " : " . $config->{$var} . "\n";
}
}
print STDERR "Printing Section\n";
if (defined $config->{"alvis_connection"}) {
print STDERR " Section Definition of the Alvis connection\n";
my %alvis_connection_vars = ("HARVESTER_PORT" => "Harvester port",
"NEXTSTEP" => "Send information to the next step of the pipeline?",
"NEXTSTEP_HOST" => "Next step host",
"NEXTSTEP_PORT" => "Next step port",
"SPOOLDIR" => "Spool directory",
"OUTDIR" => "Output directory",
);
foreach $var (keys %alvis_connection_vars) {
if (defined $config->{"alvis_connection"}->{$var}) {
print STDERR "\t" . $alvis_connection_vars{$var} . " : " . $config->{"alvis_connection"}->{$var} . "\n";
}
}
}
if (defined $config->{"NLP_connection"}) {
print STDERR " Section Definition of the NLP connection\n";
my %nlp_connection_vars = ("SERVER" => "NLP Server host",
"PORT" => "NLP Server port",
"RETRY_CONNECTION" => "Number of time for retrying the connection",
);
foreach $var (keys %nlp_connection_vars) {
if (defined $config->{"NLP_connection"}->{$var}) {
print STDERR "\t" . $nlp_connection_vars{$var} . " : " . $config->{"NLP_connection"}->{$var} . "\n";
}
}
}
if (defined $config->{"XML_INPUT"}) {
print STDERR " Section Configuration of the XML INPUT\n";
my %xml_input_vars = ("PRESERVEWHITESPACE" => "Preserve XML White space?",
"LINGUISTIC_ANNOTATION_LOADING" => "Loading previous linguistic annotation?",
);
foreach $var (keys %xml_input_vars) {
if (defined $config->{"XML_INPUT"}->{$var}) {
print STDERR "\t" . $xml_input_vars{$var} . " : " . $config->{"XML_INPUT"}->{$var} . "\n";
}
}
}
if (defined $config->{"XML_OUTPUT"}) {
print STDERR " Section Configuration of the XML OUTPUT\n";
my %xml_output_vars = ("NO_STD_XML_OUTPUT" => "No printing standard XML output?",
);
foreach $var (keys %xml_output_vars) {
if (defined $config->{"XML_OUTPUT"}->{$var}) {
print STDERR "\t" . $xml_output_vars{$var} . " : " . $config->{"XML_OUTPUT"}->{$var} . "\n";
}
}
}
&compute_dependencies($config);
if (defined $config->{"NLP_misc"}) {
print STDERR " Section Miscellaneous NLP configuration features\n";
my %NLP_misc_vars = ("NLP_resources" => "NLP resource directory",
"SAVE_IN_OUTDIR" => "Saving Annotated documents in the output directory?",
"TERM_LIST_EN" => "File containing the terms for English",
"TERM_LIST_FR" => "File containing the terms for French",
);
foreach $var (keys %NLP_misc_vars) {
if (defined $config->{"NLP_misc"}->{$var}) {
print STDERR "\t" . $NLP_misc_vars{$var} . " : " . $config->{"NLP_misc"}->{$var} . "\n";
}
}
}
if (defined $config->{"NLP_tools"}) {
print STDERR " Section NLP tool path and command line\n";
my %NLP_tools_vars = ("NETAG_EN" => "English Named Entity Recognizer command line",
"NETAG_FR" => "French Named Entity Recognizer command line",
"WORDSEG_EN" => "English Word Segmentizer command line",
"WORDSEG_FR" => "French Word Segmentizer command line",
"POSTAG_EN" => "English POS Tagger command line",
"POSTAG_FR" => "French POS Tagger command line",
"SYNTACTIC_PATH_EN" => "English Parser command line",
"SYNTACTIC_PATH_FR" => "French Parser command line",
);
foreach $var (keys %NLP_tools_vars) {
if (defined $config->{"NLP_tools"}->{$var}) {
print STDERR "\t" . $NLP_tools_vars{$var} . " : " . $config->{"NLP_tools"}->{$var} . "\n";
}
}
}
if (defined $config->{"CONVERTERS"}) {
print STDERR " Section INPUT CONVERTERS\n";
my %Converter_vars = ("SupplMagicFile" => "File for Additional Definition of Magic Number",
);
foreach $var (keys %Converter_vars) {
if (defined $config->{"CONVERTERS"}->{$var}) {
print STDERR "\t" . $Converter_vars{$var} . " : " . $config->{"CONVERTERS"}->{$var} . "\n";
}
}
print STDERR "\tRecognized formats:\n";
$Converter_vars{"STYLESHEET"} = 1;
my $format;
foreach $format (keys %{$config->{"CONVERTERS"}}) {
if (!exists($Converter_vars{$format})) {
print STDERR "\t\t$format\n";
}
}
}
}
sub client
{
my ($rcfile) = @_;
my %config = Alvis::NLPPlatform::load_config($rcfile);
$nlp_host = $config{"NLP_connection"}->{"SERVER"};
$nlp_port = $config{"NLP_connection"}->{"PORT"};
$connection_retry=$config{"alvis_connection"}->{"RETRY_CONNECTION"};
my $line;
my $doc_xml_size;
my $doc_xml;
# my $connection_retry;
my $sock=0;
my $time_render;
my $sig_handler = "";
while(1) {
# to not stop the connection (should crash the server)
$sig_handler = $SIG{'INT'};
$SIG{'INT'}='IGNORE'; # to prevent zombification
$connection_retry=$config{"alvis_connection"}->{"RETRY_CONNECTION"};
do {
$sock=new IO::Socket::INET( PeerAddr => $nlp_host,
PeerPort => $nlp_port,
Proto => 'tcp');
warn "Could not create socket: $! \n" unless $sock;
$connection_retry--;
sleep(1);
} while(!defined($sock) && ($connection_retry >0));
if ($connection_retry ==0) {
die "Timeout. Could not create socket: $! \n";
}
lib/Alvis/NLPPlatform.pm view on Meta::CPAN
=head2 init_server()
init_server($r_config);
This method initializes the server. It reads the document id from the
file C<$ALVISTMP/.proc_id> and loads the corresponding documents
i.e. documents which have been annotated but not recorded due to a
server crash.
=head2 token_id_is_in_list_refid_token()
token_id_is_in_list_refid_token($list_refid_token, $token_to_search);
The method returns 1 if the token C<$token_to_search> is in the list
C<$list_refid_token>, 0 else.
=head2 token_id_follows_list_refid_token()
token_id_follows_list_refid_token($list_refid_token, $token_to_search);
The method returns 1 if the token C<$token_to_search> is the foollwing
of the last token of the list C<$list_refid_token>, 0 else.
=head2 token_id_just_before_last_of_list_refid_token()
token_id_just_before_last_of_list_refid_token($list_refid_token, $token_to_search);
The method returns 1 if the token C<$token_to_search> is just before
the first token of the list C<$list_refid_token>, 0 else.
=head2 unparseable_id()
unparseable_id($id)
The method checks if the id have been parsed or not. If not, it prints
a warning.
=head2 platform_reset()
platform_reset()
The method empties or resets the structures and variables attached to
a processed document.
=head1 PLATFORM CONFIGURATION
The configuration file of the NLP Platform is composed of global
variables and divided into several sections:
=over
=item * Global variables.
The two mandatory variables are C<ALVISTMP> and C<PRESERVEWHITESPACE>
(in the XML_INPUT section).
=over 8
=item *
C<ALVISTMP> : it defines the temporary directory used during the
annotation process. The files are recorded in (XML files and
input/output of the NLP tools) during the annotation step. It must
be writable to the user the process is running as.
=item *
C<DEBUG> : this variable indicates if the NLP platform is run in a
debug mode or not. The value are 1 (debug mode) or 0 (no debug
mode). Default value is 0. The main consequence of the debug mode is
to keep the temporary file.
=back
Additional variables and environement variables can be used if they
are interpolated in the configuration file. For instance, in the
default configuration file, we add
=over
=item *
C<PLATFORM_ROOT>: directory where are installed NLP tools and resources.
=item *
C<NLP_tools_root>: root directory where are installed the NLP tools
=item *
C<AWK>: path for awk
=item *
C<SEMTAG_EN_DIR>: directory where is installed the semantic tagger
=item *
C<ONTOLOGY>: path for the ontology for the semanticTypeTagger (trish2
format -- see documentation of the semanticTypeTagger)
=item *
C<CANONICAL_DICT>: path for the dictionary with the canonical form of
the semantic units (trish2 format -- see documentation of the
semanticTypeTagger)
=item *
lib/Alvis/NLPPlatform.pm view on Meta::CPAN
=back
=item * Section C<alvis_connection>
=over 8
=item *
C<HARVESTER_PORT>: the port of the harverster/crawler (C<combine>) that the platform will read from to get the documents to annotate.
=item *
C<NEXTSTEP>: indicates if there is a next step in the pipeline
(for instance, the indexer IdZebra). The value is C<0> or C<1>.
=item *
C<NEXTSTEP_HOST>: the host name of the component that the platform will send the annotated document to.
=item *
C<NEXTSTEP_PORT>: the port of the component that the platform will send the annotated document to.
=item *
C<SPOOLDIR>: the directory where the documents coming from the harvester are stored.
It must be writable to the user the process is running as.
=item *
C<OUTDIR>: the directory where are stored the annotated documents if C<SAVE_IN_OUTDIR> (in Section C<NLP_misc>) is set.
It must be writable to the user the process is running as.
=back
=item * Section C<NLP_connection>
=over 8
=item *
C<SERVER>: The host name where the NLP server is running, for
the connections with the NLP clients.
=item *
C<PORT>: The listening port of the NLP server, for the
connections with the NLP clients.
=item *
C<RETRY_CONNECTION>: The number of times that
the clients attempts to connect to the server.
=back
=item * C<XML_INPUT>
=over 8
=item *
C<PRESERVEWHITESPACE> is a boolean indicating if the linguistic
annotation will be done by preserving white space or not, i.e. XML
blank nodes and white space at the beginning and the end of any line,
but also indentation of the text in the canonicalDocument
Default value is C<0> or false (blank nodes and
indentation characters are removed).
=item *
C<LINGUISTIC_ANNOTATION_LOADING>: The linguistic annotations already
existing in the input documents are loaded or not. Default value is
c<1> or true (linguistic annotations are loaded).
=back
=item *
C<XML_OUTPUT> (Not available yet)
=over 8
=item *
C<NO_STD_XML_OUTPUT>: The standard XML output is not printed. Default
value is false.
=item
FORM
=item
ID
=back
=item * Section C<linguistic_annotation>
the section defines the NLP steps that will be used for annotating documents. The values are C<0> or C<1>.
=over 8
=item *
C<ENABLE_TOKEN>: toggles the tokenization step.
=item *
C<ENABLE_NER>: toggles the named entity recognition step.
=item *
C<ENABLE_WORD>: toogles the word segmentation step.
=item *
C<ENABLE_SENTENCE>: toogles the sentence segmentation step.
=item *
C<ENABLE_POS>: toogles the Part-of-Speech tagging step.
=item *
C<ENABLE_LEMMA>: toogles the lemmatization step.
=item *
C<ENABLE_TERM_TAG>: toogles the term tagging step.
=item *
C<ENABLE_SYNTAX>: toogles the parsing step.
=back
=item * Section C<NLP_misc>
the section defines miscellenous variables for NLP annotation steps.
=over 8
( run in 1.100 second using v1.01-cache-2.11-cpan-e1769b4cff6 )