Alvis-NLPPlatform
view release on metacpan or search on metacpan
lib/Alvis/NLPPlatform.pm view on Meta::CPAN
%Alvis::NLPPlatform::last_words = ();
@Alvis::NLPPlatform::found_terms = ();
@Alvis::NLPPlatform::found_terms_tidx = ();
@Alvis::NLPPlatform::found_terms_smidx = ();
@Alvis::NLPPlatform::found_terms_phr = ();
@Alvis::NLPPlatform::found_terms_words = ();
$Alvis::NLPPlatform::phrase_idx = 1;
return(0);
}
###########################################################################
###########################################################################
###########################################################################
sub standalone {
my $config = shift;
my $HOSTNAME = shift;
my $doc = shift;
# print STDERR "$ref_doc\n";
# my $tab_docs_xml = shift;
# my $doc_num = shift;
my $i;
my @cur_doc;
my $j;
my $tmpfile;
my $render_time;
my @records;
my $rec;
my $docR;
my $id;
my @doc_collection_out;
$tmpfile = $config->{'ALVISTMP'} . "/$HOSTNAME.$$.outfile";
# print STDERR $doc;
@records=&split_to_docRecs($doc);
$Alvis::NLPPlatform::last_doc = 0;
unlink $config->{'ALVISTMP'} . "/$HOSTNAME.$$.corpus.yatea.tmp";
for($i=0;$i <scalar(@records); $i++) {
if ($i == $#records) {
$Alvis::NLPPlatform::last_doc = 1;
}
$rec = $records[$i];
($id,$docR)=@$rec;
warn "Process document $id\n";
open FILETMP_OUT, ">$tmpfile";
binmode(FILETMP_OUT, ":utf8");
# binmode(FILETMP_OUT);
# print FILETMP_OUT Encode::decode_utf8($doc);
Alvis::NLPPlatform::platform_reset();
$render_time = Alvis::NLPPlatform::standalone_main($config, $docR, \*FILETMP_OUT, 1); #${$tab_docs_xml->[$doc_num]}[1] ; ${$ref_doc}[1]
close(FILETMP_OUT);
open FILETMP_OUT, "$tmpfile" or die "No such file or directory\n";
@cur_doc = <FILETMP_OUT>;
$j = 0;
while(($j< scalar @cur_doc) && ($cur_doc[$j] !~ s/\@RENDER_TIME_NOT_SET\@/$render_time/)) {
$j++;
}
close(FILETMP_OUT);
if (!((exists $config->{"XML_OUTPUT"}->{"NO_STD_XML_OUTPUT"}) && ($config->{"XML_OUTPUT"}->{"NO_STD_XML_OUTPUT"} == 1))) {
if (scalar(@records) > 1) {
if ($i == 0){
pop @cur_doc;
} else {
shift @cur_doc;
shift @cur_doc;
}
}
# push @doc_collection_out, @cur_doc;
print @cur_doc;
}
$time_total=$time_load+$time_tok+$time_ne+$time_word+$time_sent+$time_pos+$time_lemm+$time_term+$time_synt + $time_semtag + $time_render;
warn "Total processing time: $time_total\n";
}
# print STDERR "$tmpfile\n";
unlink $tmpfile;
# return @cur_doc;
return @doc_collection_out;
}
sub standalone_main {
my $h_config = $_[0];
my $doc_xml = $_[1];
my $descriptor = $_[2];
my $printCollectionHeaderFooter = $_[3];
my $xmlhead="";#"<?xml version=\"1.0\" encoding=\"$charset\"?>\n<documentCollection xmlns=\"http://alvis.info/enriched/\" version=\"1.1\">\n";
my $xmlfoot="";#</documentCollection>\n";
my $doc_hash;
$last_semantic_unit=0;
$last_semantic_feature = 0;
$cur_doc_nb=1;
compute_dependencies($h_config);
$NLPTOOLS=$h_config->{'NLP_tools_root'};
$ALVISTMP=$h_config->{'ALVISTMP'};
$HOSTNAME=hostname
$ALVISRSC=$h_config->{'NLP_misc'}->{'NLP_resources'};
if (!exists $h_config->{'TMPFILE'}) {
$h_config->{'TMPFILE'}="$ALVISTMP/$HOSTNAME.$$";
}
$ALVISLOGFILE= "$ALVISTMP/alvis.$HOSTNAME.$$.log";
if (exists $h_config->{'DEBUG'}) {
lib/Alvis/NLPPlatform.pm view on Meta::CPAN
}
}
print STDERR "\tRecognized formats:\n";
$Converter_vars{"STYLESHEET"} = 1;
my $format;
foreach $format (keys %{$config->{"CONVERTERS"}}) {
if (!exists($Converter_vars{$format})) {
print STDERR "\t\t$format\n";
}
}
}
}
sub client
{
my ($rcfile) = @_;
my %config = Alvis::NLPPlatform::load_config($rcfile);
$nlp_host = $config{"NLP_connection"}->{"SERVER"};
$nlp_port = $config{"NLP_connection"}->{"PORT"};
$connection_retry=$config{"alvis_connection"}->{"RETRY_CONNECTION"};
my $line;
my $doc_xml_size;
my $doc_xml;
# my $connection_retry;
my $sock=0;
my $time_render;
my $sig_handler = "";
while(1) {
# to not stop the connection (should crash the server)
$sig_handler = $SIG{'INT'};
$SIG{'INT'}='IGNORE'; # to prevent zombification
$connection_retry=$config{"alvis_connection"}->{"RETRY_CONNECTION"};
do {
$sock=new IO::Socket::INET( PeerAddr => $nlp_host,
PeerPort => $nlp_port,
Proto => 'tcp');
warn "Could not create socket: $! \n" unless $sock;
$connection_retry--;
sleep(1);
} while(!defined($sock) && ($connection_retry >0));
if ($connection_retry ==0) {
die "Timeout. Could not create socket: $! \n";
}
# $sock=new IO::Socket::INET( PeerAddr => $nlp_host,
# PeerPort => $nlp_port,
# Proto => 'tcp');
# die "Could not create socket: $!\n" unless $sock;
$sock -> autoflush(1); ###############
binmode($sock, ":utf8");
print STDERR `date`;
print STDERR "Established connection to server.\n";
print STDERR "Requesting document...";
print $sock "REQUEST\n";
print STDERR "done.\n";
print STDERR "Receiving document...\n";
# SENDING $id
while($line = <$sock>) {
print STDERR "$line";
$line=uc $line;
if ($line =~ /SENDING ([^\n]+)\n/) {
$id = $1;
last;
} else {
warn "Out of protocol message\n";
close $sock;
next;
}
}
print STDERR "GETTING $id\n";
# SIZE of $doc_xml
while ($line = <$sock>) {
print STDERR "$line";
$line=uc $line;
if ($line =~ /SIZE ([^\n]+)\n/) {
$doc_xml_size = $1;
last;
} else {
warn "Out of protocol message\n";
close $sock;
next;
}
}
print STDERR "READING $doc_xml_size bytes\n";
$doc_xml = "";
print STDERR length($doc_xml) . "\r";
while ((defined $sock) && ($line = <$sock>) && ($line ne "<DONE>\n")) { # (length($doc_xml) < $doc_xml_size) &&
print STDERR length($doc_xml) . "\r";
$doc_xml .= $line;
}
if (length($doc_xml) > $doc_xml_size) {
warn "Received more bytes than expected\n";
}
print STDERR length($doc_xml) . "\n";
print STDERR "\n";
print STDERR "READING $id done.\n";
print STDERR "Sending ACK...";
print $sock "ACK\n";
print STDERR "done.\n";
close $sock;
# restore the normal behaviour
$SIG{'INT'} = \&sigint_handler;
print STDERR "Processing $id";
my $doc_hash;
Alvis::NLPPlatform::starttimer();
$doc_hash=Alvis::NLPPlatform::Annotation::load_xml($doc_xml, \%config);
my $time_load+=Alvis::NLPPlatform::endtimer();
# Recording computing data (time and entity size)
# init
# $doc_hash->{"log_processing"} = {};
$doc_hash->{"log_processing0"}->{"datatype"}="log_processing";
$doc_hash->{"log_processing0"}->{"log_id"} = "time";
$doc_hash->{"log_processing1"}->{"datatype"}="log_processing";
$doc_hash->{"log_processing1"}->{"log_id"} = "element_size";
# Recording statistical data (time and entity size)
# XML loading time
my @tmp_c;;
$doc_hash->{"log_processing0"}->{"comments"} = \@tmp_c;
push @{$doc_hash->{"log_processing0"}->{"comments"}}, "XML loading Time : $time_load";
my @tmp_d;;
$doc_hash->{"log_processing1"}->{"comments"} = \@tmp_d;
$doc_hash = Alvis::NLPPlatform::client_main($doc_hash, \%config);
# to not stop the connection (should crash the server)
$sig_handler = $SIG{'INT'};
$SIG{'INT'}='IGNORE'; # to prevent zombification
$connection_retry=$config{"alvis_connection"}->{"RETRY_CONNECTION"};
do {
$sock=new IO::Socket::INET( PeerAddr => $nlp_host,
PeerPort => $nlp_port,
Proto => 'tcp');
warn "Could not create socket: $! \n" unless $sock;
$connection_retry--;
sleep(1);
} while(!defined($sock) && ($connection_retry >0));
if ($connection_retry ==0) {
die "Timeout. Could not create socket: $! \n";
}
binmode $sock, ":utf8";
print STDERR "Established connection to server.\n";
print STDERR "Giving back annotated document...\n";
# Communitation with the server
print $sock "GIVEBACK\n$id\n";
# Save to XML file
print STDERR "\tRendering XML... ";
starttimer();
$time_render = 0;
push @{$doc_hash->{"log_processing0"}->{"comments"}}, "XML rendering Time : \@RENDER_TIME_NOT_SET\@";
Alvis::NLPPlatform::Annotation::render_xml($doc_hash, $sock, 1,\%config);
$time_render+=endtimer();
# TODO : recording the xml rendering time
print STDERR "done\n";
print $sock "<DONE>\n";
print STDERR "done.\n";
# the render time is sent
print $sock "RENDER TIME\n$time_render\n";
print STDERR "Awaiting acknowledgement...";
my $line;
while($line=<$sock>){
chomp $line;
$line=uc $line;
if($line=~/ACK/gi){
close($sock);
last;
} }
print STDERR "OK.\n";
close($sock);
# restore the normal behaviour
$SIG{'INT'} = $sig_handler;
print STDERR "Closed connection to server.\n";
}
return($time_render);
}
sub sigint_handler {
my ($signal) = @_;
my $sock;
# $nlp_host = $r_config->{"NLP_connection"}->{"SERVER"};
# $nlp_port = $r_config->{"NLP_connection"}->{"PORT"};
warn "Receiving SIGINT -- Aborting NL processing\n";
do {
$sock=new IO::Socket::INET( PeerAddr => $nlp_host,
PeerPort => $nlp_port,
Proto => 'tcp');
warn "Could not create socket: $! \n" unless $sock;
$connection_retry--;
sleep(1);
} while(!defined($sock) && ($connection_retry >0));
if ($connection_retry ==0) {
die "Timeout. Could not create socket: $! \n";
}
$sock -> autoflush(1); ###############
binmode $sock, ":utf8";
print STDERR "Established connection to server.\n";
print STDERR "Sending aborting message\n";
print $sock "ABORTING\n$id\n";
print STDERR "Aborting message sent\n";
print STDERR "Awaiting acknowledgement...";
my $line;
while($line=<$sock>){
chomp $line;
$line=uc $line;
if($line=~/ACK/gi){
close($sock);
last;
}
}
print STDERR "OK.\n";
close($sock);
exit;
}
sub server
{
my ($rcfile) = @_;
print STDERR "config File : $rcfile \n";
my %config = Alvis::NLPPlatform::load_config($rcfile);
$nlp_host = $config{"NLP_connection"}->{"SERVER"};
$nlp_port = $config{"NLP_connection"}->{"PORT"};
$connection_retry = $config{"alvis_connection"}->{"RETRY_CONNECTION"};
# print STDERR Dumper(\%config);
my $charset = 'UTF-8';
# header and footer
my $xmlhead="<?xml version=\"1.0\" encoding=\"$charset\"?>\n<documentCollection xmlns=\"http://alvis.info/enriched/\" version=\"1.1\">\n";
my $xmlfoot="</documentCollection>\n";
# connection to the crawler
my $pipe = new Alvis::Pipeline::Read(port => $config{"alvis_connection"}->{"HARVESTER_PORT"}, spooldir => $config{"alvis_connection"}->{"SPOOLDIR"},
loglevel=>10)
or die "can't create read-pipe on port " . $config{"alvis_connection"}->{"HARVESTER_PORT"} . ": $!";
$|=1;
touch($config{"ALVISTMP"} . "/.proc_id");
&init_server(\%config);
unlink($config{"ALVISTMP"} . "/.proc_id");
touch($config{"ALVISTMP"} . "/.proc_id");
mkpath($config{"alvis_connection"}->{"OUTDIR"});
my $n=1;
my $annotated_xml;
$SIG{'CHLD'}='IGNORE'; # to prevent zombification
my $sock=new IO::Socket::INET(LocalPort => $config{"NLP_connection"}->{"PORT"},
Proto => 'tcp',
Listen => 10,
Reuse => 1);
die "Could not create socket: $!\n" unless $sock;
$sock -> autoflush(1); ###############
my $client_sock=0;
my $name;
my @records;
my $id;
my $sub_dir;
my %processing_id;
while(1){
warn "beginning of the loop\n";
# await client connection
if ($client_sock=$sock->accept()) {
warn "Accepting a connection\n";
if (fork() == 0) {
close($sock);
binmode($client_sock, ":utf8");
my ($client_port,$client_iaddr) = sockaddr_in(getpeername($client_sock));
warn "Getting information about remote host\n";
$name=gethostbyaddr($client_iaddr,AF_INET);
&disp_log($name,"Client (".inet_ntoa($client_iaddr).":".$client_port.") has connected.");
$client_sock -> autoflush(1); ###############
##############################
# CLIENT HANDLING CODE
my $line;
$line=<$client_sock>;
chomp $line;
$line=uc $line;
$line=~m/^\s*([A-Z]+)$/g;
## CLIENT IS REQUESTING A DOCUMENT
if($1 eq "REQUEST"){
&disp_log($name,"Client is requesting a document.");
# send document
&disp_log($name,"Sending document to client.");
my $xml = "";
warn "Reading the pipe\n";
if ($xml = $pipe->read(1)) {
$xml .= "\n" if $xml !~ /\n$/;
@records=&split_to_docRecs($xml);
if (scalar(@records))
{
my $rec = shift (@records);
($id,$xml)=@$rec;
if (scalar (@records)) {
# if there is more than one records other are store again in the pipeline
# use of combineExport code
my $pipe_out = new Alvis::Pipeline::Write(host => "localhost",
port => $config{"alvis_connection"}->{"HARVESTER_PORT"},
loglevel => 10)
or die "can't create ALVIS write-pipe for port '" . $config{"alvis_connection"}->{"HARVESTER_PORT"} . "': $!";
foreach my $rec_out (@records) {
$pipe_out->write($xmlhead . $rec_out . $xmlfoot);
}
}
if (defined($id))
{
warn "Received\t$n\t$id\n";
`date`;
if (defined(open(I,">:utf8",$config{"ALVISTMP"} . "/${id}.xml")))
{
print I $xml;
close(I);
}
else
{
die("Unable to open " . $config{"ALVISTMP"} . "/${id}.xml for writing.");
}
my $xml2 = $xml;
&disp_log($name,"Sending Document to client (" . (length($xml2) + 1 ) . " bytes).");
&disp_log($name, "SENDING $id");
&record_id($id,\%config);
print $client_sock "SENDING $id\n";
print $client_sock "SIZE " . (length($xml2) + 1 ) . "\n";
$xml2 = "";
print $client_sock "$xml\n";
print $client_sock "<DONE>\n";
# await acknowledgement
&disp_log($name,"Document sent to client.");
&disp_log($name,"Awaiting ACK from client...");
while($line=<$client_sock>){
chomp $line;
$line=uc $line;
if($line=~/ACK/gi){
close($client_sock);
last;
}
}
&disp_log($name,"Received ACK from client - Request fulfilled.");
close($client_sock);
}
else
{
warn "No id for record #$id of record \"$rec\"\n";
}
}
else
{
my $doc_text;
if (ref($xml))
{
$doc_text=$xml->toString();
}
else
{
$doc_text=$xml;
}
warn "Could not split into documentRecords document $doc_text";
}
} else {
$pipe->close();
warn "No documents in pipeline\n"
if $n == 0;
}
$n++;
close($client_sock);
}
## CLIENT IS ABOUT TO GIVE BACK AN ANNOTATED DOCUMENT
if($1 eq "GIVEBACK"){
&disp_log($name,"Client is giving back a document.");
# receive document
&disp_log($name,"Receiving annotated document from client...");
$id = <$client_sock>;
chomp $id;
&disp_log($name,"Annotated document ID: $id");
# Recording the annotation document (local)
$sub_dir=&sub_dir_from_id($id);
if ($config{"NLP_misc"}->{"SAVE_IN_OUTDIR"}) {
mkpath( $config{"alvis_connection"}->{"OUTDIR"} . "/$sub_dir");
}
my $xml = "";
if (($config{"NLP_misc"}->{"SAVE_IN_OUTDIR"} == 0) || (defined(open(O,">:utf8", $config{"alvis_connection"}->{"OUTDIR"} . "/$sub_dir/${id}.xml"))))
{
while((defined $sock) && ($line=<$client_sock>) && ($line ne "<DONE>\n")) {
# recording the annotation document (local)
# building xml string for sending to the next step
$xml .= $line;
# print STDERR $line;
}
# print STDERR $line;
# get the RENDER TIME
if ((defined $sock) && ($line = <$client_sock>) && ($line eq "RENDER TIME\n")) {
if ((defined $sock) && ($line = <$client_sock>)) {
chomp $line;
$xml =~ s/\@RENDER_TIME_NOT_SET\@/$line/;
# print STDERR $line;
} else {
warn "\n***\nValue of render time is not sent\n***\n\n";
}
} else {
warn "\n***\nRender time is not sent\n***\n\n";
}
if ($config{"NLP_misc"}->{"SAVE_IN_OUTDIR"}) {
print O $xml;
close(O);
}
# sending the annotated document to the newt step
if ($config{"alvis_connection"}->{"NEXTSTEP"}) {
warn "Sending the annotated document to the next step... \n";
my $pipe_out_nextstep = new Alvis::Pipeline::Write(host => $config{"alvis_connection"}->{"NEXTSTEP_HOST"},
port => $config{"alvis_connection"}->{"NEXTSTEP_PORT"},
loglevel => 10)
or die "can't create ALVIS write-pipe for '" . $config{"alvis_connection"}->{"NEXTSTEP_HOST"} . "' port '" . $config{"alvis_connection"}->{"nextstep_port"} . "': $!";
$pipe_out_nextstep->write($xml);
warn "done\n";
} else {
warn "Not sending to a nextstep\n";
}
} else {
if ($config{"NLP_misc"}->{"SAVE_IN_OUTDIR"}) {
$sub_dir=&sub_dir_from_id($id);
die("Unable to open " . $config{"alvis_connection"}->{"OUTDIR"}. " //$sub_dir/${id}.xml for writing.");
}
}
&disp_log($name,"Received annotated document from client.");
warn "deleting $config{ALVISTMP}/${id}.xml\n";
unlink "$config{ALVISTMP}/${id}.xml";
&delete_id($id, \%config);
# send acknowledgement
&disp_log($name,"Sending ACK to client...");
print $client_sock "ACK\n";
&disp_log($name,"Sent ACK to client - Finished giving back.");
close($client_sock);
}
# CLIENT INFORMS SERVER FOR ABORTING NL PROCESSING
if ($1 eq "ABORTING") {
&disp_log($name,"Client is aborting NL processing of a document.");
$line = <$client_sock>;
chomp $line;
# use of combineExport code
( run in 0.642 second using v1.01-cache-2.11-cpan-39bf76dae61 )