Alvis-NLPPlatform

 view release on metacpan or  search on metacpan

lib/Alvis/NLPPlatform.pm  view on Meta::CPAN


    %Alvis::NLPPlatform::last_words = ();
    @Alvis::NLPPlatform::found_terms = ();
    @Alvis::NLPPlatform::found_terms_tidx = ();
    @Alvis::NLPPlatform::found_terms_smidx = ();
    @Alvis::NLPPlatform::found_terms_phr = ();
    @Alvis::NLPPlatform::found_terms_words = ();

    $Alvis::NLPPlatform::phrase_idx = 1;

    return(0);
}


###########################################################################
###########################################################################
###########################################################################

sub standalone {
    my $config = shift;
    my $HOSTNAME = shift;
    my $doc = shift;

#    print STDERR "$ref_doc\n";
#     my $tab_docs_xml = shift;
#     my $doc_num = shift;

    my $i;
    my @cur_doc;
    my $j;
    my $tmpfile;
    my $render_time;

    my @records;
    my $rec;
    my $docR;
    my $id;

    my @doc_collection_out;


    $tmpfile = $config->{'ALVISTMP'} . "/$HOSTNAME.$$.outfile";

#     print STDERR $doc;

    @records=&split_to_docRecs($doc);

    $Alvis::NLPPlatform::last_doc = 0;

    unlink $config->{'ALVISTMP'} . "/$HOSTNAME.$$.corpus.yatea.tmp";

    for($i=0;$i <scalar(@records); $i++) {
	if ($i == $#records) {
	    $Alvis::NLPPlatform::last_doc = 1;
	}
	$rec = $records[$i];
	($id,$docR)=@$rec;
	warn "Process document $id\n";

	open FILETMP_OUT, ">$tmpfile";
	binmode(FILETMP_OUT, ":utf8");
#	binmode(FILETMP_OUT);
#      print FILETMP_OUT Encode::decode_utf8($doc);
	Alvis::NLPPlatform::platform_reset();
	$render_time = Alvis::NLPPlatform::standalone_main($config, $docR, \*FILETMP_OUT, 1); #${$tab_docs_xml->[$doc_num]}[1] ; ${$ref_doc}[1]
	close(FILETMP_OUT);

	open FILETMP_OUT, "$tmpfile" or die "No such file or directory\n";
	@cur_doc = <FILETMP_OUT>;
	$j = 0;
	while(($j< scalar @cur_doc) && ($cur_doc[$j] !~ s/\@RENDER_TIME_NOT_SET\@/$render_time/)) {
	    $j++;
	}
	close(FILETMP_OUT);

        if (!((exists $config->{"XML_OUTPUT"}->{"NO_STD_XML_OUTPUT"}) && ($config->{"XML_OUTPUT"}->{"NO_STD_XML_OUTPUT"} == 1))) {
	    if (scalar(@records) > 1) {
		if ($i == 0){
		    pop @cur_doc;
		} else {
		    shift @cur_doc;
		    shift @cur_doc;
		}
	    }
#	    push @doc_collection_out, @cur_doc;
		print @cur_doc;
	}
	$time_total=$time_load+$time_tok+$time_ne+$time_word+$time_sent+$time_pos+$time_lemm+$time_term+$time_synt + $time_semtag + $time_render;
	warn "Total processing time: $time_total\n";
    }

#     print STDERR "$tmpfile\n";
     unlink $tmpfile;
#    return @cur_doc;
    return @doc_collection_out;
}

sub standalone_main {
    my $h_config = $_[0];
    my $doc_xml = $_[1];
    my $descriptor = $_[2];
    my $printCollectionHeaderFooter = $_[3];

    my $xmlhead="";#"<?xml version=\"1.0\" encoding=\"$charset\"?>\n<documentCollection xmlns=\"http://alvis.info/enriched/\" version=\"1.1\">\n";
    my $xmlfoot="";#</documentCollection>\n";

    my $doc_hash;

    $last_semantic_unit=0;
    $last_semantic_feature = 0;

    $cur_doc_nb=1;
    compute_dependencies($h_config);
    $NLPTOOLS=$h_config->{'NLP_tools_root'};
    $ALVISTMP=$h_config->{'ALVISTMP'};
    $HOSTNAME=hostname
    $ALVISRSC=$h_config->{'NLP_misc'}->{'NLP_resources'};
    if (!exists $h_config->{'TMPFILE'}) {
	$h_config->{'TMPFILE'}="$ALVISTMP/$HOSTNAME.$$";
    }
    $ALVISLOGFILE= "$ALVISTMP/alvis.$HOSTNAME.$$.log";

    if (exists $h_config->{'DEBUG'}) {

lib/Alvis/NLPPlatform.pm  view on Meta::CPAN

	    }
	}
	print STDERR "\tRecognized formats:\n";
	$Converter_vars{"STYLESHEET"} = 1;
	my $format;
	foreach $format (keys %{$config->{"CONVERTERS"}}) {
	    if (!exists($Converter_vars{$format})) {
		print STDERR "\t\t$format\n";
	    }
	}

    }
    
}

sub client
{

    my ($rcfile) = @_;

    my %config = Alvis::NLPPlatform::load_config($rcfile);

    $nlp_host = $config{"NLP_connection"}->{"SERVER"};
    $nlp_port = $config{"NLP_connection"}->{"PORT"};
    $connection_retry=$config{"alvis_connection"}->{"RETRY_CONNECTION"};

    my $line;
    my $doc_xml_size;
    my $doc_xml;
#    my $connection_retry;
    my $sock=0;
    my $time_render;
    my $sig_handler = "";

    while(1) {
	
	# to not stop the connection (should crash the server)
	$sig_handler = $SIG{'INT'};
	$SIG{'INT'}='IGNORE'; # to prevent zombification
	
	$connection_retry=$config{"alvis_connection"}->{"RETRY_CONNECTION"};
	do {
	    $sock=new IO::Socket::INET( PeerAddr => $nlp_host,
					PeerPort => $nlp_port,
					Proto => 'tcp');
	    
	    warn "Could not create socket: $! \n" unless $sock;
	    $connection_retry--;
	    sleep(1);
	} while(!defined($sock) && ($connection_retry >0));
	
	if ($connection_retry ==0) {
	    die "Timeout. Could not create socket: $! \n";
	}
#     $sock=new IO::Socket::INET( PeerAddr => $nlp_host,
# 				PeerPort => $nlp_port,
# 				Proto => 'tcp');

#     die "Could not create socket: $!\n" unless $sock;
	$sock -> autoflush(1); ###############
 	binmode($sock, ":utf8");
	print STDERR `date`;
	print STDERR "Established connection to server.\n";
	
	print STDERR "Requesting document...";
	print $sock "REQUEST\n";
	print STDERR "done.\n";

	print STDERR "Receiving document...\n";

# SENDING $id
			    
	while($line = <$sock>) {
	    print STDERR "$line";
	    $line=uc $line;
	    if ($line =~ /SENDING ([^\n]+)\n/) {
		$id = $1;
		last;
	    } else {
		warn "Out of protocol message\n";
		close $sock;
		next;
	    }
	}

	print STDERR "GETTING $id\n";

# SIZE of $doc_xml

	while ($line = <$sock>) {
	    print STDERR "$line";
	    $line=uc $line;
	    if ($line =~ /SIZE ([^\n]+)\n/) {
		$doc_xml_size = $1;
		last;
	    } else {
		warn "Out of protocol message\n";
		close $sock;
		next;
	    }
	}
	
	print STDERR "READING $doc_xml_size bytes\n";
	$doc_xml = "";
	print STDERR length($doc_xml) . "\r";
	while ((defined $sock) && ($line = <$sock>) &&  ($line ne "<DONE>\n")) { #  (length($doc_xml) < $doc_xml_size) &&
	    print STDERR length($doc_xml) . "\r";
	    $doc_xml .= $line;
	}
	if (length($doc_xml) > $doc_xml_size) {
	    warn "Received more bytes than expected\n";
	}
	print STDERR length($doc_xml) . "\n";
	print STDERR "\n";
	print STDERR "READING $id done.\n";
	print STDERR "Sending ACK...";
	print $sock "ACK\n";
	print STDERR "done.\n";
	
	close $sock;

	# restore the normal behaviour
	$SIG{'INT'} = \&sigint_handler;

	print STDERR "Processing $id";
	
	my $doc_hash;
    
	Alvis::NLPPlatform::starttimer();
	$doc_hash=Alvis::NLPPlatform::Annotation::load_xml($doc_xml, \%config);
	my $time_load+=Alvis::NLPPlatform::endtimer();

	# Recording computing data (time and entity size)
	# init
#     $doc_hash->{"log_processing"} = {};
	$doc_hash->{"log_processing0"}->{"datatype"}="log_processing";
	$doc_hash->{"log_processing0"}->{"log_id"} = "time";
	$doc_hash->{"log_processing1"}->{"datatype"}="log_processing";
	$doc_hash->{"log_processing1"}->{"log_id"} = "element_size";
	
    # Recording statistical data (time and entity size)
    # XML loading time
	my @tmp_c;;
	$doc_hash->{"log_processing0"}->{"comments"} = \@tmp_c;
	
	push @{$doc_hash->{"log_processing0"}->{"comments"}},  "XML loading Time : $time_load";
	
	my @tmp_d;;
	$doc_hash->{"log_processing1"}->{"comments"} = \@tmp_d;
	
	
	$doc_hash = Alvis::NLPPlatform::client_main($doc_hash, \%config);
	
	# to not stop the connection (should crash the server)
	$sig_handler = $SIG{'INT'};
	$SIG{'INT'}='IGNORE'; # to prevent zombification

	$connection_retry=$config{"alvis_connection"}->{"RETRY_CONNECTION"};
	do {
	    $sock=new IO::Socket::INET( PeerAddr => $nlp_host,
					PeerPort => $nlp_port,
					Proto => 'tcp');
	    
	    warn "Could not create socket: $! \n" unless $sock;
	    $connection_retry--;
	    sleep(1);
	} while(!defined($sock) && ($connection_retry >0));
	
	if ($connection_retry ==0) {
	    die "Timeout. Could not create socket: $! \n";
	}
	binmode $sock, ":utf8";
	
	print STDERR "Established connection to server.\n";
	
	print STDERR "Giving back annotated document...\n";
	# Communitation with the server
	print $sock "GIVEBACK\n$id\n";
	
	# Save to XML file

	print STDERR "\tRendering XML...  ";

	starttimer();
	$time_render = 0;
	push @{$doc_hash->{"log_processing0"}->{"comments"}},  "XML rendering Time : \@RENDER_TIME_NOT_SET\@";
	Alvis::NLPPlatform::Annotation::render_xml($doc_hash, $sock, 1,\%config);
	$time_render+=endtimer();

# TODO : recording the xml rendering time
	print STDERR "done\n";
    
	print $sock "<DONE>\n";
	
	print STDERR "done.\n";
	
	# the render time is sent

	print $sock "RENDER TIME\n$time_render\n";

	print STDERR "Awaiting acknowledgement...";
	my $line;
	while($line=<$sock>){
	    chomp $line;
	    $line=uc $line;
	    if($line=~/ACK/gi){
		close($sock);
		last;
	    }	}
	print STDERR "OK.\n";

	close($sock);

	# restore the normal behaviour
	$SIG{'INT'} = $sig_handler;
	print STDERR "Closed connection to server.\n";
    }
    return($time_render);
}


sub sigint_handler {

    my ($signal) = @_;
    my $sock;

#     $nlp_host = $r_config->{"NLP_connection"}->{"SERVER"};
#     $nlp_port = $r_config->{"NLP_connection"}->{"PORT"};


    warn "Receiving SIGINT -- Aborting NL processing\n";

    

    do {
	$sock=new IO::Socket::INET( PeerAddr => $nlp_host,
				    PeerPort => $nlp_port,
				    Proto => 'tcp');

	warn "Could not create socket: $! \n" unless $sock;
	$connection_retry--;
	sleep(1);
    } while(!defined($sock) && ($connection_retry >0));

    if ($connection_retry ==0) {
	die "Timeout. Could not create socket: $! \n";
    }
    $sock -> autoflush(1); ###############
    binmode $sock, ":utf8";


    print STDERR "Established connection to server.\n";

    print STDERR "Sending aborting message\n";

    print $sock "ABORTING\n$id\n";

    print STDERR "Aborting message sent\n";

    print STDERR "Awaiting acknowledgement...";
    my $line;
    while($line=<$sock>){
	chomp $line;
	$line=uc $line;
	if($line=~/ACK/gi){
	    close($sock);
	    last;
	}
    }
    print STDERR "OK.\n";

    close($sock);
    exit;
}


sub server 
{
    my ($rcfile) = @_;

    print STDERR "config File : $rcfile \n";

    my %config = Alvis::NLPPlatform::load_config($rcfile);

     $nlp_host = $config{"NLP_connection"}->{"SERVER"};
     $nlp_port = $config{"NLP_connection"}->{"PORT"};
     $connection_retry = $config{"alvis_connection"}->{"RETRY_CONNECTION"};
#    print STDERR Dumper(\%config);

    my $charset = 'UTF-8';

    #  header and footer

    my $xmlhead="<?xml version=\"1.0\" encoding=\"$charset\"?>\n<documentCollection xmlns=\"http://alvis.info/enriched/\" version=\"1.1\">\n";
    my $xmlfoot="</documentCollection>\n";

    # connection to the crawler

    my $pipe = new Alvis::Pipeline::Read(port => $config{"alvis_connection"}->{"HARVESTER_PORT"}, spooldir => $config{"alvis_connection"}->{"SPOOLDIR"},
					 loglevel=>10)
	or die "can't create read-pipe on port " . $config{"alvis_connection"}->{"HARVESTER_PORT"} . ": $!";

    $|=1;

    touch($config{"ALVISTMP"} . "/.proc_id");

    &init_server(\%config);

    unlink($config{"ALVISTMP"} . "/.proc_id");
    touch($config{"ALVISTMP"} . "/.proc_id");
    mkpath($config{"alvis_connection"}->{"OUTDIR"});
    my $n=1;

    my $annotated_xml;

    $SIG{'CHLD'}='IGNORE'; # to prevent zombification

    my $sock=new IO::Socket::INET(LocalPort => $config{"NLP_connection"}->{"PORT"},
				  Proto => 'tcp',
				  Listen => 10,
				  Reuse => 1);

    die "Could not create socket: $!\n" unless $sock;

    $sock -> autoflush(1); ###############

    my $client_sock=0;
    my $name;
    my @records;
    my $id;
    my $sub_dir;
    my %processing_id;

    while(1){
	warn "beginning of the loop\n";
	# await client connection
	if ($client_sock=$sock->accept()) {
	    warn "Accepting a connection\n";
	    if (fork() == 0) {
		close($sock);
		binmode($client_sock, ":utf8");
		my ($client_port,$client_iaddr) = sockaddr_in(getpeername($client_sock));
		warn "Getting information about remote host\n";
		$name=gethostbyaddr($client_iaddr,AF_INET);
		&disp_log($name,"Client (".inet_ntoa($client_iaddr).":".$client_port.") has connected.");
		$client_sock -> autoflush(1); ###############
		
		##############################
		# CLIENT HANDLING CODE
		my $line;
		$line=<$client_sock>;
		chomp $line;
		$line=uc $line;
		$line=~m/^\s*([A-Z]+)$/g;
		
		## CLIENT IS REQUESTING A DOCUMENT
		if($1 eq "REQUEST"){
		    &disp_log($name,"Client is requesting a document.");
		    # send document
		    
		    &disp_log($name,"Sending document to client.");

		    my $xml = "";
		    warn "Reading the pipe\n";
		    if ($xml = $pipe->read(1)) {
			$xml .= "\n" if $xml !~ /\n$/;
			
			@records=&split_to_docRecs($xml);
			if (scalar(@records))
			{
			    my $rec = shift (@records);
			    ($id,$xml)=@$rec;
			    if (scalar (@records)) {
				# if there is more than one records other are store again in the pipeline
				# use of combineExport code
				my $pipe_out = new Alvis::Pipeline::Write(host => "localhost", 
									  port => $config{"alvis_connection"}->{"HARVESTER_PORT"},
									  loglevel => 10)
				    or die "can't create ALVIS write-pipe for port '" . $config{"alvis_connection"}->{"HARVESTER_PORT"} . "': $!";
				foreach my $rec_out (@records) {
				    $pipe_out->write($xmlhead . $rec_out . $xmlfoot);
				}
			    }

			    if (defined($id))
			    {
				warn "Received\t$n\t$id\n";
				
				`date`;
				if (defined(open(I,">:utf8",$config{"ALVISTMP"} . "/${id}.xml")))
				{
				    print I $xml;
				    close(I);		
				}
				else
				{
				    die("Unable to open " .  $config{"ALVISTMP"} . "/${id}.xml for writing.");
				}
				
				my $xml2 = $xml;
				&disp_log($name,"Sending Document to client ("  . (length($xml2) + 1 ) . " bytes).");
				&disp_log($name, "SENDING $id");
				&record_id($id,\%config);
				print $client_sock "SENDING $id\n";
				print $client_sock "SIZE " . (length($xml2) + 1 ) . "\n";
				$xml2 = "";
				print $client_sock "$xml\n";
				print $client_sock "<DONE>\n";
				# await acknowledgement
				&disp_log($name,"Document sent to client.");
				&disp_log($name,"Awaiting ACK from client...");
				while($line=<$client_sock>){
				    chomp $line;
				    $line=uc $line;
				    if($line=~/ACK/gi){
					close($client_sock);
					last;
				    }
				}
				&disp_log($name,"Received ACK from client - Request fulfilled.");
				close($client_sock);
			    }
			    else
			    {
				warn "No id for record #$id of record \"$rec\"\n";
				}
			}
			else
			{
			    my $doc_text;
			    if (ref($xml))
			    {
				$doc_text=$xml->toString();
			    }
			    else
			    {
				$doc_text=$xml;
			    }
			    warn "Could not split into documentRecords document $doc_text";
			}
		    } else {
			$pipe->close();
			warn "No documents in pipeline\n"
			    if $n == 0;
		    }
		    
		    $n++;
		    close($client_sock);
		}   
		
		
		## CLIENT IS ABOUT TO GIVE BACK AN ANNOTATED DOCUMENT
		if($1 eq "GIVEBACK"){
		    &disp_log($name,"Client is giving back a document.");
		    # receive document
		    &disp_log($name,"Receiving annotated document from client...");
		    
		    $id = <$client_sock>;
		    chomp $id;

		    &disp_log($name,"Annotated document ID: $id");
		    
		    # Recording the annotation document (local)
		    $sub_dir=&sub_dir_from_id($id);
		    if ($config{"NLP_misc"}->{"SAVE_IN_OUTDIR"}) {
			mkpath( $config{"alvis_connection"}->{"OUTDIR"} . "/$sub_dir");
		    }
		    my $xml = "";
		    if (($config{"NLP_misc"}->{"SAVE_IN_OUTDIR"} == 0) || (defined(open(O,">:utf8", $config{"alvis_connection"}->{"OUTDIR"} . "/$sub_dir/${id}.xml"))))
		    {
			while((defined $sock) && ($line=<$client_sock>) && ($line ne "<DONE>\n")) {
			    # recording the annotation document (local)
			    # building xml string for sending to the next step
			    $xml .= $line;
# 			    print STDERR $line;
			}
# 			print STDERR $line;
			# get the RENDER TIME
			if ((defined $sock) && ($line = <$client_sock>) && ($line eq "RENDER TIME\n")) {
			    if ((defined $sock) && ($line = <$client_sock>)) {
				chomp $line;
				$xml =~ s/\@RENDER_TIME_NOT_SET\@/$line/;
# 				print STDERR $line;
			    } else {
				warn "\n***\nValue of render time is not sent\n***\n\n";
			    }
		        } else {
			    warn "\n***\nRender time is not sent\n***\n\n";
			}
			if ($config{"NLP_misc"}->{"SAVE_IN_OUTDIR"}) {
			    print O $xml;
			    close(O);
			}
			# sending the annotated document to the newt step
			if ($config{"alvis_connection"}->{"NEXTSTEP"}) {
			    warn "Sending the annotated document to the next step... \n";
			    my $pipe_out_nextstep = new Alvis::Pipeline::Write(host => $config{"alvis_connection"}->{"NEXTSTEP_HOST"}, 
									       port => $config{"alvis_connection"}->{"NEXTSTEP_PORT"}, 
									       loglevel => 10)
				or die "can't create ALVIS write-pipe for '" . $config{"alvis_connection"}->{"NEXTSTEP_HOST"} . "' port '" . $config{"alvis_connection"}->{"nextstep_port"} . "': $!";
			    $pipe_out_nextstep->write($xml);
			    warn "done\n";
			} else {
			    warn "Not sending to a nextstep\n";
			}
		    } else {
			if ($config{"NLP_misc"}->{"SAVE_IN_OUTDIR"}) {
			    $sub_dir=&sub_dir_from_id($id);
			    die("Unable to open " . $config{"alvis_connection"}->{"OUTDIR"}. " //$sub_dir/${id}.xml for writing.");
			}
		    }
		    
		    &disp_log($name,"Received annotated document from client.");
		    
		    warn "deleting $config{ALVISTMP}/${id}.xml\n";
		    unlink "$config{ALVISTMP}/${id}.xml";
		    &delete_id($id, \%config);
		    # send acknowledgement
		    &disp_log($name,"Sending ACK to client...");
		    print $client_sock "ACK\n";
		    &disp_log($name,"Sent ACK to client - Finished giving back.");
		    close($client_sock);
		}
		#  CLIENT INFORMS SERVER FOR ABORTING NL PROCESSING
		if ($1 eq "ABORTING") {
		    &disp_log($name,"Client is aborting NL processing of a document.");
		    $line = <$client_sock>;
		    chomp $line;
		    # use of combineExport code



( run in 0.642 second using v1.01-cache-2.11-cpan-39bf76dae61 )