Alvis-NLPPlatform

 view release on metacpan or  search on metacpan

lib/Alvis/NLPPlatform.pm  view on Meta::CPAN

#     print STDERR $doc;

    @records=&split_to_docRecs($doc);

    $Alvis::NLPPlatform::last_doc = 0;

    unlink $config->{'ALVISTMP'} . "/$HOSTNAME.$$.corpus.yatea.tmp";

    for($i=0;$i <scalar(@records); $i++) {
	if ($i == $#records) {
	    $Alvis::NLPPlatform::last_doc = 1;
	}
	$rec = $records[$i];
	($id,$docR)=@$rec;
	warn "Process document $id\n";

	open FILETMP_OUT, ">$tmpfile";
	binmode(FILETMP_OUT, ":utf8");
#	binmode(FILETMP_OUT);
#      print FILETMP_OUT Encode::decode_utf8($doc);
	Alvis::NLPPlatform::platform_reset();
	$render_time = Alvis::NLPPlatform::standalone_main($config, $docR, \*FILETMP_OUT, 1); #${$tab_docs_xml->[$doc_num]}[1] ; ${$ref_doc}[1]
	close(FILETMP_OUT);

	open FILETMP_OUT, "$tmpfile" or die "No such file or directory\n";
	@cur_doc = <FILETMP_OUT>;
	$j = 0;
	while(($j< scalar @cur_doc) && ($cur_doc[$j] !~ s/\@RENDER_TIME_NOT_SET\@/$render_time/)) {
	    $j++;
	}
	close(FILETMP_OUT);

        if (!((exists $config->{"XML_OUTPUT"}->{"NO_STD_XML_OUTPUT"}) && ($config->{"XML_OUTPUT"}->{"NO_STD_XML_OUTPUT"} == 1))) {
	    if (scalar(@records) > 1) {
		if ($i == 0){
		    pop @cur_doc;
		} else {
		    shift @cur_doc;
		    shift @cur_doc;
		}
	    }
#	    push @doc_collection_out, @cur_doc;
		print @cur_doc;
	}
	$time_total=$time_load+$time_tok+$time_ne+$time_word+$time_sent+$time_pos+$time_lemm+$time_term+$time_synt + $time_semtag + $time_render;
	warn "Total processing time: $time_total\n";
    }

#     print STDERR "$tmpfile\n";
     unlink $tmpfile;
#    return @cur_doc;
    return @doc_collection_out;
}

sub standalone_main {
    my $h_config = $_[0];
    my $doc_xml = $_[1];
    my $descriptor = $_[2];
    my $printCollectionHeaderFooter = $_[3];

    my $xmlhead="";#"<?xml version=\"1.0\" encoding=\"$charset\"?>\n<documentCollection xmlns=\"http://alvis.info/enriched/\" version=\"1.1\">\n";
    my $xmlfoot="";#</documentCollection>\n";

    my $doc_hash;

    $last_semantic_unit=0;
    $last_semantic_feature = 0;

    $cur_doc_nb=1;
    compute_dependencies($h_config);
    $NLPTOOLS=$h_config->{'NLP_tools_root'};
    $ALVISTMP=$h_config->{'ALVISTMP'};
    $HOSTNAME=hostname
    $ALVISRSC=$h_config->{'NLP_misc'}->{'NLP_resources'};
    if (!exists $h_config->{'TMPFILE'}) {
	$h_config->{'TMPFILE'}="$ALVISTMP/$HOSTNAME.$$";
    }
    $ALVISLOGFILE= "$ALVISTMP/alvis.$HOSTNAME.$$.log";

    if (exists $h_config->{'DEBUG'}) {
	$ALVISDEBUG = $h_config->{'DEBUG'};
    }
    

    print STDERR "\n";


    $time_load=0;
    $time_tok=0;
    $time_ne=0;
    $time_word=0;
    $time_sent=0;
    $time_pos=0;
    $time_lemm=0;
    $time_term=0;
    $time_render=0;

    # Load document record
    print STDERR "Loading DR... ";
    undef %$doc_hash;
    %$doc_hash=();
    $doc_hash=0;

    %hash_tokens=();

    $dont_annotate=0;
    %hash_words=();
    %hash_words_punct=();
    %hash_sentences=();
    %hash_postags=();
    @word_start=();
    @word_end=();

    %last_words=();
    @found_terms=();
    @found_terms_tidx=();
    @found_terms_smidx=();
    @found_terms_phr=();
    @found_terms_words=();

    $phrase_idx=1;

    @tab_errors=();

    starttimer();


#     $doc_xml =~ s/("<\?xml version=\"1.0\" encoding=\"$charset\"?>\n
    $doc_hash=Alvis::NLPPlatform::Annotation::load_xml($doc_xml, $h_config);
    $time_load+=endtimer();

    # Recording computing data (time and entity size)
    # init
    $doc_hash->{"log_processing0"}->{"datatype"}="log_processing";
    $doc_hash->{"log_processing0"}->{"log_id"} = "time";
    $doc_hash->{"log_processing1"}->{"datatype"}="log_processing";
    $doc_hash->{"log_processing1"}->{"log_id"} = "element_size";
    $doc_hash->{"log_processing2"}->{"datatype"}="log_processing";
    $doc_hash->{"log_processing2"}->{"log_id"} = "host";
    $doc_hash->{"log_processing2"}->{"comments"} = $HOSTNAME;

    # Recording statistical data (time and entity size)
    # XML loading time
    my @tmp_c;
    $doc_hash->{"log_processing0"}->{"comments"} = \@tmp_c;

    push @{$doc_hash->{"log_processing0"}->{"comments"}},  "XML loading Time : $time_load";
    print STDERR "\tXML loading Time : $time_load\n";
    my @tmp_d;
    $doc_hash->{"log_processing1"}->{"comments"} = \@tmp_d;
    

    if($doc_hash!=0)
    {
	print STDERR "done - documentRecord ".$Alvis::NLPPlatform::Annotation::document_record_id;
	print STDERR " (document $cur_doc_nb)\n";


	Alvis::NLPPlatform::linguistic_annotation($h_config, $doc_hash);

	# Save to XML file
	$cur_doc_nb++;
	print STDERR "Rendering XML...  ";

	starttimer();
	$time_render = 0;
	push @{$doc_hash->{"log_processing0"}->{"comments"}},  "XML rendering Time : \@RENDER_TIME_NOT_SET\@";
	Alvis::NLPPlatform::Annotation::render_xml($doc_hash, $descriptor, $printCollectionHeaderFooter, $h_config);
	$time_render+=endtimer();

# TODO : recording the xml rendering time

	# Recording statistical data (time and entity size)
	# XML rendering (unsuable)
	print STDERR "done\n";
	print STDERR "\tXML rendering Time : $time_render\n";
	
    }else{
	print STDERR "done parsing - no more documents.\n";
	last;
    }
    print STDERR "\n";

    # log errors
    open LOGERRORS,">>$ALVISLOGFILE";
    if(scalar @tab_errors>0){
	print LOGERRORS "Document $Alvis::NLPPlatform::Annotation::document_record_id (number $cur_doc_nb)\n";
	foreach $log_entry(@tab_errors){

lib/Alvis/NLPPlatform.pm  view on Meta::CPAN


    warn "Receiving SIGINT -- Aborting NL processing\n";

    

    do {
	$sock=new IO::Socket::INET( PeerAddr => $nlp_host,
				    PeerPort => $nlp_port,
				    Proto => 'tcp');

	warn "Could not create socket: $! \n" unless $sock;
	$connection_retry--;
	sleep(1);
    } while(!defined($sock) && ($connection_retry >0));

    if ($connection_retry ==0) {
	die "Timeout. Could not create socket: $! \n";
    }
    $sock -> autoflush(1); ###############
    binmode $sock, ":utf8";


    print STDERR "Established connection to server.\n";

    print STDERR "Sending aborting message\n";

    print $sock "ABORTING\n$id\n";

    print STDERR "Aborting message sent\n";

    print STDERR "Awaiting acknowledgement...";
    my $line;
    while($line=<$sock>){
	chomp $line;
	$line=uc $line;
	if($line=~/ACK/gi){
	    close($sock);
	    last;
	}
    }
    print STDERR "OK.\n";

    close($sock);
    exit;
}


sub server 
{
    my ($rcfile) = @_;

    print STDERR "config File : $rcfile \n";

    my %config = Alvis::NLPPlatform::load_config($rcfile);

     $nlp_host = $config{"NLP_connection"}->{"SERVER"};
     $nlp_port = $config{"NLP_connection"}->{"PORT"};
     $connection_retry = $config{"alvis_connection"}->{"RETRY_CONNECTION"};
#    print STDERR Dumper(\%config);

    my $charset = 'UTF-8';

    #  header and footer

    my $xmlhead="<?xml version=\"1.0\" encoding=\"$charset\"?>\n<documentCollection xmlns=\"http://alvis.info/enriched/\" version=\"1.1\">\n";
    my $xmlfoot="</documentCollection>\n";

    # connection to the crawler

    my $pipe = new Alvis::Pipeline::Read(port => $config{"alvis_connection"}->{"HARVESTER_PORT"}, spooldir => $config{"alvis_connection"}->{"SPOOLDIR"},
					 loglevel=>10)
	or die "can't create read-pipe on port " . $config{"alvis_connection"}->{"HARVESTER_PORT"} . ": $!";

    $|=1;

    touch($config{"ALVISTMP"} . "/.proc_id");

    &init_server(\%config);

    unlink($config{"ALVISTMP"} . "/.proc_id");
    touch($config{"ALVISTMP"} . "/.proc_id");
    mkpath($config{"alvis_connection"}->{"OUTDIR"});
    my $n=1;

    my $annotated_xml;

    $SIG{'CHLD'}='IGNORE'; # to prevent zombification

    my $sock=new IO::Socket::INET(LocalPort => $config{"NLP_connection"}->{"PORT"},
				  Proto => 'tcp',
				  Listen => 10,
				  Reuse => 1);

    die "Could not create socket: $!\n" unless $sock;

    $sock -> autoflush(1); ###############

    my $client_sock=0;
    my $name;
    my @records;
    my $id;
    my $sub_dir;
    my %processing_id;

    while(1){
	warn "beginning of the loop\n";
	# await client connection
	if ($client_sock=$sock->accept()) {
	    warn "Accepting a connection\n";
	    if (fork() == 0) {
		close($sock);
		binmode($client_sock, ":utf8");
		my ($client_port,$client_iaddr) = sockaddr_in(getpeername($client_sock));
		warn "Getting information about remote host\n";
		$name=gethostbyaddr($client_iaddr,AF_INET);
		&disp_log($name,"Client (".inet_ntoa($client_iaddr).":".$client_port.") has connected.");
		$client_sock -> autoflush(1); ###############
		
		##############################
		# CLIENT HANDLING CODE
		my $line;
		$line=<$client_sock>;
		chomp $line;
		$line=uc $line;
		$line=~m/^\s*([A-Z]+)$/g;

lib/Alvis/NLPPlatform.pm  view on Meta::CPAN

sub record_id {
    my ($doc_id, $r_config) = @_;

    my $file_id = $r_config->{"ALVISTMP"} . "/.proc_id";
    my $fh = new IO::File("+<$file_id")
	or die "can't read '$file_id': $!";
    flock($fh, LOCK_EX) or die "can't lock '$file_id': $!";
    seek($fh, 0, SEEK_END) or die "can't seek to start of '$file_id': $!";

#     my @tab_proc_id;

#     while($line = $fh->getline()) {
# 	if ($line ne "$doc_id\n") {
# 	    push @tab_proc_id, $line;
# 	}
#     }
    
    $fh->print("$doc_id\n") or die "can't write in '$file_id': $!";

    
    flock($fh, LOCK_UN) or die "can't unlock '$file_id': $!";
    $fh->close() or die "Truly unbelievable";
    
}


sub delete_id {
    my ($doc_id, $r_config) = @_;
    my $line;
    my @tab_proc_id;

    my $file_id = $r_config->{"ALVISTMP"} . "/.proc_id";
    my $fh = new IO::File("<$file_id")
	or die "can't read '$file_id': $!";
    flock($fh, LOCK_EX) or die "can't lock '$file_id': $!";
    while($line = $fh->getline()) {
	if ($line ne "$doc_id\n") {
	    push @tab_proc_id, $line;
	}
    }
    $fh->close() or die "Truly unbelievable";
    $fh = new IO::File(">$file_id")
	or die "can't write '$file_id': $!";
#     seek($fh, 0, SEEK_SET) or die "can't seek to start of '$file_id': $!";
    foreach $line (@tab_proc_id) {
	$fh->print("$line") or die "can't write in '$file_id': $!";
    }
    
    flock($fh, LOCK_UN) or die "can't unlock '$file_id': $!";
    $fh->close() or die "Truly unbelievable";
    
}

sub init_server {
    my $r_config = $_[0];
    my $doc_id;
    my $line;
    my $rec_out = "";
    my @tab_proc_id;

    my $xmlhead=""; #<?xml version=\"1.0\" encoding=\"$charset\"?>\n<documentCollection xmlns=\"http://alvis.info/enriched/\" version=\"1.1\">\n";
    my $xmlfoot=""; #</documentCollection>\n";

    print STDERR "Starting Server Initialisation ...\n";

#     warn "Receiving SIGINT -- Aborting any NL processing\n";

    my $pipe_out = new Alvis::Pipeline::Write(host => "localhost", 
					       port => $r_config->{"alvis_connection"}->{"HARVESTER_PORT"}, 
					       loglevel => 10)
	    or die "can't create ALVIS write-pipe for port '" . $r_config->{"alvis_connection"}->{"HARVESTER_PORT"} . "': $!";
    my $file_id = $r_config->{ALVISTMP} . "/.proc_id";
    my $fh = new IO::File("+<$file_id")
	or die "can't read '$file_id': $!";
    flock($fh, LOCK_EX) or die "can't lock '$file_id': $!";
    while($line = $fh->getline()) {
	chomp $line;
	push @tab_proc_id, $line;
    }

    warn "Recording " . scalar(@tab_proc_id) ." documents in the pipe...";

    foreach $doc_id (@tab_proc_id) {
	warn "Recording $doc_id in the pipe...";
	# use of combineExport code
	open ABORTING_FILE, $r_config->{ALVISTMP} . "/$doc_id.xml" ;
	$rec_out = "";
	while($line = <ABORTING_FILE>) {
	    $rec_out .= $line;
	}
	$pipe_out->write($xmlhead . $rec_out . $xmlfoot);
	close ABORTING_FILE;
	unlink $r_config->{ALVISTMP} . "/$doc_id.xml" ;
	
	warn "$doc_id recorded in the pipe";
	
    }
    flock($fh, LOCK_UN) or die "can't unlock '$file_id': $!";
    $fh->close() or die "Truly unbelievable";
    print STDERR "Server Initialisation Done\n";
}


sub token_id_is_in_list_refid_token
{
    my $list_refid_token = $_[0];
    my $token_to_search = $_[1];
    
#    warn "searching $token_to_search\n";

    my $tok_id;

    foreach $tok_id (@$list_refid_token) {
	if ($tok_id eq $token_to_search) {
	    return 1;
	}
    }
    return 0;
}

sub token_id_follows_list_refid_token



( run in 0.931 second using v1.01-cache-2.11-cpan-39bf76dae61 )