Alvis-NLPPlatform
view release on metacpan or search on metacpan
lib/Alvis/NLPPlatform.pm view on Meta::CPAN
# print STDERR $doc;
@records=&split_to_docRecs($doc);
$Alvis::NLPPlatform::last_doc = 0;
unlink $config->{'ALVISTMP'} . "/$HOSTNAME.$$.corpus.yatea.tmp";
for($i=0;$i <scalar(@records); $i++) {
if ($i == $#records) {
$Alvis::NLPPlatform::last_doc = 1;
}
$rec = $records[$i];
($id,$docR)=@$rec;
warn "Process document $id\n";
open FILETMP_OUT, ">$tmpfile";
binmode(FILETMP_OUT, ":utf8");
# binmode(FILETMP_OUT);
# print FILETMP_OUT Encode::decode_utf8($doc);
Alvis::NLPPlatform::platform_reset();
$render_time = Alvis::NLPPlatform::standalone_main($config, $docR, \*FILETMP_OUT, 1); #${$tab_docs_xml->[$doc_num]}[1] ; ${$ref_doc}[1]
close(FILETMP_OUT);
open FILETMP_OUT, "$tmpfile" or die "No such file or directory\n";
@cur_doc = <FILETMP_OUT>;
$j = 0;
while(($j< scalar @cur_doc) && ($cur_doc[$j] !~ s/\@RENDER_TIME_NOT_SET\@/$render_time/)) {
$j++;
}
close(FILETMP_OUT);
if (!((exists $config->{"XML_OUTPUT"}->{"NO_STD_XML_OUTPUT"}) && ($config->{"XML_OUTPUT"}->{"NO_STD_XML_OUTPUT"} == 1))) {
if (scalar(@records) > 1) {
if ($i == 0){
pop @cur_doc;
} else {
shift @cur_doc;
shift @cur_doc;
}
}
# push @doc_collection_out, @cur_doc;
print @cur_doc;
}
$time_total=$time_load+$time_tok+$time_ne+$time_word+$time_sent+$time_pos+$time_lemm+$time_term+$time_synt + $time_semtag + $time_render;
warn "Total processing time: $time_total\n";
}
# print STDERR "$tmpfile\n";
unlink $tmpfile;
# return @cur_doc;
return @doc_collection_out;
}
sub standalone_main {
my $h_config = $_[0];
my $doc_xml = $_[1];
my $descriptor = $_[2];
my $printCollectionHeaderFooter = $_[3];
my $xmlhead="";#"<?xml version=\"1.0\" encoding=\"$charset\"?>\n<documentCollection xmlns=\"http://alvis.info/enriched/\" version=\"1.1\">\n";
my $xmlfoot="";#</documentCollection>\n";
my $doc_hash;
$last_semantic_unit=0;
$last_semantic_feature = 0;
$cur_doc_nb=1;
compute_dependencies($h_config);
$NLPTOOLS=$h_config->{'NLP_tools_root'};
$ALVISTMP=$h_config->{'ALVISTMP'};
$HOSTNAME=hostname
$ALVISRSC=$h_config->{'NLP_misc'}->{'NLP_resources'};
if (!exists $h_config->{'TMPFILE'}) {
$h_config->{'TMPFILE'}="$ALVISTMP/$HOSTNAME.$$";
}
$ALVISLOGFILE= "$ALVISTMP/alvis.$HOSTNAME.$$.log";
if (exists $h_config->{'DEBUG'}) {
$ALVISDEBUG = $h_config->{'DEBUG'};
}
print STDERR "\n";
$time_load=0;
$time_tok=0;
$time_ne=0;
$time_word=0;
$time_sent=0;
$time_pos=0;
$time_lemm=0;
$time_term=0;
$time_render=0;
# Load document record
print STDERR "Loading DR... ";
undef %$doc_hash;
%$doc_hash=();
$doc_hash=0;
%hash_tokens=();
$dont_annotate=0;
%hash_words=();
%hash_words_punct=();
%hash_sentences=();
%hash_postags=();
@word_start=();
@word_end=();
%last_words=();
@found_terms=();
@found_terms_tidx=();
@found_terms_smidx=();
@found_terms_phr=();
@found_terms_words=();
$phrase_idx=1;
@tab_errors=();
starttimer();
# $doc_xml =~ s/("<\?xml version=\"1.0\" encoding=\"$charset\"?>\n
$doc_hash=Alvis::NLPPlatform::Annotation::load_xml($doc_xml, $h_config);
$time_load+=endtimer();
# Recording computing data (time and entity size)
# init
$doc_hash->{"log_processing0"}->{"datatype"}="log_processing";
$doc_hash->{"log_processing0"}->{"log_id"} = "time";
$doc_hash->{"log_processing1"}->{"datatype"}="log_processing";
$doc_hash->{"log_processing1"}->{"log_id"} = "element_size";
$doc_hash->{"log_processing2"}->{"datatype"}="log_processing";
$doc_hash->{"log_processing2"}->{"log_id"} = "host";
$doc_hash->{"log_processing2"}->{"comments"} = $HOSTNAME;
# Recording statistical data (time and entity size)
# XML loading time
my @tmp_c;
$doc_hash->{"log_processing0"}->{"comments"} = \@tmp_c;
push @{$doc_hash->{"log_processing0"}->{"comments"}}, "XML loading Time : $time_load";
print STDERR "\tXML loading Time : $time_load\n";
my @tmp_d;
$doc_hash->{"log_processing1"}->{"comments"} = \@tmp_d;
if($doc_hash!=0)
{
print STDERR "done - documentRecord ".$Alvis::NLPPlatform::Annotation::document_record_id;
print STDERR " (document $cur_doc_nb)\n";
Alvis::NLPPlatform::linguistic_annotation($h_config, $doc_hash);
# Save to XML file
$cur_doc_nb++;
print STDERR "Rendering XML... ";
starttimer();
$time_render = 0;
push @{$doc_hash->{"log_processing0"}->{"comments"}}, "XML rendering Time : \@RENDER_TIME_NOT_SET\@";
Alvis::NLPPlatform::Annotation::render_xml($doc_hash, $descriptor, $printCollectionHeaderFooter, $h_config);
$time_render+=endtimer();
# TODO : recording the xml rendering time
# Recording statistical data (time and entity size)
# XML rendering (unsuable)
print STDERR "done\n";
print STDERR "\tXML rendering Time : $time_render\n";
}else{
print STDERR "done parsing - no more documents.\n";
last;
}
print STDERR "\n";
# log errors
open LOGERRORS,">>$ALVISLOGFILE";
if(scalar @tab_errors>0){
print LOGERRORS "Document $Alvis::NLPPlatform::Annotation::document_record_id (number $cur_doc_nb)\n";
foreach $log_entry(@tab_errors){
lib/Alvis/NLPPlatform.pm view on Meta::CPAN
warn "Receiving SIGINT -- Aborting NL processing\n";
do {
$sock=new IO::Socket::INET( PeerAddr => $nlp_host,
PeerPort => $nlp_port,
Proto => 'tcp');
warn "Could not create socket: $! \n" unless $sock;
$connection_retry--;
sleep(1);
} while(!defined($sock) && ($connection_retry >0));
if ($connection_retry ==0) {
die "Timeout. Could not create socket: $! \n";
}
$sock -> autoflush(1); ###############
binmode $sock, ":utf8";
print STDERR "Established connection to server.\n";
print STDERR "Sending aborting message\n";
print $sock "ABORTING\n$id\n";
print STDERR "Aborting message sent\n";
print STDERR "Awaiting acknowledgement...";
my $line;
while($line=<$sock>){
chomp $line;
$line=uc $line;
if($line=~/ACK/gi){
close($sock);
last;
}
}
print STDERR "OK.\n";
close($sock);
exit;
}
sub server
{
my ($rcfile) = @_;
print STDERR "config File : $rcfile \n";
my %config = Alvis::NLPPlatform::load_config($rcfile);
$nlp_host = $config{"NLP_connection"}->{"SERVER"};
$nlp_port = $config{"NLP_connection"}->{"PORT"};
$connection_retry = $config{"alvis_connection"}->{"RETRY_CONNECTION"};
# print STDERR Dumper(\%config);
my $charset = 'UTF-8';
# header and footer
my $xmlhead="<?xml version=\"1.0\" encoding=\"$charset\"?>\n<documentCollection xmlns=\"http://alvis.info/enriched/\" version=\"1.1\">\n";
my $xmlfoot="</documentCollection>\n";
# connection to the crawler
my $pipe = new Alvis::Pipeline::Read(port => $config{"alvis_connection"}->{"HARVESTER_PORT"}, spooldir => $config{"alvis_connection"}->{"SPOOLDIR"},
loglevel=>10)
or die "can't create read-pipe on port " . $config{"alvis_connection"}->{"HARVESTER_PORT"} . ": $!";
$|=1;
touch($config{"ALVISTMP"} . "/.proc_id");
&init_server(\%config);
unlink($config{"ALVISTMP"} . "/.proc_id");
touch($config{"ALVISTMP"} . "/.proc_id");
mkpath($config{"alvis_connection"}->{"OUTDIR"});
my $n=1;
my $annotated_xml;
$SIG{'CHLD'}='IGNORE'; # to prevent zombification
my $sock=new IO::Socket::INET(LocalPort => $config{"NLP_connection"}->{"PORT"},
Proto => 'tcp',
Listen => 10,
Reuse => 1);
die "Could not create socket: $!\n" unless $sock;
$sock -> autoflush(1); ###############
my $client_sock=0;
my $name;
my @records;
my $id;
my $sub_dir;
my %processing_id;
while(1){
warn "beginning of the loop\n";
# await client connection
if ($client_sock=$sock->accept()) {
warn "Accepting a connection\n";
if (fork() == 0) {
close($sock);
binmode($client_sock, ":utf8");
my ($client_port,$client_iaddr) = sockaddr_in(getpeername($client_sock));
warn "Getting information about remote host\n";
$name=gethostbyaddr($client_iaddr,AF_INET);
&disp_log($name,"Client (".inet_ntoa($client_iaddr).":".$client_port.") has connected.");
$client_sock -> autoflush(1); ###############
##############################
# CLIENT HANDLING CODE
my $line;
$line=<$client_sock>;
chomp $line;
$line=uc $line;
$line=~m/^\s*([A-Z]+)$/g;
lib/Alvis/NLPPlatform.pm view on Meta::CPAN
sub record_id {
my ($doc_id, $r_config) = @_;
my $file_id = $r_config->{"ALVISTMP"} . "/.proc_id";
my $fh = new IO::File("+<$file_id")
or die "can't read '$file_id': $!";
flock($fh, LOCK_EX) or die "can't lock '$file_id': $!";
seek($fh, 0, SEEK_END) or die "can't seek to start of '$file_id': $!";
# my @tab_proc_id;
# while($line = $fh->getline()) {
# if ($line ne "$doc_id\n") {
# push @tab_proc_id, $line;
# }
# }
$fh->print("$doc_id\n") or die "can't write in '$file_id': $!";
flock($fh, LOCK_UN) or die "can't unlock '$file_id': $!";
$fh->close() or die "Truly unbelievable";
}
sub delete_id {
my ($doc_id, $r_config) = @_;
my $line;
my @tab_proc_id;
my $file_id = $r_config->{"ALVISTMP"} . "/.proc_id";
my $fh = new IO::File("<$file_id")
or die "can't read '$file_id': $!";
flock($fh, LOCK_EX) or die "can't lock '$file_id': $!";
while($line = $fh->getline()) {
if ($line ne "$doc_id\n") {
push @tab_proc_id, $line;
}
}
$fh->close() or die "Truly unbelievable";
$fh = new IO::File(">$file_id")
or die "can't write '$file_id': $!";
# seek($fh, 0, SEEK_SET) or die "can't seek to start of '$file_id': $!";
foreach $line (@tab_proc_id) {
$fh->print("$line") or die "can't write in '$file_id': $!";
}
flock($fh, LOCK_UN) or die "can't unlock '$file_id': $!";
$fh->close() or die "Truly unbelievable";
}
sub init_server {
my $r_config = $_[0];
my $doc_id;
my $line;
my $rec_out = "";
my @tab_proc_id;
my $xmlhead=""; #<?xml version=\"1.0\" encoding=\"$charset\"?>\n<documentCollection xmlns=\"http://alvis.info/enriched/\" version=\"1.1\">\n";
my $xmlfoot=""; #</documentCollection>\n";
print STDERR "Starting Server Initialisation ...\n";
# warn "Receiving SIGINT -- Aborting any NL processing\n";
my $pipe_out = new Alvis::Pipeline::Write(host => "localhost",
port => $r_config->{"alvis_connection"}->{"HARVESTER_PORT"},
loglevel => 10)
or die "can't create ALVIS write-pipe for port '" . $r_config->{"alvis_connection"}->{"HARVESTER_PORT"} . "': $!";
my $file_id = $r_config->{ALVISTMP} . "/.proc_id";
my $fh = new IO::File("+<$file_id")
or die "can't read '$file_id': $!";
flock($fh, LOCK_EX) or die "can't lock '$file_id': $!";
while($line = $fh->getline()) {
chomp $line;
push @tab_proc_id, $line;
}
warn "Recording " . scalar(@tab_proc_id) ." documents in the pipe...";
foreach $doc_id (@tab_proc_id) {
warn "Recording $doc_id in the pipe...";
# use of combineExport code
open ABORTING_FILE, $r_config->{ALVISTMP} . "/$doc_id.xml" ;
$rec_out = "";
while($line = <ABORTING_FILE>) {
$rec_out .= $line;
}
$pipe_out->write($xmlhead . $rec_out . $xmlfoot);
close ABORTING_FILE;
unlink $r_config->{ALVISTMP} . "/$doc_id.xml" ;
warn "$doc_id recorded in the pipe";
}
flock($fh, LOCK_UN) or die "can't unlock '$file_id': $!";
$fh->close() or die "Truly unbelievable";
print STDERR "Server Initialisation Done\n";
}
sub token_id_is_in_list_refid_token
{
my $list_refid_token = $_[0];
my $token_to_search = $_[1];
# warn "searching $token_to_search\n";
my $tok_id;
foreach $tok_id (@$list_refid_token) {
if ($tok_id eq $token_to_search) {
return 1;
}
}
return 0;
}
sub token_id_follows_list_refid_token
( run in 0.931 second using v1.01-cache-2.11-cpan-39bf76dae61 )