Alvis-NLPPlatform

 view release on metacpan or  search on metacpan

lib/Alvis/NLPPlatform/Annotation.pm  view on Meta::CPAN

        # Get the document id
	if($line=~/<documentRecord.*?[ \s]+id[ \s]*=[ \s]*"(.+)".*?>/){
	    $document_record_head = $line;
	    $document_record_id=$1;
	}
	# canonicalDocument
	if($line=~/<canonicalDocument[^>]*>/i){$is_in_canonical=1;}
	if($is_in_canonical==1){$canonicalDocument.=$line;}
	if($line=~/<\/canonicalDocument>/i){$is_in_canonical=0;}
	# acquisitionData
	if($line=~/<acquisitionData[^>]*>/i){$is_in_acquisition=1;}
	if($is_in_acquisition==1){$acquisitionData.=$line;}
	if($line=~/<\/acquisitionData>/i){$is_in_acquisition=0;}
	# originalDocument
	if($line=~/<originalDocument[^>\/]*>/i){$is_in_original=1;}
	if($line=~/<originalDocument\/>/i){$is_in_original=0;$originalDocument=$line;}
	if($is_in_original==1){$originalDocument.=$line;}
	if($line=~/<\/originalDocument>/i){$is_in_original=0;}
	# metaData
	if($line=~/<metaData[^>\/]*>/i){$is_in_meta=1;}
	if($line=~/<metaData\/>/i){$is_in_meta=0;$metaData=$line;}
	if($is_in_meta==1){$metaData.=$line;}
	if($line=~/<\/metaData>/i){$is_in_meta=0;}
	# links
	if($line=~/<links[^>\/]*>/i){$is_in_links=1;}
	if($line=~/<links\/>/i){$is_in_links=0;$links=$line;}
	if($is_in_links==1){$links.=$line;}
	if($line=~/<\/links>/i){$is_in_links=0;}
	# analysis
	if($line=~/<analysis[^>\/]*>/i){$is_in_analysis=1;}
	if($line=~/<analysis\/>/i){$is_in_analysis=0; $analysis=$line;}
	if($is_in_analysis==1){$analysis.=$line;}
	if($line=~/<\/analysis>/i){$is_in_analysis=0;}
	# relevance
	if($line=~/<relevance[^>\/]*>/i){$is_in_relevance=1;}
	if($line=~/<relevance\/>/i){$is_in_relevance=0;$relevance=$line;}
	if($is_in_relevance==1){$relevance.=$line;}
	if($line=~/<\/relevance>/i){$is_in_relevance=0;}
	
	# Stop analysis when the first Go out "</documentRecord>" is  encountered
	if($line=~/<\/documentRecord>/i){
	    if (defined $doc_xml[$i]) {
		$enter .= $doc_xml[$i];
	    }
	    last;
	}
    }

#      print STDERR "$enter\n";

    $enter=~s/ encoding *= *\"([^\"]*)\"/ encoding=\"UTF-8\"/;
    if($enter=~/(<\?xml version="[0-9\.]+")(.*?)([ \s\t]*<documentRecord)/sgo){
	$header=$1.$2;
    }else{
	$enter=$header.$enter;
    }
    $acquisitionData=~/<url>([^<]+)<\/url>/g;
    $documenturl=$1;

    my $string_parse;
    if ((!exists $h_config->{"XML_INPUT"}->{"LINGUISTIC_ANNOTATION_LOADING"}) || ($h_config->{"XML_INPUT"}->{"LINGUISTIC_ANNOTATION_LOADING"} != 0)) {
	warn "  Loading existing linguistic annotations if necessary\n";
	$parser->parse(Source=>{String=>$enter});
	
	
	# Caveat !!! we assume that there is only named entities in the loaded documents
	$Alvis::NLPPlatform::last_semantic_unit = $myreceiver->{"counter_id"};
	
    
	
    }
    $string_parse =  $myreceiver->{"tab_object"};
    return($string_parse);

}


sub print_Annotation
{
    my ($descriptor, $string) = @_;

#     print STDERR "ref : " . ref($descriptor) . "\n";



    if (ref($descriptor) eq "IO::Socket::INET") {
	print $descriptor Encode::decode_utf8($string);
#	print $descriptor $string;
#	print STDERR "Descriptor is a SOCKET\n";
    }
    if (ref($descriptor) eq "GLOB") {
	print $descriptor Encode::decode_utf8($string);
#	print $descriptor $string;
#	print STDERR "Descriptor is a STREAM (GLOB)\n";
    }
    if (ref($descriptor) eq "SCALAR") {
	$$descriptor .= Encode::decode_utf8($string);
#	$$descriptor .= $string;
#	print STDERR "Descriptor is a SCALAR\n";
    }
    unless (ref($descriptor)) {
	print STDERR "Critical error: descriptor is not a reference at all.\n";
	exit(-1);
    }
#     print STDERR "$string\n";

#    print STDERR  Encode::decode_utf8($string);
    
    return(1);
}

1;

__END__


=head1 NAME

Alvis::NLPPlatform::Annotation - Perl extension for managing XML
annotation of documents in the Alvis format



( run in 1.901 second using v1.01-cache-2.11-cpan-39bf76dae61 )