Alvis-NLPPlatform
view release on metacpan or search on metacpan
lib/Alvis/NLPPlatform/Annotation.pm view on Meta::CPAN
# Get the document id
if($line=~/<documentRecord.*?[ \s]+id[ \s]*=[ \s]*"(.+)".*?>/){
$document_record_head = $line;
$document_record_id=$1;
}
# canonicalDocument
if($line=~/<canonicalDocument[^>]*>/i){$is_in_canonical=1;}
if($is_in_canonical==1){$canonicalDocument.=$line;}
if($line=~/<\/canonicalDocument>/i){$is_in_canonical=0;}
# acquisitionData
if($line=~/<acquisitionData[^>]*>/i){$is_in_acquisition=1;}
if($is_in_acquisition==1){$acquisitionData.=$line;}
if($line=~/<\/acquisitionData>/i){$is_in_acquisition=0;}
# originalDocument
if($line=~/<originalDocument[^>\/]*>/i){$is_in_original=1;}
if($line=~/<originalDocument\/>/i){$is_in_original=0;$originalDocument=$line;}
if($is_in_original==1){$originalDocument.=$line;}
if($line=~/<\/originalDocument>/i){$is_in_original=0;}
# metaData
if($line=~/<metaData[^>\/]*>/i){$is_in_meta=1;}
if($line=~/<metaData\/>/i){$is_in_meta=0;$metaData=$line;}
if($is_in_meta==1){$metaData.=$line;}
if($line=~/<\/metaData>/i){$is_in_meta=0;}
# links
if($line=~/<links[^>\/]*>/i){$is_in_links=1;}
if($line=~/<links\/>/i){$is_in_links=0;$links=$line;}
if($is_in_links==1){$links.=$line;}
if($line=~/<\/links>/i){$is_in_links=0;}
# analysis
if($line=~/<analysis[^>\/]*>/i){$is_in_analysis=1;}
if($line=~/<analysis\/>/i){$is_in_analysis=0; $analysis=$line;}
if($is_in_analysis==1){$analysis.=$line;}
if($line=~/<\/analysis>/i){$is_in_analysis=0;}
# relevance
if($line=~/<relevance[^>\/]*>/i){$is_in_relevance=1;}
if($line=~/<relevance\/>/i){$is_in_relevance=0;$relevance=$line;}
if($is_in_relevance==1){$relevance.=$line;}
if($line=~/<\/relevance>/i){$is_in_relevance=0;}
# Stop analysis when the first Go out "</documentRecord>" is encountered
if($line=~/<\/documentRecord>/i){
if (defined $doc_xml[$i]) {
$enter .= $doc_xml[$i];
}
last;
}
}
# print STDERR "$enter\n";
$enter=~s/ encoding *= *\"([^\"]*)\"/ encoding=\"UTF-8\"/;
if($enter=~/(<\?xml version="[0-9\.]+")(.*?)([ \s\t]*<documentRecord)/sgo){
$header=$1.$2;
}else{
$enter=$header.$enter;
}
$acquisitionData=~/<url>([^<]+)<\/url>/g;
$documenturl=$1;
my $string_parse;
if ((!exists $h_config->{"XML_INPUT"}->{"LINGUISTIC_ANNOTATION_LOADING"}) || ($h_config->{"XML_INPUT"}->{"LINGUISTIC_ANNOTATION_LOADING"} != 0)) {
warn " Loading existing linguistic annotations if necessary\n";
$parser->parse(Source=>{String=>$enter});
# Caveat !!! we assume that there is only named entities in the loaded documents
$Alvis::NLPPlatform::last_semantic_unit = $myreceiver->{"counter_id"};
}
$string_parse = $myreceiver->{"tab_object"};
return($string_parse);
}
sub print_Annotation
{
my ($descriptor, $string) = @_;
# print STDERR "ref : " . ref($descriptor) . "\n";
if (ref($descriptor) eq "IO::Socket::INET") {
print $descriptor Encode::decode_utf8($string);
# print $descriptor $string;
# print STDERR "Descriptor is a SOCKET\n";
}
if (ref($descriptor) eq "GLOB") {
print $descriptor Encode::decode_utf8($string);
# print $descriptor $string;
# print STDERR "Descriptor is a STREAM (GLOB)\n";
}
if (ref($descriptor) eq "SCALAR") {
$$descriptor .= Encode::decode_utf8($string);
# $$descriptor .= $string;
# print STDERR "Descriptor is a SCALAR\n";
}
unless (ref($descriptor)) {
print STDERR "Critical error: descriptor is not a reference at all.\n";
exit(-1);
}
# print STDERR "$string\n";
# print STDERR Encode::decode_utf8($string);
return(1);
}
1;
__END__
=head1 NAME
Alvis::NLPPlatform::Annotation - Perl extension for managing XML
annotation of documents in the Alvis format
( run in 1.901 second using v1.01-cache-2.11-cpan-39bf76dae61 )