Alvis-NLPPlatform
view release on metacpan or search on metacpan
lib/Alvis/NLPPlatform/Annotation.pm view on Meta::CPAN
if($is_in_links==1){$links.=$line;}
if($line=~/<\/links>/i){$is_in_links=0;}
# analysis
if($line=~/<analysis[^>\/]*>/i){$is_in_analysis=1;}
if($line=~/<analysis\/>/i){$is_in_analysis=0; $analysis=$line;}
if($is_in_analysis==1){$analysis.=$line;}
if($line=~/<\/analysis>/i){$is_in_analysis=0;}
# relevance
if($line=~/<relevance[^>\/]*>/i){$is_in_relevance=1;}
if($line=~/<relevance\/>/i){$is_in_relevance=0;$relevance=$line;}
if($is_in_relevance==1){$relevance.=$line;}
if($line=~/<\/relevance>/i){$is_in_relevance=0;}
# Stop analysis when the first Go out "</documentRecord>" is encountered
if($line=~/<\/documentRecord>/i){
if (defined $doc_xml[$i]) {
$enter .= $doc_xml[$i];
}
last;
}
}
# print STDERR "$enter\n";
$enter=~s/ encoding *= *\"([^\"]*)\"/ encoding=\"UTF-8\"/;
if($enter=~/(<\?xml version="[0-9\.]+")(.*?)([ \s\t]*<documentRecord)/sgo){
$header=$1.$2;
}else{
$enter=$header.$enter;
}
$acquisitionData=~/<url>([^<]+)<\/url>/g;
$documenturl=$1;
my $string_parse;
if ((!exists $h_config->{"XML_INPUT"}->{"LINGUISTIC_ANNOTATION_LOADING"}) || ($h_config->{"XML_INPUT"}->{"LINGUISTIC_ANNOTATION_LOADING"} != 0)) {
warn " Loading existing linguistic annotations if necessary\n";
$parser->parse(Source=>{String=>$enter});
# Caveat !!! we assume that there is only named entities in the loaded documents
$Alvis::NLPPlatform::last_semantic_unit = $myreceiver->{"counter_id"};
}
$string_parse = $myreceiver->{"tab_object"};
return($string_parse);
}
sub print_Annotation
{
my ($descriptor, $string) = @_;
# print STDERR "ref : " . ref($descriptor) . "\n";
if (ref($descriptor) eq "IO::Socket::INET") {
print $descriptor Encode::decode_utf8($string);
# print $descriptor $string;
# print STDERR "Descriptor is a SOCKET\n";
}
if (ref($descriptor) eq "GLOB") {
print $descriptor Encode::decode_utf8($string);
# print $descriptor $string;
# print STDERR "Descriptor is a STREAM (GLOB)\n";
}
if (ref($descriptor) eq "SCALAR") {
$$descriptor .= Encode::decode_utf8($string);
# $$descriptor .= $string;
# print STDERR "Descriptor is a SCALAR\n";
}
unless (ref($descriptor)) {
print STDERR "Critical error: descriptor is not a reference at all.\n";
exit(-1);
}
# print STDERR "$string\n";
# print STDERR Encode::decode_utf8($string);
return(1);
}
1;
__END__
=head1 NAME
Alvis::NLPPlatform::Annotation - Perl extension for managing XML
annotation of documents in the Alvis format
=head1 SYNOPSIS
use Alvis::NLPPlatform::Annotation;
Alvis::NLPPlatform::Annotation::load_xml($doc_xml);
Alvis::NLPPlatform::Annotation::render_xml($doc_xml, \*STDOUT);
=head1 DESCRIPTION
This module provides two main methods (C<load_xml> and C<render_xml>)
for loading and dumping XML annotated documents conformed to the Alvis
DTD (see http://www.alvis/info ).
Documents are read on the standard input and load in a has
table. Annotated documents are written on a file thanks to the
descriptor given as parameter. Note that the input documents can be
annoted or not, even partially annotated.
=head1 METHODS
=head2 read_key_id()
read_key_id($element_id);
this method returns the number in the id (C<$element_id>) of the token
or word XML element (10 in the element id 'token10').
=head2 sort_keys()
sort_keys($element_id1, $element_id2);
This method sorts two xml element ids (C<$element_id1> and
C<$element_id2>) after removing string refering to the type of the xml
element ("token", "word", etc.).
=head2 sort()
sort($ref_hashtable)
( run in 1.143 second using v1.01-cache-2.11-cpan-39bf76dae61 )