Alvis-NLPPlatform

 view release on metacpan or  search on metacpan

lib/Alvis/NLPPlatform/Annotation.pm  view on Meta::CPAN

	if($is_in_links==1){$links.=$line;}
	if($line=~/<\/links>/i){$is_in_links=0;}
	# analysis
	if($line=~/<analysis[^>\/]*>/i){$is_in_analysis=1;}
	if($line=~/<analysis\/>/i){$is_in_analysis=0; $analysis=$line;}
	if($is_in_analysis==1){$analysis.=$line;}
	if($line=~/<\/analysis>/i){$is_in_analysis=0;}
	# relevance
	if($line=~/<relevance[^>\/]*>/i){$is_in_relevance=1;}
	if($line=~/<relevance\/>/i){$is_in_relevance=0;$relevance=$line;}
	if($is_in_relevance==1){$relevance.=$line;}
	if($line=~/<\/relevance>/i){$is_in_relevance=0;}
	
	# Stop analysis when the first Go out "</documentRecord>" is  encountered
	if($line=~/<\/documentRecord>/i){
	    if (defined $doc_xml[$i]) {
		$enter .= $doc_xml[$i];
	    }
	    last;
	}
    }

#      print STDERR "$enter\n";

    $enter=~s/ encoding *= *\"([^\"]*)\"/ encoding=\"UTF-8\"/;
    if($enter=~/(<\?xml version="[0-9\.]+")(.*?)([ \s\t]*<documentRecord)/sgo){
	$header=$1.$2;
    }else{
	$enter=$header.$enter;
    }
    $acquisitionData=~/<url>([^<]+)<\/url>/g;
    $documenturl=$1;

    my $string_parse;
    if ((!exists $h_config->{"XML_INPUT"}->{"LINGUISTIC_ANNOTATION_LOADING"}) || ($h_config->{"XML_INPUT"}->{"LINGUISTIC_ANNOTATION_LOADING"} != 0)) {
	warn "  Loading existing linguistic annotations if necessary\n";
	$parser->parse(Source=>{String=>$enter});
	
	
	# Caveat !!! we assume that there is only named entities in the loaded documents
	$Alvis::NLPPlatform::last_semantic_unit = $myreceiver->{"counter_id"};
	
    
	
    }
    $string_parse =  $myreceiver->{"tab_object"};
    return($string_parse);

}


sub print_Annotation
{
    my ($descriptor, $string) = @_;

#     print STDERR "ref : " . ref($descriptor) . "\n";



    if (ref($descriptor) eq "IO::Socket::INET") {
	print $descriptor Encode::decode_utf8($string);
#	print $descriptor $string;
#	print STDERR "Descriptor is a SOCKET\n";
    }
    if (ref($descriptor) eq "GLOB") {
	print $descriptor Encode::decode_utf8($string);
#	print $descriptor $string;
#	print STDERR "Descriptor is a STREAM (GLOB)\n";
    }
    if (ref($descriptor) eq "SCALAR") {
	$$descriptor .= Encode::decode_utf8($string);
#	$$descriptor .= $string;
#	print STDERR "Descriptor is a SCALAR\n";
    }
    unless (ref($descriptor)) {
	print STDERR "Critical error: descriptor is not a reference at all.\n";
	exit(-1);
    }
#     print STDERR "$string\n";

#    print STDERR  Encode::decode_utf8($string);
    
    return(1);
}

1;

__END__


=head1 NAME

Alvis::NLPPlatform::Annotation - Perl extension for managing XML
annotation of documents in the Alvis format

=head1 SYNOPSIS

use Alvis::NLPPlatform::Annotation;

Alvis::NLPPlatform::Annotation::load_xml($doc_xml);

Alvis::NLPPlatform::Annotation::render_xml($doc_xml, \*STDOUT);

=head1 DESCRIPTION

This module provides two main methods (C<load_xml> and C<render_xml>)
for loading and dumping XML annotated documents conformed to the Alvis
DTD (see http://www.alvis/info ).

Documents are read on the standard input and load in a has
table. Annotated documents are written on a file thanks to the
descriptor given as parameter. Note that the input documents can be
annoted or not, even partially annotated.

=head1 METHODS




=head2 read_key_id()

    read_key_id($element_id);

this method returns the number in the id (C<$element_id>) of the token
or word XML element (10 in the element id 'token10').



=head2 sort_keys()

    sort_keys($element_id1, $element_id2);

This method sorts two xml element ids (C<$element_id1> and
C<$element_id2>) after removing string refering to the type of the xml
element ("token", "word", etc.).



=head2 sort()

    sort($ref_hashtable)



( run in 1.143 second using v1.01-cache-2.11-cpan-39bf76dae61 )