Alvis-NLPPlatform

 view release on metacpan or  search on metacpan

lib/Alvis/NLPPlatform/NLPWrappers.pm  view on Meta::CPAN


#use Encode;
use Encode qw(:fallbacks);;

our $VERSION=$Alvis::NLPPlatform::VERSION;

my @term_list_EN;
my @regex_term_list_EN;

my @term_list_FR;
my @regex_term_list_FR;



sub tokenize
{
###################################################
    my ($class, $h_config, $doc_hash) = @_;
    my $line;
    my @characters;
    my $char;

    my $offset;
    my $current_char;
    my $last_char;
    my $length;
    my $string;
    my $token_id;

    my $section_id = 0;
    
    my $alpha="[A-Za-z\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{FF}]";

    my $num="[0-9]";
    my $sep="[ \\s\\t\\n\\r]";
    
    my $canonical;
    my @lines;

#     my $shift_offset = 0;
#     my $shift_offset_prec = 0;

    my $dtype;

    my $token_id_str;

###################################################


    $offset=0;
    $last_char=0;
    $length=0;
    $string="";
    $token_id=1;


    
    print STDERR "  Tokenizing...           ";
    
    $canonical = $Alvis::NLPPlatform::Annotation::canonicalDocument;
    $canonical = Alvis::NLPPlatform::Canonical::CleanUp($canonical, $h_config->{"XML_INPUT"}->{"PRESERVEWHITESPACE"});

    @lines=split /\n/,$canonical;
#     map {$_ .= "\n"} @lines;

    foreach $line(@lines)
    {
	$line .= "\n";
	# convert SGML into characters
	
	# character spliting
	@characters=split //, $line;

	foreach $char(@characters){
	    
	    # determine the type of the current character
	    $current_char=4; # default type
	    if($char=~/$alpha/o){$current_char=1;# print STDERR "$char : OK\n";
			     }
	    if($char=~/$num/o){$current_char=2;}
	    if($char=~/$sep/o){$current_char=3;}
	    # comparison with last seen character

	    # if it is the same ...
	    if(($current_char==$last_char) && ($current_char!=4)){
		$string=$string . $char;
		$length++;
	    }else{
		if($length>0){
		    #######################################################
		    if($last_char==1){$dtype="alpha";}
		    if($last_char==2){$dtype="num";}
		    if($last_char==3){$dtype="sep";}
		    if($last_char==4){$dtype="symb";}
		    $token_id_str = "token$token_id";
		    $doc_hash->{$token_id_str}={};
		    $doc_hash->{$token_id_str}->{'datatype'}="token";
		    $doc_hash->{$token_id_str}->{'type'}=$dtype;
		    $doc_hash->{$token_id_str}->{'id'}=$token_id_str;
		    $doc_hash->{$token_id_str}->{'from'}=$offset;
		    $doc_hash->{$token_id_str}->{'to'}=$offset+$length-1;

		    while(($section_id < scalar(@Alvis::NLPPlatform::tab_end_sections_byaddr)) && ($Alvis::NLPPlatform::tab_end_sections_byaddr[$section_id]  <= $offset + $length - 1)) {
			push @Alvis::NLPPlatform::tab_end_sections_bytoken, $token_id_str; #"token$token_id";
			$section_id++;
		    }

		    if($last_char==3){
			$string=~s/\n/\\n/go;
			$string=~s/\r/\\r/go;
			$string=~s/\t/\\t/go;
		    }
		    $doc_hash->{$token_id_str}->{"content"}=$string;	
		    $Alvis::NLPPlatform::hash_tokens{$token_id_str}=$string;
		    $token_id++;
		    $offset+=$length;
		    #######################################################
		}
		$length=1;
		$string=$char;
		$last_char=$current_char;

lib/Alvis/NLPPlatform/NLPWrappers.pm  view on Meta::CPAN

    my $last_en;

    my $corpus_filename;
    my $result_filename;

    print STDERR "  Named entites tagging...     ";
    
    $corpus="";

    foreach $token(Alvis::NLPPlatform::Annotation::sort(\%Alvis::NLPPlatform::hash_tokens)){
	$tok_ct=$Alvis::NLPPlatform::hash_tokens{$token}; # why not $token ? (TH)
	Alvis::NLPPlatform::XMLEntities::decode($tok_ct);

	# (TH) those replacements are required to workaround a bug in
	# tagen (Named entity following a \n is not analyse - because
	# n is concatenate with the next word)

	$tok_ct=~s/\\n/\\n /go;
	$tok_ct=~s/\\r/\\r /go;
	$tok_ct=~s/\\t/\\t /go;
	$corpus.=$tok_ct;
	push @tab_tokens,$tok_ct;
    }

    $corpus_filename = $h_config->{'TMPFILE'} . ".corpus_en.txt";
    
    open CORPUS,">$corpus_filename";
#     binmode(CORPUS,":utf8");


    print CORPUS Encode::encode_utf8($corpus);
    close CORPUS;

    print STDERR "done\n";
    
    my $command_line;
    if($Alvis::NLPPlatform::Annotation::ALVISLANGUAGE eq "FR"){
	$command_line = $h_config->{'NLP_tools'}->{'NETAG_FR'} . " $corpus_filename 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;
    } else {
	$command_line = $h_config->{'NLP_tools'}->{'NETAG_EN'} . " $corpus_filename 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;
    }
    # nice idea, though TagEN seems to return 0 anyhow...
    #`$command_line` && print STDERR "FAILED TO EXECUTE \"$command_line\": &!\n";
    `$command_line`;
    $Alvis::NLPPlatform::ALVISDEBUG || unlink $corpus_filename;
    @Alvis::NLPPlatform::en_start=();
    @Alvis::NLPPlatform::en_end=();
    @Alvis::NLPPlatform::en_type=();

    $result_filename = $h_config->{'TMPFILE'} . ".corpus_en.tag.txt";

    open REN,"<$result_filename"  or warn "Can't open the file $result_filename";
    binmode REN;
    while($line=<REN>){
	($NE_type, $NE_start, $NE_end) = split /\t/, $line;
# 	$line=~m/(.+)\s+([0-9]+)\s+([0-9]+)/;
# 	$NE_type = $1;
# 	$NE_start = $2;
# 	$NE_end = $3;
	push @Alvis::NLPPlatform::en_type,$NE_type;
	if ((exists($h_config->{'XML_INPUT'}->{"PRESERVEWHITESPACE"})) && ($h_config->{'XML_INPUT'}->{"PRESERVEWHITESPACE"})) {
	    push @Alvis::NLPPlatform::en_start,($NE_start-1);
	    push @Alvis::NLPPlatform::en_end,($NE_end-1);
	} else {
	    push @Alvis::NLPPlatform::en_start,$NE_start;
	    push @Alvis::NLPPlatform::en_end,$NE_end;
	}
    }
    close REN;

    $Alvis::NLPPlatform::ALVISDEBUG || unlink $result_filename;

#    print STDERR scalar(@Alvis::NLPPlatform::en_type) . " to find\n";

    print STDERR "  Matching EN with tokens...   ";

    # scan tokens and match with NE

    @Alvis::NLPPlatform::en_tokens_start=();
    @Alvis::NLPPlatform::en_tokens_end=();
    %Alvis::NLPPlatform::en_tokens_hash=();
    $number_of_tokens=scalar @tab_tokens;

    $en=$Alvis::NLPPlatform::last_semantic_unit+1;
    $last_en=0;

    my $en_str = "";
    for($t=0;$t<$number_of_tokens;$t++){
	print STDERR "\r  Matching EN with tokens...   ".($t+1)."/".$number_of_tokens." ";
	for($i=$last_en;$i<scalar @Alvis::NLPPlatform::en_start;$i++){
# 	    print STDERR "\ti = $i :: last_en = $last_en\n";
	    if($Alvis::NLPPlatform::en_start[$i]==$offset){
# 		print STDERR "Found\n";
		$last_en=$i;
		$Alvis::NLPPlatform::en_tokens_start[$en]="token".($t+1);
		$Alvis::NLPPlatform::en_tokens_hash{($t+1)}=$en;
		$start=$t+1;
		while($Alvis::NLPPlatform::en_end[$i]>$offset-1){
		    $Alvis::NLPPlatform::en_tokens_end[$en]="token".($t+1);
		    $end=$t+1;
		    $offset+=length($tab_tokens[$t]);
		    $t++;
		}
		$en_str = "semantic_unit$en";
		
		$doc_hash->{$en_str}={};
		$doc_hash->{$en_str}->{"datatype"}="semantic_unit";
		$doc_hash->{$en_str}->{"named_entity"}={};
		$doc_hash->{$en_str}->{"named_entity"}->{"datatype"}="named_entity";
		$doc_hash->{$en_str}->{"named_entity"}->{"named_entity_type"}=$Alvis::NLPPlatform::en_type[$i];
		$doc_hash->{$en_str}->{"named_entity"}->{"id"}="named_entity$en";

		$ref_tab=$doc_hash->{$en_str}->{"named_entity"}->{"list_refid_token"}={};
		$ref_tab->{'datatype'}="list_refid_token";
		$en_cont="";
		$refid_n=1;
		my @tab_tokens_en;
		$ref_tab->{"refid_token"}=\@tab_tokens_en;
		for($j=$start;$j<=$end;$j++){
		    push @tab_tokens_en, "token$j";
		    $refid_n++;



( run in 1.286 second using v1.01-cache-2.11-cpan-39bf76dae61 )