Alvis-NLPPlatform
view release on metacpan or search on metacpan
lib/Alvis/NLPPlatform/NLPWrappers.pm view on Meta::CPAN
#use Encode;
use Encode qw(:fallbacks);;
our $VERSION=$Alvis::NLPPlatform::VERSION;
my @term_list_EN;
my @regex_term_list_EN;
my @term_list_FR;
my @regex_term_list_FR;
sub tokenize
{
###################################################
my ($class, $h_config, $doc_hash) = @_;
my $line;
my @characters;
my $char;
my $offset;
my $current_char;
my $last_char;
my $length;
my $string;
my $token_id;
my $section_id = 0;
my $alpha="[A-Za-z\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{FF}]";
my $num="[0-9]";
my $sep="[ \\s\\t\\n\\r]";
my $canonical;
my @lines;
# my $shift_offset = 0;
# my $shift_offset_prec = 0;
my $dtype;
my $token_id_str;
###################################################
$offset=0;
$last_char=0;
$length=0;
$string="";
$token_id=1;
print STDERR " Tokenizing... ";
$canonical = $Alvis::NLPPlatform::Annotation::canonicalDocument;
$canonical = Alvis::NLPPlatform::Canonical::CleanUp($canonical, $h_config->{"XML_INPUT"}->{"PRESERVEWHITESPACE"});
@lines=split /\n/,$canonical;
# map {$_ .= "\n"} @lines;
foreach $line(@lines)
{
$line .= "\n";
# convert SGML into characters
# character spliting
@characters=split //, $line;
foreach $char(@characters){
# determine the type of the current character
$current_char=4; # default type
if($char=~/$alpha/o){$current_char=1;# print STDERR "$char : OK\n";
}
if($char=~/$num/o){$current_char=2;}
if($char=~/$sep/o){$current_char=3;}
# comparison with last seen character
# if it is the same ...
if(($current_char==$last_char) && ($current_char!=4)){
$string=$string . $char;
$length++;
}else{
if($length>0){
#######################################################
if($last_char==1){$dtype="alpha";}
if($last_char==2){$dtype="num";}
if($last_char==3){$dtype="sep";}
if($last_char==4){$dtype="symb";}
$token_id_str = "token$token_id";
$doc_hash->{$token_id_str}={};
$doc_hash->{$token_id_str}->{'datatype'}="token";
$doc_hash->{$token_id_str}->{'type'}=$dtype;
$doc_hash->{$token_id_str}->{'id'}=$token_id_str;
$doc_hash->{$token_id_str}->{'from'}=$offset;
$doc_hash->{$token_id_str}->{'to'}=$offset+$length-1;
while(($section_id < scalar(@Alvis::NLPPlatform::tab_end_sections_byaddr)) && ($Alvis::NLPPlatform::tab_end_sections_byaddr[$section_id] <= $offset + $length - 1)) {
push @Alvis::NLPPlatform::tab_end_sections_bytoken, $token_id_str; #"token$token_id";
$section_id++;
}
if($last_char==3){
$string=~s/\n/\\n/go;
$string=~s/\r/\\r/go;
$string=~s/\t/\\t/go;
}
$doc_hash->{$token_id_str}->{"content"}=$string;
$Alvis::NLPPlatform::hash_tokens{$token_id_str}=$string;
$token_id++;
$offset+=$length;
#######################################################
}
$length=1;
$string=$char;
$last_char=$current_char;
lib/Alvis/NLPPlatform/NLPWrappers.pm view on Meta::CPAN
my $last_en;
my $corpus_filename;
my $result_filename;
print STDERR " Named entites tagging... ";
$corpus="";
foreach $token(Alvis::NLPPlatform::Annotation::sort(\%Alvis::NLPPlatform::hash_tokens)){
$tok_ct=$Alvis::NLPPlatform::hash_tokens{$token}; # why not $token ? (TH)
Alvis::NLPPlatform::XMLEntities::decode($tok_ct);
# (TH) those replacements are required to workaround a bug in
# tagen (Named entity following a \n is not analyse - because
# n is concatenate with the next word)
$tok_ct=~s/\\n/\\n /go;
$tok_ct=~s/\\r/\\r /go;
$tok_ct=~s/\\t/\\t /go;
$corpus.=$tok_ct;
push @tab_tokens,$tok_ct;
}
$corpus_filename = $h_config->{'TMPFILE'} . ".corpus_en.txt";
open CORPUS,">$corpus_filename";
# binmode(CORPUS,":utf8");
print CORPUS Encode::encode_utf8($corpus);
close CORPUS;
print STDERR "done\n";
my $command_line;
if($Alvis::NLPPlatform::Annotation::ALVISLANGUAGE eq "FR"){
$command_line = $h_config->{'NLP_tools'}->{'NETAG_FR'} . " $corpus_filename 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;
} else {
$command_line = $h_config->{'NLP_tools'}->{'NETAG_EN'} . " $corpus_filename 2>> " . $Alvis::NLPPlatform::ALVISLOGFILE;
}
# nice idea, though TagEN seems to return 0 anyhow...
#`$command_line` && print STDERR "FAILED TO EXECUTE \"$command_line\": &!\n";
`$command_line`;
$Alvis::NLPPlatform::ALVISDEBUG || unlink $corpus_filename;
@Alvis::NLPPlatform::en_start=();
@Alvis::NLPPlatform::en_end=();
@Alvis::NLPPlatform::en_type=();
$result_filename = $h_config->{'TMPFILE'} . ".corpus_en.tag.txt";
open REN,"<$result_filename" or warn "Can't open the file $result_filename";
binmode REN;
while($line=<REN>){
($NE_type, $NE_start, $NE_end) = split /\t/, $line;
# $line=~m/(.+)\s+([0-9]+)\s+([0-9]+)/;
# $NE_type = $1;
# $NE_start = $2;
# $NE_end = $3;
push @Alvis::NLPPlatform::en_type,$NE_type;
if ((exists($h_config->{'XML_INPUT'}->{"PRESERVEWHITESPACE"})) && ($h_config->{'XML_INPUT'}->{"PRESERVEWHITESPACE"})) {
push @Alvis::NLPPlatform::en_start,($NE_start-1);
push @Alvis::NLPPlatform::en_end,($NE_end-1);
} else {
push @Alvis::NLPPlatform::en_start,$NE_start;
push @Alvis::NLPPlatform::en_end,$NE_end;
}
}
close REN;
$Alvis::NLPPlatform::ALVISDEBUG || unlink $result_filename;
# print STDERR scalar(@Alvis::NLPPlatform::en_type) . " to find\n";
print STDERR " Matching EN with tokens... ";
# scan tokens and match with NE
@Alvis::NLPPlatform::en_tokens_start=();
@Alvis::NLPPlatform::en_tokens_end=();
%Alvis::NLPPlatform::en_tokens_hash=();
$number_of_tokens=scalar @tab_tokens;
$en=$Alvis::NLPPlatform::last_semantic_unit+1;
$last_en=0;
my $en_str = "";
for($t=0;$t<$number_of_tokens;$t++){
print STDERR "\r Matching EN with tokens... ".($t+1)."/".$number_of_tokens." ";
for($i=$last_en;$i<scalar @Alvis::NLPPlatform::en_start;$i++){
# print STDERR "\ti = $i :: last_en = $last_en\n";
if($Alvis::NLPPlatform::en_start[$i]==$offset){
# print STDERR "Found\n";
$last_en=$i;
$Alvis::NLPPlatform::en_tokens_start[$en]="token".($t+1);
$Alvis::NLPPlatform::en_tokens_hash{($t+1)}=$en;
$start=$t+1;
while($Alvis::NLPPlatform::en_end[$i]>$offset-1){
$Alvis::NLPPlatform::en_tokens_end[$en]="token".($t+1);
$end=$t+1;
$offset+=length($tab_tokens[$t]);
$t++;
}
$en_str = "semantic_unit$en";
$doc_hash->{$en_str}={};
$doc_hash->{$en_str}->{"datatype"}="semantic_unit";
$doc_hash->{$en_str}->{"named_entity"}={};
$doc_hash->{$en_str}->{"named_entity"}->{"datatype"}="named_entity";
$doc_hash->{$en_str}->{"named_entity"}->{"named_entity_type"}=$Alvis::NLPPlatform::en_type[$i];
$doc_hash->{$en_str}->{"named_entity"}->{"id"}="named_entity$en";
$ref_tab=$doc_hash->{$en_str}->{"named_entity"}->{"list_refid_token"}={};
$ref_tab->{'datatype'}="list_refid_token";
$en_cont="";
$refid_n=1;
my @tab_tokens_en;
$ref_tab->{"refid_token"}=\@tab_tokens_en;
for($j=$start;$j<=$end;$j++){
push @tab_tokens_en, "token$j";
$refid_n++;
( run in 1.286 second using v1.01-cache-2.11-cpan-39bf76dae61 )