Alvis-TermTagger
view release on metacpan or search on metacpan
lib/Alvis/TermTagger.pm view on Meta::CPAN
my @term_list;
my %term_listIdx;
my @regex_term_list;
my @regex_lemmawordterm_list;
my %corpus;
my %lc_corpus;
my %lemmatised_corpus;
my %lc_lemmatised_corpus;
my %corpus_index;
my %lemmatised_corpus_index;
my %idtrm_select;
my %idlemtrm_select;
if (!defined $caseSensitive) {
$caseSensitive = -1;
}
&load_TermList($term_list_filename,\@term_list, \%term_listIdx);
&get_Regex_TermList(\@term_list, \@regex_term_list, \@regex_lemmawordterm_list);
&load_Corpus($corpus_filename, \%corpus, \%lc_corpus);
if (defined $lemmatised_corpus_filename) {
&load_Corpus($lemmatised_corpus_filename, \%lemmatised_corpus, \%lc_lemmatised_corpus);
}
&corpus_Indexing(\%lc_corpus, \%corpus, \%corpus_index, $caseSensitive);
if (defined $lemmatised_corpus_filename) {
&corpus_Indexing(\%lc_lemmatised_corpus, \%lemmatised_corpus, \%lemmatised_corpus_index, $caseSensitive);
}
&term_Selection(\%corpus_index, \@term_list, \%idtrm_select, $caseSensitive);
if (defined $lemmatised_corpus_filename) {
&term_Selection(\%lemmatised_corpus_index, \@term_list, \%idlemtrm_select, $caseSensitive);
}
&term_tagging_offset_brat(\@term_list, \@regex_term_list, \%idtrm_select, \%corpus, $output_filename, $caseSensitive);
if (defined $lemmatised_corpus_filename) {
&term_tagging_offset_brat(\@term_list, \@regex_lemmawordterm_list, \%idlemtrm_select, \%lemmatised_corpus, $output_filename, $caseSensitive);
}
return(0);
}
sub load_TermList {
my ($termlist_name, $ref_termlist, $ref_termlistIdx) = @_;
my $line;
my $line1;
my $term; # not use yet
my $suppl_info; # not use yet
my @tab;
warn "Loading the terminological resource\n";
open DESC_TERMLIST, $termlist_name or die "$0: $termlist_name: No such file\n";
binmode(DESC_TERMLIST, ":utf8");
while($line1 = <DESC_TERMLIST>) {
chomp $line1;
utf8::decode($line1);
$line=$line1;
# Blank and comment lines are throw away
if (($line !~ /^\s*\#/o)&&($line !~ /^\s*\/\//o)&&($line !~ /^\s*$/o)) {
# Term is split from the other information
my @tab = split / ?[\|:] ?/, $line;
if ($tab[0] !~ /^\s*$/) {
# TODO better
$tab[0] =~ s/ +/ /go;
$tab[0] =~ s/ $//go;
$tab[0] =~ s/^ //go;
# $tab[0] =~ s/\\:/:/go;
# warn "term: " . $tab[0] . "\n";;
if (!exists $ref_termlistIdx->{$tab[0]}) {
push @$ref_termlist, \@tab;
$ref_termlistIdx->{$tab[0]} = scalar(@$ref_termlist) -1;
} else {
$ref_termlist->[$ref_termlistIdx->{$tab[0]}]->[2] .= ";" . $tab[2];
}
}
}
}
close DESC_TERMLIST;
print STDERR "\n\tTerm list size : " . scalar(@$ref_termlist) . "\n\n";
}
sub get_Regex_TermList {
my ($ref_termlist, $ref_regex_termlist, $ref_regex_lemmaWordtermlist) = @_;
my $term_counter;
warn "Generating the regular expression from the terms\n";
for($term_counter = 0;$term_counter < scalar @$ref_termlist;$term_counter++) {
$ref_regex_termlist->[$term_counter] = $ref_termlist->[$term_counter]->[0];
if (defined $ref_regex_lemmaWordtermlist) {
if (defined $ref_termlist->[$term_counter]->[3]) {
$ref_regex_lemmaWordtermlist->[$term_counter] = $ref_termlist->[$term_counter]->[3];
# warn "==> " . $ref_termlist->[$term_counter]->[3] . "\n";
} else {
$ref_regex_lemmaWordtermlist->[$term_counter] = $ref_termlist->[$term_counter]->[0];
}
}
# warn $ref_regex_lemmaWordtermlist->[$term_counter] . "\n";
$ref_regex_termlist->[$term_counter] =~ s/([()\',\[\]\?\!:;\/.\+\-\*\#\{\}\\])/\\$1/og;
$ref_regex_termlist->[$term_counter] =~ s/ /[\- \n]/og;
$ref_regex_termlist->[$term_counter] =~ s/A/[\x{00C0}-\x{00C5}\x{00E0}-\x{00E5}A]/og;
$ref_regex_termlist->[$term_counter] =~ s/AE/(\x{00C6}|AE)/og;
$ref_regex_termlist->[$term_counter] =~ s/C/[\x{00C7}|C]/og;
$ref_regex_termlist->[$term_counter] =~ s/E/[\x{00C8}-\x{00CB}E]/og;
$ref_regex_termlist->[$term_counter] =~ s/I/[\x{00CC}-\x{00CF}I]/og;
$ref_regex_termlist->[$term_counter] =~ s/D/[\x{00D0}D]/og;
$ref_regex_termlist->[$term_counter] =~ s/N/[\x{00D1}N]/og;
$ref_regex_termlist->[$term_counter] =~ s/O/[\x{00D2}-\x{00D8}O]/og;
$ref_regex_termlist->[$term_counter] =~ s/U/[\x{00D9}-\x{00DC}U]/og;
$ref_regex_termlist->[$term_counter] =~ s/Y/[\x{00DD}Y]/og;
if (defined $ref_regex_lemmaWordtermlist) {
$ref_regex_lemmaWordtermlist->[$term_counter] =~ s/([()\',\[\]\?\!:;\/.\+\-\*\#\{\}\\])/\\$1/og;
$ref_regex_lemmaWordtermlist->[$term_counter] =~ s/ /[\- \n]/og;
$ref_regex_lemmaWordtermlist->[$term_counter] =~ s/A/[\x{00C0}-\x{00C5}A]/og;
$ref_regex_lemmaWordtermlist->[$term_counter] =~ s/AE/(\x{00C6}|AE)/og;
$ref_regex_lemmaWordtermlist->[$term_counter] =~ s/C/[\x{00C7}C]/og;
( run in 0.754 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )