Alvis-TermTagger
view release on metacpan or search on metacpan
lib/Alvis/TermTagger.pm view on Meta::CPAN
# Author : Thierry Hamon
# Email : thierry.hamon@limsi.fr
# URL : https://perso.limsi.fr/hamon/
#
########################################################################
use strict;
use warnings;
use utf8;
# TODO : write functions for term tagginga, term selection with and
# without offset in the corpus
sub termtagging {
my ($corpus_filename, $term_list_filename, $output_filename, $lemmatised_corpus_filename, $caseSensitive) = @_;
my @term_list;
my %term_listIdx;
lib/Alvis/TermTagger.pm view on Meta::CPAN
my $line;
my $line1;
my $term; # not use yet
my $suppl_info; # not use yet
my @tab;
warn "Loading the terminological resource\n";
open DESC_TERMLIST, $termlist_name or die "$0: $termlist_name: No such file\n";
binmode(DESC_TERMLIST, ":utf8");
while($line1 = <DESC_TERMLIST>) {
chomp $line1;
utf8::decode($line1);
$line=$line1;
# Blank and comment lines are throw away
if (($line !~ /^\s*\#/o)&&($line !~ /^\s*\/\//o)&&($line !~ /^\s*$/o)) {
# Term is split from the other information
my @tab = split / ?[\|:] ?/, $line;
if ($tab[0] !~ /^\s*$/) {
# TODO better
$tab[0] =~ s/ +/ /go;
$tab[0] =~ s/ $//go;
lib/Alvis/TermTagger.pm view on Meta::CPAN
my ($corpus_filename, $ref_tabh_Corpus, $ref_tabh_Corpus_lc) = @_;
my $line;
my $sent_id = 1;
my $offset = 0;
my $lineLen = 0;
warn "Loading the corpus\n";
open CORPUS, $corpus_filename or die "File $corpus_filename not found\n";
binmode(CORPUS, ":utf8");
while($line=<CORPUS>){
$lineLen = length($line);
chomp $line;
$ref_tabh_Corpus->{$sent_id}->{'line'} = $line;
$ref_tabh_Corpus->{$sent_id}->{'offset'} = $offset;
$ref_tabh_Corpus_lc->{$sent_id}->{'line'} = lc $line;
$ref_tabh_Corpus_lc->{$sent_id}->{'offset'} = $offset;
# warn "=> " . $ref_tabh_Corpus_lc->{$sent_id} . "\n";
$sent_id++;
lib/Alvis/TermTagger.pm view on Meta::CPAN
if (!defined $termField) {
$termField = 0;
}
# XXX - ABREVIATION - XXX => regex
warn "Term tagging\n";
open TAGGEDCORPUS, ">>$offset_tagged_corpus_name" or die "$0: $offset_tagged_corpus_name: No such file\n";
binmode(TAGGEDCORPUS, ":utf8");
foreach $counter (keys %$ref_tabh_idtrm_select) {
$term_regex = $ref_regex_termlist->[$counter];
$termField2 = 0;
if (defined $ref_termlist->[$counter]->[$termField]) {
$termField2 = $termField;
}
foreach $sent_id (keys %{$ref_tabh_idtrm_select->{$counter}}){
$line = $ref_tabh_corpus->{$sent_id}->{'line'};
print STDERR ".";
lib/Alvis/TermTagger.pm view on Meta::CPAN
my $termId = 1;
my $offset;
my $currOffset;
$i = 0;
warn "Term tagging ($offset_tagged_corpus_name)\n";
open TAGGEDCORPUS, ">$offset_tagged_corpus_name" or die "$0: $offset_tagged_corpus_name: No such file\n";
binmode(TAGGEDCORPUS, ":utf8");
if (!defined $termField) {
$termField = 0;
}
# XXX - ABREVIATION - XXX => regex
# warn "====> $caseSensitive\n";
foreach $counter (keys %$ref_tabh_idtrm_select) {
( run in 0.736 second using v1.01-cache-2.11-cpan-d8267643d1d )