Alvis-TermTagger

 view release on metacpan or  search on metacpan

lib/Alvis/TermTagger.pm  view on Meta::CPAN

# Author : Thierry Hamon
# Email : thierry.hamon@limsi.fr
# URL : https://perso.limsi.fr/hamon/
#
########################################################################


use strict;
use warnings;

use utf8;

# TODO : write functions for term tagginga, term selection with and
# without offset in the corpus

sub termtagging {

    my ($corpus_filename, $term_list_filename, $output_filename, $lemmatised_corpus_filename, $caseSensitive) = @_;

    my @term_list;
    my %term_listIdx;

lib/Alvis/TermTagger.pm  view on Meta::CPAN

    my $line;
    my $line1;
    my $term;        # not use yet 
    my $suppl_info;  # not use yet 
    my @tab;

    warn "Loading the terminological resource\n";

    open DESC_TERMLIST, $termlist_name or die "$0: $termlist_name: No such file\n";

    binmode(DESC_TERMLIST, ":utf8");

    while($line1 = <DESC_TERMLIST>) {
	chomp $line1;
	utf8::decode($line1);
	$line=$line1;

	# Blank and comment lines are throw away
	if (($line !~ /^\s*\#/o)&&($line !~ /^\s*\/\//o)&&($line !~ /^\s*$/o)) {
	    # Term is split from the other information
	    my @tab = split / ?[\|:] ?/, $line;
	     if ($tab[0] !~ /^\s*$/) {
		 # TODO better
		 $tab[0] =~ s/ +/ /go;
		 $tab[0] =~ s/ $//go;

lib/Alvis/TermTagger.pm  view on Meta::CPAN

    my ($corpus_filename, $ref_tabh_Corpus, $ref_tabh_Corpus_lc) = @_;
    my $line;
    my $sent_id = 1;
    my $offset = 0;
    my $lineLen = 0;

    warn "Loading the corpus\n";

    open CORPUS, $corpus_filename or die "File $corpus_filename not found\n";
 
    binmode(CORPUS, ":utf8");
    
    while($line=<CORPUS>){
	$lineLen = length($line);
	chomp $line;
	$ref_tabh_Corpus->{$sent_id}->{'line'} = $line;
	$ref_tabh_Corpus->{$sent_id}->{'offset'} = $offset;
	$ref_tabh_Corpus_lc->{$sent_id}->{'line'} = lc $line;	
	$ref_tabh_Corpus_lc->{$sent_id}->{'offset'} = $offset;	
	# warn "=> " . $ref_tabh_Corpus_lc->{$sent_id} . "\n";
	$sent_id++;

lib/Alvis/TermTagger.pm  view on Meta::CPAN


    if (!defined $termField) {
	$termField = 0;
    }
    # XXX - ABREVIATION - XXX => regex

    warn "Term tagging\n";

    open TAGGEDCORPUS, ">>$offset_tagged_corpus_name" or die "$0: $offset_tagged_corpus_name: No such file\n";

    binmode(TAGGEDCORPUS, ":utf8");

    foreach $counter (keys %$ref_tabh_idtrm_select) {
	$term_regex = $ref_regex_termlist->[$counter];
	$termField2 = 0;
	if (defined $ref_termlist->[$counter]->[$termField]) {
	    $termField2 = $termField;
	}
	foreach $sent_id (keys %{$ref_tabh_idtrm_select->{$counter}}){
	    $line = $ref_tabh_corpus->{$sent_id}->{'line'};
	    print STDERR ".";

lib/Alvis/TermTagger.pm  view on Meta::CPAN

    my $termId = 1;
    my $offset;
    my $currOffset;

    $i = 0;

    warn "Term tagging ($offset_tagged_corpus_name)\n";

    open TAGGEDCORPUS, ">$offset_tagged_corpus_name" or die "$0: $offset_tagged_corpus_name: No such file\n";

    binmode(TAGGEDCORPUS, ":utf8");


    if (!defined $termField) {
	$termField = 0;
    }

    # XXX - ABREVIATION - XXX => regex
    # warn "====> $caseSensitive\n";
    
    foreach $counter (keys %$ref_tabh_idtrm_select) {



( run in 0.736 second using v1.01-cache-2.11-cpan-d8267643d1d )