Alvis-TermTagger

 view release on metacpan or  search on metacpan

lib/Alvis/TermTagger.pm  view on Meta::CPAN

    my @term_list;
    my %term_listIdx;
    my @regex_term_list;
    my @regex_lemmawordterm_list;
    my %corpus;
    my %lc_corpus;
    my %lemmatised_corpus;
    my %lc_lemmatised_corpus;
    my %corpus_index;
    my %lemmatised_corpus_index;
    my %idtrm_select;
    my %idlemtrm_select;

    if (!defined $caseSensitive) {
	$caseSensitive = -1;
    }

    &load_TermList($term_list_filename,\@term_list, \%term_listIdx);
    &get_Regex_TermList(\@term_list, \@regex_term_list, \@regex_lemmawordterm_list);

    &load_Corpus($corpus_filename, \%corpus, \%lc_corpus);
    if (defined $lemmatised_corpus_filename) {
	&load_Corpus($lemmatised_corpus_filename, \%lemmatised_corpus, \%lc_lemmatised_corpus);
    }
    &corpus_Indexing(\%lc_corpus, \%corpus, \%corpus_index, $caseSensitive);
    if (defined $lemmatised_corpus_filename) {
	&corpus_Indexing(\%lc_lemmatised_corpus, \%lemmatised_corpus, \%lemmatised_corpus_index, $caseSensitive);
    }
    &term_Selection(\%corpus_index, \@term_list, \%idtrm_select, $caseSensitive);
    if (defined $lemmatised_corpus_filename) {
	&term_Selection(\%lemmatised_corpus_index, \@term_list, \%idlemtrm_select, $caseSensitive);
    }
    &term_tagging_offset_brat(\@term_list, \@regex_term_list, \%idtrm_select, \%corpus, $output_filename, $caseSensitive);
    if (defined $lemmatised_corpus_filename) {
	&term_tagging_offset_brat(\@term_list, \@regex_lemmawordterm_list, \%idlemtrm_select, \%lemmatised_corpus, $output_filename, $caseSensitive);
    }
    return(0);
}


sub load_TermList {
    my ($termlist_name, $ref_termlist, $ref_termlistIdx) = @_;

    my $line;
    my $line1;
    my $term;        # not use yet 
    my $suppl_info;  # not use yet 
    my @tab;

    warn "Loading the terminological resource\n";

    open DESC_TERMLIST, $termlist_name or die "$0: $termlist_name: No such file\n";

    binmode(DESC_TERMLIST, ":utf8");

    while($line1 = <DESC_TERMLIST>) {
	chomp $line1;
	utf8::decode($line1);
	$line=$line1;

	# Blank and comment lines are throw away
	if (($line !~ /^\s*\#/o)&&($line !~ /^\s*\/\//o)&&($line !~ /^\s*$/o)) {
	    # Term is split from the other information
	    my @tab = split / ?[\|:] ?/, $line;
	     if ($tab[0] !~ /^\s*$/) {
		 # TODO better
		 $tab[0] =~ s/ +/ /go;
		 $tab[0] =~ s/ $//go;
		 $tab[0] =~ s/^ //go;
#		 $tab[0] =~ s/\\:/:/go;
		 # warn "term: " . $tab[0] . "\n";;
		 if (!exists $ref_termlistIdx->{$tab[0]}) {
		     push @$ref_termlist, \@tab;
		     $ref_termlistIdx->{$tab[0]} = scalar(@$ref_termlist) -1;
		 } else {
		     $ref_termlist->[$ref_termlistIdx->{$tab[0]}]->[2] .= ";" . $tab[2];
		 }
	     }
 	 }
    }
    close DESC_TERMLIST;
    print STDERR "\n\tTerm list size : " . scalar(@$ref_termlist) . "\n\n";
}

sub get_Regex_TermList {

    my ($ref_termlist, $ref_regex_termlist, $ref_regex_lemmaWordtermlist) = @_;
    my $term_counter;

    warn "Generating the regular expression from the terms\n";

    for($term_counter  = 0;$term_counter < scalar @$ref_termlist;$term_counter++) {
	$ref_regex_termlist->[$term_counter] = $ref_termlist->[$term_counter]->[0];
	if (defined $ref_regex_lemmaWordtermlist) {
	    if (defined $ref_termlist->[$term_counter]->[3]) {
		$ref_regex_lemmaWordtermlist->[$term_counter] = $ref_termlist->[$term_counter]->[3];
		# warn "==> " . $ref_termlist->[$term_counter]->[3] . "\n";
	    } else {
		$ref_regex_lemmaWordtermlist->[$term_counter] = $ref_termlist->[$term_counter]->[0];
	    }
	}
#	warn $ref_regex_lemmaWordtermlist->[$term_counter] . "\n";
 	$ref_regex_termlist->[$term_counter] =~ s/([()\',\[\]\?\!:;\/.\+\-\*\#\{\}\\])/\\$1/og;
	$ref_regex_termlist->[$term_counter] =~ s/ /[\- \n]/og;
	$ref_regex_termlist->[$term_counter] =~ s/A/[\x{00C0}-\x{00C5}\x{00E0}-\x{00E5}A]/og;
	$ref_regex_termlist->[$term_counter] =~ s/AE/(\x{00C6}|AE)/og;
	$ref_regex_termlist->[$term_counter] =~ s/C/[\x{00C7}|C]/og;
	$ref_regex_termlist->[$term_counter] =~ s/E/[\x{00C8}-\x{00CB}E]/og;
	$ref_regex_termlist->[$term_counter] =~ s/I/[\x{00CC}-\x{00CF}I]/og;
	$ref_regex_termlist->[$term_counter] =~ s/D/[\x{00D0}D]/og;
	$ref_regex_termlist->[$term_counter] =~ s/N/[\x{00D1}N]/og;
	$ref_regex_termlist->[$term_counter] =~ s/O/[\x{00D2}-\x{00D8}O]/og;
	$ref_regex_termlist->[$term_counter] =~ s/U/[\x{00D9}-\x{00DC}U]/og;
	$ref_regex_termlist->[$term_counter] =~ s/Y/[\x{00DD}Y]/og;

	if (defined $ref_regex_lemmaWordtermlist) {
	    $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/([()\',\[\]\?\!:;\/.\+\-\*\#\{\}\\])/\\$1/og;
	    $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/ /[\- \n]/og;
	    $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/A/[\x{00C0}-\x{00C5}A]/og;
	    $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/AE/(\x{00C6}|AE)/og;
	    $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/C/[\x{00C7}C]/og;



( run in 0.754 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )