Alvis-TermTagger
view release on metacpan or search on metacpan
lib/Alvis/TermTagger.pm view on Meta::CPAN
package Alvis::TermTagger;
our $VERSION = '0.82';
#######################################################################
#
# Last Update: 16/09/2015 (mm/dd/yyyy date format)
#
# Copyright (C) 2006 Thierry Hamon
#
# Written by thierry.hamon@limsi.fr
#
# Author : Thierry Hamon
# Email : thierry.hamon@limsi.fr
# URL : https://perso.limsi.fr/hamon/
#
########################################################################
use strict;
use warnings;
use utf8;
# TODO : write functions for term tagginga, term selection with and
# without offset in the corpus
sub termtagging {
my ($corpus_filename, $term_list_filename, $output_filename, $lemmatised_corpus_filename, $caseSensitive) = @_;
my @term_list;
my %term_listIdx;
my @regex_term_list;
my @regex_lemmawordterm_list;
my %corpus;
my %lc_corpus;
my %lemmatised_corpus;
my %lc_lemmatised_corpus;
my %corpus_index;
my %lemmatised_corpus_index;
my %idtrm_select;
my %idlemtrm_select;
if (!defined $caseSensitive) {
$caseSensitive = -1;
}
&load_TermList($term_list_filename,\@term_list, \%term_listIdx);
&get_Regex_TermList(\@term_list, \@regex_term_list, \@regex_lemmawordterm_list);
&load_Corpus($corpus_filename, \%corpus, \%lc_corpus);
if (defined $lemmatised_corpus_filename) {
&load_Corpus($lemmatised_corpus_filename, \%lemmatised_corpus, \%lc_lemmatised_corpus);
}
&corpus_Indexing(\%lc_corpus, \%corpus, \%corpus_index, $caseSensitive);
if (defined $lemmatised_corpus_filename) {
&corpus_Indexing(\%lc_lemmatised_corpus, \%lemmatised_corpus, \%lemmatised_corpus_index, $caseSensitive);
}
&term_Selection(\%corpus_index, \@term_list, \%idtrm_select, $caseSensitive);
if (defined $lemmatised_corpus_filename) {
&term_Selection(\%lemmatised_corpus_index, \@term_list, \%idlemtrm_select, $caseSensitive);
}
&term_tagging_offset(\@term_list, \@regex_term_list, \%idtrm_select, \%corpus, $output_filename, $caseSensitive);
if (defined $lemmatised_corpus_filename) {
&term_tagging_offset(\@term_list, \@regex_lemmawordterm_list, \%idlemtrm_select, \%lemmatised_corpus, $output_filename, $caseSensitive);
}
return(0);
}
sub termtagging_brat {
my ($corpus_filename, $term_list_filename, $output_filename, $lemmatised_corpus_filename, $caseSensitive) = @_;
my @term_list;
my %term_listIdx;
my @regex_term_list;
my @regex_lemmawordterm_list;
my %corpus;
my %lc_corpus;
my %lemmatised_corpus;
my %lc_lemmatised_corpus;
my %corpus_index;
my %lemmatised_corpus_index;
my %idtrm_select;
my %idlemtrm_select;
if (!defined $caseSensitive) {
$caseSensitive = -1;
}
&load_TermList($term_list_filename,\@term_list, \%term_listIdx);
&get_Regex_TermList(\@term_list, \@regex_term_list, \@regex_lemmawordterm_list);
&load_Corpus($corpus_filename, \%corpus, \%lc_corpus);
if (defined $lemmatised_corpus_filename) {
&load_Corpus($lemmatised_corpus_filename, \%lemmatised_corpus, \%lc_lemmatised_corpus);
}
&corpus_Indexing(\%lc_corpus, \%corpus, \%corpus_index, $caseSensitive);
if (defined $lemmatised_corpus_filename) {
&corpus_Indexing(\%lc_lemmatised_corpus, \%lemmatised_corpus, \%lemmatised_corpus_index, $caseSensitive);
}
&term_Selection(\%corpus_index, \@term_list, \%idtrm_select, $caseSensitive);
if (defined $lemmatised_corpus_filename) {
&term_Selection(\%lemmatised_corpus_index, \@term_list, \%idlemtrm_select, $caseSensitive);
}
&term_tagging_offset_brat(\@term_list, \@regex_term_list, \%idtrm_select, \%corpus, $output_filename, $caseSensitive);
if (defined $lemmatised_corpus_filename) {
&term_tagging_offset_brat(\@term_list, \@regex_lemmawordterm_list, \%idlemtrm_select, \%lemmatised_corpus, $output_filename, $caseSensitive);
}
return(0);
}
sub load_TermList {
my ($termlist_name, $ref_termlist, $ref_termlistIdx) = @_;
my $line;
my $line1;
my $term; # not use yet
my $suppl_info; # not use yet
my @tab;
warn "Loading the terminological resource\n";
open DESC_TERMLIST, $termlist_name or die "$0: $termlist_name: No such file\n";
binmode(DESC_TERMLIST, ":utf8");
while($line1 = <DESC_TERMLIST>) {
chomp $line1;
utf8::decode($line1);
$line=$line1;
# Blank and comment lines are throw away
if (($line !~ /^\s*\#/o)&&($line !~ /^\s*\/\//o)&&($line !~ /^\s*$/o)) {
# Term is split from the other information
my @tab = split / ?[\|:] ?/, $line;
if ($tab[0] !~ /^\s*$/) {
# TODO better
$tab[0] =~ s/ +/ /go;
$tab[0] =~ s/ $//go;
$tab[0] =~ s/^ //go;
# $tab[0] =~ s/\\:/:/go;
# warn "term: " . $tab[0] . "\n";;
if (!exists $ref_termlistIdx->{$tab[0]}) {
push @$ref_termlist, \@tab;
$ref_termlistIdx->{$tab[0]} = scalar(@$ref_termlist) -1;
} else {
$ref_termlist->[$ref_termlistIdx->{$tab[0]}]->[2] .= ";" . $tab[2];
}
}
}
}
close DESC_TERMLIST;
print STDERR "\n\tTerm list size : " . scalar(@$ref_termlist) . "\n\n";
}
sub get_Regex_TermList {
my ($ref_termlist, $ref_regex_termlist, $ref_regex_lemmaWordtermlist) = @_;
my $term_counter;
warn "Generating the regular expression from the terms\n";
for($term_counter = 0;$term_counter < scalar @$ref_termlist;$term_counter++) {
$ref_regex_termlist->[$term_counter] = $ref_termlist->[$term_counter]->[0];
if (defined $ref_regex_lemmaWordtermlist) {
if (defined $ref_termlist->[$term_counter]->[3]) {
$ref_regex_lemmaWordtermlist->[$term_counter] = $ref_termlist->[$term_counter]->[3];
# warn "==> " . $ref_termlist->[$term_counter]->[3] . "\n";
} else {
$ref_regex_lemmaWordtermlist->[$term_counter] = $ref_termlist->[$term_counter]->[0];
}
}
# warn $ref_regex_lemmaWordtermlist->[$term_counter] . "\n";
$ref_regex_termlist->[$term_counter] =~ s/([()\',\[\]\?\!:;\/.\+\-\*\#\{\}\\])/\\$1/og;
$ref_regex_termlist->[$term_counter] =~ s/ /[\- \n]/og;
$ref_regex_termlist->[$term_counter] =~ s/A/[\x{00C0}-\x{00C5}\x{00E0}-\x{00E5}A]/og;
$ref_regex_termlist->[$term_counter] =~ s/AE/(\x{00C6}|AE)/og;
$ref_regex_termlist->[$term_counter] =~ s/C/[\x{00C7}|C]/og;
$ref_regex_termlist->[$term_counter] =~ s/E/[\x{00C8}-\x{00CB}E]/og;
$ref_regex_termlist->[$term_counter] =~ s/I/[\x{00CC}-\x{00CF}I]/og;
$ref_regex_termlist->[$term_counter] =~ s/D/[\x{00D0}D]/og;
$ref_regex_termlist->[$term_counter] =~ s/N/[\x{00D1}N]/og;
$ref_regex_termlist->[$term_counter] =~ s/O/[\x{00D2}-\x{00D8}O]/og;
$ref_regex_termlist->[$term_counter] =~ s/U/[\x{00D9}-\x{00DC}U]/og;
$ref_regex_termlist->[$term_counter] =~ s/Y/[\x{00DD}Y]/og;
if (defined $ref_regex_lemmaWordtermlist) {
$ref_regex_lemmaWordtermlist->[$term_counter] =~ s/([()\',\[\]\?\!:;\/.\+\-\*\#\{\}\\])/\\$1/og;
$ref_regex_lemmaWordtermlist->[$term_counter] =~ s/ /[\- \n]/og;
$ref_regex_lemmaWordtermlist->[$term_counter] =~ s/A/[\x{00C0}-\x{00C5}A]/og;
$ref_regex_lemmaWordtermlist->[$term_counter] =~ s/AE/(\x{00C6}|AE)/og;
$ref_regex_lemmaWordtermlist->[$term_counter] =~ s/C/[\x{00C7}C]/og;
$ref_regex_lemmaWordtermlist->[$term_counter] =~ s/E/[\x{00C8}-\x{00CB}E]/og;
$ref_regex_lemmaWordtermlist->[$term_counter] =~ s/I/[\x{00CC}-\x{00CF}I]/og;
$ref_regex_lemmaWordtermlist->[$term_counter] =~ s/D/[\x{00D0}D]/og;
$ref_regex_lemmaWordtermlist->[$term_counter] =~ s/N/[\x{00D1}N]/og;
$ref_regex_lemmaWordtermlist->[$term_counter] =~ s/O/[\x{00D2}-\x{00D8}O]/og;
$ref_regex_lemmaWordtermlist->[$term_counter] =~ s/U/[\x{00D9}-\x{00DC}U]/og;
$ref_regex_lemmaWordtermlist->[$term_counter] =~ s/Y/[\x{00DD}Y]/og;
}
}
print STDERR "\n\tTerm/regex list size : " . scalar(@$ref_regex_termlist);
if (defined $ref_regex_lemmaWordtermlist) {
print STDERR" / " . scalar(@$ref_regex_lemmaWordtermlist);
}
print STDERR "\n\n";
}
sub load_Corpus {
my ($corpus_filename, $ref_tabh_Corpus, $ref_tabh_Corpus_lc) = @_;
my $line;
my $sent_id = 1;
my $offset = 0;
my $lineLen = 0;
warn "Loading the corpus\n";
open CORPUS, $corpus_filename or die "File $corpus_filename not found\n";
binmode(CORPUS, ":utf8");
while($line=<CORPUS>){
$lineLen = length($line);
chomp $line;
$ref_tabh_Corpus->{$sent_id}->{'line'} = $line;
$ref_tabh_Corpus->{$sent_id}->{'offset'} = $offset;
$ref_tabh_Corpus_lc->{$sent_id}->{'line'} = lc $line;
$ref_tabh_Corpus_lc->{$sent_id}->{'offset'} = $offset;
# warn "=> " . $ref_tabh_Corpus_lc->{$sent_id} . "\n";
$sent_id++;
$offset += $lineLen;
}
close CORPUS;
print STDERR "\n\tCorpus size : " . scalar(keys %$ref_tabh_Corpus) . "\n\n";
}
sub corpus_Indexing {
my ($ref_corpus_lc, $ref_corpus, $ref_corpus_index, $caseSensitive) = @_;
my $word;
my @tab_words;
my @tab_words_lc;
my $sent_id;
my $i;
warn "Indexing the corpus\n";
foreach $sent_id (keys %$ref_corpus_lc) { # \-\.,\n;\/
@tab_words = split /[ ()\',\[\]\?\!:;\/\.\+\-\*\#\{\}\n]/, $ref_corpus->{$sent_id}->{'line'};
@tab_words_lc = split /[ ()\',\[\]\?\!:;\/\.\+\-\*\#\{\}\n]/, $ref_corpus_lc->{$sent_id}->{'line'};
for($i=0;$i < scalar(@tab_words_lc);$i++) {
# foreach $word_lc (@tab_words_lc) {
if ((defined $caseSensitive) && (($caseSensitive == 0) || (length($tab_words_lc[$i]) <= $caseSensitive))) {
$word = $tab_words[$i];
} else {
$word = $tab_words_lc[$i];
}
if ($word ne "") {
$word =~ s/[\x{00C0}-\x{00C5}\x{00E0}-\x{00E5}]/A/og;
$word =~ s/\x{00C6}/AE/og;
$word =~ s/[\x{00C7}]/C/og;
$word =~ s/[\x{00C8}-\x{00CB}]/E/og;
$word =~ s/[\x{00CC}-\x{00CF}]/I/og;
$word =~ s/[\x{00D0}]/D/og;
$word =~ s/[\x{00D1}]/N/og;
$word =~ s/[\x{00D2}-\x{00D8}]/O/og;
$word =~ s/[\x{00D9}-\x{00DC}]/U/og;
$word =~ s/[\x{00DD}]/Y/og;
if (!exists $ref_corpus_index->{$word}) {
my @tabtmp;
$ref_corpus_index->{$word} = \@tabtmp;
}
push @{$ref_corpus_index->{$word}}, $sent_id;
}
}
}
# print STDERR join(" : ", keys(%$ref_corpus_index)) . "\n";
lib/Alvis/TermTagger.pm view on Meta::CPAN
while(($i < scalar(@tab_termlex)) && ($i < scalar(@tab_termlexCap)) &&
((($word eq "") || (exists $ref_corpus_index->{$word})) ||
((($caseSensitive == 0) || (length($termCap) > $caseSensitive)) &&
(exists $ref_corpus_index->{$tab_termlexCap[$i]})))
) {
# ((($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField]) > $caseSensitive)) &&
if ($word ne "") {
# warn "---> $term\n";
push @recordedWords, $word;
# } else {
# warn "--------------------------> $term\n";
}
$i++;
$word = $tab_termlex[$i];
# warn "i: $i\n";
}
if ($i == scalar(@tab_termlex)) {
foreach $word (@recordedWords) {
# print STDERR "$word : ";
if (!exists $ref_tabh_idtrm_select->{$counter}) {
my %tabhtmp2;
$ref_tabh_idtrm_select->{$counter} = \%tabhtmp2;
}
foreach $sent_id (@{$ref_corpus_index->{$word}}) {
${$ref_tabh_idtrm_select->{$counter}}{$sent_id} = 1;
}
}
}
# }
}
# print STDERR "\n";
# print STDERR join(" : ", keys(%$ref_tabh_idtrm_select)) . "\n";
warn "Size of the selected list: " . scalar (keys %$ref_tabh_idtrm_select) . "\n";
# foreach $counter (keys %$ref_tabh_idtrm_select) {
# warn $ref_termlist->[$counter]->[0] . "\n";
# }
warn "\nEnd of selecting the terms potentialy appearing in the corpus\n";
}
sub term_tagging_offset {
my ($ref_termlist, $ref_regex_termlist, $ref_tabh_idtrm_select, $ref_tabh_corpus, $offset_tagged_corpus_name, $caseSensitive, $termField) = @_;
my $counter;
my $term_regex;
my $sent_id;
my $line;
my $termField2;
if (!defined $termField) {
$termField = 0;
}
# XXX - ABREVIATION - XXX => regex
warn "Term tagging\n";
open TAGGEDCORPUS, ">>$offset_tagged_corpus_name" or die "$0: $offset_tagged_corpus_name: No such file\n";
binmode(TAGGEDCORPUS, ":utf8");
foreach $counter (keys %$ref_tabh_idtrm_select) {
$term_regex = $ref_regex_termlist->[$counter];
$termField2 = 0;
if (defined $ref_termlist->[$counter]->[$termField]) {
$termField2 = $termField;
}
foreach $sent_id (keys %{$ref_tabh_idtrm_select->{$counter}}){
$line = $ref_tabh_corpus->{$sent_id}->{'line'};
print STDERR ".";
if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
($line =~ /[,.?!:;\/ \n\-\/\*'\#\{\}\(\)\[\]\+]($term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/)) ||
(((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
($line =~ /[,.?!:;\/ \n\-\/\*'\#\{\}\(\)\[\]\+]($term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/i))) {
printMatchingTerm(\*TAGGEDCORPUS, $ref_termlist->[$counter], $sent_id);
}
if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
($line =~ /^($term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/i)) ||
(((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
($line =~ /^($term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/i))) {
printMatchingTerm(\*TAGGEDCORPUS, $ref_termlist->[$counter], $sent_id);
}
if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
($line =~ /[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]($term_regex)$/)) ||
(((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
($line =~ /[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]($term_regex)$/i))) {
printMatchingTerm(\*TAGGEDCORPUS, $ref_termlist->[$counter], $sent_id);
}
if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
($line =~ /^($term_regex)$/i)) ||
(((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
($line =~ /^($term_regex)$/i))) {
printMatchingTerm(\*TAGGEDCORPUS, $ref_termlist->[$counter], $sent_id);
}
}
print STDERR "\n";
}
close TAGGEDCORPUS;
#########################################################################################################
warn "\nEnd of term tagging\n";
}
sub printMatchingTerm() {
my ($descriptor, $ref_matching_term, $sent_id) = @_;
print $descriptor "$sent_id\t";
print $descriptor join("\t", @$ref_matching_term);
print $descriptor "\n";
}
sub term_tagging_offset_tab {
my ($ref_termlist, $ref_regex_termlist, $ref_tabh_idtrm_select, $ref_tabh_corpus, $ref_tab_results, $caseSensitive, $termField) = @_;
my $counter;
my $term_regex;
my $sent_id;
lib/Alvis/TermTagger.pm view on Meta::CPAN
$termField2 = $termField;
}
foreach $sent_id (keys %{$ref_tabh_idtrm_select->{$counter}}){
$line = $ref_tabh_corpus->{$sent_id}->{'line'};
# warn "$line\n$term_regex\n";
if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
($line =~ /[,.?!:;\/ \n\-\/\*'\#\{\}\(\)\[\]\+](?<term>$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/s)) ||
(((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
($line =~ /[,.?!:;\/ \n\-\/\*'\#\{\}\(\)\[\]\+](?<term>$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/is))) {
printMatchingTerm_tab($ref_termlist->[$counter], $+{term}, $sent_id, $ref_tab_results);
}
if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
($line =~ /^(?<term>$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/s)) ||
(((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
($line =~ /^(?<term>$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/is))) {
printMatchingTerm_tab($ref_termlist->[$counter], $+{term}, $sent_id, $ref_tab_results);
}
if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
($line =~ /[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+](?<term>$term_regex)$/s)) ||
(((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
($line =~ /[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+](?<term>$term_regex)$/is))) {
printMatchingTerm_tab($ref_termlist->[$counter], $+{term}, $sent_id, $ref_tab_results);
}
if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
($line =~ /^(?<term>$term_regex)$/s)) ||
(((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
($line =~ /^(?<term>$term_regex)$/is))) {
printMatchingTerm_tab($ref_termlist->[$counter], $+{term}, $sent_id, $ref_tab_results);
}
}
$i++;
}
print STDERR "\n";
#########################################################################################################
warn "\nEnd of term tagging\n";
}
sub term_tagging_offset_brat {
my ($ref_termlist, $ref_regex_termlist, $ref_tabh_idtrm_select, $ref_tabh_corpus, $offset_tagged_corpus_name, $caseSensitive, $termField) = @_;
my $counter;
my $term_regex;
my $sent_id;
my $line;
my $i;
my $size_termselect = scalar(keys %$ref_tabh_idtrm_select);
my $termField2;
my $termId = 1;
my $offset;
my $currOffset;
$i = 0;
warn "Term tagging ($offset_tagged_corpus_name)\n";
open TAGGEDCORPUS, ">$offset_tagged_corpus_name" or die "$0: $offset_tagged_corpus_name: No such file\n";
binmode(TAGGEDCORPUS, ":utf8");
if (!defined $termField) {
$termField = 0;
}
# XXX - ABREVIATION - XXX => regex
# warn "====> $caseSensitive\n";
foreach $counter (keys %$ref_tabh_idtrm_select) {
# printf STDERR "Term tagging... %0.1f%%\r", ($i/$size_termselect)*100 ;
$term_regex = $ref_regex_termlist->[$counter];
# warn "counter: $counter ($term_regex)\n";
$termField2 = 0;
if (defined $ref_termlist->[$counter]->[$termField]) {
$termField2 = $termField;
}
foreach $sent_id (keys %{$ref_tabh_idtrm_select->{$counter}}){
$line = $ref_tabh_corpus->{$sent_id}->{'line'};
$offset = $ref_tabh_corpus->{$sent_id}->{'offset'};
# warn "$line\n$term_regex\n";
# warn "$line\n$offset\n";
if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
($line =~ /(?<before>[,.?!:;\/ \n\-\/\*'\#\{\}\(\)\[\]\+])(?<term>$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/s)) ||
(((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
($line =~ /(?<before>[,.?!:;\/ \n\-\/\*'\#\{\}\(\)\[\]\+])(?<term>$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/is))) {
$currOffset = $offset+length($`)+length($+{before});
print_brat_output(\*TAGGEDCORPUS, \$termId, $+{term}, $currOffset, $currOffset + length($+{term}),$ref_termlist->[$counter]->[2]);
}
if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
($line =~ /^(?<term>$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/s)) ||
(((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
($line =~ /^(?<term>$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/is))) {
$currOffset = $offset+length($`);
print_brat_output(\*TAGGEDCORPUS, \$termId, $+{term}, $currOffset, $currOffset + length($+{term}),$ref_termlist->[$counter]->[2]);
}
if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
($line =~ /(?<before>[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+])(?<term>$term_regex)$/s)) ||
(((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
($line =~ /(?<before>[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+])(?<term>$term_regex)$/is))) {
$currOffset = $offset+length($`)+length($+{before});
print_brat_output(\*TAGGEDCORPUS, \$termId, $+{term}, $currOffset, $currOffset+length($+{term}),$ref_termlist->[$counter]->[2]);
}
if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
($line =~ /^(?<term>$term_regex)$/s)) ||
(((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
($line =~ /^(?<term>$term_regex)$/is))) {
$currOffset = $offset+length($`);
print_brat_output(\*TAGGEDCORPUS, \$termId, $+{term}, $currOffset, $currOffset+length($+{term}),$ref_termlist->[$counter]->[2]);
}
}
$i++;
}
print STDERR "\n";
close TAGGEDCORPUS;
( run in 2.008 seconds using v1.01-cache-2.11-cpan-437f7b0c052 )