Alvis-QueryFilter
view release on metacpan or search on metacpan
lib/Alvis/QueryFilter.pm view on Meta::CPAN
$self->{term_dict}=$self->_read_term_dict($term_dict_f);
}
else
{
undef $self->{term_dict};
}
if (defined($NE_dict_f))
{
$self->{NE_dict}=$self->_read_NE_dict($NE_dict_f);
}
else
{
undef $self->{NE_dict};
}
if (defined($typing_rules_f))
{
$self->{typing_rules}=$self->_read_typing_rules($typing_rules_f);
}
else
{
undef $self->{typing_rules};
}
if (defined($onto_nodes_f))
{
$self->{onto_nodes}=$self->_read_onto_nodes($onto_nodes_f);
}
else
{
undef $self->{onto_nodes};
}
if (defined($onto_mapping_f))
{
$self->{onto_paths}=$self->_read_onto_mapping($onto_mapping_f);
}
else
{
undef $self->{onto_paths};
}
# print STDERR " Term dict. check: 'Northern blot' -> " .
# $self->{term_dict}->{&canonise_def('Northern blot')} . "\n";
return 1;
}
sub cleanspaces() {
$_ = shift();
s/\s+/ /g;
s/^ //g;
s/ $//g;
return $_;
}
sub _read_lemma_dict
{
my $self=shift;
my $f=shift;
my %dict=();
if (!defined(open(F,"<:utf8",$f)))
{
return undef;
}
while (my $l=<F>)
{
chomp $l;
my ($form,$lemma,$pos)=split(/\t/,$l,-1);
$form = &cleanspaces($form);
$dict{lc($form)}{lemma}=&cleanspaces($lemma);
$dict{lc($form)}{POS}=$pos;
}
close(F);
return \%dict;
}
# default method to standardise terms and named entities
# lower case, ignore space and '-'
sub canonise_def {
$_ = shift();
s/\s+//g;
s/\-//g;
$_ = lc($_);
}
sub _read_term_dict
{
my $self=shift;
my $f=shift;
my %dict=();
my $term_max_len = 0;
if (!defined(open(F,"<:utf8",$f)))
{
return undef;
}
while (my $l=<F>)
{
chomp $l;
my ($form,$can)=split(/\t/,$l,-1);
$form = &cleanspaces($form);
$can = &cleanspaces($can);
my $cf = &{$self->{tcanon}}($form);
if ( $verbose && defined($dict{$cf}) && $dict{$cf} ne $can ) {
print STDERR "Term of form '$form' has canonical form '$can'\n"
. " but maps to the another canonical form '$dict{$cf}'\n";
}
$dict{$cf}=$can;
my @tt = split(/ /,$form);
if ( scalar(@tt)> $term_max_len) {
$term_max_len = scalar(@tt);
}
}
if ( $self->{termMaxLen}<$term_max_len ) {
$self->{termMaxLen} = $term_max_len;
}
close(F);
return \%dict;
}
sub _read_NE_dict
{
my $self=shift;
my $f=shift;
my %dict=();
my $term_max_len = 0;
if (!defined(open(F,"<:utf8",$f)))
{
return undef;
}
while (my $l=<F>)
{
chomp $l;
my ($form,$can)=split(/\t/,$l,-1);
$form = &cleanspaces($form);
$can = &cleanspaces($can);
my $cf = &{$self->{ncanon}}($form);
if ( $verbose && defined($dict{$cf}) && $dict{$cf} ne $can ) {
print STDERR "NE of form '$form' has canonical form '$can'\n"
. " but maps to the another canonical form '$dict{$cf}'\n";
}
$dict{$cf}=$can;
my @tt = split(/\s+/,$form);
if ( scalar(@tt)> $term_max_len) {
$term_max_len = scalar(@tt);
}
}
if ( $self->{termMaxLen}<$term_max_len ) {
$self->{termMaxLen} = $term_max_len;
}
close(F);
return \%dict;
}
sub _read_typing_rules
{
my $self=shift;
my $f=shift;
my %dict=();
if (!defined(open(F,"<:utf8",$f)))
{
return undef;
}
while (my $l=<F>)
{
chomp $l;
my ($form,$type)=split(/\t/,$l,-1);
$form = &cleanspaces($form);
$type = &cleanspaces($type);
$dict{$form}=$type;
}
close(F);
return \%dict;
}
sub _read_onto_nodes
{
my $self=shift;
my $f=shift;
my %dict=();
if (!defined(open(F,"<:utf8",$f)))
{
return undef;
}
while (my $l=<F>)
{
chomp $l;
my ($form,$onto_node)=split(/\t/,$l,-1);
$form = &cleanspaces($form);
$onto_node = &cleanspaces($onto_node);
$dict{$form}=$onto_node;
}
close(F);
return \%dict;
}
sub _read_onto_mapping
{
my $self=shift;
my $f=shift;
my %dict=();
if (!defined(open(F,"<:utf8",$f)))
{
return undef;
}
while (my $l=<F>)
{
chomp $l;
my ($node,$path)=split(/\t/,$l,-1);
$node = &cleanspaces($node);
$path = &cleanspaces($path);
$dict{$node}=$path;
}
close(F);
return \%dict;
}
sub transform # just for testing and debugging
{
my $self=shift;
my $query=shift; # list of word forms
my $expanded_query_struct=$self->_expand_qword_list($query);
$self->{queryForm} = $query;
$self->{finalForm} = "";
my $query_XML=$self->_data_struct2XML($expanded_query_struct);
return $query_XML;
}
#
# Given a list of word forms, expand
#
sub _expand_qword_list
{
my $self=shift;
my $query=shift; # list of word forms
# print STDERR "Q: " . Dumper($query) . "\n";
my $lemmatized_by_tagger=$self->_apply_treetagger($query);
if (!defined($lemmatized_by_tagger))
{
$self->_set_err_state($ERR_TREETAGGER);
return undef;
}
# print STDERR "LEM: " . Dumper($lemmatized_by_tagger) . "\n";
my $lemmatized=
$self->_apply_lemma_dict($lemmatized_by_tagger); # if one exists
if (!defined($lemmatized))
{
$self->_set_err_state($ERR_LEMMA_DICT);
return undef;
}
( run in 1.207 second using v1.01-cache-2.11-cpan-d8267643d1d )