Alvis-QueryFilter
view release on metacpan or search on metacpan
lib/Alvis/QueryFilter.pm view on Meta::CPAN
return \%dict;
}
sub _read_onto_nodes
{
my $self=shift;
my $f=shift;
my %dict=();
if (!defined(open(F,"<:utf8",$f)))
{
return undef;
}
while (my $l=<F>)
{
chomp $l;
my ($form,$onto_node)=split(/\t/,$l,-1);
$form = &cleanspaces($form);
$onto_node = &cleanspaces($onto_node);
$dict{$form}=$onto_node;
}
close(F);
return \%dict;
}
sub _read_onto_mapping
{
my $self=shift;
my $f=shift;
my %dict=();
if (!defined(open(F,"<:utf8",$f)))
{
return undef;
}
while (my $l=<F>)
{
chomp $l;
my ($node,$path)=split(/\t/,$l,-1);
$node = &cleanspaces($node);
$path = &cleanspaces($path);
$dict{$node}=$path;
}
close(F);
return \%dict;
}
sub transform # just for testing and debugging
{
my $self=shift;
my $query=shift; # list of word forms
my $expanded_query_struct=$self->_expand_qword_list($query);
$self->{queryForm} = $query;
$self->{finalForm} = "";
my $query_XML=$self->_data_struct2XML($expanded_query_struct);
return $query_XML;
}
#
# Given a list of word forms, expand
#
sub _expand_qword_list
{
my $self=shift;
my $query=shift; # list of word forms
# print STDERR "Q: " . Dumper($query) . "\n";
my $lemmatized_by_tagger=$self->_apply_treetagger($query);
if (!defined($lemmatized_by_tagger))
{
$self->_set_err_state($ERR_TREETAGGER);
return undef;
}
# print STDERR "LEM: " . Dumper($lemmatized_by_tagger) . "\n";
my $lemmatized=
$self->_apply_lemma_dict($lemmatized_by_tagger); # if one exists
if (!defined($lemmatized))
{
$self->_set_err_state($ERR_LEMMA_DICT);
return undef;
}
# print STDERR "LEMTAG: " . Dumper($lemmatized) . "\n";
my $term_NE_expanded=$self->_apply_terms_and_NEs($lemmatized);
if (!defined($term_NE_expanded))
{
$self->_set_err_state($ERR_APPLYING_TERM_NE);
return undef;
}
# print STDERR "TERM: " . Dumper($term_NE_expanded) . "\n";
my $typing_expanded=$self->_apply_typing_rules($term_NE_expanded);
if (!defined($typing_expanded))
{
$self->_set_err_state($ERR_APPLYING_TYPING);
return undef;
}
my $onto_expanded=$self->_apply_onto($typing_expanded);
if (!defined($onto_expanded))
{
$self->_set_err_state($ERR_APPLYING_ONTO);
return undef;
}
# print STDERR "FINAL: " . Dumper($onto_expanded) . "\n";
return $onto_expanded;
}
# extract query from SRU
sub UI2Query
{
my $self=shift;
my $SRU=shift;
if ( /&query=([^\&]*)/ ) {
return $1;
}
return "";
}
#
# UI ---> Zebra middle man
#
sub UI2Zebra
{
my $self=shift;
my $SRU=shift;
my @expanded_SRU=();
# extract the query
my $query;
my @p=split(/\&/,$SRU,-1);
for my $p (@p)
{
if ($p=~/^query=(.*)$/)
{
$query=$1;
}
else
{
push(@expanded_SRU,$p); # so we can reconstruct
}
}
if (!defined($query))
{
$self->_set_err_state($ERR_NO_QUERY,"SRU:\"$SRU\"");
return undef;
}
$self->{queryForm} = $query;
$self->{queryForm} =~ s/\&/\&/g;
$self->{queryForm} =~ s/</\</g;
$self->{queryForm} =~ s/>/\>/g;
$self->{finalForm} = "";
# decode percentage notation
my $query_copy=$query;
$query_copy=uri_unescape($query_copy);
# parse the CQL
my $parse_tree;
eval
{
$parse_tree=$self->{CQLParser}->parse($query_copy);
};
if ($@)
{
chomp($query);
$@=~s/(.*) at .* line [0-9]+\n/$1/o;
$self->_set_err_state($ERR_CQL_PARSE,"Query:\"$query\".");
return undef;
}
# Get a list of all possible text query word sequences (so this is
# implicitly an OR of them)
#
my $t_qwords=[[]]; # help variable used in the recursion
my $seq_list=
$self->_get_text_qwords($parse_tree,$t_qwords);
if (!defined($seq_list))
{
$self->_set_err_state($ERR_CREATING_SEQ_LIST);
return undef;
}
# Get the categorising tail anded to the end
#
my $cats=&get_categories($parse_tree);
if (!defined($cats))
{
$self->_set_err_state($ERR_CREATING_CAT_LIST);
return undef;
}
# Important! Used in the following Zebra2UI, 'cause the
# SRU response has nothing about the query.
# So...if used out of sync/with more than one client...kaboom!
# Not my problem.
$self->{currSeqList}=$seq_list;
# print STDERR "Term elements: " . Dumper($seq_list) . "\n";
#
# Ok, create the 'tail' i.e. what we AND to the original query
# as an OR of possible expansions
#
my $CQL_tail=$self->_data_struct2CQLtail($seq_list);
if (!defined($CQL_tail))
{
$self->_set_err_state($ERR_CREATING_CQL_TAIL);
return undef;
}
# print STDERR "QQ##$query##$cats##$CQL_tail\n";
#$query='%28' . $query . '%29%20and%20' $CQL_tail;
$query=$CQL_tail;
if ( $cats ) {
$query .= '%20and%20' . $cats;
}
push(@expanded_SRU,"query=$query");
$self->{finalForm} = $query;
$self->{finalForm} =~ s/\&/\&/g;
$self->{finalForm} =~ s/</\</g;
$self->{finalForm} =~ s/>/\>/g;
return join('&',@expanded_SRU);
}
#
# Zebra ---> UI middle man
#
sub Zebra2UI
{
my $self=shift;
my $SRU_response=shift;
# We need to know what the query was! It's not in the response.
# Of course this is bloody dangerous if we get out of sync or
# have more than 1 client.
#
if (!defined($self->{currSeqList}))
{
$self->_set_err_state($ERR_NO_SEQ_LIST);
return undef;
}
#
# Just convert it to our XML format to put into extraResponseData
# I chose to just catenate <query> elements as an implicit OR..
#
my $query_XML=$self->_data_struct2XML($self->{currSeqList});
if (!defined($query_XML))
{
$self->_set_err_state($ERR_STRUCT2XML);
return undef;
}
${$SRU_response} =~ s/<\/zs:searchRetrieveResponse>/<zs:extraResponseData>$query_XML<\/zs:extraResponseData><\/zs:searchRetrieveResponse>/;
return 1;
}
#
# Recursive CQL parse tree traversal, results in picking out the relevant
# text query words in order. Too tired to explain.
#
sub _get_text_qwords
{
my $self=shift;
my $CQL_parse_node=shift;
my $text_qwords=shift;
my ($text_qwords_l,$text_qwords_r);
# print STDERR "ENTRY:",Dumper($text_qwords);
if ($CQL_parse_node->isa("CQL::AndNode"))
{
# warn "AND";
$text_qwords_l=$self->_get_text_qwords($CQL_parse_node->left(),
$text_qwords);
$text_qwords_r=$self->_get_text_qwords($CQL_parse_node->right(),
$text_qwords_l);
return $text_qwords_r;
lib/Alvis/QueryFilter.pm view on Meta::CPAN
# the ANDed category part at the end
#
sub get_categories
{
my $CQL_parse_node=shift;
my ($text_catq_l,$text_catq_r);
# print STDERR "ENTRY:",Dumper($text_catq);
if ($CQL_parse_node->isa("CQL::AndNode"))
{
# warn "AND";
$text_catq_l=&get_categories($CQL_parse_node->left());
$text_catq_r=&get_categories($CQL_parse_node->right());
if ( $text_catq_l && $text_catq_r ) {
return $text_catq_l . ' and ' .$text_catq_r;
}
return $text_catq_l . $text_catq_r;
}
elsif ($CQL_parse_node->isa("CQL::OrNode"))
{
return "";
}
elsif ($CQL_parse_node->isa("CQL::NotNode"))
{
return "";
}
elsif ($CQL_parse_node->isa("CQL::TermNode"))
{
# warn "TERM";
my $qualifier=$CQL_parse_node->getQualifier();
# Our partial hack solution: if it contains a space, leave as is.
# Wray's hack on hack - keep space stuff, and deal with it differently
if ($qualifier ne 'text' && $qualifier ne 'srw.ServerChoice') {
return $CQL_parse_node->toCQL();
} else {
return "";
}
}
return "";
}
#
# Converts our expansion data structure to a CQL "tail"
#
sub _data_struct2CQLtail
{
my $self=shift;
my $seq_list=shift;
my $query;
my @seq_items=();
for my $seq (@$seq_list)
{
my $ds=$self->_expand_qword_list($seq);
my @items=();
for (my $i=0;$i<scalar(@$ds);$i++)
{
my ($token,$POS,$lemma,$max_type,$match_can_form,$pathtype)
= @{$ds->[$i]};
if ( $POS eq 'INDEX' && $token =~ /^([a-z0-9\-\_\.]+)=(.*)/ ) {
push(@items,"$1%3D%22$2%22");
} elsif (defined($max_type))
{
if ($max_type eq 'term_dict')
{
my $surface_form=$token;
my $can_form=$match_can_form;
my $onto=$pathtype;
my $j;
for ($j=$i+1;$j<scalar(@$ds);$j++)
{
my ($token,$POS,$lemma,$max_type,$match_can_form,
$onto_path)
=@{$ds->[$j]};
if (!defined($max_type) || $max_type ne 'term_dict'
|| !defined($match_can_form) || $can_form ne $match_can_form )
{
last;
}
$surface_form.=" $token";
$onto=$onto_path;
}
if ( defined($onto) && $onto ne "" )
{
push(@items,"term%3D%22$onto$can_form%22"); # unclear
}
else
{
push(@items,"term%3D%22$can_form%22");
}
$i=$j-1;
}
elsif ($max_type eq 'NE_dict')
{
my $surface_form=$token;
my $can_form=$match_can_form;
my $type=$pathtype;
my $j;
for ($j=$i+1;$j<scalar(@$ds);$j++)
{
my ($token,$POS,$lemma,$max_type,$match_can_form,$NE_type)
=@{$ds->[$j]};
if (!defined($max_type) || $max_type ne 'NE_dict'
|| !defined($match_can_form) || $can_form ne $match_can_form )
{
last;
}
$surface_form.=" $token";
$type = $NE_type;
lib/Alvis/QueryFilter.pm view on Meta::CPAN
$i=$j-1;
}
}
elsif (defined($lemma) && $POS =~ /$self->{lemmaSearch}/o
&& $self->{keepLemmas})
{
push(@items,"lemma%3D%22$lemma%22");
}
else {
push(@items,$self->_make_term($token));
}
}
push(@seq_items,"%28" . join('%20and%20',@items) . "%29");
}
if ( scalar(@seq_items) <= 1 ) {
$query = $seq_items[0];
} else {
$query = "%28" . join('%20or%20',@seq_items) . "%29";
}
return $query;
}
sub _make_term
{
my $self=shift;
my $term=shift;
if ( $term !~/^\"/ || $term !~/\"$/ ) {
$term="\"$term\"";
}
if ( $self->{textFields} =~/ / ) {
my $result = "";
foreach my $f ( split(/ /,$self->{textFields}) ) {
$result .= " or $f%3D$term";
}
$result =~ s/^ or //;
return "($result)";
}
return $self->{textFields} . "%3D$term";
}
#
# Converts our expansion data structure to XML that fits extraResponseData
#
sub _data_struct2XML
{
my $self=shift;
my $seq_list=shift;
my $XML = "<filter>\n <input>" . $self->{queryForm} . "</input>\n";
# Why was this here in the first place?
# $XML.="<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
my @seq_items=();
for my $seq (@$seq_list)
{
my $ds=$self->_expand_qword_list($seq);
$XML.="<query xmlns=\"http://alvis.info/query/\"\n";
$XML.=" form=\"" . join(' ',@$seq) . "\" >\n";
for (my $i=0;$i<scalar(@$ds);$i++)
{
my ($token,$POS,$lemma,$max_type,$match_can_form,$pathtype)
= @{$ds->[$i]};
if (defined($max_type))
{
if ($max_type eq 'term_dict')
{
my $surface_form=$token;
my $can_form=$match_can_form;
my $onto=$pathtype;
my $j;
for ($j=$i+1;$j<scalar(@$ds);$j++)
{
my ($token,$POS,$lemma,$max_type,$match_can_form,
$onto_path)
=@{$ds->[$j]};
if (!defined($max_type) || $max_type ne 'term_dict'
|| $match_can_form ne $can_form)
{
last;
}
$surface_form.=" $token";
$onto=$onto_path;
}
$XML.="<term>\n";
$XML.=" <form>$surface_form</form>\n";
$XML.=" <canonical_form>$can_form</canonical_form>\n";
if ( defined($onto) && $onto ne "" )
{
$onto =~ s/\/$//;
$XML .= " <ontology_path>$onto</ontology_path>\n";
}
$XML.="</term>\n";
$i=$j-1;
}
elsif ($max_type eq 'NE_dict')
{
my $surface_form=$token;
my $can_form=$match_can_form;
my $type=$pathtype;
my $j;
for ($j=$i+1;$j<scalar(@$ds);$j++)
{
my ($token,$POS,$lemma,$max_type,$match_can_form,$NE_type)
=@{$ds->[$j]};
if ($max_type ne 'NE_dict' || $match_can_form ne $can_form)
{
last;
}
$surface_form.=" $token";
$type=$NE_type;
}
$XML.="<named_entity>\n";
$XML.=" <form>$surface_form</form>\n";
lib/Alvis/QueryFilter.pm view on Meta::CPAN
SRU server. Queries are first lemmatised by the Treetagger,
and then translated according to rules in a
set of dictionaries, and then fed to an SRU server. The
results then have the query translation data added into the
<extraResponseData> field.
Query translation uses a specific scheme for creating field
names to use, and these fields are supported by the underlying SRU server.
Words in double quotes are left as is.
The remaining words are lemmatised by the Treetagger and
contiguous sequences match the term and named entity rules.
Terms recognised in the input query will generate a
I<term="words"> entry in the transformed query.
If an ontology node exists for them,
the corresponding ontology path will be prepended giving
I<term="onto-path/words"> entry.
Named entities recognised in the
input query, where ontologies are applied, will generate
a
I<entity="words"> or
I<entity="onto-path/words"> entry.
When typing is used for named entities, a
I<entity-type="words"> entry is made.
Words that are not used in either terms or named entities,
that are lemmatised create a
I<lemma="word"> entry.
=head1 METHODS
=head2 new()
Create object.
my $QF=Alvis::QueryFilter->new();
=head2 read_dicts()
Sets the filenames for the linguistic resources, and loads them up.
Must be called once at the start.
if (!$QF->read_dicts($lemma_dict_f, $term_dict_f, $NE_dict_f,
$typing_rules_f, $onto_nodes_f, $onto_mapping_f)) {
die("Reading the dictionaries failed.");
}
Dictionary rules apply to the lemmatised forms after the Treetagger has been used.
$lemma_dict_f : Lists (text-occurence,lemma,part-of-speech) for lemmatising to be done on words left as unknown by the Treetagger. The part of speech is just annotation, so not used.
$term_dict_f : Lists (text-occurence,canonical-form) for terms.
$NE_dict_f : Lists (text-occurence,canonical-form) for named entities.
$typing_rules_f : Lists (canonical-form,type) for named entities. Types are short text items (e.g., 'species', 'company', 'person') used to categorise named entities when no ontology is in use.
$onto_nodes_f : Lists (canonical-form,ontology-node) for terms and named entities that are located in the ontology. If named entities occur here, $typing_rules_f should be empty.
$onto_mapping_f : Lists (ontology-node,ontology-path) giving fully expanded path for each node.
Entries in "NEs" and "terms" are applied as rules to query words, with longest match applying first. Once all these are done, the typing or ontology forms are applied.
=head2 set_canon()
Sets the functions used to convert terms and names to a canonical
form that will be used when matching against dictionaries. Call before
reading dictionaries. This can be used to handle comment elements of term
matching such as (possibly dangerously) ignoring dashes.
sub termcanonise { $_ = lc(shift()); s/[\s\-]//g; return $_; }
sub namecanonise { $_ = shift(); s/[\s\-\.]//g; return $_; }
$QF->set_canon(\&termcanonise,\&namecanonise);
=head2 set_lemma()
Sets the match field to identify whether a lemma located by Treetagger
should be searched in
I<lemma> indexes or
I<text> indexes.
$QF->set_lemma("^[NVJ]");
=head2 set_text_fields()
Sets the text fields expected of CQL output. Call before
reading dictionaries.
$QF->set_text_fields("text anchortext dc.title");
Fields are extracted by splitting on spaces.
The query filter assumes unfielded query terms are with the CQL field
"text", and any other fields should only occur conjoined to the
end of the query (i.e., not inside any other Boolean constructs).
On output, and with the above call to &set_text_fields(),
every CQL terminal node of form text="words" will
be translated into the disjunct:
( text="words" OR anchortext="words" OR dc.title="words" )
=head2 UI2Zebra()
Convert SRU request/input received from your HTTP server, for instance, and
do the query translation to generate a new SRU request ready
to send to the real SRU server. Details of the query mapping are
stored with the object for later use by Zebra2UI().
my $ToZebra=$QF->UI2Zebra($SRU);
my $ua = LWP::UserAgent->new;
my $response = $ua->get("http://localhost:10000/$ToZebra");
=head2 Zebra2UI()
Filter the XML-wrapped as a HTTP response, received from
the real SRU server to add the query translation data into the
<extraResponseData> field as a <filter> entry.
The argument is a reference to the response text.
( run in 0.977 second using v1.01-cache-2.11-cpan-5b529ec07f3 )