Alvis-QueryFilter

 view release on metacpan or  search on metacpan

bin/run_QF.pl  view on Meta::CPAN

=head1 DATA

All resources have one entry per line, and each entry has fields that are tab delimited.  Spacing within a field should be standardised to single spaces.  The "types" file should be non-existant if named entities are also listed as having ontology no...

<AlvisDir>/resources/lemmas :   Lists (text-occurrence,lemma-form) for lemmatising words.

<AlvisDir>/resources/NEs :   Lists (text-occurrence,canonical-form) for matching named entities.

<AlvisDir>/resources/onto_nodes :    Lists (canonical-form,ontology-node) for matching lemmas, terms and named entities that are located in the ontology.

<AlvisDir>/resources/onto_paths :  Lists (ontology-node,ontology-path) giving fully expanded path for each node.

<AlvisDir>/resources/terms :   Lists (text-occurence,canonical-form) for matching terms.

<AlvisDir>/resources/types :   Lists (canonical-form,type) for named entities.  Types are short text items (e.g., 'species', 'company', 'person') used to categorise named entities when no ontology is in use.

Entries in "NEs" and "terms" are applied as rules to query words, with longest match applying first.  Once all these are done, the typing or ontology forms are applied.

Resources are best manipulated and iported/exported as a
single XML file using the routines of
B<zebractl>(1).

lib/Alvis/QueryFilter.pm  view on Meta::CPAN

    close(F);

    return \%dict;
}

sub transform   # just for testing and debugging
{
    my $self=shift;
    my $query=shift; # list of word forms
    
    my $expanded_query_struct=$self->_expand_qword_list($query);

    $self->{queryForm} = $query;
    $self->{finalForm} = "";
    
    my $query_XML=$self->_data_struct2XML($expanded_query_struct);

    return $query_XML;
}

#
# Given a list of word forms, expand
#
sub _expand_qword_list
{
    my $self=shift;
    my $query=shift; # list of word forms

    # print STDERR "Q: " . Dumper($query) . "\n";

    my $lemmatized_by_tagger=$self->_apply_treetagger($query);
    if (!defined($lemmatized_by_tagger))
    {
	$self->_set_err_state($ERR_TREETAGGER);

lib/Alvis/QueryFilter.pm  view on Meta::CPAN

    my $lemmatized=
	$self->_apply_lemma_dict($lemmatized_by_tagger); # if one exists
    if (!defined($lemmatized))
    {
	$self->_set_err_state($ERR_LEMMA_DICT);
	return undef;
    }
    
    # print STDERR "LEMTAG: " . Dumper($lemmatized) . "\n";
    
    my $term_NE_expanded=$self->_apply_terms_and_NEs($lemmatized);
    if (!defined($term_NE_expanded))
    {
	$self->_set_err_state($ERR_APPLYING_TERM_NE);
	return undef;
    }
    # print STDERR "TERM: " . Dumper($term_NE_expanded) . "\n";
    
    
    my $typing_expanded=$self->_apply_typing_rules($term_NE_expanded); 
    if (!defined($typing_expanded))
    {
	$self->_set_err_state($ERR_APPLYING_TYPING);
	return undef;
    }

    my $onto_expanded=$self->_apply_onto($typing_expanded); 
    if (!defined($onto_expanded))
    {
	$self->_set_err_state($ERR_APPLYING_ONTO);
	return undef;
    }
    # print STDERR "FINAL: " . Dumper($onto_expanded) . "\n";

    return $onto_expanded;
}

#  extract query from SRU
sub UI2Query
{
    my $self=shift;
    my $SRU=shift;
    if ( /&query=([^\&]*)/ ) {
      return $1;
    }

lib/Alvis/QueryFilter.pm  view on Meta::CPAN

}

#
#  UI ---> Zebra  middle man
#
sub UI2Zebra
{
    my $self=shift;
    my $SRU=shift;

    my @expanded_SRU=();

    # extract the query
    my $query;
    my @p=split(/\&/,$SRU,-1);
    for my $p (@p)
    {
	if ($p=~/^query=(.*)$/)
	{
	    $query=$1;
	}
	else
	{
	    push(@expanded_SRU,$p); # so we can reconstruct
	}
    }
    if (!defined($query))
    {
	$self->_set_err_state($ERR_NO_QUERY,"SRU:\"$SRU\"");
	return undef;
    }
    $self->{queryForm} = $query;
    $self->{queryForm} =~ s/\&/\&amp;/g;
    $self->{queryForm} =~ s/</\&lt;/g;

lib/Alvis/QueryFilter.pm  view on Meta::CPAN

    }

    # print STDERR "QQ##$query##$cats##$CQL_tail\n";

    #$query='%28' . $query . '%29%20and%20' $CQL_tail;
    $query=$CQL_tail;
    if ( $cats ) {
      $query .= '%20and%20' . $cats;
    } 

    push(@expanded_SRU,"query=$query");

    $self->{finalForm} = $query;
    $self->{finalForm} =~ s/\&/\&amp;/g;
    $self->{finalForm} =~ s/</\&lt;/g;
    $self->{finalForm} =~ s/>/\&gt;/g;

    return join('&',@expanded_SRU);
}

#
#  Zebra ---> UI  middle man
#
sub Zebra2UI
{
    my $self=shift;
    my $SRU_response=shift; 
    

lib/Alvis/QueryFilter.pm  view on Meta::CPAN

#
sub _data_struct2CQLtail
{
    my $self=shift;
    my $seq_list=shift;

    my $query;
    my @seq_items=();
    for my $seq (@$seq_list)
    {
	my $ds=$self->_expand_qword_list($seq);

	my @items=();

	for (my $i=0;$i<scalar(@$ds);$i++)
	{
	    my ($token,$POS,$lemma,$max_type,$match_can_form,$pathtype)
	      = @{$ds->[$i]};

	    if ( $POS eq 'INDEX' && $token =~ /^([a-z0-9\-\_\.]+)=(.*)/ ) {
		push(@items,"$1%3D%22$2%22");

lib/Alvis/QueryFilter.pm  view on Meta::CPAN

    my $seq_list=shift;

    my $XML = "<filter>\n <input>" . $self->{queryForm} . "</input>\n";

#    Why was this here in the first place?
#    $XML.="<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";

    my @seq_items=();
    for my $seq (@$seq_list)
    {
	my $ds=$self->_expand_qword_list($seq);

	$XML.="<query xmlns=\"http://alvis.info/query/\"\n";
	$XML.="     form=\"" . join(' ',@$seq) . "\" >\n";

	for (my $i=0;$i<scalar(@$ds);$i++)
	{
	    my ($token,$POS,$lemma,$max_type,$match_can_form,$pathtype)
	      = @{$ds->[$i]};

	    if (defined($max_type))

lib/Alvis/QueryFilter.pm  view on Meta::CPAN

$lemma_dict_f :   Lists (text-occurence,lemma,part-of-speech) for lemmatising to be done on words left as unknown by the Treetagger.  The part of speech is just annotation, so not used.

$term_dict_f :    Lists (text-occurence,canonical-form) for terms.

$NE_dict_f :   Lists (text-occurence,canonical-form) for named entities.

$typing_rules_f :    Lists (canonical-form,type) for named entities.  Types are short text items (e.g., 'species', 'company', 'person') used to categorise named entities when no ontology is in use.

$onto_nodes_f :    Lists (canonical-form,ontology-node) for terms and named entities that are located in the ontology.  If named entities occur here, $typing_rules_f should be empty.

$onto_mapping_f :    Lists (ontology-node,ontology-path) giving fully expanded path for each node.

Entries in "NEs" and "terms" are applied as rules to query words, with longest match applying first.  Once all these are done, the typing or ontology forms are applied.


=head2 set_canon()

Sets the functions used to convert terms and names to a canonical
form that will be used when matching against dictionaries. Call before
reading dictionaries.  This can be used to handle comment elements of term
matching such as (possibly dangerously) ignoring dashes.



( run in 0.716 second using v1.01-cache-2.11-cpan-5b529ec07f3 )