CWB-Web
view release on metacpan or search on metacpan
lib/CWB/Web/Search.pm view on Meta::CPAN
my $word_att = shift;
my $start_tag = shift;
my $end_tag = shift;
my $maximal_length = shift;
$maximal_length = 160 unless defined $maximal_length;
$maximal_length = int ($maximal_length / 2) + 1; # summary is symmetric to center of match
my $match = $self->[$n];
my $length = int(sqrt($match->{'variance'})*2 + 15); # default summary extends 2 * s.d. + 15 tokens to each side from center
$length = $maximal_length if $length > $maximal_length;
$length = 5 if $length < 5; # minimum size is 11 words
my @words = (); # summary as word list
# summary must be contained in context, i.e. use this matches 'first' and 'last' as hard boundaries
my $first = $match->{'center'} - $length;
my $pre_ell = "... "; # ellipsis before summary if not start of document
if ($first < $match->{'first'}) {
$first = $match->{'first'};
$pre_ell = "";
}
my $last = $match->{'center'} + $length;
my $post_ell = " ..."; # ellipsis after summary if not end of document
if ($last > $match->{'last'}) {
$last = $match->{'last'};
$post_ell = "";
}
foreach my $cpos ($first .. $last) {
my $token = encode_entities($word_att->cpos2str($cpos));
$token = $start_tag.$token.$end_tag
if (defined $match->{'keyword'}->{$cpos});
push @words, $token;
}
return $pre_ell."@words".$post_ell;
}
# (re-)compute quality values of matches
# Usage: $ml->compute_quality($no_of_keywords [, @preferred ]);
sub compute_quality {
my $self = shift;
my $no_of_keywords = shift;
my @preferred = @_; # list of s-attribute handles for preferred regions -> higher quality score
# each keyword present in the match scores 1 point; proximity of the
# keywords scores up to $proximity_points points (scaled by standard deviation)
my $proximity_points = # we'll probably want to make that configurable
($no_of_keywords > 2) ? 2 : 1;
my $preferred_points = (@preferred) ? # ... and that as well
(($no_of_keywords > 3) ? 2 : 1) : 0;
my $total_points = ($no_of_keywords - 1) + $proximity_points + $preferred_points;
foreach my $match (@$self) {
my $variance = $match->{'variance'};
$variance = 4 if $variance < 4; # scale proximity score to 0 .. $proximity_points range
my $sd = sqrt($variance/4);
my $cnt_preferred = 0; # a match gets higher score, if one or more of its keywords fall into preferred environments
foreach my $keyword (keys %{$match->{'keyword'}}) {
$cnt_preferred++
if grep { defined $_->cpos2struc($keyword) } @preferred;
}
my $quality =
(($match->{'n'} - 1) + ($proximity_points / $sd) + ($preferred_points * $cnt_preferred / $no_of_keywords))
/ $total_points;
$match->{'quality'} = int ($quality*100 + 0.5); # rounded percentage
}
}
# sort matchlist by one the fields
# 'first' ... sort by first position in match
# 'quality' ... sort by quality values (make sure you've called compute_quality() beforehand!)
sub sort {
my $self = shift;
my $method = shift;
my %sort = (
'first' => sub {return sort {$a->{'first'} <=> $b->{'first'}} @_},
'quality' => sub {return sort {$b->{'quality'} <=> $a->{'quality'}} @_},
);
croak "Usage: \$ml->sort('".join("' | '", keys %sort)."');"
unless defined $sort{$method};
@$self = &{$sort{$method}}(@$self);
}
# remove overlapping matches ("cull" the one with highest quality rating)
# NB don't forget to $ml->compute_quality; $ml->sort('first') before!
sub cull {
my $self = shift;
I_LOOP:
for (my $i = 0; $i < @$self; $i++) {
my $i_first = $self->[$i]->{'first'};
my $i_last = $self->[$i]->{'last'};
J_LOOP:
for (my $j = $i+1; $j < @$self; $j++) {
last unless $j < @$self; # check again for 'redo'
my $j_first = $self->[$j]->{'first'};
my $j_last = $self->[$j]->{'last'};
# compare i-th and j-th element of match list
last J_LOOP # assume ml is ordered by 'first' -> no more overlaps possible after this point
if $j_first > $i_last;
if ($i_last >= $j_first) {
if ($self->[$i]->{'quality'} >= $self->[$j]->{'quality'}) {
# i-th match is better -> keep it, delete j-th match
splice @$self, $j, 1;
redo J_LOOP; # must re-consider j-th match
}
else {
# j-th match is better -> delete i-th match and restart outer loop
splice @$self, $i, 1;
redo I_LOOP;
}
}
}
}
# meself had better be culled nicely now ...
}
# **
# cleanup: to be done
# find overlapping matches & select the one with higher quality
# **
# expand to desired context setting
# template for the context_expansion_func() function:
lib/CWB/Web/Search.pm view on Meta::CPAN
$m->{'data'}->{'date'};
if ($want_context) {
$m = $search->match($i, 'context');
$m->{'context'}; # match with context (HTML encoded)
}
}
undef $search;
=head1 DESCRIPTION
The I<CWB::Web::Search> module executes simple queries similar to
commercial Web search engines on CWB-encoded corpora. The I<query()> method
returns I<keywords> found in the corpus with the requested amount of
context in HTML format. Additionally, data stored in structural
attributes can be returned. Typically, a CGI script will create a
I<CWB::Web::Search> object for a single query.
=head1 ERRORS
If the I<CWB::Web::Search> module encounters an error condition, an error
message is printed on C<STDERR> and the program is terminated. A user-defined
error handler can be installed with the I<on_error()> method. In this case,
the error callback function is passed the error message generated by the module
as a list of strings.
=head1 CORPUS REGISTRY
If you need to use a registry other than the default corpus registry,
you should change the setting directly in the L<CWB::CL|CWB::CL> module.
use CWB::CL;
$CWB::CL::Registry = "/path/to/my/registry";
This will affect all new I<CWB::Web::Search> objects.
=head1 RESULT STRUCTURE
The search module's I<match()> method return a I<result struct> for
the n-th match of the last query executed. A CGI script will usually
iterate through all matches with a loop similar to this:
$nr_matches = $search->query(...);
for ($n = 0; $n < $nr_maches; $n++) {
$m = $search->match($n);
# code for processing match data in result struct $m
}
A I<result struct> $m has the following fields:
=over 4
=item $m->{'cpos'}
I<Corpus position> of the I<centre> of this match (the I<centre> is
computed from the positions of all search I<keywords> in a match).
=item $m->{'quality'}
An estimate of the I<relevance> of this match. This ranking is given as a
percentage with 100% corresponding to a "perfect match". The matches found
by the I<query()> method are sorted according to their 'quality' value.
=item $m->{'summary'}
A text segment from the corpus containing most of the <keywords> found
in this match (up to a reasonable maxium length). It is returned in
HTML format with the I<keywords> highlighted.
=item $m->{'context'}
The text segment from the corpus containing all <keywords> found in
this match, expanded according to the I<context()> setting. It is
returned in HTML format with the I<keywords> highlighted.
B<NB> The I<context> field is only included if the C<'context'> switch
was passed to the I<match()> method:
$m = $search->match($n, 'context');
See the remarks on I<virtual context> in the description of the
I<cull()> method below.
=item $m->{'data'}
The values of the structural attributes requested by the I<data()>
method are returned in the subfields of the 'data' field. A typical
CGI application will use the 'data' field to retrieve document URLs,
e.g.
$match_url = $m->{'data'}->{'url'};
where the search corpus contains regions like the following
<url http://www.ims.uni-stuttgart.de/> ... </url>
The values stored in the 'data' field are not HTML encoded.
=back
=head1 METHODS
=over 4
=item $search = new CWB::Web::Search $corpus;
Create I<CWB::Web::Search> object for WWW search queries on the
CWB corpus C<$corpus>.
=item @results = $search->query($key1, $key2, ... );
Searches corpus for the specified I<keywords> and returns a list
of matches sorted by (decreasing) relevance.
See L<"RESULT STRUCTURE"> for the format of the C<@results> list.
=back
=head1 COPYRIGHT
( run in 1.801 second using v1.01-cache-2.11-cpan-8f98c5d2c55 )