CWB-Web

 view release on metacpan or  search on metacpan

lib/CWB/Web/Search.pm  view on Meta::CPAN

  my $word_att = shift;
  my $start_tag = shift;
  my $end_tag = shift;
  my $maximal_length = shift;
  $maximal_length = 160 unless defined $maximal_length;
  $maximal_length = int ($maximal_length / 2) + 1; # summary is symmetric to center of match
  my $match = $self->[$n];
  my $length = int(sqrt($match->{'variance'})*2 + 15); # default summary extends 2 * s.d. + 15 tokens to each side from center
  $length = $maximal_length if $length > $maximal_length;
  $length = 5 if $length < 5;   # minimum size is 11 words
  my @words = ();               # summary as word list
  # summary must be contained in context, i.e. use this matches 'first' and 'last' as hard boundaries
  my $first = $match->{'center'} - $length;
  my $pre_ell = "... ";         # ellipsis before summary if not start of document
  if ($first < $match->{'first'}) {
    $first = $match->{'first'};
    $pre_ell = "";
  }
  my $last = $match->{'center'} + $length;
  my $post_ell = " ...";        # ellipsis after summary if not end of document
  if ($last > $match->{'last'}) {
    $last = $match->{'last'};
    $post_ell = "";
  }

  foreach my $cpos ($first .. $last) {
    my $token = encode_entities($word_att->cpos2str($cpos));
    $token = $start_tag.$token.$end_tag
      if (defined $match->{'keyword'}->{$cpos});
    push @words, $token;
  }
  return $pre_ell."@words".$post_ell;
}

# (re-)compute quality values of matches
# Usage: $ml->compute_quality($no_of_keywords [, @preferred ]);
sub compute_quality {
  my $self = shift;
  my $no_of_keywords = shift;
  my @preferred = @_;           # list of s-attribute handles for preferred regions -> higher quality score
  # each keyword present in the match scores 1 point; proximity of the
  # keywords scores up to $proximity_points points (scaled by standard deviation)
  my $proximity_points =                       # we'll probably want to make that configurable
    ($no_of_keywords > 2) ? 2 : 1;
  my $preferred_points = (@preferred) ?        # ... and that as well
    (($no_of_keywords > 3) ? 2 : 1)   : 0; 
  my $total_points = ($no_of_keywords - 1) + $proximity_points + $preferred_points;
  
  foreach my $match (@$self) {
    my $variance = $match->{'variance'};
    $variance = 4 if $variance < 4; # scale proximity score to 0 .. $proximity_points range
    my $sd = sqrt($variance/4);
    my $cnt_preferred = 0; # a match gets higher score, if one or more of its keywords fall into preferred environments
    foreach my $keyword (keys %{$match->{'keyword'}}) {
      $cnt_preferred++ 
        if grep { defined $_->cpos2struc($keyword) } @preferred;
    }
    my $quality = 
      (($match->{'n'} - 1) + ($proximity_points / $sd) + ($preferred_points * $cnt_preferred / $no_of_keywords)) 
       / $total_points;
    $match->{'quality'} = int ($quality*100 + 0.5); # rounded percentage
  }
}

# sort matchlist by one the fields
#   'first'  ...  sort by first position in match
#   'quality' ... sort by quality values (make sure you've called compute_quality() beforehand!)
sub sort {
  my $self = shift;
  my $method = shift;
  my %sort = (
              'first' => sub {return sort {$a->{'first'} <=> $b->{'first'}} @_},
              'quality' => sub {return sort {$b->{'quality'} <=> $a->{'quality'}} @_},
              );
  croak "Usage:  \$ml->sort('".join("' | '", keys %sort)."');"
    unless defined $sort{$method};
  @$self = &{$sort{$method}}(@$self);
}

# remove overlapping matches ("cull" the one with highest quality rating)
# NB don't forget to $ml->compute_quality; $ml->sort('first') before!
sub cull {
  my $self = shift;
 I_LOOP:
  for (my $i = 0; $i < @$self; $i++) {
    my $i_first = $self->[$i]->{'first'};
    my $i_last = $self->[$i]->{'last'};
  J_LOOP:
    for (my $j = $i+1; $j < @$self; $j++) {
      last unless $j < @$self;  # check again for 'redo'
      my $j_first = $self->[$j]->{'first'};
      my $j_last = $self->[$j]->{'last'};
      # compare i-th and j-th element of match list
      last J_LOOP # assume ml is ordered by 'first' -> no more overlaps possible after this point
        if $j_first > $i_last; 
      if ($i_last >= $j_first) {
        if ($self->[$i]->{'quality'} >= $self->[$j]->{'quality'}) {
          # i-th match is better -> keep it, delete j-th match
          splice @$self, $j, 1;
          redo J_LOOP;          # must re-consider j-th match
        }
        else {
          # j-th match is better -> delete i-th match and restart outer loop
          splice @$self, $i, 1;
          redo I_LOOP;
        }
      }
    }
  }
  # meself had better be culled nicely now ...
}


# **
# cleanup: to be done
# find overlapping matches & select the one with higher quality
# **


# expand to desired context setting
# template for the context_expansion_func() function:

lib/CWB/Web/Search.pm  view on Meta::CPAN

    $m->{'data'}->{'date'};
    if ($want_context) {
      $m = $search->match($i, 'context');
      $m->{'context'};          # match with context (HTML encoded)
    }
  }

  undef $search;

=head1 DESCRIPTION

The I<CWB::Web::Search> module executes simple queries similar to
commercial Web search engines on CWB-encoded corpora. The I<query()> method
returns I<keywords> found in the corpus with the requested amount of
context in HTML format. Additionally, data stored in structural
attributes can be returned. Typically, a CGI script will create a
I<CWB::Web::Search> object for a single query.

=head1 ERRORS

If the I<CWB::Web::Search> module encounters an error condition, an error
message is printed on C<STDERR> and the program is terminated. A user-defined
error handler can be installed with the I<on_error()> method. In this case,
the error callback function is passed the error message generated by the module
as a list of strings.

=head1 CORPUS REGISTRY

If you need to use a registry other than the default corpus registry,
you should change the setting directly in the L<CWB::CL|CWB::CL> module.

  use CWB::CL;
  $CWB::CL::Registry = "/path/to/my/registry";

This will affect all new I<CWB::Web::Search> objects.

=head1 RESULT STRUCTURE

The search module's I<match()> method return a I<result struct> for
the n-th match of the last query executed. A CGI script will usually
iterate through all matches with a loop similar to this:

    $nr_matches = $search->query(...);
    for ($n = 0; $n < $nr_maches; $n++) {
      $m = $search->match($n);
      # code for processing match data in result struct $m 
    }

A I<result struct> $m has the following fields:

=over 4

=item $m->{'cpos'}

I<Corpus  position> of the I<centre>  of  this match (the I<centre> is
computed from the positions of all search I<keywords> in a match).

=item $m->{'quality'}

An estimate of the I<relevance> of this match. This ranking is given as a
percentage with 100% corresponding to a "perfect match". The matches found
by the I<query()> method are sorted according to their 'quality' value. 

=item $m->{'summary'}

A text segment from the corpus containing most of the <keywords> found
in this match (up to a reasonable maxium length). It is returned in
HTML format with the I<keywords> highlighted.

=item $m->{'context'}

The text segment from the corpus containing all <keywords> found in
this match, expanded according to the I<context()> setting. It is
returned in HTML format with the I<keywords> highlighted.

B<NB> The I<context> field is only included if the C<'context'> switch
was passed to the I<match()> method:

    $m = $search->match($n, 'context');

See the remarks on I<virtual context> in the description of the
I<cull()> method below.

=item $m->{'data'}

The values of the structural attributes requested by the I<data()> 
method are returned in the subfields of the 'data' field. A typical
CGI application will use the 'data' field to retrieve document URLs,
e.g.

    $match_url = $m->{'data'}->{'url'};

where the search corpus contains regions like the following

    <url http://www.ims.uni-stuttgart.de/> ... </url>

The values stored in the 'data' field are not HTML encoded.

=back


=head1 METHODS

=over 4

=item $search = new CWB::Web::Search $corpus;

Create I<CWB::Web::Search> object for WWW search queries on the
CWB corpus C<$corpus>.

=item @results = $search->query($key1, $key2, ... );

Searches corpus for the specified I<keywords> and returns a list
of matches sorted by (decreasing) relevance. 
 
See L<"RESULT STRUCTURE"> for the format of the C<@results> list. 

=back


=head1 COPYRIGHT



( run in 1.801 second using v1.01-cache-2.11-cpan-8f98c5d2c55 )