Alt-CWB-ambs

 view release on metacpan or  search on metacpan

lib/CWB/CEQL.pm  view on Meta::CPAN

    }
  }
  elsif ($item =~ /^<.*>$/) {
    return $self->Call("xml_tag", $item);
  }
  elsif ($item =~ /^[*+]+$/) {
    return "[]?" if $item eq "*";  # special cases make CQP query more natural
    return "[]" if $item eq "+";
    my $n_plus = $item =~ tr/+/+/; # count number of + and * characters
    my $n_ast = $item =~ tr/*/*/;
    my $min_count = $n_plus;
    my $max_count = $n_plus + $n_ast;
    return "[]{$min_count,$max_count}";
  }
  else {
    return $self->Call("token_expression", $item);
  }
}

=item C<xml_tag>

A start or end tag matching the boundary of an s-attribute region.  The
C<xml_tag> rule only performs validation, in particularly ensuring that the
region name is listed as an allowed s-attribute in the parameter
C<s_attributes>, then passes the tag through to the CQP query.

=cut

sub xml_tag {
  my ($self, $tag) = @_;
  $tag =~ /^<\/?([^\/<>]+)>$/
    or die "syntax error in XML tag '' $tag ''\n";
  my $name = $1;
  my $is_valid_tag = $self->GetParam("s_attributes");
  if (ref($is_valid_tag) eq "HASH") {
    unless ($is_valid_tag->{$name}) {
      my @valid_tags = map {"<$_>"} sort keys %$is_valid_tag;
      die "invalid XML tag '' $tag '' (allowed tags: ''@valid_tags'')\n";
    }
  }
  else {
    die "XML tags are not allowed in this corpus\n";
  }
  return $tag;
}

=back


=head2 Proximity Query

=over 4

=item C<proximity_query>

A proximity query searches for combinations of words within a certain distance
of each other, specified either as a number of tokens (I<numeric distance>) or
as co-occurrence within an s-attribute region (I<structural distance>).  The
C<proximity_query> rule splits its input into a sequence of token patterns,
distance operators and parentheses used for grouping.  Shorthand notation for
word sequences is expanded (e.g. C<as long as> into C<<< as >>1>> long >>2>>
as >>>), and then the C<proximity_expression> rule is applied to each item in
turn.  A shift-reduce algorithm in C<proximity_expression> reduces the
resulting list into a single CQP query (using the undocumented "MU" notation).

=cut

sub proximity_query {
  my ($self, $input) = @_;
  $input =~ s/(?<!\\)([()])/\t$1\t/g; # separate parentheses and distance operators with TABs
  $input =~ s/(?<!\\)((<<|>>)[^<>\\ ]*(<<|>>))/\t$1\t/g;
  $input =~ s/^\s+//; $input =~ s/\s+$//; # strip leading/trailing whitespace
  my @items = split /\s*\t\s*/, $input; # split on TABs into proximity operators, parentheses, token expressions (removes extra whitespace)
  # pre-process shorthand notation for word sequences (such as "as long as")
  @items = map {
    if (/\s/) {
      my @shorthand = split " ";
      my @expanded =  ("(", $shorthand[0]);
      foreach my $i (1 .. $#shorthand) {
        push @expanded, ">>$i,$i>>", $shorthand[$i];
      }
      push @expanded, ")";
      @expanded;
    }
    else {
      $_; # single token expressions, distance operators and parentheses are passed through
    }
  } @items;
  # now apply proximity_expression rule to each item, which should eventually return a single term
  my @query = $self->Apply("proximity_expression", @items);
  die "incomplete proximity query: expected another term after distance operator\n"
    if @query == 2 and $query[1]->type eq "Op";
  confess "shift-reduce parsing with **proximity_expression** failed to return a single term"
    unless @query == 1 and $query[0]->type eq "Term"; # better safe than sorry ...
  return "MU$query[0]";
}

=item C<proximity_expression>

A proximity expression is either a token expression (delegated to
C<token_expression>), a distance operator (delegated to C<distance_operator>)
or a parenthesis for grouping subexpressions (handled directly).  At each
step, the current result list is examined to check whether the respective type
of proximity expression is valid here.  When 3 elements have been collected in
the result list (term, operator, term), they are reduced to a single term.
This ensures that the B<Apply> method in C<proximity_query> returns only a
single string containing the (almost) complete CQP query.

=cut

sub proximity_expression {
  my ($self, $item) = @_;
  my $result_list = $self->currentGroup;
  my $n_results = @$result_list; # current position in result list
  my $new_term = undef;
  # handle different types of proximity expressions
  if ($item eq "(") {
    die "cannot start subexpression at this point, expected distance operator\n"
      unless $n_results == 0 or $n_results == 2;
    $self->BeginGroup("(...)"); # named group makes error messages more meaningful
    return "";
  }
  elsif ($item eq ")") {
    my @subexp = $self->EndGroup("(...)");
    die "empty subexpression not allowed in proximity query\n"
      if @subexp == 0;
    die "incomplete subexpression in proximity query: expected another term after distance operator\n"
      if @subexp == 2 and $subexp[1]->type eq "Op";
    confess "shift-reduce parsing of subexpression in **proximity_expression** failed to return a single term"
      unless @subexp == 1 and $subexp[0]->type eq "Term"; # better safe than sorry ...
    $new_term = $subexp[0];
    $result_list = $self->currentGroup; # EndGroup() has moved back to the parent result list, so update local variables
    $n_results = @$result_list;
  }
  elsif ($item =~ /^(<<|>>).*(<<|>>)$/) {
    die "distance operator not allowed at this point, expected token expression or parenthesis\n"
      unless $n_results == 1;
    $new_term = $self->Call("distance_expression", $item);
  }
  elsif ($item =~ /^[*+]+$/) {
    die "optional/skipped tokens ''$item'' not allowed in proximity query\n";
  }
  else {



( run in 0.582 second using v1.01-cache-2.11-cpan-97f6503c9c8 )