Alt-CWB-ambs
view release on metacpan or search on metacpan
lib/CWB/CEQL.pm view on Meta::CPAN
}
}
elsif ($item =~ /^<.*>$/) {
return $self->Call("xml_tag", $item);
}
elsif ($item =~ /^[*+]+$/) {
return "[]?" if $item eq "*"; # special cases make CQP query more natural
return "[]" if $item eq "+";
my $n_plus = $item =~ tr/+/+/; # count number of + and * characters
my $n_ast = $item =~ tr/*/*/;
my $min_count = $n_plus;
my $max_count = $n_plus + $n_ast;
return "[]{$min_count,$max_count}";
}
else {
return $self->Call("token_expression", $item);
}
}
=item C<xml_tag>
A start or end tag matching the boundary of an s-attribute region. The
C<xml_tag> rule only performs validation, in particularly ensuring that the
region name is listed as an allowed s-attribute in the parameter
C<s_attributes>, then passes the tag through to the CQP query.
=cut
sub xml_tag {
my ($self, $tag) = @_;
$tag =~ /^<\/?([^\/<>]+)>$/
or die "syntax error in XML tag '' $tag ''\n";
my $name = $1;
my $is_valid_tag = $self->GetParam("s_attributes");
if (ref($is_valid_tag) eq "HASH") {
unless ($is_valid_tag->{$name}) {
my @valid_tags = map {"<$_>"} sort keys %$is_valid_tag;
die "invalid XML tag '' $tag '' (allowed tags: ''@valid_tags'')\n";
}
}
else {
die "XML tags are not allowed in this corpus\n";
}
return $tag;
}
=back
=head2 Proximity Query
=over 4
=item C<proximity_query>
A proximity query searches for combinations of words within a certain distance
of each other, specified either as a number of tokens (I<numeric distance>) or
as co-occurrence within an s-attribute region (I<structural distance>). The
C<proximity_query> rule splits its input into a sequence of token patterns,
distance operators and parentheses used for grouping. Shorthand notation for
word sequences is expanded (e.g. C<as long as> into C<<< as >>1>> long >>2>>
as >>>), and then the C<proximity_expression> rule is applied to each item in
turn. A shift-reduce algorithm in C<proximity_expression> reduces the
resulting list into a single CQP query (using the undocumented "MU" notation).
=cut
sub proximity_query {
my ($self, $input) = @_;
$input =~ s/(?<!\\)([()])/\t$1\t/g; # separate parentheses and distance operators with TABs
$input =~ s/(?<!\\)((<<|>>)[^<>\\ ]*(<<|>>))/\t$1\t/g;
$input =~ s/^\s+//; $input =~ s/\s+$//; # strip leading/trailing whitespace
my @items = split /\s*\t\s*/, $input; # split on TABs into proximity operators, parentheses, token expressions (removes extra whitespace)
# pre-process shorthand notation for word sequences (such as "as long as")
@items = map {
if (/\s/) {
my @shorthand = split " ";
my @expanded = ("(", $shorthand[0]);
foreach my $i (1 .. $#shorthand) {
push @expanded, ">>$i,$i>>", $shorthand[$i];
}
push @expanded, ")";
@expanded;
}
else {
$_; # single token expressions, distance operators and parentheses are passed through
}
} @items;
# now apply proximity_expression rule to each item, which should eventually return a single term
my @query = $self->Apply("proximity_expression", @items);
die "incomplete proximity query: expected another term after distance operator\n"
if @query == 2 and $query[1]->type eq "Op";
confess "shift-reduce parsing with **proximity_expression** failed to return a single term"
unless @query == 1 and $query[0]->type eq "Term"; # better safe than sorry ...
return "MU$query[0]";
}
=item C<proximity_expression>
A proximity expression is either a token expression (delegated to
C<token_expression>), a distance operator (delegated to C<distance_operator>)
or a parenthesis for grouping subexpressions (handled directly). At each
step, the current result list is examined to check whether the respective type
of proximity expression is valid here. When 3 elements have been collected in
the result list (term, operator, term), they are reduced to a single term.
This ensures that the B<Apply> method in C<proximity_query> returns only a
single string containing the (almost) complete CQP query.
=cut
sub proximity_expression {
my ($self, $item) = @_;
my $result_list = $self->currentGroup;
my $n_results = @$result_list; # current position in result list
my $new_term = undef;
# handle different types of proximity expressions
if ($item eq "(") {
die "cannot start subexpression at this point, expected distance operator\n"
unless $n_results == 0 or $n_results == 2;
$self->BeginGroup("(...)"); # named group makes error messages more meaningful
return "";
}
elsif ($item eq ")") {
my @subexp = $self->EndGroup("(...)");
die "empty subexpression not allowed in proximity query\n"
if @subexp == 0;
die "incomplete subexpression in proximity query: expected another term after distance operator\n"
if @subexp == 2 and $subexp[1]->type eq "Op";
confess "shift-reduce parsing of subexpression in **proximity_expression** failed to return a single term"
unless @subexp == 1 and $subexp[0]->type eq "Term"; # better safe than sorry ...
$new_term = $subexp[0];
$result_list = $self->currentGroup; # EndGroup() has moved back to the parent result list, so update local variables
$n_results = @$result_list;
}
elsif ($item =~ /^(<<|>>).*(<<|>>)$/) {
die "distance operator not allowed at this point, expected token expression or parenthesis\n"
unless $n_results == 1;
$new_term = $self->Call("distance_expression", $item);
}
elsif ($item =~ /^[*+]+$/) {
die "optional/skipped tokens ''$item'' not allowed in proximity query\n";
}
else {
( run in 0.582 second using v1.01-cache-2.11-cpan-97f6503c9c8 )