Alt-CWB-ambs
view release on metacpan or search on metacpan
lib/CWB/CEQL.pm view on Meta::CPAN
to the right). Structural distances are always two-sided and specifies an
s-attribute region, in which both items must co-occur (e.g. C<<< <<s>> >>>).
=cut
sub distance_expression {
my ($self, $op) = @_;
$op =~ /^(<<|>>)(.+)(<<|>>)$/
or die "syntax error in distance operator '' $op ''\n";
my $type = "$1$3";
my $distance = $2;
die "invalid distance type ''>>..<<'' in distance operator '' $op ''\n"
if $type eq ">><<";
if ($distance =~ /^(?:([1-9][0-9]*),)?([1-9][0-9]*)$/) {
# numeric distance
my ($min, $max) = ($1, $2);
die "maximum distance must be greater than or equal to minimum distance in '' $op ''\n"
if $min and not $max >= $min;
die "distance range ''$distance'' not allowed for two-sided distance '' $op ''\n"
if $min and $type eq "<<>>";
$min = 1 unless $min;
if ($type eq "<<>>") { return new CWB::CEQL::String "-$max $max", "Op" }
elsif ($type eq "<<<<") { return new CWB::CEQL::String "-$max -$min", "Op" }
elsif ($type eq ">>>>") { return new CWB::CEQL::String "$min $max", "Op" }
else { confess "This can't happen." }
}
else {
# structural distance
my $is_valid_region = $self->GetParam("s_attributes") || {};
if ($is_valid_region->{$distance}) {
die "structural distance must be two-sided (''<<..>>'')\n"
unless $type eq "<<>>";
return new CWB::CEQL::String $distance, "Op";
}
else {
my @valid_ops = map {"<<$_>>"} sort keys %$is_valid_region;
die "'' $op '' is neither a numeric distance nor a valid structural distance (supported structures: ''@valid_ops'')\n";
}
}
}
=back
=head2 Token Expression
=over 4
=item C<token_expression>
Evaluate complete token expression with word form (or lemma) constraint and or
part-of-speech (or simple POS) constraint. The two parts of the token
expression are passed on to C<word_or_lemma_constraint> and C<pos_constraint>,
respectively. This rule returns a CQP token expression enclosed in square
brackets.
=cut
sub token_expression {
my ($self, $input) = @_;
my @parts = split /(?<!\\)_/, $input; # split input on unescaped underscores
die "only a single ''_'' separator allowed between word form and POS constraint (use ''\\_'' to match literal underscore)\n"
if @parts > 2;
my ($word, $pos) = @parts;
$word = "" unless defined $word;
$pos = "" unless defined $pos;
my ($cqp_word, $cqp_pos) = (undef, undef);
if ($word ne "" and # optimise *_ITJ to _ITJ (to avoid word form constraint matching all words)
not ($word =~ /^[+*]$/ and $pos ne "")) {
$cqp_word = $self->Call("word_or_lemma_constraint", $word);
}
if ($pos ne "") {
$cqp_pos = $self->Call("pos_constraint", $pos);
}
if (defined $cqp_word and defined $cqp_pos) {
return "[$cqp_word \& $cqp_pos]";
}
elsif (defined $cqp_word) {
return "[$cqp_word]";
}
elsif (defined $cqp_pos) {
return "[$cqp_pos]";
}
else {
die "neither word form nor part-of-speech constraint in token expression '' $input ''\n";
}
}
=back
=head2 Word Form / Lemma
=over 4
=item C<word_or_lemma_constraint>
Evaluate complete word form or lemma constraint, including case/diacritics
flags, and return suitable CQP code to be included in a token expression
=cut
sub word_or_lemma_constraint {
my ($self, $input) = @_;
my $ignore_case = ($self->GetParam("default_ignore_case")) ? 1 : 0;
my $ignore_diac = ($self->GetParam("default_ignore_diac")) ? 1 : 0;
if ($input =~ s/(?<!\\):([A-Za-z]+)$//) {
my $flags = $1;
foreach my $flag (split //, $flags) {
if ($flag eq "c") { $ignore_case = 1 }
elsif ($flag eq "C") { $ignore_case = 0 }
elsif ($flag eq "d") { $ignore_diac = 1 }
elsif ($flag eq "D") { $ignore_diac = 0 }
else { die "invalid flag ''$flag'' in modifier '':$flags''\n" }
}
}
my $cqp_code = $self->Call("word_or_lemma", $input);
if ($ignore_case or $ignore_diac) {
$cqp_code .= '%';
$cqp_code .= "c" if $ignore_case;
$cqp_code .= "d" if $ignore_diac;
lib/CWB/CEQL.pm view on Meta::CPAN
C<pos_attribute>).
=cut
sub pos_tag {
my ($self, $tag) = @_;
my $attr = $self->GetParam("pos_attribute")
or die "no attribute defined for part-of-speech tags (internal error)\n";
my $regexp = $self->Call("wildcard_pattern", $tag);
return "$attr=$regexp";
}
=item C<simple_pos>
Translate simple part-of-speech tag into CQP constraint. The specified tag is
looked up in the hash provided by the C<simple_pos> parameter, and replaced by
the regular expression listed there. If the tag cannot be found, or if no simple
tags have been defined, a helpful error message is generated.
=cut
sub simple_pos {
my ($self, $tag) = @_;
my $attr = $self->GetParam("simple_pos_attribute") || $self->GetParam("pos_attribute")
or die "no attribute defined for part-of-speech tags (internal error)\n";
my $lookup = $self->GetParam("simple_pos");
die "no simple part-of-speech tags are available for this corpus\n"
unless ref($lookup) eq "HASH";
my $regexp = $lookup->{$tag};
if (not defined $regexp) {
my @valid_tags = sort keys %$lookup;
die "'' $tag '' is not a valid simple part-of-speech tag (available tags: '' @valid_tags '')\n";
}
return "$attr=\"$regexp\"";
}
=back
=head2 Wildcard Patterns
=over 4
=item C<wildcard_pattern>
Translate string containing wildcards into regular expression, which is
enclosed in double quotes so it can directly be interpolated into a CQP query.
Internally, the input string is split into wildcards and literal substrings,
which are then processed one item at a time with the C<wildcard_item>
rule.
=cut
sub wildcard_pattern {
my ($self, $input) = @_;
die "literal backslash ''\\\\'' is not allowed in wildcard pattern '' $input '')\n"
if $input =~ /\\\\/; # / (temporary workaround: TextMate is confused by the regexp)
die "wildcard pattern must not end in a backslash ('' $input '')\n"
if $input =~ /\\$/;
## add whitespace around (unescaped) wildcard metacharacters
$input =~ s/(?<!\\)([?*+\[,\]])/ $1 /g;
$input =~ s/(\\[aAlLuUdDwW])/ $1 /g;
## trim whitespace, then split wildcard pattern on whitespace into items
$input =~ s/^\s+//;
$input =~ s/\s+$//;
my @items = split " ", $input;
die "empty wildcard pattern '' $_[1] '' is not allowed\n"
unless @items > 0;
my @regexp_comps = $self->Apply("wildcard_item", @items);
return '"'.join("", @regexp_comps).'"';
}
=item C<wildcard_item>
Process an item of a wildcard pattern, which is either some metacharacter
(handled directly) or a literal substring (delegated to the C<literal_string>
rule). Proper nesting of alternatives is ensured using the shift-reduce
parsing mechanism (with B<BeginGroup> and B<EndGroup> calls).
=cut
## internal lookup table for wildcard substitutions
our %_wildcard_table = (
"?" => ".",
"*" => ".*",
"+" => ".+",
"\\a" => "[A-Za-z]",
"\\A" => "[A-Za-z]+",
"\\l" => "[a-z]",
"\\L" => "[a-z]+",
"\\u" => "[A-Z]",
"\\U" => "[A-Z]+",
"\\d" => "[0-9]",
"\\D" => "[0-9]+",
"\\w" => "[A-Za-z0-9'-]",
"\\W" => "[A-Za-z0-9'-]+",
);
sub wildcard_item {
my ($self, $item) = @_;
if (exists $_wildcard_table{$item}) {
return $_wildcard_table{$item};
}
elsif ($item eq "[") {
$self->BeginGroup("[...]"); # group names make error messages more meaningful
return "";
}
elsif ($item eq ",") {
die "alternatives separator ('','') may only be used within brackets ''[ .. ]''\n"
unless $self->NestingLevel > 0;
return "|";
}
elsif ($item eq "]") {
my @parts = $self->EndGroup("[...]");
my ($has_empty_alternative, @filtered_parts) = $self->_remove_empty_alternatives(@parts);
die "empty list of alternatives not allowed in wildcard pattern\n"
unless @filtered_parts > 0;
my $group = "(".join("", @filtered_parts).")";
return(($has_empty_alternative) ? "$group?" : $group);
}
else {
return $self->Call("literal_string", $item);
}
}
=item C<literal_string>
Translate literal string into regular expression, escaping all metacharacters
with backslashes (backslashes in the input string are removed first).
Note that escaping of C<^> and C<"> isn't fully reliable because CQP might
interpret the resulting escape sequences as latex-style accents if they are
followed by certain letters. Future versions of CQP should provide a safer
escaping mechanism and/or allow interpretation of latex-style accents to be
turned off.
=cut
sub literal_string {
my ($self, $input) = @_;
$input =~ s/\\//g; # remove backslashes (used to escape CEQL metacharacters)
$input =~ s/([.?*+|(){}\[\]"\^\$])/\\$1/g; # " needs to be escaped because CQP regexp will be enclosed in double quotes
return $input;
}
=back
=head2 Internal Subroutines
=over 4
=item (I<$has_empty_alt>, I<@tokens>) = I<$self>->B<_remove_empty_alternatives>(I<@tokens>);
This internal method identifies and removes empty alternatives from a
tokenised group of alternatives (I<@tokens>), with alternatives separated by
C<|> tokens. In particular, leading an trailing separator tokens are removed,
and multiple consecutive separators are collapsed to a single C<|>. The first
return value (I<$has_empty_alt>) indicates whether one or more empty
alternatives were found; it is followed by the sanitised list of tokens.
=cut
sub _remove_empty_alternatives {
my ($self, @tokens) = @_;
my $after_separator = 1; # when this is TRUE, a "|" token introduces an empty alternative
my $has_empty_alternative = 0;
my @filtered_tokens = ();
while (@tokens) {
my $t = shift @tokens;
my $keep = 1;
if ($t eq "|") {
# a trailing "|" token also introduces an empty alternative (checked here)
if ($after_separator or @tokens == 0) {
$has_empty_alternative = 1;
$keep = 0;
}
$after_separator = 1;
}
else {
$after_separator = 0;
}
push @filtered_tokens, $t
if $keep;
}
return $has_empty_alternative, @filtered_tokens;
}
=back
=head1 COPYRIGHT
Copyright (C) 1999-2010 Stefan Evert [http::/purl.org/stefan.evert]
This software is provided AS IS and the author makes no warranty as to
its use and performance. You may use the software, redistribute and
modify it under the same terms as Perl itself.
=cut
1;
( run in 1.077 second using v1.01-cache-2.11-cpan-524268b4103 )