Alt-CWB-ambs

 view release on metacpan or  search on metacpan

lib/CWB/CEQL.pm  view on Meta::CPAN

to the right).  Structural distances are always two-sided and specifies an
s-attribute region, in which both items must co-occur (e.g. C<<< <<s>> >>>).

=cut

sub distance_expression {
  my ($self, $op) = @_;
  $op =~ /^(<<|>>)(.+)(<<|>>)$/
    or die "syntax error in distance operator '' $op ''\n";
  my $type = "$1$3";
  my $distance = $2;
  die "invalid distance type ''>>..<<'' in distance operator '' $op ''\n"
    if $type eq ">><<";
  if ($distance =~ /^(?:([1-9][0-9]*),)?([1-9][0-9]*)$/) {
    # numeric distance
    my ($min, $max) = ($1, $2);
    die "maximum distance must be greater than or equal to minimum distance in '' $op ''\n"
      if $min and not $max >= $min;
    die "distance range ''$distance'' not allowed for two-sided distance '' $op ''\n"
      if $min and $type eq "<<>>";
    $min = 1 unless $min;
    if ($type eq "<<>>")    { return new CWB::CEQL::String "-$max $max", "Op" }
    elsif ($type eq "<<<<") { return new CWB::CEQL::String "-$max -$min", "Op" }
    elsif ($type eq ">>>>") { return new CWB::CEQL::String "$min $max", "Op" }
    else { confess "This can't happen." }
  }
  else {
    # structural distance
    my $is_valid_region = $self->GetParam("s_attributes") || {};
    if ($is_valid_region->{$distance}) {
      die "structural distance must be two-sided (''<<..>>'')\n"
        unless $type eq "<<>>";
      return new CWB::CEQL::String $distance, "Op";
    }
    else {
      my @valid_ops = map {"<<$_>>"} sort keys %$is_valid_region;
      die "'' $op '' is neither a numeric distance nor a valid structural distance (supported structures: ''@valid_ops'')\n";
    }
  }
}

=back


=head2 Token Expression

=over 4

=item C<token_expression>

Evaluate complete token expression with word form (or lemma) constraint and or
part-of-speech (or simple POS) constraint.  The two parts of the token
expression are passed on to C<word_or_lemma_constraint> and C<pos_constraint>,
respectively.  This rule returns a CQP token expression enclosed in square
brackets.

=cut

sub token_expression {
  my ($self, $input) = @_;
  my @parts = split /(?<!\\)_/, $input; # split input on unescaped underscores
  die "only a single ''_'' separator allowed between word form and POS constraint (use ''\\_'' to match literal underscore)\n"
    if @parts > 2;
  my ($word, $pos) = @parts;
  $word = "" unless defined $word;
  $pos = "" unless defined $pos;
  my ($cqp_word, $cqp_pos) = (undef, undef);
  if ($word ne "" and           # optimise *_ITJ to _ITJ (to avoid word form constraint matching all words)
      not ($word =~ /^[+*]$/ and $pos ne "")) {
    $cqp_word = $self->Call("word_or_lemma_constraint", $word);
  }
  if ($pos ne "") {
    $cqp_pos = $self->Call("pos_constraint", $pos);
  }

  if (defined $cqp_word and defined $cqp_pos) {
    return "[$cqp_word \& $cqp_pos]";
  }
  elsif (defined $cqp_word) {
    return "[$cqp_word]";
  }
  elsif (defined $cqp_pos) {
    return "[$cqp_pos]";
  }
  else {
    die "neither word form nor part-of-speech constraint in token expression '' $input ''\n";
  }
}

=back

=head2 Word Form / Lemma

=over 4

=item C<word_or_lemma_constraint>

Evaluate complete word form or lemma constraint, including case/diacritics
flags, and return suitable CQP code to be included in a token expression

=cut

sub word_or_lemma_constraint {
  my ($self, $input) = @_;
  my $ignore_case = ($self->GetParam("default_ignore_case")) ? 1 : 0;
  my $ignore_diac = ($self->GetParam("default_ignore_diac")) ? 1 : 0;
  if ($input =~ s/(?<!\\):([A-Za-z]+)$//) {
    my $flags = $1;
    foreach my $flag (split //, $flags) {
      if ($flag eq "c")    { $ignore_case = 1 }
      elsif ($flag eq "C") { $ignore_case = 0 }
      elsif ($flag eq "d") { $ignore_diac = 1 }
      elsif ($flag eq "D") { $ignore_diac = 0 }
      else { die "invalid flag ''$flag'' in modifier '':$flags''\n" }
    }
  }
  my $cqp_code = $self->Call("word_or_lemma", $input);
  if ($ignore_case or $ignore_diac) {
    $cqp_code .= '%';
    $cqp_code .= "c" if $ignore_case;
    $cqp_code .= "d" if $ignore_diac;

lib/CWB/CEQL.pm  view on Meta::CPAN

C<pos_attribute>).

=cut

sub pos_tag {
  my ($self, $tag) = @_;
  my $attr = $self->GetParam("pos_attribute")
    or die "no attribute defined for part-of-speech tags (internal error)\n";
  my $regexp = $self->Call("wildcard_pattern", $tag);
  return "$attr=$regexp";
}

=item C<simple_pos>

Translate simple part-of-speech tag into CQP constraint.  The specified tag is
looked up in the hash provided by the C<simple_pos> parameter, and replaced by
the regular expression listed there.  If the tag cannot be found, or if no simple
tags have been defined, a helpful error message is generated.

=cut

sub simple_pos {
  my ($self, $tag) = @_;
  my $attr = $self->GetParam("simple_pos_attribute") || $self->GetParam("pos_attribute")
    or die "no attribute defined for part-of-speech tags (internal error)\n";
  my $lookup = $self->GetParam("simple_pos");
  die "no simple part-of-speech tags are available for this corpus\n"
    unless ref($lookup) eq "HASH";
  my $regexp = $lookup->{$tag};
  if (not defined $regexp) {
    my @valid_tags = sort keys %$lookup;
    die "'' $tag '' is not a valid simple part-of-speech tag (available tags: '' @valid_tags '')\n";
  }
  return "$attr=\"$regexp\"";
}

=back


=head2 Wildcard Patterns

=over 4

=item C<wildcard_pattern>

Translate string containing wildcards into regular expression, which is
enclosed in double quotes so it can directly be interpolated into a CQP query.

Internally, the input string is split into wildcards and literal substrings,
which are then processed one item at a time with the C<wildcard_item>
rule.

=cut

sub wildcard_pattern {
  my ($self, $input) = @_;
  die "literal backslash ''\\\\'' is not allowed in wildcard pattern '' $input '')\n"
    if $input =~ /\\\\/; # / (temporary workaround: TextMate is confused by the regexp)
  die "wildcard pattern must not end in a backslash ('' $input '')\n"
    if $input =~ /\\$/;
  ## add whitespace around (unescaped) wildcard metacharacters
  $input =~ s/(?<!\\)([?*+\[,\]])/ $1 /g;
  $input =~ s/(\\[aAlLuUdDwW])/ $1 /g;
  ## trim whitespace, then split wildcard pattern on whitespace into items
  $input =~ s/^\s+//;
  $input =~ s/\s+$//;
  my @items = split " ", $input;
  die "empty wildcard pattern '' $_[1] '' is not allowed\n"
    unless @items > 0;
  my @regexp_comps = $self->Apply("wildcard_item", @items);
  return '"'.join("", @regexp_comps).'"';
}

=item C<wildcard_item>

Process an item of a wildcard pattern, which is either some metacharacter
(handled directly) or a literal substring (delegated to the C<literal_string>
rule).  Proper nesting of alternatives is ensured using the shift-reduce
parsing mechanism (with B<BeginGroup> and B<EndGroup> calls).

=cut

## internal lookup table for wildcard substitutions
our %_wildcard_table = (
                        "?" => ".",
                        "*" => ".*",
                        "+" => ".+",
                        "\\a" => "[A-Za-z]",
                        "\\A" => "[A-Za-z]+",
                        "\\l" => "[a-z]",
                        "\\L" => "[a-z]+",
                        "\\u" => "[A-Z]",
                        "\\U" => "[A-Z]+",
                        "\\d" => "[0-9]",
                        "\\D" => "[0-9]+",
                        "\\w" => "[A-Za-z0-9'-]",
                        "\\W" => "[A-Za-z0-9'-]+",
                      );

sub wildcard_item {
  my ($self, $item) = @_;
  if (exists $_wildcard_table{$item}) {
    return $_wildcard_table{$item};
  }
  elsif ($item eq "[") {
    $self->BeginGroup("[...]"); # group names make error messages more meaningful
    return "";
  }
  elsif ($item eq ",") {
    die "alternatives separator ('','') may only be used within brackets ''[ .. ]''\n"
      unless $self->NestingLevel > 0;
    return "|";
  }
  elsif ($item eq "]") {
    my @parts = $self->EndGroup("[...]");
    my ($has_empty_alternative, @filtered_parts) = $self->_remove_empty_alternatives(@parts);
    die "empty list of alternatives not allowed in wildcard pattern\n"
      unless @filtered_parts > 0;
    my $group = "(".join("", @filtered_parts).")";
    return(($has_empty_alternative) ? "$group?" : $group);
  }
  else {
    return $self->Call("literal_string", $item);
  }
}

=item C<literal_string>

Translate literal string into regular expression, escaping all metacharacters
with backslashes (backslashes in the input string are removed first).

Note that escaping of C<^> and C<"> isn't fully reliable because CQP might
interpret the resulting escape sequences as latex-style accents if they are
followed by certain letters.  Future versions of CQP should provide a safer
escaping mechanism and/or allow interpretation of latex-style accents to be
turned off.

=cut

sub literal_string {
  my ($self, $input) = @_;
  $input =~ s/\\//g; # remove backslashes (used to escape CEQL metacharacters)
  $input =~ s/([.?*+|(){}\[\]"\^\$])/\\$1/g; # " needs to be escaped because CQP regexp will be enclosed in double quotes
  return $input;
}

=back


=head2 Internal Subroutines

=over 4

=item (I<$has_empty_alt>, I<@tokens>) = I<$self>->B<_remove_empty_alternatives>(I<@tokens>);

This internal method identifies and removes empty alternatives from a
tokenised group of alternatives (I<@tokens>), with alternatives separated by
C<|> tokens.  In particular, leading an trailing separator tokens are removed,
and multiple consecutive separators are collapsed to a single C<|>.  The first
return value (I<$has_empty_alt>) indicates whether one or more empty
alternatives were found; it is followed by the sanitised list of tokens.

=cut

sub _remove_empty_alternatives {
  my ($self, @tokens) = @_;
  my $after_separator = 1;    # when this is TRUE, a "|" token introduces an empty alternative
  my $has_empty_alternative = 0;
  my @filtered_tokens = ();
    while (@tokens) {
      my $t = shift @tokens;
      my $keep = 1;
      if ($t eq "|") {
        # a trailing "|" token also introduces an empty alternative (checked here)
        if ($after_separator or @tokens == 0) {
          $has_empty_alternative = 1;
          $keep = 0;
        }
        $after_separator = 1;
      }
      else {
        $after_separator = 0;
      }
      push @filtered_tokens, $t
        if $keep;
    }
  return $has_empty_alternative, @filtered_tokens;
}

=back


=head1 COPYRIGHT

Copyright (C) 1999-2010 Stefan Evert [http::/purl.org/stefan.evert]

This software is provided AS IS and the author makes no warranty as to
its use and performance. You may use the software, redistribute and
modify it under the same terms as Perl itself.

=cut

1;



( run in 1.077 second using v1.01-cache-2.11-cpan-524268b4103 )