Alt-CWB-ambs

 view release on metacpan or  search on metacpan

t/44_ceql_bncweb.t  view on Meta::CPAN


## test whether syntax errors are recognised by the CEQL parser
our @ErrorTests = split /\n/, $ErrorTests;
while (@ErrorTests) {
  my $query = shift @ErrorTests;
  my $regexp = shift @ErrorTests;
  if (@ErrorTests) {
    my $blank = shift @ErrorTests;
    die "Shucks! Syntax error in list of error tests (expected blank line)."
      unless $blank =~ /^\s*$/;
  }
  my $msg = "find syntax error in ``$query''";
  my $result = $CEQL->Parse($query);
  if (defined $result) {
    fail($msg);
  }
  else {
    like($CEQL->HtmlErrorMessage, $regexp, $msg);
  }
}


########## BEGIN 'BNCweb::CEQL' grammar

package BNCweb::CEQL;
use base 'CWB::CEQL';

use Encode;
use HTML::Entities; # real BNCweb implementation uses this module to decode/encode HTML entities

# constructor: set up attribute names and define simplified POS tags
sub new {
  my $class = shift;
  my $self = new CWB::CEQL;
  $self->SetParam("lemma_attribute", "hw"); # corresponds to lemma in standard CEQL grammar
  my $table = { # define lookup table for simple POS tags (refer to class attribute)
               "A" => "ADJ",
               "ADJ" => "ADJ",
               "N" => "SUBST",
               "SUBST" => "SUBST",
               "V" => "VERB",
               "VERB" => "VERB",
               "ADV" => "ADV",
               "ART" => "ART",
               "CONJ" => "CONJ",
               "INT" => "INTERJ",
               "INTERJ" => "INTERJ",
               "PREP" => "PREP",
               "PRON" => "PRON",
               '$' => "STOP",
               "STOP" => "STOP",
               "UNC" => "UNC",
              };
  $self->SetParam("simple_pos", $table);
  $self->SetParam("simple_pos_attribute", "class");
  my %xml_tags = map { $_ => 1 } # list of s-attribute regions in the BNC version used by BNCweb
    (qw(text u div head quote sp speaker stage lg l list label item note bibl corr hi trunc p s mw), # from CWB registry file
     # nested attributes are accepted, but should perhaps better be inserted automagically
     qw(div1 div2 div3 quote1 list1 list2 item1 item2 hi1 p1 p2));
  $self->SetParam("s_attributes", \%xml_tags);
  return bless($self, $class);
}

# BNCweb::CEQL expects its input to be in the canonical BNCweb encoding, i.e. Latin-1 + HTML entities;
# the "default" rule first converts the input to a Perl Unicode string, and then re-encodes the resulting CQP query in Latin-1
sub default {
  my ($self, $input) = @_;
  my $unicode = decode("iso-8859-1", $input);
  ##-- # the real implementation uses the HTML::Entities module to decode HTML entities
  ##-- decode_entities($unicode);
  # here, dummy rules covering all entities in the test suite help us to avoid a dependency on the non-standard HTML::Entities module
  $unicode =~ s/\é/\x{E9}/g;
  $unicode =~ s/\à/\x{E0}/g;
  $unicode =~ s/\£/\x{A3}/g;
  $unicode =~ s/\α/\x{03B1}/g;
  $unicode =~ s/\♥/\x{2665}/g;
  $unicode =~ s/\δ/\x{03B4}/g;
  # end of dummy rules
  my $cqp_unicode = $self->Call("ceql_query", $unicode);
  return encode("iso-8859-1", $cqp_unicode, Encode::FB_CROAK);
}

# override literal_string rule to insert HTML entities (for non-Latin-1 characters and special treatment of ")
sub literal_string {
  my ($self, $input) = @_;
  $input =~ s/\\//g; # remove backslashes (used to escape CEQL metacharacters)
  ##-- # the real implementation uses the HTML::Entities module to insert HTML entities
  ##-- encode_entities($input, '<>&');            # unsafe characters <, >, & are HTML entities in the canonical BNCweb encoding
  ##-- encode_entities($input, '^\x{00}-\x{FF}'); # encode non-Latin-1 characters as HTML entities (but keep $input in Unicode for now)
  # here, dummy rules covering all entities in the test suite help us to avoid a dependency on the non-standard HTML::Entities module
  $input =~ s/&/&amp;/g;
  $input =~ s/</&lt;/g;
  $input =~ s/>/&gt;/g;
  $input =~ s/\x{03B1}/\&alpha;/g;
  $input =~ s/\x{2665}/\&hearts;/g;
  $input =~ s/\x{03B4}/\&delta;/g;
  # end of dummy rules
  $input =~ s/([.?*+|(){}\[\]\^\$])/\\$1/g;  # escape CQP regexp metacharacters (" is treated separately below)
  $input =~ s/"/&(lsquo|rsquo);/g;           # special handling of " to match both left and right quotes
  return $input;
}

# override lemma_pattern rule to provide support for {book/V} notation
sub lemma_pattern {
  my ($self, $lemma) = @_;
  my $simple_pos = $self->GetParam("simple_pos");
  die "simplified POST tags are not available (internal error)\n"
    unless ref($simple_pos) eq "HASH";
  # split lemma into headword pattern and optional simple POS constraint
  my ($hw, $tag, $extra) = split /(?<!\\)\//, $lemma;
  die "only a single ''/'' separator is allowed between headword and simplified POS in lemma constraint\n"
    if defined $extra;
  die "missing headword in lemma constraint (did you mean ''_{$tag}''?)\n"
    if $hw eq "";
  # translate wildcard pattern for headword and look up simple POS if specified
  my $regexp = $self->Call("wildcard_pattern", $hw);
  if (defined $tag) {
    # simple POS specified => look up in $simple_pos an combine with $regexp
    my $tag_regexp = $simple_pos->{$tag};
    if (not defined $tag_regexp) {
      my @valid_tags = sort keys %$simple_pos;



( run in 0.772 second using v1.01-cache-2.11-cpan-ceb78f64989 )