Alt-CWB-ambs
view release on metacpan or search on metacpan
t/44_ceql_bncweb.t view on Meta::CPAN
## test whether syntax errors are recognised by the CEQL parser
our @ErrorTests = split /\n/, $ErrorTests;
while (@ErrorTests) {
my $query = shift @ErrorTests;
my $regexp = shift @ErrorTests;
if (@ErrorTests) {
my $blank = shift @ErrorTests;
die "Shucks! Syntax error in list of error tests (expected blank line)."
unless $blank =~ /^\s*$/;
}
my $msg = "find syntax error in ``$query''";
my $result = $CEQL->Parse($query);
if (defined $result) {
fail($msg);
}
else {
like($CEQL->HtmlErrorMessage, $regexp, $msg);
}
}
########## BEGIN 'BNCweb::CEQL' grammar
package BNCweb::CEQL;
use base 'CWB::CEQL';
use Encode;
use HTML::Entities; # real BNCweb implementation uses this module to decode/encode HTML entities
# constructor: set up attribute names and define simplified POS tags
sub new {
my $class = shift;
my $self = new CWB::CEQL;
$self->SetParam("lemma_attribute", "hw"); # corresponds to lemma in standard CEQL grammar
my $table = { # define lookup table for simple POS tags (refer to class attribute)
"A" => "ADJ",
"ADJ" => "ADJ",
"N" => "SUBST",
"SUBST" => "SUBST",
"V" => "VERB",
"VERB" => "VERB",
"ADV" => "ADV",
"ART" => "ART",
"CONJ" => "CONJ",
"INT" => "INTERJ",
"INTERJ" => "INTERJ",
"PREP" => "PREP",
"PRON" => "PRON",
'$' => "STOP",
"STOP" => "STOP",
"UNC" => "UNC",
};
$self->SetParam("simple_pos", $table);
$self->SetParam("simple_pos_attribute", "class");
my %xml_tags = map { $_ => 1 } # list of s-attribute regions in the BNC version used by BNCweb
(qw(text u div head quote sp speaker stage lg l list label item note bibl corr hi trunc p s mw), # from CWB registry file
# nested attributes are accepted, but should perhaps better be inserted automagically
qw(div1 div2 div3 quote1 list1 list2 item1 item2 hi1 p1 p2));
$self->SetParam("s_attributes", \%xml_tags);
return bless($self, $class);
}
# BNCweb::CEQL expects its input to be in the canonical BNCweb encoding, i.e. Latin-1 + HTML entities;
# the "default" rule first converts the input to a Perl Unicode string, and then re-encodes the resulting CQP query in Latin-1
sub default {
my ($self, $input) = @_;
my $unicode = decode("iso-8859-1", $input);
##-- # the real implementation uses the HTML::Entities module to decode HTML entities
##-- decode_entities($unicode);
# here, dummy rules covering all entities in the test suite help us to avoid a dependency on the non-standard HTML::Entities module
$unicode =~ s/\é/\x{E9}/g;
$unicode =~ s/\à/\x{E0}/g;
$unicode =~ s/\£/\x{A3}/g;
$unicode =~ s/\α/\x{03B1}/g;
$unicode =~ s/\♥/\x{2665}/g;
$unicode =~ s/\δ/\x{03B4}/g;
# end of dummy rules
my $cqp_unicode = $self->Call("ceql_query", $unicode);
return encode("iso-8859-1", $cqp_unicode, Encode::FB_CROAK);
}
# override literal_string rule to insert HTML entities (for non-Latin-1 characters and special treatment of ")
sub literal_string {
my ($self, $input) = @_;
$input =~ s/\\//g; # remove backslashes (used to escape CEQL metacharacters)
##-- # the real implementation uses the HTML::Entities module to insert HTML entities
##-- encode_entities($input, '<>&'); # unsafe characters <, >, & are HTML entities in the canonical BNCweb encoding
##-- encode_entities($input, '^\x{00}-\x{FF}'); # encode non-Latin-1 characters as HTML entities (but keep $input in Unicode for now)
# here, dummy rules covering all entities in the test suite help us to avoid a dependency on the non-standard HTML::Entities module
$input =~ s/&/&/g;
$input =~ s/</</g;
$input =~ s/>/>/g;
$input =~ s/\x{03B1}/\α/g;
$input =~ s/\x{2665}/\♥/g;
$input =~ s/\x{03B4}/\δ/g;
# end of dummy rules
$input =~ s/([.?*+|(){}\[\]\^\$])/\\$1/g; # escape CQP regexp metacharacters (" is treated separately below)
$input =~ s/"/&(lsquo|rsquo);/g; # special handling of " to match both left and right quotes
return $input;
}
# override lemma_pattern rule to provide support for {book/V} notation
sub lemma_pattern {
my ($self, $lemma) = @_;
my $simple_pos = $self->GetParam("simple_pos");
die "simplified POST tags are not available (internal error)\n"
unless ref($simple_pos) eq "HASH";
# split lemma into headword pattern and optional simple POS constraint
my ($hw, $tag, $extra) = split /(?<!\\)\//, $lemma;
die "only a single ''/'' separator is allowed between headword and simplified POS in lemma constraint\n"
if defined $extra;
die "missing headword in lemma constraint (did you mean ''_{$tag}''?)\n"
if $hw eq "";
# translate wildcard pattern for headword and look up simple POS if specified
my $regexp = $self->Call("wildcard_pattern", $hw);
if (defined $tag) {
# simple POS specified => look up in $simple_pos an combine with $regexp
my $tag_regexp = $simple_pos->{$tag};
if (not defined $tag_regexp) {
my @valid_tags = sort keys %$simple_pos;
( run in 0.772 second using v1.01-cache-2.11-cpan-ceb78f64989 )