CWB
view release on metacpan or search on metacpan
t/44_ceql_bncweb.t view on Meta::CPAN
# nested attributes are accepted, but should perhaps better be inserted automagically
qw(div1 div2 div3 quote1 list1 list2 item1 item2 hi1 p1 p2));
$self->SetParam("s_attributes", \%xml_tags);
return bless($self, $class);
}
# BNCweb::CEQL expects its input to be in the canonical BNCweb encoding, i.e. Latin-1 + HTML entities;
# the "default" rule first converts the input to a Perl Unicode string, and then re-encodes the resulting CQP query in Latin-1
sub default {
my ($self, $input) = @_;
my $unicode = decode("iso-8859-1", $input);
##-- # the real implementation uses the HTML::Entities module to decode HTML entities
##-- decode_entities($unicode);
# here, dummy rules covering all entities in the test suite help us to avoid a dependency on the non-standard HTML::Entities module
$unicode =~ s/\é/\x{E9}/g;
$unicode =~ s/\à/\x{E0}/g;
$unicode =~ s/\£/\x{A3}/g;
$unicode =~ s/\α/\x{03B1}/g;
$unicode =~ s/\♥/\x{2665}/g;
$unicode =~ s/\δ/\x{03B4}/g;
# end of dummy rules
my $cqp_unicode = $self->Call("ceql_query", $unicode);
return encode("iso-8859-1", $cqp_unicode, Encode::FB_CROAK);
}
# override literal_string rule to insert HTML entities (for non-Latin-1 characters and special treatment of ")
sub literal_string {
my ($self, $input) = @_;
$input =~ s/\\//g; # remove backslashes (used to escape CEQL metacharacters)
##-- # the real implementation uses the HTML::Entities module to insert HTML entities
##-- encode_entities($input, '<>&'); # unsafe characters <, >, & are HTML entities in the canonical BNCweb encoding
##-- encode_entities($input, '^\x{00}-\x{FF}'); # encode non-Latin-1 characters as HTML entities (but keep $input in Unicode for now)
# here, dummy rules covering all entities in the test suite help us to avoid a dependency on the non-standard HTML::Entities module
( run in 0.418 second using v1.01-cache-2.11-cpan-88abd93f124 )