CWB

 view release on metacpan or  search on metacpan

t/44_ceql_bncweb.t  view on Meta::CPAN

     # nested attributes are accepted, but should perhaps better be inserted automagically
     qw(div1 div2 div3 quote1 list1 list2 item1 item2 hi1 p1 p2));
  $self->SetParam("s_attributes", \%xml_tags);
  return bless($self, $class);
}

# BNCweb::CEQL expects its input to be in the canonical BNCweb encoding, i.e. Latin-1 + HTML entities;
# the "default" rule first converts the input to a Perl Unicode string, and then re-encodes the resulting CQP query in Latin-1
sub default {
  my ($self, $input) = @_;
  my $unicode = decode("iso-8859-1", $input);
  ##-- # the real implementation uses the HTML::Entities module to decode HTML entities
  ##-- decode_entities($unicode);
  # here, dummy rules covering all entities in the test suite help us to avoid a dependency on the non-standard HTML::Entities module
  $unicode =~ s/\é/\x{E9}/g;
  $unicode =~ s/\à/\x{E0}/g;
  $unicode =~ s/\£/\x{A3}/g;
  $unicode =~ s/\α/\x{03B1}/g;
  $unicode =~ s/\♥/\x{2665}/g;
  $unicode =~ s/\δ/\x{03B4}/g;
  # end of dummy rules
  my $cqp_unicode = $self->Call("ceql_query", $unicode);
  return encode("iso-8859-1", $cqp_unicode, Encode::FB_CROAK);
}

# override literal_string rule to insert HTML entities (for non-Latin-1 characters and special treatment of ")
sub literal_string {
  my ($self, $input) = @_;
  $input =~ s/\\//g; # remove backslashes (used to escape CEQL metacharacters)
  ##-- # the real implementation uses the HTML::Entities module to insert HTML entities
  ##-- encode_entities($input, '<>&');            # unsafe characters <, >, & are HTML entities in the canonical BNCweb encoding
  ##-- encode_entities($input, '^\x{00}-\x{FF}'); # encode non-Latin-1 characters as HTML entities (but keep $input in Unicode for now)
  # here, dummy rules covering all entities in the test suite help us to avoid a dependency on the non-standard HTML::Entities module



( run in 0.394 second using v1.01-cache-2.11-cpan-88abd93f124 )