BOM results from the CPAN

HTML-FormatExternal


{
  # format_file() with base

  require HTML::FormatText::Elinks;
  my $str = HTML::FormatText::Elinks->format_file
    ('devel/base.html', base => 'http://localhost');
  exit 0;
}
{
  # BOM on input
  # lynx recognises automatically

  my $html = "<html><body><p>Hello world</p></body></html>\n";
  require Encode;

   $html = Encode::encode('utf-32',$html); # with BOM
  # $html = "\xFF\xFE\x00\x00" . Encode::encode('utf-32le',$html); # with BOM
  $html = ("\x20\x00\x00\x00" x 8) . $html;  # BE spaces

  print "HTML input string:\n";
  IPC::Run::run(['hd'],'<',\$html, '>','/tmp/hd.txt');
  IPC::Run::run(['cat'],'<','/tmp/hd.txt');

  require HTML::FormatText::Lynx;
  my $text = HTML::FormatText::Lynx->format_string ($html,
                                                    input_charset=>'UTF-32',
                                                    # output_charset=>'UTF-8',

lib/HTML/FormatExternal.pm view on Meta::CPAN

  my ($options, $str, $input_wide) = @_;
  my $base = delete $options->{'base'};
  ### _base_prefix: $base

  $base = "$base";           # stringize possible URI object
  $base = _entitize($base);  # probably shouldn't be any non-ascii in a url
  $base = "<base href=\"$base\">\n";

  my $pos = 0;
  unless ($input_wide) {
    # encode $base in the input_charset, and possibly after a BOM.
    #
    # Lynx recognises a BOM, if it doesn't have other -assume_charset.  It
    # recognises it only at the start of the file, so must insert <base>
    # after it here to preserve that feature of Lynx.
    #
    # If input_charset is utf-32 or utf-16 then it seems reasonable to step
    # over any BOM.  But Lynx for some reason doesn't like a BOM together
    # with utf-32 or utf-16 specified.  Dunno if that's a bug or a feature
    # on its part.

    my $input_charset = $options->{'input_charset'};
    if (! defined $input_charset || lc($input_charset) eq 'utf-32') {
      if ($str =~ /^\000\000\376\377/) {
        $input_charset = 'utf-32be';
        $pos = 4;
      } elsif ($str =~ /^\377\376\000\000/) {
        $input_charset = 'utf-32le';

lib/HTML/FormatExternal.pm view on Meta::CPAN

sub _entitize {
  my ($str) = @_;
  $str =~ s{([^\x20-\x7E])}{'&#'.ord($1).';'}eg;
  ### $str
  return $str;
}

1;
__END__

=for stopwords HTML-FormatExternal formatter formatters charset charsets TreeBuilder ie latin-1 config Elinks absolutized tty Ryde filename recognise BOM UTF entitized unrepresentable untaint superset onwards overstriking

=head1 NAME

HTML::FormatExternal - HTML to text formatting using external programs

=head1 DESCRIPTION

This is a collection of formatter modules which turn HTML into plain text by
dumping it through the respective external programs.

lib/HTML/FormatExternal.pm view on Meta::CPAN

    $modulever = HTML::FormatText::Netrik->VERSION;
    $modulever = $formatter->VERSION

=back

=head1 CHARSETS

File or byte string input is by default interpreted by the programs in their
usual ways.  This should mean HTML Latin-1 but user configurations might
override that and some programs recognise a C<< <meta> >> charset
declaration or a Unicode BOM.  The C<input_charset> option below can force
the input charset.

Perl wide-character input string is encoded and passed to the program in
whatever way it best understands.  Usually this is UTF-8 but in some cases
it is entitized instead.  The C<input_charset> option can force the input
charset to use if for some reason UTF-8 is not best.

The output string is either bytes or wide chars.  By default output is the
same as input, so wide char string input gives wide output and byte input
string or file input gives byte output.  The C<output_wide> option can force

( run in 0.379 second using v1.01-cache-2.11-cpan-e9daa2b36ef )