BOM results from the CPAN

HTML-HTML5-Parser

             char_buffer_pos => 0,
             character_queue => [],
             filehandle => $_[2],
             charset => $_[1],
             byte_buffer => '',
             onerror => $_[3] || sub {},
             #onerror_set
            };
  if ($csdef->{uri}->{$XML_AUTO_CHARSET} or
      $obj->{charset} eq $XML_AUTO_CHARSET) {
    my $b = ''; # UTF-8 w/o BOM
    $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
    $obj->{input_encoding} = 'UTF-8';
    if (read $obj->{filehandle}, $b, 256) {
      no warnings "substr";
      no warnings "uninitialized";
      if (substr ($b, 0, 1) eq "<") {
        if (substr ($b, 1, 1) eq "?") { # ASCII8
          if ($b =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
              encoding\s*=\s*["']([^"']*)/x) {
            $obj->{input_encoding} = $1;

lib/HTML/HTML5/Parser/Charset/DecodeHandle.pm view on Meta::CPAN

          }
          if (defined $csdef->{no_bom_variant32endian2143}) {
            $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
          }
        }
        # \x4C\x6F\xA7\x94 EBCDIC
      } # buffer
      $obj->{byte_buffer} .= $b;
    } # read
  } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-8'}) {
    ## BOM is optional.
    my $b = '';
    if (read $obj->{filehandle}, $b, 3) {
      if ($b eq "\xEF\xBB\xBF") {
        $obj->{has_bom} = 1;
      } else {
        $obj->{byte_buffer} .= $b;
      }
    }
    $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; # UTF-8 w/o BOM
  } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-16'}) {
    ## BOM is mandated.
    my $b = '';
    if (read $obj->{filehandle}, $b, 2) {
      if ($b eq "\xFE\xFF") {
        $obj->{has_bom} = 1; # UTF-16BE w/o BOM
        $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
      } elsif ($b eq "\xFF\xFE") {
        $obj->{has_bom} = 1; # UTF-16LE w/o BOM
        $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
      } else {
        $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
        $obj->{has_bom} = 0;
        $obj->{byte_buffer} .= $b; # UTF-16BE w/o BOM
        $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
      }
    } else {
      $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
      $obj->{has_bom} = 0; # UTF-16BE w/o BOM
      $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
    }
  }

  if ($csdef->{uri}->{$XML_CHARSET.'iso-2022-jp'}) {
    $obj->{state_2440} = 'gl-jis-1997-swapped';
    $obj->{state_2442} = 'gl-jis-1997';
    $obj->{state} = 'state_2842';
    require Encode::GLJIS1997Swapped;
    require Encode::GLJIS1997;

lib/HTML/HTML5/Parser/Charset/Info.pm view on Meta::CPAN

                 ## there are UTF-8 variant in fact, such as 
                 ## Unicode's UTF-8, ISO/IEC 10646's UTF-8, UTF-8n, and as
                 ## such.
  },
  perl_names => {
    'utf-8-strict' => PRIMARY_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
        ERROR_REPORTING_ENCODING_IMPL,
        ## NOTE: It does not support non-Unicode UCS characters (conforming).
        ## It does detect illegal sequences (conforming).
        ## It does not support surrpgate pairs (conforming).
        ## It does not support BOMs (non-conforming).
  },
  ## TODO: |error_level|
  bom_pattern => qr/\xEF\xBB\xBF/,
});

$Charset->{'utf-8n'}
= $IANACharset->{'utf-8n'}
= $HTMLCharset->{'utf-8'}
= __PACKAGE__->new ({
  category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT |

lib/HTML/HTML5/Parser/TagSoupParser.pm view on Meta::CPAN


    $wrapped_char_stream = $get_wrapper->($char_stream);
    $wrapped_char_stream->onerror ($char_onerror);

    $return = $self->parse_char_stream ($wrapped_char_stream, @args);
  };
  $self->_data($return, charset => $charset_name);
  return $return;
} # parse_byte_stream

## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
## and the HTML layer MUST ignore it.  However, we does strip BOM in
## the encoding layer and the HTML layer does not ignore any U+FEFF,
## because the core part of our HTML parser expects a string of character,
## not a string of bytes or code units or anything which might contain a BOM.
## Therefore, any parser interface that accepts a string of bytes,
## such as |parse_byte_string| in this module, must ensure that it does
## strip the BOM and never strip any ZWNBSP.

sub parse_char_string ($$$;$$) {
  #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
  my $self = shift;
  my $s = ref $_[0] ? $_[0] : \($_[0]);
  require HTML::HTML5::Parser::Charset::DecodeHandle;
  my $input = HTML::HTML5::Parser::Charset::DecodeHandle::CharString->new ($s);
  return $self->parse_char_stream ($input, @_[1..$#_]);
} # parse_char_string
*parse_string = \&parse_char_string; ## NOTE: Alias for backward compatibility.

( run in 0.360 second using v1.01-cache-2.11-cpan-e9daa2b36ef )