HTML-HTML5-Parser
view release on metacpan or search on metacpan
lib/HTML/HTML5/Parser/Charset/DecodeHandle.pm view on Meta::CPAN
char_buffer_pos => 0,
character_queue => [],
filehandle => $_[2],
charset => $_[1],
byte_buffer => '',
onerror => $_[3] || sub {},
#onerror_set
};
if ($csdef->{uri}->{$XML_AUTO_CHARSET} or
$obj->{charset} eq $XML_AUTO_CHARSET) {
my $b = ''; # UTF-8 w/o BOM
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
$obj->{input_encoding} = 'UTF-8';
if (read $obj->{filehandle}, $b, 256) {
no warnings "substr";
no warnings "uninitialized";
if (substr ($b, 0, 1) eq "<") {
if (substr ($b, 1, 1) eq "?") { # ASCII8
if ($b =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
encoding\s*=\s*["']([^"']*)/x) {
$obj->{input_encoding} = $1;
lib/HTML/HTML5/Parser/Charset/DecodeHandle.pm view on Meta::CPAN
}
if (defined $csdef->{no_bom_variant32endian2143}) {
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
}
}
# \x4C\x6F\xA7\x94 EBCDIC
} # buffer
$obj->{byte_buffer} .= $b;
} # read
} elsif ($csdef->{uri}->{$XML_CHARSET.'utf-8'}) {
## BOM is optional.
my $b = '';
if (read $obj->{filehandle}, $b, 3) {
if ($b eq "\xEF\xBB\xBF") {
$obj->{has_bom} = 1;
} else {
$obj->{byte_buffer} .= $b;
}
}
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; # UTF-8 w/o BOM
} elsif ($csdef->{uri}->{$XML_CHARSET.'utf-16'}) {
## BOM is mandated.
my $b = '';
if (read $obj->{filehandle}, $b, 2) {
if ($b eq "\xFE\xFF") {
$obj->{has_bom} = 1; # UTF-16BE w/o BOM
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
} elsif ($b eq "\xFF\xFE") {
$obj->{has_bom} = 1; # UTF-16LE w/o BOM
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
} else {
$obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
$obj->{has_bom} = 0;
$obj->{byte_buffer} .= $b; # UTF-16BE w/o BOM
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
}
} else {
$obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
$obj->{has_bom} = 0; # UTF-16BE w/o BOM
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
}
}
if ($csdef->{uri}->{$XML_CHARSET.'iso-2022-jp'}) {
$obj->{state_2440} = 'gl-jis-1997-swapped';
$obj->{state_2442} = 'gl-jis-1997';
$obj->{state} = 'state_2842';
require Encode::GLJIS1997Swapped;
require Encode::GLJIS1997;
lib/HTML/HTML5/Parser/Charset/Info.pm view on Meta::CPAN
## there are UTF-8 variant in fact, such as
## Unicode's UTF-8, ISO/IEC 10646's UTF-8, UTF-8n, and as
## such.
},
perl_names => {
'utf-8-strict' => PRIMARY_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
ERROR_REPORTING_ENCODING_IMPL,
## NOTE: It does not support non-Unicode UCS characters (conforming).
## It does detect illegal sequences (conforming).
## It does not support surrpgate pairs (conforming).
## It does not support BOMs (non-conforming).
},
## TODO: |error_level|
bom_pattern => qr/\xEF\xBB\xBF/,
});
$Charset->{'utf-8n'}
= $IANACharset->{'utf-8n'}
= $HTMLCharset->{'utf-8'}
= __PACKAGE__->new ({
category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT |
lib/HTML/HTML5/Parser/TagSoupParser.pm view on Meta::CPAN
$wrapped_char_stream = $get_wrapper->($char_stream);
$wrapped_char_stream->onerror ($char_onerror);
$return = $self->parse_char_stream ($wrapped_char_stream, @args);
};
$self->_data($return, charset => $charset_name);
return $return;
} # parse_byte_stream
## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
## and the HTML layer MUST ignore it. However, we does strip BOM in
## the encoding layer and the HTML layer does not ignore any U+FEFF,
## because the core part of our HTML parser expects a string of character,
## not a string of bytes or code units or anything which might contain a BOM.
## Therefore, any parser interface that accepts a string of bytes,
## such as |parse_byte_string| in this module, must ensure that it does
## strip the BOM and never strip any ZWNBSP.
sub parse_char_string ($$$;$$) {
#my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
my $self = shift;
my $s = ref $_[0] ? $_[0] : \($_[0]);
require HTML::HTML5::Parser::Charset::DecodeHandle;
my $input = HTML::HTML5::Parser::Charset::DecodeHandle::CharString->new ($s);
return $self->parse_char_stream ($input, @_[1..$#_]);
} # parse_char_string
*parse_string = \&parse_char_string; ## NOTE: Alias for backward compatibility.
( run in 0.360 second using v1.01-cache-2.11-cpan-e9daa2b36ef )