HTML-HTML5-Parser
view release on metacpan or search on metacpan
lib/HTML/HTML5/Parser/Charset/DecodeHandle.pm view on Meta::CPAN
package HTML::HTML5::Parser::Charset::DecodeHandle;
## skip Test::Tabs
use strict;
use warnings;
our $VERSION = '0.992';
## NOTE: |Message::Charset::Info| uses this module without calling
## the constructor.
use HTML::HTML5::Parser::Charset::Info;
my $XML_AUTO_CHARSET = q<http://suika.fam.cx/www/2006/03/xml-entity/>;
my $IANA_CHARSET = q<urn:x-suika-fam-cx:charset:>;
my $PERL_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.>;
my $XML_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.>;
## ->create_decode_handle ($charset_uri, $byte_stream, $onerror)
sub create_decode_handle ($$$;$) {
my $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$_[1]};
my $obj = {
category => 0,
char_buffer => \(my $s = ''),
char_buffer_pos => 0,
character_queue => [],
filehandle => $_[2],
charset => $_[1],
byte_buffer => '',
onerror => $_[3] || sub {},
#onerror_set
};
if ($csdef->{uri}->{$XML_AUTO_CHARSET} or
$obj->{charset} eq $XML_AUTO_CHARSET) {
my $b = ''; # UTF-8 w/o BOM
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
$obj->{input_encoding} = 'UTF-8';
if (read $obj->{filehandle}, $b, 256) {
no warnings "substr";
no warnings "uninitialized";
if (substr ($b, 0, 1) eq "<") {
if (substr ($b, 1, 1) eq "?") { # ASCII8
if ($b =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
encoding\s*=\s*["']([^"']*)/x) {
$obj->{input_encoding} = $1;
my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri};
if (not $csdef->{ascii8} or $csdef->{bom_required}) {
$obj->{onerror}->(undef, 'charset-name-mismatch-error',
charset_uri => $uri,
charset_name => $obj->{input_encoding});
}
} else {
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
$obj->{input_encoding} = 'UTF-8';
}
if (defined $csdef->{no_bom_variant}) {
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant}};
}
} elsif (substr ($b, 1, 1) eq "\x00") {
if (substr ($b, 2, 2) eq "?\x00") { # ASCII16LE
my $c = $b; $c =~ tr/\x00//d;
if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
encoding\s*=\s*["']([^"']*)/x) {
$obj->{input_encoding} = $1;
my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri};
if (not $csdef->{ascii16} or $csdef->{ascii16be} or
$csdef->{bom_required}) {
$obj->{onerror}->(undef, 'charset-name-mismatch-error',
charset_uri => $uri,
charset_name => $obj->{input_encoding});
}
} else {
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
$obj->{input_encoding} = 'UTF-8';
}
if (defined $csdef->{no_bom_variant16le}) {
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant16le}};
}
} elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321
my $c = $b; $c =~ tr/\x00//d;
if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
encoding\s*=\s*["']([^"']*)/x) {
$obj->{input_encoding} = $1;
my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri};
if (not $csdef->{ascii32} or
$csdef->{ascii32endian1234} or
$csdef->{ascii32endian2143} or
$csdef->{ascii32endian3412} or
$csdef->{bom_required}) {
$obj->{onerror}->(undef, 'charset-name-mismatch-error',
charset_uri => $uri,
charset_name => $obj->{input_encoding});
lib/HTML/HTML5/Parser/Charset/DecodeHandle.pm view on Meta::CPAN
} elsif (substr ($b, 2, 2) eq "\xFE\xFF") { # ASCII32Endian1234
$obj->{has_bom} = 1;
substr ($b, 0, 4) = '';
my $c = $b; $c =~ tr/\x00//d;
if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
encoding\s*=\s*["']([^"']*)/x) {
$obj->{input_encoding} = $1;
my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri};
if (not $csdef->{ascii32} or
$csdef->{ascii32endian2143} or
$csdef->{ascii32endian3412} or
$csdef->{ascii32endian4321} or
$csdef->{bom_required}) {
$obj->{onerror}->(undef, 'charset-name-mismatch-error',
charset_uri => $uri,
charset_name => $obj->{input_encoding});
}
} else {
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
$obj->{input_encoding} = 'UTF-8';
$obj->{has_bom} = 0;
$obj->{byte_buffer} .= "\x00\x00\xFE\xFF";
}
if (defined $csdef->{no_bom_variant32endian1234}) {
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}};
}
} elsif (substr ($b, 2, 2) eq "\xFF\xFE") { # ASCII32Endian2143
$obj->{has_bom} = 1;
substr ($b, 0, 4) = '';
my $c = $b; $c =~ tr/\x00//d;
if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
encoding\s*=\s*["']([^"']*)/x) {
$obj->{input_encoding} = $1;
my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri};
if (not $csdef->{ascii32} or
$csdef->{ascii32endian1234} or
$csdef->{ascii32endian3412} or
$csdef->{ascii32endian4321} or
$csdef->{bom_required}) {
$obj->{onerror}->(undef, 'charset-name-mismatch-error',
charset_uri => $uri,
charset_name => $obj->{input_encoding});
}
} else {
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
$obj->{input_encoding} = 'UTF-8';
$obj->{has_bom} = 0;
$obj->{byte_buffer} .= "\x00\x00\xFF\xFE";
}
if (defined $csdef->{no_bom_variant32endian2143}) {
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
}
}
# \x4C\x6F\xA7\x94 EBCDIC
} # buffer
$obj->{byte_buffer} .= $b;
} # read
} elsif ($csdef->{uri}->{$XML_CHARSET.'utf-8'}) {
## BOM is optional.
my $b = '';
if (read $obj->{filehandle}, $b, 3) {
if ($b eq "\xEF\xBB\xBF") {
$obj->{has_bom} = 1;
} else {
$obj->{byte_buffer} .= $b;
}
}
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; # UTF-8 w/o BOM
} elsif ($csdef->{uri}->{$XML_CHARSET.'utf-16'}) {
## BOM is mandated.
my $b = '';
if (read $obj->{filehandle}, $b, 2) {
if ($b eq "\xFE\xFF") {
$obj->{has_bom} = 1; # UTF-16BE w/o BOM
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
} elsif ($b eq "\xFF\xFE") {
$obj->{has_bom} = 1; # UTF-16LE w/o BOM
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
} else {
$obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
$obj->{has_bom} = 0;
$obj->{byte_buffer} .= $b; # UTF-16BE w/o BOM
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
}
} else {
$obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
$obj->{has_bom} = 0; # UTF-16BE w/o BOM
$csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
}
}
if ($csdef->{uri}->{$XML_CHARSET.'iso-2022-jp'}) {
$obj->{state_2440} = 'gl-jis-1997-swapped';
$obj->{state_2442} = 'gl-jis-1997';
$obj->{state} = 'state_2842';
require Encode::GLJIS1997Swapped;
require Encode::GLJIS1997;
if (Encode::find_encoding ($obj->{state_2440}) and
Encode::find_encoding ($obj->{state_2442})) {
return bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::ISO2022JP';
}
} elsif ($csdef->{uri}->{$IANA_CHARSET.'iso-2022-jp'}) {
$obj->{state_2440} = 'gl-jis-1978';
$obj->{state_2442} = 'gl-jis-1983';
$obj->{state} = 'state_2842';
require Encode::GLJIS1978;
require Encode::GLJIS1983;
if (Encode::find_encoding ($obj->{state_2440}) and
Encode::find_encoding ($obj->{state_2442})) {
return bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::ISO2022JP';
}
} elsif (defined $csdef->{perl_name}->[0]) {
if ($csdef->{uri}->{$XML_CHARSET.'euc-jp'} or
$csdef->{uri}->{$IANA_CHARSET.'euc-jp'}) {
$obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
require Encode::EUCJP1997;
if (Encode::find_encoding ($obj->{perl_encoding_name})) {
$obj->{category} |= HTML::HTML5::Parser::Charset::Info::CHARSET_CATEGORY_EUCJP;
return bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::Encode';
}
} elsif ($csdef->{uri}->{$XML_CHARSET.'shift_jis'} or
$csdef->{uri}->{$IANA_CHARSET.'shift_jis'}) {
$obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
require Encode::ShiftJIS1997;
if (Encode::find_encoding ($obj->{perl_encoding_name})) {
return bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::Encode';
}
} elsif ($csdef->{is_block_safe}) {
$obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
require Encode;
if (Encode::find_encoding ($obj->{perl_encoding_name})) {
return bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::Encode';
}
}
}
$obj->{onerror}->(undef, 'charset-not-supported-error',
charset_uri => $obj->{charset});
return undef;
} # create_decode_handle
sub name_to_uri ($$$) {
my $domain = $_[1];
my $name = lc $_[2];
if ($domain eq 'ietf') {
return $IANA_CHARSET . $name;
( run in 0.761 second using v1.01-cache-2.11-cpan-99c4e6809bf )