HTML-HTML5-Parser

 view release on metacpan or  search on metacpan

lib/HTML/HTML5/Parser/Charset/DecodeHandle.pm  view on Meta::CPAN

package HTML::HTML5::Parser::Charset::DecodeHandle;
## skip Test::Tabs
use strict;
use warnings;

our $VERSION = '0.992';

## NOTE: |Message::Charset::Info| uses this module without calling
## the constructor.
use HTML::HTML5::Parser::Charset::Info;

my $XML_AUTO_CHARSET = q<http://suika.fam.cx/www/2006/03/xml-entity/>;
my $IANA_CHARSET = q<urn:x-suika-fam-cx:charset:>;
my $PERL_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.>;
my $XML_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.>;

## ->create_decode_handle ($charset_uri, $byte_stream, $onerror)
sub create_decode_handle ($$$;$) {
  my $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$_[1]};
  my $obj = {
             category => 0,
             char_buffer => \(my $s = ''),
             char_buffer_pos => 0,
             character_queue => [],
             filehandle => $_[2],
             charset => $_[1],
             byte_buffer => '',
             onerror => $_[3] || sub {},
             #onerror_set
            };
  if ($csdef->{uri}->{$XML_AUTO_CHARSET} or
      $obj->{charset} eq $XML_AUTO_CHARSET) {
    my $b = ''; # UTF-8 w/o BOM
    $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
    $obj->{input_encoding} = 'UTF-8';
    if (read $obj->{filehandle}, $b, 256) {
      no warnings "substr";
      no warnings "uninitialized";
      if (substr ($b, 0, 1) eq "<") {
        if (substr ($b, 1, 1) eq "?") { # ASCII8
          if ($b =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
              encoding\s*=\s*["']([^"']*)/x) {
            $obj->{input_encoding} = $1;
            my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
            $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri};
            if (not $csdef->{ascii8} or $csdef->{bom_required}) {
              $obj->{onerror}->(undef, 'charset-name-mismatch-error',
                                charset_uri => $uri,
                                charset_name => $obj->{input_encoding});
            }
          } else {
            $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
            $obj->{input_encoding} = 'UTF-8';
          }
          if (defined $csdef->{no_bom_variant}) {
            $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant}};
          }
        } elsif (substr ($b, 1, 1) eq "\x00") {
          if (substr ($b, 2, 2) eq "?\x00") { # ASCII16LE
            my $c = $b; $c =~ tr/\x00//d;
            if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
                encoding\s*=\s*["']([^"']*)/x) {
              $obj->{input_encoding} = $1;
              my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
              $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri};
              if (not $csdef->{ascii16} or $csdef->{ascii16be} or
                  $csdef->{bom_required}) {
                $obj->{onerror}->(undef, 'charset-name-mismatch-error',
                                  charset_uri => $uri,
                                  charset_name => $obj->{input_encoding});
              }
            } else {
              $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
              $obj->{input_encoding} = 'UTF-8';
            }
            if (defined $csdef->{no_bom_variant16le}) {
              $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant16le}};
            }
          } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321
            my $c = $b; $c =~ tr/\x00//d;
            if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
                encoding\s*=\s*["']([^"']*)/x) {
              $obj->{input_encoding} = $1;
              my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
              $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri};
              if (not $csdef->{ascii32} or 
                  $csdef->{ascii32endian1234} or
                  $csdef->{ascii32endian2143} or
                  $csdef->{ascii32endian3412} or
                  $csdef->{bom_required}) {
                $obj->{onerror}->(undef, 'charset-name-mismatch-error',
                                  charset_uri => $uri,
                                  charset_name => $obj->{input_encoding});

lib/HTML/HTML5/Parser/Charset/DecodeHandle.pm  view on Meta::CPAN

        } elsif (substr ($b, 2, 2) eq "\xFE\xFF") { # ASCII32Endian1234
          $obj->{has_bom} = 1;
          substr ($b, 0, 4) = '';
          my $c = $b; $c =~ tr/\x00//d;
          if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
              encoding\s*=\s*["']([^"']*)/x) {
            $obj->{input_encoding} = $1;
            my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
            $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri};
            if (not $csdef->{ascii32} or
                $csdef->{ascii32endian2143} or
                $csdef->{ascii32endian3412} or
                $csdef->{ascii32endian4321} or
                $csdef->{bom_required}) {
              $obj->{onerror}->(undef, 'charset-name-mismatch-error',
                                charset_uri => $uri,
                                charset_name => $obj->{input_encoding});
            }
          } else {
            $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
            $obj->{input_encoding} = 'UTF-8';
            $obj->{has_bom} = 0;
            $obj->{byte_buffer} .= "\x00\x00\xFE\xFF";
         }
          if (defined $csdef->{no_bom_variant32endian1234}) {
            $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}};
          }
        } elsif (substr ($b, 2, 2) eq "\xFF\xFE") { # ASCII32Endian2143
          $obj->{has_bom} = 1;
          substr ($b, 0, 4) = '';
          my $c = $b; $c =~ tr/\x00//d;
          if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
              encoding\s*=\s*["']([^"']*)/x) {
            $obj->{input_encoding} = $1;
            my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
            $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri};
            if (not $csdef->{ascii32} or
                $csdef->{ascii32endian1234} or
                $csdef->{ascii32endian3412} or
                $csdef->{ascii32endian4321} or
                $csdef->{bom_required}) {
              $obj->{onerror}->(undef, 'charset-name-mismatch-error',
                                charset_uri => $uri,
                                charset_name => $obj->{input_encoding});
            }
          } else {
            $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
            $obj->{input_encoding} = 'UTF-8';
            $obj->{has_bom} = 0;
            $obj->{byte_buffer} .= "\x00\x00\xFF\xFE";
          }
          if (defined $csdef->{no_bom_variant32endian2143}) {
            $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
          }
        }
        # \x4C\x6F\xA7\x94 EBCDIC
      } # buffer
      $obj->{byte_buffer} .= $b;
    } # read
  } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-8'}) {
    ## BOM is optional.
    my $b = '';
    if (read $obj->{filehandle}, $b, 3) {
      if ($b eq "\xEF\xBB\xBF") {
        $obj->{has_bom} = 1;
      } else {
        $obj->{byte_buffer} .= $b;
      }
    }
    $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; # UTF-8 w/o BOM
  } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-16'}) {
    ## BOM is mandated.
    my $b = '';
    if (read $obj->{filehandle}, $b, 2) {
      if ($b eq "\xFE\xFF") {
        $obj->{has_bom} = 1; # UTF-16BE w/o BOM
        $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
      } elsif ($b eq "\xFF\xFE") {
        $obj->{has_bom} = 1; # UTF-16LE w/o BOM
        $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
      } else {
        $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
        $obj->{has_bom} = 0;
        $obj->{byte_buffer} .= $b; # UTF-16BE w/o BOM
        $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
      }
    } else {
      $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
      $obj->{has_bom} = 0; # UTF-16BE w/o BOM
      $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
    }
  }

  if ($csdef->{uri}->{$XML_CHARSET.'iso-2022-jp'}) {
    $obj->{state_2440} = 'gl-jis-1997-swapped';
    $obj->{state_2442} = 'gl-jis-1997';
    $obj->{state} = 'state_2842';
    require Encode::GLJIS1997Swapped;
    require Encode::GLJIS1997;
    if (Encode::find_encoding ($obj->{state_2440}) and
        Encode::find_encoding ($obj->{state_2442})) {
      return bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::ISO2022JP';
    }
  } elsif ($csdef->{uri}->{$IANA_CHARSET.'iso-2022-jp'}) {
    $obj->{state_2440} = 'gl-jis-1978';
    $obj->{state_2442} = 'gl-jis-1983';
    $obj->{state} = 'state_2842';
    require Encode::GLJIS1978;
    require Encode::GLJIS1983;
    if (Encode::find_encoding ($obj->{state_2440}) and
        Encode::find_encoding ($obj->{state_2442})) {
      return bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::ISO2022JP';
    }
  } elsif (defined $csdef->{perl_name}->[0]) {
    if ($csdef->{uri}->{$XML_CHARSET.'euc-jp'} or
        $csdef->{uri}->{$IANA_CHARSET.'euc-jp'}) {
      $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
      require Encode::EUCJP1997;
      if (Encode::find_encoding ($obj->{perl_encoding_name})) {
        $obj->{category} |= HTML::HTML5::Parser::Charset::Info::CHARSET_CATEGORY_EUCJP;
        return bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::Encode';
      }
    } elsif ($csdef->{uri}->{$XML_CHARSET.'shift_jis'} or
             $csdef->{uri}->{$IANA_CHARSET.'shift_jis'}) {
      $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
      require Encode::ShiftJIS1997;
      if (Encode::find_encoding ($obj->{perl_encoding_name})) {
        return bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::Encode';
      }
    } elsif ($csdef->{is_block_safe}) {
      $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
      require Encode;
      if (Encode::find_encoding ($obj->{perl_encoding_name})) {
        return bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::Encode';
      }
    }
  }
  
  $obj->{onerror}->(undef, 'charset-not-supported-error',
                    charset_uri => $obj->{charset});
  return undef;
} # create_decode_handle

sub name_to_uri ($$$) {
  my $domain = $_[1];
  my $name = lc $_[2];

  if ($domain eq 'ietf') {
    return $IANA_CHARSET . $name;



( run in 0.761 second using v1.01-cache-2.11-cpan-99c4e6809bf )