HTML-HTML5-Parser

 view release on metacpan or  search on metacpan

lib/HTML/HTML5/Parser/Charset/Info.pm  view on Meta::CPAN

    'iso-2022-jp-2' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
    'csiso2022jp2' => REGISTERED_CHARSET_NAME,
  },
  ## TODO: |error_level|
});

## TODO: ...

$IANACharset->{'gb_2312-80'}
= $IANACharset->{'iso-ir-58'}
= $IANACharset->{chinese}
= $HTMLCharset->{gb231280}
= $HTMLCharset->{isoir58}
= __PACKAGE__->new ({
  ## NOTE: What is represented by this charset is unclear...  I don't 
  ## understand what RFC 1945 describes...
  category => 0,
  iana_names => {
    'gb_2312-80' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
    'iso-ir-58' => REGISTERED_CHARSET_NAME,
    'chinese' => REGISTERED_CHARSET_NAME,
    'csiso58gb231280' => REGISTERED_CHARSET_NAME,
  },
  perl_names => {
    ## TODO: GB2312->GBK Parse Error wrapper
    'cp936' => FALLBACK_ENCODING_IMPL,
  },
  ## NOTE: |gb2312| is handled as |gbk|, such that properties should be
  ## consistent.
});

## TODO: ...

$Charset->{'utf-8'}
= $IANACharset->{'utf-8'}
= $IANACharset->{'x-utf-8'}
= $HTMLCharset->{'utf8'}
= $HTMLCharset->{'xutf8'}
= __PACKAGE__->new ({
  category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
      CHARSET_CATEGORY_MIME_TEXT,
  iana_names => {
    'utf-8' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
        ## NOTE: IANA name "utf-8" references RFC 3629.  According to the RFC,
        ## the definitive definition is one specified in the Unicode Standard.
    'x-utf-8' => UNREGISTERED_CHARSET_NAME,
        ## NOTE: We treat |x-utf-8| as an alias of |utf-8|, since unlike
        ## other charset like |x-sjis| or |x-euc-jp|, there is no major
        ## variant for the UTF-8 encoding.
                 ## TODO: We might ought to reconsider this policy, since
                 ## there are UTF-8 variant in fact, such as 
                 ## Unicode's UTF-8, ISO/IEC 10646's UTF-8, UTF-8n, and as
                 ## such.
  },
  perl_names => {
    'utf-8-strict' => PRIMARY_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
        ERROR_REPORTING_ENCODING_IMPL,
        ## NOTE: It does not support non-Unicode UCS characters (conforming).
        ## It does detect illegal sequences (conforming).
        ## It does not support surrpgate pairs (conforming).
        ## It does not support BOMs (non-conforming).
  },
  ## TODO: |error_level|
  bom_pattern => qr/\xEF\xBB\xBF/,
});

$Charset->{'utf-8n'}
= $IANACharset->{'utf-8n'}
= $HTMLCharset->{'utf-8'}
= __PACKAGE__->new ({
  category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT |
      CHARSET_CATEGORY_ASCII_COMPAT,
  iana_names => {
    'utf-8n' => UNREGISTERED_CHARSET_NAME,
        ## NOTE: Is there any normative definition for the charset?
        ## What variant of UTF-8 should we use for the charset?
  },
  perl_names => {
    'utf-8-strict' => PRIMARY_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
  },
  ## TODO: |error_level|
});

## TODO: ...

$Charset->{'gbk'}
= $IANACharset->{'gbk'}
= $IANACharset->{'cp936'}
= $IANACharset->{'ms936'}
= $IANACharset->{'windows-936'}
= $HTMLCharset->{'windows936'}
= __PACKAGE__->new ({
  category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
  iana_names => {
    'gbk' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
    'cp936' => REGISTERED_CHARSET_NAME,
    'ms936' => REGISTERED_CHARSET_NAME,
    'windows-936' => REGISTERED_CHARSET_NAME,
  },
  ## TODO: |error_level|
  iana_status => STATUS_COMMON | STATUS_OBSOLETE,
});

$Charset->{'gb18030'}
= $IANACharset->{'gb18030'}
= __PACKAGE__->new ({
  category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
  iana_names => {
    'gb18030' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
  },
  iana_status => STATUS_COMMON,
  mime_text_suitable => 1,
});

## TODO: ...

$Charset->{'utf-16be'}
= $IANACharset->{'utf-16be'}
= $HTMLCharset->{'utf16be'}
= __PACKAGE__->new ({
  category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,



( run in 0.626 second using v1.01-cache-2.11-cpan-5a3173703d6 )