HTML-HTML5-Parser
view release on metacpan or search on metacpan
lib/HTML/HTML5/Parser/Charset/Info.pm view on Meta::CPAN
'iso-2022-jp-2' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
'csiso2022jp2' => REGISTERED_CHARSET_NAME,
},
## TODO: |error_level|
});
## TODO: ...
$IANACharset->{'gb_2312-80'}
= $IANACharset->{'iso-ir-58'}
= $IANACharset->{chinese}
= $HTMLCharset->{gb231280}
= $HTMLCharset->{isoir58}
= __PACKAGE__->new ({
## NOTE: What is represented by this charset is unclear... I don't
## understand what RFC 1945 describes...
category => 0,
iana_names => {
'gb_2312-80' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
'iso-ir-58' => REGISTERED_CHARSET_NAME,
'chinese' => REGISTERED_CHARSET_NAME,
'csiso58gb231280' => REGISTERED_CHARSET_NAME,
},
perl_names => {
## TODO: GB2312->GBK Parse Error wrapper
'cp936' => FALLBACK_ENCODING_IMPL,
},
## NOTE: |gb2312| is handled as |gbk|, such that properties should be
## consistent.
});
## TODO: ...
$Charset->{'utf-8'}
= $IANACharset->{'utf-8'}
= $IANACharset->{'x-utf-8'}
= $HTMLCharset->{'utf8'}
= $HTMLCharset->{'xutf8'}
= __PACKAGE__->new ({
category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
CHARSET_CATEGORY_MIME_TEXT,
iana_names => {
'utf-8' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
## NOTE: IANA name "utf-8" references RFC 3629. According to the RFC,
## the definitive definition is one specified in the Unicode Standard.
'x-utf-8' => UNREGISTERED_CHARSET_NAME,
## NOTE: We treat |x-utf-8| as an alias of |utf-8|, since unlike
## other charset like |x-sjis| or |x-euc-jp|, there is no major
## variant for the UTF-8 encoding.
## TODO: We might ought to reconsider this policy, since
## there are UTF-8 variant in fact, such as
## Unicode's UTF-8, ISO/IEC 10646's UTF-8, UTF-8n, and as
## such.
},
perl_names => {
'utf-8-strict' => PRIMARY_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
ERROR_REPORTING_ENCODING_IMPL,
## NOTE: It does not support non-Unicode UCS characters (conforming).
## It does detect illegal sequences (conforming).
## It does not support surrpgate pairs (conforming).
## It does not support BOMs (non-conforming).
},
## TODO: |error_level|
bom_pattern => qr/\xEF\xBB\xBF/,
});
$Charset->{'utf-8n'}
= $IANACharset->{'utf-8n'}
= $HTMLCharset->{'utf-8'}
= __PACKAGE__->new ({
category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT |
CHARSET_CATEGORY_ASCII_COMPAT,
iana_names => {
'utf-8n' => UNREGISTERED_CHARSET_NAME,
## NOTE: Is there any normative definition for the charset?
## What variant of UTF-8 should we use for the charset?
},
perl_names => {
'utf-8-strict' => PRIMARY_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
},
## TODO: |error_level|
});
## TODO: ...
$Charset->{'gbk'}
= $IANACharset->{'gbk'}
= $IANACharset->{'cp936'}
= $IANACharset->{'ms936'}
= $IANACharset->{'windows-936'}
= $HTMLCharset->{'windows936'}
= __PACKAGE__->new ({
category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
iana_names => {
'gbk' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
'cp936' => REGISTERED_CHARSET_NAME,
'ms936' => REGISTERED_CHARSET_NAME,
'windows-936' => REGISTERED_CHARSET_NAME,
},
## TODO: |error_level|
iana_status => STATUS_COMMON | STATUS_OBSOLETE,
});
$Charset->{'gb18030'}
= $IANACharset->{'gb18030'}
= __PACKAGE__->new ({
category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
iana_names => {
'gb18030' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
},
iana_status => STATUS_COMMON,
mime_text_suitable => 1,
});
## TODO: ...
$Charset->{'utf-16be'}
= $IANACharset->{'utf-16be'}
= $HTMLCharset->{'utf16be'}
= __PACKAGE__->new ({
category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,
( run in 0.626 second using v1.01-cache-2.11-cpan-5a3173703d6 )