HTML5-DOM
view release on metacpan or search on metacpan
my $encoding_id = HTML5::DOM::Encoding::detectByPrescanStream($text, $max_length = 0);
Detect encoding by parsing C<E<lt>metaE<gt>> tags in html.
Returns encoding id, if success. And returns HTML5::DOM::Encoding->NOT_DETERMINED if fail.
See for more info: L<https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding>
my $encoding_id = HTML5::DOM::Encoding::detectByPrescanStream('
<meta http-equiv="content-type" content="text/html; charset=windows-1251">
');
my $encoding = HTML5::DOM::Encoding::id2name($encoding_id);
print $encoding; # WINDOWS-1251
=head3 detectByCharset
my $encoding_id = HTML5::DOM::Encoding::detectByCharset($text, $max_length = 0);
Extracting character encoding from string. Find "charset=" and see encoding. Return found raw data.
For example: "text/html; charset=windows-1251". Return HTML5::DOM::Encoding->WINDOWS_1251
And returns HTML5::DOM::Encoding->NOT_DETERMINED if fail.
See for more info: L<https://html.spec.whatwg.org/multipage/infrastructure.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element>
my $encoding_id = HTML5::DOM::Encoding::detectByPrescanStream('
<meta http-equiv="content-type" content="text/html; charset=windows-1251">
');
my $encoding = HTML5::DOM::Encoding::id2name($encoding_id);
print $encoding; # WINDOWS-1251
=head3 detectBomAndCut
my ($encoding_id, $new_text) = HTML5::DOM::Encoding::detectBomAndCut($text, $max_length = 0);
Returns array with encoding id and new text without BOM.
lib/HTML5/DOM.pod view on Meta::CPAN
my $encoding_id = HTML5::DOM::Encoding::detectByPrescanStream($text, $max_length = 0);
Detect encoding by parsing C<E<lt>metaE<gt>> tags in html.
Returns encoding id, if success. And returns HTML5::DOM::Encoding->NOT_DETERMINED if fail.
See for more info: L<https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding>
my $encoding_id = HTML5::DOM::Encoding::detectByPrescanStream('
<meta http-equiv="content-type" content="text/html; charset=windows-1251">
');
my $encoding = HTML5::DOM::Encoding::id2name($encoding_id);
print $encoding; # WINDOWS-1251
=head3 detectByCharset
my $encoding_id = HTML5::DOM::Encoding::detectByCharset($text, $max_length = 0);
Extracting character encoding from string. Find "charset=" and see encoding. Return found raw data.
For example: "text/html; charset=windows-1251". Return HTML5::DOM::Encoding->WINDOWS_1251
And returns HTML5::DOM::Encoding->NOT_DETERMINED if fail.
See for more info: L<https://html.spec.whatwg.org/multipage/infrastructure.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element>
my $encoding_id = HTML5::DOM::Encoding::detectByPrescanStream('
<meta http-equiv="content-type" content="text/html; charset=windows-1251">
');
my $encoding = HTML5::DOM::Encoding::id2name($encoding_id);
print $encoding; # WINDOWS-1251
=head3 detectBomAndCut
my ($encoding_id, $new_text) = HTML5::DOM::Encoding::detectBomAndCut($text, $max_length = 0);
Returns array with encoding id and new text without BOM.
# detectBomAndCut
my ($encoding_id, $new_text) = HTML5::DOM::Encoding::detectBomAndCut("\xEF\xBB\xBFtest214");
ok($encoding_id == HTML5::DOM::Encoding->UTF_8, 'detectBomAndCut id');
ok($new_text eq 'test214', 'detectBomAndCut text');
# detectByCharset
$encoding_id = HTML5::DOM::Encoding::detectByCharset("text/html; charset=windows-1251");
ok($encoding_id == HTML5::DOM::Encoding->WINDOWS_1251, 'detectByCharset');
# detectByPrescanStream
$encoding_id = HTML5::DOM::Encoding::detectByPrescanStream('<meta http-equiv="content-type" content="text/html; charset=windows-1251">');
ok($encoding_id == HTML5::DOM::Encoding->WINDOWS_1251, 'detectByPrescanStream');
my $utf16 = "\x21\x04\x4a\x04\x35\x04\x48\x04\x4c\x04\x20\x00\x35\x04\x49\x04\x51\x04\x20\x00\x4d\x04\x42\x04\x38\x04\x45\x04\x20\x00\x3c\x04\x4f\x04\x33\x04\x3a\x04\x38\x04\x45\x04\x20\x00\x44\x04\x40\x04\x30\x04\x3d\x04\x46\x04\x43\x04\x37\x04\x41\...
my $cp1251 = "\xe5\xed\xe8\x20\xee\xe3\xee\x20\xf1\xf2\xe2\x20\xed\xe8\xff\x20\xee\xe2\xe0\x20\xf2\xe5\xeb\x20\xf0\xe5\xe4\x20\xee\xf1\xf2" x 100;
# detectUnicode
$encoding_id = HTML5::DOM::Encoding::detectUnicode($utf16);
ok($encoding_id == HTML5::DOM::Encoding->UTF_16LE, 'detectUnicode');
# detect
third_party/modest/source/myencoding/detect.c view on Meta::CPAN
while(*length < data_size) {
*length = myencoding_prescan_stream_to_determine_encoding_get_attr(udata, *length, data_size, &attr, &it_last);
/* 9 */
if(attr.key_length == strlen("http-equiv") &&
mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"http-equiv", &udata[ attr.key_begin ]))
{
if((is_exists & 1) == 0) {
is_exists |= 1;
if(attr.value_length == strlen("content-type") &&
mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"content-type", &udata[ attr.value_begin ]))
{
got_pragma = true;
}
}
}
else if(attr.key_length == strlen("content") &&
mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"content", &udata[ attr.key_begin ]))
{
if((is_exists & 2) == 0) {
is_exists |= 2;
( run in 1.365 second using v1.01-cache-2.11-cpan-d7f47b0818f )