HTML5-DOM

 view release on metacpan or  search on metacpan

README.pod  view on Meta::CPAN


 my $encoding_id = HTML5::DOM::Encoding::detectByPrescanStream($text, $max_length = 0);

Detect encoding by parsing C<E<lt>metaE<gt>> tags in html.

Returns encoding id, if success. And returns HTML5::DOM::Encoding->NOT_DETERMINED if fail.

See for more info: L<https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding>

 my $encoding_id = HTML5::DOM::Encoding::detectByPrescanStream('
    <meta http-equiv="content-type" content="text/html; charset=windows-1251">
 ');
 my $encoding = HTML5::DOM::Encoding::id2name($encoding_id);
 print $encoding; # WINDOWS-1251

=head3 detectByCharset

 my $encoding_id = HTML5::DOM::Encoding::detectByCharset($text, $max_length = 0);

Extracting character encoding from string. Find "charset=" and see encoding. Return found raw data.

For example: "text/html; charset=windows-1251". Return HTML5::DOM::Encoding->WINDOWS_1251

And returns HTML5::DOM::Encoding->NOT_DETERMINED if fail.

See for more info: L<https://html.spec.whatwg.org/multipage/infrastructure.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element>

 my $encoding_id = HTML5::DOM::Encoding::detectByPrescanStream('
    <meta http-equiv="content-type" content="text/html; charset=windows-1251">
 ');
 my $encoding = HTML5::DOM::Encoding::id2name($encoding_id);
 print $encoding; # WINDOWS-1251

=head3 detectBomAndCut

 my ($encoding_id, $new_text) = HTML5::DOM::Encoding::detectBomAndCut($text, $max_length = 0);

Returns array with encoding id and new text without BOM. 

lib/HTML5/DOM.pod  view on Meta::CPAN


 my $encoding_id = HTML5::DOM::Encoding::detectByPrescanStream($text, $max_length = 0);

Detect encoding by parsing C<E<lt>metaE<gt>> tags in html.

Returns encoding id, if success. And returns HTML5::DOM::Encoding->NOT_DETERMINED if fail.

See for more info: L<https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding>

 my $encoding_id = HTML5::DOM::Encoding::detectByPrescanStream('
    <meta http-equiv="content-type" content="text/html; charset=windows-1251">
 ');
 my $encoding = HTML5::DOM::Encoding::id2name($encoding_id);
 print $encoding; # WINDOWS-1251

=head3 detectByCharset

 my $encoding_id = HTML5::DOM::Encoding::detectByCharset($text, $max_length = 0);

Extracting character encoding from string. Find "charset=" and see encoding. Return found raw data.

For example: "text/html; charset=windows-1251". Return HTML5::DOM::Encoding->WINDOWS_1251

And returns HTML5::DOM::Encoding->NOT_DETERMINED if fail.

See for more info: L<https://html.spec.whatwg.org/multipage/infrastructure.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element>

 my $encoding_id = HTML5::DOM::Encoding::detectByPrescanStream('
    <meta http-equiv="content-type" content="text/html; charset=windows-1251">
 ');
 my $encoding = HTML5::DOM::Encoding::id2name($encoding_id);
 print $encoding; # WINDOWS-1251

=head3 detectBomAndCut

 my ($encoding_id, $new_text) = HTML5::DOM::Encoding::detectBomAndCut($text, $max_length = 0);

Returns array with encoding id and new text without BOM. 

t/0-api.t  view on Meta::CPAN

# detectBomAndCut
my ($encoding_id, $new_text) = HTML5::DOM::Encoding::detectBomAndCut("\xEF\xBB\xBFtest214");
ok($encoding_id == HTML5::DOM::Encoding->UTF_8, 'detectBomAndCut id');
ok($new_text eq 'test214', 'detectBomAndCut text');

# detectByCharset
$encoding_id = HTML5::DOM::Encoding::detectByCharset("text/html; charset=windows-1251");
ok($encoding_id == HTML5::DOM::Encoding->WINDOWS_1251, 'detectByCharset');

# detectByPrescanStream
$encoding_id = HTML5::DOM::Encoding::detectByPrescanStream('<meta http-equiv="content-type" content="text/html; charset=windows-1251">');
ok($encoding_id == HTML5::DOM::Encoding->WINDOWS_1251, 'detectByPrescanStream');

my $utf16 = "\x21\x04\x4a\x04\x35\x04\x48\x04\x4c\x04\x20\x00\x35\x04\x49\x04\x51\x04\x20\x00\x4d\x04\x42\x04\x38\x04\x45\x04\x20\x00\x3c\x04\x4f\x04\x33\x04\x3a\x04\x38\x04\x45\x04\x20\x00\x44\x04\x40\x04\x30\x04\x3d\x04\x46\x04\x43\x04\x37\x04\x41\...
my $cp1251 = "\xe5\xed\xe8\x20\xee\xe3\xee\x20\xf1\xf2\xe2\x20\xed\xe8\xff\x20\xee\xe2\xe0\x20\xf2\xe5\xeb\x20\xf0\xe5\xe4\x20\xee\xf1\xf2" x 100;

# detectUnicode
$encoding_id = HTML5::DOM::Encoding::detectUnicode($utf16);
ok($encoding_id == HTML5::DOM::Encoding->UTF_16LE, 'detectUnicode');

# detect

third_party/modest/source/myencoding/detect.c  view on Meta::CPAN

    while(*length < data_size) {
        *length = myencoding_prescan_stream_to_determine_encoding_get_attr(udata, *length, data_size, &attr, &it_last);
        
        /* 9 */
        if(attr.key_length == strlen("http-equiv") &&
           mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"http-equiv", &udata[ attr.key_begin ]))
        {
            if((is_exists & 1) == 0) {
                is_exists |= 1;
                
                if(attr.value_length == strlen("content-type") &&
                   mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"content-type", &udata[ attr.value_begin ]))
                {
                    got_pragma = true;
                }
            }
        }
        else if(attr.key_length == strlen("content") &&
                mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"content", &udata[ attr.key_begin ]))
        {
            if((is_exists & 2) == 0) {
                is_exists |= 2;



( run in 1.365 second using v1.01-cache-2.11-cpan-d7f47b0818f )