XHTML results from the CPAN

XHTML
HTML-Parser
view release on metacpan or search on metacpan
ok(!$p->header('Isindex'));

# Try feeding one char at a time
my $expected = $p->as_string;
my $nl       = 1;
$p = HTML::HeadParser->new(H->new);
while ($HTML =~ /(.)/sg) {

    #print STDERR '#' if $nl;
    #print STDERR $1;
    $nl = $1 eq "\n";
    $p->parse($1) or last;
}
is($p->as_string, $expected);

# Try reading it from a file
my $file = "hptest$$.html";
die "$file already exists" if -e $file;

{
    open(my $fh, '>', $file) or die "Can't create $file: $!";
    binmode($fh);
    print {$fh} $HTML;
    print {$fh} "<p>This is more content...</p>\n" x 2000;
    print {$fh} "<title>Buuuh!</title>\n" x 200;
    close $fh or die "Can't close $file: $!";
}
$p = HTML::HeadParser->new(H->new);
$p->parse_file($file);
unlink($file) or warn "Can't unlink $file: $!";

is($p->header("Title"), "Ã… vÃ¦re eller Ã¥ ikke vÃ¦re");


# We got into an infinite loop on data without tags and no EOL.
# This was actually a HTML::Parser bug.
{
    open(my $fh, '>', $file) or die "Can't create $file: $!";
    print {$fh} "Foo";
    close($fh);
}

$p = HTML::HeadParser->new(H->new);
$p->parse_file($file);
unlink($file) or warn "Can't unlink $file: $!";

ok(!$p->as_string);

SKIP: {
    # Test that the Unicode BOM does not confuse us?
    $p = HTML::HeadParser->new(H->new);
    ok($p->parse("\x{FEFF}\n<title>Hi <foo></title>"));
    $p->eof;

    is($p->header("title"), "Hi <foo>");

    $p = HTML::HeadParser->new(H->new);
    $p->utf8_mode(1);
    $p->parse(
        <<"EOT"); # example from http://rt.cpan.org/Ticket/Display.html?id=27522
\xEF\xBB\xBF<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
 <head>
 <title>
Parkinson's disease</title>
 <meta name="Keywords" content="brain,disease,dopamine,drug,levodopa,parkinson,patients,symptoms,,Medications, Medications">
 </meta>
 \t
\t<link href="../../css/ummAdam.css" rel="stylesheet" type="text/css" />
\t<link rel="stylesheet" rev="stylesheet" href="../../css/ummprint.css" media="print" />
\t
\t </head>
 <body>
EOT
    $p->eof;

    is($p->header("title"), "Parkinson's disease");
    is($p->header("link")->[0],
        '<../../css/ummAdam.css>; rel="stylesheet"; type="text/css"');

    $p = HTML::HeadParser->new(H->new);
    $p->utf8_mode(1);
    $p->parse(<<"EOT");    # example from http://www.mjw.com.pl/
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\r
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="pl" lang="pl"> \r
\r
<head profile="http://gmpg.org/xfn/11">\r
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r
\r
<title> ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa &#8211; MJW</title>\r
<link rel="shortcut icon" href="favicon.ico" type="image/x-icon" />\r

EOT
    $p->eof;
    is($p->header("title"),
        "ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa \xE2\x80\x93 MJW"
    );
}
( run in 0.452 second using v1.01-cache-2.11-cpan-119454b85a5 )