HTML-Parser
view release on metacpan or search on metacpan
t/headparser.t view on Meta::CPAN
ok(!$p->header('Isindex'));
# Try feeding one char at a time
my $expected = $p->as_string;
my $nl = 1;
$p = HTML::HeadParser->new(H->new);
while ($HTML =~ /(.)/sg) {
#print STDERR '#' if $nl;
#print STDERR $1;
$nl = $1 eq "\n";
$p->parse($1) or last;
}
is($p->as_string, $expected);
# Try reading it from a file
my $file = "hptest$$.html";
die "$file already exists" if -e $file;
{
open(my $fh, '>', $file) or die "Can't create $file: $!";
binmode($fh);
print {$fh} $HTML;
print {$fh} "<p>This is more content...</p>\n" x 2000;
print {$fh} "<title>Buuuh!</title>\n" x 200;
close $fh or die "Can't close $file: $!";
}
$p = HTML::HeadParser->new(H->new);
$p->parse_file($file);
unlink($file) or warn "Can't unlink $file: $!";
is($p->header("Title"), "Ã
være eller å ikke være");
# We got into an infinite loop on data without tags and no EOL.
# This was actually a HTML::Parser bug.
{
open(my $fh, '>', $file) or die "Can't create $file: $!";
print {$fh} "Foo";
close($fh);
}
$p = HTML::HeadParser->new(H->new);
$p->parse_file($file);
unlink($file) or warn "Can't unlink $file: $!";
ok(!$p->as_string);
SKIP: {
# Test that the Unicode BOM does not confuse us?
$p = HTML::HeadParser->new(H->new);
ok($p->parse("\x{FEFF}\n<title>Hi <foo></title>"));
$p->eof;
is($p->header("title"), "Hi <foo>");
$p = HTML::HeadParser->new(H->new);
$p->utf8_mode(1);
$p->parse(
<<"EOT"); # example from http://rt.cpan.org/Ticket/Display.html?id=27522
\xEF\xBB\xBF<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
<head>
<title>
Parkinson's disease</title>
<meta name="Keywords" content="brain,disease,dopamine,drug,levodopa,parkinson,patients,symptoms,,Medications, Medications">
</meta>
\t
\t<link href="../../css/ummAdam.css" rel="stylesheet" type="text/css" />
\t<link rel="stylesheet" rev="stylesheet" href="../../css/ummprint.css" media="print" />
\t
\t </head>
<body>
EOT
$p->eof;
is($p->header("title"), "Parkinson's disease");
is($p->header("link")->[0],
'<../../css/ummAdam.css>; rel="stylesheet"; type="text/css"');
$p = HTML::HeadParser->new(H->new);
$p->utf8_mode(1);
$p->parse(<<"EOT"); # example from http://www.mjw.com.pl/
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\r
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="pl" lang="pl"> \r
\r
<head profile="http://gmpg.org/xfn/11">\r
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r
\r
<title> ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa – MJW</title>\r
<link rel="shortcut icon" href="favicon.ico" type="image/x-icon" />\r
EOT
$p->eof;
is($p->header("title"),
"ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa \xE2\x80\x93 MJW"
);
}
( run in 0.452 second using v1.01-cache-2.11-cpan-119454b85a5 )