HTML-Parser

 view release on metacpan or  search on metacpan

lib/HTML/HeadParser.pm  view on Meta::CPAN

    } elsif ($tag =~ /^(?:title|noscript|object|command)$/) {
	# Just remember tag.  Initialize header when we see the end tag.
	$self->{'tag'} = $tag;
    } elsif ($tag eq 'link') {
	return unless exists $attr->{href};
	# <link href="http:..." rel="xxx" rev="xxx" title="xxx">
	my $href = delete($attr->{href});
	$href =~ s/^\s+//; $href =~ s/\s+$//; # HTML5
	my $h_val = "<$href>";
	for (sort keys %{$attr}) {
	    next if $_ eq "/";  # XHTML junk
	    $h_val .= qq(; $_="$attr->{$_}");
	}
	$self->{'header'}->push_header(Link => $h_val);
    } elsif ($tag eq 'head' || $tag eq 'html') {
	# ignore
    } else {
	 # stop parsing
	$self->eof;
    }
}

t/declaration.t  view on Meta::CPAN

<"<!-- foo -->">]
[<Entity>
<foo>
<"<!-- foo -->">]

<!row --> foo
EOT

$res = "";
$p->parse(<<EOT)->eof;
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"[]>
EOT
is($res, <<EOT);
[<DOCTYPE>
<html>
<PUBLIC>
<"-//W3C//DTD XHTML 1.0 Strict//EN">
<"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<[]>]
EOT

t/headparser.t  view on Meta::CPAN

    $p = HTML::HeadParser->new(H->new);
    ok($p->parse("\x{FEFF}\n<title>Hi <foo></title>"));
    $p->eof;

    is($p->header("title"), "Hi <foo>");

    $p = HTML::HeadParser->new(H->new);
    $p->utf8_mode(1);
    $p->parse(
        <<"EOT"); # example from http://rt.cpan.org/Ticket/Display.html?id=27522
\xEF\xBB\xBF<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
 <head>
 <title>
Parkinson's disease</title>
 <meta name="Keywords" content="brain,disease,dopamine,drug,levodopa,parkinson,patients,symptoms,,Medications, Medications">
 </meta>
 \t
\t<link href="../../css/ummAdam.css" rel="stylesheet" type="text/css" />
\t<link rel="stylesheet" rev="stylesheet" href="../../css/ummprint.css" media="print" />
\t

t/headparser.t  view on Meta::CPAN

EOT
    $p->eof;

    is($p->header("title"), "Parkinson's disease");
    is($p->header("link")->[0],
        '<../../css/ummAdam.css>; rel="stylesheet"; type="text/css"');

    $p = HTML::HeadParser->new(H->new);
    $p->utf8_mode(1);
    $p->parse(<<"EOT");    # example from http://www.mjw.com.pl/
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\r
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="pl" lang="pl"> \r
\r
<head profile="http://gmpg.org/xfn/11">\r
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r
\r
<title> ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa &#8211; MJW</title>\r
<link rel="shortcut icon" href="favicon.ico" type="image/x-icon" />\r

EOT
    $p->eof;



( run in 1.561 second using v1.01-cache-2.11-cpan-49f99fa48dc )