HTML-Parser-Simple
view release on metacpan or search on metacpan
lib/HTML/Parser/Simple.pm view on Meta::CPAN
acronym => 1,
applet => 1,
b => 1,
basefont => 1,
bdo => 1,
big => 1,
br => 1,
button => 1,
cite => 1,
code => 1,
del => 1,
dfn => 1,
em => 1,
font => 1,
i => 1,
iframe => 1,
img => 1,
input => 1,
ins => 1,
kbd => 1,
label => 1,
map => 1,
object => 1,
'q' => 1,
's' => 1,
samp => 1,
script => 1,
select => 1,
small => 1,
span => 1,
strike => 1,
strong => 1,
sub => 1,
sup => 1,
textarea => 1,
tt => 1,
u => 1,
var => 1,
});
$self -> self_close
({
colgroup => 1,
dd => 1,
dt => 1,
li => 1,
options => 1,
p => 1,
td => 1,
tfoot => 1,
th => 1,
thead => 1,
'tr' => 1,
});
$self -> current_node($self -> create_new_node('root', '', Tree::Simple -> ROOT) );
$self -> root($self -> current_node);
if ($self -> xhtml)
{
# Compared to the non-XHTML re, this has an extra ':' in the first [].
$self -> tagged_attribute
(
q#^(<(\w+)((?:\s+[-:\w]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>)#
);
}
else
{
$self -> tagged_attribute
(
q#^(<(\w+)((?:\s+[-\w]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>)#
);
}
} # End of BUILD.
# -----------------------------------------------
# Create a new node to store the new tag.
# Each node has metadata:
# o attributes: The tag's attributes, as a string with N spaces as a prefix.
# o content: The content before the tag was parsed.
# o name: The HTML tag.
# o node_type: This holds 'global' before '<head>' and between '</head>'
# and '<body>', and after '</body>'. It holds 'head' from
# '<head>' to </head>', and holds 'body' from '<body>' to
# '</body>'. It's just there in case you need it.
sub create_new_node
{
my($self, $name, $attributes, $parent) = @_;
my($metadata) =
{
attributes => $attributes,
content => [],
depth => $self -> depth,
name => $name,
node_type => $self -> node_type,
};
return Tree::Simple -> new($metadata, $parent);
} # End of create_new_node.
# -----------------------------------------------
sub handle_comment
{
my($self, $s) = @_;
$self -> handle_content($s);
} # End of handle_comment.
# -----------------------------------------------
sub handle_content
{
my($self, $s) = @_;
my($count) = $self -> current_node -> getChildCount;
my($metadata) = $self -> current_node -> getNodeValue;
lib/HTML/Parser/Simple.pm view on Meta::CPAN
die "Can't read($input_file_name): $!\n" if (! defined $html);
$self -> log('Parsing');
$self -> parse($html);
$self -> log('Traversing');
$self -> traverse($self -> root);
$self -> log("Writing $output_file_name");
open($fh, "> $output_file_name") || die "Can't open(> $output_file_name): $!\n";
print $fh $self -> result;
close $fh;
# Return the invocant to allow method chaining.
return $self;
} # End of parse_file.
# -----------------------------------------------
sub parse_start_tag
{
my($self, $tag_name, $attributes, $unary, $stack) = @_;
$tag_name = lc $tag_name;
if (${$self -> block}{$tag_name})
{
for (; $#$stack >= 0 && ${$self -> inline}{$$stack[$#$stack]};)
{
$self -> parse_end_tag($$stack[$#$stack], $stack);
}
}
if (${$self -> self_close}{$tag_name} && ($$stack[$#$stack] eq $tag_name) )
{
$self -> parse_end_tag($tag_name, $stack);
}
$unary = ${$self -> empty}{$tag_name} || $unary;
push @$stack, $tag_name if (! $unary);
$self -> handle_start_tag($tag_name, $attributes, $unary);
} # End of parse_start_tag.
# -----------------------------------------------
sub _set_tagged_attribute
{
my($self, $new, $old) = @_;
if ($new)
{
$self -> tagged_attribute
(
# Compared to the non-XHTML re, this has an extra ':' in the first [].
q#^(<(\w+)((?:\s+[-:\w]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>)#
);
}
else
{
$self -> tagged_attribute
(
q#^(<(\w+)((?:\s+[-\w]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>)#
);
}
} # End of _set_tagged_attribute.
# -----------------------------------------------
sub traverse
{
my($self, $node) = @_;
my(@child) = $node -> getAllChildren;
my($metadata) = $node -> getNodeValue;
my($content) = $$metadata{'content'};
my($name) = $$metadata{'name'};
# Special check to avoid printing '<root>' when we still need to output
# the content of the root, e.g. the DOCTYPE.
$self -> result($self -> result . "<$name$$metadata{'attributes'}>") if ($name ne 'root');
my($index);
my($s);
for $index (0 .. $#child)
{
$self -> result($self -> result . ($index <= $#$content && defined($$content[$index]) ? $$content[$index] : '') );
$self -> traverse($child[$index]);
}
# Output the content after the last child node has been closed,
# but before the current node is closed.
$index = $#child + 1;
$self -> result($self -> result . ($index <= $#$content && defined($$content[$index]) ? $$content[$index] : '') );
$self -> result($self -> result . "</$name>") if (! ${$self -> empty}{$name} && ($name ne 'root') );
# Return the invocant to allow method chaining.
return $self;
} # End of traverse.
# -----------------------------------------------
1;
=head1 NAME
HTML::Parser::Simple - Parse nice HTML files without needing a compiler
lib/HTML/Parser/Simple.pm view on Meta::CPAN
=head1 Description
C<HTML::Parser::Simple> is a pure Perl module.
It parses HTML V 4 files, and generates a tree of nodes, with 1 node per HTML tag.
The data associated with each node is documented in the L</FAQ>.
See also L<HTML::Parser::Simple::Attributes> and L<HTML::Parser::Simple::Reporter>.
=head1 Distributions
This module is available as a Unix-style distro (*.tgz).
See L<http://savage.net.au/Perl-modules.html> for details.
See L<http://savage.net.au/Perl-modules/html/installing-a-module.html> for
help on unpacking and installing.
=head1 Constructor and initialization
new(...) returns an object of type C<HTML::Parser::Simple>.
This is the class contructor.
Usage: C<< HTML::Parser::Simple -> new >>.
This method takes a hash of options.
Call C<< new() >> as C<< new(option_1 => value_1, option_2 => value_2, ...) >>.
Available options (each one of which is also a method):
=over 4
=item o input_file => $a_file_name
This takes the file name, including the path, of the input file.
Default: '' (the empty string).
=item o output_file => $a_file_name
This takes the file name, including the path, of the output file.
Default: '' (the empty string).
=item o verbose => $Boolean
This takes either a 0 or a 1.
Write more or less progress messages.
Default: 0.
=item o xhtml => $Boolean
This takes either a 0 or a 1.
0 means do not accept an XML declaration, such as <?xml version="1.0" encoding="UTF-8"?>
at the start of the input file, and some other XHTML features, explained next.
1 means accept XHTML input.
Default: 0.
The only XHTML changes to this code, so far, are:
=over 4
=item o Accept the XML declaration
E.g.: <?xml version="1.0" standalone='yes'?>.
=item o Accept attribute names containing the ':' char
E.g.: <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">.
=back
=back
=head1 Methods
=head2 block()
Returns a hashref where the keys are the names of block-level HTML tags.
The corresponding values in the hashref are just 1.
Typical keys: address, form, p, table, tr.
Note: Some keys, e.g. tr, are also returned by L</self_close()>.
=head2 current_node()
Returns the L<Tree::Simple> object which the parser calls the current node.
=head2 depth()
Returns the nesting depth of the current tag.
The method is just here in case you need it.
=head2 empty()
Returns a hashref where the keys are the names of HTML tags of type empty.
The corresponding values in the hashref are just 1.
Typical keys: area, base, input, wbr.
=head2 inline()
Returns a hashref where the keys are the names of HTML tags of type inline.
The corresponding values in the hashref are just 1.
Typical keys: a, em, img, textarea.
=head2 input_file($in_file_name)
Gets or sets the input file name used by L</parse($input_file_name, $output_file_name)>.
Note: The parameters passed in to L</parse_file($input_file_name, $output_file_name)>, take
precedence over the I<input_file> and I<output_file> parameters passed in to C<< new() >>, and over
the internal values set with C<< input_file($in_file_name) >> and
( run in 1.968 second using v1.01-cache-2.11-cpan-119454b85a5 )