HTML-Parser-Simple

 view release on metacpan or  search on metacpan

lib/HTML/Parser/Simple.pm  view on Meta::CPAN

	 acronym => 1,
	 applet => 1,
	 b => 1,
	 basefont => 1,
	 bdo => 1,
	 big => 1,
	 br => 1,
	 button => 1,
	 cite => 1,
	 code => 1,
	 del => 1,
	 dfn => 1,
	 em => 1,
	 font => 1,
	 i => 1,
	 iframe => 1,
	 img => 1,
	 input => 1,
	 ins => 1,
	 kbd => 1,
	 label => 1,
	 map => 1,
	 object => 1,
	 'q' => 1,
	 's' => 1,
	 samp => 1,
	 script => 1,
	 select => 1,
	 small => 1,
	 span => 1,
	 strike => 1,
	 strong => 1,
	 sub => 1,
	 sup => 1,
	 textarea => 1,
	 tt => 1,
	 u => 1,
	 var => 1,
	});

	$self -> self_close
	({
	 colgroup => 1,
	 dd => 1,
	 dt => 1,
	 li => 1,
	 options => 1,
	 p => 1,
	 td => 1,
	 tfoot => 1,
	 th => 1,
	 thead => 1,
	 'tr' => 1,
	});

	$self -> current_node($self -> create_new_node('root', '', Tree::Simple -> ROOT) );
	$self -> root($self -> current_node);

	if ($self -> xhtml)
	{
		# Compared to the non-XHTML re, this has an extra  ':' in the first [].

		$self -> tagged_attribute
		(
			q#^(<(\w+)((?:\s+[-:\w]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>)#
		);
	}
	else
	{
		$self -> tagged_attribute
		(
			q#^(<(\w+)((?:\s+[-\w]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>)#
		);
	}

}	# End of BUILD.

# -----------------------------------------------
# Create a new node to store the new tag.
# Each node has metadata:
# o attributes: The tag's attributes, as a string with N spaces as a prefix.
# o content:    The content before the tag was parsed.
# o name:       The HTML tag.
# o node_type:  This holds 'global' before '<head>' and between '</head>'
#               and '<body>', and after '</body>'. It holds 'head' from
#               '<head>' to </head>', and holds 'body' from '<body>' to
#               '</body>'. It's just there in case you need it.

sub create_new_node
{
	my($self, $name, $attributes, $parent) = @_;
	my($metadata) =
	{
		attributes => $attributes,
		content    => [],
		depth      => $self -> depth,
		name       => $name,
		node_type  => $self -> node_type,
	};

	return Tree::Simple -> new($metadata, $parent);

} # End of create_new_node.

# -----------------------------------------------

sub handle_comment
{
	my($self, $s) = @_;

	$self -> handle_content($s);

} # End of handle_comment.

# -----------------------------------------------

sub handle_content
{
	my($self, $s)                 = @_;
	my($count)                    = $self -> current_node -> getChildCount;
	my($metadata)                 = $self -> current_node -> getNodeValue;

lib/HTML/Parser/Simple.pm  view on Meta::CPAN

	die "Can't read($input_file_name): $!\n" if (! defined $html);

	$self -> log('Parsing');

	$self -> parse($html);

	$self -> log('Traversing');

	$self -> traverse($self -> root);

	$self -> log("Writing $output_file_name");

	open($fh, "> $output_file_name") || die "Can't open(> $output_file_name): $!\n";
	print $fh $self -> result;
	close $fh;

	# Return the invocant to allow method chaining.

	return $self;

} # End of parse_file.

# -----------------------------------------------

sub parse_start_tag
{
	my($self, $tag_name, $attributes, $unary, $stack) = @_;
	$tag_name = lc $tag_name;

	if (${$self -> block}{$tag_name})
	{
		for (; $#$stack >= 0 && ${$self -> inline}{$$stack[$#$stack]};)
		{
			$self -> parse_end_tag($$stack[$#$stack], $stack);
		}
	}

	if (${$self -> self_close}{$tag_name} && ($$stack[$#$stack] eq $tag_name) )
	{
		$self -> parse_end_tag($tag_name, $stack);
	}

	$unary = ${$self -> empty}{$tag_name} || $unary;

	push @$stack, $tag_name if (! $unary);

	$self -> handle_start_tag($tag_name, $attributes, $unary);

} # End of parse_start_tag.

# -----------------------------------------------

sub _set_tagged_attribute
{
	my($self, $new, $old) = @_;

	if ($new)
	{
		$self -> tagged_attribute
		(
			# Compared to the non-XHTML re, this has an extra  ':' in the first [].

			q#^(<(\w+)((?:\s+[-:\w]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>)#
		);
	}
	else
	{
		$self -> tagged_attribute
		(
			q#^(<(\w+)((?:\s+[-\w]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>)#
		);
	}

} # End of _set_tagged_attribute.

# -----------------------------------------------

sub traverse
{
	my($self, $node) = @_;
	my(@child)       = $node -> getAllChildren;
	my($metadata)    = $node -> getNodeValue;
	my($content)     = $$metadata{'content'};
	my($name)        = $$metadata{'name'};

	# Special check to avoid printing '<root>' when we still need to output
	# the content of the root, e.g. the DOCTYPE.

	$self -> result($self -> result . "<$name$$metadata{'attributes'}>") if ($name ne 'root');

	my($index);
	my($s);

	for $index (0 .. $#child)
	{
		$self -> result($self -> result . ($index <= $#$content && defined($$content[$index]) ? $$content[$index] : '') );
		$self -> traverse($child[$index]);
	}

	# Output the content after the last child node has been closed,
	# but before the current node is closed.

	$index = $#child + 1;

	$self -> result($self -> result . ($index <= $#$content && defined($$content[$index]) ? $$content[$index] : '') );
	$self -> result($self -> result . "</$name>") if (! ${$self -> empty}{$name} && ($name ne 'root') );

	# Return the invocant to allow method chaining.

	return $self;

} # End of traverse.

# -----------------------------------------------

1;

=head1 NAME

HTML::Parser::Simple - Parse nice HTML files without needing a compiler

lib/HTML/Parser/Simple.pm  view on Meta::CPAN

=head1 Description

C<HTML::Parser::Simple> is a pure Perl module.

It parses HTML V 4 files, and generates a tree of nodes, with 1 node per HTML tag.

The data associated with each node is documented in the L</FAQ>.

See also L<HTML::Parser::Simple::Attributes> and L<HTML::Parser::Simple::Reporter>.

=head1 Distributions

This module is available as a Unix-style distro (*.tgz).

See L<http://savage.net.au/Perl-modules.html> for details.

See L<http://savage.net.au/Perl-modules/html/installing-a-module.html> for
help on unpacking and installing.

=head1 Constructor and initialization

new(...) returns an object of type C<HTML::Parser::Simple>.

This is the class contructor.

Usage: C<< HTML::Parser::Simple -> new >>.

This method takes a hash of options.

Call C<< new() >> as C<< new(option_1 => value_1, option_2 => value_2, ...) >>.

Available options (each one of which is also a method):

=over 4

=item o input_file => $a_file_name

This takes the file name, including the path, of the input file.

Default: '' (the empty string).

=item o output_file => $a_file_name

This takes the file name, including the path, of the output file.

Default: '' (the empty string).

=item o verbose => $Boolean

This takes either a 0 or a 1.

Write more or less progress messages.

Default: 0.

=item o xhtml => $Boolean

This takes either a 0 or a 1.

0 means do not accept an XML declaration, such as <?xml version="1.0" encoding="UTF-8"?>
at the start of the input file, and some other XHTML features, explained next.

1 means accept XHTML input.

Default: 0.

The only XHTML changes to this code, so far, are:

=over 4

=item o Accept the XML declaration

E.g.: <?xml version="1.0" standalone='yes'?>.

=item o Accept attribute names containing the ':' char

E.g.: <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">.

=back

=back

=head1 Methods

=head2 block()

Returns a hashref where the keys are the names of block-level HTML tags.

The corresponding values in the hashref are just 1.

Typical keys: address, form, p, table, tr.

Note: Some keys, e.g. tr, are also returned by L</self_close()>.

=head2 current_node()

Returns the L<Tree::Simple> object which the parser calls the current node.

=head2 depth()

Returns the nesting depth of the current tag.

The method is just here in case you need it.

=head2 empty()

Returns a hashref where the keys are the names of HTML tags of type empty.

The corresponding values in the hashref are just 1.

Typical keys: area, base, input, wbr.

=head2 inline()

Returns a hashref where the keys are the names of HTML tags of type inline.

The corresponding values in the hashref are just 1.

Typical keys: a, em, img, textarea.

=head2 input_file($in_file_name)

Gets or sets the input file name used by L</parse($input_file_name, $output_file_name)>.

Note: The parameters passed in to L</parse_file($input_file_name, $output_file_name)>, take
precedence over the I<input_file> and I<output_file> parameters passed in to C<< new() >>, and over
the internal values set with C<< input_file($in_file_name) >> and



( run in 1.968 second using v1.01-cache-2.11-cpan-119454b85a5 )