WebSource
view release on metacpan or search on metacpan
WebSource/Parser.pm view on Meta::CPAN
package WebSource::Parser;
use strict;
use XML::LibXML;
use HTML::TreeBuilder;
{
package MyTreeBuilder;
our @ISA = ('HTML::TreeBuilder');
sub start {
my ($self,$tag,$attr,$attrseq,$origtext) =@_;
my %nattr;
my @naseq;
# Clean up attributes
foreach my $a (@$attrseq) {
if($a =~ m#[^\w_:\-]#) {
$self->{verbose} and warn "Bad attribute $a detected and removed";
} else {
push @naseq, ($a);
$nattr{$a} = $attr->{$a};
}
}
$self->SUPER::start($tag,\%nattr,\@naseq,$origtext);
}
sub text {
my ($self,$origtext,$iscdata) = @_;
if(!$iscdata) {
$origtext =~ /Sion/ and print "Text : $origtext\n";
if($origtext =~ m/\0/) {
$self->{verbose} and warn "Decected null char\n";
$origtext =~ s/\0//g;
}
if($origtext =~ m/\&\#[0-9]\;/) {
warn "Bad entity detected";
$origtext =~ s/\&\#[0-9]\;//g;
}
}
$self->SUPER::text($origtext,$iscdata);
}
}
our @ISA = ("XML::LibXML");
=head1 NAME
WebSource::Parser - A XML/HTML parser extending XML::LibXML
=head1 DESCRIPTION
A simple XML::LibXML extention to be more robust in parsing HTML by
using HTML::TreeBuilder
=head1 SYNOPSIS
my $parser = WebSource::Parser->new;
=head1 METHODS
=over 2
=item B<< $parser = WebSource::Parser->new; >>
Create a new WebSource::Parser
=cut
sub new {
my $class = shift;
my $self = $class->SUPER::new(verbose => 1, @_);
return $self;
}
=item B<< $parser->parse_html_file($file); >>
Parse an html file
=cut
sub parse_html_file {
my $self = shift;
my $file = shift;
my $tb = MyTreeBuilder->new;
# $tb->xml_mode(1);
$tb->parse_file($file);
return $self->SUPER::parse_string($tb->as_XML);
}
=item B<< $parser->parse_html_string($string); >>
Parse an html string
=cut
sub parse_html_string {
my $self = shift;
my $string = shift;
my $tb = MyTreeBuilder->new;
$tb->parse($string);
( run in 1.475 second using v1.01-cache-2.11-cpan-2398b32b56e )