Data-TableReader-Decoder-HTML
view release on metacpan or search on metacpan
lib/Data/TableReader/Decoder/HTML.pm view on Meta::CPAN
package Data::TableReader::Decoder::HTML;
$Data::TableReader::Decoder::HTML::VERSION = '0.020';
use Moo 2;
use Try::Tiny;
use Carp;
use IO::Handle;
use HTML::Parser;
extends 'Data::TableReader::Decoder';
# ABSTRACT: Access the tables of an HTML document
has _tables => ( is => 'lazy' );
sub parse {
shift->_tables;
return 1;
}
sub _build__tables {
my $self= shift;
# TODO: determine encoding from BOM, or from meta-equiv while parsing...
binmode $self->file_handle;
return $self->_parse_html_tables($self->file_handle);
}
sub _parse_html_tables {
my ($self, $handle)= @_;
# These variables track the state of the HTML parse.
# cur_row is only defined when we are in a table row, and $cur_cell
# is a scalar ref only defined when we are in a cell.
my (@tables, $cur_table, $cur_row, $cur_cell);
my $nested_tables= 0;
my $ignore_all= 0;
my $tag_start= sub {
next if $ignore_all;
my ($tagname, $attr)= (uc $_[0], $_[1]);
if ($tagname eq 'TABLE') {
if ($cur_table) {
$self->_log->('warn','tables within tables are currently returned as a single cell value');
$nested_tables++;
$ignore_all++;
}
else {
push @tables, ($cur_table= []);
}
}
elsif ($tagname eq 'TR') {
$cur_table or croak "found <tr> outside any <table>";
$cur_row and $self->_log->('warn', 'found <tr> before end of previous row');
push @$cur_table, ($cur_row= []);
}
elsif ($tagname eq 'TD' or $tagname eq 'TH') {
$cur_table or croak "found <$tagname> outside any <table>";
$cur_row or croak "found <$tagname> outside any <tr>";
$cur_cell and $self->_log->('warn', "found <$tagname> before previous </$tagname>");
push @$cur_row, '';
$cur_cell= \$cur_row->[-1];
}
};
my $content= sub {
my ($text)= @_;
if ($cur_cell) {
$$cur_cell .= $text
}
elsif ($cur_row && $text =~ /\S/) {
$self->_log->('warn', "Encountered text within a row but not in a cell: '$text'");
}
};
my $tag_end= sub {
my ($tagname)= (uc($_[0]));
if ($ignore_all) {
if ($tagname eq 'TABLE') {
--$nested_tables;
$ignore_all= 0 if $nested_tables <= 0;
}
}
elsif ($tagname eq 'TD' or $tagname eq 'TH') {
$cur_cell or $self->_log->('warn', "Found </$tagname> without matching <$tagname>");
$cur_cell= undef;
}
( run in 2.081 seconds using v1.01-cache-2.11-cpan-5837b0d9d2c )