Data-TableReader-Decoder-HTML

 view release on metacpan or  search on metacpan

lib/Data/TableReader/Decoder/HTML.pm  view on Meta::CPAN

package Data::TableReader::Decoder::HTML;
$Data::TableReader::Decoder::HTML::VERSION = '0.020';
use Moo 2;
use Try::Tiny;
use Carp;
use IO::Handle;
use HTML::Parser;
extends 'Data::TableReader::Decoder';

# ABSTRACT: Access the tables of an HTML document


has _tables => ( is => 'lazy' );
sub parse {
	shift->_tables;
	return 1;
}

sub _build__tables {
	my $self= shift;
	# TODO: determine encoding from BOM, or from meta-equiv while parsing...
	binmode $self->file_handle;
	return $self->_parse_html_tables($self->file_handle);
}

sub _parse_html_tables {
	my ($self, $handle)= @_;
	# These variables track the state of the HTML parse.
	# cur_row is only defined when we are in a table row, and $cur_cell
	# is a scalar ref only defined when we are in a cell.
	my (@tables, $cur_table, $cur_row, $cur_cell);
	my $nested_tables= 0;
	my $ignore_all= 0;

	my $tag_start= sub {
		next if $ignore_all;
		my ($tagname, $attr)= (uc $_[0], $_[1]);
		if ($tagname eq 'TABLE') {
			if ($cur_table) {
				$self->_log->('warn','tables within tables are currently returned as a single cell value');
				$nested_tables++;
				$ignore_all++;
			}
			else {
				push @tables, ($cur_table= []);
			}
		}
		elsif ($tagname eq 'TR') {
			$cur_table or croak "found <tr> outside any <table>";
			$cur_row and $self->_log->('warn', 'found <tr> before end of previous row');
			push @$cur_table, ($cur_row= []);
		}
		elsif ($tagname eq 'TD' or $tagname eq 'TH') {
			$cur_table or croak "found <$tagname> outside any <table>";
			$cur_row or croak "found <$tagname> outside any <tr>";
			$cur_cell and $self->_log->('warn', "found <$tagname> before previous </$tagname>");
			push @$cur_row, '';
			$cur_cell= \$cur_row->[-1];
		}
	};
	my $content= sub {
		my ($text)= @_;
		if ($cur_cell) {
			$$cur_cell .= $text
		}
		elsif ($cur_row && $text =~ /\S/) {
			$self->_log->('warn', "Encountered text within a row but not in a cell: '$text'");
		}
	};
	my $tag_end= sub {
		my ($tagname)= (uc($_[0]));
		if ($ignore_all) {
			if ($tagname eq 'TABLE') {
				--$nested_tables;
				$ignore_all= 0 if $nested_tables <= 0;
			}
		}
		elsif ($tagname eq 'TD' or $tagname eq 'TH') {
			$cur_cell or $self->_log->('warn', "Found </$tagname> without matching <$tagname>");
			$cur_cell= undef;
		}



( run in 2.081 seconds using v1.01-cache-2.11-cpan-5837b0d9d2c )