HTML-HTML5-ToText

 view release on metacpan or  search on metacpan

lib/HTML/HTML5/ToText.pm  view on Meta::CPAN

package HTML::HTML5::ToText;

use 5.010;
use common::sense;
use utf8;

BEGIN {
	$HTML::HTML5::ToText::AUTHORITY = 'cpan:TOBYINK';
	$HTML::HTML5::ToText::VERSION   = '0.004';
}

use Moose;
with 'MooseX::Traits';

has '+_trait_namespace' => (
	default => join('::', __PACKAGE__, 'Trait'),
);

use HTML::HTML5::Parser;
use XML::LibXML::PrettyPrint;

BEGIN
{
	my @noshow = qw[base basefont bgsound meta param script style];
	my @empty  = qw[br canvas col command embed frame hr
	                img is index keygen link];
	my @inline = qw[a abbr area b bdi bdo big button cite code dfn em font i
	                input kbd label mark meter nobr progress q rp rt ruby s
	                samp small span strike strong sub sup time tt u var wbr];
	my @block  = qw[address applet article aside audio blockquote body caption
	                center colgroup datalist del dir div dd details dl dt
	                fieldset figcaption figure footer form frameset h1 h2 h3
	                h4 h5 h6 head header hgroup html iframe ins legend li
	                listing map marquee menu nav noembed noframes noscript
	                object ol optgroup option p pre select section source summary
	                table tbody td tfoot th thead title tr track ul video];
	
	{
		no strict 'refs';
		*{ uc $_ } = sub { (shift)->_inline($_, @_) }
			foreach @inline;
		*{ uc $_ } = sub { (shift)->_block($_, @_) }
			foreach @block;
		*{ uc $_ } = sub { (shift)->_empty($_, @_) }
			foreach @empty;
		*{ uc $_ } = sub { (shift)->_noshow($_, @_) }
			foreach @noshow;
	}
}

sub process
{
	my ($self, $node, $no_clone) = @_;
	$self = $self->new unless ref $self;
	
	if ($node->nodeName eq '#document')
	{
		$node = $node->documentElement;
	}
	
	unless ($no_clone)
	{
		$node = $node->cloneNode(1);
	}
	
	if ($node->isa('XML::LibXML::Element'))
	{
		XML::LibXML::PrettyPrint->new_for_html->strip_whitespace($node);
		my $elem = uc $node->nodeName;
		my $str  = $self->$elem($node);
		$str =~ s{ (^\n+) | (\n+$) }{}gx;
		return "$str\n";
	}
	elsif ($node->nodeName eq '#text')
	{
		return $node->data;
	}
}

sub process_string
{
	shift->process(
		HTML::HTML5::Parser->load_html(string => shift, URI => shift),
		'no_clone',
	);
}

sub textnode
{
	my ($self, $node, %args) = @_;
	return $node->data;
}



( run in 1.889 second using v1.01-cache-2.11-cpan-5b529ec07f3 )