HTML-ExtractText
view release on metacpan or search on metacpan
lib/HTML/ExtractText.pm view on Meta::CPAN
my ( $self, $error ) = @_;
$self->error( $error );
return;
}
sub _process {
my $tag = $_->tag;
return _all_text($_) unless $tag =~ /input|img/;
return $_->attr('alt')//''
if $tag eq 'img' or
($tag eq 'input' and ($_->attr('type')//'') eq 'image');
return $_->attr('value')//'';
}
sub _extract {
my ( $self, $dom, $selector, $what ) = @_;
return $dom->find( $what->{ $selector } )
->map( sub { $self->_process( @_ ) } )->each;
}
# The _all_text & _text functions copied from Mojo::DOM 6.66.
sub _all_text {
my ( $dom ) = @_;
my $trim = 1;
# Detect "pre" tag
my $tree = $dom->tree;
map { $_->[1] eq 'pre' and $trim = 0 } Mojo::DOM::_ancestors( $dom ), $tree
if $trim && $tree->[0] ne 'root';
return _text( [Mojo::DOM::_nodes($tree)], $trim );
}
sub _text {
my ( $nodes, $trim ) = @_;
# Merge successive text nodes.
my $i = 0;
while ( my $next = $nodes->[$i + 1] ) {
++$i and next unless $nodes->[$i][0] eq 'text' && $next->[0] eq 'text';
splice @$nodes, $i, 2, ['text', $nodes->[$i][1] . $next->[1]];
}
my $text = '';
for my $node ( @$nodes ) {
my $type = $node->[0];
my $chunk = '';
# Text.
if ( $type eq 'text' ) {
$chunk = $node->[1];
if ( $trim ) {
$chunk =~ s/^\s+//;
$chunk =~ s/\s+$//;
$chunk =~ s/\s+/ /g;
}
}
# CDATA or raw text.
elsif ( $type eq 'cdata' || $type eq 'raw' ) {
$chunk = $node->[1];
}
# Nested tag.
elsif ( $type eq 'tag' ) {
no warnings 'recursion';
$chunk = _text( [Mojo::DOM::_nodes($node)], 1, $node->[1] eq 'pre' ? 0 : $trim );
}
# Add leading whitespace if punctuation allows it.
$chunk = " $chunk" if $text =~ /\S\z/ && $chunk =~ /^[^.!?,;:\s]+/;
# Trim whitespace blocks.
$text .= $chunk if $chunk =~ /\S+/ || !$trim;
}
return $text;
}
q|
Programming is 10% science, 20% ingenuity,
and 70% getting the ingenuity to work with the science.
|;
__END__
=encoding utf8
=for stopwords Znet Zoffix errored html
=head1 NAME
HTML::ExtractText - extract multiple text strings from HTML content, using CSS selectors
=head1 SYNOPSIS
=for test_synopsis no strict qw/vars/; no warnings;
At its simplest; use CSS selectors:
=for html <div style="display: table; height: 91px; background: url(http://zoffix.com/CPAN/Dist-Zilla-Plugin-Pod-Spiffy/icons/section-code.png) no-repeat left; padding-left: 120px;" ><div style="display: table-cell; vertical-align: middle;">
use HTML::ExtractText;
my $ext = HTML::ExtractText->new;
$ext->extract({ page_title => 'title' }, $html) or die "Error: $ext";
print "Page title is $ext->{page_title}\n";
=for html </div></div>
We can go fancy pants with selectors as well as
extract more than one bit of text:
=for html <div style="display: table; height: 91px; background: url(http://zoffix.com/CPAN/Dist-Zilla-Plugin-Pod-Spiffy/icons/section-code.png) no-repeat left; padding-left: 120px;" ><div style="display: table-cell; vertical-align: middle;">
use HTML::ExtractText;
my $ext = HTML::ExtractText->new;
$ext->extract(
{
article => 'article#main_content',
irc_links => 'article#main_content a[href^="irc://"]',
( run in 3.750 seconds using v1.01-cache-2.11-cpan-5b529ec07f3 )