HTML-ExtractText

 view release on metacpan or  search on metacpan

lib/HTML/ExtractText.pm  view on Meta::CPAN

    my ( $self, $error ) = @_;
    $self->error( $error );
    return;
}

sub _process {
    my $tag = $_->tag;
    return _all_text($_) unless $tag =~ /input|img/;

    return $_->attr('alt')//''
        if $tag eq 'img' or
        ($tag eq 'input' and ($_->attr('type')//'') eq 'image');

    return $_->attr('value')//'';
}

sub _extract {
    my ( $self, $dom, $selector, $what ) = @_;
    return $dom->find( $what->{ $selector } )
        ->map( sub { $self->_process( @_ ) } )->each;
}

# The _all_text & _text functions copied from Mojo::DOM 6.66.
sub _all_text {
    my ( $dom ) = @_;
    my $trim = 1;

    # Detect "pre" tag
    my $tree = $dom->tree;
    map { $_->[1] eq 'pre' and $trim = 0 } Mojo::DOM::_ancestors( $dom ), $tree
        if $trim && $tree->[0] ne 'root';

    return _text( [Mojo::DOM::_nodes($tree)], $trim );
}

sub _text {
    my ( $nodes, $trim ) = @_;

    # Merge successive text nodes.
    my $i = 0;
    while ( my $next = $nodes->[$i + 1] ) {
       ++$i and next unless $nodes->[$i][0] eq 'text' && $next->[0] eq 'text';
       splice @$nodes, $i, 2, ['text', $nodes->[$i][1] . $next->[1]];
    }

    my $text = '';
    for my $node ( @$nodes ) {
        my $type = $node->[0];
        my $chunk = '';

        # Text.
        if ( $type eq 'text' ) {
            $chunk = $node->[1];
            if ( $trim ) {
                $chunk =~ s/^\s+//;
                $chunk =~ s/\s+$//;
                $chunk =~ s/\s+/ /g;
            }
        }
        # CDATA or raw text.
        elsif ( $type eq 'cdata' || $type eq 'raw' ) {
            $chunk = $node->[1];
        }
        # Nested tag.
        elsif ( $type eq 'tag' ) {
           no warnings 'recursion';
           $chunk = _text( [Mojo::DOM::_nodes($node)], 1, $node->[1] eq 'pre' ? 0 : $trim );
        }

        # Add leading whitespace if punctuation allows it.
        $chunk = " $chunk" if $text =~ /\S\z/ && $chunk =~ /^[^.!?,;:\s]+/;

        # Trim whitespace blocks.
        $text .= $chunk if $chunk =~ /\S+/ || !$trim;
    }

    return $text;
}


q|
Programming is 10% science, 20% ingenuity,
and 70% getting the ingenuity to work with the science.
|;

__END__

=encoding utf8

=for stopwords Znet Zoffix errored  html

=head1 NAME

HTML::ExtractText - extract multiple text strings from HTML content, using CSS selectors

=head1 SYNOPSIS

=for test_synopsis no strict qw/vars/; no warnings;

At its simplest; use CSS selectors:

=for html  <div style="display: table; height: 91px; background: url(http://zoffix.com/CPAN/Dist-Zilla-Plugin-Pod-Spiffy/icons/section-code.png) no-repeat left; padding-left: 120px;" ><div style="display: table-cell; vertical-align: middle;">

    use HTML::ExtractText;
    my $ext = HTML::ExtractText->new;
    $ext->extract({ page_title => 'title' }, $html) or die "Error: $ext";
    print "Page title is $ext->{page_title}\n";

=for html  </div></div>

We can go fancy pants with selectors as well as
extract more than one bit of text:

=for html  <div style="display: table; height: 91px; background: url(http://zoffix.com/CPAN/Dist-Zilla-Plugin-Pod-Spiffy/icons/section-code.png) no-repeat left; padding-left: 120px;" ><div style="display: table-cell; vertical-align: middle;">

    use HTML::ExtractText;
    my $ext = HTML::ExtractText->new;
    $ext->extract(
        {
            article   => 'article#main_content',
            irc_links => 'article#main_content a[href^="irc://"]',



( run in 3.750 seconds using v1.01-cache-2.11-cpan-5b529ec07f3 )