cdata results from the CPAN

HTML-ExtractText
view release on metacpan or search on metacpan
lib/HTML/ExtractText.pm view on Meta::CPAN
    return _all_text($_) unless $tag =~ /input|img/;

    return $_->attr('alt')//''
        if $tag eq 'img' or
        ($tag eq 'input' and ($_->attr('type')//'') eq 'image');

    return $_->attr('value')//'';
}

sub _extract {
    my ( $self, $dom, $selector, $what ) = @_;
    return $dom->find( $what->{ $selector } )
        ->map( sub { $self->_process( @_ ) } )->each;
}

# The _all_text & _text functions copied from Mojo::DOM 6.66, adapted to
# avoid Mojo::DOM's private _nodes()/_ancestors() internals, whose return
# conventions changed in later Mojo::DOM releases (which silently broke text
# extraction). _child_nodes() reads the stable ->tree node structure directly.
sub _child_nodes {
    my $tree = shift;
    return unless $tree;
    return @$tree[ ( $tree->[0] eq 'tag' ? 4 : 1 ) .. $#$tree ];
}

sub _all_text {
    my ( $dom ) = @_;
    my $trim = 1;

    # Detect "pre" tag on the element itself or any of its ancestors
    $trim = 0 if $dom->tag eq 'pre' or $dom->ancestors('pre')->size;

    return _text( [ _child_nodes( $dom->tree ) ], $trim );
}

sub _text {
    my ( $nodes, $trim ) = @_;

    # Merge successive text nodes.
    my $i = 0;
    while ( my $next = $nodes->[$i + 1] ) {
       ++$i and next unless $nodes->[$i][0] eq 'text' && $next->[0] eq 'text';
       splice @$nodes, $i, 2, ['text', $nodes->[$i][1] . $next->[1]];
    }

    my $text = '';
    for my $node ( @$nodes ) {
        my $type = $node->[0];
        my $chunk = '';

        # Text.
        if ( $type eq 'text' ) {
            $chunk = $node->[1];
            if ( $trim ) {
                $chunk =~ s/^\s+//;
                $chunk =~ s/\s+$//;
                $chunk =~ s/\s+/ /g;
            }
        }
        # CDATA or raw text.
        elsif ( $type eq 'cdata' || $type eq 'raw' ) {
            $chunk = $node->[1];
        }
        # Nested tag.
        elsif ( $type eq 'tag' ) {
           no warnings 'recursion';
           $chunk = _text( [ _child_nodes($node) ], 1, $node->[1] eq 'pre' ? 0 : $trim );
        }

        # Add leading whitespace if punctuation allows it.
        $chunk = " $chunk" if $text =~ /\S\z/ && $chunk =~ /^[^.!?,;:\s]+/;

        # Trim whitespace blocks.
        $text .= $chunk if $chunk =~ /\S+/ || !$trim;
    }

    return $text;
}


q|
Programming is 10% science, 20% ingenuity,
and 70% getting the ingenuity to work with the science.
|;

__END__

=encoding utf8

=for stopwords Znet Zoffix errored  html

=head1 NAME

HTML::ExtractText - extract multiple text strings from HTML content, using CSS selectors

=head1 SYNOPSIS

=for test_synopsis no strict qw/vars/; no warnings;

At its simplest; use CSS selectors:

=for html  <div style="display: table; height: 91px; background: url(http://zoffix.com/CPAN/Dist-Zilla-Plugin-Pod-Spiffy/icons/section-code.png) no-repeat left; padding-left: 120px;" ><div style="display: table-cell; vertical-align: middle;">

    use HTML::ExtractText;
    my $ext = HTML::ExtractText->new;
    $ext->extract({ page_title => 'title' }, $html) or die "Error: $ext";
    print "Page title is $ext->{page_title}\n";

=for html  </div></div>

We can go fancy pants with selectors as well as
extract more than one bit of text:

=for html  <div style="display: table; height: 91px; background: url(http://zoffix.com/CPAN/Dist-Zilla-Plugin-Pod-Spiffy/icons/section-code.png) no-repeat left; padding-left: 120px;" ><div style="display: table-cell; vertical-align: middle;">

    use HTML::ExtractText;
    my $ext = HTML::ExtractText->new;
    $ext->extract(
        {
            article   => 'article#main_content',
            irc_links => 'article#main_content a[href^="irc://"]',
( run in 0.814 second using v1.01-cache-2.11-cpan-a9496e3eb41 )