HTML-CruftText

 view release on metacpan or  search on metacpan

lib/HTML/CruftText.pm  view on Meta::CPAN

use 5.012;
use strict;
use warnings;

use Time::HiRes;
use List::MoreUtils qw(first_index indexes last_index);

# STATICS

# markers -- patterns used to find lines than can help find the text
my $_MARKER_PATTERNS = {
    startclickprintinclude => qr/<\!--\s*startclickprintinclude/pi,
    endclickprintinclude   => qr/<\!--\s*endclickprintinclude/pi,
    startclickprintexclude => qr/<\!--\s*startclickprintexclude/pi,
    endclickprintexclude   => qr/<\!--\s*endclickprintexclude/pi,
    sphereitbegin          => qr/<\!--\s*DISABLEsphereit\s*start/i,
    sphereitend            => qr/<\!--\s*DISABLEsphereit\s*end/i,
    body                   => qr/<body/i,
    comment                => qr/(id|class)="[^"]*comment[^"]*"/i,
};

#TODO handle sphereit like we're now handling CLickprint.

# blank everything within these elements
my $_SCRUB_TAGS = [ qw/script style frame applet textarea/ ];

sub _remove_everything_except_newlines($)
{
    my $data = shift;

    # Retain the number of newlines
    my $newlines = ($data =~ tr/\n//);

    return "\n" x $newlines;    
}


my $_process_html_comment_regex_clickprint_comments = qr/^\s*(start|end)clickprint(in|ex)clude/ios;
my $_process_html_comment_regex_brackets = qr/[<>]/os;

sub _process_html_comment($)
{
    my $data = shift;

    # Don't touch clickprint comments
    if ($data =~ $_process_html_comment_regex_clickprint_comments) {
        return $data;
    }

    # Replace ">" and "<" to "|"
    $data =~ s/$_process_html_comment_regex_brackets/|/g;

    # Prepend every line with comment (not precompiled because trivial)
    $data =~ s/\n/ -->\n<!-- /gs;

    return $data;
}

# remove >'s from inside comments so the simple line density scorer
# doesn't get confused about where tags end.
# also, split multiline comments into multiple single line comments
my $_remove_tags_in_comments_regex_html_comment = qr/<!--(.*?)-->/ios;

sub _remove_tags_in_comments($)
{
    my $lines = shift;

    my $html = join("\n", @{ $lines });

    # Remove ">" and "<" in comments
    $html =~ s/$_remove_tags_in_comments_regex_html_comment/'<!--'._process_html_comment($1).'-->'/eg;

    $lines = [ split("\n", $html) ];

    return $lines;
}

# make sure that all tags start and close on one line
# by adding false <>s as necessary, eg:
#
# <foo
# bar>
#
# becomes
#
# <foo>
# <tag bar>
#
sub _fix_multiline_tags
{
    my ( $lines ) = @_;

    my $add_start_tag;
    for ( my $i = 0 ; $i < @{ $lines } ; $i++ )
    {
        if ( $add_start_tag )
        {
            $lines->[ $i ] = "<$add_start_tag " . $lines->[ $i ];
            $add_start_tag = undef;
        }

        if ( $lines->[ $i ] =~ /<([^ >]*)[^>]*$/ )
        {
            $add_start_tag = $1;
            $lines->[ $i ] .= ' >';
        }
    }
}

#remove all text not within the <body> tag
#Note: Some badly formated web pages will have multiple <body> tags or will not have an open tag.
#We go the conservative thing of only deleting stuff before the first <body> tag and stuff after the last </body> tag.
sub _remove_nonbody_text
{
    my ( $lines ) = @_;

    my $add_start_tag;

    my $state = 'before_body';

    my $body_open_tag_line_number = first_index { $_ =~ /<body/i } @{ $lines };



( run in 1.281 second using v1.01-cache-2.11-cpan-71847e10f99 )