HTML-CruftText
view release on metacpan or search on metacpan
lib/HTML/CruftText.pm view on Meta::CPAN
use 5.012;
use strict;
use warnings;
use Time::HiRes;
use List::MoreUtils qw(first_index indexes last_index);
# STATICS
# markers -- patterns used to find lines than can help find the text
my $_MARKER_PATTERNS = {
startclickprintinclude => qr/<\!--\s*startclickprintinclude/pi,
endclickprintinclude => qr/<\!--\s*endclickprintinclude/pi,
startclickprintexclude => qr/<\!--\s*startclickprintexclude/pi,
endclickprintexclude => qr/<\!--\s*endclickprintexclude/pi,
sphereitbegin => qr/<\!--\s*DISABLEsphereit\s*start/i,
sphereitend => qr/<\!--\s*DISABLEsphereit\s*end/i,
body => qr/<body/i,
comment => qr/(id|class)="[^"]*comment[^"]*"/i,
};
#TODO handle sphereit like we're now handling CLickprint.
# blank everything within these elements
my $_SCRUB_TAGS = [ qw/script style frame applet textarea/ ];
sub _remove_everything_except_newlines($)
{
my $data = shift;
# Retain the number of newlines
my $newlines = ($data =~ tr/\n//);
return "\n" x $newlines;
}
my $_process_html_comment_regex_clickprint_comments = qr/^\s*(start|end)clickprint(in|ex)clude/ios;
my $_process_html_comment_regex_brackets = qr/[<>]/os;
sub _process_html_comment($)
{
my $data = shift;
# Don't touch clickprint comments
if ($data =~ $_process_html_comment_regex_clickprint_comments) {
return $data;
}
# Replace ">" and "<" to "|"
$data =~ s/$_process_html_comment_regex_brackets/|/g;
# Prepend every line with comment (not precompiled because trivial)
$data =~ s/\n/ -->\n<!-- /gs;
return $data;
}
# remove >'s from inside comments so the simple line density scorer
# doesn't get confused about where tags end.
# also, split multiline comments into multiple single line comments
my $_remove_tags_in_comments_regex_html_comment = qr/<!--(.*?)-->/ios;
sub _remove_tags_in_comments($)
{
my $lines = shift;
my $html = join("\n", @{ $lines });
# Remove ">" and "<" in comments
$html =~ s/$_remove_tags_in_comments_regex_html_comment/'<!--'._process_html_comment($1).'-->'/eg;
$lines = [ split("\n", $html) ];
return $lines;
}
# make sure that all tags start and close on one line
# by adding false <>s as necessary, eg:
#
# <foo
# bar>
#
# becomes
#
# <foo>
# <tag bar>
#
sub _fix_multiline_tags
{
my ( $lines ) = @_;
my $add_start_tag;
for ( my $i = 0 ; $i < @{ $lines } ; $i++ )
{
if ( $add_start_tag )
{
$lines->[ $i ] = "<$add_start_tag " . $lines->[ $i ];
$add_start_tag = undef;
}
if ( $lines->[ $i ] =~ /<([^ >]*)[^>]*$/ )
{
$add_start_tag = $1;
$lines->[ $i ] .= ' >';
}
}
}
#remove all text not within the <body> tag
#Note: Some badly formated web pages will have multiple <body> tags or will not have an open tag.
#We go the conservative thing of only deleting stuff before the first <body> tag and stuff after the last </body> tag.
sub _remove_nonbody_text
{
my ( $lines ) = @_;
my $add_start_tag;
my $state = 'before_body';
my $body_open_tag_line_number = first_index { $_ =~ /<body/i } @{ $lines };
( run in 1.281 second using v1.01-cache-2.11-cpan-71847e10f99 )