App-SpamcupNG
view release on metacpan or search on metacpan
lib/App/SpamcupNG/HTMLParse.pm view on Meta::CPAN
sub find_best_contacts {
my $content_ref = shift;
confess "Must receive an scalar reference as parameter"
unless ( ref($content_ref) eq 'SCALAR' );
my $tree = HTML::TreeBuilder::XPath->new;
$tree->parse_content($content_ref);
my @nodes = $tree->findnodes('//div[@id="content"]');
foreach my $node (@nodes) {
for my $html_element ( $node->content_list ) {
# only text
next if ref($html_element);
$html_element =~ s/^\s+//;
if ( index( $html_element, 'Using best contacts' ) == 0 ) {
my @tokens = split( /\s/, $html_element );
splice( @tokens, 0, 3 );
return \@tokens;
}
}
}
return [];
}
=head2 find_spam_header
Expects as parameter a scalar reference of the HTML page.
You can optionally pass a second parameter that defines if each line should be
prefixed with a tab character. The default value is false.
Tries to find the e-mail header of the SPAM reported.
Returns an array reference with all the lines of the e-mail header found.
=cut
sub find_spam_header {
my $content_ref = shift;
confess "Must receive an scalar reference as parameter"
unless ( ref($content_ref) eq 'SCALAR' );
my $formatted //= 0;
my $tree = HTML::TreeBuilder::XPath->new;
$tree->parse_content($content_ref);
my @nodes = $tree->findnodes('/html/body/div[5]/p[1]/strong');
my $expected = 'Please make sure this email IS spam:';
my $parent = undef;
foreach my $node (@nodes) {
if ( $node->as_trimmed_text eq $expected ) {
$parent = $node->parent;
last;
}
}
if ($parent) {
$parent->parent(undef);
@nodes = $parent->findnodes('//font');
if ( ( scalar(@nodes) != 1 )
or ( ref( $nodes[0] ) ne 'HTML::Element' ) )
{
confess 'Unexpected content of SPAM header: ' . Dumper(@nodes);
}
my @lines;
my $header = $nodes[0]->content;
for ( my $i = 0 ; $i <= scalar( @{$header} ) ; $i++ ) {
if ( ref( $header->[$i] ) eq 'HTML::Element' ) {
$header->[$i]->parent(undef);
# just want text here
next unless $header->[$i]->content;
my $content = ( $header->[$i]->content )->[0];
next unless $content;
next if ( ref($content) );
$header->[$i] = $content;
}
next unless $header->[$i];
# removing Unicode spaces in place
$header->[$i] =~ s/^\s++//u;
if ($formatted) {
push( @lines, "\t$header->[$i]" );
}
else {
push( @lines, $header->[$i] );
}
}
return \@lines;
}
return [];
}
=head2 find_receivers
Expects as parameter a scalar reference of the HTML page.
Tries to find all the receivers of the SPAM report, even if those were not real
e-mail address, only internal identifiers for Spamcop to store statistics.
Returns an array reference, where each item is a string.
=cut
sub find_receivers {
my $content_ref = shift;
confess "Must receive an scalar reference as parameter"
unless ( ref($content_ref) eq 'SCALAR' );
my $tree = HTML::TreeBuilder::XPath->new;
$tree->parse_content($content_ref);
my @nodes = $tree->findnodes('//*[@id="content"]');
my @receivers;
( run in 1.390 second using v1.01-cache-2.11-cpan-ceb78f64989 )