App-SpamcupNG

 view release on metacpan or  search on metacpan

lib/App/SpamcupNG/HTMLParse.pm  view on Meta::CPAN

sub find_best_contacts {
    my $content_ref = shift;
    confess "Must receive an scalar reference as parameter"
      unless ( ref($content_ref) eq 'SCALAR' );
    my $tree = HTML::TreeBuilder::XPath->new;
    $tree->parse_content($content_ref);
    my @nodes = $tree->findnodes('//div[@id="content"]');

    foreach my $node (@nodes) {
        for my $html_element ( $node->content_list ) {

            # only text
            next if ref($html_element);
            $html_element =~ s/^\s+//;
            if ( index( $html_element, 'Using best contacts' ) == 0 ) {
                my @tokens = split( /\s/, $html_element );
                splice( @tokens, 0, 3 );
                return \@tokens;
            }
        }

    }

    return [];
}

=head2 find_spam_header

Expects as parameter a scalar reference of the HTML page.

You can optionally pass a second parameter that defines if each line should be
prefixed with a tab character. The default value is false.

Tries to find the e-mail header of the SPAM reported.

Returns an array reference with all the lines of the e-mail header found.

=cut

sub find_spam_header {
    my $content_ref = shift;
    confess "Must receive an scalar reference as parameter"
      unless ( ref($content_ref) eq 'SCALAR' );
    my $formatted //= 0;
    my $tree = HTML::TreeBuilder::XPath->new;
    $tree->parse_content($content_ref);

    my @nodes    = $tree->findnodes('/html/body/div[5]/p[1]/strong');
    my $expected = 'Please make sure this email IS spam:';
    my $parent   = undef;

    foreach my $node (@nodes) {
        if ( $node->as_trimmed_text eq $expected ) {
            $parent = $node->parent;
            last;
        }
    }

    if ($parent) {
        $parent->parent(undef);
        @nodes = $parent->findnodes('//font');

        if (   ( scalar(@nodes) != 1 )
            or ( ref( $nodes[0] ) ne 'HTML::Element' ) )
        {
            confess 'Unexpected content of SPAM header: ' . Dumper(@nodes);
        }

        my @lines;
        my $header = $nodes[0]->content;

        for ( my $i = 0 ; $i <= scalar( @{$header} ) ; $i++ ) {
            if ( ref( $header->[$i] ) eq 'HTML::Element' ) {
                $header->[$i]->parent(undef);

                # just want text here
                next unless $header->[$i]->content;
                my $content = ( $header->[$i]->content )->[0];
                next unless $content;
                next if ( ref($content) );
                $header->[$i] = $content;
            }
            next unless $header->[$i];

            # removing Unicode spaces in place
            $header->[$i] =~ s/^\s++//u;

            if ($formatted) {
                push( @lines, "\t$header->[$i]" );

            }
            else {
                push( @lines, $header->[$i] );

            }
        }
        return \@lines;
    }

    return [];
}

=head2 find_receivers

Expects as parameter a scalar reference of the HTML page.

Tries to find all the receivers of the SPAM report, even if those were not real
e-mail address, only internal identifiers for Spamcop to store statistics.

Returns an array reference, where each item is a string.

=cut

sub find_receivers {
    my $content_ref = shift;
    confess "Must receive an scalar reference as parameter"
      unless ( ref($content_ref) eq 'SCALAR' );
    my $tree = HTML::TreeBuilder::XPath->new;
    $tree->parse_content($content_ref);
    my @nodes = $tree->findnodes('//*[@id="content"]');
    my @receivers;



( run in 1.390 second using v1.01-cache-2.11-cpan-ceb78f64989 )