App-SpamcupNG

 view release on metacpan or  search on metacpan

lib/App/SpamcupNG/HTMLParse.pm  view on Meta::CPAN


=head2 find_header_info

Finds information from the e-mail header of the received SPAM and returns it.

Returns a hash reference with the following keys:

=over

=item mailer: the X-Mailer header, if available

=item content_type: the Content-Type, if available

=back

There is an attempt to normalize the C<Content-Type> header, by removing extra
spaces and using just the first two entries, also making everything as lower
case.

=cut

sub find_header_info {
    my $content_ref = shift;
    confess "Must receive an scalar reference as parameter"
      unless ( ref($content_ref) eq 'SCALAR' );
    my $tree = HTML::TreeBuilder::XPath->new;
    $tree->parse_content($$content_ref);
    my @nodes = $tree->findnodes('/html/body/div[@id="content"]/pre');
    my %info  = (
        mailer       => undef,
        content_type => undef
    );
    my $mailer_regex       = qr/^X-Mailer:/;
    my $content_type_regex = qr/^Content-Type:/;

    foreach my $node (@nodes) {

        foreach my $content ( split( "\n", $node->as_text() ) ) {
            $content =~ s/^\s+//;
            $content =~ s/\s+$//;
            next if ( $content eq '' );

            if ( $content =~ $mailer_regex ) {
                my $wanted = ( split( ':', $content ) )[1];
                $wanted =~ s/^\s+//;
                $info{mailer} = $wanted;
                next;
            }

            if ( $content =~ $content_type_regex ) {
                my $wanted = ( split( ':', $content ) )[1];
                $wanted =~ s/^\s+//;
                my @wanted = split( ';', $wanted );

                if ( scalar(@wanted) > 1 ) {
                    my $encoding = lc( $wanted[0] );
                    my $charset  = lc( $wanted[1] );
                    $charset =~ s/^\s+//;
                    $charset =~ tr/"//d;

                    my $not_useful = 'boundary';

                    if (
                        substr( $charset, 0, length($not_useful) ) eq
                        $not_useful )
                    {
                        $info{content_type} = $encoding;
                        $info{charset}      = undef;
                    }
                    else {
                        $info{content_type} = $encoding;
                        $info{charset}      = ( split( '=', $charset ) )[1];
                    }
                }
                else {
                    chop $wanted if ( substr( $wanted, -1 ) eq ';' );
                    $info{content_type} = $wanted;
                }

                next;
            }

            last if ( $info{mailer} and $info{content_type} );
        }
    }

    return \%info;

}

=head2 find_message_age

Find and return the SPAM message age information.

Returns an array reference, with the zero index as an integer with the age, and
the index 1 as the age unit (possibly "hour");

If nothing is found, returns C<undef>;

=cut

sub find_message_age {
    my $content_ref = shift;
    confess "Must receive an scalar reference as parameter"
      unless ( ref($content_ref) eq 'SCALAR' );
    my $tree = HTML::TreeBuilder::XPath->new;
    $tree->parse_content($$content_ref);
    my @nodes = $tree->findnodes('/html/body/child::div[@id="content"]');

    foreach my $node (@nodes) {
        foreach my $content ( $node->content_refs_list ) {
            next unless ( ref($content) eq 'SCALAR' );
            $$content =~ s/^\s+//;
            $$content =~ s/\s+$//;
            next if ( $$content eq '' );

            if ( $$content =~ $regexes{message_age} ) {
                my ( $age, $unit ) = ( $1, $2 );
                chop $unit if ( substr( $unit, -1 ) eq 's' );
                return [ $age, $unit ];
            }
        }
    }

    return undef;



( run in 1.596 second using v1.01-cache-2.11-cpan-39bf76dae61 )