App-SpamcupNG
view release on metacpan or search on metacpan
lib/App/SpamcupNG/HTMLParse.pm view on Meta::CPAN
=head2 find_header_info
Finds information from the e-mail header of the received SPAM and returns it.
Returns a hash reference with the following keys:
=over
=item mailer: the X-Mailer header, if available
=item content_type: the Content-Type, if available
=back
There is an attempt to normalize the C<Content-Type> header, by removing extra
spaces and using just the first two entries, also making everything as lower
case.
=cut
sub find_header_info {
my $content_ref = shift;
confess "Must receive an scalar reference as parameter"
unless ( ref($content_ref) eq 'SCALAR' );
my $tree = HTML::TreeBuilder::XPath->new;
$tree->parse_content($$content_ref);
my @nodes = $tree->findnodes('/html/body/div[@id="content"]/pre');
my %info = (
mailer => undef,
content_type => undef
);
my $mailer_regex = qr/^X-Mailer:/;
my $content_type_regex = qr/^Content-Type:/;
foreach my $node (@nodes) {
foreach my $content ( split( "\n", $node->as_text() ) ) {
$content =~ s/^\s+//;
$content =~ s/\s+$//;
next if ( $content eq '' );
if ( $content =~ $mailer_regex ) {
my $wanted = ( split( ':', $content ) )[1];
$wanted =~ s/^\s+//;
$info{mailer} = $wanted;
next;
}
if ( $content =~ $content_type_regex ) {
my $wanted = ( split( ':', $content ) )[1];
$wanted =~ s/^\s+//;
my @wanted = split( ';', $wanted );
if ( scalar(@wanted) > 1 ) {
my $encoding = lc( $wanted[0] );
my $charset = lc( $wanted[1] );
$charset =~ s/^\s+//;
$charset =~ tr/"//d;
my $not_useful = 'boundary';
if (
substr( $charset, 0, length($not_useful) ) eq
$not_useful )
{
$info{content_type} = $encoding;
$info{charset} = undef;
}
else {
$info{content_type} = $encoding;
$info{charset} = ( split( '=', $charset ) )[1];
}
}
else {
chop $wanted if ( substr( $wanted, -1 ) eq ';' );
$info{content_type} = $wanted;
}
next;
}
last if ( $info{mailer} and $info{content_type} );
}
}
return \%info;
}
=head2 find_message_age
Find and return the SPAM message age information.
Returns an array reference, with the zero index as an integer with the age, and
the index 1 as the age unit (possibly "hour");
If nothing is found, returns C<undef>;
=cut
sub find_message_age {
my $content_ref = shift;
confess "Must receive an scalar reference as parameter"
unless ( ref($content_ref) eq 'SCALAR' );
my $tree = HTML::TreeBuilder::XPath->new;
$tree->parse_content($$content_ref);
my @nodes = $tree->findnodes('/html/body/child::div[@id="content"]');
foreach my $node (@nodes) {
foreach my $content ( $node->content_refs_list ) {
next unless ( ref($content) eq 'SCALAR' );
$$content =~ s/^\s+//;
$$content =~ s/\s+$//;
next if ( $$content eq '' );
if ( $$content =~ $regexes{message_age} ) {
my ( $age, $unit ) = ( $1, $2 );
chop $unit if ( substr( $unit, -1 ) eq 's' );
return [ $age, $unit ];
}
}
}
return undef;
( run in 1.596 second using v1.01-cache-2.11-cpan-39bf76dae61 )