Mail-SpamAssassin
view release on metacpan or search on metacpan
lib/Mail/SpamAssassin/Plugin/PDFInfo.pm view on Meta::CPAN
$self->register_eval_rule ("pdf_image_size_exact", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
$self->register_eval_rule ("pdf_image_size_range", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
$self->register_eval_rule ("pdf_named", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
$self->register_eval_rule ("pdf_name_regex", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
$self->register_eval_rule ("pdf_image_to_text_ratio", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
$self->register_eval_rule ("pdf_match_md5", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
$self->register_eval_rule ("pdf_match_fuzzy_md5", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
$self->register_eval_rule ("pdf_match_details", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
$self->register_eval_rule ("pdf_is_encrypted", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
$self->register_eval_rule ("pdf_has_form", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
$self->register_eval_rule ("pdf_has_script", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
$self->register_eval_rule ("pdf_has_auto_script", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
$self->register_eval_rule ("pdf_is_empty_body", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
# lower priority for add_uri_detail_list to work
$self->register_method_priority ("parsed_metadata", -1);
return $self;
}
sub parsed_metadata {
my ($self, $opts) = @_;
my $pms = $opts->{permsgstatus};
# initialize
$pms->{pdfinfo}->{count_pdf} = 0;
$pms->{pdfinfo}->{count_pdf_images} = 0;
$pms->{pdfinfo}->{count_pdf_uris} = 0;
my @parts = $pms->{msg}->find_parts(qr@^(image|application)/(pdf|octet\-stream)$@, 1);
my $part_count = scalar @parts;
dbg("pdfinfo: Identified $part_count possible mime parts that need checked for PDF content");
foreach my $p (@parts) {
my $type = $p->{type} || '';
my $name = $p->{name} || '';
dbg("pdfinfo: found part, type=$type file=$name");
# filename must end with .pdf, or application type can be pdf
# sometimes windows muas will wrap a pdf up inside a .dat file
# v0.8 - Added .fdf phoney PDF detection
next unless ($name =~ /\.[fp]df$/i || $type =~ m@/pdf$@);
_get_pdf_details($pms, $p);
$pms->{pdfinfo}->{count_pdf}++;
}
_set_tag($pms, 'PDFCOUNT', $pms->{pdfinfo}->{count_pdf});
_set_tag($pms, 'PDFIMGCOUNT', $pms->{pdfinfo}->{count_pdf_images});
_set_tag($pms, 'PDFURICOUNT', $pms->{pdfinfo}->{count_pdf_uris});
}
sub _get_pdf_details {
my ($pms, $part) = @_;
my $data = $part->decode();
# Remove UTF-8 BOM
$data =~ s/^\xef\xbb\xbf//;
# Search magic in first 1024 bytes
if ($data !~ /^.{0,1024}\%PDF\-(\d\.\d)/s) {
dbg("pdfinfo: PDF magic header not found, invalid file?");
return;
}
my $version = $1;
_set_tag($pms, 'PDFVERSION', $version);
# dbg("pdfinfo: pdf version = $version");
my ($fuzzy_data, $pdf_tags);
my ($md5, $fuzzy_md5) = ('','');
my ($total_height, $total_width, $total_area, $line_count) = (0,0,0,0);
my $name = $part->{name} || '';
_set_tag($pms, 'PDFNAME', $name);
# store the file name so we can check pdf_named() or pdf_name_match() later.
$pms->{pdfinfo}->{names_pdf}->{$name} = 1 if $name;
my $no_more_fuzzy = 0;
my $got_image = 0;
my $encrypted = 0;
my $has_form = 0;
my $has_script = 0;
my $has_auto_script = 0;
my %uris;
while ($data =~ /([^\n]+)/g) {
# dbg("pdfinfo: line=$1");
my $line = $1;
if (!$no_more_fuzzy && ++$line_count < 70) {
if ($line !~ m/^\%/ && $line !~ m/^\/(?:Height|Width|(?:(?:Media|Crop)Box))/ && $line !~ m/^\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+cm$/) {
$line =~ s/\s+$//; # strip off whitespace at end.
$fuzzy_data .= $line;
}
# once we hit the first stream, we stop collecting data for fuzzy md5
$no_more_fuzzy = 1 if index($line, 'stream') >= 0;
}
$got_image = 1 if index($line, '/Image') >= 0;
if (!$encrypted && index($line, '/Encrypt') == 0) {
# store encrypted flag.
$encrypted = $pms->{pdfinfo}->{encrypted} = 1;
}
# Detect if the PDF file has an embedded form
if (!$has_form && index($line, '/AcroForm') == 0) {
# PDF has a Form.
$has_form = $pms->{pdfinfo}->{has_form} = 1;
}
# Detect if the PDF file has Javascript code that can optionally be started automatically
if (!$has_script && index($line, '/JS') == 0) {
# PDF has Javascript code.
$has_script = $pms->{pdfinfo}->{has_script} = 1;
}
if (!$has_auto_script && index($line, '/AA') == 0) {
$has_auto_script++;
( run in 1.060 second using v1.01-cache-2.11-cpan-39bf76dae61 )