Mail-SpamAssassin

 view release on metacpan or  search on metacpan

lib/Mail/SpamAssassin/Plugin/PDFInfo.pm  view on Meta::CPAN

  $self->register_eval_rule ("pdf_image_size_exact", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
  $self->register_eval_rule ("pdf_image_size_range", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
  $self->register_eval_rule ("pdf_named", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
  $self->register_eval_rule ("pdf_name_regex", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
  $self->register_eval_rule ("pdf_image_to_text_ratio", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
  $self->register_eval_rule ("pdf_match_md5", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
  $self->register_eval_rule ("pdf_match_fuzzy_md5", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
  $self->register_eval_rule ("pdf_match_details", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
  $self->register_eval_rule ("pdf_is_encrypted", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
  $self->register_eval_rule ("pdf_has_form", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
  $self->register_eval_rule ("pdf_has_script", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
  $self->register_eval_rule ("pdf_has_auto_script", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
  $self->register_eval_rule ("pdf_is_empty_body", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);

  # lower priority for add_uri_detail_list to work
  $self->register_method_priority ("parsed_metadata", -1);

  return $self;
}

sub parsed_metadata {
  my ($self, $opts) = @_;

  my $pms = $opts->{permsgstatus};

  # initialize
  $pms->{pdfinfo}->{count_pdf} = 0;
  $pms->{pdfinfo}->{count_pdf_images} = 0;
  $pms->{pdfinfo}->{count_pdf_uris} = 0;

  my @parts = $pms->{msg}->find_parts(qr@^(image|application)/(pdf|octet\-stream)$@, 1);
  my $part_count = scalar @parts;

  dbg("pdfinfo: Identified $part_count possible mime parts that need checked for PDF content");

  foreach my $p (@parts) {
    my $type = $p->{type} || '';
    my $name = $p->{name} || '';

    dbg("pdfinfo: found part, type=$type file=$name");

    # filename must end with .pdf, or application type can be pdf
    # sometimes windows muas will wrap a pdf up inside a .dat file
    # v0.8 - Added .fdf phoney PDF detection
    next unless ($name =~ /\.[fp]df$/i || $type =~ m@/pdf$@);

    _get_pdf_details($pms, $p);
    $pms->{pdfinfo}->{count_pdf}++;
  }

  _set_tag($pms, 'PDFCOUNT',  $pms->{pdfinfo}->{count_pdf});
  _set_tag($pms, 'PDFIMGCOUNT', $pms->{pdfinfo}->{count_pdf_images});
  _set_tag($pms, 'PDFURICOUNT', $pms->{pdfinfo}->{count_pdf_uris});
}

sub _get_pdf_details {
  my ($pms, $part) = @_;

  my $data = $part->decode();

  # Remove UTF-8 BOM
  $data =~ s/^\xef\xbb\xbf//;

  # Search magic in first 1024 bytes
  if ($data !~ /^.{0,1024}\%PDF\-(\d\.\d)/s) {
    dbg("pdfinfo: PDF magic header not found, invalid file?");
    return;
  }
  my $version = $1;
  _set_tag($pms, 'PDFVERSION', $version);
  # dbg("pdfinfo: pdf version = $version");

  my ($fuzzy_data, $pdf_tags);
  my ($md5, $fuzzy_md5) = ('','');
  my ($total_height, $total_width, $total_area, $line_count) = (0,0,0,0);

  my $name = $part->{name} || '';
  _set_tag($pms, 'PDFNAME', $name);
  # store the file name so we can check pdf_named() or pdf_name_match() later.
  $pms->{pdfinfo}->{names_pdf}->{$name} = 1 if $name;

  my $no_more_fuzzy = 0;
  my $got_image = 0;
  my $encrypted = 0;
  my $has_form = 0;
  my $has_script = 0;
  my $has_auto_script = 0;
  my %uris;

  while ($data =~ /([^\n]+)/g) {
    # dbg("pdfinfo: line=$1");
    my $line = $1;

    if (!$no_more_fuzzy && ++$line_count < 70) {
      if ($line !~ m/^\%/ && $line !~ m/^\/(?:Height|Width|(?:(?:Media|Crop)Box))/ && $line !~ m/^\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+cm$/) {
        $line =~ s/\s+$//;  # strip off whitespace at end.
        $fuzzy_data .= $line;
      }
      # once we hit the first stream, we stop collecting data for fuzzy md5
      $no_more_fuzzy = 1  if index($line, 'stream') >= 0;
    }

    $got_image = 1  if index($line, '/Image') >= 0;
    if (!$encrypted && index($line, '/Encrypt') == 0) {
      # store encrypted flag.
      $encrypted = $pms->{pdfinfo}->{encrypted} = 1;
    }

    # Detect if the PDF file has an embedded form
    if (!$has_form && index($line, '/AcroForm') == 0) {
      # PDF has a Form.
      $has_form = $pms->{pdfinfo}->{has_form} = 1;
    }

    # Detect if the PDF file has Javascript code that can optionally be started automatically
    if (!$has_script && index($line, '/JS') == 0) {
      # PDF has Javascript code.
      $has_script = $pms->{pdfinfo}->{has_script} = 1;
    }
    if (!$has_auto_script && index($line, '/AA') == 0) {
      $has_auto_script++;



( run in 1.060 second using v1.01-cache-2.11-cpan-39bf76dae61 )