Mail-SpamAssassin

 view release on metacpan or  search on metacpan

lib/Mail/SpamAssassin/Message.pm  view on Meta::CPAN

	  # allowed WSP before a colon.
	  if (++$hdr_errors <= 3) {
	    # just consume but ignore a few invalid header fields
	  } else {  # enough is enough...
	    $self->{'missing_head_body_separator'} = 1;
	    unshift(@message, $current);
 	    last;
	  }
	}
      }

      # start collecting a new header field
      $header = $current;
      $self->{'pristine_headers'} .= $current;
    }
  }
  undef $header;

  # Store the pristine body for later -- store as a copy since @message
  # will get modified below
  $self->{'pristine_body'} = join('', @message);

  if (!defined $self->{pristine_body_length}) {
    $self->{'pristine_body_length'} = length $self->{'pristine_body'};
  }

  # Store complete message, get_pristine() is used a lot, avoid making copies
  $self->{'pristine_msg'} = $self->{'pristine_headers'} . $self->{'pristine_body'};

  # iterate over lines in reverse order
  # merge multiple blank lines into a single one
  my $start;
  for (my $cnt=$#message; $cnt>=0; $cnt--) {
    # CRLF -> LF line-endings conversion if necessary
    $message[$cnt] =~ s/\015\012\z/\012/  if $squash_crlf;

    # line is blank
    if ($message[$cnt] =~ /^\s*$/) {
      # /^\s*$/ is about 5% faster then !/\S/, but still expensive here
      if (!defined $start) {
        $start=$cnt;
      }
      next unless $cnt == 0;
    }

    # line is not blank, or we've reached the beginning

    # if we've got a series of blank lines, get rid of them
    if (defined $start) {
      my $max_blank_lines = 20;
      my $num = $start-$cnt;
      if ($num > $max_blank_lines) {
        splice @message, $cnt+2, $num-$max_blank_lines;
      }
      undef $start;
    }
  }

  # Figure out the boundary
  my ($boundary);
  ($self->{'type'}, $boundary) = Mail::SpamAssassin::Util::parse_content_type($self->header('content-type'));
  dbg("message: main message type: ".$self->{'type'});

#  dbg("message: \$message[0]: \"" . $message[0] . "\"");

  # bug 6845: if main message type is multipart and the message body does not begin with
  # either a blank line or the boundary (if defined), insert a blank line
  # to ensure proper parsing - do not consider MIME headers at the beginning of the body
  # to be part of the message headers.
  if (index($self->{'type'}, 'multipart/') == 0 && $#message > 0 && $message[0] =~ /\S/)
  {
    if (!defined $boundary || $message[0] !~ /^--\Q$boundary\E/)
    {
      dbg("message: Inserting blank line at top of body to ensure correct multipart MIME parsing");
      unshift(@message, "\012");
    }
  }

#  dbg("message: \$message[0]: \"" . $message[0] . "\"");
#  dbg("message: \$message[1]: \"" . $message[1] . "\"");

  # parse queue, simple array of parts to parse:
  # 0: part object, already in the tree
  # 1: boundary used to focus body parsing
  # 2: message content
  # 3: how many MIME subparts to parse down
  #
  $self->{'parse_queue'} = [ [ $self, $boundary, \@message, $subparse ] ];

  # If the message does need to get parsed, save off a copy of the body
  # in a format we can easily parse later so we don't have to rip from
  # pristine_body ...  If we do want to parse now, go ahead and do so ...
  #
  if ($parsenow) {
    $self->parse_body();
  }

  $self;
}

# ---------------------------------------------------------------------------

=item find_parts()

Used to search the tree for specific MIME parts.  See
I<Mail::SpamAssassin::Message::Node> for more details.

=cut

# Used to find any MIME parts whose simple content-type matches a given regexp
# Searches it's own and any children parts.  Returns an array of MIME
# objects which match.
#
sub find_parts {
  my $self = shift;

  # ok, we need to do the parsing now...
  $self->parse_body() if (exists $self->{'parse_queue'});

  # and pass through to the Message::Node version of the method
  return $self->SUPER::find_parts(@_);
}

# ---------------------------------------------------------------------------

=item get_pristine_header()

Returns pristine headers of the message.  If no specific header name
is given as a parameter (case-insensitive), then all headers will be
returned as a scalar, including the blank line at the end of the headers.

If called in an array context, an array will be returned with each
specific header in a different element.  In a scalar context, the last
specific header is returned.

ie: If 'Subject' is specified as the header, and there are 2 Subject
headers in a message, the last/bottom one in the message is returned in
scalar context or both are returned in array context.

Btw, returning the last header field (not the first) happens to be consistent
with DKIM signatures, which search for and cover multiple header fields
bottom-up according to the 'h' tag. Let's keep it this way.

Note: the returned header will include the ending newline and any embedded
whitespace folding.

=cut

sub get_pristine_header {
  my ($self, $hdr) = @_;
  
  return $self->{pristine_headers} if !defined $hdr || $hdr eq '';
  my(@ret) =
    $self->{pristine_headers} =~ /^\Q$hdr\E[ \t]*:[ \t]*(.*?\n(?![ \t]))/smgi;
  # taintedness is retained by "use re 'taint'" (fix in bug 5283 now redundant)
  if (!@ret) {
    return $self->get_header($hdr);
  } elsif (wantarray) {
    return @ret;
  } else {
    return $ret[-1];
  }
}

=item get_mbox_separator()

Returns the mbox separator found in the message, or undef if there
wasn't one.

=cut

lib/Mail/SpamAssassin/Message.pm  view on Meta::CPAN

    # Else, there's no boundary, so leave the whole part...
  }

  # prepare a new tree node
  my $part_msg = Mail::SpamAssassin::Message::Node->new({ normalize=>$self->{normalize} });
  my $in_body = 0;
  my $header;
  my $part_array;
  my $found_end_boundary;
  my $found_last_end_boundary;
  my $partcnt = 0;

  my $line_count = @{$body};
  foreach ( @{$body} ) {
    # if we're on the last body line, or we find any boundary marker,
    # deal with the mime part;
    # a triage before an unlikely-to-match regexp avoids a CPU hotspot
    $found_end_boundary = defined $boundary && substr($_,0,2) eq '--'
                          && /^--\Q$boundary\E(--)?\s*$/;
    $found_last_end_boundary = $found_end_boundary && $1;
    if ($found_end_boundary && $nested_boundary) {
      $found_end_boundary = 0;
      $nested_boundary = 0 if ($found_last_end_boundary); # bug 7358 - handle one level of non-unique boundary string
    }
    if ( --$line_count == 0 || $found_end_boundary ) {
      my $line = $_; # remember the last line

      # If at last line and no end boundary found, the line belongs to body
      # TODO:
      #  Is $self->{mime_boundary_state}->{$boundary}-- needed here?
      #  Could "missing end boundary" be a useful rule? Mark it somewhere?
      #  If SA processed truncated message from amavis etc, this could also
      #  be hit legimately..
      if (!$found_end_boundary) {
        # TODO: This is duplicate code from few pages down below..
        while (length ($_) > MAX_BODY_LINE_LENGTH) {
          push (@{$part_array}, substr($_, 0, MAX_BODY_LINE_LENGTH)."\n");
          substr($_, 0, MAX_BODY_LINE_LENGTH) = '';
        }
        push ( @{$part_array}, $_ );
      }
      # per rfc 1521, the CRLF before the boundary is part of the boundary:
      # NOTE: The CRLF preceding the encapsulation line is conceptually
      # attached to the boundary so that it is possible to have a part
      # that does not end with a CRLF (line break). Body parts that must
      # be considered to end with line breaks, therefore, must have two
      # CRLFs preceding the encapsulation line, the first of which is part
      # of the preceding body part, and the second of which is part of the
      # encapsulation boundary.
      elsif ($part_array) {
        chomp( $part_array->[-1] );  # trim the CRLF that's part of the boundary
        splice @{$part_array}, -1 if ( $part_array->[-1] eq '' ); # blank line for the boundary only ...
      }
      else {
        # Invalid parts can have no body, so fake in a blank body
	# in that case.
        $part_array = [];
      }

      ($part_msg->{'type'}, my $p_boundary, undef, undef, my $ct_was_missing) =
          Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type'));

      # bug 5741: if ct was missing and parent == multipart/digest, then
      # type should be set as message/rfc822
      if ($ct_was_missing) {
        if ($msg->{'type'} eq 'multipart/digest') {
          dbg("message: missing type, setting multipart/digest child as message/rfc822");
          $part_msg->{'type'} = 'message/rfc822';
        } else {
          dbg("message: missing type, setting as default text/plain");
        }
      }

      $p_boundary ||= $boundary;
      dbg("message: found part of type ".$part_msg->{'type'}.", boundary: ".(defined $p_boundary ? $p_boundary : ''));

      # we've created a new node object, so add it to the queue along with the
      # text that belongs to that part, then add the new part to the current
      # node to create the tree.
      push(@{$self->{'parse_queue'}}, [ $part_msg, $p_boundary, $part_array, $subparse ]);
      $msg->add_body_part($part_msg);

      if (defined $boundary) {
        if ($found_last_end_boundary) {
	  # Make a note that we've seen the end boundary
	  $self->{mime_boundary_state}->{$boundary}--;
          last;
        }
	elsif ($line_count && $body->[-$line_count] !~ /^[\041-\071\073-\176]+:/) {
          # if we aren't on an end boundary and there are still lines left, it
	  # means we hit a new start boundary.  therefore, the next line ought
	  # to be a mime header.  if it's not, mark it.
	  $self->{'missing_mime_headers'} = 1;
	}
      }

      # Maximum parts to process, simply skip the rest of the parts
      if (++$partcnt == 1000) {
        dbg("message: mimepart limit exceeded, stopping parsing");
        $self->{'mimepart_limit_exceeded'} = 1;
        return;
      }

      # make sure we start with a new clean node
      $in_body  = 0;
      $part_msg = Mail::SpamAssassin::Message::Node->new({ normalize=>$self->{normalize} });
      undef $part_array;
      undef $header;

      next;
    }

    if (!$in_body) {
      # s/\s+$//;   # bug 5127: don't clean this up (yet)
      # no re "strict";  # since perl 5.21.8: Ranges of ASCII printables...
      if (/^[\041-\071\073-\176]+[ \t]*:/) {
        if ($header) {
          my ( $key, $value ) = split ( /:\s*/, $header, 2 );
          $part_msg->header( $key, $value );
        }
        $header = $_;
	next;
      }
      elsif (/^[ \t]/ && $header) {
        # $_ =~ s/^\s*//;   # bug 5127, again
        $header .= $_;
	next;
      }
      else {
        if ($header) {
          my ( $key, $value ) = split ( /:\s*/, $header, 2 );
          $part_msg->header( $key, $value );
          if (defined $boundary && lc $key eq 'content-type') {
	    my (undef, $nested_bound) = Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type'));
            if (defined $nested_bound && $nested_bound eq $boundary) {
       	      $nested_boundary = 1;
            }
          }
        }
        $in_body = 1;

	# if there's a blank line separator, that's good.  if there isn't,
	# it's a body line, so drop through.
	if (/^\r?$/) {
	  next;
	}
	else {
          $self->{'missing_mime_head_body_separator'} = 1;
	}
      }
    }

    # we run into a perl bug if the lines are astronomically long (probably
    # due to lots of regexp backtracking); so split any individual line
    # over MAX_BODY_LINE_LENGTH bytes in length.  This can wreck HTML
    # totally -- but IMHO the only reason a luser would use
    # MAX_BODY_LINE_LENGTH-byte lines is to crash filters, anyway.
    while (length ($_) > MAX_BODY_LINE_LENGTH) {
      push (@{$part_array}, substr($_, 0, MAX_BODY_LINE_LENGTH)."\n");
      substr($_, 0, MAX_BODY_LINE_LENGTH) = '';
    }
    push ( @{$part_array}, $_ );
  }

  # Look for a message epilogue
  # originally ignored whitespace:   0.185   0.2037   0.0654    0.757   0.00   0.00  TVD_TAB
  # ham FPs were all "." on a line by itself.
  # spams seem to only have NULL chars afterwards ?
  if ($line_count) {
    for(; $line_count > 0; $line_count--) {
      if ($body->[-$line_count] =~ /[^\s.]/) {
        $self->{mime_epilogue_exists} = 1;
        last;
      }
    }
  }

}

=item _parse_normal()

Generate a leaf node and add it to the parent.

=cut

sub _parse_normal {
  my($self, $toparse) = @_;

  my ($msg, $boundary, $body) = @{$toparse};

  dbg("message: parsing normal part");

  # 0: content-type, 1: boundary, 2: charset, 3: filename 4: ct_missing
  my @ct = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));

  # multipart sections are required to have a boundary set ...  If this
  # one doesn't, assume it's malformed and revert to text/plain
  # bug 5741: don't overwrite the default type assigned by _parse_multipart()
  if (!$ct[4]) {
    $msg->{'type'} = (index($ct[0], 'multipart/') != 0 || defined $boundary) ?
      $ct[0] : 'text/plain'
  } else {
    dbg("message: missing type, setting previous multipart type: %s", $msg->{'type'});
  }
  $msg->{'charset'} = $ct[2];

  # parse content-disposition header
  if ( my $disp = $msg->header('content-disposition') ) {
    $disp = Mail::SpamAssassin::Header::ParameterHeader->new($disp);
    $msg->{'disposition'} = $disp->value();
    $msg->{'name'} = $disp->parameter('filename');
  }
  # if the content-disposition header doesn't have a filename, try to get
  # the filename from the content-type header
  $msg->{'name'} = $ct[3] unless defined $msg->{'name'};

  $msg->{'boundary'} = $boundary;

  # If the part type is not one that we're likely to want to use, go
  # ahead and write the part data out to a temp file -- why keep sucking
  # up RAM with something we're not going to use?
  #
  my $type = $msg->effective_type();
  unless ($type eq 'text/plain' || $type eq 'text/html' ||
          index($type, 'message/') == 0) {
    my($filepath, $fh);
    eval {
      ($filepath, $fh) = Mail::SpamAssassin::Util::secure_tmpfile();  1;
    } or do {
      my $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat;
      info("message: failed to create a temp file: %s", $eval_stat);
    };
    if ($fh) {
      # The temp file was created, add it to the list of pending deletions
      # we cannot just delete immediately in the POSIX idiom, as this is
      # unportable (to win32 at least)
      push @{$self->{tmpfiles}}, $filepath;
      dbg("message: storing a message part to file %s", $filepath);
      $fh->print(@{$body})  or die "error writing to $filepath: $!";
      $fh->flush  or die "error writing (flush) to $filepath: $!";
      $msg->{'raw'} = $fh;
    }
  }

  # if the part didn't get a temp file, go ahead and store the data in memory
  if (!defined $msg->{'raw'}) {
    dbg("message: storing a body to memory");
    $msg->{'raw'} = $body;
  }
}

# ---------------------------------------------------------------------------

sub get_mimepart_digests {
  my ($self) = @_;

  if (!exists $self->{mimepart_digests}) {
    # traverse all parts which are leaves, recursively
    $self->{mimepart_digests} =
      [ map(sha1_hex($_->decode) . ':' . ($_->{type}||''),
            $self->find_parts(qr/^/,1,1)) ];
  }
  return $self->{mimepart_digests};
}

# ---------------------------------------------------------------------------

# common code for get_rendered_body_text_array,
# get_visible_rendered_body_text_array, get_invisible_rendered_body_text_array
#
sub get_body_text_array_common {
  my ($self, $method_name) = @_;

  my $key = 'text_' . $method_name;



( run in 0.464 second using v1.01-cache-2.11-cpan-d7a12ab2c7f )