BOM results from the CPAN

Mail-SpamAssassin

    }

    my $encoding = lc $self->header('content-transfer-encoding') || '';

    if ( $encoding eq 'quoted-printable' ) {
      dbg("message: decoding quoted-printable");
      $self->{'decoded'} = Mail::SpamAssassin::Util::qp_decode($raw);
      $self->{'decoded'} =~ s/\015\012/\012/gs;
    }
    elsif ( $encoding eq 'base64' ) {
      dbg("message: decoding base64");

      # if it's not defined or is 0, do the whole thing, otherwise only decode
      # a portion
      if ($bytes) {
        return Mail::SpamAssassin::Util::base64_decode($raw, $bytes);
      }
      else {
        # Generate the decoded output
        $self->{'decoded'} = Mail::SpamAssassin::Util::base64_decode($raw);
      }

      if ( $self->{'type'} =~ m@^(?:text|message)\b/@i ) {
        $self->{'decoded'} =~ s/\015\012/\012/gs;
      }
    }
    else {
      # Encoding is one of 7bit, 8bit, binary or x-something
      if ( $encoding ) {
        dbg("message: decoding other encoding type ($encoding), ignoring");
      }
      else {
        dbg("message: no encoding detected");
      }
      $self->{'decoded'} = $raw;
    }
  }

  if ( !defined $bytes || $bytes ) {
    if ( !defined $bytes ) {
      # force a copy
      return '' . $self->{'decoded'};
    }
    else {
      return substr($self->{'decoded'}, 0, $bytes);
    }
  }
}

# Detect endianness of UTF-16 encoded data
sub detect_utf16 {
	my $data = $_[0];  # could not avoid copying large strings
	my $utf16le_clues = 0;
	my $utf16be_clues = 0;
	my $sum_h_e = 0;
	my $sum_h_o = 0;
	my $sum_l_e = 0;
	my $sum_l_o = 0;
	my $decoder = undef;

	# avoid scan if BOM present
	if( $data =~ /^(?:\xff\xfe|\xfe\xff)/ ) {
		dbg( "message: detect_utf16: found BOM" );
		return;	# let perl figure it out from the BOM
	}
	
	my @msg_h = unpack 'H' x length( $data ), $data;
	my @msg_l = unpack 'h' x length( $data ), $data;

	for( my $i = 0; $i < length( $data ); $i+=2 ) {
		my $check_char = sprintf( "%01X%01X %01X%01X", hex $msg_h[$i], hex $msg_l[$i], hex $msg_h[$i+1], hex $msg_l[$i+1] );
		$sum_h_e += hex $msg_h[$i];
		$sum_h_o += hex $msg_h[$i+1];
		$sum_l_e += hex $msg_l[$i];
		$sum_l_o += hex $msg_l[$i+1];
		if (index($check_char, '20 00') >= 0) {
			# UTF-16LE space char detected
			$utf16le_clues++;
		}
		if (index($check_char, '00 20') >= 0) {
			# UTF-16BE space char detected
			$utf16be_clues++;
		}
	}

	# If we have 4x as many non-null characters in the odd bytes, we're probably UTF-16LE
	$utf16le_clues++ if( ($sum_h_e + $sum_l_e) > ($sum_h_o + $sum_l_o)*4 );

	# If we have 4x as many non-null characters in the even bytes, we're probably UTF-16BE
	$utf16be_clues++ if( ($sum_h_o + $sum_l_o)*4 > ($sum_h_e + $sum_l_e) );

	if( $utf16le_clues > $utf16be_clues ) {
		dbg( "message: detect_utf16: UTF-16LE" );
		$decoder = Encode::find_encoding("UTF-16LE");
	} elsif( $utf16be_clues > $utf16le_clues ) {
		dbg( "message: detect_utf16: UTF-16BE" );
		$decoder = Encode::find_encoding("UTF-16BE");
	} else {
		dbg( "message: detect_utf16: Could not detect UTF-16 endianness" );
	}

	return $decoder;
}

# Look at a text scalar and determine whether it should be rendered
# as text/html.
#
# This is not a public function.
# 
sub _html_render {
  if ($_[0] =~ m/^(.{0,18}?<(?:body|head|html|img|pre|table|title)(?:\s.{0,18}?)?>)/is)
  {
    my $pad = $1;
    my $count = 0;
    $count += ($pad =~ tr/\n//d) * 2;
    $count += ($pad =~ tr/\n//cd);
    return ($count < 24);
  }
  return 0;
}

# Decode character set of a given text to perl characters (Unicode),
# then encode into UTF-8 octets if requested.
#

lib/Mail/SpamAssassin/Message/Node.pm view on Meta::CPAN

    if utf8::is_utf8($_[0]);

  # workaround for Encode::decode taint laundering bug [rt.cpan.org #84879]
  my $data_taint = substr($_[0], 0, 0);  # empty string, tainted like $data

  # number of characters with code above 127
  my $cnt_8bits = $_[0] =~ tr/\x00-\x7F//c;

  if (!$cnt_8bits &&
      $charset_declared =~
        /^(?: (?:US-)?ASCII | ANSI[_ ]? X3\.4- (?:1986|1968) |
              ISO646-US )\z/xsi)
  { # declared as US-ASCII (a.k.a. ANSI X3.4-1986) and it really is
    dbg("message: contains only US-ASCII characters, declared %s, not decoding",
      $charset_declared);
    return $_[0];  # is all-ASCII, no need for decoding
  }

  if (!$cnt_8bits &&
      $charset_declared =~
        /^(?: ISO[ -]?8859 (?: - \d{1,2} )? | Windows-\d{4} |
              UTF-?8 | (KOI8|EUC)-[A-Z]{1,2} |
              Big5 | GBK | GB[ -]?18030 (?:-20\d\d)? )\z/xsi)
  { # declared as extended ASCII, but it is actually a plain 7-bit US-ASCII
    dbg("message: contains only US-ASCII characters, declared %s, not decoding",
      $charset_declared);
    return $_[0];  # is all-ASCII, no need for decoding
  }

  # Try first to strictly decode based on a declared character set.

  my $rv;

  # Try first as UTF-8 ignoring declaring?
  my $tried_utf8;
  if ($cnt_8bits && !$insist_on_declared_charset) {
    if (eval { $rv = $enc_utf8->decode($_[0], Encode::FB_CROAK | Encode::LEAVE_SRC); defined $rv }) {
      dbg("message: decoded as charset UTF-8, declared %s",
        $charset_declared);
      return $_[0]  if !$return_decoded;
      $rv .= $data_taint;  # carry taintedness over, avoid Encode bug
      return $rv;  # decoded
    } else {
      my $err = '';
      if ($@) {
        $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
        $err = " ($err)";
      }
      dbg("message: failed decoding as charset UTF-8, declared %s%s",
        $charset_declared, $err);
      $tried_utf8 = 1;
    }
  }

  if ($charset_declared =~ /^(?:US-)?ASCII\z/i
           && !$insist_on_declared_charset) {
    # declared as US-ASCII but contains 8-bit characters, makes no sense
    # to attempt decoding first as strict US-ASCII as we know it would fail

  } elsif ($charset_declared =~ /^UTF[ -]?16/i) {
    # Handle cases where spammers use UTF-16 encoding without including a BOM
    # or declaring endianness as reported at:
    # https://bz.apache.org/SpamAssassin/show_bug.cgi?id=7252

    my $decoder = detect_utf16( $_[0] );
    if (defined $decoder) {
      if (eval { $rv = $decoder->decode($_[0], Encode::FB_CROAK | Encode::LEAVE_SRC); defined $rv }) {
        dbg("message: decoded as charset %s, declared %s",
          $decoder->name, $charset_declared);
        utf8::encode($rv) if !$return_decoded;
        $rv .= $data_taint;  # carry taintedness over, avoid Encode bug
        return $rv;  # decoded
      } else {
        my $err = '';
        if ($@) {
          $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
          $err = " ($err)";
        }
        dbg("message: failed decoding as charset %s, declared %s%s",
          $decoder->name, $charset_declared, $err);
      }
    };
  } else {
    # try decoding as a declared character set

    # ->  http://en.wikipedia.org/wiki/Windows-1252
    # Windows-1252 character encoding is a superset of ISO 8859-1, but differs
    # from the IANA's ISO-8859-1 by using displayable characters rather than
    # control characters in the 80 to 9F (hex) range. [...]
    # It is very common to mislabel Windows-1252 text with the charset label
    # ISO-8859-1. A common result was that all the quotes and apostrophes
    # (produced by "smart quotes" in word-processing software) were replaced
    # with question marks or boxes on non-Windows operating systems, making
    # text difficult to read. Most modern web browsers and e-mail clients
    # treat the MIME charset ISO-8859-1 as Windows-1252 to accommodate
    # such mislabeling. This is now standard behavior in the draft HTML 5
    # specification, which requires that documents advertised as ISO-8859-1
    # actually be parsed with the Windows-1252 encoding.
    #
    my($chset, $decoder);
    if ($charset_declared =~ /^(?: ISO-?8859-1 | Windows-1252 | CP1252 )\z/xi) {
      $chset = 'Windows-1252'; $decoder = $enc_w1252;
    } elsif ($charset_declared =~ /^UTF-?8\z/i) {
      $chset = 'UTF-8'; $decoder = $enc_utf8;
    } else {
      $chset = $charset_declared;
      $decoder = Encode::find_encoding($chset);
      if (!$decoder && $chset =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
        $decoder = Encode::find_encoding('GBK');  # a subset of GB18030
        dbg("message: no decoder for a declared charset %s, using GBK",
            $chset)  if $decoder;
      }
    }
    if (!$decoder) {
      dbg("message: failed decoding, no decoder for a declared charset %s",
          $chset);
    }
    else {
      my $check_flags = Encode::LEAVE_SRC;
      $check_flags |= Encode::FB_CROAK  unless $insist_on_declared_charset || ($tried_utf8 && $chset eq 'UTF-8');
      my $err = '';

( run in 1.122 second using v1.01-cache-2.11-cpan-c966e8aa7e8 )