Mail-SpamAssassin
view release on metacpan or search on metacpan
lib/Mail/SpamAssassin/Message/Node.pm view on Meta::CPAN
}
my $encoding = lc $self->header('content-transfer-encoding') || '';
if ( $encoding eq 'quoted-printable' ) {
dbg("message: decoding quoted-printable");
$self->{'decoded'} = Mail::SpamAssassin::Util::qp_decode($raw);
$self->{'decoded'} =~ s/\015\012/\012/gs;
}
elsif ( $encoding eq 'base64' ) {
dbg("message: decoding base64");
# if it's not defined or is 0, do the whole thing, otherwise only decode
# a portion
if ($bytes) {
return Mail::SpamAssassin::Util::base64_decode($raw, $bytes);
}
else {
# Generate the decoded output
$self->{'decoded'} = Mail::SpamAssassin::Util::base64_decode($raw);
}
if ( $self->{'type'} =~ m@^(?:text|message)\b/@i ) {
$self->{'decoded'} =~ s/\015\012/\012/gs;
}
}
else {
# Encoding is one of 7bit, 8bit, binary or x-something
if ( $encoding ) {
dbg("message: decoding other encoding type ($encoding), ignoring");
}
else {
dbg("message: no encoding detected");
}
$self->{'decoded'} = $raw;
}
}
if ( !defined $bytes || $bytes ) {
if ( !defined $bytes ) {
# force a copy
return '' . $self->{'decoded'};
}
else {
return substr($self->{'decoded'}, 0, $bytes);
}
}
}
# Detect endianness of UTF-16 encoded data
sub detect_utf16 {
my $data = $_[0]; # could not avoid copying large strings
my $utf16le_clues = 0;
my $utf16be_clues = 0;
my $sum_h_e = 0;
my $sum_h_o = 0;
my $sum_l_e = 0;
my $sum_l_o = 0;
my $decoder = undef;
# avoid scan if BOM present
if( $data =~ /^(?:\xff\xfe|\xfe\xff)/ ) {
dbg( "message: detect_utf16: found BOM" );
return; # let perl figure it out from the BOM
}
my @msg_h = unpack 'H' x length( $data ), $data;
my @msg_l = unpack 'h' x length( $data ), $data;
for( my $i = 0; $i < length( $data ); $i+=2 ) {
my $check_char = sprintf( "%01X%01X %01X%01X", hex $msg_h[$i], hex $msg_l[$i], hex $msg_h[$i+1], hex $msg_l[$i+1] );
$sum_h_e += hex $msg_h[$i];
$sum_h_o += hex $msg_h[$i+1];
$sum_l_e += hex $msg_l[$i];
$sum_l_o += hex $msg_l[$i+1];
if (index($check_char, '20 00') >= 0) {
# UTF-16LE space char detected
$utf16le_clues++;
}
if (index($check_char, '00 20') >= 0) {
# UTF-16BE space char detected
$utf16be_clues++;
}
}
# If we have 4x as many non-null characters in the odd bytes, we're probably UTF-16LE
$utf16le_clues++ if( ($sum_h_e + $sum_l_e) > ($sum_h_o + $sum_l_o)*4 );
# If we have 4x as many non-null characters in the even bytes, we're probably UTF-16BE
$utf16be_clues++ if( ($sum_h_o + $sum_l_o)*4 > ($sum_h_e + $sum_l_e) );
if( $utf16le_clues > $utf16be_clues ) {
dbg( "message: detect_utf16: UTF-16LE" );
$decoder = Encode::find_encoding("UTF-16LE");
} elsif( $utf16be_clues > $utf16le_clues ) {
dbg( "message: detect_utf16: UTF-16BE" );
$decoder = Encode::find_encoding("UTF-16BE");
} else {
dbg( "message: detect_utf16: Could not detect UTF-16 endianness" );
}
return $decoder;
}
# Look at a text scalar and determine whether it should be rendered
# as text/html.
#
# This is not a public function.
#
sub _html_render {
if ($_[0] =~ m/^(.{0,18}?<(?:body|head|html|img|pre|table|title)(?:\s.{0,18}?)?>)/is)
{
my $pad = $1;
my $count = 0;
$count += ($pad =~ tr/\n//d) * 2;
$count += ($pad =~ tr/\n//cd);
return ($count < 24);
}
return 0;
}
# Decode character set of a given text to perl characters (Unicode),
# then encode into UTF-8 octets if requested.
#
lib/Mail/SpamAssassin/Message/Node.pm view on Meta::CPAN
if utf8::is_utf8($_[0]);
# workaround for Encode::decode taint laundering bug [rt.cpan.org #84879]
my $data_taint = substr($_[0], 0, 0); # empty string, tainted like $data
# number of characters with code above 127
my $cnt_8bits = $_[0] =~ tr/\x00-\x7F//c;
if (!$cnt_8bits &&
$charset_declared =~
/^(?: (?:US-)?ASCII | ANSI[_ ]? X3\.4- (?:1986|1968) |
ISO646-US )\z/xsi)
{ # declared as US-ASCII (a.k.a. ANSI X3.4-1986) and it really is
dbg("message: contains only US-ASCII characters, declared %s, not decoding",
$charset_declared);
return $_[0]; # is all-ASCII, no need for decoding
}
if (!$cnt_8bits &&
$charset_declared =~
/^(?: ISO[ -]?8859 (?: - \d{1,2} )? | Windows-\d{4} |
UTF-?8 | (KOI8|EUC)-[A-Z]{1,2} |
Big5 | GBK | GB[ -]?18030 (?:-20\d\d)? )\z/xsi)
{ # declared as extended ASCII, but it is actually a plain 7-bit US-ASCII
dbg("message: contains only US-ASCII characters, declared %s, not decoding",
$charset_declared);
return $_[0]; # is all-ASCII, no need for decoding
}
# Try first to strictly decode based on a declared character set.
my $rv;
# Try first as UTF-8 ignoring declaring?
my $tried_utf8;
if ($cnt_8bits && !$insist_on_declared_charset) {
if (eval { $rv = $enc_utf8->decode($_[0], Encode::FB_CROAK | Encode::LEAVE_SRC); defined $rv }) {
dbg("message: decoded as charset UTF-8, declared %s",
$charset_declared);
return $_[0] if !$return_decoded;
$rv .= $data_taint; # carry taintedness over, avoid Encode bug
return $rv; # decoded
} else {
my $err = '';
if ($@) {
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
$err = " ($err)";
}
dbg("message: failed decoding as charset UTF-8, declared %s%s",
$charset_declared, $err);
$tried_utf8 = 1;
}
}
if ($charset_declared =~ /^(?:US-)?ASCII\z/i
&& !$insist_on_declared_charset) {
# declared as US-ASCII but contains 8-bit characters, makes no sense
# to attempt decoding first as strict US-ASCII as we know it would fail
} elsif ($charset_declared =~ /^UTF[ -]?16/i) {
# Handle cases where spammers use UTF-16 encoding without including a BOM
# or declaring endianness as reported at:
# https://bz.apache.org/SpamAssassin/show_bug.cgi?id=7252
my $decoder = detect_utf16( $_[0] );
if (defined $decoder) {
if (eval { $rv = $decoder->decode($_[0], Encode::FB_CROAK | Encode::LEAVE_SRC); defined $rv }) {
dbg("message: decoded as charset %s, declared %s",
$decoder->name, $charset_declared);
utf8::encode($rv) if !$return_decoded;
$rv .= $data_taint; # carry taintedness over, avoid Encode bug
return $rv; # decoded
} else {
my $err = '';
if ($@) {
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
$err = " ($err)";
}
dbg("message: failed decoding as charset %s, declared %s%s",
$decoder->name, $charset_declared, $err);
}
};
} else {
# try decoding as a declared character set
# -> http://en.wikipedia.org/wiki/Windows-1252
# Windows-1252 character encoding is a superset of ISO 8859-1, but differs
# from the IANA's ISO-8859-1 by using displayable characters rather than
# control characters in the 80 to 9F (hex) range. [...]
# It is very common to mislabel Windows-1252 text with the charset label
# ISO-8859-1. A common result was that all the quotes and apostrophes
# (produced by "smart quotes" in word-processing software) were replaced
# with question marks or boxes on non-Windows operating systems, making
# text difficult to read. Most modern web browsers and e-mail clients
# treat the MIME charset ISO-8859-1 as Windows-1252 to accommodate
# such mislabeling. This is now standard behavior in the draft HTML 5
# specification, which requires that documents advertised as ISO-8859-1
# actually be parsed with the Windows-1252 encoding.
#
my($chset, $decoder);
if ($charset_declared =~ /^(?: ISO-?8859-1 | Windows-1252 | CP1252 )\z/xi) {
$chset = 'Windows-1252'; $decoder = $enc_w1252;
} elsif ($charset_declared =~ /^UTF-?8\z/i) {
$chset = 'UTF-8'; $decoder = $enc_utf8;
} else {
$chset = $charset_declared;
$decoder = Encode::find_encoding($chset);
if (!$decoder && $chset =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
$decoder = Encode::find_encoding('GBK'); # a subset of GB18030
dbg("message: no decoder for a declared charset %s, using GBK",
$chset) if $decoder;
}
}
if (!$decoder) {
dbg("message: failed decoding, no decoder for a declared charset %s",
$chset);
}
else {
my $check_flags = Encode::LEAVE_SRC;
$check_flags |= Encode::FB_CROAK unless $insist_on_declared_charset || ($tried_utf8 && $chset eq 'UTF-8');
my $err = '';
( run in 2.787 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )