Mail-SpamAssassin
view release on metacpan or search on metacpan
lib/Mail/SpamAssassin/HTML.pm view on Meta::CPAN
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
# HTML decoding TODOs
# - add URIs to list for faster URI testing
package Mail::SpamAssassin::HTML;
use strict;
use warnings;
use re 'taint';
use HTML::Parser 3.43 ();
use Mail::SpamAssassin::Logger;
use Mail::SpamAssassin::Constants qw(:sa);
use Mail::SpamAssassin::Util qw(untaint_var);
use Mail::SpamAssassin::HTML::Color;
our @ISA = qw(HTML::Parser);
# elements defined by the HTML 4.01 and XHTML 1.0 DTDs (do not change them!)
# does not include XML
my %elements = map {; $_ => 1 }
# strict
qw( a abbr acronym address area b base bdo big blockquote body br button caption cite code col colgroup dd del dfn div dl dt em fieldset form h1 h2 h3 h4 h5 h6 head hr html i img input ins kbd label legend li link map meta noscript object ol optgro...
# loose
qw( applet basefont center dir font frame frameset iframe isindex menu noframes s strike u ),
# non-standard tags
qw( nobr x-sigsep x-tab ),
;
# elements that we want to render, but not count as valid
my %tricks = map {; $_ => 1 }
# non-standard and non-valid tags
qw( bgsound embed listing plaintext xmp ),
# other non-standard tags handled in popfile
# blink ilayer multicol noembed nolayer spacer wbr
;
# elements that change text style
my %elements_text_style = map {; $_ => 1 }
qw( body font table tr th td big small marquee span p div a strong em b i sup sub ),
;
# elements that insert whitespace
my %elements_whitespace = map {; $_ => 1 }
qw( br div li th td dt dd p hr blockquote pre embed listing plaintext xmp title
h1 h2 h3 h4 h5 h6 ),
;
# elements that push URIs
my %elements_uri = map {; $_ => 1 }
qw( body table tr td a area link img frame iframe embed script form base bgsound meta ),
;
# style attribute not accepted
#my %elements_no_style = map {; $_ => 1 }
# qw( base basefont head html meta param script style title ),
#;
# permitted element attributes
my %ok_attributes;
$ok_attributes{body}{$_} = 1 for qw( text bgcolor link alink vlink background );
$ok_attributes{font}{$_} = 1 for qw( color face size );
$ok_attributes{marquee}{$_} = 1 for qw( bgcolor background );
$ok_attributes{table}{$_} = 1 for qw( bgcolor style );
$ok_attributes{td}{$_} = 1 for qw( bgcolor style );
$ok_attributes{th}{$_} = 1 for qw( bgcolor style );
$ok_attributes{tr}{$_} = 1 for qw( bgcolor style );
$ok_attributes{span}{$_} = 1 for qw( style );
$ok_attributes{p}{$_} = 1 for qw( style );
$ok_attributes{div}{$_} = 1 for qw( style );
$ok_attributes{a}{$_} = 1 for qw( style );
$ok_attributes{strong}{$_} = 1 for qw( style );
$ok_attributes{em}{$_} = 1 for qw( style );
$ok_attributes{b}{$_} = 1 for qw( style );
$ok_attributes{i}{$_} = 1 for qw( style );
$ok_attributes{big}{$_} = 1 for qw( style );
$ok_attributes{small}{$_} = 1 for qw( style );
$ok_attributes{sup}{$_} = 1 for qw( style );
$ok_attributes{sub}{$_} = 1 for qw( style );
( run in 1.192 second using v1.01-cache-2.11-cpan-39bf76dae61 )