Mail-SpamAssassin

 view release on metacpan or  search on metacpan

lib/Mail/SpamAssassin/HTML.pm  view on Meta::CPAN

# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at:
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

# HTML decoding TODOs
# - add URIs to list for faster URI testing

package Mail::SpamAssassin::HTML;

use strict;
use warnings;
use re 'taint';

use HTML::Parser 3.43 ();
use Mail::SpamAssassin::Logger;
use Mail::SpamAssassin::Constants qw(:sa);
use Mail::SpamAssassin::Util qw(untaint_var);
use Mail::SpamAssassin::HTML::Color;

our @ISA = qw(HTML::Parser);

# elements defined by the HTML 4.01 and XHTML 1.0 DTDs (do not change them!)
# does not include XML
my %elements = map {; $_ => 1 }
  # strict
  qw( a abbr acronym address area b base bdo big blockquote body br button caption cite code col colgroup dd del dfn div dl dt em fieldset form h1 h2 h3 h4 h5 h6 head hr html i img input ins kbd label legend li link map meta noscript object ol optgro...
  # loose
  qw( applet basefont center dir font frame frameset iframe isindex menu noframes s strike u ),
  # non-standard tags
  qw( nobr x-sigsep x-tab ),
;

# elements that we want to render, but not count as valid
my %tricks = map {; $_ => 1 }
  # non-standard and non-valid tags
  qw( bgsound embed listing plaintext xmp ),
  # other non-standard tags handled in popfile
  #   blink ilayer multicol noembed nolayer spacer wbr
;

# elements that change text style
my %elements_text_style = map {; $_ => 1 }
  qw( body font table tr th td big small marquee span p div a strong em b i sup sub ),
;

# elements that insert whitespace
my %elements_whitespace = map {; $_ => 1 }
  qw( br div li th td dt dd p hr blockquote pre embed listing plaintext xmp title 
    h1 h2 h3 h4 h5 h6 ),
;

# elements that push URIs
my %elements_uri = map {; $_ => 1 }
  qw( body table tr td a area link img frame iframe embed script form base bgsound meta ),
;

# style attribute not accepted
#my %elements_no_style = map {; $_ => 1 }
#  qw( base basefont head html meta param script style title ),
#;

# permitted element attributes
my %ok_attributes;
$ok_attributes{body}{$_} = 1 for qw( text bgcolor link alink vlink background );
$ok_attributes{font}{$_} = 1 for qw( color face size );
$ok_attributes{marquee}{$_} = 1 for qw( bgcolor background );
$ok_attributes{table}{$_} = 1 for qw( bgcolor style );
$ok_attributes{td}{$_} = 1 for qw( bgcolor style );
$ok_attributes{th}{$_} = 1 for qw( bgcolor style );
$ok_attributes{tr}{$_} = 1 for qw( bgcolor style );
$ok_attributes{span}{$_} = 1 for qw( style );
$ok_attributes{p}{$_} = 1 for qw( style );
$ok_attributes{div}{$_} = 1 for qw( style );
$ok_attributes{a}{$_} = 1 for qw( style );
$ok_attributes{strong}{$_} = 1 for qw( style );
$ok_attributes{em}{$_} = 1 for qw( style );
$ok_attributes{b}{$_} = 1 for qw( style );
$ok_attributes{i}{$_} = 1 for qw( style );
$ok_attributes{big}{$_} = 1 for qw( style );
$ok_attributes{small}{$_} = 1 for qw( style );
$ok_attributes{sup}{$_} = 1 for qw( style );
$ok_attributes{sub}{$_} = 1 for qw( style );



( run in 1.192 second using v1.01-cache-2.11-cpan-39bf76dae61 )