HTML-Detoxifier

 view release on metacpan or  search on metacpan

lib/HTML/Detoxifier.pm  view on Meta::CPAN

# -----------------------------------------------------------------------------
#  HTML::Detoxifier - strips harmful HTML from user input   v0.02 - 03/01/2004
#
#  Copyright (c) 2004 Patrick Walton <pwalton@metajournal.net>
#  but freely redistributable under the same terms as Perl itself.
# -----------------------------------------------------------------------------

package HTML::Detoxifier;

use strict;
use warnings FATAL => 'all';
use HTML::TokeParser;
use HTML::Entities;

use base qw<Exporter>;
@HTML::Detoxifier::EXPORT_OK = qw(detoxify); 

$HTML::Detoxifier::VERSION = 0.01;

=head1 NAME

HTML::Detoxifier - practical module to strip harmful HTML

=head1 SYNOPSIS

	use HTML::Detoxifier qw<detoxify>;
	
	my $clean_html = detoxify $html;
	
	my $cleaner_html = detoxify($html, disallow =>
		[qw(dynamic images document)]);
	
	my $stripped_html = detoxify($html, disallow => [qw(everything)]);

=head1 DESCRIPTION

HTML::Detoxifier is a practical module to remove harmful tags from HTML input.
It's intended to be used for web sites that accept user input in the form of
HTML and then present that information in some form.

Accepting all HTML from untrusted users is generally a very bad idea;
typically, all HTML should be run through some kind of filter before being
presented to end users. Cross-site scripting (XSS) vulnerabilities can run
rampant without a filter. The most common and obvious HTML vulnerability lies
in stealing users' login cookies through JavaScript.

Unlike other modules, HTML::Detoxifier is intended to be a practical solution
that abstracts away all the specifics of whitelisting certain tags easily 
and securely. Tags are divided into functional groups, each of which can be
disallowed or allowed as you wish. Additionally, HTML::Detoxifier knows how to
clean inline CSS; with HTML::Detoxifier, you can securely allow users to use
style sheets without allowing cross-site scripting vulnerabilities. (Yes, it is
possible to execute JavaScript from CSS!)

In addition to this main purpose, HTML::Detoxifier cleans up some common
mistakes with HTML: all tags are closed, empty tags are converted to valid
XML (that is, with a trailing /), and images without ALT text as required in
HTML 4.0 are given a plain ALT tag. The module does its best to emit valid
XHTML 1.0; it even adds XML declarations and DOCTYPE elements where needed.

=cut

use constant TAG_GROUPS => {
	links => {
		a => undef,
		area => undef,
		link => undef,
		map => undef
	},
	document => {
		base => undef,
		basefont => undef,
		bdo => undef,
		head => undef,
		body => undef,
		html => undef,
		link => undef,
		meta => undef,
		style => undef,
		title => undef
	},
	aesthetic => {
		b => undef,
		basefont => undef,
		big => undef,
		blink => undef,
		em => undef,
		h1 => undef,
		h2 => undef,
		h3 => undef,
		h4 => undef,
		h5 => undef,
		h6 => undef,
		i => undef,
		kbd => undef,
		marquee => undef,
		pre => undef,
		s => undef,
		small => undef,
		strike => undef,
		strong => undef,
		style => undef,
		'sub' => undef,
		sup => undef,
		tt => undef,
		u => undef,
		var => undef
	},
	'size-changing' => {
		big => undef,
		h1 => undef,
		h2 => undef,
		h3 => undef,
		h4 => undef,
		h5 => undef,
		h6 => undef,
		small => undef,
		style => undef,
		'sub' => undef,

lib/HTML/Detoxifier.pm  view on Meta::CPAN

	}

	my $styles_allowed = 1;
	foreach my $restriction (keys %{$opts{disallow}}) {
		$styles_allowed = 0, last if exists STYLES_ALLOWED_IF->{$restriction}
	}

	TOKEN: while (my $token = get_token $parser) {
		if ($token->[0] eq 'S') {
			next TOKEN if exists $opts{disallow}{everything};
			next TOKEN unless exists TAGS->{lc $token->[1]};

			foreach my $restriction (keys %{$opts{disallow}}) {
				next TOKEN if
					exists TAG_GROUPS->{$restriction}{lc $token->[1]}
			}

			my %attrs;
			while (my ($key, $value) = each %{$token->[2]}) {
				next unless $key =~ /^[a-z]/i;

				if (exists $opts{disallow}{dynamic}) {
					next if $key =~ /^on/is;
					next if lc($key) eq 'href' and
						$value =~ /^[a-z]+?script:/is;
				}

				$attrs{lc $key} = $value
			}

			# As a special case, external style sheets must be disabled if
			# dynamic content is disallowed.
			next TOKEN if lc $token->[1] eq 'link' and (
				exists $attrs{rel} && lc $attrs{rel} =~
				/^\s*style\s*sheet\s*$/is or
				exists $attrs{type} && lc $attrs{type} =~
				m(^\s*text/css\s*$));

			# If this is a style declaration and dynamic content is
			# disallowed, we need to flag it for checking.
			$checkcss = 1 if lc $token->[1] eq 'style' and exists
				$opts{disallow}{dynamic};

			# Add an ALT tag to images if it's needed.
			$attrs{alt} = '[' .
				(($attrs{src} =~ m{([^/.]*)\.[a-z]+$}gi)[0] or 'image') .
				']' if lc $token->[1] eq 'img' and $attrs{src} and not
				$attrs{alt};

			if (not $styles_allowed) {
				delete $attrs{style} if exists $attrs{style};
				delete $attrs{class} if exists $attrs{class};
				delete $attrs{id} if exists $attrs{id}
			} elsif (exists $opts{disallow}{dynamic}) {
				$attrs{style} = remove_scripts_from_css $attrs{style} if
					$attrs{style}
			}
			
			if (lc $token->[1] eq 'html') {	
				# Add a valid XML declaration and a doctype. HTML::Detoxifier
				# converts everything to XHTML 1.0, so we might as well
				# qualify it!

				$out = <<"ENDDECL" . $out;
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
ENDDECL

				$attrs{xmlns} = "http://www.w3.org/1999/xhtml"
					unless $attrs{xmlns};
				$attrs{lang} = "en-US" unless $attrs{lang};	
			}

			$out .= "<" . lc $token->[1];
			while (my ($key, $value) = each %attrs) {
				$value = encode_entities $value;
				$out .= qq( $key="$value");
			}

			if (exists EMPTY_ELEMENTS->{lc $token->[1]}) {
				$out .= " />";
			} else {
				unshift @tagstack, $token->[1];
				$out .= ">";
			}
		} elsif ($token->[0] eq 'E') {
			next TOKEN unless exists TAGS->{lc $token->[1]};
			foreach my $restriction (keys %{$opts{disallow}}) {
				next TOKEN if
					exists TAG_GROUPS->{$restriction}{lc $token->[1]}
			}

			while (@tagstack) {
				my $tag = shift @tagstack;
				$out .= "</$tag>";
				last if $tag eq lc $token->[1]; 	
			}

			$checkcss = 0 if lc $token->[1] eq 'style' and exists
				$opts{disallow}{dynamic};  
		} elsif ($token->[0] eq 'T') {
			local $_ = $token->[1];
			$_ = remove_scripts_from_css $_ if $checkcss;
			
			$out .= $_;
		} elsif ($token->[0] eq 'C') {
			local $_ = $token->[1];
			$_ = remove_scripts_from_css $_ if $checkcss;

			s/(?:<!--\s*|\s*-->)//g;

			$out .= "<!-- $_ -->" unless exists $opts{disallow}{comments}
				or exists $opts{disallow}{everything};
		}
	}	

	if (not exists $opts{section} or $opts{section} eq 'last') { 
		foreach my $unclosedtag (@tagstack) {
			$out .= "</$unclosedtag>";
		}

		@tagstack = @oldtagstacks ? @{pop @oldtagstacks} : ();
	}

	$out;
}



( run in 0.478 second using v1.01-cache-2.11-cpan-119454b85a5 )