Labyrinth
view release on metacpan or search on metacpan
lib/Labyrinth/MLUtils.pm view on Meta::CPAN
# -------------------------------------
# Library Modules
use Encode::ZapCP1252;
use HTML::Entities;
use Regexp::Common qw /profanity/;
use Labyrinth::Audit;
use Labyrinth::Variables;
# -------------------------------------
# Variables
my $DEFAULTTAGS = 'p,a,br,b,strong,center,hr,ol,ul,li,i,img,u,em,strike,h1,h2,h3,h4,h5,h6,table,thead,tr,th,tbody,td,sup,address,pre';
my ($HTMLTAGS,%HTMLTAGS);
# -------------------------------------
# The Public Interface Subs
=head1 FUNCTIONS
=head2 HTML Tag handling
=over 4
=item LegalTag
Returns TRUE or FALSE as to whether the given HTML tag is accepted by the
system.
=item LegalTags
Returns the list of HTML tags that are accepted by the system.
=item CleanTags
For a given text string, attempts to clean the use of any HTML tags. Any HTML
tags found that are not accepted by the system are encoded into HTML entities.
=item CleanHTML
For a given text string, removes all existence of any HTML tag. Mostly used in
input text box cleaning.
=item SafeHTML
For a given text string, encodes all HTML tags to HTML entities. Mostly used in
input textarea edit preparation.
=item CleanLink
Attempts to remove known spam style links.
=item CleanWords
Attempts to remove known profanity words.
=item LinkTitles
Given a XHTML snippet, will look for basic links and add title attributes.
Titles are of rhe format 'External Site: $domain', where $domain is the domain
used in the link.
=back
=cut
sub LegalTag {
my $tag = lc shift;
my %tags = _buildtags();
return 1 if($tags{$tag});
return 0;
}
sub LegalTags {
my %tags = _buildtags();
my $tags = join(", ", sort keys %tags);
$tags =~ s/, ([^,]+)$/ and $1/;
return $tags;
}
sub CleanTags {
my $text = shift;
return '' unless($text);
$text =~ s!</?(span|tbody)[^>]*>!!sig;
$text =~ s!<(br|hr)>!<$1 />!sig;
$text =~ s!<p>(?:\s| )+(?:</p>)?<(table|p|ul|ol|div|pre)!<$1!sig;
$text =~ s!\s+&\s+! & !sg;
$text =~ s!&[lr]squo;!"!mg;
$text =~ s{&(?!\#\d+;|[a-z0-9]+;)}{&}sig;
# decode TinyMCE encodings
$text =~ s!<(.*?)>!<$1>!sig;
# clean paragraphs
$text =~ s!</p>\s+<p>!</p><p>!sig;
$text =~ s!\s*<br /><br />\s*!</p><p>!sig;
my %tags = _buildtags();
my @found = ($text =~ m!</?(\w+)(?:\s+[^>]*)?>!gm);
for my $tag (@found) {
$tag = lc $tag;
next if($tags{$tag});
$text =~ s!<(/?$tag(?:[^>]*)?)>!<$1>!igm;
$tags{$tag} = 1;
}
process_html($text,0,1);
}
sub CleanHTML {
my $text = shift;
return '' unless($text);
$text =~ s!<[^>]+>!!gm; # remove any tags
$text =~ s!\s{2,}! !mg;
$text =~ s!&[lr]squo;!"!mg;
lib/Labyrinth/MLUtils.pm view on Meta::CPAN
=item cleanup_attr_scriptlang
=item cleanup_attr_scripttype
=item strip_nonprintable
=back
=cut
# Configuration
my $allow_html = 0;
my $line_breaks = 1;
# End configuration
##################################################################
#
# HTML handling code
#
# The code below provides some functions for manipulating HTML.
#
# process_html ( INPUT [,LINE_BREAKS [,ALLOW]] )
#
# Returns a modified version of the HTML string INPUT, with
# any potentially malicious HTML constructs (such as java,
# javascript and IMG tags) removed.
#
# If the LINE_BREAKS parameter is present and true then
# line breaks in the input will be converted to html <br />
# tags in the output.
#
# If the ALLOW parameter is present and true then most
# harmless tags will be left in, otherwise all tags will be
# removed.
#
# escape_html ( INPUT )
#
# Returns a copy of the string INPUT with any HTML
# metacharacters replaced with character escapes.
#
# unescape_html ( INPUT )
#
# Returns a copy of the string INPUT with HTML character
# entities converted to literal characters where possible.
# Note that some entites have no 8-bit character equivalent,
# see "http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent"
# for some examples. unescape_html() leaves these entities
# in their encoded form.
#
use vars qw(%html_entities $html_safe_chars %escape_html_map $escape_html_map);
use vars qw(%safe_tags %safe_style %tag_is_empty %closetag_is_optional
%closetag_is_dependent %force_closetag %transpose_tag
$convert_nl %auto_deinterleave $auto_deinterleave_pattern);
# check the validity of a URL.
sub process_html {
my ($text, $line_breaks, $allow_html) = @_;
# cleanup erroneous XHTML patterns
if($text) {
$text =~ s!</pre><pre>!<br />!gsi;
$text =~ s!<ul>\s*<br />!<ul>!gsi;
$text =~ s!<br />\s*</ul>!</ul>!gsi;
$text =~ s!<ul>\s*</ul>!!gsi;
$text =~ s!<ol>\s*</ol>!!gsi;
}
# clean text of any nasties
#$text =~ s/[\x201A\x2018\x2019`]/'/g; # nasty single quotes
#$text =~ s/[\x201E\x201C\x201D]/"/g; # nasty double quotes
cleanup_html( $text, $line_breaks, ($allow_html ? \%safe_tags : {}));
}
BEGIN
{
%html_entities = (
'lt' => '<',
'gt' => '>',
'quot' => '"',
'amp' => '&',
'nbsp' => "\240", 'iexcl' => "\241",
'cent' => "\242", 'pound' => "\243",
'curren' => "\244", 'yen' => "\245",
'brvbar' => "\246", 'sect' => "\247",
'uml' => "\250", 'copy' => "\251",
'ordf' => "\252", 'laquo' => "\253",
'not' => "\254", 'shy' => "\255",
'reg' => "\256", 'macr' => "\257",
'deg' => "\260", 'plusmn' => "\261",
'sup2' => "\262", 'sup3' => "\263",
'acute' => "\264", 'micro' => "\265",
'para' => "\266", 'middot' => "\267",
'cedil' => "\270", 'supl' => "\271",
'ordm' => "\272", 'raquo' => "\273",
'frac14' => "\274", 'frac12' => "\275",
'frac34' => "\276", 'iquest' => "\277",
'Agrave' => "\300", 'Aacute' => "\301",
'Acirc' => "\302", 'Atilde' => "\303",
'Auml' => "\304", 'Aring' => "\305",
'AElig' => "\306", 'Ccedil' => "\307",
'Egrave' => "\310", 'Eacute' => "\311",
'Ecirc' => "\312", 'Euml' => "\313",
'Igrave' => "\314", 'Iacute' => "\315",
'Icirc' => "\316", 'Iuml' => "\317",
'ETH' => "\320", 'Ntilde' => "\321",
'Ograve' => "\322", 'Oacute' => "\323",
'Ocirc' => "\324", 'Otilde' => "\325",
'Ouml' => "\326", 'times' => "\327",
'Oslash' => "\330", 'Ugrave' => "\331",
'Uacute' => "\332", 'Ucirc' => "\333",
'Uuml' => "\334", 'Yacute' => "\335",
'THORN' => "\336", 'szlig' => "\337",
'agrave' => "\340", 'aacute' => "\341",
'acirc' => "\342", 'atilde' => "\343",
'auml' => "\344", 'aring' => "\345",
lib/Labyrinth/MLUtils.pm view on Meta::CPAN
sub cleanup_html {
local ($_, $convert_nl, $safe_tags) = @_;
local @stack = ();
return '' unless($_);
my $ignore_comments = 0;
if($ignore_comments) {
s[
(?: <!--.*?--> ) |
(?: <[?!].*?> ) |
(?: <([a-z0-9]+)\b((?:[^>'"]|"[^"]*"|'[^']*')*)> ) |
(?: </([a-z0-9]+)> ) |
(?: (.[^<]*) )
][
defined $1 ? cleanup_tag(lc $1, $2) :
defined $3 ? cleanup_close(lc $3) :
defined $4 ? cleanup_cdata($4) :
''
]igesx;
} else {
s[
(?: (<!--.if.*?endif.-->) ) |
(?: <!--.*?--> ) |
(?: <[?!].*?> ) |
(?: <([a-z0-9]+)\b((?:[^>'"]|"[^"]*"|'[^']*')*)> ) |
(?: </([a-z0-9]+)> ) |
(?: (.[^<]*) )
][
defined $1 ? $1 :
defined $2 ? cleanup_tag(lc $2, $3) :
defined $4 ? cleanup_close(lc $4) :
defined $5 ? cleanup_cdata($5) :
''
]igesx;
}
# Close anything that was left open
$_ .= join '', map "</$_->{NAME}>", @stack;
# Where we turned <i><b>foo</i></b> into <i><b>foo</b></i><b></b>,
# take out the pointless <b></b>.
1 while s#<($auto_deinterleave_pattern)\b[^>]*>( |\s)*</\1>##go;
# cleanup p elements
s!\s+</p>!</p>!g;
s!<p></p>!!g;
# Element pre is not declared in p list of possible children
s!<p>\s*(<pre>.*?</pre>)\s*</p>!$1!g;
return $_;
}
sub cleanup_tag {
my ($tag, $attrs) = @_;
unless (exists $safe_tags->{$tag}) {
return '';
}
# for XHTML conformity
$tag = $transpose_tag{$tag} if($transpose_tag{$tag});
my $html = '';
if($force_closetag{$tag}) {
while (scalar @stack and $force_closetag{$tag}{$stack[0]{NAME}}) {
$html = cleanup_close($stack[0]{NAME});
}
}
my $t = $safe_tags->{$tag};
my $safe_attrs = '';
while ($attrs =~ s#^\s*(\w+)(?:\s*=\s*(?:([^"'>\s]+)|"([^"]*)"|'([^']*)'))?##) {
my $attr = lc $1;
my $val = ( defined $2 ? $2 :
defined $3 ? unescape_html($3) :
defined $4 ? unescape_html($4) :
'$attr'
);
unless (exists $t->{$attr}) {
next;
}
if (defined $t->{$attr}) {
local $_ = $val;
my $cleaned = &{ $t->{$attr} }();
if (defined $cleaned) {
$safe_attrs .= qq| $attr="${\( escape_html($cleaned) )}"|;
}
} else {
$safe_attrs .= " $attr";
}
}
my $str;
if (exists $tag_is_empty{$tag}) {
$str = "$html<$tag$safe_attrs />";
} elsif (exists $closetag_is_optional{$tag}) {
$str = "$html<$tag$safe_attrs>";
# } elsif (exists $closetag_is_dependent{$tag} && $safe_attrs =~ /$closetag_is_dependent{$tag}=/) {
# return "$html<$tag$safe_attrs />";
} else {
my $full = "<$tag$safe_attrs>";
unshift @stack, { NAME => $tag, FULL => $full };
$str = "$html$full";
}
#LogDebug("cleanup_tag: str=$str");
return $str;
}
sub cleanup_close {
my $tag = shift;
# for XHTML conformity
$tag = $transpose_tag{$tag} if($transpose_tag{$tag});
# Ignore a close without an open
unless (grep {$_->{NAME} eq $tag} @stack) {
return '';
}
# Close open tags up to the matching open
my @close = ();
while (scalar @stack and $stack[0]{NAME} ne $tag) {
push @close, shift @stack;
}
push @close, shift @stack;
my $html = join '', map {"</$_->{NAME}>"} @close;
# Reopen any we closed early if all that were closed are
# configured to be auto deinterleaved.
unless (grep {! exists $auto_deinterleave{$_->{NAME}} } @close) {
pop @close;
$html .= join '', map {$_->{FULL}} reverse @close;
unshift @stack, @close;
}
return $html;
}
sub cleanup_cdata {
local $_ = shift;
return $_ if(scalar @stack and $stack[0]{NAME} eq 'script');
s[ (?: & (
[a-zA-Z0-9]{2,15} |
[#][0-9]{2,6} |
[#][xX][a-fA-F0-9]{2,6} | ) \b ;?
) | ($escape_html_map) | (.)
][
defined $1 ? "&$1;" : defined $2 ? $2 : $3
]gesx;
# substitute newlines in the input for html line breaks if required.
s%\cM?\n%<br />\n%g if $convert_nl;
return $_;
}
# subroutine to escape the necessary characters to the appropriate HTML
# entities
sub escape_html {
my $str = shift or return '';
$str = encode_entities($str);
$str =~ s/&(#x?\d+;)/&$1/g; # avoid double encoding of hex/dec characters
return $str;
}
# subroutine to unescape escaped HTML entities. Note that some entites
# have no 8-bit character equivalent, see
# "http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent" for some examples.
( run in 1.101 second using v1.01-cache-2.11-cpan-39bf76dae61 )