Labyrinth

 view release on metacpan or  search on metacpan

lib/Labyrinth/MLUtils.pm  view on Meta::CPAN


# -------------------------------------
# Library Modules

use Encode::ZapCP1252;
use HTML::Entities;
use Regexp::Common  qw /profanity/;

use Labyrinth::Audit;
use Labyrinth::Variables;

# -------------------------------------
# Variables

my $DEFAULTTAGS = 'p,a,br,b,strong,center,hr,ol,ul,li,i,img,u,em,strike,h1,h2,h3,h4,h5,h6,table,thead,tr,th,tbody,td,sup,address,pre';
my ($HTMLTAGS,%HTMLTAGS);

# -------------------------------------
# The Public Interface Subs

=head1 FUNCTIONS

=head2 HTML Tag handling

=over 4

=item LegalTag

Returns TRUE or FALSE as to whether the given HTML tag is accepted by the
system.

=item LegalTags

Returns the list of HTML tags that are accepted by the system.

=item CleanTags

For a given text string, attempts to clean the use of any HTML tags. Any HTML
tags found that are not accepted by the system are encoded into HTML entities.

=item CleanHTML

For a given text string, removes all existence of any HTML tag. Mostly used in
input text box cleaning.

=item SafeHTML

For a given text string, encodes all HTML tags to HTML entities. Mostly used in
input textarea edit preparation.

=item CleanLink

Attempts to remove known spam style links.

=item CleanWords

Attempts to remove known profanity words.

=item LinkTitles

Given a XHTML snippet, will look for basic links and add title attributes.
Titles are of rhe format 'External Site: $domain', where $domain is the domain
used in the link.

=back

=cut

sub LegalTag {
    my $tag = lc shift;

    my %tags = _buildtags();
    return 1    if($tags{$tag});
    return 0;
}

sub LegalTags {
    my %tags = _buildtags();
    my $tags = join(", ", sort keys %tags);
    $tags =~ s/, ([^,]+)$/ and $1/;
    return $tags;
}

sub CleanTags {
    my $text = shift;
    return ''   unless($text);

    $text =~ s!</?(span|tbody)[^>]*>!!sig;
    $text =~ s!<(br|hr)>!<$1 />!sig;
    $text =~ s!<p>(?:\s|&nbsp;)+(?:</p>)?<(table|p|ul|ol|div|pre)!<$1!sig;
    $text =~ s!\s+&\s+! &amp; !sg;
    $text =~ s!&[lr]squo;!&quot;!mg;
    $text =~ s{&(?!\#\d+;|[a-z0-9]+;)}{&amp;}sig;

    # decode TinyMCE encodings
    $text =~ s!&lt;(.*?)&gt;!<$1>!sig;

    # clean paragraphs
    $text =~ s!</p>\s+<p>!</p><p>!sig;
    $text =~ s!\s*<br /><br />\s*!</p><p>!sig;

    my %tags = _buildtags();
    my @found = ($text =~ m!</?(\w+)(?:\s+[^>]*)?>!gm);
    for my $tag (@found) {
        $tag = lc $tag;
        next    if($tags{$tag});

        $text =~ s!<(/?$tag(?:[^>]*)?)>!&lt;$1&gt;!igm;
        $tags{$tag} = 1;
    }

    process_html($text,0,1);
}

sub CleanHTML {
    my $text = shift;
    return ''   unless($text);

    $text =~ s!<[^>]+>!!gm; # remove any tags
    $text =~ s!\s{2,}! !mg;
    $text =~ s!&[lr]squo;!&quot;!mg;

lib/Labyrinth/MLUtils.pm  view on Meta::CPAN

=item cleanup_attr_scriptlang

=item cleanup_attr_scripttype

=item strip_nonprintable

=back

=cut

# Configuration
my $allow_html  = 0;
my $line_breaks = 1;
# End configuration

##################################################################
#
# HTML handling code
#
# The code below provides some functions for manipulating HTML.
#
#  process_html ( INPUT [,LINE_BREAKS [,ALLOW]] )
#
#    Returns a modified version of the HTML string INPUT, with
#    any potentially malicious HTML constructs (such as java,
#    javascript and IMG tags) removed.
#
#    If the LINE_BREAKS parameter is present and true then
#    line breaks in the input will be converted to html <br />
#    tags in the output.
#
#    If the ALLOW parameter is present and true then most
#    harmless tags will be left in, otherwise all tags will be
#    removed.
#
#  escape_html ( INPUT )
#
#    Returns a copy of the string INPUT with any HTML
#    metacharacters replaced with character escapes.
#
#  unescape_html ( INPUT )
#
#    Returns a copy of the string INPUT with HTML character
#    entities converted to literal characters where possible.
#    Note that some entites have no 8-bit character equivalent,
#    see "http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent"
#    for some examples.  unescape_html() leaves these entities
#    in their encoded form.
#

use vars qw(%html_entities $html_safe_chars %escape_html_map $escape_html_map);
use vars qw(%safe_tags %safe_style %tag_is_empty %closetag_is_optional
            %closetag_is_dependent %force_closetag %transpose_tag 
            $convert_nl %auto_deinterleave $auto_deinterleave_pattern);

# check the validity of a URL.

sub process_html {
    my ($text, $line_breaks, $allow_html) = @_;

    # cleanup erroneous XHTML patterns
    if($text) {
        $text =~ s!</pre><pre>!<br />!gsi;
        $text =~ s!<ul>\s*<br />!<ul>!gsi;
        $text =~ s!<br />\s*</ul>!</ul>!gsi;
        $text =~ s!<ul>\s*</ul>!!gsi;
        $text =~ s!<ol>\s*</ol>!!gsi;
    }

    # clean text of any nasties
    #$text =~ s/[\x201A\x2018\x2019`]/&#39;/g;   # nasty single quotes
    #$text =~ s/[\x201E\x201C\x201D]/&quot;/g;   # nasty double quotes

    cleanup_html( $text, $line_breaks, ($allow_html ? \%safe_tags : {}));
}

BEGIN
{
    %html_entities = (
        'lt'     => '<',
        'gt'     => '>',
        'quot'   => '"',
        'amp'    => '&',

        'nbsp'   => "\240", 'iexcl'  => "\241",
        'cent'   => "\242", 'pound'  => "\243",
        'curren' => "\244", 'yen'    => "\245",
        'brvbar' => "\246", 'sect'   => "\247",
        'uml'    => "\250", 'copy'   => "\251",
        'ordf'   => "\252", 'laquo'  => "\253",
        'not'    => "\254", 'shy'    => "\255",
        'reg'    => "\256", 'macr'   => "\257",
        'deg'    => "\260", 'plusmn' => "\261",
        'sup2'   => "\262", 'sup3'   => "\263",
        'acute'  => "\264", 'micro'  => "\265",
        'para'   => "\266", 'middot' => "\267",
        'cedil'  => "\270", 'supl'   => "\271",
        'ordm'   => "\272", 'raquo'  => "\273",
        'frac14' => "\274", 'frac12' => "\275",
        'frac34' => "\276", 'iquest' => "\277",

        'Agrave' => "\300", 'Aacute' => "\301",
        'Acirc'  => "\302", 'Atilde' => "\303",
        'Auml'   => "\304", 'Aring'  => "\305",
        'AElig'  => "\306", 'Ccedil' => "\307",
        'Egrave' => "\310", 'Eacute' => "\311",
        'Ecirc'  => "\312", 'Euml'   => "\313",
        'Igrave' => "\314", 'Iacute' => "\315",
        'Icirc'  => "\316", 'Iuml'   => "\317",
        'ETH'    => "\320", 'Ntilde' => "\321",
        'Ograve' => "\322", 'Oacute' => "\323",
        'Ocirc'  => "\324", 'Otilde' => "\325",
        'Ouml'   => "\326", 'times'  => "\327",
        'Oslash' => "\330", 'Ugrave' => "\331",
        'Uacute' => "\332", 'Ucirc'  => "\333",
        'Uuml'   => "\334", 'Yacute' => "\335",
        'THORN'  => "\336", 'szlig'  => "\337",

        'agrave' => "\340", 'aacute' => "\341",
        'acirc'  => "\342", 'atilde' => "\343",
        'auml'   => "\344", 'aring'  => "\345",

lib/Labyrinth/MLUtils.pm  view on Meta::CPAN

sub cleanup_html {
    local ($_, $convert_nl, $safe_tags) = @_;
    local @stack = ();

    return ''   unless($_);

    my $ignore_comments = 0;
    if($ignore_comments) {
        s[
            (?: <!--.*?-->                                   ) |
            (?: <[?!].*?>                                    ) |
            (?: <([a-z0-9]+)\b((?:[^>'"]|"[^"]*"|'[^']*')*)> ) |
            (?: </([a-z0-9]+)>                               ) |
            (?: (.[^<]*)                                     )
        ][
            defined $1 ? cleanup_tag(lc $1, $2)              :
            defined $3 ? cleanup_close(lc $3)                :
            defined $4 ? cleanup_cdata($4)                   :
            ''
        ]igesx;
    } else {
        s[
            (?: (<!--.if.*?endif.-->)                        ) |
            (?: <!--.*?-->                                   ) |
            (?: <[?!].*?>                                    ) |
            (?: <([a-z0-9]+)\b((?:[^>'"]|"[^"]*"|'[^']*')*)> ) |
            (?: </([a-z0-9]+)>                               ) |
            (?: (.[^<]*)                                     )
        ][
            defined $1 ? $1                                  :
            defined $2 ? cleanup_tag(lc $2, $3)              :
            defined $4 ? cleanup_close(lc $4)                :
            defined $5 ? cleanup_cdata($5)                   :
            ''
        ]igesx;
    }

    # Close anything that was left open
    $_ .= join '', map "</$_->{NAME}>", @stack;

    # Where we turned <i><b>foo</i></b> into <i><b>foo</b></i><b></b>,
    # take out the pointless <b></b>.
    1 while s#<($auto_deinterleave_pattern)\b[^>]*>(&nbsp;|\s)*</\1>##go;

    # cleanup p elements
    s!\s+</p>!</p>!g;
    s!<p></p>!!g;

    # Element pre is not declared in p list of possible children
    s!<p>\s*(<pre>.*?</pre>)\s*</p>!$1!g;

    return $_;
}

sub cleanup_tag {
    my ($tag, $attrs) = @_;
    unless (exists $safe_tags->{$tag}) {
        return '';
    }

    # for XHTML conformity
    $tag = $transpose_tag{$tag} if($transpose_tag{$tag});

    my $html = '';
    if($force_closetag{$tag}) {
        while (scalar @stack and $force_closetag{$tag}{$stack[0]{NAME}}) {
            $html = cleanup_close($stack[0]{NAME});
        }
    }

    my $t = $safe_tags->{$tag};
    my $safe_attrs = '';
    while ($attrs =~ s#^\s*(\w+)(?:\s*=\s*(?:([^"'>\s]+)|"([^"]*)"|'([^']*)'))?##) {
        my $attr = lc $1;
        my $val = ( defined $2 ? $2                :
                    defined $3 ? unescape_html($3) :
                    defined $4 ? unescape_html($4) :
                    '$attr'
        );
        unless (exists $t->{$attr}) {
            next;
        }
        if (defined $t->{$attr}) {
            local $_ = $val;
            my $cleaned = &{ $t->{$attr} }();
            if (defined $cleaned) {
                $safe_attrs .= qq| $attr="${\( escape_html($cleaned) )}"|;
            }
        } else {
            $safe_attrs .= " $attr";
        }
    }

    my $str;
    if (exists $tag_is_empty{$tag}) {
        $str = "$html<$tag$safe_attrs />";
    } elsif (exists $closetag_is_optional{$tag}) {
        $str = "$html<$tag$safe_attrs>";
#   } elsif (exists $closetag_is_dependent{$tag} && $safe_attrs =~ /$closetag_is_dependent{$tag}=/) {
#       return "$html<$tag$safe_attrs />";
    } else {
        my $full = "<$tag$safe_attrs>";
        unshift @stack, { NAME => $tag, FULL => $full };
        $str = "$html$full";
    }
#LogDebug("cleanup_tag: str=$str");
    return $str;
}

sub cleanup_close {
    my $tag = shift;

    # for XHTML conformity
    $tag = $transpose_tag{$tag} if($transpose_tag{$tag});

    # Ignore a close without an open
    unless (grep {$_->{NAME} eq $tag} @stack) {
        return '';
    }

    # Close open tags up to the matching open
    my @close = ();
    while (scalar @stack and $stack[0]{NAME} ne $tag) {
        push @close, shift @stack;
    }
    push @close, shift @stack;

    my $html = join '', map {"</$_->{NAME}>"} @close;

    # Reopen any we closed early if all that were closed are
    # configured to be auto deinterleaved.
    unless (grep {! exists $auto_deinterleave{$_->{NAME}} } @close) {
        pop @close;
        $html .= join '', map {$_->{FULL}} reverse @close;
        unshift @stack, @close;
    }

    return $html;
}

sub cleanup_cdata {
    local $_ = shift;

    return $_   if(scalar @stack and $stack[0]{NAME} eq 'script');

    s[ (?: & ( 
        [a-zA-Z0-9]{2,15}       |
        [#][0-9]{2,6}           |
        [#][xX][a-fA-F0-9]{2,6} | ) \b ;?
        ) | ($escape_html_map) | (.)
    ][
        defined $1 ? "&$1;" : defined $2 ? $2 : $3
    ]gesx;

    # substitute newlines in the input for html line breaks if required.
    s%\cM?\n%<br />\n%g if $convert_nl;

    return $_;
}

# subroutine to escape the necessary characters to the appropriate HTML
# entities

sub escape_html {
    my $str = shift or return '';
    $str = encode_entities($str);
    $str =~ s/&amp;(#x?\d+;)/&$1/g;  # avoid double encoding of hex/dec characters
    return $str;
}

# subroutine to unescape escaped HTML entities.  Note that some entites
# have no 8-bit character equivalent, see
# "http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent" for some examples.



( run in 1.101 second using v1.01-cache-2.11-cpan-39bf76dae61 )