HTML-Defang

 view release on metacpan or  search on metacpan

lib/HTML/Defang.pm  view on Meta::CPAN

  sub DefangContentCallback {
    my ($Self, $Defang, $ContentR) = @_;

    $$ContentR =~ s/remove this content//;
  }

=head1 DESCRIPTION

This module accepts an input HTML and/or CSS string and removes any executable code including scripting, embedded objects, applets, etc., and neutralises any XSS attacks. A whitelist based approach is used which means only HTML known to be safe is al...

HTML::Defang uses a custom html tag parser. The parser has been designed and tested to work with nasty real world html and to try and emulate as close as possible what browsers actually do with strange looking constructs. The test suite has been buil...

HTML::Defang can make callbacks to client code when it encounters the following:

=over 4

=item *

When a specified tag is parsed

=item *

When a specified attribute is parsed

=item *

When a URL is parsed as part of an HTML attribute, or CSS property value.

=item *

When style data is parsed, as part of an HTML style attribute, or as part of an HTML <style> tag.

=back

The callbacks include details about the current tag/attribute that is being parsed, and also gives a scalar reference to the input HTML. Querying pos() on the input HTML should indicate where the module is with parsing. This gives the client code fle...

HTML::Defang can defang whole tags, any attribute in a tag, any URL that appear as an attribute or style property, or any CSS declaration in a declaration block in a style rule. This helps to precisely block the most specific unwanted elements in the...

=cut

use Exporter;
our @ISA = ('Exporter');
%EXPORT_TAGS = (all => [qw(@FormTags DEFANG_NONE DEFANG_ALWAYS DEFANG_DEFAULT)]);
Exporter::export_ok_tags('all');

use 5.008;
use strict;
use warnings;

our $VERSION=1.08;

use constant DEFANG_NONE => 0;
use constant DEFANG_ALWAYS => 1;
use constant DEFANG_DEFAULT => 2;

use Encode;

my $HasScalarReadonly = 0;
BEGIN { eval "use Scalar::Readonly qw(readonly_on);" && ($HasScalarReadonly = 1); }

our @FormTags = qw(form input textarea select option button fieldset label legend multicol nextid optgroup);

# Some regexps for matching HTML tags + key=value attributes
my $AttrKeyStartLineRE = qr/(?:[^=<>\s\/\\]{1,}|[\/](?!\s*>))/;
my $AttrKeyRE = qr/(?<=[\s'"\/])$AttrKeyStartLineRE/;
my $AttrValRE = qr/[^>\s'"`][^>\s]*|'[^']*?'|"[^"]*?"|`[^`]*?`/;
my $AttributesRE = qr/(?:(?:$AttrKeyRE\s*)?(?:=\s*$AttrValRE\s*)?)*/;
my $TagNameRE = qr/[A-Za-z][A-Za-z0-9\#\&\;\:\!_-]*/;

my $StyleSelectors = qr/[^{}\s][^{}]{0,1024}?/;
my $StyleName = qr/[^:}\s][^:{}]*?/;
my $StyleValue = qr/[^;}\s][^;}]*|.*$/; 
my $StyleRule = qr/$StyleName\s*:\s*$StyleValue\s*/;
my $StyleRules = qr/\s*(?:$StyleRule)?(?:;\s*$StyleRule)*(?:;\s*)*/;
my $StyleMediaSelector = qr/\@media\b[^{]*/;
my $RECStyleMediaSelector = qr{\G(\s*)($StyleMediaSelector)(\{)(\s*)}so;
my $RECStyleNaked = qr/\G(\s*)()()()($StyleRules)()(\s*)/o;
my $RECStyleSelected = qr/\G(\s*)((?:$StyleSelectors)?)(\s*)(\{)($StyleRules)(\})(\s*)/o;

my $Fonts            = qr/["']?([A-Za-z0-9\s-]+)["']?/;
my $Alignments       = qr/(absbottom|absmiddle|all|autocentre|baseline|bottom|center|justify|left|middle|none|right|texttop|top)/;

my $Executables = '([^@]\.com|'.
                  '.*\.(exe|cmd|bat|pif|scr|sys|sct|lnk|dll'.
                  '|vbs?|vbe|hta|shb|shs|hlp|chm|eml|wsf|wsh|js'.
                  '|asx|wm.|mdb|mht|msi|msp|cpl|lib|reg))';
my $SrcBanStd      = qr/^([A-Za-z]*script|.*\&\{|mocha|about|opera|mailto:|hcp:|\/(dev|proc)|\\|file|smb|cid:${Executables}(@|\?|$))/i;

my %Rules = 
(
  # Disallow unknown tags by default
  "_unknown"     => qr/.*/,
  "align"        => qr/^${Alignments}$/i,
  "alnum"        => qr/^[A-Za-z0-9_.-]+$/,
  "boolean"      => qr/^(0|1|true|yes|no|false)$/,
  "charset"      => qr/^[A-Za-z0-9_][A-Za-z0-9_.-]*$/,
  "class"        => qr/^[A-Za-z0-9_.:\s-]*$/,
  "color"        => qr/^#?[0-9A-Z]+$/i,
  "coords"       => qr/^(\d+,)+\d+$/i,
  "datetime"     => qr/^\d\d\d\d-\d\d-\d\d.{0,5}\d\d:\d\d:\d\d.{0,5}$/,
  "dir"          => qr/^(ltr|rtl)$/i,
  "empty"        => qr/^$/i,
  "eudora"       => qr/^(autourl)$/i,
  "font-face"    => qr/^((${Fonts})[,\s]*)+$/i,
  "form-enctype" => qr/^(application\/x-www-form-urlencoded|multipart\/form-data)$/i,
  "form-method"  => qr/^(get|post)$/i,
  "frame"        => qr/^(void|above|below|hsides|vsides|lhs|rhs|box|border)$/i,
  # href: Not javascript, vbs or vbscript
  "href"         => [ qr/^((?:[a-z]*script|mocha|opera|about|data|tcl)\s*:|.*\&\{|hcp|smb|\/dev\/|<)/i ],
  "usemap-href"  => qr/^#[A-Za-z0-9_.-]+$/,  # this is not really a href at all!
  "input-size"   => qr/^(\d{1,4})$/, # some browsers freak out with very large widgets
  "input-type"   => qr/^(button|checkbox|file|hidden|image|password|radio|readonly|reset|submit|text)$/i,
  "integer"      => qr/^(-|\+)?\d+$/,
  "number"       => qr/^(-|\+)?[\d.,]+$/,
  # language: Not javascript, vbs or vbscript
  "language"     => qr/^(XML)$/i, 
  "media"        => qr/^((screen|print|projection|braille|speech|all)[,\s]*)+$/i,
  "meta:name"    => qr/^(author|progid|originator|generator|keywords|description|content-type|pragma|expires)$/i,
  # mime-type: Not javascript
  "mime-type"    => qr/^(cite|text\/(plain|css|html|xml))$/i,
  "list-type"    => qr/^(none,a,i,upper-alpha,lower-alpha,upper-roman,lower-roman,decimal,disc,square,circle,round)$/i,

lib/HTML/Defang.pm  view on Meta::CPAN

  "h2"     => 1,
  "h3"     => 1,
  "h4"     => 1,
  "h5"     => 1,
  "h6"     => 1,
  "iframe" => 0,
  "ilayer" => 0,
  "img" =>
  {
    "alt"      => "anything",
    "border"   => "size",
    "dynsrc"   => "src",
    "hspace"   => "size",
    "ismap"    => "anything",
    "loop"     => "alnum",
    "lowsrc"   => "src",
    "nosend"   => "alnum",
    "src"      => "src",
    "start"    => "alnum",
    "usemap"   => "usemap-href",
    "vspace"   => "size",
  },
  "inlineinput" => 0,
  "input" => # FORM
  {
    "type"     => "input-type",
    "disabled" => "anything",
    "value"    => "anything",
    "maxlength" => "input-size",
    "size"     => "input-size",
    "readonly" => "anything",
    "tabindex" => "number",
    "checked"  => "anything",
    "accept"   => "anything",
    # for type "image":
    "alt"      => "anything",
    "border"   => "size",
    "dynsrc"   => "src",
    "hspace"   => "size",
    "ismap"    => "anything",
    "loop"     => "alnum",
    "lowsrc"   => "src",
    "nosend"   => "alnum",
    "src"      => "src",
    "start"    => "alnum",
    "usemap"   => "usemap-href",
    "vspace"   => "size",
  },
  "ins" =>
  {
    "cite" => "href",
    "datetime" => "datetime",
  },
  "isindex" => 0,
  "keygen"  => 0,
  "label"   => # FORM
  {
    "for"  => "alnum",
  },
  "layer"   => 0,
  "legend"  => 1, # FORM
  "li" => {
    "value" => "integer",
  },
  "listing"  => 0,
  "map"      => 1,
  "marquee"  => 0,
  "menu"     => \%ListAttributes,
  "multicol" => 0,
  "nextid"   => 0,
  "nobr"     => 0,
  "noembed"  => 1,
  "nolayer"  => 1,
  # Pretend our defang result is going into a non-scripting environment,
  #  even though javascript is likely enabled, so just defang all noscript tags
  "noscript" => 0,
  "noembed"  => 1,
  "object"   => 0,
  "ol"       => \%ListAttributes,
  "optgroup" => # FORM
  {
    "disabled" => "anything",
    "label"    => "anything",
  },
  "option"   => # FORM
  {
    "disabled" => "anything",
    "label"    => "anything",
    "selected" => "anything",
    "value"    => "anything",
  },
  "o:p"      => 1,
  "p"        => 1,
  "param"    => 0,
  "plaintext"=> 0,
  "pre"      => 1,
  "rt"       => 0,
  "ruby"     => 0,
  "section"  => 1,
  "select"   => # FORM
  {
    "disabled" => "anything",
    "multiple" => "anything",
    "size"     => "input-size",
    "tabindex" => "number",
  },
  "spacer"   => 0,
  "span"     => 1,
  "spell"    => 0,
  "sound" => 
  {
    "delay" => "number",
    "loop"  => "integer",
    "src"   => "src",
  },
  "table"  => \%TableAttributes,
  "tbody"  => \%TableAttributes,
  "textarea" => # FORM
  {
    "cols"     => "input-size",
    "rows"     => "input-size",



( run in 2.131 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )