HTML-Laundry
view release on metacpan or search on metacpan
lib/HTML/Laundry/Rules.pm view on Meta::CPAN
81828384858687888990919293949596979899100=cut
sub acceptable_e {
my $self = shift;
my @acceptable = qw(
a abbr acronym address area b bdo big blockquote
br button caption center cite code col colgroup dd
del dfn dir div dl dt em fieldset font form
h1 h2 h3 h4 h5 h6 hr i img input ins kbd
label legend li map menu ol optgroup option p
pre q s samp select small span strike strong
sub sup table tbody td textarea tfoot th thead
tr tt u ul var wbr
);
my %acceptable = map { ( $_, 1 ) } @acceptable;
return \%acceptable;
}
=head2 empty_e
t/parser_default.t view on Meta::CPAN
5678910111213141516171819202122232425require_ok(
'HTML::Laundry'
);
my
$l1
= HTML::Laundry->new({
notidy
=> 1 });
isa_ok(
$l1
,
'HTML::Laundry'
,
'New object isa HTML::Laundry'
);
my
$plaintext
=
'She was the youngest of the two daughters of a most affectionate, indulgent father...'
;
is(
$l1
->clean(
$plaintext
),
$plaintext
,
'Short plain text passes through cleanly'
);
$plaintext
= 'She was the youngest of the two daughters of a most affectionate, indulgent father; and had, in consequence of her sister\'s marriage, been mistress of his house from a very early period. Her mother had died too long ago
for
her to have...
is(
$l1
->clean(
$plaintext
),
$plaintext
,
'Longer plain text passes through cleanly'
);
my
$kurosawa
=
q[Akira Kurosawa (Kyūjitai: 黒澤 明, Shinjitai: 黒沢 明 Kurosawa Akira, 23 March 1910 – 6 September 1998) was a legendary Japanese filmmaker, producer, screenwriter and editor]
;
is(
$l1
->clean(
$kurosawa
),
$kurosawa
,
'UTF-8 text passes through cleanly'
);
my
$valid
= '<p class=
"opening"
>She was the youngest of the two daughters of a most affectionate, indulgent <a href=
"#footnote1"
>father</a>; and had, in consequence of her sister\'s marriage, been mistress of his house from a very early period. Her m...
is(
$l1
->clean(
$valid
),
$valid
,
'Validating text passes through cleanly'
);
is(
$l1
->clean(
'<p></p>'
),
'<p></p>'
,
'Non-empty tag passes through cleanly'
);
is(
$l1
->clean(
'<br />'
),
'<br />'
,
'Empty tag passes through cleanly'
);
is(
$l1
->clean(
'<br / >'
),
'<br />'
,
'Empty tag with whitespace passes through cleanly'
);
is(
$l1
->clean(
'<p />'
),
'<p></p>'
,
'Non-empty tag passed in as empty is normalized to non-empty format'
);
is(
$l1
->clean(
'<br></br>'
),
'<br />'
,
'Empty tag passed in as non-empty is normalized to empty format'
);
is(
$l1
->clean(
'<br class="foo" />'
),
'<br class="foo" />'
,
'Empty tag attribute is preserved'
);
is(
$l1
->clean(
'<p class="foo"></p>'
),
'<p class="foo"></p>'
,
'Non-empty tag attribute is preserved'
);
t/ruleset_minimal.t view on Meta::CPAN
101112131415161718192021222324252627282930
rules
=>
'HTML::Laundry::Rules::Minimal'
});
my
@ok
=
qw( a b br blockquote code em i li ol p pre strong u ul )
;
my
%ok
=
map
{
$_
=> 1 }
@ok
;
my
@e
= (
'a'
,
'abbr'
,
'acronym'
,
'address'
,
'area'
,
'b'
,
'bdo'
,
'big'
,
'blockquote'
,
'br'
,
'button'
,
'caption'
,
'center'
,
'cite'
,
'code'
,
'col'
,
'colgroup'
,
'dd'
,
'del'
,
'dfn'
,
'dir'
,
'div'
,
'dl'
,
'dt'
,
'em'
,
'fieldset'
,
'font'
,
'form'
,
'h1'
,
'h2'
,
'h3'
,
'h4'
,
'h5'
,
'h6'
,
'hr'
,
'i'
,
'img'
,
'input'
,
'ins'
,
'kbd'
,
'label'
,
'legend'
,
'li'
,
'map'
,
'menu'
,
'ol'
,
'optgroup'
,
'option'
,
'p'
,
'pre'
,
'q'
,
's'
,
'samp'
,
'select'
,
'small'
,
'span'
,
'strike'
,
'strong'
,
'sub'
,
'sup'
,
'table'
,
'tbody'
,
'td'
,
'textarea'
,
'tfoot'
,
'th'
,
'thead'
,
'tr'
,
'tt'
,
'u'
,
'ul'
,
'var'
,
'wbr'
);
foreach
my
$e
(
@e
) {
if
(
$ok
{
$e
} and
$e
ne
'br'
) {
# The only allowed empty element in this ruleset is <br />
is(
$l1
->clean(
"<$e></$e>"
),
"<$e></$e>"
,
"element $e is not sanitized"
);
}
elsif
(
$ok
{
$e
} ) {
t/sanitize_default.t view on Meta::CPAN
222324252627282930313233343536373839404142
''
,
'DOCTYPE declaration is stripped'
);
is(
$l1
->clean(
'<p class="xyzzy" plugh="plover">Her situation in life, the character of her father and mother, her own person and disposition, were all equally against her.</p>'
),
'<p class="xyzzy">Her situation in life, the character of her father and mother, her own person and disposition, were all equally against her.</p>'
,
'Unknown attribute is stripped, but known attribute remains'
);
my
@e
= (
'a'
,
'abbr'
,
'acronym'
,
'address'
,
'area'
,
'b'
,
'bdo'
,
'big'
,
'blockquote'
,
'br'
,
'button'
,
'caption'
,
'center'
,
'cite'
,
'code'
,
'col'
,
'colgroup'
,
'dd'
,
'del'
,
'dfn'
,
'dir'
,
'div'
,
'dl'
,
'dt'
,
'em'
,
'fieldset'
,
'font'
,
'form'
,
'h1'
,
'h2'
,
'h3'
,
'h4'
,
'h5'
,
'h6'
,
'hr'
,
'i'
,
'img'
,
'input'
,
'ins'
,
'kbd'
,
'label'
,
'legend'
,
'li'
,
'map'
,
'menu'
,
'ol'
,
'optgroup'
,
'option'
,
'p'
,
'pre'
,
'q'
,
's'
,
'samp'
,
'select'
,
'small'
,
'span'
,
'strike'
,
'strong'
,
'sub'
,
'sup'
,
'table'
,
'tbody'
,
'td'
,
'textarea'
,
'tfoot'
,
'th'
,
'thead'
,
'tr'
,
'tt'
,
'u'
,
'ul'
,
'var'
,
'wbr'
);
my
%empty
= (
area
=> 1,
br
=> 1,
col
=> 1,
hr
=> 1,
t/tidy_default.t view on Meta::CPAN
192021222324252627282930313233343536373839my
$plaintext
=
'She was the youngest of the two daughters of a most affectionate, indulgent father...'
;
is(
$l
->clean(
$plaintext
),
$plaintext
,
'Short plain text passes through cleanly'
);
$plaintext
=
q{She had been a friend and companion such as few possessed: intelligent, well-informed, useful, gentle, knowing all the ways of the family, interested in all its concerns, and peculiarly interested in herself, in every pleasure, eve...
is( $l->clean($plaintext), $plaintext, 'Longer plain text passes through cleanly');
TODO: {
# HTML::Tidy 1.56 fixes unicode support
local $TODO = "HTML::Tidy version dependent. Install HTML::Tidy 1.56 or greater"
unless eval { HTML::Tidy->VERSION(1.56) }
;
my
$kurosawa_chars
= Encode::encode(
'UTF-8'
,
q[Akira Kurosawa (Kyūjitai: 黒澤 明, Shinjitai: 黒沢 明 Kurosawa Akira, 23 March 1910 – 6 September 1998) was a legendary Japanese filmmaker, producer, screenwriter and editor]
);
my
$kurosawa_bytes
= Encode::decode(
'UTF-8'
,
$kurosawa_chars
);
is(
$l
->clean(
$kurosawa_chars
),
$kurosawa_bytes
,
'UTF-8 text passes through cleanly'
);
};
my
$valid
=
q{<p>}
.
$plaintext
.
q{</p>}
;
is(
$l
->clean(
$valid
),
$valid
,
'Validating HTML passes through cleanly'
);
TODO: {
local
$TODO
=
"libtidy version dependent - figure out how to check"
;
is(
$l
->clean(
'<div></div>'
),
q{}
,
'No-content elements are stripped...'
);
is(
$l
->clean(
'<div foo="bar"></div>'
),
q{<div id="foo"></div>}
,
'...unless they have attributes'
);
t/tidy_libxml.t view on Meta::CPAN
262728293031323334353637383940414243444546
};
}
my
$l
= HTML::Laundry->new();
is(
$l
->{tidy_engine},
q{HTML::Tidy::libXML}
,
'Laundry uses HTML::Tidy::libXML as tidying engine if HTML::Tidy is unavailable'
);
my
$plaintext
=
'She was the youngest of the two daughters of a most affectionate, indulgent father...'
;
is(
$l
->clean(
$plaintext
),
$plaintext
,
'Short plain text passes through cleanly'
);
$plaintext
=
q{She had been a friend and companion such as few possessed: intelligent, well-informed, useful, gentle, knowing all the ways of the family, interested in all its concerns, and peculiarly interested in herself, in every pleasure, eve...
is( $l->clean($plaintext), $plaintext, 'Longer plain text passes through cleanly');
my $kurosawa = q[Akira Kurosawa (Kyūjitai: 黒澤 明, Shinjitai: 黒沢 明 Kurosawa Akira, 23 March 1910 – 6 September 1998) was a legendary Japanese filmmaker, producer, screenwriter and editor];
is( $l->clean($kurosawa), $kurosawa, 'UTF-8 text passes through cleanly');
my $valid = q{<p>}
.
$plaintext
.
q{</p>}
;
is(
$l
->clean(
$valid
),
$valid
,
'Validating HTML passes through cleanly'
);
is(
$l
->clean(
'<p></p>'
),
'<p/>'
,
'libXML collapses no-content paragraph tags to empty element'
);
is(
$l
->clean(
'<br />'
),
'<br/>'
,
'Empty tag passes through cleanly'
);
is(
$l
->clean(
'<br / >'
),
'<br/>'
,
'Empty tag with whitespace passes through cleanly'
);
is(
$l
->clean(
'<br></br>'
),
'<br/>'
,
'Empty tag passed in as non-empty is normalized to empty format'
);
is(
$l
->clean(
'<br class="foo" />'
),
'<br class="foo"/>'
,
'Empty tag attribute is preserved'
);
is(
$l
->clean(
'<p class="foo"></p>'
),
'<p class="foo"/>'
,
'Non-empty tag attribute is preserved'
);
# Actual tidying begins
( run in 1.312 second using v1.01-cache-2.11-cpan-49f99fa48dc )