HTML-Laundry
view release on metacpan or search on metacpan
lib/HTML/Laundry.pm
lib/HTML/Laundry/Rules.pm
lib/HTML/Laundry/Rules/Default.pm
lib/HTML/Laundry/Rules/Minimal.pm
t/00-use.t
t/add_attributes.t
t/add_elements.t
t/add_unacceptable.t
t/callbacks.t
t/lib/TestHelpers.pm
t/parser_cdata.t
t/parser_default.t
t/pod.t
t/pod-coverage.t
t/rebase.t
t/ruleset.t
t/ruleset_minimal.t
t/sanitize_default.t
t/tag_whitespace.t
t/tidy_default.t
t/tidy_libxml.t
lib/HTML/Laundry.pm view on Meta::CPAN
require HTML::Parser;
use HTML::Entities qw(encode_entities encode_entities_numeric);
use URI;
use URI::Escape qw(uri_unescape uri_escape uri_escape_utf8);
use URI::Split qw();
use Scalar::Util 'blessed';
my @fragments;
my $unacceptable_count;
my $local_unacceptable_count;
my $cdata_dirty;
my $in_cdata;
my $tag_leading_whitespace = qr/
(?<=<) # Left bracket followed by
\s* # any amount of whitespace
(\/?) # optionally with a forward slash
\s* # and then more whitespace
/x;
=head1 FUNCTIONS
=head2 new
lib/HTML/Laundry.pm view on Meta::CPAN
$self->clear_callback('start_tag');
$self->clear_callback('end_tag');
$self->clear_callback('uri');
$self->clear_callback('text');
$self->clear_callback('output');
$self->{parser} = HTML::Parser->new(
api_version => 3,
utf8_mode => 1,
start_h => [ sub { $self->_tag_start_handler(@_) }, 'tagname,attr' ],
end_h => [ sub { $self->_tag_end_handler(@_) }, 'tagname,attr' ],
text_h => [ sub { $self->_text_handler(@_) }, 'dtext,is_cdata' ],
empty_element_tags => 1,
marked_sections => 1,
);
$self->{cdata_parser} = HTML::Parser->new(
api_version => 3,
utf8_mode => 1,
start_h => [ sub { $self->_tag_start_handler(@_) }, 'tagname,attr' ],
end_h => [ sub { $self->_tag_end_handler(@_) }, 'tagname,attr' ],
text_h => [ sub { $self->_text_handler(@_) }, 'dtext' ],
empty_element_tags => 1,
unbroken_text => 1,
marked_sections => 0,
);
$self->initialize($args);
lib/HTML/Laundry.pm view on Meta::CPAN
=cut
sub clean {
my ( $self, $chunk, $args ) = @_;
$self->_reset_state();
if ( $self->{trim_tag_whitespace} ) {
$chunk =~ s/$tag_leading_whitespace/$1/gs;
}
my $p = $self->{parser};
my $cp = $self->{cdata_parser};
$p->parse($chunk);
if ( !$in_cdata && !$unacceptable_count ) {
$p->eof();
}
if ( $in_cdata && !$local_unacceptable_count ) {
$cp->eof();
}
my $output = $self->gen_output;
$cp->eof(); # Clear buffer if we haven't already
if ($cdata_dirty) { # Overkill to get out of CDATA parser state
$self->{parser} = HTML::Parser->new(
api_version => 3,
start_h =>
[ sub { $self->_tag_start_handler(@_) }, 'tagname,attr' ],
end_h => [ sub { $self->_tag_end_handler(@_) }, 'tagname,attr' ],
text_h => [ sub { $self->_text_handler(@_) }, 'dtext,is_cdata' ],
empty_element_tags => 1,
marked_sections => 1,
);
}
else {
$p->eof(); # Clear buffer if we haven't already
}
return $output;
}
lib/HTML/Laundry.pm view on Meta::CPAN
1;
};
}
}
sub _reset_state {
my ($self) = @_;
@fragments = ();
$unacceptable_count = 0;
$local_unacceptable_count = 0;
$in_cdata = 0;
$cdata_dirty = 0;
return;
}
sub _tag_start_handler {
my ( $self, $tagname, $attr ) = @_;
if ( !$self->_run_callbacks( q{start_tag}, \$tagname, $attr ) ) {
return;
}
if ( !$in_cdata ) {
$cdata_dirty = 0;
}
my @attributes;
foreach my $k ( keys %{$attr} ) {
if ( $self->{acceptable_a}->{$k} ) {
if ( grep {/^$k$/} @{ $self->{uri_list}->{$tagname} } ) {
$self->_uri_handler( $tagname, \$k, \$attr->{$k},
$self->{base_uri} );
}
# Allow uri handler to suppress insertion
lib/HTML/Laundry.pm view on Meta::CPAN
}
else {
if ($attributes) {
$attributes = q{ } . $attributes;
}
push @fragments, "<$tagname$attributes>";
}
}
else {
if ( $self->{unacceptable_e}->{$tagname} ) {
if ($in_cdata) {
$local_unacceptable_count += 1;
}
else {
$unacceptable_count += 1;
}
}
}
return;
}
sub _tag_end_handler {
my ( $self, $tagname ) = @_;
if ( !$self->_run_callbacks( q{end_tag}, \$tagname ) ) {
return;
}
if ( !$in_cdata ) {
$cdata_dirty = 0;
}
if ( $self->{acceptable_e}->{$tagname} ) {
if ( !$self->{empty_e}->{$tagname} ) {
push @fragments, "</$tagname>";
}
}
else {
if ( $self->{unacceptable_e}->{$tagname} ) {
if ($in_cdata) {
$local_unacceptable_count -= 1;
$local_unacceptable_count = 0
if ( $local_unacceptable_count < 0 );
}
else {
$unacceptable_count -= 1;
$unacceptable_count = 0 if ( $unacceptable_count < 0 );
}
}
}
return;
}
sub _text_handler {
my ( $self, $text, $is_cdata ) = @_;
if ( $in_cdata && $local_unacceptable_count ) {
return;
}
if ($unacceptable_count) {
return;
}
if ($is_cdata) {
my $cp = $self->{cdata_parser};
$in_cdata = 1;
$cp->parse($text);
if ( !$local_unacceptable_count ) {
$cp->eof();
}
$cdata_dirty = 1;
$in_cdata = 0;
return;
}
else {
if ( !$self->_run_callbacks( q{text}, \$text, $is_cdata ) ) {
return q{};
}
$text = encode_entities( $text, '<>&"' );
$cdata_dirty = 0;
}
push @fragments, $text;
return;
}
sub _uri_handler {
my ( $self, $tagname, $attr_ref, $value_ref, $base ) = @_;
my ( $attr, $value ) = ( ${$attr_ref}, ${$value_ref} );
$value =~ s/[`\x00-\x1f\x7f]+//g;
$value =~ s/\ufffd//g;
t/callbacks.t view on Meta::CPAN
my $tag = ${$tagref};
isa_ok( $laundry, 'HTML::Laundry', 'Laundry object is passed into end_tag callback' );
is($tag, q{p}, 'Tag is passed correctly to end_tag callback');
ok( ! $attrref, 'Attributes not passed to end_tag callback');
my $newtag = q{span};
${$tagref} = $newtag;
return 1;
}
sub text_test {
my ( $laundry, $textref, $iscdata ) = @_;
isa_ok( $laundry, 'HTML::Laundry', 'Laundry object is passed into text callback' );
my $text = ${$textref};
my $expected = q{Sixteen years had Miss Taylor been in Mr. Woodhouse's family, less as a governess than a friend, very fond of both daughters, but particularly of Emma.};
is($text, $expected, 'Text is passed correctly to text callback');
${$textref} = 'The family of Dashwood had been long settled in Sussex.';
return 1;
}
sub entity_test {
my ( $laundry, $textref, $iscdata ) = @_;
my $text = ${$textref};
ok($text !~ q{lt;}, 'Text is passed before entity escaping has occured');
return 1;
}
sub output_test {
my ( $laundry, $fragsref ) = @_;
isa_ok( $laundry, 'HTML::Laundry', 'Laundry object is passed into output callback' );
my @fragments = @{$fragsref};
is(scalar @fragments, 3, 'Fragments array is passed via reference, has right number of elements');
t/callbacks.t view on Meta::CPAN
$l->clear_callback('text');
$l->add_callback('text', \&entity_test );
$l->clean(q{1 < 2});
$l->clear_callback('text');
$l->add_callback('text', \&cancel );
$output = $l->clean($austen);
is( $output, q{<p id="foo"></p>}, 'Text callback allows forced non-parsing of text via false return ');
$l->clear_callback('text');
$l->clear_callback('uri');
$l->add_callback('text', sub {
my ( $laundry, $textref, $iscdata ) = @_;
${$textref} =~ s/a//g;
});
$l->add_callback('text', sub {
my ( $laundry, $textref, $iscdata ) = @_;
${$textref} =~ s/e/ee/g;
});
$l->add_callback('text', sub {
my ( $laundry, $textref, $iscdata ) = @_;
${$textref} =~ s/qu/kw/g;
});
$output = $l->clean(q{<p><em>The quick brown fox jumped over the lazy dogs.</em></p>});
is( $output,
q{<p><em>Thee kwick brown fox jumpeed oveer thee lzy dogs.</em></p>},
q{Text callbacks may be chained. (text)});
$l->clear_callback('text');
$output = $l->clean($austen);
is( $output, $austen, 'Cleared text callback turns off callback');
( run in 0.247 second using v1.01-cache-2.11-cpan-454fe037f31 )