HTML-Laundry

 view release on metacpan or  search on metacpan

MANIFEST  view on Meta::CPAN

lib/HTML/Laundry.pm
lib/HTML/Laundry/Rules.pm
lib/HTML/Laundry/Rules/Default.pm
lib/HTML/Laundry/Rules/Minimal.pm
t/00-use.t
t/add_attributes.t
t/add_elements.t
t/add_unacceptable.t
t/callbacks.t
t/lib/TestHelpers.pm
t/parser_cdata.t
t/parser_default.t
t/pod.t
t/pod-coverage.t
t/rebase.t
t/ruleset.t
t/ruleset_minimal.t
t/sanitize_default.t
t/tag_whitespace.t
t/tidy_default.t
t/tidy_libxml.t

lib/HTML/Laundry.pm  view on Meta::CPAN

require HTML::Parser;
use HTML::Entities qw(encode_entities encode_entities_numeric);
use URI;
use URI::Escape qw(uri_unescape uri_escape uri_escape_utf8);
use URI::Split qw();
use Scalar::Util 'blessed';

my @fragments;
my $unacceptable_count;
my $local_unacceptable_count;
my $cdata_dirty;
my $in_cdata;
my $tag_leading_whitespace = qr/
    (?<=<)  # Left bracket followed by
    \s*     # any amount of whitespace
    (\/?)   # optionally with a forward slash
    \s*     # and then more whitespace
/x;

=head1 FUNCTIONS

=head2 new

lib/HTML/Laundry.pm  view on Meta::CPAN

    $self->clear_callback('start_tag');
    $self->clear_callback('end_tag');
    $self->clear_callback('uri');
    $self->clear_callback('text');
    $self->clear_callback('output');
    $self->{parser} = HTML::Parser->new(
        api_version => 3,
        utf8_mode   => 1,
        start_h => [ sub { $self->_tag_start_handler(@_) }, 'tagname,attr' ],
        end_h  => [ sub { $self->_tag_end_handler(@_) }, 'tagname,attr' ],
        text_h => [ sub { $self->_text_handler(@_) },    'dtext,is_cdata' ],
        empty_element_tags => 1,
        marked_sections    => 1,
    );
    $self->{cdata_parser} = HTML::Parser->new(
        api_version => 3,
        utf8_mode   => 1,
        start_h => [ sub { $self->_tag_start_handler(@_) }, 'tagname,attr' ],
        end_h  => [ sub { $self->_tag_end_handler(@_) }, 'tagname,attr' ],
        text_h => [ sub { $self->_text_handler(@_) },    'dtext' ],
        empty_element_tags => 1,
        unbroken_text      => 1,
        marked_sections    => 0,
    );
    $self->initialize($args);

lib/HTML/Laundry.pm  view on Meta::CPAN


=cut

sub clean {
    my ( $self, $chunk, $args ) = @_;
    $self->_reset_state();
    if ( $self->{trim_tag_whitespace} ) {
        $chunk =~ s/$tag_leading_whitespace/$1/gs;
    }
    my $p  = $self->{parser};
    my $cp = $self->{cdata_parser};
    $p->parse($chunk);
    if ( !$in_cdata && !$unacceptable_count ) {
        $p->eof();
    }
    if ( $in_cdata && !$local_unacceptable_count ) {
        $cp->eof();
    }
    my $output = $self->gen_output;
    $cp->eof();    # Clear buffer if we haven't already
    if ($cdata_dirty) {    # Overkill to get out of CDATA parser state
        $self->{parser} = HTML::Parser->new(
            api_version => 3,
            start_h =>
                [ sub { $self->_tag_start_handler(@_) }, 'tagname,attr' ],
            end_h => [ sub { $self->_tag_end_handler(@_) }, 'tagname,attr' ],
            text_h => [ sub { $self->_text_handler(@_) }, 'dtext,is_cdata' ],
            empty_element_tags => 1,
            marked_sections    => 1,
        );
    }
    else {
        $p->eof();         # Clear buffer if we haven't already
    }
    return $output;
}

lib/HTML/Laundry.pm  view on Meta::CPAN

            1;
        };
    }
}

sub _reset_state {
    my ($self) = @_;
    @fragments                = ();
    $unacceptable_count       = 0;
    $local_unacceptable_count = 0;
    $in_cdata                 = 0;
    $cdata_dirty              = 0;
    return;
}

sub _tag_start_handler {
    my ( $self, $tagname, $attr ) = @_;
    if ( !$self->_run_callbacks( q{start_tag}, \$tagname, $attr ) ) {
        return;
    }
    if ( !$in_cdata ) {
        $cdata_dirty = 0;
    }
    my @attributes;
    foreach my $k ( keys %{$attr} ) {
        if ( $self->{acceptable_a}->{$k} ) {
            if ( grep {/^$k$/} @{ $self->{uri_list}->{$tagname} } ) {
                $self->_uri_handler( $tagname, \$k, \$attr->{$k},
                    $self->{base_uri} );
            }

            # Allow uri handler to suppress insertion

lib/HTML/Laundry.pm  view on Meta::CPAN

        }
        else {
            if ($attributes) {
                $attributes = q{ } . $attributes;
            }
            push @fragments, "<$tagname$attributes>";
        }
    }
    else {
        if ( $self->{unacceptable_e}->{$tagname} ) {
            if ($in_cdata) {
                $local_unacceptable_count += 1;
            }
            else {
                $unacceptable_count += 1;
            }
        }
    }
    return;
}

sub _tag_end_handler {
    my ( $self, $tagname ) = @_;
    if ( !$self->_run_callbacks( q{end_tag}, \$tagname ) ) {
        return;
    }
    if ( !$in_cdata ) {
        $cdata_dirty = 0;
    }
    if ( $self->{acceptable_e}->{$tagname} ) {
        if ( !$self->{empty_e}->{$tagname} ) {
            push @fragments, "</$tagname>";
        }
    }
    else {
        if ( $self->{unacceptable_e}->{$tagname} ) {
            if ($in_cdata) {
                $local_unacceptable_count -= 1;
                $local_unacceptable_count = 0
                    if ( $local_unacceptable_count < 0 );
            }
            else {
                $unacceptable_count -= 1;
                $unacceptable_count = 0 if ( $unacceptable_count < 0 );
            }
        }
    }
    return;
}

sub _text_handler {
    my ( $self, $text, $is_cdata ) = @_;
    if ( $in_cdata && $local_unacceptable_count ) {
        return;
    }
    if ($unacceptable_count) {
        return;
    }
    if ($is_cdata) {
        my $cp = $self->{cdata_parser};
        $in_cdata = 1;
        $cp->parse($text);
        if ( !$local_unacceptable_count ) {
            $cp->eof();
        }
        $cdata_dirty = 1;
        $in_cdata    = 0;
        return;
    }
    else {
        if ( !$self->_run_callbacks( q{text}, \$text, $is_cdata ) ) {
            return q{};
        }
        $text = encode_entities( $text, '<>&"' );
        $cdata_dirty = 0;
    }
    push @fragments, $text;
    return;
}

sub _uri_handler {
    my ( $self, $tagname, $attr_ref, $value_ref, $base ) = @_;
    my ( $attr, $value ) = ( ${$attr_ref}, ${$value_ref} );
    $value =~ s/[`\x00-\x1f\x7f]+//g;
    $value =~ s/\ufffd//g;

t/callbacks.t  view on Meta::CPAN

    my $tag = ${$tagref};
    isa_ok( $laundry, 'HTML::Laundry', 'Laundry object is passed into end_tag callback' );
    is($tag, q{p}, 'Tag is passed correctly to end_tag callback');
    ok( ! $attrref, 'Attributes not passed to end_tag callback');
    my $newtag = q{span};
    ${$tagref} = $newtag;
    return 1;
}

sub text_test {
    my ( $laundry, $textref, $iscdata ) = @_;
    isa_ok( $laundry, 'HTML::Laundry', 'Laundry object is passed into text callback' );
    my $text = ${$textref};
    my $expected = q{Sixteen years had Miss Taylor been in Mr. Woodhouse's family, less as a governess than a friend, very fond of both daughters, but particularly of Emma.};
    is($text, $expected, 'Text is passed correctly to text callback');
    ${$textref} = 'The family of Dashwood had been long settled in Sussex.';
    return 1;
}

sub entity_test {
    my ( $laundry, $textref, $iscdata ) = @_;
    my $text = ${$textref};
    ok($text !~ q{lt;}, 'Text is passed before entity escaping has occured');
    return 1;
}

sub output_test {
    my ( $laundry, $fragsref ) = @_;
    isa_ok( $laundry, 'HTML::Laundry', 'Laundry object is passed into output callback' );
    my @fragments = @{$fragsref};
    is(scalar @fragments, 3, 'Fragments array is passed via reference, has right number of elements');

t/callbacks.t  view on Meta::CPAN

$l->clear_callback('text');
$l->add_callback('text', \&entity_test );
$l->clean(q{1 < 2});
$l->clear_callback('text');
$l->add_callback('text', \&cancel );
$output = $l->clean($austen);
is( $output, q{<p id="foo"></p>}, 'Text callback allows forced non-parsing of text via false return ');
$l->clear_callback('text');
$l->clear_callback('uri');
$l->add_callback('text', sub {
    my ( $laundry, $textref, $iscdata ) = @_;
    ${$textref} =~ s/a//g;
});
$l->add_callback('text', sub {
    my ( $laundry, $textref, $iscdata ) = @_;
    ${$textref} =~ s/e/ee/g;
});
$l->add_callback('text', sub {
    my ( $laundry, $textref, $iscdata ) = @_;
    ${$textref} =~ s/qu/kw/g;
});
$output = $l->clean(q{<p><em>The quick brown fox jumped over the lazy dogs.</em></p>});
is( $output,
    q{<p><em>Thee kwick brown fox jumpeed oveer thee lzy dogs.</em></p>},
    q{Text callbacks may be chained. (text)});
$l->clear_callback('text');
$output = $l->clean($austen);
is( $output, $austen, 'Cleared text callback turns off callback');



( run in 0.256 second using v1.01-cache-2.11-cpan-454fe037f31 )