HTML-Laundry

 view release on metacpan or  search on metacpan

lib/HTML/Laundry/Rules.pm  view on Meta::CPAN


=cut

sub acceptable_e {
    my $self       = shift;
    my @acceptable = qw(
        a abbr acronym address area b bdo big blockquote
        br button caption center cite code col colgroup dd
        del dfn dir div dl dt em fieldset font form
        h1 h2 h3 h4 h5 h6 hr i img input ins kbd
        label legend li map menu ol optgroup option p
        pre q s samp select small span strike strong
        sub sup table tbody td textarea tfoot th thead
        tr tt u ul var wbr
    );
    my %acceptable = map { ( $_, 1 ) } @acceptable;
    return \%acceptable;
}

=head2 empty_e

t/parser_default.t  view on Meta::CPAN


require_ok('HTML::Laundry');

my $l1 = HTML::Laundry->new({ notidy => 1 });
isa_ok($l1, 'HTML::Laundry', 'New object isa HTML::Laundry');

my $plaintext = 'She was the youngest of the two daughters of a most affectionate, indulgent father...';
is( $l1->clean($plaintext), $plaintext, 'Short plain text passes through cleanly');
$plaintext = 'She was the youngest of the two daughters of a most affectionate, indulgent father; and had, in consequence of her sister\'s marriage, been mistress of his house from a very early period. Her mother had died too long ago for her to have...
is( $l1->clean($plaintext), $plaintext, 'Longer plain text passes through cleanly');
my $kurosawa = q[Akira Kurosawa (Kyūjitai: 黒澤 明, Shinjitai: 黒沢 明 Kurosawa Akira, 23 March 1910 – 6 September 1998) was a legendary Japanese filmmaker, producer, screenwriter and editor];
is( $l1->clean($kurosawa), $kurosawa, 'UTF-8 text passes through cleanly');
my $valid = '<p class="opening">She was the youngest of the two daughters of a most affectionate, indulgent <a href="#footnote1">father</a>; and had, in consequence of her sister\'s marriage, been mistress of his house from a very early period. Her m...
is( $l1->clean($valid), $valid, 'Validating text passes through cleanly');
is( $l1->clean('<p></p>'), '<p></p>', 'Non-empty tag passes through cleanly');
is( $l1->clean('<br />'), '<br />', 'Empty tag passes through cleanly');
is( $l1->clean('<br /   >'), '<br />', 'Empty tag with whitespace passes through cleanly');
is( $l1->clean('<p />'), '<p></p>', 'Non-empty tag passed in as empty is normalized to non-empty format');
is( $l1->clean('<br></br>'), '<br />', 'Empty tag passed in as non-empty is normalized to empty format');
is( $l1->clean('<br class="foo" />'), '<br class="foo" />', 'Empty tag attribute is preserved');
is( $l1->clean('<p class="foo"></p>'), '<p class="foo"></p>', 'Non-empty tag attribute is preserved');

t/ruleset_minimal.t  view on Meta::CPAN

    rules => 'HTML::Laundry::Rules::Minimal' });

my @ok = qw( a b br blockquote code em i li ol p pre strong u ul );
my %ok = map { $_ => 1 } @ok;

my @e = (
  'a', 'abbr', 'acronym', 'address', 'area', 'b', 'bdo', 'big', 'blockquote',
  'br', 'button', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'dd',
  'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset', 'font', 'form',
  'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd',
  'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p',
  'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike', 'strong',
  'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead',
  'tr', 'tt', 'u', 'ul', 'var', 'wbr'
);

foreach my $e ( @e ) {
    if ( $ok{$e} and $e ne 'br' ) {
        # The only allowed empty element in this ruleset is <br />
        is( $l1->clean("<$e></$e>"), "<$e></$e>", "element $e is not sanitized");
    } elsif ( $ok{$e} ) {

t/sanitize_default.t  view on Meta::CPAN

    '', 'DOCTYPE declaration is stripped');
is( $l1->clean('<p class="xyzzy" plugh="plover">Her situation in life, the character of her father and mother, her own person and disposition, were all equally against her.</p>'),
    '<p class="xyzzy">Her situation in life, the character of her father and mother, her own person and disposition, were all equally against her.</p>',
    'Unknown attribute is stripped, but known attribute remains' );

my @e = (
  'a', 'abbr', 'acronym', 'address', 'area', 'b', 'bdo', 'big', 'blockquote',
  'br', 'button', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'dd',
  'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset', 'font', 'form',
  'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd',
  'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p',
  'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike', 'strong',
  'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead',
  'tr', 'tt', 'u', 'ul', 'var', 'wbr'
);

my %empty = (
    area => 1,
    br => 1,
    col => 1,
    hr => 1,

t/tidy_default.t  view on Meta::CPAN

    my $plaintext = 'She was the youngest of the two daughters of a most affectionate, indulgent father...';
    is( $l->clean($plaintext), $plaintext, 'Short plain text passes through cleanly');
    $plaintext = q{She had been a friend and companion such as few possessed: intelligent, well-informed, useful, gentle, knowing all the ways of the family, interested in all its concerns, and peculiarly interested in herself, in every pleasure, eve...
    is( $l->clean($plaintext), $plaintext, 'Longer plain text passes through cleanly');

    TODO: {
        # HTML::Tidy 1.56 fixes unicode support
        local $TODO = "HTML::Tidy version dependent. Install HTML::Tidy 1.56 or greater"
            unless eval { HTML::Tidy->VERSION(1.56) };

        my $kurosawa_chars = Encode::encode('UTF-8', q[Akira Kurosawa (Kyūjitai: 黒澤 明, Shinjitai: 黒沢 明 Kurosawa Akira, 23 March 1910 – 6 September 1998) was a legendary Japanese filmmaker, producer, screenwriter and editor]);
        my $kurosawa_bytes = Encode::decode('UTF-8', $kurosawa_chars);
        is( $l->clean($kurosawa_chars), $kurosawa_bytes, 'UTF-8 text passes through cleanly');
    };

    my $valid = q{<p>} . $plaintext . q{</p>};
    is( $l->clean($valid), $valid, 'Validating HTML passes through cleanly');
    TODO: {
        local $TODO = "libtidy version dependent - figure out how to check";
        is( $l->clean('<div></div>'), q{}, 'No-content elements are stripped...');
        is( $l->clean('<div foo="bar"></div>'), q{<div id="foo"></div>}, '...unless they have attributes');

t/tidy_libxml.t  view on Meta::CPAN

        };
    }
    my $l = HTML::Laundry->new();
    is( $l->{tidy_engine}, q{HTML::Tidy::libXML},
        'Laundry uses HTML::Tidy::libXML as tidying engine if HTML::Tidy is unavailable'
        );
    my $plaintext = 'She was the youngest of the two daughters of a most affectionate, indulgent father...';
    is( $l->clean($plaintext), $plaintext, 'Short plain text passes through cleanly');
    $plaintext = q{She had been a friend and companion such as few possessed: intelligent, well-informed, useful, gentle, knowing all the ways of the family, interested in all its concerns, and peculiarly interested in herself, in every pleasure, eve...
    is( $l->clean($plaintext), $plaintext, 'Longer plain text passes through cleanly');
    my $kurosawa = q[Akira Kurosawa (Kyūjitai: 黒澤 明, Shinjitai: 黒沢 明 Kurosawa Akira, 23 March 1910 – 6 September 1998) was a legendary Japanese filmmaker, producer, screenwriter and editor];
    is( $l->clean($kurosawa), $kurosawa, 'UTF-8 text passes through cleanly');
    my $valid = q{<p>} . $plaintext . q{</p>};
    is( $l->clean($valid), $valid, 'Validating HTML passes through cleanly');
    is( $l->clean('<p></p>'), '<p/>', 'libXML collapses no-content paragraph tags to empty element');
    is( $l->clean('<br />'), '<br/>', 'Empty tag passes through cleanly');
    is( $l->clean('<br /   >'), '<br/>', 'Empty tag with whitespace passes through cleanly');
    is( $l->clean('<br></br>'), '<br/>', 'Empty tag passed in as non-empty is normalized to empty format');
    is( $l->clean('<br class="foo" />'), '<br class="foo"/>', 'Empty tag attribute is preserved');
    is( $l->clean('<p class="foo"></p>'), '<p class="foo"/>', 'Non-empty tag attribute is preserved');
    # Actual tidying begins



( run in 1.374 second using v1.01-cache-2.11-cpan-49f99fa48dc )