HTML-Tidy-libXML

 view release on metacpan or  search on metacpan

lib/HTML/Tidy/libXML.pm  view on Meta::CPAN

#
# $Id: libXML.pm,v 0.2 2009/02/21 11:47:58 dankogai Exp dankogai $
#
package HTML::Tidy::libXML;
use warnings;
use strict;
use Encode;
use XML::LibXML;

our $VERSION = sprintf "%d.%02d", q$Revision: 0.2 $ =~ /(\d+)/g;

sub new {
    my $class = shift;
    my $lx    = XML::LibXML->new;
    $lx->validation(0);
    $lx->recover_silently(1);
    bless { lx => $lx }, $class;
}

sub html2dom {
    my ( $self, $html, $encoding ) = @_;
    $encoding ||= 'iso-8859-1';
    $html =~ s/\r\n?/\n/msg;               # normalize CRLF to LF
    $html = decode( $encoding, $html );    # leave the utf8 flag
    $self->{lx}->parse_html_string($html);
}

sub dom2xml {
    my ($self, $dom, $level) = @_;
    my $root = $dom->findnodes('/html')->shift;
    $root->setAttribute( xmlns => 'http://www.w3.org/1999/xhtml' );
    for my $meta ( $dom->findnodes('//meta[@http-equiv!=""]') ) {
        $meta->setAttribute( content => 'text/html; charset=utf-8' );
    }
    _tidy_dom($dom) if  $level > 0;
    my $xhtml = $root->toString( 0, 'utf-8' );    # utf8 flag off
    return <<EOT;
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
$xhtml
EOT
}

sub html2xml {
    my ( $self, $html, $encoding, $level ) = @_;
    my $dom = $self->html2dom( $html, $encoding );
    $self->dom2xml($dom, $level);
}

sub _tidy_dom {
    my $dom = shift;
    # remove empty attributes (like <br clear="">)
    for my $node ( $dom->findnodes('//*[attribute::*=""]') ) {
        for my $attr ( $node->attributes ) {
            next if $attr->getValue;
            $node->removeAttribute( $attr->getName );
        }
    }
    # handle <script>
    for my $script ( $dom->findnodes('//script') ) {
        $script->getAttribute('type')
          or $script->setAttribute( type => "text/javascript" );
        if ( $script->hasChildNodes ) {
            $script->insertBefore( $dom->createTextNode("//"),
                $script->firstChild );
            $script->lastChild->appendData("\n//");
        }
        else { # <script src="..."/> => <script src=""></script>
            $script->appendChild( $dom->createTextNode("") );
        }
    }
    # handle <style>
    for my $style ( $dom->findnodes('//style') ) {
        $style->getAttribute('type')
          or $style->setAttribute( type => "text/css" );
        if ( $style->hasChildNodes ) {    # this one is trickier
            $style->insertBefore( $dom->createTextNode("/*"),
                $style->firstChild );
            $style->lastChild->insertData( 0, "*/" );
            $style->lastChild->appendData("/*");
            $style->appendChild( $dom->createTextNode("*/") );
        }else{
	    $style->appendChild( $dom->createTextNode("") );
	}
    }
    # fix <img>
    for my $img ( $dom->findnodes('//img') ) {
        next if $img->getAttribute('type');
	my $alt = $img->getAttribute('src');
	$alt =~ s{.*/}{}o; # basename only
        $img->setAttribute( alt => $alt || 'img' );
    }
    # <a name="foo"/> => <a name="foo"></a>
    for my $a ( $dom->findnodes('//a[@name!=""]') ) {
        my $empty = $dom->createTextNode("");
        $a->appendChild($empty);
    }
}



( run in 0.746 second using v1.01-cache-2.11-cpan-524268b4103 )