HTML-Tidy-libXML
view release on metacpan or search on metacpan
lib/HTML/Tidy/libXML.pm view on Meta::CPAN
#
# $Id: libXML.pm,v 0.2 2009/02/21 11:47:58 dankogai Exp dankogai $
#
package HTML::Tidy::libXML;
use warnings;
use strict;
use Encode;
use XML::LibXML;
our $VERSION = sprintf "%d.%02d", q$Revision: 0.2 $ =~ /(\d+)/g;
sub new {
my $class = shift;
my $lx = XML::LibXML->new;
$lx->validation(0);
$lx->recover_silently(1);
bless { lx => $lx }, $class;
}
sub html2dom {
my ( $self, $html, $encoding ) = @_;
$encoding ||= 'iso-8859-1';
$html =~ s/\r\n?/\n/msg; # normalize CRLF to LF
$html = decode( $encoding, $html ); # leave the utf8 flag
$self->{lx}->parse_html_string($html);
}
sub dom2xml {
my ($self, $dom, $level) = @_;
my $root = $dom->findnodes('/html')->shift;
$root->setAttribute( xmlns => 'http://www.w3.org/1999/xhtml' );
for my $meta ( $dom->findnodes('//meta[@http-equiv!=""]') ) {
$meta->setAttribute( content => 'text/html; charset=utf-8' );
}
_tidy_dom($dom) if $level > 0;
my $xhtml = $root->toString( 0, 'utf-8' ); # utf8 flag off
return <<EOT;
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
$xhtml
EOT
}
sub html2xml {
my ( $self, $html, $encoding, $level ) = @_;
my $dom = $self->html2dom( $html, $encoding );
$self->dom2xml($dom, $level);
}
sub _tidy_dom {
my $dom = shift;
# remove empty attributes (like <br clear="">)
for my $node ( $dom->findnodes('//*[attribute::*=""]') ) {
for my $attr ( $node->attributes ) {
next if $attr->getValue;
$node->removeAttribute( $attr->getName );
}
}
# handle <script>
for my $script ( $dom->findnodes('//script') ) {
$script->getAttribute('type')
or $script->setAttribute( type => "text/javascript" );
if ( $script->hasChildNodes ) {
$script->insertBefore( $dom->createTextNode("//"),
$script->firstChild );
$script->lastChild->appendData("\n//");
}
else { # <script src="..."/> => <script src=""></script>
$script->appendChild( $dom->createTextNode("") );
}
}
# handle <style>
for my $style ( $dom->findnodes('//style') ) {
$style->getAttribute('type')
or $style->setAttribute( type => "text/css" );
if ( $style->hasChildNodes ) { # this one is trickier
$style->insertBefore( $dom->createTextNode("/*"),
$style->firstChild );
$style->lastChild->insertData( 0, "*/" );
$style->lastChild->appendData("/*");
$style->appendChild( $dom->createTextNode("*/") );
}else{
$style->appendChild( $dom->createTextNode("") );
}
}
# fix <img>
for my $img ( $dom->findnodes('//img') ) {
next if $img->getAttribute('type');
my $alt = $img->getAttribute('src');
$alt =~ s{.*/}{}o; # basename only
$img->setAttribute( alt => $alt || 'img' );
}
# <a name="foo"/> => <a name="foo"></a>
for my $a ( $dom->findnodes('//a[@name!=""]') ) {
my $empty = $dom->createTextNode("");
$a->appendChild($empty);
}
}
( run in 0.746 second using v1.01-cache-2.11-cpan-524268b4103 )