HTML-Diff
view release on metacpan or search on metacpan
lib/HTML/Diff.pm view on Meta::CPAN
package HTML::Diff;
our $VERSION = '0.60';
use 5.006;
use strict;
use warnings;
use Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw(line_diff word_diff html_word_diff);
# This list of tags is taken from the XHTML spec and includes
# all those for which no closing tag is expected. In addition
# the pattern below matches any tag which ends with a slash /
our @UNBALANCED_TAGS = qw(br hr p li base basefont meta link
col colgroup frame input isindex area
embed img bgsound marquee);
use Algorithm::Diff 'sdiff';
sub member {
my ($item, @list) = @_;
return scalar(grep {$_ eq $item} @list);
}
sub html_word_diff {
my ($left, $right) = @_;
# Split the two texts into words and tags.
my (@leftchks) = $left =~ m/(<[^>]*>\s*|[^<]+)/gm;
my (@rightchks) = $right =~ m/(<[^>]*>\s*|[^<]+)/gm;
@leftchks = map { $_ =~ /^<[^>]*>$/ ? $_ : ($_ =~ m/(\S+\s*)/gm) }
@leftchks;
@rightchks = map { $_ =~ /^<[^>]*>$/ ? $_ : ($_ =~ m/(\S+\s*)/gm) }
@rightchks;
# Remove blanks; maybe the above regexes could handle this?
@leftchks = grep { $_ ne '' } @leftchks;
@rightchks = grep { $_ ne '' } @rightchks;
# Now we process each segment by turning it into a pair. The first element
# is the text as we want it to read in the result. The second element is
# the value we will to use in comparisons. It contains an identifier
# for each of the balanced tags that it lies within.
# This subroutine holds state in the tagstack variable
my $tagstack = [];
my $smear_tags = sub {
if ($_ =~ /^<.*>/) {
if ($_ =~ m|^</|) {
my ($tag) = m|^</\s*([^ \t\n\r>]*)|;
$tag = lc $tag;
# print STDERR "Found closer of $tag with " . (scalar @$tagstack) . " stack items\n";
# If we found the closer for the tag on top
# of the stack, pop it off.
if ((scalar @$tagstack) > 0 && $$tagstack[-1] eq $tag) {
my $stacktag = pop @$tagstack;
}
return [$_, $tag];
} else {
my ($tag) = m|^<\s*([^\s>]*)|;
$tag = lc $tag;
# print STDERR "Found opener of $tag with " . (scalar @$tagstack) . " stack items\n";
if (member($tag, @UNBALANCED_TAGS) || $tag =~ m#/\s*>$#)
{ # (tags without correspond closer tags)
return [$_, $tag];
} else {
push @$tagstack, $tag;
}
( run in 1.603 second using v1.01-cache-2.11-cpan-119454b85a5 )