HTML-Diff

 view release on metacpan or  search on metacpan

lib/HTML/Diff.pm  view on Meta::CPAN

package HTML::Diff;

our $VERSION = '0.60';

use 5.006;
use strict;
use warnings;

use Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw(line_diff word_diff html_word_diff);

# This list of tags is taken from the XHTML spec and includes
# all those for which no closing tag is expected. In addition
# the pattern below matches any tag which ends with a slash /

our @UNBALANCED_TAGS = qw(br hr p li base basefont meta link 
			  col colgroup frame input isindex area 
			  embed img bgsound marquee);

use Algorithm::Diff 'sdiff';

sub member {
    my ($item, @list) = @_;

    return scalar(grep {$_ eq $item} @list);
}

sub html_word_diff {
    my ($left, $right) = @_;

    # Split the two texts into words and tags.
    my (@leftchks) = $left =~ m/(<[^>]*>\s*|[^<]+)/gm;
    my (@rightchks) = $right =~ m/(<[^>]*>\s*|[^<]+)/gm;
    
    @leftchks = map { $_ =~ /^<[^>]*>$/ ? $_ : ($_ =~ m/(\S+\s*)/gm) }
                    @leftchks;
    @rightchks = map { $_ =~ /^<[^>]*>$/ ? $_ : ($_ =~ m/(\S+\s*)/gm) } 
                     @rightchks;

    # Remove blanks; maybe the above regexes could handle this?
    @leftchks = grep { $_ ne '' } @leftchks;
    @rightchks = grep { $_ ne '' } @rightchks;

    # Now we process each segment by turning it into a pair. The first element
    # is the text as we want it to read in the result. The second element is
    # the value we will to use in comparisons. It contains an identifier
    # for each of the balanced tags that it lies within.

    # This subroutine holds state in the tagstack variable
    my $tagstack = [];
    my $smear_tags = sub {
	if ($_ =~ /^<.*>/) {
	    if ($_ =~ m|^</|) {
		my ($tag) = m|^</\s*([^ \t\n\r>]*)|;
		$tag = lc $tag;
#                print STDERR "Found closer of $tag with " . (scalar @$tagstack) . " stack items\n";
		# If we found the closer for the tag on top 
		# of the stack, pop it off.
		if ((scalar @$tagstack) > 0 && $$tagstack[-1] eq $tag) {
                    my $stacktag = pop @$tagstack;
                }
		return [$_, $tag];
	    } else {
		my ($tag) = m|^<\s*([^\s>]*)|;
		$tag = lc $tag;
#                print STDERR "Found opener of $tag with " . (scalar @$tagstack) . " stack items\n";
		if (member($tag, @UNBALANCED_TAGS) || $tag =~ m#/\s*>$#)
		{	                # (tags without correspond closer tags)
		    return [$_, $tag];
		} else {
		    push @$tagstack, $tag;
		}



( run in 1.603 second using v1.01-cache-2.11-cpan-119454b85a5 )