Algorithm-Diff
view release on metacpan or search on metacpan
package Hunk;
sub new {
# Arg1 is output from &LCS::diff (which corresponds to one Block)
# Arg2 is the number of items (lines, e.g.,) of context around each block
#
# This subroutine changes $File_Length_Difference
#
# Fields in a Hunk:
# blocks - a list of Block objects
# start - index in file 1 where first block of the hunk starts
# end - index in file 1 where last block of the hunk ends
#
# Variables:
# before_diff - how much longer file 2 is than file 1 due to all hunks
# until but NOT including this one
# after_diff - difference due to all hunks including this one
my ($class, $piece, $context_items) = @_;
my $block = new Block ($piece); # this modifies $FLD!
my $before_diff = $File_Length_Difference; # BEFORE this hunk
my $after_diff = $before_diff + $block->{"length_diff"};
$File_Length_Difference += $block->{"length_diff"};
# @remove_array and @insert_array hold the items to insert and remove
# Save the start & beginning of each array. If the array doesn't exist
# though (e.g., we're only adding items in this block), then figure
# out the line number based on the line number of the other file and
# the current difference in file lengths
my @remove_array = $block->remove;
my @insert_array = $block->insert;
my ($a1, $a2, $b1, $b2, $start1, $start2, $end1, $end2);
$a1 = @remove_array ? $remove_array[0 ]->{"item_no"} : -1;
$a2 = @remove_array ? $remove_array[-1]->{"item_no"} : -1;
$b1 = @insert_array ? $insert_array[0 ]->{"item_no"} : -1;
$b2 = @insert_array ? $insert_array[-1]->{"item_no"} : -1;
$start1 = $a1 == -1 ? $b1 - $before_diff : $a1;
$end1 = $a2 == -1 ? $b2 - $after_diff : $a2;
$start2 = $b1 == -1 ? $a1 + $before_diff : $b1;
$end2 = $b2 == -1 ? $a2 + $after_diff : $b2;
# At first, a hunk will have just one Block in it
my $hunk = {
"start1" => $start1,
"start2" => $start2,
"end1" => $end1,
"end2" => $end2,
"blocks" => [$block],
};
bless $hunk, $class;
$hunk->flag_context($context_items);
return $hunk;
}
# Change the "start" and "end" fields to note that context should be added
# to this hunk
sub flag_context {
my ($hunk, $context_items) = @_;
return unless $context_items; # no context
# add context before
my $start1 = $hunk->{"start1"};
my $num_added = $context_items > $start1 ? $start1 : $context_items;
$hunk->{"start1"} -= $num_added;
$hunk->{"start2"} -= $num_added;
# context after
my $end1 = $hunk->{"end1"};
$num_added = ($end1+$context_items > $#f1) ?
$#f1 - $end1 :
$context_items;
$hunk->{"end1"} += $num_added;
$hunk->{"end2"} += $num_added;
}
# Is there an overlap between hunk arg0 and old hunk arg1?
# Note: if end of old hunk is one less than beginning of second, they overlap
sub does_overlap {
my ($hunk, $oldhunk) = @_;
return "" unless $oldhunk; # first time through, $oldhunk is empty
# Do I actually need to test both?
return ($hunk->{"start1"} - $oldhunk->{"end1"} <= 1 ||
$hunk->{"start2"} - $oldhunk->{"end2"} <= 1);
}
# Prepend hunk arg1 to hunk arg0
# Note that arg1 isn't updated! Only arg0 is.
sub prepend_hunk {
my ($hunk, $oldhunk) = @_;
$hunk->{"start1"} = $oldhunk->{"start1"};
$hunk->{"start2"} = $oldhunk->{"start2"};
unshift (@{$hunk->{"blocks"}}, @{$oldhunk->{"blocks"}});
}
# DIFF OUTPUT ROUTINES. THESE ROUTINES CONTAIN DIFF FORMATTING INFO...
sub output_diff {
if (defined $main::opt_u) {&output_unified_diff(@_)}
elsif (defined $main::opt_c) {&output_context_diff(@_)}
else {die "unknown diff"}
sub output_unified_diff {
my ($hunk, $fileref1, $fileref2) = @_;
my @blocklist;
# Calculate item number range.
my $range1 = $hunk->unified_range(1);
my $range2 = $hunk->unified_range(2);
print "@@ -$range1 +$range2 @@\n";
# Outlist starts containing the hunk of file 1.
# Removing an item just means putting a '-' in front of it.
# Inserting an item requires getting it from file2 and splicing it in.
# We splice in $num_added items. Remove blocks use $num_added because
# splicing changed the length of outlist.
# We remove $num_removed items. Insert blocks use $num_removed because
# their item numbers---corresponding to positions in file *2*--- don't take
# removed items into account.
my $low = $hunk->{"start1"};
my $hi = $hunk->{"end1"};
my ($num_added, $num_removed) = (0,0);
my @outlist = @$fileref1[$low..$hi];
map {s/^/ /} @outlist; # assume it's just context
foreach my $block (@{$hunk->{"blocks"}}) {
foreach my $item ($block->remove) {
my $op = $item->{"sign"}; # -
my $offset = $item->{"item_no"} - $low + $num_added;
$outlist[$offset] =~ s/^ /$op/;
$num_removed++;
}
foreach my $item ($block->insert) {
my $op = $item->{"sign"}; # +
my $i = $item->{"item_no"};
my $offset = $i - $hunk->{"start2"} + $num_removed;
splice(@outlist,$offset,0,"$op$$fileref2[$i]");
$num_added++;
}
}
map {s/$/\n/} @outlist; # add \n's
print @outlist;
}
my @blocklist;
print "***************\n";
# Calculate item number range.
my $range1 = $hunk->context_range(1);
my $range2 = $hunk->context_range(2);
# Print out file 1 part for each block in context diff format if there are
# any blocks that remove items
print "*** $range1 ****\n";
my $low = $hunk->{"start1"};
my $hi = $hunk->{"end1"};
if (@blocklist = grep {$_->remove} @{$hunk->{"blocks"}}) {
my @outlist = @$fileref1[$low..$hi];
map {s/^/ /} @outlist; # assume it's just context
foreach my $block (@blocklist) {
my $op = $block->op; # - or !
foreach my $item ($block->remove) {
$outlist[$item->{"item_no"} - $low] =~ s/^ /$op/;
}
}
map {s/$/\n/} @outlist; # add \n's
print @outlist;
}
print "--- $range2 ----\n";
$low = $hunk->{"start2"};
$hi = $hunk->{"end2"};
if (@blocklist = grep {$_->insert} @{$hunk->{"blocks"}}) {
my @outlist = @$fileref2[$low..$hi];
map {s/^/ /} @outlist; # assume it's just context
foreach my $block (@blocklist) {
my $op = $block->op; # + or !
foreach my $item ($block->insert) {
$outlist[$item->{"item_no"} - $low] =~ s/^ /$op/;
}
}
map {s/$/\n/} @outlist; # add \n's
print @outlist;
}
}
sub context_range {
# Generate a range of item numbers to print. Only print 1 number if the range
# has only one item in it. Otherwise, it's 'start,end'
my ($hunk, $flag) = @_;
my ($start, $end) = ($hunk->{"start$flag"},$hunk->{"end$flag"});
$start++; $end++; # index from 1, not zero
my $range = ($start < $end) ? "$start,$end" : $end;
return $range;
}
sub unified_range {
# Generate a range of item numbers to print for unified diff
# Print number where block starts, followed by number of lines in the block
# (don't print number of lines if it's 1)
my ($hunk, $flag) = @_;
my ($start, $end) = ($hunk->{"start$flag"},$hunk->{"end$flag"});
$start++; $end++; # index from 1, not zero
my $length = $end - $start + 1;
my $first = $length < 2 ? $end : $start; # strange, but correct...
my $range = $length== 1 ? $first : "$first,$length";
return $range;
}
} # end Package Hunk
# Package Block. A block is an operation removing, adding, or changing
# a group of items. Basically, this is just a list of changes, where each
# change adds or deletes a single item.
# (Change could be a separate class, but it didn't seem worth it)
{
package Hunk;
sub new {
# Arg1 is output from &LCS::diff (which corresponds to one Block)
# Arg2 is the number of items (lines, e.g.,) of context around each block
#
# This subroutine changes $File_Length_Difference
#
# Fields in a Hunk:
# blocks - a list of Block objects
# start - index in file 1 where first block of the hunk starts
# end - index in file 1 where last block of the hunk ends
#
# Variables:
# before_diff - how much longer file 2 is than file 1 due to all hunks
# until but NOT including this one
# after_diff - difference due to all hunks including this one
my ($class, $piece, $context_items) = @_;
my $block = new Block ($piece); # this modifies $FLD!
my $before_diff = $File_Length_Difference; # BEFORE this hunk
my $after_diff = $before_diff + $block->{"length_diff"};
$File_Length_Difference += $block->{"length_diff"};
# @remove_array and @insert_array hold the items to insert and remove
# Save the start & beginning of each array. If the array doesn't exist
# though (e.g., we're only adding items in this block), then figure
# out the line number based on the line number of the other file and
# the current difference in file lenghts
my @remove_array = $block->remove;
my @insert_array = $block->insert;
my ($a1, $a2, $b1, $b2, $start1, $start2, $end1, $end2);
$a1 = @remove_array ? $remove_array[0 ]->{"item_no"} : -1;
$a2 = @remove_array ? $remove_array[-1]->{"item_no"} : -1;
$b1 = @insert_array ? $insert_array[0 ]->{"item_no"} : -1;
$b2 = @insert_array ? $insert_array[-1]->{"item_no"} : -1;
$start1 = $a1 == -1 ? $b1 - $before_diff : $a1;
$end1 = $a2 == -1 ? $b2 - $after_diff : $a2;
$start2 = $b1 == -1 ? $a1 + $before_diff : $b1;
$end2 = $b2 == -1 ? $a2 + $after_diff : $b2;
# At first, a hunk will have just one Block in it
my $hunk = {
"start1" => $start1,
"start2" => $start2,
"end1" => $end1,
"end2" => $end2,
"blocks" => [$block],
};
bless $hunk, $class;
$hunk->flag_context($context_items);
return $hunk;
}
# Change the "start" and "end" fields to note that context should be added
# to this hunk
sub flag_context {
my ($hunk, $context_items) = @_;
return unless $context_items; # no context
# add context before
my $start1 = $hunk->{"start1"};
my $num_added = $context_items > $start1 ? $start1 : $context_items;
$hunk->{"start1"} -= $num_added;
$hunk->{"start2"} -= $num_added;
# context after
my $end1 = $hunk->{"end1"};
$num_added = ($end1+$context_items > $#f1) ?
$#f1 - $end1 :
$context_items;
$hunk->{"end1"} += $num_added;
$hunk->{"end2"} += $num_added;
}
# Is there an overlap between hunk arg0 and old hunk arg1?
# Note: if end of old hunk is one less than beginning of second, they overlap
sub does_overlap {
my ($hunk, $oldhunk) = @_;
return "" unless $oldhunk; # first time through, $oldhunk is empty
# Do I actually need to test both?
return ($hunk->{"start1"} - $oldhunk->{"end1"} <= 1 ||
$hunk->{"start2"} - $oldhunk->{"end2"} <= 1);
}
# Prepend hunk arg1 to hunk arg0
# Note that arg1 isn't updated! Only arg0 is.
sub prepend_hunk {
my ($hunk, $oldhunk) = @_;
$hunk->{"start1"} = $oldhunk->{"start1"};
$hunk->{"start2"} = $oldhunk->{"start2"};
unshift (@{$hunk->{"blocks"}}, @{$oldhunk->{"blocks"}});
}
# DIFF OUTPUT ROUTINES. THESE ROUTINES CONTAIN DIFF FORMATTING INFO...
sub output_diff {
# First arg is the current hunk of course
# Next args are refs to the files
# last arg is type of diff
# old diff range is just like a context diff range, except the ranges
# are on one line with the action between them.
my $range1 = $hunk->context_range(1);
my $range2 = $hunk->context_range(2);
my $action = $op_hash{$op} || warn "unknown op $op";
print "$range1$action$range2\n";
# If removing anything, just print out all the remove lines in the hunk
# which is just all the remove lines in the block
if ($block->remove) {
my @outlist = @$fileref1[$hunk->{"start1"}..$hunk->{"end1"}];
map {$_ = "< $_\n"} @outlist; # all lines will be '< text\n'
print @outlist;
}
print "---\n" if $op eq '!'; # only if inserting and removing
if ($block->insert) {
my @outlist = @$fileref2[$hunk->{"start2"}..$hunk->{"end2"}];
map {$_ = "> $_\n"} @outlist; # all lines will be '> text\n'
print @outlist;
}
}
sub output_unified_diff {
my ($hunk, $fileref1, $fileref2) = @_;
my @blocklist;
# Calculate item number range.
my $range1 = $hunk->unified_range(1);
my $range2 = $hunk->unified_range(2);
print "@@ -$range1 +$range2 @@\n";
# Outlist starts containing the hunk of file 1.
# Removing an item just means putting a '-' in front of it.
# Inserting an item requires getting it from file2 and splicing it in.
# We splice in $num_added items. Remove blocks use $num_added because
# splicing changed the length of outlist.
# We remove $num_removed items. Insert blocks use $num_removed because
# their item numbers---corresponding to positions in file *2*--- don't take
# removed items into account.
my $low = $hunk->{"start1"};
my $hi = $hunk->{"end1"};
my ($num_added, $num_removed) = (0,0);
my @outlist = @$fileref1[$low..$hi];
map {s/^/ /} @outlist; # assume it's just context
foreach my $block (@{$hunk->{"blocks"}}) {
foreach my $item ($block->remove) {
my $op = $item->{"sign"}; # -
my $offset = $item->{"item_no"} - $low + $num_added;
$outlist[$offset] =~ s/^ /$op/;
$num_removed++;
}
foreach my $item ($block->insert) {
my $op = $item->{"sign"}; # +
my $i = $item->{"item_no"};
my $offset = $i - $hunk->{"start2"} + $num_removed;
splice(@outlist,$offset,0,"$op$$fileref2[$i]");
$num_added++;
}
}
map {s/$/\n/} @outlist; # add \n's
print @outlist;
}
my @blocklist;
print "***************\n";
# Calculate item number range.
my $range1 = $hunk->context_range(1);
my $range2 = $hunk->context_range(2);
# Print out file 1 part for each block in context diff format if there are
# any blocks that remove items
print "*** $range1 ****\n";
my $low = $hunk->{"start1"};
my $hi = $hunk->{"end1"};
if (@blocklist = grep {$_->remove} @{$hunk->{"blocks"}}) {
my @outlist = @$fileref1[$low..$hi];
map {s/^/ /} @outlist; # assume it's just context
foreach my $block (@blocklist) {
my $op = $block->op; # - or !
foreach my $item ($block->remove) {
$outlist[$item->{"item_no"} - $low] =~ s/^ /$op/;
}
}
map {s/$/\n/} @outlist; # add \n's
print @outlist;
}
print "--- $range2 ----\n";
$low = $hunk->{"start2"};
$hi = $hunk->{"end2"};
if (@blocklist = grep {$_->insert} @{$hunk->{"blocks"}}) {
my @outlist = @$fileref2[$low..$hi];
map {s/^/ /} @outlist; # assume it's just context
foreach my $block (@blocklist) {
my $op = $block->op; # + or !
foreach my $item ($block->insert) {
$outlist[$item->{"item_no"} - $low] =~ s/^ /$op/;
}
}
# Calculate item number range.
# old diff range is just like a context diff range, except the ranges
# are on one line with the action between them.
my $range1 = $hunk->context_range(1);
$range1 =~ s/,/ / if $diff_type eq "REVERSE_ED";
my $action = $op_hash{$op} || warn "unknown op $op";
print ($diff_type eq "ED" ? "$range1$action\n" : "$action$range1\n");
if ($block->insert) {
my @outlist = @$fileref2[$hunk->{"start2"}..$hunk->{"end2"}];
map {s/$/\n/} @outlist; # add \n's
print @outlist;
print ".\n"; # end of ed 'c' or 'a' command
}
}
sub context_range {
# Generate a range of item numbers to print. Only print 1 number if the range
# has only one item in it. Otherwise, it's 'start,end'
# Flag is the number of the file (1 or 2)
my ($hunk, $flag) = @_;
my ($start, $end) = ($hunk->{"start$flag"},$hunk->{"end$flag"});
$start++; $end++; # index from 1, not zero
my $range = ($start < $end) ? "$start,$end" : $end;
return $range;
}
sub unified_range {
# Generate a range of item numbers to print for unified diff
# Print number where block starts, followed by number of lines in the block
# (don't print number of lines if it's 1)
my ($hunk, $flag) = @_;
my ($start, $end) = ($hunk->{"start$flag"},$hunk->{"end$flag"});
$start++; $end++; # index from 1, not zero
my $length = $end - $start + 1;
my $first = $length < 2 ? $end : $start; # strange, but correct...
my $range = $length== 1 ? $first : "$first,$length";
return $range;
}
} # end Package Hunk
########
# Package Block. A block is an operation removing, adding, or changing
# a group of items. Basically, this is just a list of changes, where each
# change adds or deletes a single item.
# (Change could be a separate class, but it didn't seem worth it)
htmldiff.pl view on Meta::CPAN
PRE {
margin-left: 24pt;
font-size: 12pt;
font-family: Courier, monospaced;
white-space: pre
}
PRE.onlyA { color: red }
PRE.onlyB { color: blue }
EOS
# Print out the starting HTML
print
# header(),
start_html(
{
-title => "$ARGV[0] vs. $ARGV[1]",
-style => { -code => $style }
}
),
h1(
{ -style => 'margin-left: 24pt' },
span( { -style => 'color: red' }, $ARGV[0] ),
span(" <i>vs.</i> "),
span( { -style => 'color: blue' }, $ARGV[1] )
lib/Algorithm/Diff.pm view on Meta::CPAN
traverse_sequences traverse_balanced
);
# McIlroy-Hunt diff algorithm
# Adapted from the Smalltalk code of Mario I. Wolczko, <mario@wolczko.com>
# by Ned Konz, perl@bike-nomad.com
# Updates by Tye McQueen, http://perlmonks.org/?node=tye
# Create a hash that maps each element of $aCollection to the set of
# positions it occupies in $aCollection, restricted to the elements
# within the range of indexes specified by $start and $end.
# The fourth parameter is a subroutine reference that will be called to
# generate a string to use as a key.
# Additional parameters, if any, will be passed to this subroutine.
#
# my $hashRef = _withPositionsOfInInterval( \@array, $start, $end, $keyGen );
sub _withPositionsOfInInterval
{
my $aCollection = shift; # array ref
my $start = shift;
my $end = shift;
my $keyGen = shift;
my %d;
my $index;
for ( $index = $start ; $index <= $end ; $index++ )
{
my $element = $aCollection->[$index];
my $key = &$keyGen( $element, @_ );
if ( exists( $d{$key} ) )
{
unshift ( @{ $d{$key} }, $index );
}
else
{
$d{$key} = [$index];
lib/Algorithm/Diff.pm view on Meta::CPAN
This module solves the LCS problem. It also includes a canned function
to generate C<diff>-like output.
It might seem from the example above that the LCS of two sequences is
always pretty obvious, but that's not always the case, especially when
the two sequences have many repeated elements. For example, consider
a x b y c z p d q
a b c a x b y c z
A naive approach might start by matching up the C<a> and C<b> that
appear at the beginning of each sequence, like this:
a x b y c z p d q
a b c a b y c z
This finds the common subsequence C<a b c z>. But actually, the LCS
is C<a x b y c z>:
a x b y c z p d q
a b c a x b y c z
lib/Algorithm/Diff.pm view on Meta::CPAN
Passing in C<undef> for an optional argument is always treated the same
as if no argument were passed in.
=item C<Next>
$pos = $diff->Next(); # Move forward 1 hunk
$pos = $diff->Next( 2 ); # Move forward 2 hunks
$pos = $diff->Next(-5); # Move backward 5 hunks
C<Next> moves the object to point at the next hunk. The object starts
out "reset", which means it isn't pointing at any hunk. If the object
is reset, then C<Next()> moves to the first hunk.
C<Next> returns a true value iff the move didn't go past the last hunk.
So C<Next(0)> will return true iff the object is not reset.
Actually, C<Next> returns the object's new position, which is a number
between 1 and the number of hunks (inclusive), or returns a false value.
=item C<Prev>
lib/Algorithm/Diff.pm view on Meta::CPAN
=item C<Copy>
$copy = $diff->Copy( $newPos, $newBase );
C<Copy> returns a copy of the object. The copy and the original object
share most of their data, so making copies takes very little memory.
The copy maintains its own position (separate from the original), which
is the main purpose of copies. It also maintains its own base.
By default, the copy's position starts out the same as the original
object's position. But C<Copy> takes an optional first argument to set the
new position, so the following three snippets are equivalent:
$copy = $diff->Copy($pos);
$copy = $diff->Copy();
$copy->Reset($pos);
$copy = $diff->Copy()->Reset($pos);
lib/Algorithm/Diff.pm view on Meta::CPAN
C<compact_diff> is much like C<sdiff> except it returns a much more
compact description consisting of just one flat list of indices. An
example helps explain the format:
my @a = qw( a b c e h j l m n p );
my @b = qw( b c d e f j k l m r s t );
@cdiff = compact_diff( \@a, \@b );
# Returns:
# @a @b @a @b
# start start values values
( 0, 0, # =
0, 0, # a !
1, 0, # b c = b c
3, 2, # ! d
3, 3, # e = e
4, 4, # f ! h
5, 5, # j = j
6, 6, # ! k
6, 7, # l m = l m
8, 9, # n p ! r s t
10, 12, #
);
The 0th, 2nd, 4th, etc. entries are all indices into @seq1 (@a in the
above example) indicating where a hunk begins. The 1st, 3rd, 5th, etc.
entries are all indices into @seq2 (@b in the above example) indicating
where the same hunk begins.
So each pair of indices (except the last pair) describes where a hunk
begins (in each sequence). Since each hunk must end at the item just
before the item that starts the next hunk, the next pair of indices can
be used to determine where the hunk ends.
So, the first 4 entries (0..3) describe the first hunk. Entries 0 and 1
describe where the first hunk begins (and so are always both 0).
Entries 2 and 3 describe where the next hunk begins, so subtracting 1
from each tells us where the first hunk ends. That is, the first hunk
contains items C<$diff[0]> through C<$diff[2] - 1> of the first sequence
and contains items C<$diff[1]> through C<$diff[3] - 1> of the second
sequence.
In other words, the first hunk consists of the following two lists of items:
# 1st pair 2nd pair
# of indices of indices
@list1 = @a[ $cdiff[0] .. $cdiff[2]-1 ];
@list2 = @b[ $cdiff[1] .. $cdiff[3]-1 ];
# Hunk start Hunk end
Note that the hunks will always alternate between those that are part of
the LCS (those that contain unchanged items) and those that contain
changes. This means that all we need to be told is whether the first
hunk is a 'same' or 'diff' hunk and we can determine which of the other
hunks contain 'same' items or 'diff' items.
By convention, we always make the first hunk contain unchanged items.
So the 1st, 3rd, 5th, etc. hunks (all odd-numbered hunks if you start
counting from 1) all contain unchanged items. And the 2nd, 4th, 6th,
etc. hunks (all even-numbered hunks if you start counting from 1) all
contain changed items.
Since @a and @b don't begin with the same value, the first hunk in our
example is empty (otherwise we'd violate the above convention). Note
that the first 4 index values in our example are all zero. Plug these
values into our previous code block and we get:
@hunk1a = @a[ 0 .. 0-1 ];
@hunk1b = @b[ 0 .. 0-1 ];
( run in 0.895 second using v1.01-cache-2.11-cpan-0d8aa00de5b )