LCS-Similar
view release on metacpan or search on metacpan
LCS(\@a,\@b,\&similarity,$threshold)
Finds a Longest Common Subsequence, taking two arrayrefs as method
arguments. It returns an array reference of corresponding indices,
which are represented by 2-element array refs.
The third argument is the reference of a subroutine comparing two
elements and returning a number between 0 and 1. Where 0 means
unequal and 1 means equal.
Without a subroutine the module falls back to string comparison.
The fourth argument is a threshold passed to the subroutine.
max($number1, $number2)
Returns the maximum of two numbers.
max3($number1, $number2, $number3)
Returns the maximum of three numbers.
EXPORT
None by design.
EXAMPLES
Aline two textfiles
use LCS::Similar;
use LCS;
binmode(STDOUT,":encoding(UTF-8)");
open(my $in1,"<:encoding(UTF-8)",'file1')
or die "cannot open file1: $!";
open(my $in2,"<:encoding(UTF-8)",'file2')
or die "cannot open file2: $!";
my $lines1 = [<$in1>];
my $lines2 = [<$in2>];
sub similarity {
my ($a, $b, $threshold) = @_;
$a //= '';
$b //= '';
$threshold //= 0.7;
return 1 if ($a eq $b);
return 1 if (!$a eq !$b); # avoid division by zero
# length of LCS
my $llcs = LCS->LLCS(
[split(//,$a)],
[split(//,$b)],
);
# the standard formula
my $similarity = (2 * $llcs) / (length($a) + length($b));
return $similarity if ($similarity >= $threshold);
return 0;
}
# aligned indices of elements similar more than $threshold 0.5
my $lcs = LCS::Similar->LCS( $lines1, $lines2, \&similarity, 0.5 );
# map indices and not so similar elements into AoA of lines
my $aligned = LCS->lcs2align( $lines1, $lines2, $lcs );
# print them
for my $chunk (@$aligned) {
print 'a: ',$chunk->[0];
print 'b: ',$chunk->[1];
print "\n";
}
Aline two words
use LCS::Similar;
use LCS;
my $word1 = [ split(//, 'eonnnnnicaio') ];
my $word1 = [ split(//, 'communicato' ) ];
sub confusable {
my ($a, $b, $threshold) = @_;
$a //= '';
$b //= '';
$threshold //= 0.7;
return 1 if ($a eq $b);
return 1 if (!$a && !$b);
my $map = {
'e' => 'c',
'c' => 'e',
'm' => 'n',
'n' => 'm',
'i' => 't',
't' => 'i',
};
return $threshold if (exists $map->{$a} && $map->{$a} eq $b);
return 0;
}
my $aligned = [
LCS->align2strings(
LCS->lcs2align(
$word1,
$word2,
LCS::Similar->LCS( $word1, $word2, \&confusable, 0.7 )
)
)
];
print 'a: ',$aligned->[0],"\n"; # eonnnnnicaio
print 'b: ',$aligned->[1],"\n"; # commu_nicato
( run in 2.326 seconds using v1.01-cache-2.11-cpan-98e64b0badf )