LCS-Similar

 view release on metacpan or  search on metacpan

README  view on Meta::CPAN


    LCS(\@a,\@b,\&similarity,$threshold)

      Finds a Longest Common Subsequence, taking two arrayrefs as method
      arguments. It returns an array reference of corresponding indices,
      which are represented by 2-element array refs.

      The third argument is the reference of a subroutine comparing two
      elements and returning a number between 0 and 1. Where 0 means
      unequal and 1 means equal.

      Without a subroutine the module falls back to string comparison.

      The fourth argument is a threshold passed to the subroutine.

    max($number1, $number2)

      Returns the maximum of two numbers.

    max3($number1, $number2, $number3)

      Returns the maximum of three numbers.

 EXPORT

    None by design.

EXAMPLES

 Aline two textfiles

      use LCS::Similar;
      use LCS;
    
      binmode(STDOUT,":encoding(UTF-8)");
    
      open(my $in1,"<:encoding(UTF-8)",'file1')
        or die "cannot open file1: $!";
      open(my $in2,"<:encoding(UTF-8)",'file2')
        or die "cannot open file2: $!";
    
      my $lines1 = [<$in1>];
      my $lines2 = [<$in2>];
    
      sub similarity {
        my ($a, $b, $threshold) = @_;
    
        $a //= '';
        $b //= '';
        $threshold //= 0.7;
    
        return 1 if ($a eq $b);
        return 1 if (!$a eq !$b); # avoid division by zero
    
        # length of LCS
        my $llcs = LCS->LLCS(
          [split(//,$a)],
          [split(//,$b)],
        );
    
        # the standard formula
        my $similarity = (2 * $llcs) / (length($a) + length($b));
        return $similarity if ($similarity >= $threshold);
        return 0;
      }
    
      # aligned indices of elements similar more than $threshold 0.5
      my $lcs = LCS::Similar->LCS( $lines1, $lines2, \&similarity, 0.5 );
    
      # map indices and not so similar elements into AoA of lines
      my $aligned = LCS->lcs2align( $lines1, $lines2, $lcs );
    
      # print them
      for my $chunk (@$aligned) {
        print 'a: ',$chunk->[0];
        print 'b: ',$chunk->[1];
        print "\n";
      }

 Aline two words

      use LCS::Similar;
      use LCS;
    
      my $word1 = [ split(//, 'eonnnnnicaio') ];
      my $word1 = [ split(//, 'communicato' ) ];
    
      sub confusable {
        my ($a, $b, $threshold) = @_;
    
        $a //= '';
        $b //= '';
        $threshold //= 0.7;
    
        return 1 if ($a eq $b);
        return 1 if (!$a && !$b);
    
        my $map = {
          'e' => 'c',
          'c' => 'e',
          'm' => 'n',
          'n' => 'm',
          'i' => 't',
          't' => 'i',
        };
    
        return $threshold if (exists $map->{$a} && $map->{$a} eq $b);
        return 0;
      }
    
      my $aligned = [
        LCS->align2strings(
          LCS->lcs2align(
            $word1,
            $word2,
            LCS::Similar->LCS( $word1, $word2, \&confusable, 0.7 )
          )
        )
      ];
      print 'a: ',$aligned->[0],"\n"; # eonnnnnicaio
      print 'b: ',$aligned->[1],"\n"; # commu_nicato



( run in 2.326 seconds using v1.01-cache-2.11-cpan-98e64b0badf )