BioPerl

 view release on metacpan or  search on metacpan

Bio/Align/ProteinStatistics.pm  view on Meta::CPAN

 Usage   : my $matrix = $pepstats->D_Kimura($aln);
 Function: Calculate Kimura protein distance (Kimura 1983) which 
           approximates PAM distance
           D = -ln ( 1 - p - 0.2 * p^2 )
 Returns : L<Bio::Matrix::PhylipDist>
 Args    : L<Bio::Align::AlignI>


=cut

# Kimura, M. 1983. The Neutral Theory of Molecular Evolution. CUP, Cambridge.

sub D_Kimura{
   my ($self,$aln) = @_;
   return 0 unless $self->_check_arg($aln);
   # ambiguities ignored at this point
   my (@seqs,@names,@values,%dist);
   my $seqct = 0;
   foreach my $seq ( $aln->each_seq) {
       push @names, $seq->display_id;
       push @seqs, uc $seq->seq();
       $seqct++;
   }
   my $len = $aln->length;
   my $precisionstr = "%.$Precision"."f";

   for( my $i = 0; $i < $seqct-1; $i++ ) {
       # (diagonals) distance is 0 for same sequence
       $dist{$names[$i]}->{$names[$i]} = [$i,$i];
       $values[$i][$i] = sprintf($precisionstr,0);
       for( my $j = $i+1; $j < $seqct; $j++ ) {
	   my ($scored,$match) = (0,0);
	   for( my $k=0; $k < $len; $k++ ) {
	       my $m1 = substr($seqs[$i],$k,1);
	       my $m2 = substr($seqs[$j],$k,1);
	       if( $m1 ne '-' && $m2 ne '-' ) {
		   # score is number of scored bases (alignable bases)
		   # it could have also come from 
		   # my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
		   # match is number of matches weighting ambiguity bases
		   # as well
		   $match += _check_ambiguity_protein($m1,$m2);
		   $scored++;
	       }
	   }
	   # From Felsenstein's PHYLIP documentation:
	   # This is very quick to do but has some obvious
	   # limitations. It does not take into account which amino
	   # acids differ or to what amino acids they change, so some
	   # information is lost. The units of the distance measure
	   # are fraction of amino acids differing, as also in the
	   # case of the PAM distance. If the fraction of amino acids
	   # differing gets larger than 0.8541 the distance becomes
	   # infinite.

	   my $D = 1 - ( $match / $scored );
	   if( $D < 0.8541 ) {
	       $D = - log ( 1 - $D - (0.2 * ($D ** 2)));
	       $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$D);
	   } else { 
	       $values[$j][$i] = $values[$i][$j] = '    NaN';
	   }
	   # fwd and rev lookup
	   $dist{$names[$i]}->{$names[$j]} = [$i,$j];
	   $dist{$names[$j]}->{$names[$i]} = [$i,$j];	   

           # (diagonals) distance is 0 for same sequence
	   $dist{$names[$j]}->{$names[$j]} = [$j,$j];	   
	   $values[$j][$j] = sprintf($precisionstr,0); 

       }
   }
   return Bio::Matrix::PhylipDist->new(-program => 'bioperl_PEPstats',
				       -matrix  => \%dist,
				       -names   => \@names,
				       -values  => \@values); 
   
}

# some methods from EMBOSS distmat
sub _check_ambiguity_protein
{
    my ($t1,$t2) = @_;
    my $n = 0;

    if( $t1 ne 'X' && $t1 eq $t2 ) { 
        $n = 1.0;
    } elsif(  (($t1 eq 'B' && $t2 =~ /[DN]/ ) ||
	       ($t2 eq 'B' && $t1 =~ /[DN]/ )) ||
	      
	      (($t1 eq 'Z' && $t2 =~ /[EQ]/) ||
	       ($t2 eq 'Z' && $t1 =~ /[EQ]/ ))) {
        $n = 0.5;
    } elsif ( $t1 eq 'X' && $t2 eq 'X' ) {
        $n = 0.0025;
    } elsif(  $t1 eq 'X' || $t2 eq 'X' ) {
        $n = 0.05;
    }
    return $n;
}

=head2 Data Methods

=cut

=head2 pairwise_stats

 Title   : pairwise_stats
 Usage   : $obj->pairwise_stats($newval)
 Function: 
 Returns : value of pairwise_stats
 Args    : newvalue (optional)


=cut

sub pairwise_stats{
   my ($self,$value) = @_;
   if( defined $value) {
      $self->{'_pairwise_stats'} = $value;
    }



( run in 0.931 second using v1.01-cache-2.11-cpan-39bf76dae61 )