BioPerl
view release on metacpan or search on metacpan
Bio/Align/ProteinStatistics.pm view on Meta::CPAN
Usage : my $matrix = $pepstats->D_Kimura($aln);
Function: Calculate Kimura protein distance (Kimura 1983) which
approximates PAM distance
D = -ln ( 1 - p - 0.2 * p^2 )
Returns : L<Bio::Matrix::PhylipDist>
Args : L<Bio::Align::AlignI>
=cut
# Kimura, M. 1983. The Neutral Theory of Molecular Evolution. CUP, Cambridge.
sub D_Kimura{
my ($self,$aln) = @_;
return 0 unless $self->_check_arg($aln);
# ambiguities ignored at this point
my (@seqs,@names,@values,%dist);
my $seqct = 0;
foreach my $seq ( $aln->each_seq) {
push @names, $seq->display_id;
push @seqs, uc $seq->seq();
$seqct++;
}
my $len = $aln->length;
my $precisionstr = "%.$Precision"."f";
for( my $i = 0; $i < $seqct-1; $i++ ) {
# (diagonals) distance is 0 for same sequence
$dist{$names[$i]}->{$names[$i]} = [$i,$i];
$values[$i][$i] = sprintf($precisionstr,0);
for( my $j = $i+1; $j < $seqct; $j++ ) {
my ($scored,$match) = (0,0);
for( my $k=0; $k < $len; $k++ ) {
my $m1 = substr($seqs[$i],$k,1);
my $m2 = substr($seqs[$j],$k,1);
if( $m1 ne '-' && $m2 ne '-' ) {
# score is number of scored bases (alignable bases)
# it could have also come from
# my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
# match is number of matches weighting ambiguity bases
# as well
$match += _check_ambiguity_protein($m1,$m2);
$scored++;
}
}
# From Felsenstein's PHYLIP documentation:
# This is very quick to do but has some obvious
# limitations. It does not take into account which amino
# acids differ or to what amino acids they change, so some
# information is lost. The units of the distance measure
# are fraction of amino acids differing, as also in the
# case of the PAM distance. If the fraction of amino acids
# differing gets larger than 0.8541 the distance becomes
# infinite.
my $D = 1 - ( $match / $scored );
if( $D < 0.8541 ) {
$D = - log ( 1 - $D - (0.2 * ($D ** 2)));
$values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$D);
} else {
$values[$j][$i] = $values[$i][$j] = ' NaN';
}
# fwd and rev lookup
$dist{$names[$i]}->{$names[$j]} = [$i,$j];
$dist{$names[$j]}->{$names[$i]} = [$i,$j];
# (diagonals) distance is 0 for same sequence
$dist{$names[$j]}->{$names[$j]} = [$j,$j];
$values[$j][$j] = sprintf($precisionstr,0);
}
}
return Bio::Matrix::PhylipDist->new(-program => 'bioperl_PEPstats',
-matrix => \%dist,
-names => \@names,
-values => \@values);
}
# some methods from EMBOSS distmat
sub _check_ambiguity_protein
{
my ($t1,$t2) = @_;
my $n = 0;
if( $t1 ne 'X' && $t1 eq $t2 ) {
$n = 1.0;
} elsif( (($t1 eq 'B' && $t2 =~ /[DN]/ ) ||
($t2 eq 'B' && $t1 =~ /[DN]/ )) ||
(($t1 eq 'Z' && $t2 =~ /[EQ]/) ||
($t2 eq 'Z' && $t1 =~ /[EQ]/ ))) {
$n = 0.5;
} elsif ( $t1 eq 'X' && $t2 eq 'X' ) {
$n = 0.0025;
} elsif( $t1 eq 'X' || $t2 eq 'X' ) {
$n = 0.05;
}
return $n;
}
=head2 Data Methods
=cut
=head2 pairwise_stats
Title : pairwise_stats
Usage : $obj->pairwise_stats($newval)
Function:
Returns : value of pairwise_stats
Args : newvalue (optional)
=cut
sub pairwise_stats{
my ($self,$value) = @_;
if( defined $value) {
$self->{'_pairwise_stats'} = $value;
}
( run in 0.931 second using v1.01-cache-2.11-cpan-39bf76dae61 )