FAST
view release on metacpan or search on metacpan
}
## THIS FUNCTION WILL BE MAPPED OVER ALIGNMENT COLUMNS TO COUNT STATES
sub get_pattern {
my @chars = @{ $_[0] };
my $count = {};
map {$$count{$_}++} @chars;
return $count;
}
## THIS APPROXIMATION FOR THE LOG FACTORIAL IS DUE TO RAMANUJAN TAKEN FROM WIKIPEDIA ON MARCH 13 2009
## NEEDS TO BE VERIFIED AND COMPARED TO THE MATH::GSL::SF AND BIGFLOAT FUNCTIONS
## INPUT: INTEGER, FRACTIONAL PART WILL BE FLOORED AWAY
sub Ramanujan_logfact{
my $lf;
my $n = shift;
$lf = ($n * log $n) - $n;
$lf += log ($n + (4 * $n ** 2) + (8 * $n ** 3)) / 6;
$lf += hlPI; #this constant is half-log PI defined at the top
}
sub calculate {
my $ua = shift;
my $output = shift;
if ($output && $output !~ /[pwd]/) {
die "$NAME: Error, output (third parameter to -w) must be equal to \'p\', \'w\', or \'d\´.\n";
}
my $gfs = $ua->gap_free_sites();
my $gf = FAST::Bio::UnivAln->new('-seqs' => $gfs);
$L = $gf->width(); ## NOT my BECAUSE IT IS PRINTED GLOBAL
my $nseqs = $gf->height(); ## LOCAL DISTINCT VARIABLE BECAUSE OF PAIRWISE MODE
my $vs = $gf->var_sites();
my $v = FAST::Bio::UnivAln->new('-seqs' => $vs);
$S = $v->width();
$s = $S / ($L || 1);
$Theta_w = $S / $a1;
$Theta_w_per_site = $Theta_w / ($L || 1);
return $Theta_w_per_site if ($output && $output eq 'w');
print join ' ','',$S,"\n" if ($upblue);
## NEW IMPLEMENTATION $opt_u NOT TESTED; REQUIRES PRINTING NUMBER PW DIFF EACH PAIR OF SEQS!?
my @seqs = split /\n/,$vs; ## NEWLINES GO AWAY
my %alleles;
for my $i (0..$#seqs) {
$alleles{$seqs[$i]}++; ## FOR THE ALLELE CONFIGURATION
}
## COMPUTE PROBABILITY OF ALLELIC CONFIGURATION USING KARLIN MACGREGOR 1972
$Num_alleles = scalar keys %alleles;
$H = 1;
my %a;
map {$H -= (($_/$nseqs) ** 2)} values %alleles;
if (!$slide_window and !$pairwise and $Theta_w > 0) {
$Exp_num_alleles = 1 / $Theta_w;
for my $i (1 .. ($#seqs )) {
$Exp_num_alleles += 1 / ($Theta_w + $i);
}
$Exp_num_alleles *= $Theta_w; ## Ewen's Sampling formula
map {$a{$_}++} values %alleles; ## THIS MAKES THE a_i
my $log_prob = $Num_alleles * log ($Theta_w);
# $log_prob += Math::Gsl::Sf::sf_lnfact($Num_alleles);
$log_prob += Ramanujan_logfact($Num_alleles);
foreach my $i (0 .. ($nseqs - 1)) {
$log_prob -= log ($Theta_w + $i);
}
foreach (keys %a) {
# $log_prob -= ($a{$_} * log ($_)) + Math::Gsl::Sf::sf_lnfact($a{$_});
$log_prob -= ($a{$_} * log ($_)) + Ramanujan_logfact($a{$_});
}
$Prob_allele_partition = exp $log_prob;
}
else {
$Exp_num_alleles = 1;
$Prob_allele_partition = 1;
}
undef %alleles;
## end COMPUTE PROBABILITY OF ALLELIC CONFIGURATION USING KARLIN MACGREGOR 1972
$Pi = 0;
$eta_S = 0; ## FOR fu li D* and F*
$eta = 0;
my @pats = $v->map_c(\&get_pattern);
foreach my $pat (@pats) {
$eta_S += scalar grep {$_ == 1} values %$pat;
$eta += (scalar values %$pat) - 1;
my $F = 0;
map {$F += (($_ / $nseqs) ** 2)} values %$pat;
$Pi += (1 - $F);
}
$Pi and $Pi *= ($nseqs / ($nseqs - 1)); ## for stochastic and sampling variance see Tajima 93
## USED TO BE $nseqs_stat -- for pairwise pi mode. Why?
$pi = $Pi / ($L || 1); ## nuc diversity
return $pi if ($output && $output eq 'p');
my $Tajima_D_num = $Pi - $Theta_w;
my $Tajima_D_den = sqrt ( ($e1 * $S) + ($e2 * $S * ($S - 1)));
$Tajima_D = $Tajima_D_num / ($Tajima_D_den || 1);
return $Tajima_D if ($output && $output eq 'd');
if ($nseq > 2) {
my $cn = ($nseq == 2 ? 1 : (2 * ((($nseq * $a1) - (2 * ($nseq - 1)))
/ (($nseq - 1) * ($nseq - 2))))); ## FU LI eq 14. p 695
$anp1 = $a1 + (1 / ($nseq));
my $dn = ($cn +
(($nseq - 2) / (($nseq - 1) ** 2)) +
((2 / ($nseq - 1)) *
(3/2 - (((2 * $anp1) - 3) / ($nseq - 2)) - (1 / $nseq))
)
);
my $v_D_st = (((($nseq / ($nseq - 1)) ** 2) * $a2) +
(($a1 ** 2) * $dn) -
((2 * $nseq * $a1 * ($a1 + 1)) / (($nseq - 1) ** 2)))
/
( ($a1 ** 2) + $a2 );
( run in 0.588 second using v1.01-cache-2.11-cpan-39bf76dae61 )