FAST

 view release on metacpan or  search on metacpan

bin/alnpi  view on Meta::CPAN

}


## THIS FUNCTION WILL BE MAPPED OVER ALIGNMENT COLUMNS TO COUNT STATES
sub get_pattern {
  my @chars = @{ $_[0] };
  my $count = {};
  map {$$count{$_}++} @chars;
  return $count;
}

## THIS APPROXIMATION FOR THE LOG FACTORIAL IS DUE TO RAMANUJAN TAKEN FROM WIKIPEDIA ON MARCH 13 2009
## NEEDS TO BE VERIFIED AND COMPARED TO THE MATH::GSL::SF AND BIGFLOAT FUNCTIONS
## INPUT: INTEGER, FRACTIONAL PART WILL BE FLOORED AWAY
sub Ramanujan_logfact{
  my $lf;
  my $n = shift; 
  $lf = ($n * log $n) - $n;
  $lf += log ($n + (4 * $n ** 2) + (8 * $n ** 3)) / 6;
  $lf += hlPI; #this constant is half-log PI defined at the top
}

sub calculate {
  my $ua = shift;
  my $output = shift;
  if ($output && $output !~ /[pwd]/) { 
    die "$NAME: Error, output (third parameter to -w) must be equal to \'p\', \'w\', or \'d\´.\n"; 
  }

  my $gfs = $ua->gap_free_sites();
  my $gf = FAST::Bio::UnivAln->new('-seqs' => $gfs);
  $L =  $gf->width(); ## NOT my BECAUSE IT IS PRINTED GLOBAL
  my $nseqs = $gf->height(); ## LOCAL DISTINCT VARIABLE BECAUSE OF PAIRWISE MODE
  my $vs = $gf->var_sites();
  my $v =  FAST::Bio::UnivAln->new('-seqs' => $vs);

  $S  = $v->width();
  $s = $S / ($L || 1);
  $Theta_w = $S / $a1;
  $Theta_w_per_site = $Theta_w / ($L || 1);
  return $Theta_w_per_site if ($output && $output eq 'w');

  print join ' ','',$S,"\n" if ($upblue); 
  ## NEW IMPLEMENTATION $opt_u NOT TESTED; REQUIRES PRINTING NUMBER PW DIFF EACH PAIR OF SEQS!?

  my @seqs = split /\n/,$vs;       ## NEWLINES GO AWAY
  my %alleles;
  for my $i (0..$#seqs) {      
    $alleles{$seqs[$i]}++;         ## FOR THE ALLELE CONFIGURATION
  }
  ## COMPUTE PROBABILITY OF ALLELIC CONFIGURATION USING KARLIN MACGREGOR 1972
  $Num_alleles = scalar keys %alleles;
  $H = 1;
  my %a;
  map {$H -= (($_/$nseqs) ** 2)} values %alleles;
  if (!$slide_window and !$pairwise and $Theta_w > 0) {
    $Exp_num_alleles = 1 / $Theta_w;
    for my $i (1 .. ($#seqs )) {
      $Exp_num_alleles += 1 / ($Theta_w + $i);
    }
    $Exp_num_alleles *= $Theta_w; ## Ewen's Sampling formula
    map {$a{$_}++} values %alleles; ## THIS MAKES THE a_i
    my $log_prob   = $Num_alleles * log ($Theta_w);
    #      $log_prob  += Math::Gsl::Sf::sf_lnfact($Num_alleles);
    $log_prob  += Ramanujan_logfact($Num_alleles);
    
    foreach my $i  (0 .. ($nseqs - 1)) {
      $log_prob -= log ($Theta_w + $i);
    }
    foreach (keys %a) {
      #   $log_prob -= ($a{$_} * log ($_)) + Math::Gsl::Sf::sf_lnfact($a{$_});
      $log_prob -= ($a{$_} * log ($_)) + Ramanujan_logfact($a{$_});
    }
    $Prob_allele_partition = exp $log_prob;
  }
  else {
    $Exp_num_alleles = 1;
    $Prob_allele_partition = 1;
  }
  undef %alleles;
  ## end COMPUTE PROBABILITY OF ALLELIC CONFIGURATION USING KARLIN MACGREGOR 1972
  $Pi = 0;
  $eta_S = 0; ## FOR fu li D* and F*
  $eta   = 0;
  my @pats = $v->map_c(\&get_pattern);
  foreach my $pat (@pats) { 
    $eta_S += scalar grep {$_ == 1} values %$pat;
    $eta   += (scalar values %$pat) - 1;
    my $F = 0;
    map {$F += (($_ / $nseqs) ** 2)} values %$pat;
    $Pi += (1 - $F);
  }
  
  $Pi and $Pi *= ($nseqs / ($nseqs - 1)); ## for stochastic and sampling variance see Tajima 93
  ## USED TO BE $nseqs_stat -- for pairwise pi mode. Why?
  $pi = $Pi / ($L || 1); ## nuc diversity 
  return $pi if ($output && $output eq 'p');

  my $Tajima_D_num = $Pi - $Theta_w;
  my $Tajima_D_den = sqrt ( ($e1 * $S) + ($e2 * $S * ($S - 1)));
  $Tajima_D     = $Tajima_D_num / ($Tajima_D_den || 1);
  return $Tajima_D if ($output && $output eq 'd');
  
  if ($nseq > 2) {  
    
    my $cn          = ($nseq == 2 ? 1 : (2 * ((($nseq * $a1) - (2 * ($nseq - 1))) 
					      / (($nseq - 1) * ($nseq - 2))))); ## FU LI eq 14. p 695
    $anp1 = $a1 + (1 / ($nseq));
    
    my $dn          = ($cn + 
		       (($nseq - 2) / (($nseq - 1) ** 2)) + 
		       ((2 / ($nseq - 1)) * 
			(3/2 - (((2 * $anp1) - 3) / ($nseq - 2)) - (1 / $nseq))
		       )
		      ); 
    
    my $v_D_st      = (((($nseq / ($nseq - 1)) ** 2) * $a2) + 
		       (($a1 ** 2) * $dn) - 
		       ((2 * $nseq * $a1 * ($a1 + 1)) / (($nseq - 1) ** 2)))
      /
	( ($a1 ** 2) + $a2 );



( run in 0.588 second using v1.01-cache-2.11-cpan-39bf76dae61 )