BioPerl
view release on metacpan or search on metacpan
Bio/Align/DNAStatistics.pm view on Meta::CPAN
############## primary internal subs for alignment comparisons ########################
sub _run_bootstrap {
### generates sampled sequences, calculates Ds and Dn values,
### then calculates variance of sampled sequences and add results to results hash
###
my ($self,$seq_ref, $results, $bootstrap_rpt) = @_;
my @seqs = @$seq_ref;
my @btstrp_aoa; # to hold array of array of nucleotides for resampling
my %bootstrap_values = (ds => [], dn =>[]); # to hold list of av values
#1st make alternative array of codons;
my $c = 0;
while ($c < length $seqs[0]{'seq'}) {
for (0..$#seqs) {
push @{$btstrp_aoa[$_]}, substr ($seqs[$_]{'seq'}, $c, 3);
}
$c+=3;
}
for (1..$bootstrap_rpt) {
my $sampled = _resample (\@btstrp_aoa);
my ($ds, $dn) = $self->_get_av_ds_dn ($sampled) ; # is array ref
push @{$bootstrap_values{'ds'}}, $ds;
push @{$bootstrap_values{'dn'}}, $dn;
}
$results->{'D_s_var'} = sampling_variance($bootstrap_values{'ds'});
$results->{'D_n_var'} = sampling_variance($bootstrap_values{'dn'});
$results->{'z_score'} = ($results->{'D_n'} - $results->{'D_s'}) /
sqrt($results->{'D_s_var'} + $results->{'D_n_var'} );
#print "bootstrapped var_syn = $results->{'D_s_var'} \n" ;
#print "bootstrapped var_nc = $results->{'D_n_var'} \n";
#print "z is $results->{'z_score'}\n"; ### end of global set up of/perm look up data
}
sub _resample {
my $ref = shift;
my $codon_num = scalar (@{$ref->[0]});
my @altered;
for (0..$codon_num -1) { #for each codon
my $rand = int (rand ($codon_num));
for (0..$#$ref) {
push @{$altered[$_]}, $ref->[$_][$rand];
}
}
my @stringed = map {join '', @$_}@altered;
my @return;
#now out in random name to keep other subs happy
for (@stringed) {
push @return, {id=>'1', seq=> $_};
}
return \@return;
}
sub _get_av_ds_dn {
# takes array of hashes of sequence strings and ids #
my $self = shift;
my $seq_ref = shift;
my $result = shift if @_;
my @caller = caller(1);
my @seqarray = @$seq_ref;
my $bootstrap_score_list;
#for a multiple alignment considers all pairwise combinations#
my %dsfor_average = (ds => [], dn => []);
for (my $i = 0; $i < scalar @seqarray; $i++) {
for (my $j = $i +1; $j<scalar @seqarray; $j++ ){
# print "comparing $i and $j\n";
if (length($seqarray[$i]{'seq'}) != length($seqarray[$j]{'seq'})) {
$self->warn(" aligned sequences must be of equal length!");
next;
}
my $syn_site_count = count_syn_sites($seqarray[$i]{'seq'}, $synsites);
my $syn_site_count2 = count_syn_sites($seqarray[$j]{'seq'}, $synsites);
# print "syn 1 is $syn_site_count , syn2 is $syn_site_count2\n";
my ($syn_count, $non_syn_count, $gap_cnt) = analyse_mutations($seqarray[$i]{'seq'}, $seqarray[$j]{'seq'});
#get averages
my $av_s_site = ($syn_site_count + $syn_site_count2)/2;
my $av_ns_syn_site = length($seqarray[$i]{'seq'}) - $gap_cnt- $av_s_site ;
#calculate ps and pn (p54)
my $syn_prop = $syn_count / $av_s_site;
my $nc_prop = $non_syn_count / $av_ns_syn_site ;
#now use jukes/cantor to calculate D_s and D_n, would alter here if needed a different method
my $d_syn = $self->jk($syn_prop);
my $d_nc = $self->jk($nc_prop);
#JK calculation must succeed for continuation of calculation
#ret_value = -1 if error
next unless $d_nc >=0 && $d_syn >=0;
push @{$dsfor_average{'ds'}}, $d_syn;
push @{$dsfor_average{'dn'}}, $d_nc;
#if not doing bootstrap, calculate the pairwise comparisin stats
if ($caller[3] =~ /calc_KaKs_pair/ || $caller[3] =~ /calc_all_KaKs_pairs/) {
#now calculate variances assuming large sample
my $d_syn_var = jk_var($syn_prop, length($seqarray[$i]{'seq'}) - $gap_cnt );
my $d_nc_var = jk_var($nc_prop, length ($seqarray[$i]{'seq'}) - $gap_cnt);
#now calculate z_value
#print "d_syn_var is $d_syn_var,and d_nc_var is $d_nc_var\n";
#my $z = ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var);
my $z = ($d_syn_var + $d_nc_var) ?
($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var) : 0;
# print "z is $z\n";
push @$result , {S => $av_s_site, N=>$av_ns_syn_site,
S_d => $syn_count, N_d =>$non_syn_count,
P_s => $syn_prop, P_n=>$nc_prop,
D_s => @{$dsfor_average{'ds'}}[-1],
D_n => @{$dsfor_average{'dn'}}[-1],
D_n_var =>$d_nc_var, D_s_var => $d_syn_var,
Seq1 => $seqarray[$i]{'id'},
Seq2 => $seqarray[$j]{'id'},
z_score => $z,
};
$self->warn (" number of mutations too small to justify normal test for $seqarray[$i]{'id'} and $seqarray[$j]{'id'}\n- use Fisher's exact, or bootstrap a MSA")
if ($syn_count < 10 || $non_syn_count < 10 ) && $self->verbose > -1 ;
}#endif
( run in 2.454 seconds using v1.01-cache-2.11-cpan-75ffa21a3d4 )