BioPerl

 view release on metacpan or  search on metacpan

Bio/Align/DNAStatistics.pm  view on Meta::CPAN

############## primary internal subs for alignment comparisons ########################

sub _run_bootstrap {
    ### generates sampled sequences, calculates Ds and Dn values,
    ### then calculates variance of sampled sequences and add results to results hash
    ### 
    my ($self,$seq_ref, $results, $bootstrap_rpt) = @_;	
    my @seqs = @$seq_ref;
    my @btstrp_aoa; # to hold array of array of nucleotides for resampling
    my %bootstrap_values = (ds => [], dn =>[]);	# to hold list of av values 

    #1st make alternative array of codons;
    my $c = 0;
    while ($c < length $seqs[0]{'seq'}) {
	for (0..$#seqs) {
	    push @{$btstrp_aoa[$_]}, substr ($seqs[$_]{'seq'}, $c, 3);
	}
	$c+=3;
    }

    for (1..$bootstrap_rpt) {
	my $sampled = _resample (\@btstrp_aoa);
	my ($ds, $dn) = $self->_get_av_ds_dn ($sampled) ; # is array ref
	push @{$bootstrap_values{'ds'}}, $ds;
	push @{$bootstrap_values{'dn'}}, $dn;
    }	

    $results->{'D_s_var'} = sampling_variance($bootstrap_values{'ds'});
    $results->{'D_n_var'} = sampling_variance($bootstrap_values{'dn'});
    $results->{'z_score'} = 	($results->{'D_n'} - $results->{'D_s'}) / 
	sqrt($results->{'D_s_var'} + $results->{'D_n_var'} ); 
    #print "bootstrapped var_syn = 	$results->{'D_s_var'} \n" ;
    #print "bootstrapped var_nc = 	$results->{'D_n_var'} \n"; 
    #print "z is $results->{'z_score'}\n";	### end of global set up of/perm look up data
}

sub _resample {
    my $ref = shift;
    my $codon_num = scalar (@{$ref->[0]});
    my @altered;
    for (0..$codon_num -1) {	#for each codon
	my $rand = int (rand ($codon_num));
	for (0..$#$ref) {
	    push @{$altered[$_]}, $ref->[$_][$rand];
	}
    }
    my @stringed = map {join '', @$_}@altered;
    my @return;
    #now out in random name to keep other subs happy
    for (@stringed) {
	push @return, {id=>'1', seq=> $_};
    }
    return \@return;
}

sub _get_av_ds_dn {
    # takes array of hashes of sequence strings and ids   #
    my $self = shift;
    my $seq_ref = shift;
    my $result = shift if @_;
    my @caller = caller(1);
    my @seqarray = @$seq_ref;
    my $bootstrap_score_list;
    #for a multiple alignment considers all pairwise combinations#
    my %dsfor_average = (ds => [], dn => []); 
    for (my $i = 0; $i < scalar @seqarray; $i++) {
	for (my $j = $i +1; $j<scalar @seqarray; $j++ ){
#			print "comparing $i and $j\n";
	    if (length($seqarray[$i]{'seq'}) != length($seqarray[$j]{'seq'})) {
		$self->warn(" aligned sequences must be of equal length!");
		next;
	    }

	    my $syn_site_count = count_syn_sites($seqarray[$i]{'seq'}, $synsites);
	    my $syn_site_count2 = count_syn_sites($seqarray[$j]{'seq'}, $synsites);
#			print "syn 1 is $syn_site_count , syn2 is $syn_site_count2\n";
	    my ($syn_count, $non_syn_count, $gap_cnt) = analyse_mutations($seqarray[$i]{'seq'}, $seqarray[$j]{'seq'});	
	    #get averages
	    my $av_s_site = ($syn_site_count + $syn_site_count2)/2;
	    my $av_ns_syn_site = length($seqarray[$i]{'seq'}) - $gap_cnt- $av_s_site ;

	    #calculate ps and pn  (p54)
	    my $syn_prop = $syn_count / $av_s_site;
	    my $nc_prop = $non_syn_count / $av_ns_syn_site	;

	    #now use jukes/cantor to calculate D_s and D_n, would alter here if needed a different method
	    my $d_syn = $self->jk($syn_prop);
	    my $d_nc = $self->jk($nc_prop);

	    #JK calculation must succeed for continuation of calculation
	    #ret_value = -1 if error
	    next unless $d_nc >=0 && $d_syn >=0;


	    push @{$dsfor_average{'ds'}}, $d_syn;
	    push @{$dsfor_average{'dn'}}, $d_nc;

	    #if not doing bootstrap, calculate the pairwise comparisin stats
	    if ($caller[3] =~ /calc_KaKs_pair/ || $caller[3] =~ /calc_all_KaKs_pairs/) {
				#now calculate variances assuming large sample
		my $d_syn_var =  jk_var($syn_prop, length($seqarray[$i]{'seq'})  - $gap_cnt );
		my $d_nc_var =  jk_var($nc_prop, length ($seqarray[$i]{'seq'}) - $gap_cnt);
		#now calculate z_value
		#print "d_syn_var is  $d_syn_var,and d_nc_var is $d_nc_var\n";
		#my $z = ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var);
		my $z = ($d_syn_var + $d_nc_var) ? 
		  ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var) : 0;
		#	print "z is $z\n";
		push @$result , {S => $av_s_site, N=>$av_ns_syn_site,
				 S_d => $syn_count, N_d =>$non_syn_count,
				 P_s => $syn_prop, P_n=>$nc_prop,
				 D_s => @{$dsfor_average{'ds'}}[-1],
				 D_n => @{$dsfor_average{'dn'}}[-1],
				 D_n_var =>$d_nc_var, D_s_var => $d_syn_var,
				 Seq1 => $seqarray[$i]{'id'},
				 Seq2 => $seqarray[$j]{'id'},
				 z_score => $z,
			     };
		$self->warn (" number of mutations too small to justify normal test for  $seqarray[$i]{'id'} and $seqarray[$j]{'id'}\n- use Fisher's exact, or bootstrap a MSA")
		    if ($syn_count < 10 || $non_syn_count < 10 ) && $self->verbose > -1 ;
	    }#endif



( run in 2.454 seconds using v1.01-cache-2.11-cpan-75ffa21a3d4 )