Bio-MaxQuant-Evidence-Statistics
view release on metacpan or search on metacpan
lib/Bio/MaxQuant/Evidence/Statistics.pm view on Meta::CPAN
mean2 => $d2->{median},
usv1 => $d1->{usv_mad},
usv2 => $d2->{usv_mad},
n1 => $d1->{n},
n2 => $d2->{n},
);
$tm->{p} = ($d1->{n} && $d2->{n}) ? Statistics::Distributions::tprob(int ($tm->{df}), $tm->{t}) : '';
my $r = {
stats1 => $d1, stats2 => $d2, ttest => $tt, ttest_mad => $tm
};
$o->{cache}->{ttests}->{$cachekey} = $r;
return $r;
}
=head2 welchs_ttest
performs Welch's ttest, given mean1, mean2, usv1, usv2, n1 and n2 in a hash.
e.g.
$o->welchs_ttest( mean1 => 4, mean2 => 3, # sample mean
usv1 => 1, usv2 => 1.1, # unbiased sample variance (returned as usv from $o->sd
n1 => 4, n2=> 7 # number of observations
also performs Welch-Satterthwaite to calculate degrees of freedom (to look up in t-statistic table)
Returns hashref containing t and df.
=cut
sub welchs_ttest {
my ($o, %t) = @_;
my ($x1,$x2,$v1,$v2,$n1,$n2) = map {$t{$_}} qw/mean1 mean2 usv1 usv2 n1 n2/;
my ($vn1,$vn2) = ($v1/$n1, $v2/$n2);
my $t = abs($x1 - $x2) / sqrt( $vn1 + $vn2 );
my $df = ($vn1 + $vn2)**2 / ( $vn1**2/($n1-1) + $vn2**2/($n2-1) );
return {t => $t, df => $df};
}
=head2 replicateMedianSubtractions
Logs data, if not already done, calculates median for each replicate, and subtracts median from each evidence in that replicate.
=cut
sub replicateMedianSubtractions {
my ($o, %opts) = @_; # can set filter here
$o->logRatios();
foreach my $replicate($o->experiments()){
my $median = $o->replicateMedian(%opts, experiment=>$replicate);
my $p = $o->filter(experiment=>$replicate);
foreach my $pgid(keys %{$p->{data}->{$replicate}}){
foreach my $i(0.. $#{$p->{data}->{$replicate}->{$pgid}->{'Ratio H/L'}}){
if($p->{data}->{$replicate}->{$pgid}->{'Ratio H/L'}->[$i] =~ /\d/){
$p->{data}->{$replicate}->{$pgid}->{'Ratio H/L'}->[$i] -= $median;
}
}
}
}
# i guess we should do something better with generating this status:
return 1;
}
=head2 median
given a list of numbers, returns the median... assumes all items are numbers!
=cut
sub median {
my $o = shift;
my @list = sort {$a <=> $b} @_;
my $n = scalar @list;
if($n % 2){ # remainder on division by two -> it's odd!
return $list[($n-1)/2]; # index of last over 2, e.g. 21 items, last index 20, return 10.
}
else { # it's not odd... so it's even
return ($list[$n/2 - 1] + $list[$n/2]) / 2; # length over 2 and the same minus 1, e.g. 20 items, we want 9 and 10.
}
}
=head2 experimentMaximumPvalue
=cut
sub experimentMaximumPvalue {
my ($o,%opts) = @_;
# run through experiments and collect replicate names for comparisons...
# this should be filtered for individual proteins using the leadingProteins option.
my @reps1 = ();
my @reps2 = ();
foreach my $rep($o->experiments){
if($rep =~ /^$opts{experiment1}/){
push @reps1, $rep;
}
if($rep =~ /^$opts{experiment2}/){
push @reps2, $rep;
}
}
# now there must be enough replicates with enough observations in each...
$opts{minimum_observations} = 2 unless exists $opts{minimum_observations};
$opts{minimum_replicates} = 2 unless exists $opts{minimum_replicates};
my $reps1 = 0;
my $reps2 = 0;
foreach my $rep(@reps1){
my $f = $o->filter(experiment=>$rep, leadingProteins=>$opts{filter});
my @values = $f->extractColumnValues(column => 'Ratio H/L');
$reps1 ++ if scalar(@values) > $opts{minimum_observations};
}
foreach my $rep(@reps2){
my $f = $o->filter(experiment=>$rep, leadingProteins=>$opts{filter});
my @values = $f->extractColumnValues(column => 'Ratio H/L', nodup => 0);
$reps2 ++ if scalar(@values) > $opts{minimum_observations};
}
return {p_max => -1, p_mad_max => -1} if $reps1 < $opts{minimum_replicates} || $reps2 < $opts{minimum_replicates};
# compare each combination of replicates
my $p_max = 0;
my $p_mad_max = 0;
( run in 2.645 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )