Bio-MaxQuant-Evidence-Statistics
view release on metacpan or search on metacpan
lib/Bio/MaxQuant/Evidence/Statistics.pm view on Meta::CPAN
=cut
sub logRatios {
my $o = shift;
return 0 if $o->{logged};
$o->{logged} = 1;
my $data = $o->{data};
foreach my $exptname(keys %$data){
my $experiment = $data->{$exptname};
foreach my $proteinGroupId(keys %$experiment){
my $proteinGroup = $experiment->{$proteinGroupId};
my $ratios = $proteinGroup->{'Ratio H/L'};
my @newRatios = ();
foreach (0..$#$ratios){
$ratios->[$_] = $ratios->[$_] =~ /^\d+\.?\d*$/
? log($ratios->[$_])/log(2)
: '';
}
}
}
return 1;
}
=head2 filter
returns a set of protein records based on filter parameters...
=head3 options
=over
=item experiment - regular expression to match experiment name
=item proteinGroupId - regular expression to match protein group id
=item leadingProteins - regular expression to match leading protein ids
=item notLeadingProteins - regular expression to not match leading protein ids
=back
Returns a filtered object of the same type, with relevant flags set (e.g. whether
data has been logged, etc).
Warning, intentionally does not perform a deep clone!
=cut
sub filter {
my ($o,%opts) = @_;
# options :
# use Data::Dumper;
# print STDERR 'OPTS: ', Dumper \%opts;
my $data = $o->{data};
my $result = {};
foreach my $experiment(keys %$data){
if(! exists $opts{experiment} || $experiment =~ /$opts{experiment}/){
$result->{$experiment} = {};
my $exptdata = $data->{$experiment};
foreach my $pgid(keys %$exptdata){
if(! exists $opts{proteinGroupId} || $pgid =~ /$opts{proteinGroupId}/){
my $pgdata = $exptdata->{$pgid};
if(! exists $opts{leadingProteins} || $pgdata->{'Leading Proteins'} =~ /$opts{leadingProteins}/){
if(! exists $opts{notLeadingProteins} || $pgdata->{'Leading Proteins'} !~ /$opts{notLeadingProteins}/){
$result->{$experiment}->{$pgid} = $pgdata;
}
}
}
}
}
}
# print STDERR Dumper $result if $opts{experiment} eq qr/^LCC1.nE.r2$/;
my $p = $o->new;
%$p = %$o;
$p->{data} = $result;
$o->{lastfiltered} = $p;
return $p;
}
=head2 replicateMedian
options are passed to filter.
=cut
sub replicateMedian {
my ($o,%opts) = @_;
my $f = $o->filter(%opts);
return $f->median(
$f->extractColumnValues(
column => 'Ratio H/L',
nodup => 0,
)
);
}
=head2 deviations
returns an hashref with the following keys
=over
=item n - the number of items
=item sd - the standard deviation (from the mean)
=item mad - the median absolute deviation (from the median)
=item sd_via_mad - the standard deviation estimated from the median absolute deviation
=back
=cut
sub deviations {
my ($o,%opts) = @_;
# cache
$o->{cache}->{deviations} = {} unless exists $o->{cache}->{deviations};
my $cachekey = join('::', map {"$_=$opts{$_}"} sort keys %opts);
# print STDERR "$cachekey\n";
return $o->{cache}->{deviations}->{$cachekey} if exists $o->{cache}->{deviations}->{$cachekey};
##
my $f = $o->filter(%opts);
my @values = $f->extractColumnValues(
column => 'Ratio H/L',
lib/Bio/MaxQuant/Evidence/Statistics.pm view on Meta::CPAN
usv1 => $d1->{usv},
usv2 => $d2->{usv},
n1 => $d1->{n},
n2 => $d2->{n},
);
$tt->{p} = ($d1->{n} && $d2->{n}) ? Statistics::Distributions::tprob(int ($tt->{df}), $tt->{t}) : '';
my $tm = $o->welchs_ttest(
mean1 => $d1->{median},
mean2 => $d2->{median},
usv1 => $d1->{usv_mad},
usv2 => $d2->{usv_mad},
n1 => $d1->{n},
n2 => $d2->{n},
);
$tm->{p} = ($d1->{n} && $d2->{n}) ? Statistics::Distributions::tprob(int ($tm->{df}), $tm->{t}) : '';
my $r = {
stats1 => $d1, stats2 => $d2, ttest => $tt, ttest_mad => $tm
};
$o->{cache}->{ttests}->{$cachekey} = $r;
return $r;
}
=head2 welchs_ttest
performs Welch's ttest, given mean1, mean2, usv1, usv2, n1 and n2 in a hash.
e.g.
$o->welchs_ttest( mean1 => 4, mean2 => 3, # sample mean
usv1 => 1, usv2 => 1.1, # unbiased sample variance (returned as usv from $o->sd
n1 => 4, n2=> 7 # number of observations
also performs Welch-Satterthwaite to calculate degrees of freedom (to look up in t-statistic table)
Returns hashref containing t and df.
=cut
sub welchs_ttest {
my ($o, %t) = @_;
my ($x1,$x2,$v1,$v2,$n1,$n2) = map {$t{$_}} qw/mean1 mean2 usv1 usv2 n1 n2/;
my ($vn1,$vn2) = ($v1/$n1, $v2/$n2);
my $t = abs($x1 - $x2) / sqrt( $vn1 + $vn2 );
my $df = ($vn1 + $vn2)**2 / ( $vn1**2/($n1-1) + $vn2**2/($n2-1) );
return {t => $t, df => $df};
}
=head2 replicateMedianSubtractions
Logs data, if not already done, calculates median for each replicate, and subtracts median from each evidence in that replicate.
=cut
sub replicateMedianSubtractions {
my ($o, %opts) = @_; # can set filter here
$o->logRatios();
foreach my $replicate($o->experiments()){
my $median = $o->replicateMedian(%opts, experiment=>$replicate);
my $p = $o->filter(experiment=>$replicate);
foreach my $pgid(keys %{$p->{data}->{$replicate}}){
foreach my $i(0.. $#{$p->{data}->{$replicate}->{$pgid}->{'Ratio H/L'}}){
if($p->{data}->{$replicate}->{$pgid}->{'Ratio H/L'}->[$i] =~ /\d/){
$p->{data}->{$replicate}->{$pgid}->{'Ratio H/L'}->[$i] -= $median;
}
}
}
}
# i guess we should do something better with generating this status:
return 1;
}
=head2 median
given a list of numbers, returns the median... assumes all items are numbers!
=cut
sub median {
my $o = shift;
my @list = sort {$a <=> $b} @_;
my $n = scalar @list;
if($n % 2){ # remainder on division by two -> it's odd!
return $list[($n-1)/2]; # index of last over 2, e.g. 21 items, last index 20, return 10.
}
else { # it's not odd... so it's even
return ($list[$n/2 - 1] + $list[$n/2]) / 2; # length over 2 and the same minus 1, e.g. 20 items, we want 9 and 10.
}
}
=head2 experimentMaximumPvalue
=cut
sub experimentMaximumPvalue {
my ($o,%opts) = @_;
# run through experiments and collect replicate names for comparisons...
# this should be filtered for individual proteins using the leadingProteins option.
my @reps1 = ();
my @reps2 = ();
foreach my $rep($o->experiments){
if($rep =~ /^$opts{experiment1}/){
push @reps1, $rep;
}
if($rep =~ /^$opts{experiment2}/){
push @reps2, $rep;
}
}
# now there must be enough replicates with enough observations in each...
$opts{minimum_observations} = 2 unless exists $opts{minimum_observations};
$opts{minimum_replicates} = 2 unless exists $opts{minimum_replicates};
my $reps1 = 0;
my $reps2 = 0;
foreach my $rep(@reps1){
my $f = $o->filter(experiment=>$rep, leadingProteins=>$opts{filter});
my @values = $f->extractColumnValues(column => 'Ratio H/L');
$reps1 ++ if scalar(@values) > $opts{minimum_observations};
}
foreach my $rep(@reps2){
my $f = $o->filter(experiment=>$rep, leadingProteins=>$opts{filter});
my @values = $f->extractColumnValues(column => 'Ratio H/L', nodup => 0);
$reps2 ++ if scalar(@values) > $opts{minimum_observations};
}
( run in 1.917 second using v1.01-cache-2.11-cpan-97f6503c9c8 )