Bio-MaxQuant-Evidence-Statistics

 view release on metacpan or  search on metacpan

lib/Bio/MaxQuant/Evidence/Statistics.pm  view on Meta::CPAN

        mean2 => $d2->{median},
        usv1  => $d1->{usv_mad},
        usv2  => $d2->{usv_mad},
        n1    => $d1->{n},
        n2    => $d2->{n},
    );
    $tm->{p} = ($d1->{n} && $d2->{n}) ? Statistics::Distributions::tprob(int ($tm->{df}), $tm->{t}) : '';
    
    my $r =   {
        stats1 => $d1, stats2 => $d2, ttest => $tt, ttest_mad => $tm 
    };
    $o->{cache}->{ttests}->{$cachekey} = $r;
    return $r;
}

=head2 welchs_ttest

performs Welch's ttest, given mean1, mean2, usv1, usv2, n1 and n2 in a hash.

e.g. 

    $o->welchs_ttest( mean1 => 4, mean2 => 3,  # sample mean
                      usv1 => 1,  usv2 => 1.1, # unbiased sample variance (returned as usv from $o->sd
                      n1 => 4,    n2=> 7       # number of observations
                      
also performs Welch-Satterthwaite to calculate degrees of freedom (to look up in t-statistic table)

Returns hashref containing t and df.

=cut

sub welchs_ttest {
    my ($o, %t) = @_;
    my ($x1,$x2,$v1,$v2,$n1,$n2) = map {$t{$_}} qw/mean1 mean2 usv1 usv2 n1 n2/;
    my ($vn1,$vn2) = ($v1/$n1,  $v2/$n2);
    my $t = abs($x1 - $x2) / sqrt( $vn1 + $vn2 );
    my $df = ($vn1 + $vn2)**2 / (  $vn1**2/($n1-1) + $vn2**2/($n2-1)  );
    return {t => $t, df => $df};
}

=head2 replicateMedianSubtractions 

Logs data, if not already done, calculates median for each replicate, and subtracts median from each evidence in that replicate.

=cut

sub replicateMedianSubtractions {
    my ($o, %opts) = @_; # can set filter here
    $o->logRatios();
    foreach my $replicate($o->experiments()){
        my $median = $o->replicateMedian(%opts, experiment=>$replicate);
        my $p = $o->filter(experiment=>$replicate);
        foreach my $pgid(keys %{$p->{data}->{$replicate}}){
            foreach my $i(0.. $#{$p->{data}->{$replicate}->{$pgid}->{'Ratio H/L'}}){
                if($p->{data}->{$replicate}->{$pgid}->{'Ratio H/L'}->[$i] =~ /\d/){
                    $p->{data}->{$replicate}->{$pgid}->{'Ratio H/L'}->[$i] -= $median;
                }
            }
        }
    }
    # i guess we should do something better with generating this status:
    return 1;
}

=head2 median 

given a list of numbers, returns the median... assumes all items are numbers!

=cut

sub median {
    my $o = shift;
    my @list = sort {$a <=> $b} @_;
    my $n = scalar @list;
    if($n % 2){ # remainder on division by two -> it's odd!
        return $list[($n-1)/2]; # index of last over 2, e.g. 21 items, last index 20, return 10.
    }
    else { # it's not odd... so it's even
        return ($list[$n/2 - 1] + $list[$n/2]) / 2; # length over 2 and the same minus 1, e.g. 20 items, we want 9 and 10.  
    }
}


=head2 experimentMaximumPvalue 

=cut

sub experimentMaximumPvalue {
    my ($o,%opts) = @_;
    # run through experiments and collect replicate names for comparisons...
    # this should be filtered for individual proteins using the leadingProteins option.
    my @reps1 = ();
    my @reps2 = ();
    foreach my $rep($o->experiments){
        if($rep =~ /^$opts{experiment1}/){
            push @reps1, $rep;
        }
        if($rep =~ /^$opts{experiment2}/){
            push @reps2, $rep;
        }
    }
    # now there must be enough replicates with enough observations in each...
    $opts{minimum_observations} = 2 unless exists $opts{minimum_observations};
    $opts{minimum_replicates} = 2 unless exists $opts{minimum_replicates};
    my $reps1 = 0;
    my $reps2 = 0;
    foreach my $rep(@reps1){
        my $f = $o->filter(experiment=>$rep, leadingProteins=>$opts{filter});
        my @values = $f->extractColumnValues(column => 'Ratio H/L');
        $reps1 ++ if scalar(@values) > $opts{minimum_observations};
    }
    foreach my $rep(@reps2){
        my $f = $o->filter(experiment=>$rep, leadingProteins=>$opts{filter});
        my @values = $f->extractColumnValues(column => 'Ratio H/L', nodup  => 0);
        $reps2 ++ if scalar(@values) > $opts{minimum_observations};
    }
    return {p_max => -1, p_mad_max => -1} if $reps1 < $opts{minimum_replicates} || $reps2 < $opts{minimum_replicates};
    
    # compare each combination of replicates
    my $p_max = 0;
    my $p_mad_max = 0;



( run in 2.645 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )