Bio-MaxQuant-Evidence-Statistics

 view release on metacpan or  search on metacpan

lib/Bio/MaxQuant/Evidence/Statistics.pm  view on Meta::CPAN


=cut

sub logRatios {
    my $o = shift;
    return 0 if $o->{logged};
    $o->{logged} = 1;
    my $data = $o->{data};
    foreach my $exptname(keys %$data){
        my $experiment = $data->{$exptname};
        foreach my $proteinGroupId(keys %$experiment){
            my $proteinGroup = $experiment->{$proteinGroupId};
            my $ratios = $proteinGroup->{'Ratio H/L'};
            my @newRatios = ();
            foreach (0..$#$ratios){
                $ratios->[$_] = $ratios->[$_] =~ /^\d+\.?\d*$/
                    ? log($ratios->[$_])/log(2)
                    : '';
            }
        }
    }
    return 1;
}

=head2 filter

returns a set of protein records based on filter parameters...

=head3 options

=over

=item experiment - regular expression to match experiment name

=item proteinGroupId - regular expression to match protein group id 

=item leadingProteins - regular expression to match leading protein ids

=item notLeadingProteins - regular expression to not match leading protein ids

=back

Returns a filtered object of the same type, with relevant flags set (e.g. whether
data has been logged, etc).

Warning, intentionally does not perform a deep clone!

=cut

sub filter {
    my ($o,%opts) = @_;
    # options : 
#    use Data::Dumper;
#    print STDERR 'OPTS: ', Dumper \%opts;
    my $data = $o->{data};
    my $result = {};
    foreach my $experiment(keys %$data){
        if(! exists $opts{experiment} || $experiment =~ /$opts{experiment}/){
            $result->{$experiment} = {};
            my $exptdata = $data->{$experiment};
            foreach my $pgid(keys %$exptdata){
                if(! exists $opts{proteinGroupId} || $pgid =~ /$opts{proteinGroupId}/){
                    my $pgdata = $exptdata->{$pgid};
                    if(! exists $opts{leadingProteins} || $pgdata->{'Leading Proteins'} =~ /$opts{leadingProteins}/){
                        if(! exists $opts{notLeadingProteins} || $pgdata->{'Leading Proteins'} !~ /$opts{notLeadingProteins}/){
                            $result->{$experiment}->{$pgid} = $pgdata;
                        }
                    }
                }
            }
        }
    }
#   print STDERR Dumper $result if $opts{experiment} eq qr/^LCC1.nE.r2$/;
    my $p = $o->new;
    %$p = %$o;
    $p->{data} = $result;
    $o->{lastfiltered} = $p;
    return $p;
}

=head2 replicateMedian

options are passed to filter.

=cut

sub replicateMedian {
    my ($o,%opts) = @_;
    my $f = $o->filter(%opts);
    return $f->median(
        $f->extractColumnValues(
            column => 'Ratio H/L',
            nodup  => 0,
        )
    );
}

=head2 deviations 

returns an hashref with the following keys

=over

=item n - the number of items

=item sd - the standard deviation (from the mean)

=item mad - the median absolute deviation (from the median)

=item sd_via_mad - the standard deviation estimated from the median absolute deviation

=back

=cut

sub deviations {
    my ($o,%opts) = @_;
    # cache
    $o->{cache}->{deviations} = {} unless exists $o->{cache}->{deviations};
    my $cachekey = join('::', map {"$_=$opts{$_}"} sort keys %opts);
   # print STDERR "$cachekey\n";
    return $o->{cache}->{deviations}->{$cachekey} if exists $o->{cache}->{deviations}->{$cachekey};
    ##
    my $f = $o->filter(%opts);
    my @values = $f->extractColumnValues(
            column => 'Ratio H/L',

lib/Bio/MaxQuant/Evidence/Statistics.pm  view on Meta::CPAN

        usv1  => $d1->{usv},
        usv2  => $d2->{usv},
        n1    => $d1->{n},
        n2    => $d2->{n},
    );
    $tt->{p} = ($d1->{n} && $d2->{n}) ? Statistics::Distributions::tprob(int ($tt->{df}), $tt->{t}) : '';
    my $tm = $o->welchs_ttest(
        mean1 => $d1->{median},
        mean2 => $d2->{median},
        usv1  => $d1->{usv_mad},
        usv2  => $d2->{usv_mad},
        n1    => $d1->{n},
        n2    => $d2->{n},
    );
    $tm->{p} = ($d1->{n} && $d2->{n}) ? Statistics::Distributions::tprob(int ($tm->{df}), $tm->{t}) : '';
    
    my $r =   {
        stats1 => $d1, stats2 => $d2, ttest => $tt, ttest_mad => $tm 
    };
    $o->{cache}->{ttests}->{$cachekey} = $r;
    return $r;
}

=head2 welchs_ttest

performs Welch's ttest, given mean1, mean2, usv1, usv2, n1 and n2 in a hash.

e.g. 

    $o->welchs_ttest( mean1 => 4, mean2 => 3,  # sample mean
                      usv1 => 1,  usv2 => 1.1, # unbiased sample variance (returned as usv from $o->sd
                      n1 => 4,    n2=> 7       # number of observations
                      
also performs Welch-Satterthwaite to calculate degrees of freedom (to look up in t-statistic table)

Returns hashref containing t and df.

=cut

sub welchs_ttest {
    my ($o, %t) = @_;
    my ($x1,$x2,$v1,$v2,$n1,$n2) = map {$t{$_}} qw/mean1 mean2 usv1 usv2 n1 n2/;
    my ($vn1,$vn2) = ($v1/$n1,  $v2/$n2);
    my $t = abs($x1 - $x2) / sqrt( $vn1 + $vn2 );
    my $df = ($vn1 + $vn2)**2 / (  $vn1**2/($n1-1) + $vn2**2/($n2-1)  );
    return {t => $t, df => $df};
}

=head2 replicateMedianSubtractions 

Logs data, if not already done, calculates median for each replicate, and subtracts median from each evidence in that replicate.

=cut

sub replicateMedianSubtractions {
    my ($o, %opts) = @_; # can set filter here
    $o->logRatios();
    foreach my $replicate($o->experiments()){
        my $median = $o->replicateMedian(%opts, experiment=>$replicate);
        my $p = $o->filter(experiment=>$replicate);
        foreach my $pgid(keys %{$p->{data}->{$replicate}}){
            foreach my $i(0.. $#{$p->{data}->{$replicate}->{$pgid}->{'Ratio H/L'}}){
                if($p->{data}->{$replicate}->{$pgid}->{'Ratio H/L'}->[$i] =~ /\d/){
                    $p->{data}->{$replicate}->{$pgid}->{'Ratio H/L'}->[$i] -= $median;
                }
            }
        }
    }
    # i guess we should do something better with generating this status:
    return 1;
}

=head2 median 

given a list of numbers, returns the median... assumes all items are numbers!

=cut

sub median {
    my $o = shift;
    my @list = sort {$a <=> $b} @_;
    my $n = scalar @list;
    if($n % 2){ # remainder on division by two -> it's odd!
        return $list[($n-1)/2]; # index of last over 2, e.g. 21 items, last index 20, return 10.
    }
    else { # it's not odd... so it's even
        return ($list[$n/2 - 1] + $list[$n/2]) / 2; # length over 2 and the same minus 1, e.g. 20 items, we want 9 and 10.  
    }
}


=head2 experimentMaximumPvalue 

=cut

sub experimentMaximumPvalue {
    my ($o,%opts) = @_;
    # run through experiments and collect replicate names for comparisons...
    # this should be filtered for individual proteins using the leadingProteins option.
    my @reps1 = ();
    my @reps2 = ();
    foreach my $rep($o->experiments){
        if($rep =~ /^$opts{experiment1}/){
            push @reps1, $rep;
        }
        if($rep =~ /^$opts{experiment2}/){
            push @reps2, $rep;
        }
    }
    # now there must be enough replicates with enough observations in each...
    $opts{minimum_observations} = 2 unless exists $opts{minimum_observations};
    $opts{minimum_replicates} = 2 unless exists $opts{minimum_replicates};
    my $reps1 = 0;
    my $reps2 = 0;
    foreach my $rep(@reps1){
        my $f = $o->filter(experiment=>$rep, leadingProteins=>$opts{filter});
        my @values = $f->extractColumnValues(column => 'Ratio H/L');
        $reps1 ++ if scalar(@values) > $opts{minimum_observations};
    }
    foreach my $rep(@reps2){
        my $f = $o->filter(experiment=>$rep, leadingProteins=>$opts{filter});
        my @values = $f->extractColumnValues(column => 'Ratio H/L', nodup  => 0);
        $reps2 ++ if scalar(@values) > $opts{minimum_observations};
    }



( run in 1.917 second using v1.01-cache-2.11-cpan-97f6503c9c8 )