Bio-ToolBox
view release on metacpan or search on metacpan
scripts/correlate_position_data.pl view on Meta::CPAN
}
if ($parallel) {
$cpu ||= 4;
}
else {
# disable cores
print " WARNING: disabling parallel CPU execution, no support present\n" if $cpu;
$cpu = 0;
}
unless ($outfile) {
$outfile = $infile;
}
}
sub validate_or_request_dataset {
# Process the reference dataset
$refDataSet = verify_or_request_feature_types(
'db' => $ddb || $Data->database,
'feature' => $refDataSet,
'prompt' => "Enter the reference data set ",
'single' => 1,
);
unless ($refDataSet) {
die " A valid reference data set must be provided!\n";
}
# Process the test dataset
$testDataSet = verify_or_request_feature_types(
'db' => $ddb || $Data->database,
'feature' => $testDataSet,
'prompt' => "Enter the test data set ",
'single' => 1,
);
unless ($testDataSet) {
die " A valid test data set must be provided!\n";
}
}
sub parallel_execution {
my $pm = Parallel::ForkManager->new($cpu);
$pm->run_on_start( sub { sleep 1; } );
# give a chance for child to start up and open databases, files, etc
# without creating race conditions
# Prepare new columns
my ( $r_i, $p_i, $shift_i, $shiftr_i ) = add_new_columns();
# generate base name for child processes
my $child_base_name = $outfile . ".$PID";
# variables for reporting the summary of results
my @correlations;
my @optimal_shifts;
my @optimal_correlations;
my $count = 0;
my $not_enough_data = 0;
my $no_variance = 0;
$pm->run_on_finish(
sub {
my ( $pid, $exit_code, $ident, $exit_signal, $core_dump, $result ) = @_;
push @correlations, @{ $result->[0] };
push @optimal_shifts, @{ $result->[1] };
push @optimal_correlations, @{ $result->[2] };
$count += $result->[3];
$not_enough_data += $result->[4];
$no_variance += $result->[5];
}
);
# Split the input data into parts and execute in parallel in separate forks
print " Forking into $cpu children for parallel correlation collections\n";
for my $i ( 1 .. $cpu ) {
$pm->start and next;
#### In child ####
# splice the data structure
$Data->split_data( $i, $cpu );
# re-open database objects to make them clone safe
my $db = $Data->open_database(1);
if ($data_database) {
$ddb = open_db_connection( $data_database, 1 );
}
# collect
my @results = collect_correlations( $r_i, $p_i, $shift_i, $shiftr_i );
# write output file
my $success = $Data->write_file(
'filename' => sprintf( "$child_base_name.%03s", $i ),
'gz' => 0, # faster to write without compression
);
if ($success) {
print " wrote output file $success\n";
}
else {
# failure! the subroutine will have printed error messages
die " unable to write file!\n";
# no need to continue
}
# Finished
$pm->finish( 0, \@results );
}
$pm->wait_all_children;
# reassemble children files into output file
my @files = glob "$child_base_name.*";
unless (@files) {
die "unable to find children files!\n";
}
unless ( scalar @files == $cpu ) {
die "only found " . scalar(@files) . " child files when there should be $cpu!\n";
}
my $new_count = $Data->reload_children(@files);
printf " reloaded %s features from children\n", format_with_commas($new_count);
# summarize the results
summarize_results( \@correlations, \@optimal_shifts, \@optimal_correlations,
$count, $not_enough_data, $no_variance );
}
sub single_execution {
# run in a single thread
# Prepare new columns
my ( $r_i, $p_i, $shift_i, $shiftr_i ) = add_new_columns();
# collect
print " Collecting correlations....\n";
my @results = collect_correlations( $r_i, $p_i, $shift_i, $shiftr_i );
# summarize
summarize_results(@results);
}
sub collect_correlations {
my ( $r_i, $p_i, $shift_i, $shiftr_i ) = @_;
# Set variables for summary analysis
my @correlations;
my @optimal_shifts;
my @optimal_correlations;
my $count = 0;
my $not_enough_data = 0;
my $no_variance = 0;
# check that we can collect information
unless ( $Data->feature_type eq 'named' or $Data->feature_type eq 'coordinate' ) {
die " Unable to identify the type of features in the input file."
. " File must be a recognizable format and/or have column header names\n"
. " for chromosome, start, stop; or named database features\n";
}
# Walk through features
my $stream = $Data->row_stream;
while ( my $row = $stream->next_row ) {
# Collect positioned data
my %ref_pos2data;
my %test_pos2data;
if ($radius) {
# collecting data in a radius around a reference point
# collect data
%ref_pos2data = $row->get_relative_point_position_scores(
ddb => $ddb,
dataset => $refDataSet,
position => $position,
extend => ( 2 * $radius ),
);
%test_pos2data = $row->get_relative_point_position_scores(
ddb => $ddb,
dataset => $testDataSet,
position => $position,
extend => ( 2 * $radius ),
);
}
else {
# just collect data over the region
# collect data
%ref_pos2data = $row->get_region_position_scores(
ddb => $ddb,
dataset => $refDataSet,
position => $position,
);
%test_pos2data = $row->get_region_position_scores(
ddb => $ddb,
dataset => $testDataSet,
position => $position,
);
}
# Verify minimum data count
if ( sum0( map {abs} values %ref_pos2data ) == 0
or sum0( map {abs} values %test_pos2data ) == 0 )
{
# not enough data points to work with
$row->value( $r_i, '.' );
$row->value( $p_i, '.' ) if ($find_pvalue);
if ($find_shift) {
$row->value( $shift_i, '.' );
$row->value( $shiftr_i, '.' );
}
$not_enough_data++;
next;
}
# Calculate Paired T-Test P value
# going to try this before normalizing
if ($find_pvalue) {
$row->value( $p_i, calculate_ttest( \%ref_pos2data, \%test_pos2data ) );
}
# Normalize the data
# there are couple ways to do this
if ( $norm_method and $norm_method =~ /rank/i ) {
# convert to rank values
# essentially a Spearman's rank correlation
normalize_values_by_rank( \%ref_pos2data );
normalize_values_by_rank( \%test_pos2data );
}
elsif ( $norm_method and $norm_method =~ /sum/i ) {
# normalize the sums of both
# good for read counts
normalize_values_by_sum( \%ref_pos2data );
normalize_values_by_sum( \%test_pos2data );
}
# Collect the (normalized) data from the position data
# this may be all of the data or a portion of it, depending
my @ref_data;
my @test_data;
if ($radius) {
for my $i ( ( 0 - $radius ) .. $radius ) {
# get values for each position, default 0
push @ref_data, $ref_pos2data{$i} || 0;
push @test_data, $test_pos2data{$i} || 0;
}
}
else {
for ( my $i = 1; $i < $row->length; $i++ ) {
# get values for each position, default 0
push @ref_data, $ref_pos2data{$i} || 0;
push @test_data, $test_pos2data{$i} || 0;
}
}
# verify the reference data
if ( min(@ref_data) == max(@ref_data) ) {
# no variance in the data! cannot calculate Pearson
$row->value( $r_i, '.' );
$row->value( $p_i, '.' ) if ($find_pvalue);
if ($find_shift) {
$row->value( $shift_i, '.' );
$row->value( $shiftr_i, '.' );
}
$no_variance++;
next;
}
# Calculate the Pearson correlation between test and reference
my $r;
if ( min(@test_data) == max(@test_data) ) {
# test has no variance
# report a Pearson of 0
$r = 0;
# this may not be a total loss, as there may be data points
# outside of the window that would generate an optimal Pearson
# so we will continue rather than bailing
}
else {
# test has variance, calculate Pearson
$r = calculate_pearson( \@ref_data, \@test_data ) || 0;
}
$row->value( $r_i, $r );
push @correlations, $r if $r;
# Determine optimal shift if requested
if ($find_shift) {
# find optimal shift
my ( $best_shift, $best_r ) =
calculate_optimum_shift( \%test_pos2data, \@ref_data, $r, );
# change strand
if ($set_strand) {
# user is imposing a strand
if ( $row->strand < 0 ) {
# only change shift direction if reverse strand
$best_shift *= -1;
}
}
# record in data table
$row->value( $shift_i, $best_shift );
$row->value( $shiftr_i, $best_r > $r ? $best_r : '.' );
# record for summary analyses
push @optimal_shifts, $best_shift if ( $best_r > $r );
push @optimal_correlations, $best_r if ( $best_r > $r );
}
# Finished with this row
$count++;
}
# Return correlation results for summary
return ( \@correlations, \@optimal_shifts, \@optimal_correlations,
$count, $not_enough_data, $no_variance );
}
sub add_new_columns {
# Required columns
# the Pearson column
my $r_i = $Data->add_column('Pearson_correlation');
$Data->metadata( $r_i, 'reference', $refDataSet );
$Data->metadata( $r_i, 'test', $testDataSet );
# extra information
$Data->metadata( $r_i, 'data_database', $data_database ) if $data_database;
$Data->metadata( $r_i, 'normalization', $norm_method ) if $norm_method;
if ($radius) {
$Data->metadata( $r_i, 'radius', $radius );
$Data->metadata( $r_i, 'reference_point',
$position == 5 ? '5_prime' : $position == 4 ? 'midpoint' : '3_prime' );
}
# Optional columns if we're calculating an optimal shift
my ( $shift_i, $shiftr_i );
if ($find_shift) {
# optimal test-reference shift value
$shift_i = $Data->add_column('optimal_shift');
$Data->metadata( $shift_i, 'reference', $refDataSet );
$Data->metadata( $shift_i, 'test', $testDataSet );
$Data->metadata( $shift_i, 'radius', $radius );
$Data->metadata( $shift_i, 'data_database', $data_database ) if $data_database;
$Data->metadata( $shift_i, 'reference_point',
$position == 5 ? '5_prime' : $position == 4 ? 'midpoint' : '3_prime' );
# optimal test-reference shift value
$shiftr_i = $Data->add_column('shift_correlation');
$Data->metadata( $shiftr_i, 'reference', $refDataSet );
$Data->metadata( $shiftr_i, 'test', $testDataSet );
$Data->metadata( $shiftr_i, 'radius', $radius );
$Data->metadata( $shiftr_i, 'data_database', $data_database ) if $data_database;
$Data->metadata( $shiftr_i, 'reference_point',
$position == 5 ? '5_prime' : $position == 4 ? 'midpoint' : '3_prime' );
$Data->metadata( $shiftr_i, 'normalization', $norm_method ) if $norm_method;
}
# Paired Student T-Test P-value
my $p_i;
if ($find_pvalue) {
$p_i = $Data->add_column('ANOVA_Pval');
$Data->metadata( $p_i, 'reference', $refDataSet );
$Data->metadata( $p_i, 'test', $testDataSet );
$Data->metadata( $p_i, 'transform', '-Log10' );
}
# record the indices
return ( $r_i, $p_i, $shift_i, $shiftr_i );
}
sub calculate_optimum_shift {
# Calculate an optimal shift that maximizes the Pearson correlation
scripts/correlate_position_data.pl view on Meta::CPAN
my $ref_scale_factor = 100 / median( map {abs} values %{$data} );
map { $data->{$_} = $ref_scale_factor * $data->{$_} } keys %{$data};
}
sub calculate_pearson {
# positioned data hashes, start, and stop coordinates for evaluating
my ( $ref, $test ) = @_;
# calculate correlation
# my $stat = Statistics::LineFit->new();
# $stat->setData($ref, $test) or warn " bad data!\n";
# return $stat->rSquared();
my $stat = Statistics::Descriptive::Full->new();
$stat->add_data($ref);
my ( $q, $m, $r, $rms ) = $stat->least_squares_fit( @{$test} );
return $r;
}
sub calculate_ttest {
# generate the data arrays
my ( $ref, $test ) = @_;
my @ref_data;
my @test_data;
# load the data arrays
my $min_value = min( min( keys %{$ref} ), min( keys %{$test} ) );
my $max_value = max( max( keys %{$ref} ), max( keys %{$test} ) );
for my $i ( $min_value .. $max_value ) {
push @ref_data, $ref->{$i} || 0;
push @test_data, $test->{$i} || 0;
}
# calculate ANOVA
my $p_value;
my $aov = Statistics::ANOVA->new();
eval {
$aov->load(
{
'test' => \@test_data,
'ref' => \@ref_data,
}
);
$aov->anova(
'independent' => $INDEPENDENCE,
'parametric' => $PARAMETRIC,
'ordinal' => $ORDINAL,
);
$p_value = $aov->{_stat}{p_value};
};
$p_value = -1 * ( log($p_value) / LOG10 ) if $p_value;
return $p_value || '.';
}
sub summarize_results {
# the results from the correlation analysis
my ( $correlations, $optimal_shifts, $optimal_correlations,
$count, $not_enough_data, $no_variance )
= @_;
# Summary analyses
printf " Correlated %s features\n", format_with_commas($count);
printf " Mean Pearson correlation was %.3f +/- %.3f\n",
mean( @{$correlations} ), stddevp( @{$correlations} );
if ($find_shift) {
printf " Mean absolute optimal shift was %.0f +/- %.0f bp\n",
mean( map {abs} @{$optimal_shifts} ),
stddevp( map {abs} @{$optimal_shifts} );
printf " Mean optimal Pearson correlation was %.3f +/- %.3f\n",
mean( @{$optimal_correlations} ), stddevp( @{$optimal_correlations} );
}
if ($not_enough_data) {
printf " %s features did not have enough data points\n",
format_with_commas($not_enough_data);
}
if ($no_variance) {
printf " %s features had no variance in the reference data points\n",
format_with_commas($no_variance);
}
}
sub mean {
return sum0(@_) / ( scalar @_ || 1 );
}
__END__
=head1 NAME
correlate_position_data.pl
A script to calculate correlations between two datasets along the length of a feature.
=head1 SYNOPSIS
correlate_position_data.pl [--options] <filename>
Options for data files:
-i --in <filename> input file: txt bed etc
-o --out <filename> optional output file, default overwrite
-d --db <name> alternate annotation database
Options for data sources
-D --ddb <name|file> data or BigWigSet database
-r --ref <dataset|filename> reference data: bw, name, etc
-t --test <dataset|filename> test data: bw, name, etc
Options for correlating data
--pval calculate P-value by ANOVA
--shift determine optimal shift to match datasets
--radius <integer> radius in bp around reference point to calculate
-p --pos [5|m|3] reference point to measure correlation (m)
--norm [rank|sum] normalization method between datasets
--force_strand force an alternate strand
General options:
-c --cpu <interger> number of threads (4)
-z --gz compress output with gz
-v --version print version and exit
-h --help show extended documentation
=head1 OPTIONS
The command line flags and descriptions:
=head2 Options for data files
=over 4
=item --in <filename>
Specify the input file of features. The features may be comprised of
name and type, or chromosome, start, and stop. Strand information is
optional, but is respected if included. A feature list may be
( run in 0.442 second using v1.01-cache-2.11-cpan-39bf76dae61 )