Bio-ToolBox
view release on metacpan or search on metacpan
scripts/correlate_position_data.pl view on Meta::CPAN
my ( $r_i, $p_i, $shift_i, $shiftr_i ) = add_new_columns();
# generate base name for child processes
my $child_base_name = $outfile . ".$PID";
# variables for reporting the summary of results
my @correlations;
my @optimal_shifts;
my @optimal_correlations;
my $count = 0;
my $not_enough_data = 0;
my $no_variance = 0;
$pm->run_on_finish(
sub {
my ( $pid, $exit_code, $ident, $exit_signal, $core_dump, $result ) = @_;
push @correlations, @{ $result->[0] };
push @optimal_shifts, @{ $result->[1] };
push @optimal_correlations, @{ $result->[2] };
$count += $result->[3];
$not_enough_data += $result->[4];
$no_variance += $result->[5];
}
);
# Split the input data into parts and execute in parallel in separate forks
print " Forking into $cpu children for parallel correlation collections\n";
for my $i ( 1 .. $cpu ) {
$pm->start and next;
#### In child ####
scripts/correlate_position_data.pl view on Meta::CPAN
die "unable to find children files!\n";
}
unless ( scalar @files == $cpu ) {
die "only found " . scalar(@files) . " child files when there should be $cpu!\n";
}
my $new_count = $Data->reload_children(@files);
printf " reloaded %s features from children\n", format_with_commas($new_count);
# summarize the results
summarize_results( \@correlations, \@optimal_shifts, \@optimal_correlations,
$count, $not_enough_data, $no_variance );
}
sub single_execution {
# run in a single thread
# Prepare new columns
my ( $r_i, $p_i, $shift_i, $shiftr_i ) = add_new_columns();
# collect
scripts/correlate_position_data.pl view on Meta::CPAN
}
sub collect_correlations {
my ( $r_i, $p_i, $shift_i, $shiftr_i ) = @_;
# Set variables for summary analysis
my @correlations;
my @optimal_shifts;
my @optimal_correlations;
my $count = 0;
my $not_enough_data = 0;
my $no_variance = 0;
# check that we can collect information
unless ( $Data->feature_type eq 'named' or $Data->feature_type eq 'coordinate' ) {
die " Unable to identify the type of features in the input file."
. " File must be a recognizable format and/or have column header names\n"
. " for chromosome, start, stop; or named database features\n";
}
# Walk through features
scripts/correlate_position_data.pl view on Meta::CPAN
if ( sum0( map {abs} values %ref_pos2data ) == 0
or sum0( map {abs} values %test_pos2data ) == 0 )
{
# not enough data points to work with
$row->value( $r_i, '.' );
$row->value( $p_i, '.' ) if ($find_pvalue);
if ($find_shift) {
$row->value( $shift_i, '.' );
$row->value( $shiftr_i, '.' );
}
$not_enough_data++;
next;
}
# Calculate Paired T-Test P value
# going to try this before normalizing
if ($find_pvalue) {
$row->value( $p_i, calculate_ttest( \%ref_pos2data, \%test_pos2data ) );
}
# Normalize the data
scripts/correlate_position_data.pl view on Meta::CPAN
push @optimal_shifts, $best_shift if ( $best_r > $r );
push @optimal_correlations, $best_r if ( $best_r > $r );
}
# Finished with this row
$count++;
}
# Return correlation results for summary
return ( \@correlations, \@optimal_shifts, \@optimal_correlations,
$count, $not_enough_data, $no_variance );
}
sub add_new_columns {
# Required columns
# the Pearson column
my $r_i = $Data->add_column('Pearson_correlation');
$Data->metadata( $r_i, 'reference', $refDataSet );
$Data->metadata( $r_i, 'test', $testDataSet );
scripts/correlate_position_data.pl view on Meta::CPAN
};
$p_value = -1 * ( log($p_value) / LOG10 ) if $p_value;
return $p_value || '.';
}
sub summarize_results {
# the results from the correlation analysis
my ( $correlations, $optimal_shifts, $optimal_correlations,
$count, $not_enough_data, $no_variance )
= @_;
# Summary analyses
printf " Correlated %s features\n", format_with_commas($count);
printf " Mean Pearson correlation was %.3f +/- %.3f\n",
mean( @{$correlations} ), stddevp( @{$correlations} );
if ($find_shift) {
printf " Mean absolute optimal shift was %.0f +/- %.0f bp\n",
mean( map {abs} @{$optimal_shifts} ),
stddevp( map {abs} @{$optimal_shifts} );
printf " Mean optimal Pearson correlation was %.3f +/- %.3f\n",
mean( @{$optimal_correlations} ), stddevp( @{$optimal_correlations} );
}
if ($not_enough_data) {
printf " %s features did not have enough data points\n",
format_with_commas($not_enough_data);
}
if ($no_variance) {
printf " %s features had no variance in the reference data points\n",
format_with_commas($no_variance);
}
}
sub mean {
return sum0(@_) / ( scalar @_ || 1 );
}
( run in 0.228 second using v1.01-cache-2.11-cpan-0a987023a57 )