Bio-ToolBox

 view release on metacpan or  search on metacpan

scripts/correlate_position_data.pl  view on Meta::CPAN

	my ( $r_i, $p_i, $shift_i, $shiftr_i ) = add_new_columns();

	# generate base name for child processes
	my $child_base_name = $outfile . ".$PID";

	# variables for reporting the summary of results
	my @correlations;
	my @optimal_shifts;
	my @optimal_correlations;
	my $count           = 0;
	my $not_enough_data = 0;
	my $no_variance     = 0;
	$pm->run_on_finish(
		sub {
			my ( $pid, $exit_code, $ident, $exit_signal, $core_dump, $result ) = @_;
			push @correlations,         @{ $result->[0] };
			push @optimal_shifts,       @{ $result->[1] };
			push @optimal_correlations, @{ $result->[2] };
			$count           += $result->[3];
			$not_enough_data += $result->[4];
			$no_variance     += $result->[5];
		}
	);

	# Split the input data into parts and execute in parallel in separate forks
	print " Forking into $cpu children for parallel correlation collections\n";
	for my $i ( 1 .. $cpu ) {
		$pm->start and next;

		#### In child ####

scripts/correlate_position_data.pl  view on Meta::CPAN

		die "unable to find children files!\n";
	}
	unless ( scalar @files == $cpu ) {
		die "only found " . scalar(@files) . " child files when there should be $cpu!\n";
	}
	my $new_count = $Data->reload_children(@files);
	printf " reloaded %s features from children\n", format_with_commas($new_count);

	# summarize the results
	summarize_results( \@correlations, \@optimal_shifts, \@optimal_correlations,
		$count, $not_enough_data, $no_variance );
}

sub single_execution {

	# run in a single thread

	# Prepare new columns
	my ( $r_i, $p_i, $shift_i, $shiftr_i ) = add_new_columns();

	# collect

scripts/correlate_position_data.pl  view on Meta::CPAN

}

sub collect_correlations {
	my ( $r_i, $p_i, $shift_i, $shiftr_i ) = @_;

	# Set variables for summary analysis
	my @correlations;
	my @optimal_shifts;
	my @optimal_correlations;
	my $count           = 0;
	my $not_enough_data = 0;
	my $no_variance     = 0;

	# check that we can collect information
	unless ( $Data->feature_type eq 'named' or $Data->feature_type eq 'coordinate' ) {
		die " Unable to identify the type of features in the input file."
			. " File must be a recognizable format and/or have column header names\n"
			. " for chromosome, start, stop; or named database features\n";
	}

	# Walk through features

scripts/correlate_position_data.pl  view on Meta::CPAN

		if (   sum0( map {abs} values %ref_pos2data ) == 0
			or sum0( map {abs} values %test_pos2data ) == 0 )
		{
			# not enough data points to work with
			$row->value( $r_i, '.' );
			$row->value( $p_i, '.' ) if ($find_pvalue);
			if ($find_shift) {
				$row->value( $shift_i,  '.' );
				$row->value( $shiftr_i, '.' );
			}
			$not_enough_data++;
			next;
		}

		# Calculate Paired T-Test P value
		# going to try this before normalizing
		if ($find_pvalue) {
			$row->value( $p_i, calculate_ttest( \%ref_pos2data, \%test_pos2data ) );
		}

		# Normalize the data

scripts/correlate_position_data.pl  view on Meta::CPAN

			push @optimal_shifts,       $best_shift if ( $best_r > $r );
			push @optimal_correlations, $best_r     if ( $best_r > $r );
		}

		# Finished with this row
		$count++;
	}

	# Return correlation results for summary
	return ( \@correlations, \@optimal_shifts, \@optimal_correlations,
		$count, $not_enough_data, $no_variance );
}

sub add_new_columns {

	# Required columns
	# the Pearson column
	my $r_i = $Data->add_column('Pearson_correlation');
	$Data->metadata( $r_i, 'reference', $refDataSet );
	$Data->metadata( $r_i, 'test',      $testDataSet );

scripts/correlate_position_data.pl  view on Meta::CPAN

	};

	$p_value = -1 * ( log($p_value) / LOG10 ) if $p_value;
	return $p_value || '.';
}

sub summarize_results {

	# the results from the correlation analysis
	my ( $correlations, $optimal_shifts, $optimal_correlations,
		$count, $not_enough_data, $no_variance )
		= @_;

	# Summary analyses
	printf " Correlated %s features\n", format_with_commas($count);
	printf " Mean Pearson correlation was %.3f  +/- %.3f\n",
		mean( @{$correlations} ), stddevp( @{$correlations} );

	if ($find_shift) {
		printf " Mean absolute optimal shift was %.0f +/- %.0f bp\n",
			mean( map {abs} @{$optimal_shifts} ),
			stddevp( map {abs} @{$optimal_shifts} );
		printf " Mean optimal Pearson correlation was %.3f  +/- %.3f\n",
			mean( @{$optimal_correlations} ), stddevp( @{$optimal_correlations} );
	}
	if ($not_enough_data) {
		printf " %s features did not have enough data points\n",
			format_with_commas($not_enough_data);
	}
	if ($no_variance) {
		printf " %s features had no variance in the reference data points\n",
			format_with_commas($no_variance);
	}
}

sub mean {
	return sum0(@_) / ( scalar @_ || 1 );
}



( run in 0.228 second using v1.01-cache-2.11-cpan-0a987023a57 )