Bio-ToolBox

 view release on metacpan or  search on metacpan

scripts/correlate_position_data.pl  view on Meta::CPAN

	}

	if ($parallel) {
		$cpu ||= 4;
	}
	else {
		# disable cores
		print " WARNING: disabling parallel CPU execution, no support present\n" if $cpu;
		$cpu = 0;
	}

	unless ($outfile) {
		$outfile = $infile;
	}

}

sub validate_or_request_dataset {

	# Process the reference dataset
	$refDataSet = verify_or_request_feature_types(
		'db'      => $ddb || $Data->database,
		'feature' => $refDataSet,
		'prompt'  => "Enter the reference data set  ",
		'single'  => 1,
	);
	unless ($refDataSet) {
		die " A valid reference data set must be provided!\n";
	}

	# Process the test dataset
	$testDataSet = verify_or_request_feature_types(
		'db'      => $ddb || $Data->database,
		'feature' => $testDataSet,
		'prompt'  => "Enter the test data set  ",
		'single'  => 1,
	);
	unless ($testDataSet) {
		die " A valid test data set must be provided!\n";
	}
}

sub parallel_execution {
	my $pm = Parallel::ForkManager->new($cpu);
	$pm->run_on_start( sub { sleep 1; } );

	# give a chance for child to start up and open databases, files, etc
	# without creating race conditions

	# Prepare new columns
	my ( $r_i, $p_i, $shift_i, $shiftr_i ) = add_new_columns();

	# generate base name for child processes
	my $child_base_name = $outfile . ".$PID";

	# variables for reporting the summary of results
	my @correlations;
	my @optimal_shifts;
	my @optimal_correlations;
	my $count           = 0;
	my $not_enough_data = 0;
	my $no_variance     = 0;
	$pm->run_on_finish(
		sub {
			my ( $pid, $exit_code, $ident, $exit_signal, $core_dump, $result ) = @_;
			push @correlations,         @{ $result->[0] };
			push @optimal_shifts,       @{ $result->[1] };
			push @optimal_correlations, @{ $result->[2] };
			$count           += $result->[3];
			$not_enough_data += $result->[4];
			$no_variance     += $result->[5];
		}
	);

	# Split the input data into parts and execute in parallel in separate forks
	print " Forking into $cpu children for parallel correlation collections\n";
	for my $i ( 1 .. $cpu ) {
		$pm->start and next;

		#### In child ####

		# splice the data structure
		$Data->split_data( $i, $cpu );

		# re-open database objects to make them clone safe
		my $db = $Data->open_database(1);
		if ($data_database) {
			$ddb = open_db_connection( $data_database, 1 );
		}

		# collect
		my @results = collect_correlations( $r_i, $p_i, $shift_i, $shiftr_i );

		# write output file
		my $success = $Data->write_file(
			'filename' => sprintf( "$child_base_name.%03s", $i ),
			'gz'       => 0,    # faster to write without compression
		);
		if ($success) {
			print " wrote output file $success\n";
		}
		else {
			# failure! the subroutine will have printed error messages
			die " unable to write file!\n";

			# no need to continue
		}

		# Finished
		$pm->finish( 0, \@results );
	}
	$pm->wait_all_children;

	# reassemble children files into output file
	my @files = glob "$child_base_name.*";
	unless (@files) {
		die "unable to find children files!\n";
	}
	unless ( scalar @files == $cpu ) {
		die "only found " . scalar(@files) . " child files when there should be $cpu!\n";
	}
	my $new_count = $Data->reload_children(@files);
	printf " reloaded %s features from children\n", format_with_commas($new_count);

	# summarize the results
	summarize_results( \@correlations, \@optimal_shifts, \@optimal_correlations,
		$count, $not_enough_data, $no_variance );
}

sub single_execution {

	# run in a single thread

	# Prepare new columns
	my ( $r_i, $p_i, $shift_i, $shiftr_i ) = add_new_columns();

	# collect
	print " Collecting correlations....\n";
	my @results = collect_correlations( $r_i, $p_i, $shift_i, $shiftr_i );

	# summarize
	summarize_results(@results);
}

sub collect_correlations {
	my ( $r_i, $p_i, $shift_i, $shiftr_i ) = @_;

	# Set variables for summary analysis
	my @correlations;
	my @optimal_shifts;
	my @optimal_correlations;
	my $count           = 0;
	my $not_enough_data = 0;
	my $no_variance     = 0;

	# check that we can collect information
	unless ( $Data->feature_type eq 'named' or $Data->feature_type eq 'coordinate' ) {
		die " Unable to identify the type of features in the input file."
			. " File must be a recognizable format and/or have column header names\n"
			. " for chromosome, start, stop; or named database features\n";
	}

	# Walk through features
	my $stream = $Data->row_stream;
	while ( my $row = $stream->next_row ) {

		# Collect positioned data
		my %ref_pos2data;
		my %test_pos2data;
		if ($radius) {

			# collecting data in a radius around a reference point

			# collect data
			%ref_pos2data = $row->get_relative_point_position_scores(
				ddb      => $ddb,
				dataset  => $refDataSet,
				position => $position,
				extend   => ( 2 * $radius ),
			);
			%test_pos2data = $row->get_relative_point_position_scores(
				ddb      => $ddb,
				dataset  => $testDataSet,
				position => $position,
				extend   => ( 2 * $radius ),
			);
		}
		else {
			# just collect data over the region
			# collect data
			%ref_pos2data = $row->get_region_position_scores(
				ddb      => $ddb,
				dataset  => $refDataSet,
				position => $position,
			);
			%test_pos2data = $row->get_region_position_scores(
				ddb      => $ddb,
				dataset  => $testDataSet,
				position => $position,
			);
		}

		# Verify minimum data count
		if (   sum0( map {abs} values %ref_pos2data ) == 0
			or sum0( map {abs} values %test_pos2data ) == 0 )
		{
			# not enough data points to work with
			$row->value( $r_i, '.' );
			$row->value( $p_i, '.' ) if ($find_pvalue);
			if ($find_shift) {
				$row->value( $shift_i,  '.' );
				$row->value( $shiftr_i, '.' );
			}
			$not_enough_data++;
			next;
		}

		# Calculate Paired T-Test P value
		# going to try this before normalizing
		if ($find_pvalue) {
			$row->value( $p_i, calculate_ttest( \%ref_pos2data, \%test_pos2data ) );
		}

		# Normalize the data
		# there are couple ways to do this
		if ( $norm_method and $norm_method =~ /rank/i ) {

			# convert to rank values
			# essentially a Spearman's rank correlation
			normalize_values_by_rank( \%ref_pos2data );
			normalize_values_by_rank( \%test_pos2data );
		}
		elsif ( $norm_method and $norm_method =~ /sum/i ) {

			# normalize the sums of both
			# good for read counts
			normalize_values_by_sum( \%ref_pos2data );
			normalize_values_by_sum( \%test_pos2data );
		}

		# Collect the (normalized) data from the position data
		# this may be all of the data or a portion of it, depending
		my @ref_data;
		my @test_data;
		if ($radius) {
			for my $i ( ( 0 - $radius ) .. $radius ) {

				# get values for each position, default 0
				push @ref_data,  $ref_pos2data{$i}  || 0;
				push @test_data, $test_pos2data{$i} || 0;
			}
		}
		else {
			for ( my $i = 1; $i < $row->length; $i++ ) {

				# get values for each position, default 0
				push @ref_data,  $ref_pos2data{$i}  || 0;
				push @test_data, $test_pos2data{$i} || 0;
			}
		}

		# verify the reference data
		if ( min(@ref_data) == max(@ref_data) ) {

			# no variance in the data! cannot calculate Pearson
			$row->value( $r_i, '.' );
			$row->value( $p_i, '.' ) if ($find_pvalue);
			if ($find_shift) {
				$row->value( $shift_i,  '.' );
				$row->value( $shiftr_i, '.' );
			}
			$no_variance++;
			next;
		}

		# Calculate the Pearson correlation between test and reference
		my $r;
		if ( min(@test_data) == max(@test_data) ) {

			# test has no variance
			# report a Pearson of 0
			$r = 0;

			# this may not be a total loss, as there may be data points
			# outside of the window that would generate an optimal Pearson
			# so we will continue rather than bailing
		}
		else {
			# test has variance, calculate Pearson
			$r = calculate_pearson( \@ref_data, \@test_data ) || 0;
		}
		$row->value( $r_i, $r );
		push @correlations, $r if $r;

		# Determine optimal shift if requested
		if ($find_shift) {

			# find optimal shift
			my ( $best_shift, $best_r ) =
				calculate_optimum_shift( \%test_pos2data, \@ref_data, $r, );

			# change strand
			if ($set_strand) {

				# user is imposing a strand
				if ( $row->strand < 0 ) {

					# only change shift direction if reverse strand
					$best_shift *= -1;
				}
			}

			# record in data table
			$row->value( $shift_i,  $best_shift );
			$row->value( $shiftr_i, $best_r > $r ? $best_r : '.' );

			# record for summary analyses
			push @optimal_shifts,       $best_shift if ( $best_r > $r );
			push @optimal_correlations, $best_r     if ( $best_r > $r );
		}

		# Finished with this row
		$count++;
	}

	# Return correlation results for summary
	return ( \@correlations, \@optimal_shifts, \@optimal_correlations,
		$count, $not_enough_data, $no_variance );
}

sub add_new_columns {

	# Required columns
	# the Pearson column
	my $r_i = $Data->add_column('Pearson_correlation');
	$Data->metadata( $r_i, 'reference', $refDataSet );
	$Data->metadata( $r_i, 'test',      $testDataSet );

	# extra information
	$Data->metadata( $r_i, 'data_database', $data_database ) if $data_database;
	$Data->metadata( $r_i, 'normalization', $norm_method )   if $norm_method;
	if ($radius) {
		$Data->metadata( $r_i, 'radius', $radius );
		$Data->metadata( $r_i, 'reference_point',
			$position == 5 ? '5_prime' : $position == 4 ? 'midpoint' : '3_prime' );
	}

	# Optional columns if we're calculating an optimal shift
	my ( $shift_i, $shiftr_i );
	if ($find_shift) {

		# optimal test-reference shift value
		$shift_i = $Data->add_column('optimal_shift');
		$Data->metadata( $shift_i, 'reference',     $refDataSet );
		$Data->metadata( $shift_i, 'test',          $testDataSet );
		$Data->metadata( $shift_i, 'radius',        $radius );
		$Data->metadata( $shift_i, 'data_database', $data_database ) if $data_database;
		$Data->metadata( $shift_i, 'reference_point',
			$position == 5 ? '5_prime' : $position == 4 ? 'midpoint' : '3_prime' );

		# optimal test-reference shift value
		$shiftr_i = $Data->add_column('shift_correlation');
		$Data->metadata( $shiftr_i, 'reference',     $refDataSet );
		$Data->metadata( $shiftr_i, 'test',          $testDataSet );
		$Data->metadata( $shiftr_i, 'radius',        $radius );
		$Data->metadata( $shiftr_i, 'data_database', $data_database ) if $data_database;
		$Data->metadata( $shiftr_i, 'reference_point',
			$position == 5 ? '5_prime' : $position == 4 ? 'midpoint' : '3_prime' );
		$Data->metadata( $shiftr_i, 'normalization', $norm_method ) if $norm_method;
	}

	# Paired Student T-Test P-value
	my $p_i;
	if ($find_pvalue) {
		$p_i = $Data->add_column('ANOVA_Pval');
		$Data->metadata( $p_i, 'reference', $refDataSet );
		$Data->metadata( $p_i, 'test',      $testDataSet );
		$Data->metadata( $p_i, 'transform', '-Log10' );
	}

	# record the indices
	return ( $r_i, $p_i, $shift_i, $shiftr_i );
}

sub calculate_optimum_shift {

	# Calculate an optimal shift that maximizes the Pearson correlation

scripts/correlate_position_data.pl  view on Meta::CPAN

	my $ref_scale_factor = 100 / median( map {abs} values %{$data} );
	map { $data->{$_} = $ref_scale_factor * $data->{$_} } keys %{$data};
}

sub calculate_pearson {

	# positioned data hashes, start, and stop coordinates for evaluating
	my ( $ref, $test ) = @_;

	# calculate correlation
	# 	my $stat = Statistics::LineFit->new();
	# 	$stat->setData($ref, $test) or warn " bad data!\n";
	# 	return $stat->rSquared();
	my $stat = Statistics::Descriptive::Full->new();
	$stat->add_data($ref);
	my ( $q, $m, $r, $rms ) = $stat->least_squares_fit( @{$test} );
	return $r;
}

sub calculate_ttest {

	# generate the data arrays
	my ( $ref, $test ) = @_;
	my @ref_data;
	my @test_data;

	# load the data arrays
	my $min_value = min( min( keys %{$ref} ), min( keys %{$test} ) );
	my $max_value = max( max( keys %{$ref} ), max( keys %{$test} ) );
	for my $i ( $min_value .. $max_value ) {
		push @ref_data,  $ref->{$i}  || 0;
		push @test_data, $test->{$i} || 0;
	}

	# calculate ANOVA
	my $p_value;
	my $aov = Statistics::ANOVA->new();
	eval {
		$aov->load(
			{
				'test' => \@test_data,
				'ref'  => \@ref_data,
			}
		);
		$aov->anova(
			'independent' => $INDEPENDENCE,
			'parametric'  => $PARAMETRIC,
			'ordinal'     => $ORDINAL,
		);
		$p_value = $aov->{_stat}{p_value};
	};

	$p_value = -1 * ( log($p_value) / LOG10 ) if $p_value;
	return $p_value || '.';
}

sub summarize_results {

	# the results from the correlation analysis
	my ( $correlations, $optimal_shifts, $optimal_correlations,
		$count, $not_enough_data, $no_variance )
		= @_;

	# Summary analyses
	printf " Correlated %s features\n", format_with_commas($count);
	printf " Mean Pearson correlation was %.3f  +/- %.3f\n",
		mean( @{$correlations} ), stddevp( @{$correlations} );

	if ($find_shift) {
		printf " Mean absolute optimal shift was %.0f +/- %.0f bp\n",
			mean( map {abs} @{$optimal_shifts} ),
			stddevp( map {abs} @{$optimal_shifts} );
		printf " Mean optimal Pearson correlation was %.3f  +/- %.3f\n",
			mean( @{$optimal_correlations} ), stddevp( @{$optimal_correlations} );
	}
	if ($not_enough_data) {
		printf " %s features did not have enough data points\n",
			format_with_commas($not_enough_data);
	}
	if ($no_variance) {
		printf " %s features had no variance in the reference data points\n",
			format_with_commas($no_variance);
	}
}

sub mean {
	return sum0(@_) / ( scalar @_ || 1 );
}

__END__

=head1 NAME

correlate_position_data.pl

A script to calculate correlations between two datasets along the length of a feature.

=head1 SYNOPSIS

correlate_position_data.pl [--options] <filename>
  
  Options for data files:
  -i --in <filename>               input file: txt bed etc
  -o --out <filename>              optional output file, default overwrite 
  -d --db <name>                   alternate annotation database
  
  Options for data sources
  -D --ddb <name|file>             data or BigWigSet database
  -r --ref <dataset|filename>      reference data: bw, name, etc
  -t --test <dataset|filename>     test data: bw, name, etc
  
  Options for correlating data
  --pval                           calculate P-value by ANOVA
  --shift                          determine optimal shift to match datasets
  --radius <integer>               radius in bp around reference point to calculate
  -p --pos [5|m|3]                 reference point to measure correlation (m)
  --norm [rank|sum]                normalization method between datasets
  --force_strand                   force an alternate strand
  
  General options:
  -c --cpu <interger>              number of threads (4)
  -z --gz                          compress output with gz
  -v --version                     print version and exit
  -h --help                        show extended documentation

=head1 OPTIONS

The command line flags and descriptions:

=head2 Options for data files

=over 4

=item --in <filename>

Specify the input file of features. The features may be comprised of 
name and type, or chromosome, start, and stop. Strand information is 
optional, but is respected if included. A feature list may be 



( run in 0.442 second using v1.01-cache-2.11-cpan-39bf76dae61 )