Bio-ToolBox

 view release on metacpan or  search on metacpan

scripts/get_binned_data.pl  view on Meta::CPAN


			# but first, put in null values
			for my $c ( $startcolumn .. ( $Data->last_column ) ) {
				$row->value( $c, '.' );
			}
			next;
		}

		# calculate the actual extension in bp
		# this is determined from the extension_size or the feature length
		# and multiplied by the number of extensions requested
		my $extra;
		if ($extension) {
			if ($extension_size) {

				# extension is specific bp in size
				$extra = $extension_size * $extension;
			}
			else {
				# extension is dependent on feature length
				$extra = int( ( $extension * $binsize * 0.01 * $length ) + 0.5 );
			}
		}

		my $regionscores = $row->get_region_position_scores(
			'ddb'        => $ddb,
			'dataset'    => $dataset,
			'method'     => $method,
			'extend'     => $extra,
			'stranded'   => $stranded,
			'subfeature' => $subfeature,
			'length'     => $length,
		);

		# record the scores for each bin
		record_the_bin_values( $row, $length, $regionscores );
	}
}

sub collect_binned_long_data {
	my ( $binsize, $dataset ) = @_;

	## Collect the data
	my $stream = $Data->row_stream;
	while ( my $row = $stream->next_row ) {

		# identify the feature or use the row
		my $fstart = $row->start;
		my $fstop  = $row->end;
		my $strand = $row->strand;
		my $length = $row->length;   # subfeatures not allowed here, so use feature length

		# collect the scores to the bins in the region
		for my $column ( $startcolumn .. ( $Data->last_column ) ) {

			# we will step through each data column, representing each window (bin)
			# across the feature's region
			# any scores within this window will be collected and the mean
			# value reported

			# convert the window start and stop coordinates (as percentages) to
			# actual bp
			# this depends on whether the binsize is explicitly defined in bp or
			# is a fraction of the feature length
			my ( $start, $stop );
			if ( $Data->metadata( $column, 'bin_size' ) =~ /bp$/ ) {

				# the bin size is explicitly defined

				# the start and stop points are relative to either the feature
				# start (always 0) or the end (the feature length), depending
				# upon whether the 5' or 3' end of the feature

				# determine this by the sign of the start position
				if ( $Data->metadata( $column, 'start' ) < 0 and $strand >= 0 ) {

					# the start position is less than 0, implying the 5' end
					# the reference position will be the feature start on plus strand
					$start = $fstart + $Data->metadata( $column, 'start' );
					$stop  = $fstart + $Data->metadata( $column, 'stop' );
				}
				elsif ( $Data->metadata( $column, 'start' ) < 0 and $strand < 0 ) {

					# the start position is less than 0, implying the 5' end
					# the reference position will be the feature end on minus strand
					$start = $fstop - $Data->metadata( $column, 'start' );
					$stop  = $fstop - $Data->metadata( $column, 'stop' );
				}
				elsif ( $Data->metadata( $column, 'start' ) >= 0 and $strand >= 0 ) {

					# the start position is greather than 0, implying the 3' end
					# the reference position will be the feature start on plus strand
					$start = $fstop + $Data->metadata( $column, 'start' );
					$stop  = $fstop + $Data->metadata( $column, 'stop' );
				}
				elsif ( $Data->metadata( $column, 'start' ) >= 0 and $strand < 0 ) {

					# the start position is greather than 0, implying the 3' end
					# the reference position will be the feature end on minus strand
					$start = $fstart - $Data->metadata( $column, 'start' );
					$stop  = $fstart - $Data->metadata( $column, 'stop' );
				}
				else {
					warn " unable to unable to identify region orientation: start "
						. $Data->metadata( $column, 'start' )
						. ", strand $strand\n";
					return;
				}
			}
			else {
				# otherwise the bin size is based on feature length
				if ( $strand >= 0 ) {

					# forward plus strand
					$start =
						int( $fstart +
							( $Data->metadata( $column, 'start' ) * 0.01 * $length ) +
							0.5 );
					$stop =
						int( $fstart +
							( $Data->metadata( $column, 'stop' ) * 0.01 * $length ) - 1 +
							0.5 );
				}
				else {
					# reverse minus strand
					$start =
						int( $fstop -
							( $Data->metadata( $column, 'start' ) * 0.01 * $length ) +
							0.5 );
					$stop =
						int( $fstop -
							( $Data->metadata( $column, 'stop' ) * 0.01 * $length ) + 1 +
							0.5 );
				}
			}

			# collect the data for this bin
			my $score = $row->get_score(
				'ddb'      => $ddb,
				'dataset'  => $dataset,
				'start'    => $start,
				'stop'     => $stop,
				'method'   => $method,
				'stranded' => $stranded,
			);
			if ( $formatter and looks_like_number($score) ) {
				$score = sprintf( $formatter, $score );
			}
			$row->value( $column, $score );
		}
	}
}

sub record_the_bin_values {

	# get the passed values
	my ( $row, $length, $regionscores ) = @_;

	# assign the scores to the bins in the region
	for my $column ( $startcolumn .. ( $Data->last_column ) ) {

		# we will step through each data column, representing each window (bin)
		# across the feature's region
		# any scores within this window will be collected and the mean
		# value reported

		# record nulls if no data returned
		unless ( scalar keys %{$regionscores} ) {
			$row->value( $column, calculate_score( $method, undef ) );
			next;
		}

		# convert the window start and stop coordinates (as percentages) to
		# actual bp
		# this depends on whether the binsize is explicitly defined in bp or
		# is a fraction of the feature length
		my ( $start, $stop );
		if ( $Data->metadata( $column, 'bin_size' ) =~ /bp$/ ) {

			# the bin size is explicitly defined

			# the start and stop points are relative to either the feature
			# start (always 0) or the end (the feature length), depending
			# upon whether the 5' or 3' end of the feature

			# determine this by the sign of the start position
			if ( $Data->metadata( $column, 'start' ) < 0 ) {

				# the start position is less than 0, implying the 5' end
				# the reference position will be the feature start, or 0
				$start = $Data->metadata( $column, 'start' );
				$stop  = $Data->metadata( $column, 'stop' );
			}
			else {
				# the start position is greather than 0, implying the 3' end
				# the reference position will be the feature end, or length
				$start = $Data->metadata( $column, 'start' ) + $length;
				$stop  = $Data->metadata( $column, 'stop' ) + $length;
			}
		}
		else {
			# otherwise the bin size is based on feature length
			$start = sprintf "%.0f",
				( $Data->metadata( $column, 'start' ) * 0.01 * $length ) + 1;
			$stop = sprintf "%.0f",
				( $Data->metadata( $column, 'stop' ) * 0.01 * $length );
		}

		# collect the scores for this window
		my @scores = map { $regionscores->{$_} }
			grep { $_ >= $start and $_ <= $stop }
			keys %{$regionscores};

		# calculate the value
		my $window_score = calculate_score( $method, \@scores );
		if ( $formatter and looks_like_number($window_score) ) {
			$window_score = sprintf( $formatter, $window_score );
		}

		# record the value
		$row->value( $column, $window_score );
	}
}

## Interpolate the '.' values with the mean of the neighbors
sub go_interpolate_values {

	# determine counts
	my $lastwindow = $Data->last_column;

	# lastwindow is the index of the last column

	# walk through each data line and then each window
	my $stream = $Data->row_stream;
	while ( my $row = $stream->next_row ) {
		my $col = $startcolumn + 1;
		while ( $col < $lastwindow ) {

			# walk through the windows of a data row
			# skipping the very first and last windows (columns)
			# we will look for null values
			# if one is found, interpolate from neighbors
			if ( $row->value($col) eq '.' and $row->value( $col - 1 ) ne '.' ) {

				# we will interpolate the value from the neighbors
				# first, need to check that the neighbors have values

				# find the next real value
				my $next_i;
				for ( my $i = $col + 1; $col <= $lastwindow; $i++ ) {
					if ( $row->value($i) ne '.' ) {
						$next_i = $i;
						last;
					}
				}
				next unless defined $next_i;

				# determine fractional value
				my $initial = $row->value( $col - 1 );
				my $fraction =
					( $row->value($next_i) - $initial ) / ( $next_i - $col + 1 );

				# apply fractional values
				for ( my $n = $col; $n < $next_i; $n++ ) {
					my $score = $initial + ( $fraction * ( $n - $col + 1 ) );
					if ( $formatter and looks_like_number($score) ) {
						$score = sprintf( $formatter, $score );
					}
					$row->value( $n, $score );
				}

				# jump ahead
				$col = $next_i;
			}
			$col++;
		}
	}
}

### Prepare all of the bin columns and their metadata
sub prepare_bins {

	my ( $binsize, $dataset ) = @_;

	# the size of the bin in percentage units, default would be 10%
	# each bin will be titled the starting and ending point for that bin in
	# percentage units
	# for example, -20..-10,-10..0,0..10,10..20

	# if $extension is defined, then it will add the appropriate flanking bins,
	# otherwise it should skip them

	# bin(s) on 5' flank
	if ($extension) {

		if ($extension_size) {

			# extended bins should be of specific bp size
			for ( my $i = $extension; $i > 0; $i-- ) {
				my $start = 0 - ( $extension_size * $i );
				my $stop  = 0 - ( $extension_size * ( $i - 1 ) );
				_set_metadata( $start, $stop, $extension_size, 'bp', $dataset );
			}
		}
		else {
			# extended bin size will be based on feature length
			for ( my $i = $extension; $i > 0; $i-- ) {
				my $start = 0 - ( $binsize * $i );
				my $stop  = 0 - ( $binsize * ( $i - 1 ) );
				_set_metadata( $start, $stop, $binsize, '%', $dataset );
			}
		}
	}

	# bins over the gene body
	for ( my $i = 0; $i < $bins; $i++ ) {
		my $start = ( $i * $binsize );
		my $stop  = ( $i + 1 ) * $binsize;
		_set_metadata( $start, $stop, $binsize, '%', $dataset );
	}

	# bin(s) on 3' flank
	if ($extension) {

		if ($extension_size) {

			# extended bins should be of specific bp size
			for ( my $i = 0; $i < $extension; $i++ ) {
				my $start = ( $extension_size * $i );
				my $stop  = ( $extension_size * ( $i + 1 ) );
				_set_metadata( $start, $stop, $extension_size, 'bp', $dataset );
			}
		}
		else {
			# extended bin size will be based on feature length
			for ( my $i = 0; $i < $extension; $i++ ) {
				my $start = 100 + ( $binsize * $i );
				my $stop  = 100 + ( $binsize * ( $i + 1 ) );
				_set_metadata( $start, $stop, $binsize, '%', $dataset );
			}
		}
	}
}

### Set the metadata for a new data table column (dataset)
sub _set_metadata {

scripts/get_binned_data.pl  view on Meta::CPAN


Counts unique names. Useful when spliced alignments overlap more 
than one exon and you want to avoid double-counting.

=back

=item --strand [all|sense|antisense]

Specify whether stranded data should be collected. Three values are 
allowed: all datasets should be collected (default), only sense 
datasets, or only antisense datasets should be collected.

=item --subfeature [ exon | cds | 5p_utr | 3p_utr ]

Optionally specify the type of subfeature to collect from, rather than 
the entire gene. If the parent feature is gene and the subfeature is exon, 
then all transcripts of the gene will be collapsed. The other subfeatures 
(cds, 5p_utr, and 3p_utr) will not work with gene features but only with 
coding mRNA transcripts. Note that the long option is incompatible. 
Default is null. 

=item --exons

Legacy option for specifying --subfeature exon.

=item --long

Indicate that data should be collected independently for each long 
window. This may be enabled automatically if the sum of the entire 
window length passes a predefined threshold. The default for 'short' 
windows is to collect all of the point data from the dataset first, and 
then divide the results into the different windows. Datasets consisting 
of "long" features, for example long alignments, may be counted more 
than once in long mode when they span multiple windows. Not compatible 
when subfeatures are enabled.

=item --format E<lt>integerE<gt>

Specify the number of decimal positions to format the collected scores. 
Default is not to format, often leading to more than the intended 
significant digits.

=item --mapq E<lt>integer<gt>

Specify the minimum mapping quality of alignments to be considered when
counting from a Bam file. Default is 0, which will include all alignments,
including multi-mapping (typically MAPQ of 0). Set to an integer in range
of 0..255. Only affects count methods, including C<count>, C<ncount>, and
C<pcount>. Other methods involving coverage, e.g. C<mean>, do not filter
alignments.

=back

=head2 Bin specification

=over 4

=item --bins E<lt>integerE<gt>

Specify the number of bins that will be generated over the length 
of the feature. The size of the feature is a percentage of the 
feature length. The default number is 10, which results in bins of 
size equal to 10% of the feature length. 

=item --ext E<lt>integerE<gt>

Specify the number of extended bins on either side of the feature. 
The bins are of the same size as determined by the feature 
length and the --bins value. The default is 0. 

=item --extsize E<lt>integerE<gt>

Specify the exact bin size in bp of the extended bins rather than
using a percentage of feature of length.

=item --min E<lt>integerE<gt>

Specify the minimum feature size to be averaged. Features with a
length below this value will not be skipped (all bins will have
null values). This is to avoid having bin sizes below the average 
microarray tiling distance. The default is undefined (no limit).

=back

=head2 Post-processing

=over 4

=item --sum

Indicate that the data should be averaged across all features at
each position, suitable for graphing. A separate text file will be
written with the suffix '_summed' with the averaged data. The default 
is false.

=item --smooth

Indicate that windows without values should (not) be interpolated
from neighboring values. The default is false.

=back

=head2 General options

=over 4

=item --groups

Optionally write a secondary file with the list of column group names and 
their corresponding dataset group. This can be used to assist in designating 
the metadata when plotting files, for example in R with pheatmap. The 
file is named the output basename appended with F<.col_groups.txt>.

=item --gz

Specify whether (or not) the output file should be compressed with gzip.

=item --cpu E<lt>integerE<gt>

Specify the number of CPU cores to execute in parallel. This requires 
the installation of Parallel::ForkManager. With support enabled, the 
default is 4. Disable multi-threaded execution by setting to 1. 

=item --noparse

Prevent input annotation files from being automatically parsed into sequence 
features. Coordinates will be used as is and new data columns will be appended 
to the input file. 

=item --version

Print the version number.

=item --help

This help text.

=back

=head1 DESCRIPTION

This program will collect data across a gene or feature body into numerous 
percentile bins. It is used to determine if there is a spatial 
distribution preference for the dataset over gene bodies. The number 
of bins may be specified as a command argument (default 10). Additionally, 
extra bins may be extended on either side of the gene (default 0 on either 
side). The bin size is determined as a percentage of gene length.

=head1 EXAMPLES

These are some examples of some common scenarios for collecting data.

=over 4

=item Collect scores in intervals

You want to collect the mean score from a bigWig file in 10% intervals 
across each feature in a Bed file.

  get_binned_data.pl --data scores.bw --in input.bed

=item Collect scores in intervals plus extended regions

You want to collect the maximum score in 5% intervals across each each 
feature as well as five 100 bp intervals outside of each interval.

  get_binned_data.pl --bins 20 --method max --ext 5 --extsize 100 --data \
  scores.bw --in input.txt

=item Collect scores in intervals for genes

You want to collect stranded alignment counts from a Bam file for genes 
in an annotation database.

  get_binned_data.pl --db annotation --feature gene --strand sense \
  --method count --data alignments.bam --out gene_profile --sum
  
=back

=head1 AUTHOR

 Timothy J. Parnell, PhD
 Howard Hughes Medical Institute
 Dept of Oncological Sciences
 Huntsman Cancer Institute
 University of Utah
 Salt Lake City, UT, 84112

This package is free software; you can redistribute it and/or modify
it under the terms of the Artistic License 2.0.  



( run in 1.358 second using v1.01-cache-2.11-cpan-39bf76dae61 )