Bio-ToolBox
view release on metacpan or search on metacpan
scripts/get_binned_data.pl view on Meta::CPAN
# but first, put in null values
for my $c ( $startcolumn .. ( $Data->last_column ) ) {
$row->value( $c, '.' );
}
next;
}
# calculate the actual extension in bp
# this is determined from the extension_size or the feature length
# and multiplied by the number of extensions requested
my $extra;
if ($extension) {
if ($extension_size) {
# extension is specific bp in size
$extra = $extension_size * $extension;
}
else {
# extension is dependent on feature length
$extra = int( ( $extension * $binsize * 0.01 * $length ) + 0.5 );
}
}
my $regionscores = $row->get_region_position_scores(
'ddb' => $ddb,
'dataset' => $dataset,
'method' => $method,
'extend' => $extra,
'stranded' => $stranded,
'subfeature' => $subfeature,
'length' => $length,
);
# record the scores for each bin
record_the_bin_values( $row, $length, $regionscores );
}
}
sub collect_binned_long_data {
my ( $binsize, $dataset ) = @_;
## Collect the data
my $stream = $Data->row_stream;
while ( my $row = $stream->next_row ) {
# identify the feature or use the row
my $fstart = $row->start;
my $fstop = $row->end;
my $strand = $row->strand;
my $length = $row->length; # subfeatures not allowed here, so use feature length
# collect the scores to the bins in the region
for my $column ( $startcolumn .. ( $Data->last_column ) ) {
# we will step through each data column, representing each window (bin)
# across the feature's region
# any scores within this window will be collected and the mean
# value reported
# convert the window start and stop coordinates (as percentages) to
# actual bp
# this depends on whether the binsize is explicitly defined in bp or
# is a fraction of the feature length
my ( $start, $stop );
if ( $Data->metadata( $column, 'bin_size' ) =~ /bp$/ ) {
# the bin size is explicitly defined
# the start and stop points are relative to either the feature
# start (always 0) or the end (the feature length), depending
# upon whether the 5' or 3' end of the feature
# determine this by the sign of the start position
if ( $Data->metadata( $column, 'start' ) < 0 and $strand >= 0 ) {
# the start position is less than 0, implying the 5' end
# the reference position will be the feature start on plus strand
$start = $fstart + $Data->metadata( $column, 'start' );
$stop = $fstart + $Data->metadata( $column, 'stop' );
}
elsif ( $Data->metadata( $column, 'start' ) < 0 and $strand < 0 ) {
# the start position is less than 0, implying the 5' end
# the reference position will be the feature end on minus strand
$start = $fstop - $Data->metadata( $column, 'start' );
$stop = $fstop - $Data->metadata( $column, 'stop' );
}
elsif ( $Data->metadata( $column, 'start' ) >= 0 and $strand >= 0 ) {
# the start position is greather than 0, implying the 3' end
# the reference position will be the feature start on plus strand
$start = $fstop + $Data->metadata( $column, 'start' );
$stop = $fstop + $Data->metadata( $column, 'stop' );
}
elsif ( $Data->metadata( $column, 'start' ) >= 0 and $strand < 0 ) {
# the start position is greather than 0, implying the 3' end
# the reference position will be the feature end on minus strand
$start = $fstart - $Data->metadata( $column, 'start' );
$stop = $fstart - $Data->metadata( $column, 'stop' );
}
else {
warn " unable to unable to identify region orientation: start "
. $Data->metadata( $column, 'start' )
. ", strand $strand\n";
return;
}
}
else {
# otherwise the bin size is based on feature length
if ( $strand >= 0 ) {
# forward plus strand
$start =
int( $fstart +
( $Data->metadata( $column, 'start' ) * 0.01 * $length ) +
0.5 );
$stop =
int( $fstart +
( $Data->metadata( $column, 'stop' ) * 0.01 * $length ) - 1 +
0.5 );
}
else {
# reverse minus strand
$start =
int( $fstop -
( $Data->metadata( $column, 'start' ) * 0.01 * $length ) +
0.5 );
$stop =
int( $fstop -
( $Data->metadata( $column, 'stop' ) * 0.01 * $length ) + 1 +
0.5 );
}
}
# collect the data for this bin
my $score = $row->get_score(
'ddb' => $ddb,
'dataset' => $dataset,
'start' => $start,
'stop' => $stop,
'method' => $method,
'stranded' => $stranded,
);
if ( $formatter and looks_like_number($score) ) {
$score = sprintf( $formatter, $score );
}
$row->value( $column, $score );
}
}
}
sub record_the_bin_values {
# get the passed values
my ( $row, $length, $regionscores ) = @_;
# assign the scores to the bins in the region
for my $column ( $startcolumn .. ( $Data->last_column ) ) {
# we will step through each data column, representing each window (bin)
# across the feature's region
# any scores within this window will be collected and the mean
# value reported
# record nulls if no data returned
unless ( scalar keys %{$regionscores} ) {
$row->value( $column, calculate_score( $method, undef ) );
next;
}
# convert the window start and stop coordinates (as percentages) to
# actual bp
# this depends on whether the binsize is explicitly defined in bp or
# is a fraction of the feature length
my ( $start, $stop );
if ( $Data->metadata( $column, 'bin_size' ) =~ /bp$/ ) {
# the bin size is explicitly defined
# the start and stop points are relative to either the feature
# start (always 0) or the end (the feature length), depending
# upon whether the 5' or 3' end of the feature
# determine this by the sign of the start position
if ( $Data->metadata( $column, 'start' ) < 0 ) {
# the start position is less than 0, implying the 5' end
# the reference position will be the feature start, or 0
$start = $Data->metadata( $column, 'start' );
$stop = $Data->metadata( $column, 'stop' );
}
else {
# the start position is greather than 0, implying the 3' end
# the reference position will be the feature end, or length
$start = $Data->metadata( $column, 'start' ) + $length;
$stop = $Data->metadata( $column, 'stop' ) + $length;
}
}
else {
# otherwise the bin size is based on feature length
$start = sprintf "%.0f",
( $Data->metadata( $column, 'start' ) * 0.01 * $length ) + 1;
$stop = sprintf "%.0f",
( $Data->metadata( $column, 'stop' ) * 0.01 * $length );
}
# collect the scores for this window
my @scores = map { $regionscores->{$_} }
grep { $_ >= $start and $_ <= $stop }
keys %{$regionscores};
# calculate the value
my $window_score = calculate_score( $method, \@scores );
if ( $formatter and looks_like_number($window_score) ) {
$window_score = sprintf( $formatter, $window_score );
}
# record the value
$row->value( $column, $window_score );
}
}
## Interpolate the '.' values with the mean of the neighbors
sub go_interpolate_values {
# determine counts
my $lastwindow = $Data->last_column;
# lastwindow is the index of the last column
# walk through each data line and then each window
my $stream = $Data->row_stream;
while ( my $row = $stream->next_row ) {
my $col = $startcolumn + 1;
while ( $col < $lastwindow ) {
# walk through the windows of a data row
# skipping the very first and last windows (columns)
# we will look for null values
# if one is found, interpolate from neighbors
if ( $row->value($col) eq '.' and $row->value( $col - 1 ) ne '.' ) {
# we will interpolate the value from the neighbors
# first, need to check that the neighbors have values
# find the next real value
my $next_i;
for ( my $i = $col + 1; $col <= $lastwindow; $i++ ) {
if ( $row->value($i) ne '.' ) {
$next_i = $i;
last;
}
}
next unless defined $next_i;
# determine fractional value
my $initial = $row->value( $col - 1 );
my $fraction =
( $row->value($next_i) - $initial ) / ( $next_i - $col + 1 );
# apply fractional values
for ( my $n = $col; $n < $next_i; $n++ ) {
my $score = $initial + ( $fraction * ( $n - $col + 1 ) );
if ( $formatter and looks_like_number($score) ) {
$score = sprintf( $formatter, $score );
}
$row->value( $n, $score );
}
# jump ahead
$col = $next_i;
}
$col++;
}
}
}
### Prepare all of the bin columns and their metadata
sub prepare_bins {
my ( $binsize, $dataset ) = @_;
# the size of the bin in percentage units, default would be 10%
# each bin will be titled the starting and ending point for that bin in
# percentage units
# for example, -20..-10,-10..0,0..10,10..20
# if $extension is defined, then it will add the appropriate flanking bins,
# otherwise it should skip them
# bin(s) on 5' flank
if ($extension) {
if ($extension_size) {
# extended bins should be of specific bp size
for ( my $i = $extension; $i > 0; $i-- ) {
my $start = 0 - ( $extension_size * $i );
my $stop = 0 - ( $extension_size * ( $i - 1 ) );
_set_metadata( $start, $stop, $extension_size, 'bp', $dataset );
}
}
else {
# extended bin size will be based on feature length
for ( my $i = $extension; $i > 0; $i-- ) {
my $start = 0 - ( $binsize * $i );
my $stop = 0 - ( $binsize * ( $i - 1 ) );
_set_metadata( $start, $stop, $binsize, '%', $dataset );
}
}
}
# bins over the gene body
for ( my $i = 0; $i < $bins; $i++ ) {
my $start = ( $i * $binsize );
my $stop = ( $i + 1 ) * $binsize;
_set_metadata( $start, $stop, $binsize, '%', $dataset );
}
# bin(s) on 3' flank
if ($extension) {
if ($extension_size) {
# extended bins should be of specific bp size
for ( my $i = 0; $i < $extension; $i++ ) {
my $start = ( $extension_size * $i );
my $stop = ( $extension_size * ( $i + 1 ) );
_set_metadata( $start, $stop, $extension_size, 'bp', $dataset );
}
}
else {
# extended bin size will be based on feature length
for ( my $i = 0; $i < $extension; $i++ ) {
my $start = 100 + ( $binsize * $i );
my $stop = 100 + ( $binsize * ( $i + 1 ) );
_set_metadata( $start, $stop, $binsize, '%', $dataset );
}
}
}
}
### Set the metadata for a new data table column (dataset)
sub _set_metadata {
scripts/get_binned_data.pl view on Meta::CPAN
Counts unique names. Useful when spliced alignments overlap more
than one exon and you want to avoid double-counting.
=back
=item --strand [all|sense|antisense]
Specify whether stranded data should be collected. Three values are
allowed: all datasets should be collected (default), only sense
datasets, or only antisense datasets should be collected.
=item --subfeature [ exon | cds | 5p_utr | 3p_utr ]
Optionally specify the type of subfeature to collect from, rather than
the entire gene. If the parent feature is gene and the subfeature is exon,
then all transcripts of the gene will be collapsed. The other subfeatures
(cds, 5p_utr, and 3p_utr) will not work with gene features but only with
coding mRNA transcripts. Note that the long option is incompatible.
Default is null.
=item --exons
Legacy option for specifying --subfeature exon.
=item --long
Indicate that data should be collected independently for each long
window. This may be enabled automatically if the sum of the entire
window length passes a predefined threshold. The default for 'short'
windows is to collect all of the point data from the dataset first, and
then divide the results into the different windows. Datasets consisting
of "long" features, for example long alignments, may be counted more
than once in long mode when they span multiple windows. Not compatible
when subfeatures are enabled.
=item --format E<lt>integerE<gt>
Specify the number of decimal positions to format the collected scores.
Default is not to format, often leading to more than the intended
significant digits.
=item --mapq E<lt>integer<gt>
Specify the minimum mapping quality of alignments to be considered when
counting from a Bam file. Default is 0, which will include all alignments,
including multi-mapping (typically MAPQ of 0). Set to an integer in range
of 0..255. Only affects count methods, including C<count>, C<ncount>, and
C<pcount>. Other methods involving coverage, e.g. C<mean>, do not filter
alignments.
=back
=head2 Bin specification
=over 4
=item --bins E<lt>integerE<gt>
Specify the number of bins that will be generated over the length
of the feature. The size of the feature is a percentage of the
feature length. The default number is 10, which results in bins of
size equal to 10% of the feature length.
=item --ext E<lt>integerE<gt>
Specify the number of extended bins on either side of the feature.
The bins are of the same size as determined by the feature
length and the --bins value. The default is 0.
=item --extsize E<lt>integerE<gt>
Specify the exact bin size in bp of the extended bins rather than
using a percentage of feature of length.
=item --min E<lt>integerE<gt>
Specify the minimum feature size to be averaged. Features with a
length below this value will not be skipped (all bins will have
null values). This is to avoid having bin sizes below the average
microarray tiling distance. The default is undefined (no limit).
=back
=head2 Post-processing
=over 4
=item --sum
Indicate that the data should be averaged across all features at
each position, suitable for graphing. A separate text file will be
written with the suffix '_summed' with the averaged data. The default
is false.
=item --smooth
Indicate that windows without values should (not) be interpolated
from neighboring values. The default is false.
=back
=head2 General options
=over 4
=item --groups
Optionally write a secondary file with the list of column group names and
their corresponding dataset group. This can be used to assist in designating
the metadata when plotting files, for example in R with pheatmap. The
file is named the output basename appended with F<.col_groups.txt>.
=item --gz
Specify whether (or not) the output file should be compressed with gzip.
=item --cpu E<lt>integerE<gt>
Specify the number of CPU cores to execute in parallel. This requires
the installation of Parallel::ForkManager. With support enabled, the
default is 4. Disable multi-threaded execution by setting to 1.
=item --noparse
Prevent input annotation files from being automatically parsed into sequence
features. Coordinates will be used as is and new data columns will be appended
to the input file.
=item --version
Print the version number.
=item --help
This help text.
=back
=head1 DESCRIPTION
This program will collect data across a gene or feature body into numerous
percentile bins. It is used to determine if there is a spatial
distribution preference for the dataset over gene bodies. The number
of bins may be specified as a command argument (default 10). Additionally,
extra bins may be extended on either side of the gene (default 0 on either
side). The bin size is determined as a percentage of gene length.
=head1 EXAMPLES
These are some examples of some common scenarios for collecting data.
=over 4
=item Collect scores in intervals
You want to collect the mean score from a bigWig file in 10% intervals
across each feature in a Bed file.
get_binned_data.pl --data scores.bw --in input.bed
=item Collect scores in intervals plus extended regions
You want to collect the maximum score in 5% intervals across each each
feature as well as five 100 bp intervals outside of each interval.
get_binned_data.pl --bins 20 --method max --ext 5 --extsize 100 --data \
scores.bw --in input.txt
=item Collect scores in intervals for genes
You want to collect stranded alignment counts from a Bam file for genes
in an annotation database.
get_binned_data.pl --db annotation --feature gene --strand sense \
--method count --data alignments.bam --out gene_profile --sum
=back
=head1 AUTHOR
Timothy J. Parnell, PhD
Howard Hughes Medical Institute
Dept of Oncological Sciences
Huntsman Cancer Institute
University of Utah
Salt Lake City, UT, 84112
This package is free software; you can redistribute it and/or modify
it under the terms of the Artistic License 2.0.
( run in 1.358 second using v1.01-cache-2.11-cpan-39bf76dae61 )