Bio-ViennaNGS
view release on metacpan or search on metacpan
lib/Bio/ViennaNGS/Peak.pm view on Meta::CPAN
use Path::Class;
use List::Util qw(sum sum0 min max first);
use Bio::ViennaNGS::Util qw(sortbed);
use namespace::autoclean;
use File::Temp qw(tempfile);
use version; our $VERSION = version->declare("$Bio::ViennaNGS::VERSION");
has 'data' => (
is => 'ro',
isa => 'HashRef',
predicate => 'has_data',
default => sub { {} },
);
has 'region' => (
is => 'ro',
isa => 'HashRef',
predicate => 'has_region',
default => sub { {} },
);
has 'peaks' => (
is => 'ro',
isa => 'HashRef',
predicate => 'has_peaks',
default => sub { {} },
);
has 'winsize' => (
is => 'ro',
isa => 'Int',
predicate => 'has_winsize',
);
has 'interval' => (
is => 'ro',
isa => 'Int',
predicate => 'has_interval',
);
has 'mincov' => (
is => 'ro',
isa => 'Int',
predicate => 'has_mincov',
);
has 'length' => (
is => 'ro',
isa => 'Int',
predicate => 'has_length',
);
has 'threshold' => (
is => 'ro',
isa => 'Value',
predicate => 'has_threshold',
);
sub populate_data {
my ($self,$filep,$filen) = @_;
my $this_function = (caller(0))[3];
my ($i,$element,$chr,$start,$end,$val,$lastend);
my $have_lastend = 0;
# parse [+] strand
foreach $element (@{$filep->data}){
unless (exists ${$self->region}{$element->chromosome}{pos}{start}){
${$self->region}{$element->chromosome}{pos}{start} = $element->start;
}
if($have_lastend == 1 && $element->start>$lastend){ # fill the gap with 0s
for($i=$lastend;$i<$element->start;$i++){
${$self->data}{$element->chromosome}{pos}[$i] = 0.;
}
}
for ($i=$element->start;$i<$element->end;$i++){
${$self->data}{$element->chromosome}{pos}[$i]=$element->dataValue;
}
$lastend = $element->end;
$have_lastend = 1;
}
$lastend = 0; # reset lastend before parsing [-] strand
$have_lastend = 1; # needed to get the values parsed correctly below
# parse [-] strand
foreach $element (@{$filen->data}){
if ($element->{dataValue} <= 0){
$element->{dataValue} *= -1;
}
unless (exists ${$self->region}{$element->chromosome}{neg}{start}){
${$self->region}{$element->chromosome}{neg}{start} = $element->start;
}
if($have_lastend == 1 && $element->start>$lastend){ # fill the gap with 0s
for($i=$lastend;$i<$element->start;$i++){
${$self->data}{$element->chromosome}{neg}[$i] = 0.;
}
}
for ($i=$element->start;$i<$element->end;$i++){
${$self->data}{$element->chromosome}{neg}[$i]=$element->dataValue;
}
$lastend = $element->end;
$have_lastend = 1;
}
return;
}
sub raw_peaks {
my ($self,$dest,$prefix,$log) = @_;
my ($fn,$tfn,$tfh,$maxidx,$have_pstart,$pstart,$pend,$chr);
my ($winstart, $winend, $winsum, $mean, $lastmax, $from, $to);
my ($index_of_maxwin);
my $suffix = "rawpeaks.bed";
my $this_function = (caller(0))[3];
my $flex = 10;
croak "ERROR [$this_function]: $dest does not exist\n"
unless (-d $dest);
if (defined $log){open(LOG, ">", $log) or croak "$!"}
else {croak "ERROR [$this_function] \$log not defined";}
$fn = $prefix.".".$suffix; # filename of final, sorted RAWPEAKS bed file
(undef,$tfn) = tempfile('RAWPEAKS_XXXXXXX',UNLINK=>0);
croak "ERROR [$this_function]: $self->data not available\n"
unless ($self->has_data);
croak "ERROR [$this_function]: $self->region not available\n"
unless ($self->has_region);
open ($tfh, ">", $tfn) or croak $!;
$lastmax = 0;
foreach $chr (keys (%{$self->data})){
# pos strand
$maxidx = $#{$self->data->{$chr}{pos}} ; # largest index in array
$have_pstart = 0;
$pstart = -1;
$pend = -1;
if (defined $log){print LOG "processing chr $chr [+] strand $maxidx\n"}
croak "$chr has not been parsed in input data"
unless (exists $self->region->{$chr}{pos}{start});
for ($winstart=$self->region->{$chr}{pos}{start};$winstart<($maxidx-$self->winsize);$winstart+=$self->interval){
# NOTE: does not handle the last window properly - > ignore this
$winend = $winstart + $self->winsize - 1;
$winsum = sum0 @{$self->data->{$chr}{pos}}[$winstart..$winend];
$mean = $winsum/$self->winsize;
#print LOG "**processing $winstart - $winend sum=$winsum mean=$mean lastmax=$lastmax\n";
if ($mean > $lastmax){
$lastmax = $mean;
$index_of_maxwin = $winstart;
unless ($have_pstart == 1) {
$pstart = $winstart;
$have_pstart = 1;
#print LOG "Peak start $chr:$winstart -- ";
}
}
else {
if ($mean > 0. && $mean <= $lastmax*$self->threshold ){ # peak stop criterion
$pend = $winend;
$have_pstart = 0;
$from = min($pstart,$pend);
$to = max($pstart, $pend);
next if ($from<0);
next if ($to<0);
# now add +- $flex nt to raw peaks, required fro better peak boundary determination below
if ($from-$flex > 0){$from = $from - $flex;}
$to= $to+$flex; # we dont know the chromsize here, hence override it in case
#print LOG "$pend end\n";
my %pk = (
chr => $chr,
lib/Bio/ViennaNGS/Peak.pm view on Meta::CPAN
for ($winstart=$maxidx;$winstart>($self->region->{$chr}{neg}{start}+$self->winsize);$winstart-=$self->interval){
# NOTE: does not handle the last window properly - > ignore this
$winend = $winstart - $self->winsize + 1;
$winsum = sum0 @{$self->data->{$chr}{neg}}[$winend..$winstart];
$mean = $winsum/$self->winsize;
#print "processing $winstart - $winend sum=$winsum mean=$mean lastmax=$lastmax\n";
if ($mean > $lastmax){
$lastmax = $mean;
$index_of_maxwin = $winend;
unless ($have_pstart == 1) {
$pstart = $winend;
$have_pstart = 1;
#print "Peak start $chr:$pstart -- ";
}
}
else {
if ($mean > 0. && $mean <= $lastmax*$self->threshold ){ # peak stop criterion
$pend = $winstart;
$have_pstart = 0;
$from = min($pstart,$pend);
$to = max($pstart, $pend);
next if ($from<0);
next if ($to<0);
# now add +- $flex nt to raw peaks, required fro better peak boundary determination below
if ($from-$flex > 0){$from = $from - $flex;}
$to = $to+$flex; # we dont know the chromsize here, hence override it in case
#print "$pend end\n";
my %pk = (
chr => $chr,
start => $from,
end => $to,
strand => 'neg',
maxindex => $index_of_maxwin,
);
push (@{ $self->peaks->{$chr} }, \%pk);
if (defined $log){print LOG "**PEAK $chr [-] $from-$to max at $index_of_maxwin\n"}
print $tfh "$chr\t$from\t$to\tRAW\t0\t-\n";
# reset parameters
$lastmax = 0;
$pstart = -1;
$pend = -1;
}
}
} # end for
#print LOG Dumper($self->peaks);
} # end foreach
close($tfh);
close(LOG);
sortbed($tfn,$dest,$fn,0,$log);
unlink($tfn);
}
sub final_peaks {
my ($self,$dest,$prefix,$log) = @_;
my ($fn,$tfn,$tfh,$strand,$max,$idx,$val,$peak,$position,$pleft,$pright,$str);
my $suffix = "candidatepeaks.bed";
my $this_function = (caller(0))[3];
croak "ERROR [$this_function]: $dest does not exist\n"
unless (-d $dest);
if (defined $log){open(LOG, ">>", $log) or croak "$!";}
else {croak "ERROR [$this_function] \$log not defined";}
$fn = $prefix.".".$suffix;
(undef,$tfn) = tempfile('CANDIDATEPEAKS_XXXXXXX',UNLINK=>0);
croak "ERROR [$this_function]: $self->data not available\n"
unless ($self->has_data);
croak "ERROR [$this_function]: $self->peaks not available\n"
unless ($self->has_peaks);
open($tfh, ">", $tfn) or croak $!;
foreach my $chr (keys (%{$self->peaks})){
if (defined $log){print LOG "filtering candidate peaks in $chr\n";}
foreach $peak ( @{$self->peaks->{$chr}}){
my $have_pleft = 0; # we have found a proper left end
my $have_pright = 0; # we have found a proper right end
$strand = $$peak{strand};
if (defined $log){print LOG ">>processing peak $$peak{start}-$$peak{end} [$strand] ";}
$max = max @{$self->data->{$chr}{$strand}}[$$peak{start}..$$peak{end}]; # max peak elevation
if (defined $log){print LOG "max is $max";}
if ($max < $self->mincov){print LOG " ..SKIPPING\n";next;}
$idx = first { @{$self->data->{$chr}{$strand}}[$_] eq $max } $$peak{start}..$$peak{end}; # coordinate
#print LOG Dumper(\$peak);
if (defined $log){print LOG " at $idx ..KEEPING\n";}
# process regions left/right of the maximum to identify peak boundaries
# validity check first
die "maximum at $idx is not within peak region"
if ( $idx<$$peak{start} || $idx>$$peak{end} );
# find left end
for ($position = $idx; $position>$$peak{start}; $position--){
$val = ${$self->data->{$chr}{$strand}}[$position];
if (defined $log){print LOG "** VAL_L at pos $position $val - \$have_pleft=$have_pleft ";}
if ( $val < $max*$self->threshold){
$have_pleft = 1;
if (defined $log){
print LOG "exiting 'cause $val < ".$max*$self->threshold." - \$have_pleft=$have_pleft\n";
}
last;
}
if (defined $log){print LOG "\n";}
}
$pleft = $position;
# find right end
for ($position = $idx; $position<=$$peak{end}; $position++){
$val = ${$self->data->{$chr}{$strand}}[$position];
if (defined $log){print LOG "** VAL_R at pos $position $val - \$have_pright=$have_pright ";}
if ( $val < $max*$self->threshold){
$have_pright = 1;
if (defined $log){
print LOG "exiting 'cause $val < ".$max*$self->threshold."- \$have_pright=$have_pright\n";
( run in 1.952 second using v1.01-cache-2.11-cpan-524268b4103 )