Bio-ViennaNGS

 view release on metacpan or  search on metacpan

lib/Bio/ViennaNGS/Peak.pm  view on Meta::CPAN

use Path::Class;
use List::Util qw(sum sum0 min max first);
use Bio::ViennaNGS::Util qw(sortbed);
use namespace::autoclean;
use File::Temp qw(tempfile);
use version; our $VERSION = version->declare("$Bio::ViennaNGS::VERSION");

has 'data' => (
	       is => 'ro',
	       isa => 'HashRef',
	       predicate => 'has_data',
	       default => sub { {} },
	      );

has 'region' => (
		 is => 'ro',
		 isa => 'HashRef',
		 predicate => 'has_region',
		 default => sub { {} },
		);

has 'peaks' => (
		 is => 'ro',
		 isa => 'HashRef',
		 predicate => 'has_peaks',
		 default => sub { {} },
	       );

has 'winsize' => (
		  is => 'ro',
		  isa => 'Int',
		  predicate => 'has_winsize',
		 );

has 'interval' => (
		   is => 'ro',
		   isa => 'Int',
		   predicate => 'has_interval',
		  );

has 'mincov' => (
		 is => 'ro',
		 isa => 'Int',
		 predicate => 'has_mincov',
		);

has 'length' => (
		 is => 'ro',
		 isa => 'Int',
		 predicate => 'has_length',
		);

has 'threshold' => (
		 is => 'ro',
		 isa => 'Value',
		 predicate => 'has_threshold',
		);

sub populate_data {
  my ($self,$filep,$filen) = @_;
  my $this_function = (caller(0))[3];
  my ($i,$element,$chr,$start,$end,$val,$lastend);
  my $have_lastend = 0;

  # parse [+] strand
  foreach $element (@{$filep->data}){
    unless (exists ${$self->region}{$element->chromosome}{pos}{start}){
      ${$self->region}{$element->chromosome}{pos}{start} = $element->start;
    }
    if($have_lastend == 1 && $element->start>$lastend){ # fill the gap with 0s
      for($i=$lastend;$i<$element->start;$i++){
	${$self->data}{$element->chromosome}{pos}[$i] = 0.;
      }
    }
    for ($i=$element->start;$i<$element->end;$i++){
      ${$self->data}{$element->chromosome}{pos}[$i]=$element->dataValue;
    }
    $lastend = $element->end;
    $have_lastend = 1;
  }

  $lastend = 0; # reset lastend before parsing [-] strand
  $have_lastend = 1; # needed to get the values parsed correctly below

  # parse [-] strand
  foreach $element (@{$filen->data}){
    if ($element->{dataValue} <= 0){
      $element->{dataValue} *= -1;
    }
    unless (exists ${$self->region}{$element->chromosome}{neg}{start}){
      ${$self->region}{$element->chromosome}{neg}{start} = $element->start;
    }
    if($have_lastend == 1 && $element->start>$lastend){ # fill the gap with 0s
      for($i=$lastend;$i<$element->start;$i++){
	${$self->data}{$element->chromosome}{neg}[$i] = 0.;
      }
    }
    for ($i=$element->start;$i<$element->end;$i++){
      ${$self->data}{$element->chromosome}{neg}[$i]=$element->dataValue;
    }
    $lastend = $element->end;
    $have_lastend = 1;
  }
  return;
}

sub raw_peaks {
  my ($self,$dest,$prefix,$log) = @_;
  my ($fn,$tfn,$tfh,$maxidx,$have_pstart,$pstart,$pend,$chr);
  my ($winstart, $winend, $winsum, $mean, $lastmax, $from, $to);
  my ($index_of_maxwin);
  my $suffix = "rawpeaks.bed";
  my $this_function = (caller(0))[3];
  my $flex = 10;

  croak "ERROR [$this_function]: $dest does not exist\n"
    unless (-d $dest);

  if (defined $log){open(LOG, ">", $log) or croak "$!"}
  else {croak "ERROR [$this_function] \$log not defined";}

  $fn = $prefix.".".$suffix;   # filename of final, sorted RAWPEAKS bed file
  (undef,$tfn) = tempfile('RAWPEAKS_XXXXXXX',UNLINK=>0);

  croak "ERROR [$this_function]: $self->data not available\n"
    unless ($self->has_data);

  croak "ERROR [$this_function]: $self->region not available\n"
    unless ($self->has_region);

  open ($tfh, ">", $tfn) or croak $!;
  $lastmax = 0;
  foreach $chr (keys (%{$self->data})){

    # pos strand
    $maxidx      = $#{$self->data->{$chr}{pos}} ; # largest index in array
    $have_pstart = 0;
    $pstart      = -1;
    $pend        = -1;
    if (defined $log){print LOG "processing chr $chr [+] strand $maxidx\n"}
    croak "$chr has not been parsed in input data"
      unless (exists $self->region->{$chr}{pos}{start});

    for ($winstart=$self->region->{$chr}{pos}{start};$winstart<($maxidx-$self->winsize);$winstart+=$self->interval){
      # NOTE: does not handle the last window properly - > ignore this
      $winend = $winstart + $self->winsize - 1;
      $winsum = sum0 @{$self->data->{$chr}{pos}}[$winstart..$winend];
      $mean = $winsum/$self->winsize;
       #print LOG "**processing $winstart - $winend sum=$winsum mean=$mean lastmax=$lastmax\n";

      if ($mean > $lastmax){
	$lastmax = $mean;
	$index_of_maxwin = $winstart;
	unless ($have_pstart == 1) {
	  $pstart = $winstart;
	  $have_pstart = 1;
	  #print LOG "Peak start $chr:$winstart -- ";
	}
      }
      else {
	if ($mean > 0. && $mean <= $lastmax*$self->threshold ){ # peak stop criterion
	  $pend = $winend;
	  $have_pstart = 0;
	  $from = min($pstart,$pend);
	  $to   = max($pstart, $pend);
	  next if ($from<0);
	  next if ($to<0);
	  # now add +- $flex nt to raw peaks, required fro better peak boundary determination below
	  if ($from-$flex > 0){$from = $from - $flex;}
	  $to= $to+$flex; # we dont know the chromsize here, hence override it in case
	  #print LOG "$pend end\n";
	  my %pk = (
		    chr      => $chr,

lib/Bio/ViennaNGS/Peak.pm  view on Meta::CPAN

    for ($winstart=$maxidx;$winstart>($self->region->{$chr}{neg}{start}+$self->winsize);$winstart-=$self->interval){
      # NOTE: does not handle the last window properly - > ignore this
      $winend = $winstart - $self->winsize + 1;
      $winsum = sum0 @{$self->data->{$chr}{neg}}[$winend..$winstart];
      $mean = $winsum/$self->winsize;
      #print "processing $winstart - $winend sum=$winsum mean=$mean lastmax=$lastmax\n";

      if ($mean > $lastmax){
	$lastmax = $mean;
	$index_of_maxwin = $winend;
	unless ($have_pstart == 1) {
	  $pstart = $winend;
	  $have_pstart = 1;
	  #print "Peak start $chr:$pstart -- ";
	}
      }
      else {
	if ($mean > 0. && $mean <= $lastmax*$self->threshold ){ # peak stop criterion
	  $pend = $winstart;
	  $have_pstart = 0;
	  $from = min($pstart,$pend);
	  $to   = max($pstart, $pend);
	  next if ($from<0);
	  next if ($to<0);
	  # now add +- $flex nt to raw peaks, required fro better peak boundary determination below
	  if ($from-$flex > 0){$from = $from - $flex;}
	  $to = $to+$flex; # we dont know the chromsize here, hence override it in case
	  #print "$pend end\n";
	  my %pk = (
		    chr    => $chr,
		    start  => $from,
		    end    => $to,
		    strand => 'neg',
		    maxindex => $index_of_maxwin,
		   );
	  push (@{ $self->peaks->{$chr} }, \%pk);
	  if (defined $log){print LOG "**PEAK $chr [-] $from-$to max at $index_of_maxwin\n"}
	  print $tfh "$chr\t$from\t$to\tRAW\t0\t-\n";

	  # reset parameters
	  $lastmax = 0;
	  $pstart = -1;
	  $pend   = -1;
	}
      }
    } # end for

    #print LOG Dumper($self->peaks);

  } # end foreach
  close($tfh);
  close(LOG);
  sortbed($tfn,$dest,$fn,0,$log);
  unlink($tfn);
}

sub final_peaks {
  my ($self,$dest,$prefix,$log) = @_;
  my ($fn,$tfn,$tfh,$strand,$max,$idx,$val,$peak,$position,$pleft,$pright,$str);
  my $suffix = "candidatepeaks.bed";
  my $this_function = (caller(0))[3];

  croak "ERROR [$this_function]: $dest does not exist\n"
    unless (-d $dest);

  if (defined $log){open(LOG, ">>", $log) or croak "$!";}
  else {croak "ERROR [$this_function] \$log not defined";}

  $fn   = $prefix.".".$suffix;
  (undef,$tfn) = tempfile('CANDIDATEPEAKS_XXXXXXX',UNLINK=>0);

  croak "ERROR [$this_function]: $self->data not available\n"
    unless ($self->has_data);

  croak "ERROR [$this_function]: $self->peaks not available\n"
    unless ($self->has_peaks);

  open($tfh, ">", $tfn) or croak $!;
  foreach my $chr (keys (%{$self->peaks})){
    if (defined $log){print LOG "filtering candidate peaks in $chr\n";}

    foreach $peak ( @{$self->peaks->{$chr}}){
      my $have_pleft = 0; # we have found a proper left end
      my $have_pright = 0; # we have found a proper right end
      $strand = $$peak{strand};
      if (defined $log){print LOG ">>processing peak $$peak{start}-$$peak{end} [$strand] ";}
      $max = max @{$self->data->{$chr}{$strand}}[$$peak{start}..$$peak{end}]; # max peak elevation
      if (defined $log){print LOG "max is $max";}
      if ($max < $self->mincov){print LOG " ..SKIPPING\n";next;}
      $idx = first { @{$self->data->{$chr}{$strand}}[$_] eq $max } $$peak{start}..$$peak{end}; # coordinate
      #print LOG Dumper(\$peak);
      if (defined $log){print LOG " at $idx ..KEEPING\n";}

      # process regions left/right of the maximum to identify peak boundaries
      # validity check first
      die "maximum at $idx is not within peak region"
	if ( $idx<$$peak{start} || $idx>$$peak{end} );

      # find left end
      for ($position = $idx; $position>$$peak{start}; $position--){
	$val =  ${$self->data->{$chr}{$strand}}[$position];
	if (defined $log){print LOG "** VAL_L at pos $position $val - \$have_pleft=$have_pleft ";}
	if ( $val < $max*$self->threshold){
	  $have_pleft = 1;
	  if (defined $log){
	    print LOG "exiting 'cause $val < ".$max*$self->threshold." - \$have_pleft=$have_pleft\n";
	  }
	  last;
	}
	if (defined $log){print LOG "\n";}
      }
      $pleft = $position;

      # find right end
      for ($position = $idx; $position<=$$peak{end}; $position++){
	$val =  ${$self->data->{$chr}{$strand}}[$position];
	if (defined $log){print LOG "** VAL_R at pos $position $val - \$have_pright=$have_pright ";}
	if ( $val < $max*$self->threshold){
	  $have_pright = 1;
	  if (defined $log){
	    print LOG "exiting 'cause $val < ".$max*$self->threshold."- \$have_pright=$have_pright\n";



( run in 1.952 second using v1.01-cache-2.11-cpan-524268b4103 )