Bio-ViennaNGS

 view release on metacpan or  search on metacpan

lib/Bio/ViennaNGS/Util.pm  view on Meta::CPAN

# -*-CPerl-*-
# Last changed Time-stamp: <2017-06-10 19:12:00 michl>

package Bio::ViennaNGS::Util;

use Bio::ViennaNGS;
use Exporter;
use strict;
use warnings;
use Data::Dumper;
use File::Basename qw(fileparse basename);
use IPC::Cmd qw(can_run run);
use Path::Class qw(dir file);
use File::Path qw(make_path remove_tree);
use Math::Round;
use Carp;
use Env qw(HOME);
use version; our $VERSION = version->declare("$Bio::ViennaNGS::VERSION");

our @ISA = qw(Exporter);
our @EXPORT = ();

our @EXPORT_OK = qw ( bed_or_bam2bw sortbed bed2bigBed unique_array
		      kmer_enrichment fetch_chrom_sizes mkdircheck rmdircheck);


#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
#^^^^^^^^^^ Variables ^^^^^^^^^^^#
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#

my %unique = ();

#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
#^^^^^^^^^^^ Subroutines ^^^^^^^^^^#
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#

sub bed_or_bam2bw {
  my ($type,$infile,$chromsizes,$strand,$dest,$want_norm,$size,$scale,$log) = @_;
  my ($fn_bg_tmp,$fn_bg,$fn_bw,$fn,$bn,$path,$ext,$cmd);
  my @processed_files = ();
  my $factor = 1.;
  my $this_function = (caller(0))[3];

  croak "ERROR [$this_function] \$type is '$type', however it is expected to be either 'bam' or 'bed'\n"
    unless ($type eq "bam") || ($type eq "bed");

  my $genomeCoverageBed = can_run('genomeCoverageBed') or
    croak "ERROR [$this_function] genomeCoverageBed utility not found";
  my $bedGraphToBigWig = can_run('bedGraphToBigWig') or
    croak "ERROR [$this_function] bedGraphToBigWig utility not found";
  my $awk = can_run('awk') or
    croak "ERROR [$this_function] awk utility not found";

  if(defined $log){
    open(LOG, ">>", $log) or croak $!;
    print LOG "LOG [$this_function] \$infile: $infile\n";
    print LOG "LOG [$this_function] \$dest: $dest\n";
    print LOG "LOG [$this_function] \$chromsizes: $chromsizes\n";
  }

  croak "ERROR [$this_function] Cannot find $infile\n"
    unless (-e $infile);
  croak "ERROR [$this_function] $dest does not exist\n"
    unless (-d $dest);
  croak "ERROR [$this_function] Cannot find $chromsizes\n"
      unless (-e $chromsizes);

  if ($want_norm == 1){
    $factor = $scale/$size;
    print LOG "LOG [$this_function] normalization: $factor = ($scale/$size)\n"
      if(defined $log);
  }

  if ($type eq "bam"){$ext = ".bam";}
  else{$ext = ".bed";}
  $bn = basename($infile,$ext);
  $fn = fileparse($infile);

  $fn_bg_tmp  = file($dest,$fn.".tmp.bg");
  $fn_bg      = file($dest,$bn.".bg");
  if($strand eq "+"){
    $fn_bw  = file($dest,$bn.".pos.bw");
  }
  else {
    $fn_bw  = file($dest,$bn.".neg.bw");
  }

  $cmd = "$genomeCoverageBed -bg -scale $factor -split ";
  if ($type eq "bed"){ $cmd .= "-i $infile -g $chromsizes"; } # chrom.sizes only required for processing BED
  else { $cmd .= "-ibam $infile "; }
  $cmd .= " > $fn_bg_tmp";

  if($strand eq "-"){
    $cmd .= " && cat $fn_bg_tmp | $awk \'{ \$4 = - \$4 ; print \$0 }\' > $fn_bg";
  }
  else{
    $fn_bg = $fn_bg_tmp;
  }
  $cmd .= " && $bedGraphToBigWig $fn_bg $chromsizes $fn_bw";

  if (defined $log){ print LOG "LOG [$this_function] $cmd\n";}

  my( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) =
    run( command => $cmd, verbose => 0 );

  if( !$success ) {
    print STDERR "ERROR [$this_function] External command call unsuccessful\n";
    print STDERR "ERROR: this is what the command printed:\n";
    print join "", @$full_buf;
    croak $!;
  }
  if (defined $log){ close(LOG); }

  unlink ($fn_bg_tmp);
  unlink ($fn_bg);
  return $fn_bw;
}

sub bed2bigBed {
  my ($infile,$chromsizes,$dest,$log) = @_;
  my ($bn,$path,$ext,$cmd,$outfile);
  my $this_function = (caller(0))[3];
  my $bed2bigBed = can_run('bedToBigBed') or
    croak "ERROR [$this_function] bedToBigBed utility not found";

  if (defined $log){
    open(LOG, ">>", $log) or croak $!;
    print LOG "LOG [$this_function] \$infile: $infile -- \$chromsizes: $chromsizes --\$dest: $dest\n";
  }

  croak "ERROR [$this_function] Cannot find $infile"
    unless (-e $infile);
  croak "ERROR [$this_function] Cannot find $chromsizes"
    unless (-e $chromsizes);
  croak "ERROR [$this_function] $dest does not exist"
    unless (-d $dest);

  # .bed6 .bed12 extensions are replaced by .bb
  ($bn,$path,$ext) = fileparse($infile, qr /\.bed[126]?/);
  $outfile = file($dest, "$bn.bb");

  $cmd = "$bed2bigBed $infile -extraIndex=name -tab $chromsizes $outfile";
  if (defined $log){ print LOG "LOG [$this_function] $cmd\n"; }
  my( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) =
    run( command => $cmd, verbose => 0 );

  if( !$success ) {
    print STDERR "ERROR [$this_function] Call to $bed2bigBed unsuccessful\n";
    print STDERR "ERROR: this is what the command printed:\n";
    print join "", @$full_buf;
    croak $!;
  }

  if (defined $log){ close(LOG); }

  return $outfile;
}

sub sortbed {
  my ($infile,$dest,$outfile,$rm_orig,$log) = @_;
  my ($cmd,$out);
  my $this_function = (caller(0))[3];
  my $bedtools = can_run('bedtools') or
    croak "ERROR [$this_function] bedtools utility not found";

  croak "ERROR [$this_function] Cannot find $infile"
    unless (-e $infile);
  croak "ERROR [$this_function] $dest does not exist"
    unless (-d $dest);
  if (defined $log){open(LOG, ">>", $log) or croak $!;}

  $out = file($dest,$outfile);
  $cmd = "$bedtools sort -i $infile > $out";
  if (defined $log){ print LOG "LOG [$this_function] $cmd\n"; }
  my ( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) =
    run( command => $cmd, verbose => 0 );
  if( !$success ) {
    print STDERR "ERROR [$this_function] Call to $bedtools unsuccessful\n";
    print STDERR "ERROR: this is what the command printed:\n";
    print join "", @$full_buf;
    croak $!;
  }

  if($rm_orig){
    unlink($infile) or
      carp "WARN [$this_function] Could not unlink $infile";
    if (defined $log){
      print LOG "[$this_function] removed $infile $!\n";
    }
  }

  if (defined $log){ close(LOG); }
}

sub unique_array{

    my $arrayref = shift;
    my @array = @{$arrayref};

    foreach my $item (@array)
    {
	$unique{$item} ++;
    }
    my @arrayuid = sort {$a cmp $b} keys %unique;

    return(\@arrayuid);
}

sub kmer_enrichment{

    my @seqs =  @{$_[0]};
    my $klen     = $_[1];
#    my @seq = split( //, $read_tmp );
    my $kstring ='';
#return variables
    my %km;
    foreach my $sequences (@seqs){
#      print STDERR $sequences,"\n";
      my @seq = split( //, $sequences );
      for ( my $seq_pos = 0; $seq_pos <= $#seq-$klen ; $seq_pos++ ) {
	for (my $i=$seq_pos;$i<=$seq_pos+($klen-1);$i++){
	  $kstring .= $seq[$i]; 
	}
	$km{$kstring}++;
	$kstring = "";
      }
    }
    return( \%km );
}

sub fetch_chrom_sizes{
  my $species = shift;
  my %sizes;
  my @chromsize;
  my $this_function = (caller(0))[3];

  my $test_fetchChromSizes = can_run('fetchChromSizes') or
    croak "ERROR [$this_function] fetchChromSizes utility not found";

  my $cmd = "fetchChromSizes $species";
  my( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) = 
    run(command => $cmd, verbose => 0);
  if ($success){
    @chromsize = @{$stdout_buf};
  }
  else{
    carp "WARN [$this_function] Using UCSCs fetchChromSizes failed, trying alternative mysql fetch!\n";
    my $test_fetchChromSizes = can_run('mysql') or
      croak "ERROR [$this_function] mysql utility not found";
    $cmd = "mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \"select chrom, size from $species.chromInfo\"";  ### Alternative to UCSC fetchChromSizes, has mysql dependency
    my( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) =
      run(command => $cmd, verbose => 0);
    if ($success){
      @chromsize = @{$stdout_buf};
    }
    else{
      carp "ERROR [$this_function] External command call unsuccessful\n";
      carp "ERROR [$this_function] this is what the command printed:\n";
      print join "", @$full_buf;
      confess "Fetching of chromosome sizes failed, please either download fetchChromSizes from the UCSC script collection, or install mysql!\n";
    }
  }

  foreach (@chromsize){
    chomp($_);
    foreach (split(/\n/,$_)){
      my ($chr,$size)=split (/\t/,$_);
      $sizes{$chr}=$size;
    }
  }

  return(\%sizes);
}

sub mkdircheck {
  my @dirstocreate=();
  my $this_function = (caller(0))[3];
  while(@_){
    push @dirstocreate, shift(@_);
  }
  foreach (@dirstocreate){
    my @total = split(/[\/\\]/,$_);
    if ($total[0] eq '~'){
      $total[0]=$HOME;
      carp "WARNING [$this_function] replacing \'~\' in path with \'$HOME\'";
    }
    my $dir;
    while(@total){
      $dir = dir(shift(@total)) unless (defined $dir);
      $dir = dir($dir,shift(@total));
    }
    my $check = $dir->stringify();
    return if (-d $check);
    make_path($dir,{ verbose => 0 }) or
      croak "ERROR [$this_function] Cannot create directory $check";
  }
}

sub rmdircheck {
  my @dirstorm=();
  my $this_function = (caller(0))[3];
  while(@_){
    push @dirstorm, shift(@_);
  }
  foreach (@dirstorm){
    my @total = split(/[\/\\]/,$_);
    my $dir;
    while(@total){
      $dir = dir(shift(@total)) unless (defined $dir);
      $dir = dir($dir,shift(@total));
    }
    my $check = $dir->stringify();
    return if (!-d $check);
    remove_tree($dir,{ verbose => 0 }) or
      croak "ERROR [$this_function] Cannot remove directory $check";
  }
}

1;
__END__


=head1 NAME

Bio::ViennaNGS::Util - Utility routines for Next-Generation Sequencing data
analysis

=head1 SYNOPSIS

  use Bio::ViennaNGS::Util;

  # make bigWig from BED or BAM
  $type = "bam";
  $strand = "+";
  $bwfile = bed_or_bam2bw($type,$infile,$cs_in,$strand,$destdir,$wantnorm,$size_p,$scale,$logfile);

  # make bigBed from BED
  my $bb = bed2bigBed($bed_in,$cs_in,$destdir,$logfile);

  # sort a BED file 
  sortbed($bed_in,$destdir,$bed_out,$rm_orig,$logfile)

=head1 DESCRIPTION

Bio::ViennaNGS::Util is a collection of utility subroutines for
building efficient Next-Generation Sequencing (NGS) data analysis
pipelines.

=head2 ROUTINES

=over

=item bed_or_bam2bw($type,$infile,$chromsizes,$strand,$dest,$want_norm,$size,$scale,$log)

Creates stranded, normalized BigWig coverage profiles from
strand-specific BAM or BED files (provided via C<$infile>). The
routine expects a file type 'bam' or 'bed' via the C<$type>
argument. C<$chromsizes> is the chromosome.sizes files, C<$strand> is
either "+" or "-" and C<$dest> contains the output path for
results. For normlization of bigWig profiles, additional attributes
are required: C<$want_norm> triggers normalization with values 0 or



( run in 1.370 second using v1.01-cache-2.11-cpan-97f6503c9c8 )