Bio-ViennaNGS

 view release on metacpan or  search on metacpan

lib/Bio/ViennaNGS/SpliceJunc.pm  view on Meta::CPAN

# -*-CPerl-*-
# Last changed Time-stamp: <2017-06-10 19:08:57 michl>

package Bio::ViennaNGS::SpliceJunc;

use Bio::ViennaNGS;
use Exporter;
use strict;
use warnings;
use Data::Dumper;
use Bio::ViennaNGS::Fasta;
use IPC::Cmd qw(can_run run);
use File::Basename;
use Path::Class;
use Carp;
use version; our $VERSION = version->declare("$Bio::ViennaNGS::VERSION");

our @ISA = qw(Exporter);
our @EXPORT = ();
our @EXPORT_OK = qw(bed6_ss_from_bed12 bed6_ss_from_rnaseq
		 bed6_ss_to_bed12 intersect_sj ss_isCanonical);

# bed6_ss_from_bed12( $bed12,$dest,$window,$can,$fastaO)
#
# Extracts splice junctions from BED12 annotation.
#
# Writes a BED6 file for each transcript found in the BED12, listing
# all splice sites of this transcript, optionally flanking it with a
# window of +/-$window nt.
sub bed6_ss_from_bed12{
  my ($bed12,$dest,$window,$can,$fastaobjR) = @_;
  my ($i,$tr_name,$pos5,$pos3);
  my ($splicesites,$c,$totalsj,$cansj) = (0)x4;
  my @bedline = ();
  my $this_function = (caller(0))[3];

  croak "ERROR [$this_function] $bed12 does not exists\n"
    unless (-e $bed12);
  croak "ERROR [$this_function] $dest does not exist"
    unless (-d $dest);

  open(BED12IN, "< $bed12") or croak $!;
  while(<BED12IN>){
    chomp;
    my ($chr,$chromStart,$chromEnd,$name,$score,$strand,$thickStart,
	$thickEnd,$itemRgb,$blockCount,$blockSizes,$blockStarts) = split("\t");
    my @blockSize  = split(/,/,$blockSizes);
    my @blockStart = split(/,/,$blockStarts);
    unless (scalar @blockSize == scalar @blockStart){
      croak "ERROR: unequal element count in blockStarts and blockSizes\n";
    }
    my $fn = sprintf("%s_%d-%d_%s.annotatedSS.bed6",$chr,$chromStart,$chromEnd,$name);
    my $bed6_fn = file($dest,$fn);
    my $tr_count = 1;

    if ($blockCount >1){ # only transcripts with 2 or more exons (!!!)

      open(BED6OUT, "> $bed6_fn") or croak "cannot open BED6OUT $!";

      for ($i=0;$i<$blockCount-1;$i++){
	$totalsj++;
	$pos5 = $chromStart+$blockStart[$i]+$blockSize[$i];
	$pos3 = $chromStart+$blockStart[$i+1];
	if($can){
	  $c = ss_isCanonical($chr,$pos5,$pos3,$fastaobjR);
	  $cansj++;
	}
	$tr_name = sprintf("%s.%02d",$name,$tr_count++);
	@bedline = join("\t",$chr,eval($pos5-$window),
			eval($pos5+$window),$tr_name,$c,$strand);
	print BED6OUT "@bedline\n";
	@bedline = join("\t",$chr,eval($pos3-$window),
			eval($pos3+$window),$tr_name,$c,$strand);
	print BED6OUT "@bedline\n";
      } # end for

      close(BED6OUT);
    } # end if
  } # end while
  close(BED12IN);
}

# bed6_ss_from_rnaseq ($bed_in,$dest,$window,$mincov,$can,$fastaO)
#
# Extracts splice junctions from mapped RNA-seq data
#
# Writes a BED6 file for each splice junction present in the input,
# optionally flanking it with a window of +/-$window nt. Only splice
# junctions supported by at least $mcov reads are considered.
sub bed6_ss_from_rnaseq{
  my ($bed_in,$dest,$window,$mcov,$can,$fastaobjR) = @_;
  my ($reads,$proper,$passed,$pos5,$pos3);
  my $c = 0;
  my @bedline = ();
  my $this_function = (caller(0))[3];

  croak "ERROR [$this_function] $bed_in does not exist\n"
    unless (-e $bed_in);
  croak "ERROR [$this_function] $dest does not exist"
    unless (-d $dest);

  open(INBED, "< $bed_in") or croak $!;
  while(<INBED>){
    chomp;
    my ($chr, $start, $end, $info, $score, $strand) = split("\t");
    $end = $end-2; # required for segemehl's BED6 files

    if ($info =~ /^splits\:(\d+)\:(\d+)\:(\d+):(\w):(\w)/){
      $reads = $1;
      $proper = $4;
      $passed = $5;
      next unless ($proper eq 'N');
      next unless ($passed =~ /[PFM]$/);
      next if($reads < $mcov);
    }
    else {
      croak "ERROR [$this_function] unsupported INFO field in input BED:\n$info\n";
    }
    $pos5 = $start;
    $pos3 = $end;
    if($can){$c = ss_isCanonical($chr,$pos5,$pos3,$fastaobjR);}
    my $fn = sprintf("%s_%d-%d.mappedSS.bed6",$chr,$start,$end);
    my $bed6_fn = file($dest,$fn);
    open(BED6OUT, "> $bed6_fn");
    @bedline = join("\t",$chr,eval($start-$window),
		    eval($start+$window),$info,$c,$strand);
    print BED6OUT "@bedline\n";
    @bedline = join("\t",$chr,eval($end-$window),
		    eval($end+$window),$info,$c,$strand);
    print BED6OUT "@bedline\n";
    close(BED6OUT);
  }
  close(INBED);
}

# bed6_ss_to_bed12 ($bed_in,$dest,$window,$mcov)
#
# Produce BED12 from BED6 file holdig splice junctions from mapped
# RNA-seq data
sub bed6_ss_to_bed12{
  my ($bed_in,$dest,$window,$mcov,$circ) = @_;
  my ($reads,$proper,$passed,$pos5,$pos3,$basename,$fn,$bed12_fn);
  my @result = ();
  my @bedline = ();
  my $this_function = (caller(0))[3];

  croak "ERROR [$this_function] $bed_in does not exist\n"
    unless (-e $bed_in);
  croak "ERROR [$this_function] $dest does not exist"
    unless (-d $dest);

  $basename = fileparse($bed_in,qr/\.[^.]*/);
  $fn = $basename.".bed12";
  $bed12_fn = file($dest,$fn);
  open(BED12OUT, "> $bed12_fn");

  open(INBED, "< $bed_in") or croak $!;
  while(<INBED>){
    chomp;
    my ($chr, $start, $end, $info, $score, $strand) = split("\t");

    if ($info =~ /^splits\:(\d+)\:(\d+)\:(\d+):(\w):(\w)/){
      $reads = $1;
      $proper = $4;
      $passed = $5;
      if ($circ == 1){ # skip 'N' (normal), 'L' (left) and 'R' (right)
	next unless ($proper =~ /[C]$/);
      }
      else { # skip 'L', 'R' and 'C'
	next unless ($proper =~ /[N]$/);
      }
      next unless ($passed =~ /[PM]$/); # ignore 'F'
      next if($reads < $mcov);
    }
    else {
      croak "ERROR [$this_function] unsupported INFO field in input BED:\n$info\n";
    }
    $pos5 = $start;
    $pos3 = $end;

    @bedline = join("\t",$chr,eval($start-$window),
		    eval($end+$window),$info,$score,$strand,$start,$end,
		    "0","2","1,1","0,".eval($end-$start-1));
    print BED12OUT "@bedline\n";
  }
  close(INBED);
  close(BED12OUT);

  push (@result, $bed12_fn);
  return @result;
}

# intersect_sj($p_annot,$p_mapped,$dest,$prefix,$window,$mil)
#
# Intersect splice junctions determined by RNA-seq with annotated
# splice junctions. Determine novel and existing splice junctions.
#
# Writes BED6 files for existing and novel splice junctions to $dest
# and reurns an array with the absolute path to the two resulting BED
# files
sub intersect_sj{
  my ($p_annot,$p_mapped,$dest,$prefix,$window,$mil) = @_;
  my ($dha,$dhm);
  my $processed = 0;
  my @junctions = ();
  my @transcript_beds = ();
  my %asj = (); # annotated splice junctions hash
  my $this_function = (caller(0))[3];

  my $bedtools = can_run('bedtools') or
    croak "ERROR [$this_function] bedtools not found";
  my $sortBed  = can_run('sortBed') or
    croak "ERROR [$this_function] sortBed not found";

  croak "ERROR [$this_function] $p_annot does not exist\n"
    unless (-d $p_annot);
  croak "ERROR [$this_function] $p_mapped does not exist\n"
    unless (-d $p_mapped);
  croak "ERROR [$this_function] $dest does not exist\n"
    unless (-d $dest);

  # get a list of all files in $p_annot
  opendir($dha, $p_annot) or croak "Cannot opendir $p_annot: $!";
  while(readdir $dha) { push @transcript_beds, $_; }
  closedir($dha);

  # get a list of all splice junctions seen in RNA-seq data
  opendir($dhm, $p_mapped) or croak "Cannot opendir $p_mapped: $!";
  @junctions = grep { /^(chr\d+)\_(\d+)-(\d+)\.mappedSS\.bed6/ } readdir($dhm);

  # process splice junctions seen in RNA-seq data
  foreach my $file (@junctions){
    $processed++;
    croak "Unexpected file name pattern\n" unless ($file =~ /(chr\d+)\_(\d+)-(\d+)/);
    my $sc = $1;
    my $s5 = $2;
    my $s3 = $3;
    my $pattern = $sc."_".$s5."-".$s3;

    my @annotated_beds = grep { /^(chr\d+)\_(\d+)-(\d+)/ && $2<=$s5 && $3>=$s3 && $sc eq $1} @transcript_beds;
    #print"\t intersecting against ".(eval $#annotated_beds+1)." transcripts: @annotated_beds \n";

    # intersect currently opened SJ against all transcripts in @annotated_beds
    foreach my $i (@annotated_beds){
      my $a = file($p_mapped,$file);
      my $b = file($p_annot,$i);
      my $intersect_cmd = "$bedtools intersect -a $a -b $b -c -nobuf";
      open(INTERSECT, $intersect_cmd."|");
      while(<INTERSECT>){
	chomp;
	if ($_ =~ /1$/) { $asj{$pattern} = 1;}
      }
      close(INTERSECT);
    }
    if ( $processed % 1000 == 0){
      print STDERR "processed $processed splice junctions\n";
    }
  }
  closedir($dhm);

  # go through the mapped splice junctions files once again and
  # separate novel from existing splice junctions
  if (length($prefix)>0){$prefix .= ".";}
  my $outname_exist   = file($dest, $prefix."exist.SS.bed");
  my $outname_exist_u = file($dest, $prefix."exist.SS.u.bed");
  my $outname_novel   = file($dest, $prefix."novel.SS.bed");
  my $outname_novel_u = file($dest, $prefix."novel.SS.u.bed");
  open (EXISTOUT, "> $outname_exist_u");

lib/Bio/ViennaNGS/SpliceJunc.pm  view on Meta::CPAN

       } # end while
       close(SJ);
     } # end if
    else{ carp "Error with parsing BED6 junction file names __FILE__ __LINE__\n";}
  }
  close(EXISTOUT);
  close(NOVELOUT);

  # sort the resulting bed files
  my $cmd = "$bedtools sort -i  $outname_exist_u > $outname_exist";
  my ( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) =
    run( command => $cmd, verbose => 0 );
  if( !$success ) {
    print STDERR "ERROR [$this_function] Call to $bedtools  unsuccessful\n";
    print STDERR "ERROR: this is what the command printed:\n";
    print join "", @$full_buf;
    croak $!;
  }
  unlink($outname_exist_u);

  $cmd = "$bedtools sort -i  $outname_novel_u > $outname_novel";
  ( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) =
    run( command => $cmd, verbose => 0 );
  if( !$success ) {
    print STDERR "ERROR [$this_function] Call to $bedtools  unsuccessful\n";
    print STDERR "ERROR: this is what the command printed:\n";
    print join "", @$full_buf;
    croak $!;
  }
  unlink($outname_novel_u);

  printf STDERR "processed $processed splice junctions\n";
  my @result = ();
  push(@result, $outname_exist);
  push(@result, $outname_novel);
  return @result;
}

# ss_isCanonical ( $chr,$p5,$p3,$fastaO )

# Checks whether a given splice junction is canonical, ie. whether the
# first and last two nucleotides of the enclosed intron correspond to
# a certain nucleotide motif. $chr is the chromosome name, $p5 and $p3
# the 5' and 3' ends of the splice junction and $fastaobjR is a
# Bio::PrimarySeq::Fasta object holding the underlying reference
# genome.
#
# Th most common canonical splice junction motif is GT-AG (shown
# below). Other canonical motifs are GC->AG and AT->AC. 
#
#   ------------------>
# 5'===]GT..........AG[====3'
#
#   <-----------------
# 3'===]GA..........TG[====5'
#
sub ss_isCanonical{
  my ($chr,$p5,$p3,$fo) = @_;
  my ($seqL,$seqR,$pattern);
  my $ss_motif_length = 2;
  my $this_function = (caller(0))[3];
  my $c = -1;

  $seqL = $fo->stranded_subsequence($chr,$p5+1,$p5+$ss_motif_length,"+");
  $seqR = $fo->stranded_subsequence($chr,$p3-$ss_motif_length+1,$p3,"+");

  $pattern = sprintf("%s|%s",$seqL,$seqR);
  #print STDERR "[$this_function] p5->p3 ($p5 -- $p3) $seqL -- $seqR\n";

  if ($pattern =~ /^GT|AG$/) { $c = 1000;}
  elsif ($pattern =~ /^CT|AC$/) { $c = 1000; }
  elsif ($pattern =~ /^GC|AG$/) { $c = 1000; }
  elsif ($pattern =~ /^CT|GC$/) { $c = 1000; }
  elsif ($pattern =~ /^AT|AC$/) { $c = 1000; }
  elsif ($pattern =~ /^GT|AT$/) { $c = 1000; }
  else { $c = 0;}

  return $c;
}

1;
__END__

=head1 NAME

Bio::ViennaNGS::SpliceJunc - Perl extension for alternative splicing
analysis

=head1 SYNOPSIS

  use Bio::ViennaNGS::SpliceJunc;
  use Bio::ViennaNGS::Fasta;

  # get a Bio::ViennaNGS::Fasta object
  my $fastaO = Bio::ViennaNGS::Fasta->new($fasta_in);

  # Extract annotated splice sites from BED12
  bed6_ss_from_bed12($bed12_in,$dest_annot,$window,$want_canonical,$fastaO);

  # Extract mapped splice junctions from RNA-seq data
  bed6_ss_from_rnaseq($bed_in,$dest_ss,$window,$mincov,$want_canonical,$fastaO);

  # Check for each splice junction seen in RNA-seq if it overlaps with
  # any annotated splice junction
  @res = intersect_sj($dest_annot,$dest_ss,$dest,$prefix,$window,$mil);

  # Convert splice junctions seen in RNA-seq data to BED12
  @res = bed6_ss_to_bed12($s_in,$outdir,$window,$mincov,$want_circular);

  # Check whether a splice junction is canonical
  $c = ss_isCanonical($chr,$pos5,$pos3,$fastaO);

=head1 DESCRIPTION

L<Bio::ViennaNGS::SpliceJunc> is a Perl module for alternative
splicing (AS) analysis. It provides routines for identification,
characterization and visualization of novel and existing (annotated)
splice junctions from RNA-seq data.

Identification of novel splice junctions is based on intersecting
potentially novel splice junctions from RNA-seq data with annotated



( run in 0.523 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )