Bio-ViennaNGS
view release on metacpan or search on metacpan
lib/Bio/ViennaNGS/SpliceJunc.pm view on Meta::CPAN
# -*-CPerl-*-
# Last changed Time-stamp: <2017-06-10 19:08:57 michl>
package Bio::ViennaNGS::SpliceJunc;
use Bio::ViennaNGS;
use Exporter;
use strict;
use warnings;
use Data::Dumper;
use Bio::ViennaNGS::Fasta;
use IPC::Cmd qw(can_run run);
use File::Basename;
use Path::Class;
use Carp;
use version; our $VERSION = version->declare("$Bio::ViennaNGS::VERSION");
our @ISA = qw(Exporter);
our @EXPORT = ();
our @EXPORT_OK = qw(bed6_ss_from_bed12 bed6_ss_from_rnaseq
bed6_ss_to_bed12 intersect_sj ss_isCanonical);
# bed6_ss_from_bed12( $bed12,$dest,$window,$can,$fastaO)
#
# Extracts splice junctions from BED12 annotation.
#
# Writes a BED6 file for each transcript found in the BED12, listing
# all splice sites of this transcript, optionally flanking it with a
# window of +/-$window nt.
sub bed6_ss_from_bed12{
my ($bed12,$dest,$window,$can,$fastaobjR) = @_;
my ($i,$tr_name,$pos5,$pos3);
my ($splicesites,$c,$totalsj,$cansj) = (0)x4;
my @bedline = ();
my $this_function = (caller(0))[3];
croak "ERROR [$this_function] $bed12 does not exists\n"
unless (-e $bed12);
croak "ERROR [$this_function] $dest does not exist"
unless (-d $dest);
open(BED12IN, "< $bed12") or croak $!;
while(<BED12IN>){
chomp;
my ($chr,$chromStart,$chromEnd,$name,$score,$strand,$thickStart,
$thickEnd,$itemRgb,$blockCount,$blockSizes,$blockStarts) = split("\t");
my @blockSize = split(/,/,$blockSizes);
my @blockStart = split(/,/,$blockStarts);
unless (scalar @blockSize == scalar @blockStart){
croak "ERROR: unequal element count in blockStarts and blockSizes\n";
}
my $fn = sprintf("%s_%d-%d_%s.annotatedSS.bed6",$chr,$chromStart,$chromEnd,$name);
my $bed6_fn = file($dest,$fn);
my $tr_count = 1;
if ($blockCount >1){ # only transcripts with 2 or more exons (!!!)
open(BED6OUT, "> $bed6_fn") or croak "cannot open BED6OUT $!";
for ($i=0;$i<$blockCount-1;$i++){
$totalsj++;
$pos5 = $chromStart+$blockStart[$i]+$blockSize[$i];
$pos3 = $chromStart+$blockStart[$i+1];
if($can){
$c = ss_isCanonical($chr,$pos5,$pos3,$fastaobjR);
$cansj++;
}
$tr_name = sprintf("%s.%02d",$name,$tr_count++);
@bedline = join("\t",$chr,eval($pos5-$window),
eval($pos5+$window),$tr_name,$c,$strand);
print BED6OUT "@bedline\n";
@bedline = join("\t",$chr,eval($pos3-$window),
eval($pos3+$window),$tr_name,$c,$strand);
print BED6OUT "@bedline\n";
} # end for
close(BED6OUT);
} # end if
} # end while
close(BED12IN);
}
# bed6_ss_from_rnaseq ($bed_in,$dest,$window,$mincov,$can,$fastaO)
#
# Extracts splice junctions from mapped RNA-seq data
#
# Writes a BED6 file for each splice junction present in the input,
# optionally flanking it with a window of +/-$window nt. Only splice
# junctions supported by at least $mcov reads are considered.
sub bed6_ss_from_rnaseq{
my ($bed_in,$dest,$window,$mcov,$can,$fastaobjR) = @_;
my ($reads,$proper,$passed,$pos5,$pos3);
my $c = 0;
my @bedline = ();
my $this_function = (caller(0))[3];
croak "ERROR [$this_function] $bed_in does not exist\n"
unless (-e $bed_in);
croak "ERROR [$this_function] $dest does not exist"
unless (-d $dest);
open(INBED, "< $bed_in") or croak $!;
while(<INBED>){
chomp;
my ($chr, $start, $end, $info, $score, $strand) = split("\t");
$end = $end-2; # required for segemehl's BED6 files
if ($info =~ /^splits\:(\d+)\:(\d+)\:(\d+):(\w):(\w)/){
$reads = $1;
$proper = $4;
$passed = $5;
next unless ($proper eq 'N');
next unless ($passed =~ /[PFM]$/);
next if($reads < $mcov);
}
else {
croak "ERROR [$this_function] unsupported INFO field in input BED:\n$info\n";
}
$pos5 = $start;
$pos3 = $end;
if($can){$c = ss_isCanonical($chr,$pos5,$pos3,$fastaobjR);}
my $fn = sprintf("%s_%d-%d.mappedSS.bed6",$chr,$start,$end);
my $bed6_fn = file($dest,$fn);
open(BED6OUT, "> $bed6_fn");
@bedline = join("\t",$chr,eval($start-$window),
eval($start+$window),$info,$c,$strand);
print BED6OUT "@bedline\n";
@bedline = join("\t",$chr,eval($end-$window),
eval($end+$window),$info,$c,$strand);
print BED6OUT "@bedline\n";
close(BED6OUT);
}
close(INBED);
}
# bed6_ss_to_bed12 ($bed_in,$dest,$window,$mcov)
#
# Produce BED12 from BED6 file holdig splice junctions from mapped
# RNA-seq data
sub bed6_ss_to_bed12{
my ($bed_in,$dest,$window,$mcov,$circ) = @_;
my ($reads,$proper,$passed,$pos5,$pos3,$basename,$fn,$bed12_fn);
my @result = ();
my @bedline = ();
my $this_function = (caller(0))[3];
croak "ERROR [$this_function] $bed_in does not exist\n"
unless (-e $bed_in);
croak "ERROR [$this_function] $dest does not exist"
unless (-d $dest);
$basename = fileparse($bed_in,qr/\.[^.]*/);
$fn = $basename.".bed12";
$bed12_fn = file($dest,$fn);
open(BED12OUT, "> $bed12_fn");
open(INBED, "< $bed_in") or croak $!;
while(<INBED>){
chomp;
my ($chr, $start, $end, $info, $score, $strand) = split("\t");
if ($info =~ /^splits\:(\d+)\:(\d+)\:(\d+):(\w):(\w)/){
$reads = $1;
$proper = $4;
$passed = $5;
if ($circ == 1){ # skip 'N' (normal), 'L' (left) and 'R' (right)
next unless ($proper =~ /[C]$/);
}
else { # skip 'L', 'R' and 'C'
next unless ($proper =~ /[N]$/);
}
next unless ($passed =~ /[PM]$/); # ignore 'F'
next if($reads < $mcov);
}
else {
croak "ERROR [$this_function] unsupported INFO field in input BED:\n$info\n";
}
$pos5 = $start;
$pos3 = $end;
@bedline = join("\t",$chr,eval($start-$window),
eval($end+$window),$info,$score,$strand,$start,$end,
"0","2","1,1","0,".eval($end-$start-1));
print BED12OUT "@bedline\n";
}
close(INBED);
close(BED12OUT);
push (@result, $bed12_fn);
return @result;
}
# intersect_sj($p_annot,$p_mapped,$dest,$prefix,$window,$mil)
#
# Intersect splice junctions determined by RNA-seq with annotated
# splice junctions. Determine novel and existing splice junctions.
#
# Writes BED6 files for existing and novel splice junctions to $dest
# and reurns an array with the absolute path to the two resulting BED
# files
sub intersect_sj{
my ($p_annot,$p_mapped,$dest,$prefix,$window,$mil) = @_;
my ($dha,$dhm);
my $processed = 0;
my @junctions = ();
my @transcript_beds = ();
my %asj = (); # annotated splice junctions hash
my $this_function = (caller(0))[3];
my $bedtools = can_run('bedtools') or
croak "ERROR [$this_function] bedtools not found";
my $sortBed = can_run('sortBed') or
croak "ERROR [$this_function] sortBed not found";
croak "ERROR [$this_function] $p_annot does not exist\n"
unless (-d $p_annot);
croak "ERROR [$this_function] $p_mapped does not exist\n"
unless (-d $p_mapped);
croak "ERROR [$this_function] $dest does not exist\n"
unless (-d $dest);
# get a list of all files in $p_annot
opendir($dha, $p_annot) or croak "Cannot opendir $p_annot: $!";
while(readdir $dha) { push @transcript_beds, $_; }
closedir($dha);
# get a list of all splice junctions seen in RNA-seq data
opendir($dhm, $p_mapped) or croak "Cannot opendir $p_mapped: $!";
@junctions = grep { /^(chr\d+)\_(\d+)-(\d+)\.mappedSS\.bed6/ } readdir($dhm);
# process splice junctions seen in RNA-seq data
foreach my $file (@junctions){
$processed++;
croak "Unexpected file name pattern\n" unless ($file =~ /(chr\d+)\_(\d+)-(\d+)/);
my $sc = $1;
my $s5 = $2;
my $s3 = $3;
my $pattern = $sc."_".$s5."-".$s3;
my @annotated_beds = grep { /^(chr\d+)\_(\d+)-(\d+)/ && $2<=$s5 && $3>=$s3 && $sc eq $1} @transcript_beds;
#print"\t intersecting against ".(eval $#annotated_beds+1)." transcripts: @annotated_beds \n";
# intersect currently opened SJ against all transcripts in @annotated_beds
foreach my $i (@annotated_beds){
my $a = file($p_mapped,$file);
my $b = file($p_annot,$i);
my $intersect_cmd = "$bedtools intersect -a $a -b $b -c -nobuf";
open(INTERSECT, $intersect_cmd."|");
while(<INTERSECT>){
chomp;
if ($_ =~ /1$/) { $asj{$pattern} = 1;}
}
close(INTERSECT);
}
if ( $processed % 1000 == 0){
print STDERR "processed $processed splice junctions\n";
}
}
closedir($dhm);
# go through the mapped splice junctions files once again and
# separate novel from existing splice junctions
if (length($prefix)>0){$prefix .= ".";}
my $outname_exist = file($dest, $prefix."exist.SS.bed");
my $outname_exist_u = file($dest, $prefix."exist.SS.u.bed");
my $outname_novel = file($dest, $prefix."novel.SS.bed");
my $outname_novel_u = file($dest, $prefix."novel.SS.u.bed");
open (EXISTOUT, "> $outname_exist_u");
lib/Bio/ViennaNGS/SpliceJunc.pm view on Meta::CPAN
} # end while
close(SJ);
} # end if
else{ carp "Error with parsing BED6 junction file names __FILE__ __LINE__\n";}
}
close(EXISTOUT);
close(NOVELOUT);
# sort the resulting bed files
my $cmd = "$bedtools sort -i $outname_exist_u > $outname_exist";
my ( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) =
run( command => $cmd, verbose => 0 );
if( !$success ) {
print STDERR "ERROR [$this_function] Call to $bedtools unsuccessful\n";
print STDERR "ERROR: this is what the command printed:\n";
print join "", @$full_buf;
croak $!;
}
unlink($outname_exist_u);
$cmd = "$bedtools sort -i $outname_novel_u > $outname_novel";
( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) =
run( command => $cmd, verbose => 0 );
if( !$success ) {
print STDERR "ERROR [$this_function] Call to $bedtools unsuccessful\n";
print STDERR "ERROR: this is what the command printed:\n";
print join "", @$full_buf;
croak $!;
}
unlink($outname_novel_u);
printf STDERR "processed $processed splice junctions\n";
my @result = ();
push(@result, $outname_exist);
push(@result, $outname_novel);
return @result;
}
# ss_isCanonical ( $chr,$p5,$p3,$fastaO )
# Checks whether a given splice junction is canonical, ie. whether the
# first and last two nucleotides of the enclosed intron correspond to
# a certain nucleotide motif. $chr is the chromosome name, $p5 and $p3
# the 5' and 3' ends of the splice junction and $fastaobjR is a
# Bio::PrimarySeq::Fasta object holding the underlying reference
# genome.
#
# Th most common canonical splice junction motif is GT-AG (shown
# below). Other canonical motifs are GC->AG and AT->AC.
#
# ------------------>
# 5'===]GT..........AG[====3'
#
# <-----------------
# 3'===]GA..........TG[====5'
#
sub ss_isCanonical{
my ($chr,$p5,$p3,$fo) = @_;
my ($seqL,$seqR,$pattern);
my $ss_motif_length = 2;
my $this_function = (caller(0))[3];
my $c = -1;
$seqL = $fo->stranded_subsequence($chr,$p5+1,$p5+$ss_motif_length,"+");
$seqR = $fo->stranded_subsequence($chr,$p3-$ss_motif_length+1,$p3,"+");
$pattern = sprintf("%s|%s",$seqL,$seqR);
#print STDERR "[$this_function] p5->p3 ($p5 -- $p3) $seqL -- $seqR\n";
if ($pattern =~ /^GT|AG$/) { $c = 1000;}
elsif ($pattern =~ /^CT|AC$/) { $c = 1000; }
elsif ($pattern =~ /^GC|AG$/) { $c = 1000; }
elsif ($pattern =~ /^CT|GC$/) { $c = 1000; }
elsif ($pattern =~ /^AT|AC$/) { $c = 1000; }
elsif ($pattern =~ /^GT|AT$/) { $c = 1000; }
else { $c = 0;}
return $c;
}
1;
__END__
=head1 NAME
Bio::ViennaNGS::SpliceJunc - Perl extension for alternative splicing
analysis
=head1 SYNOPSIS
use Bio::ViennaNGS::SpliceJunc;
use Bio::ViennaNGS::Fasta;
# get a Bio::ViennaNGS::Fasta object
my $fastaO = Bio::ViennaNGS::Fasta->new($fasta_in);
# Extract annotated splice sites from BED12
bed6_ss_from_bed12($bed12_in,$dest_annot,$window,$want_canonical,$fastaO);
# Extract mapped splice junctions from RNA-seq data
bed6_ss_from_rnaseq($bed_in,$dest_ss,$window,$mincov,$want_canonical,$fastaO);
# Check for each splice junction seen in RNA-seq if it overlaps with
# any annotated splice junction
@res = intersect_sj($dest_annot,$dest_ss,$dest,$prefix,$window,$mil);
# Convert splice junctions seen in RNA-seq data to BED12
@res = bed6_ss_to_bed12($s_in,$outdir,$window,$mincov,$want_circular);
# Check whether a splice junction is canonical
$c = ss_isCanonical($chr,$pos5,$pos3,$fastaO);
=head1 DESCRIPTION
L<Bio::ViennaNGS::SpliceJunc> is a Perl module for alternative
splicing (AS) analysis. It provides routines for identification,
characterization and visualization of novel and existing (annotated)
splice junctions from RNA-seq data.
Identification of novel splice junctions is based on intersecting
potentially novel splice junctions from RNA-seq data with annotated
( run in 0.523 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )