Bio-ViennaNGS
view release on metacpan or search on metacpan
lib/Bio/ViennaNGS/UCSC.pm view on Meta::CPAN
# -*-CPerl-*-
# Last changed Time-stamp: <2017-06-26 17:28:26 mtw>
package Bio::ViennaNGS::UCSC;
use Bio::ViennaNGS;
use Exporter;
use strict;
use warnings;
use Template;
use Cwd;
use File::Basename;
use File::Find;
use IPC::Cmd qw(can_run run);
use File::Share ':all';
use Path::Class;
use Data::Dumper;
use Carp;
use Bio::ViennaNGS::Util qw(fetch_chrom_sizes bed2bigBed);
use version; our $VERSION = version->declare("$Bio::ViennaNGS::VERSION");
our @ISA = qw(Exporter);
our @EXPORT_OK = qw( make_assembly_hub make_track_hub retrieve_color
make_track make_multi_bigwig_container_track
make_bigwig_container_track retrieve_bigwig_tracks
retrieve_bigbed_url_tracks parse_fasta_header valid_ncbi_accession
modify_fasta_header make_group retrieve_chromosome_size
write_chromosome_size_file convert_tracks);
our @EXPORT = ();
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
#^^^^^^^^^^^ Subroutines ^^^^^^^^^^#
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
sub make_assembly_hub{
my ($fasta_path, $filesdir, $basedir, $baseURL, $big_wig_ids, $log) = @_;
my ($basename,$dir,$ext);
my $this_function = (caller(0))[3];
#check arguments
croak ("ERROR [$this_function] \$fasta_path does not exist\n")
unless (-e $fasta_path);
croak ("ERROR [$this_function] \$basedir does not exist\n")
unless (-d $basedir);
croak ("ERROR [$this_function]: no URL (network location for upload to UCSC) provided")
unless(defined $baseURL);
unless ($baseURL =~ /\/$/) { $baseURL .= "/"; }
my $tmp_path = dist_file('Bio-ViennaNGS', "hub.txt" );
($basename,$dir,$ext) = fileparse($tmp_path,qr/\..*/);
my $template_path = dir($dir,"template-AssemblyHub");
croak ("ERROR [$this_function] template directory not found\n")
unless (-d $template_path);
my $faToTwoBit = can_run('faToTwoBit') or
croak ("ERROR [$this_function] faToTwoBit is not installed!");
# bedfiles path
my $parsedHeaderRef = parse_fasta_header($fasta_path,$this_function);
my @parsedHeader = @$parsedHeaderRef;
my $unchecked_accession = $parsedHeader[0];
my $scientificName = $parsedHeader[1];
my $accession = valid_ncbi_accession($unchecked_accession);
# create assembly hub directory structure
my $assembly_hub_name = "assemblyHub";
my $assembly_hub_directory = dir($basedir, $assembly_hub_name);
my $genome_assembly_name = $accession;
my $genome_assembly_directory = dir($assembly_hub_directory,$genome_assembly_name);
mkdir $assembly_hub_directory;
mkdir $genome_assembly_directory;
if (defined $log){
open(LOG, ">>", $log) or croak "$!";
print LOG "LOG Base directory: $assembly_hub_directory\n";
print LOG "LOG Assembly Hub directory: $genome_assembly_directory\n";
close(LOG);
}
#2-bit fasta file conversion
my $fa_modified = file($assembly_hub_directory, $accession.".fa");
modify_fasta_header($fasta_path,$fa_modified,$accession);
my $twoBit = file($genome_assembly_directory, $accession.".2bit");
my $fastaToTwobit_cmd = "$faToTwoBit $fa_modified $twoBit";
if (defined $log){
open(LOG, ">>", $log) or croak "$!";
print LOG "LOG [$this_function] $fastaToTwobit_cmd\n";
close(LOG);
}
my( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) =
run( command => $fastaToTwobit_cmd, verbose => 0 );
if( !$success ) {
print STDERR "ERROR [$this_function] External command call unsuccessful\n";
print STDERR "ERROR: this is what the command printed:\n";
print join "", @$full_buf;
croak $!;
}
lib/Bio/ViennaNGS/UCSC.pm view on Meta::CPAN
bioProjectInformationDescription => "bioProjectInformationDescription",
sequenceAnnotationLink => "sequenceAnnotationLink"
};
$template->process($description_html_file,$description_html_vars,$description_html_path) or
croak "Template process failed: ", $template->error(), "\n";
my $groups = make_group("annotation", "Annotation", "1", "0");
#construct group.txt
my $group_txt_path = file($genome_assembly_directory, "groups.txt")->stringify;
my $group_txt_file = 'groups.txt';
my $group_txt_vars =
{
groups => "$groups",
};
$template->process($group_txt_file,$group_txt_vars,$group_txt_path) or
croak "Template process failed: ", $template->error(), "\n";
#construct big bed
my $tracksList;
#Bigbeds are only created from infolder
unless($filesdir =~ /-/){
my $chromosome_size = retrieve_chromosome_size($fasta_path);
my $chromosome_size_filepath = file($genome_assembly_directory,"$accession.chrom.sizes");
write_chromosome_size_file($chromosome_size_filepath,$accession,$chromosome_size);
convert_tracks($filesdir, $genome_assembly_directory, $chromosome_size_filepath, $log);
my @trackfiles = retrieve_tracks($genome_assembly_directory, $baseURL, $assembly_hub_name, $accession);
foreach my $track (@trackfiles){
my $trackString = make_track(@$track);
$tracksList .= $trackString;
}
}
#big wigs
my $bigwig_tracks_string = "";
unless($big_wig_ids=~/^-$/){
$bigwig_tracks_string = retrieve_bigwig_tracks($genome_assembly_directory, $baseURL, $assembly_hub_name, $accession, $big_wig_ids);
}
$tracksList .= $bigwig_tracks_string;
#construct trackDb.txt
my $trackDb_txt_path = file($genome_assembly_directory, "trackDb.txt")->stringify;
my $trackDb_txt_file = 'trackDb.txt';
my $trackDb_txt_vars =
{
tracks => "$tracksList"
};
$template->process($trackDb_txt_file,$trackDb_txt_vars,$trackDb_txt_path) or
croak "Template process failed: ", $template->error(), "\n";
if (defined $log){
open(LOG, ">>", $log) or croak "$!";
print LOG "LOG Assembly Hub created\n";
close(LOG);
}
}
sub make_track_hub{
my ($species, $basedir, $baseURL, $big_bed_ids, $big_wig_ids, $log) = @_;
my ($basename,$dir,$ext);
my $this_function = (caller(0))[3];
#check arguments
croak ("ERROR [$this_function] \no species provided\n")
unless ($species);
croak ("ERROR [$this_function] \$basedir does not exist\n")
unless (-d $basedir);
croak ("ERROR [$this_function]: no URL (network location for upload to UCSC) provided")
unless(defined $baseURL);
if (defined $log){
open(LOG, ">>", $log) or croak "$!";
}
unless ($baseURL =~ /\/$/) { $baseURL .= "/"; }
my $tmp_path = dist_file('Bio-ViennaNGS', "hub.txt" );
($basename,$dir,$ext) = fileparse($tmp_path,qr/\..*/);
my $template_path = dir($dir,"template-TrackHub");
croak ("ERROR [$this_function] template directory not found\n")
unless (-d $template_path);
my $faToTwoBit = can_run('faToTwoBit') or
croak ("ERROR [$this_function] faToTwoBit is not installed!");
# create track hub directory structure
my $track_hub_name = "trackHub";
my $track_hub_directory = dir($basedir, $track_hub_name);
my $genome_assembly_name = $species;
my $genome_assembly_directory = dir($track_hub_directory,$genome_assembly_name);
mkdir $track_hub_directory;
mkdir $genome_assembly_directory;
if (defined $log){
print LOG "LOG Base directory: $track_hub_directory\n";
print LOG "LOG Track Hub directory: $genome_assembly_directory\n";
}
#template definition
my $template = Template->new({
INCLUDE_PATH => ["$template_path"],
RELATIVE=>1,
});
#construct hub.txt
my $hubtxt_path = file($track_hub_directory,"hub.txt")->stringify;
my $hubtxt_file = "hub.txt";
my $hubtxt_vars =
{
hubName => $species,
shortLabel => $species,
longLabel => $species,
genomesFile => "genome.txt",
email => 'email',
descriptionURL => "$baseURL" . "description.html"
};
$template->process($hubtxt_file,$hubtxt_vars,"$hubtxt_path") ||
croak "Template process failed: ", $template->error(), "\n";
#construct genome.txt
my $genometxt_path = file($track_hub_directory, "genome.txt")->stringify;
my $genometxt_file = "genome.txt";
my $genometxt_vars =
{
genome => $species,
trackDb => file($species, "trackDb.txt"),
groups => file($species, "groups.txt"),
description => "$species",
twoBitPath => file($species,$species.".2bit"),
organism => "organism",
defaultPos => $species,
orderKey => "10",
scientificName => "scientificName",
htmlPath => file($species,"description.html")
};
$template->process($genometxt_file,$genometxt_vars,$genometxt_path) or
croak "Template process failed: ", $template->error(), "\n";
my $tracksList;
#bigbeds are only created from urls
my $bigbeds_tracks_string = "";
unless($big_bed_ids=~/^-$/){
$bigbeds_tracks_string = retrieve_bigbed_url_tracks($genome_assembly_directory, $baseURL, $track_hub_name, $big_bed_ids);
}
$tracksList .= $bigbeds_tracks_string;
#big wigs
my $bigwig_tracks_string = "";
unless($big_wig_ids=~/^-$/){
$bigwig_tracks_string = retrieve_bigwig_tracks($genome_assembly_directory, $baseURL, $track_hub_name, $species, $big_wig_ids);
}
$tracksList .= $bigwig_tracks_string;
#construct trackDb.txt
my $trackDb_txt_path = file($genome_assembly_directory, "trackDb.txt")->stringify;
my $trackDb_txt_file = 'trackDb.txt';
my $trackDb_txt_vars =
{
tracks => "$tracksList"
};
$template->process($trackDb_txt_file,$trackDb_txt_vars,$trackDb_txt_path) or
croak "Template process failed: ", $template->error(), "\n";
if (defined $log){
print LOG "LOG Track Hub created\n";
close(LOG);
}
}
sub convert_tracks{
my ($filesdir,$genome_assembly_directory,$chromosome_size_filepath,$log) = @_;
my $this_function = (caller(0))[3];
my @bedfiles = ();
find( sub {
return unless -f;
return unless /\.bed$/;
push @bedfiles, $File::Find::name;
} , $filesdir);
if (defined $log){
open(LOG, ">>", $log) or croak $!;
foreach my $bedfile (@bedfiles){
print LOG "LOG [$this_function] found file $bedfile\n";
}
close(LOG);
}
foreach my $bedfile (@bedfiles){
bed2bigBed($bedfile,$chromosome_size_filepath,$genome_assembly_directory,$log);
}
return 1;
}
sub retrieve_tracks{
my ($directoryPath,$baseURL,$assembly_hub_name,$accession) = @_;
my $currentDirectory = getcwd;
chdir $directoryPath or croak $!;
my @trackfiles = <*.bb>;
my @tracks;
my $counter = 0;
foreach my $trackfile (@trackfiles){
my $filename = $trackfile;
$filename =~ s/.bb$//;
my @filenameSplit = split(/\./, $filename);
my $id = lc($filename);
my $tag = $id;
my $track = $id . "_bed";
my $dir = dir($baseURL,$assembly_hub_name,$accession);
my $bigDataUrl = file($trackfile);
my $shortLabel = $id;
my $longLabel = $id;
my $type = "bigBed 12 .";
my $autoScale = "off";
my $bedNameLabel = "Gene Id";
my $searchIndex = "name";
my $colorByStrand = retrieve_color($counter);
my $visibility = "pack";
my $group = "annotation";
my $priority = "10";
my @track = ($tag,$track,$bigDataUrl,$shortLabel,$longLabel,$type,$autoScale,$bedNameLabel,$searchIndex,$colorByStrand,$visibility,$group,$priority);
my $trackreference = \@track;
push(@tracks, $trackreference);
$counter++;
}
chdir $currentDirectory or croak $!;
return @tracks;
}
sub retrieve_bigbed_url_tracks{
my ($directoryPath,$baseURL,$assembly_hub_name,$bigbedpaths) = @_;
my $currentDirectory = getcwd;
lib/Bio/ViennaNGS/UCSC.pm view on Meta::CPAN
$color[5] = "220,51,47 220,51,47";
#orange
$color[6] = "203,76,22 203,76,22";
#yellow
$color[7] = "181,138,0 181,138,0";
#black
$color[8] = "0,44,54 0,44,54";
#grey
$color[9] = "88,111,117 88,111,117";
return $color[$colorcode];
}
sub make_group{
my ($name, $label, $priority, $defaultIsClosed) = @_;
my $group ="name $name\nlabel $label\npriority $priority\ndefaultIsClosed $defaultIsClosed\n";
return $group;
}
sub make_track{
my ($tag, $track, $bigDataUrl, $shortLabel, $longLabel, $type, $autoScale, $bedNameLabel, $searchIndex, $colorByStrand, $visibility, $group, $priority) = @_;
my $trackEntry ="#$tag\ntrack $track\nbigDataUrl $bigDataUrl\nshortLabel $shortLabel\nlongLabel $longLabel\ntype $type\nautoScale $autoScale\nbedNameLabel $bedNameLabel\nsearchIndex $searchIndex\ncolorByStrand $colorByStrand\nvisibility $visibility...
return $trackEntry;
}
sub make_multi_bigwig_container_track{
my ($tag, $track, $shortLabel, $longLabel, $type, $autoScale, $visibility, $priority) = @_;
my $trackEntry ="#$tag\ntrack $track\ncontainer multiWig\nnoInherit on\nshortLabel $shortLabel\nlongLabel $longLabel\ntype $type\nconfigureable on\nvisibility $visibility\naggregate transparentOverlay\nshowSubtrackColorOnUi on\nautoScale $autoScale...
return $trackEntry;
}
sub make_bigwig_container_track{
my ($track, $bigDataUrl, $shortLabel, $longLabel, $type, $parent, $color) = @_;
my $trackEntry = "track $track\nbigDataUrl $bigDataUrl\nshortLabel $shortLabel\nlongLabel $longLabel\ntype $type\nparent $parent\ncolor $color\n\n";
return $trackEntry;
}
sub make_bigwig_track{
my ($tag, $track, $bigDataUrl, $shortLabel, $longLabel, $type, $autoScale, $visibility, $priority, $color) = @_;
my $trackEntry ="#$tag\ntrack $track\nbigDataUrl $bigDataUrl\nshortLabel $shortLabel\nlongLabel $longLabel\ntype $type\nvisibility $visibility\nautoScale $autoScale\npriority $priority\nalwaysZero on\nyLineMark 0\nyLineOnOff on\nmaxHeightPixels 125...
return $trackEntry;
}
sub valid_ncbi_accession{
# receives a NCBI accession ID, with or without version number
# checks for validity and returns the accession number as is
my $acc = shift;
if ($acc =~ /^(N[CSTZ]\_[A-Z]*\d{6})\.\d+?$/){
return $acc;
}
elsif ($acc =~ /^(N[CSTZ]\_[A-Z]*\d{6})$/){
return $1;
}
else {
return 0;
}
}
sub parse_fasta_header{
my $filepath = shift;
my $this_function = (caller(0))[3];
open my $file, '<', "$filepath" or die $!;
my $fastaheader = <$file>;
chomp $fastaheader;
close $file;
my @ids = ();
#>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655
if($fastaheader=~/^>gi/){
my @headerfields = split(/\|/, $fastaheader);
my $accession = $headerfields[3];
my $scientificName = $headerfields[4];
push(@ids,$accession);
push(@ids,$scientificName);
return \@ids;
}else{
$fastaheader=~s/^>//;
carp "INFO [$this_function] It looks like your input Fasta header does not contain a valid NCBI accession number. Continuing with:\n $fastaheader\n"
unless ( valid_ncbi_accession($fastaheader) );
push(@ids,$fastaheader);
push(@ids,"scientific name not set");
return \@ids;
}
}
sub write_chromosome_size_file{
my $filepath = shift;
my $chromosome_name = shift;
my $chromosome_size = shift;
my $entry = $chromosome_name . "\t" . $chromosome_size . "\n";
open CHROMFILE, '>', "$filepath" or die $!;
print CHROMFILE $entry;
close CHROMFILE;
return 1;
}
sub write_chromosome_sizes_file{
my $filepath = shift;
my $chromosome_sizes_reference = shift;
my %chromosome_sizes = %{$chromosome_sizes_reference};
open CHROMFILE, '>', "$filepath" or die $!;
foreach my $chromosome_name ( keys %chromosome_sizes){
my $chromosome_size = $chromosome_sizes{$chromosome_name};
my $entry = $chromosome_name . "\t" . $chromosome_size . "\n";
print CHROMFILE $entry;
}
close CHROMFILE;
return 1;
}
sub retrieve_chromosome_size{
my $inputFilepath = shift;
open INFILE, '<', $inputFilepath;
my @newfasta;
my $chromosome_size = 0;
my $header_skipped = 0;
while (<INFILE>) {
if($header_skipped){
chomp;
$chromosome_size += length($_);
}else{
$header_skipped = 1;
( run in 1.780 second using v1.01-cache-2.11-cpan-5b529ec07f3 )