Bio-Graphics
view release on metacpan or search on metacpan
scripts/coverage_to_topoview.pl view on Meta::CPAN
# This script will process the output of bam_coverage_windows.pl
# to the data structure needed by the topoview glyph
# Sheldon McKay (sheldon.mckay@gmail.com)
my ($log,$outdir,$help);
GetOptions (
"log" => \$log,
"output-dir=s" => \$outdir,
"help" => \$help
);
my $usage = q(
Usage: perl coverage_to_topoview.pl [-o output_dir] [-h] [-l] file1.wig.gz file2.wig.gz ...
-o output directory (default 'topoview')
-l use log2 for read counts (recommended)
-h this help message
);
die $usage if !@ARGV || $help;
$outdir ||= 'topoview';
my (%bdb_hash,$max_signal,@SubsetNames);
system "mkdir -p $outdir";
my $outfile = "$outdir/data.cat";
unlink $outfile if -e $outfile;
open( COV, '>' . $outfile ) || die "Cannot open $outfile!";
my $hashfile = "$outdir/index.bdbhash";
unlink $hashfile if -e $hashfile;
%bdb_hash = ();
tie(%bdb_hash, "BerkeleyDB::Hash",
-Filename => $hashfile,
-Flags => DB_CREATE);
$max_signal = 0;
@SubsetNames = ();
for my $file ( sort @ARGV ) {
next unless is_bed4($file);
indexCoverageFile($file);
}
$bdb_hash{'subsets'} = join( "\t", @SubsetNames );
$bdb_hash{'max_signal'} = $max_signal;
my @all_keys = keys %bdb_hash;
for my $kkey ( sort @all_keys ) {
print "\t$kkey => " . $bdb_hash{$kkey} . "\n";
}
if ( $max_signal > 10000 ) {
warn "WARNING: max_signal=$max_signal - TOO HIGH. Consider log2?\n";
}
untie %bdb_hash;
chmod( 0666, $hashfile ); # ! sometimes very important
close COV;
sub is_bed4 {
my $file = shift;
my $cat = $file =~ /gz$/ ? 'zcat'
: $file =~ /bz2/ ? 'bzcat'
: 'cat';
open WIG, "$cat $file |" or die "could not open $file: $!";
my $idx;
while (<WIG>) {
next if /^track/;
last if ++$idx > 9;
my ($ref,$start,$end,$score,@other) = split "\t";
if (@other > 0) {
die "Extra fields, I was expecting BED4";
}
unless ($ref && $start && $end && $score) {
die "Not enough fields, I was expecting BED4";
}
unless (is_numeric($start) && is_numeric($end)) {
die "start ($start) and end ($end) are supposed to be numbers";
}
unless (is_numeric($score)) {
die "score ($score) is not numeric"
}
}
return 1;
}
sub is_numeric {
no warnings;
return defined eval { $_[ 0] == 0 };
}
sub indexCoverageFile {
my $file = shift;
my $zcat = get_zcat($file);
open( INF, "$zcat $file |" ) || die "Can't open $file";
chomp(my $SubsetName = `basename $file .wig.gz`);
print STDERR "Subset=$SubsetName\n";
push( @SubsetNames, $SubsetName );
my $old_ref = "";
my @offsets = ();
my $step = 1000;
my $coordstep = 5000;
my $counter = 0;
my $offset = tell(COV);
$bdb_hash{$SubsetName} = $offset; # record offset where new subset data starts
( run in 1.187 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )