Bio-Graphics

 view release on metacpan or  search on metacpan

scripts/coverage_to_topoview.pl  view on Meta::CPAN

# This script will process the output of bam_coverage_windows.pl
# to the data structure needed by the topoview glyph
# Sheldon McKay (sheldon.mckay@gmail.com)

my ($log,$outdir,$help);
GetOptions (
    "log"           => \$log,
    "output-dir=s"  => \$outdir,
    "help"          => \$help
);

my $usage  = q(
Usage: perl coverage_to_topoview.pl [-o output_dir] [-h] [-l] file1.wig.gz file2.wig.gz ...
    -o output directory (default 'topoview')
    -l use log2 for read counts (recommended)
    -h this help message
);

die $usage if !@ARGV || $help;

$outdir ||= 'topoview';

my (%bdb_hash,$max_signal,@SubsetNames);

system "mkdir -p $outdir";

my $outfile = "$outdir/data.cat";
unlink $outfile if -e $outfile;

open( COV, '>' . $outfile ) || die "Cannot open $outfile!";
my $hashfile = "$outdir/index.bdbhash";
unlink $hashfile if -e $hashfile;

%bdb_hash = ();
tie(%bdb_hash, "BerkeleyDB::Hash",
    -Filename => $hashfile,
    -Flags    => DB_CREATE);

$max_signal  = 0;
@SubsetNames = ();

for my $file ( sort @ARGV ) {
    next unless is_bed4($file);
    indexCoverageFile($file);
}

$bdb_hash{'subsets'} = join( "\t", @SubsetNames );
$bdb_hash{'max_signal'} = $max_signal;

my @all_keys = keys %bdb_hash;

for my $kkey ( sort @all_keys ) {
    print "\t$kkey => " . $bdb_hash{$kkey} . "\n";
}

if ( $max_signal > 10000 ) {
    warn "WARNING: max_signal=$max_signal - TOO HIGH.  Consider log2?\n";
}

untie %bdb_hash;
chmod( 0666, $hashfile );     # ! sometimes very important

close COV;


sub is_bed4 {
    my $file = shift;
    my $cat  = $file =~ /gz$/ ? 'zcat' 
             : $file =~ /bz2/ ? 'bzcat'
             : 'cat';
    open WIG, "$cat $file |" or die "could not open $file: $!";
    
    my $idx;
    while (<WIG>) {
	next if /^track/;
	last if ++$idx > 9;

	my ($ref,$start,$end,$score,@other) = split "\t";
	if (@other > 0) {
	    die "Extra fields, I was expecting BED4";
	}
	unless ($ref && $start && $end && $score) {
	    die "Not enough fields, I was expecting BED4";
	}
	unless (is_numeric($start) && is_numeric($end)) {
	    die "start ($start) and end ($end) are supposed to be numbers";
	}
	unless (is_numeric($score)) {
	    die "score ($score) is not numeric"
	}
    }
    
    return 1;
}

sub is_numeric {
    no warnings;
    return defined eval { $_[ 0] == 0 };
}

sub indexCoverageFile {
    my $file = shift;
    my $zcat = get_zcat($file);

    open( INF, "$zcat $file |" ) || die "Can't open $file";
    
    chomp(my $SubsetName = `basename $file .wig.gz`);

    print STDERR "Subset=$SubsetName\n";

    push( @SubsetNames, $SubsetName );

    my $old_ref = "";
    my @offsets    = ();

    my $step      = 1000; 
    my $coordstep = 5000;
    my $counter   = 0;
    my $offset    = tell(COV);

    $bdb_hash{$SubsetName} = $offset;    # record offset where new subset data starts



( run in 1.187 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )