Bio-Graphics
view release on metacpan or search on metacpan
scripts/index_cov_files.pl view on Meta::CPAN
#
# 2009-2010 Victor Strelets, FlyBase.org
##$testing= 300000;
#$debug= 1; $do_only_chr= '2L';
#$debug= 1; $do_only_chr= '2L'; $do_only_subset= 'BS40_all_unique';
#$debug= 1; $do_only_subset= 'Female_Heads'; $do_only_chr= '2';
$apply_log= 0;
$log2= log(2);
%LogTabs= ( 0, 0, 1, 0 ); # for our purposes, these start values are right
$log_magnifier= 1.0;
if( @ARGV && $ARGV[0]=~/^\-?log/i ) { $apply_log= 1; print "applying log\n"; }
print "\nIndexing COV files for use with the fb_shmiggle glyph\n";
eval {use lib("/LS/common/system-local/perl/lib"); };
use BerkeleyDB;
my $filesmask= '*.cov*';
$filemask= shift(@ARGV) if @ARGV;
$filemask=~s/\./\\\./g;
$filemask=~s/[\*]/\\*/g;
indexfeatdir('./',$filesmask) ;
exit();
#*************************************************************************
#
#*************************************************************************
sub indexfeatdir
{
my($dir,$mask)= @_;
local(*D); opendir(D, $dir) || warn "can't open $dir";
my @files= grep( /\.(cov|wig)/i, readdir(D));
closedir(D);
my $datfilename= 'data.cat';
system("rm $datfilename") if -e $datfilename;
open(OUTDATF,'>'.$datfilename) || die "Cannot open $datfilename!";
my $bdbfilename= 'index.bdbhash';
unlink($bdbfilename) if( -e $bdbfilename );
%ResIndexHash= (); # !!GLOBAL
tie %ResIndexHash, "BerkeleyDB::Hash", -Filename => $bdbfilename, -Flags => DB_CREATE;
$max_signal = 0;
@SubsetNames= ();
foreach my $file (sort @files) {
indexCoverageFile($file); # coverage files are in fact wiggle files..
}
$ResIndexHash{'subsets'}= join("\t",@SubsetNames); # record subsets, just in case..
$ResIndexHash{'max_signal'}= $max_signal;
my @all_keys= keys %ResIndexHash;
foreach my $kkey ( sort @all_keys ) { print "\t$kkey => ".$ResIndexHash{$kkey}."\n"; }
if( $max_signal>10000 ) { print "WARNING: max_signal=$max_signal - TOO HIGH!! Re-run with '-log' option\n"; }
untie %ResIndexHash;
chmod(0666,$bdbfilename); # ! sometimes very important
close(OUTDATF);
return;
}
#*************************************************************************
#
#*************************************************************************
sub indexCoverageFile
{
my($file)= @_;
if( $debug && defined $do_only_subset ) {
return unless $file=~/^${do_only_subset}\./; }
my $zcat= get_zcat($file);
local(*INF);
open(INF,"$zcat $file |") || die "Can't open $file";
print "\t$file\n";
my $SubsetName= ($file=~/^([^\.]+)\./) ? $1 : $file;
push(@SubsetNames,$SubsetName);
my $chromosome= "";
my @offsets= ();
# following setting is very important for performance (in some cases)
# value 1000 (otherwise good) on K.White dataset was causing start of reading 100K before the actually required point..
my $step= 1000; # step in coverage file lines ()signal reads to save start-offset
my $coordstep= 20000; # step in coords to save start-offset
my $counter= 0;
my $offset= tell(OUTDATF);
$ResIndexHash{$SubsetName}= $offset; # record offset where new subset data starts
my $old_signal= 0;
my $oldcoord= -200000;
my $FileFormat= 1;
my $StartCoord= 0;
my $lastRecordedCoord= -200000;
while( (my $str= <INF>) ) {
$offset= tell(OUTDATF);
# correct variant of GEO preferred subset spec
if( $str=~m/^(track[ \t]+type=wiggle_0)\s*\n$/i ) { # new subset starting
$str= $1 . ' name="' . $SubsetName ."\"\n"; }
# following is a GEO preferred subset spec
if( $str=~m/^track[ \t]+type=wiggle_0[ \t]+name="([^"]+)"/i ) { # new subset starting
$FileFormat= 4;
$SubsetName= $1;
#$chromosome= "";
next; # because it is not a signal, should not be printed in this data loop
}
# fix for K.White files
elsif( $str=~m/^track[ \t]+type=bedGraph[ \t]+name="([^"]+)"/i ) { # new subset starting
$FileFormat= 4;
$SubsetName=~s/_(combined|coverage)$//i; $SubsetName=~s/_(combined|coverage)$//i; $SubsetName=~s/^G[A-Z]{2}\d+[_\-]//;
#$chromosome= "";
next; # because it is not a signal, should not be printed in this data loop
}
elsif( $str=~m/^variableStep[ \t]+(chr(om(osome)?)?|arm)=(\w+)/i ) { # potentially new arm starting
$FileFormat= 4;
my $new_chromosome= $4;
( run in 0.419 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )