Bio-Graphics

 view release on metacpan or  search on metacpan

scripts/index_cov_files.pl  view on Meta::CPAN

#
# 2009-2010 Victor Strelets, FlyBase.org 

##$testing= 300000;

#$debug= 1; $do_only_chr= '2L';  
#$debug= 1; $do_only_chr= '2L'; $do_only_subset= 'BS40_all_unique'; 
#$debug= 1; $do_only_subset= 'Female_Heads'; $do_only_chr= '2';

$apply_log= 0;
$log2= log(2);
%LogTabs= ( 0, 0, 1, 0 ); # for our purposes, these start values are right	
$log_magnifier= 1.0;

	if( @ARGV && $ARGV[0]=~/^\-?log/i ) { $apply_log= 1; print "applying log\n"; }
	
	print "\nIndexing COV files for use with the fb_shmiggle glyph\n";

	eval {use lib("/LS/common/system-local/perl/lib"); };
	use BerkeleyDB;

	my $filesmask= '*.cov*';
	$filemask= shift(@ARGV) if @ARGV;
	$filemask=~s/\./\\\./g;
	$filemask=~s/[\*]/\\*/g;
	indexfeatdir('./',$filesmask) ; 

	exit();

#*************************************************************************
#
#*************************************************************************

sub indexfeatdir 
{
	my($dir,$mask)= @_;

	local(*D); opendir(D, $dir) || warn "can't open $dir";
	my @files= grep( /\.(cov|wig)/i, readdir(D));
	closedir(D);
	
	my $datfilename= 'data.cat';
	system("rm $datfilename") if -e $datfilename;
	open(OUTDATF,'>'.$datfilename) || die "Cannot open $datfilename!";
	my $bdbfilename= 'index.bdbhash';
	unlink($bdbfilename) if( -e $bdbfilename );
	%ResIndexHash= (); # !!GLOBAL
	tie %ResIndexHash, "BerkeleyDB::Hash", -Filename => $bdbfilename, -Flags => DB_CREATE;

	$max_signal = 0;
	@SubsetNames= ();
	foreach my $file (sort @files) { 
		indexCoverageFile($file); # coverage files are in fact wiggle files.. 
		}
	$ResIndexHash{'subsets'}= join("\t",@SubsetNames); # record subsets, just in case..
	$ResIndexHash{'max_signal'}= $max_signal;
	my @all_keys= keys %ResIndexHash;
	foreach my $kkey ( sort @all_keys ) { print "\t$kkey => ".$ResIndexHash{$kkey}."\n"; }
	if( $max_signal>10000 ) { print "WARNING: max_signal=$max_signal - TOO HIGH!! Re-run with '-log' option\n"; } 
  untie %ResIndexHash;
	chmod(0666,$bdbfilename); # ! sometimes very important
	close(OUTDATF);

	return;
}
 
#*************************************************************************
#
#*************************************************************************

sub indexCoverageFile 
{
  my($file)= @_;
	if( $debug && defined $do_only_subset ) {
		return unless $file=~/^${do_only_subset}\./; }
	my $zcat= get_zcat($file);
	local(*INF);
	open(INF,"$zcat $file |") || die "Can't open $file";
	print "\t$file\n";
	my $SubsetName= ($file=~/^([^\.]+)\./) ? $1 : $file;
	push(@SubsetNames,$SubsetName);
  my $chromosome= "";
	my @offsets= ();
	# following setting is very important for performance (in some cases)
	# value 1000 (otherwise good) on K.White dataset was causing start of reading 100K before the actually required point..
	my $step= 1000; # step in coverage file lines ()signal reads to save start-offset
	my $coordstep= 20000; # step in coords to save start-offset
	my $counter= 0;
	my $offset= tell(OUTDATF);
	$ResIndexHash{$SubsetName}= $offset; # record offset where new subset data starts
	my $old_signal= 0;
	my $oldcoord= -200000;
	my $FileFormat= 1;
	my $StartCoord= 0;
	my $lastRecordedCoord= -200000;
	while( (my $str= <INF>) ) {
		$offset= tell(OUTDATF);

		# correct variant of GEO preferred subset spec
		if( $str=~m/^(track[ \t]+type=wiggle_0)\s*\n$/i ) { # new subset starting
			$str= $1 . ' name="' . $SubsetName ."\"\n"; }

		# following is a GEO preferred subset spec
		if( $str=~m/^track[ \t]+type=wiggle_0[ \t]+name="([^"]+)"/i ) { # new subset starting
			$FileFormat= 4;
			$SubsetName= $1;
			#$chromosome= ""; 
			next; # because it is not a signal, should not be printed in this data loop
			}

		# fix for K.White files
		elsif( $str=~m/^track[ \t]+type=bedGraph[ \t]+name="([^"]+)"/i ) { # new subset starting
			$FileFormat= 4;
			$SubsetName=~s/_(combined|coverage)$//i; $SubsetName=~s/_(combined|coverage)$//i; $SubsetName=~s/^G[A-Z]{2}\d+[_\-]//;
			#$chromosome= ""; 
			next; # because it is not a signal, should not be printed in this data loop
			}

		elsif( $str=~m/^variableStep[ \t]+(chr(om(osome)?)?|arm)=(\w+)/i ) { # potentially new arm starting
			$FileFormat= 4;
			my $new_chromosome= $4; 



( run in 0.419 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )