endpoint results from the CPAN

endpoint

Bio-ToolBox

view release on metacpan or search on metacpan

href="/search?qci=&q=endpoint&qft=&qd=Bio-ToolBox&qifl=&f=scripts%2Fbam2wig.pl" title="grep on this file">scripts/bam2wig.pl href="https://metacpan.org/release/Bio-ToolBox/source/scripts/bam2wig.pl#L101" title="view on Meta::CPAN"> view on Meta::CPAN data-start="41" data-line="101"> $parallel = 1; This program will convert bam alignments to wig data\n"; command line options are present line options and initialize values $position, $splice, $paired, $chr_number, $correlation_min, $zmax, $model, $do_strand, $min_mapq, $nosecondary, $noduplicate, $min_isize, $first_read, $multi_hit_scale, $splice_scale, $rpm, $chrnorm, $chrapply, $chr_exclude, $dec_precison, $bigwig, $bwapp, $cpu, $max_intron, $verbose, $gz, $tempdir, = 0; # these need to be explicitly set to zero because I sum them = 0; # to verify user input => \@bamfiles, # one or more bam files => \$outfile, # name of output file => \$use_start, # record start point => \$use_mid, # record mid point => \$use_span, # record span => \$use_cspan, # record center span => \$use_extend, # extend read => \$use_smartpe, # smart paired coverage => \$use_ends, # record paired-end endpoints => \$use_coverage, # calculate coverage => \$position, # legacy option => \$splice, # split splices => \$paired, # paired-end alignments => \$fastpaired, # fast paired-end alignments => \$shift, # shift coordinates 3' => \$shift_value, # value to shift coordinates => \$extend_value, # value to extend reads => \$chr_number, # number of chromosomes to sample => \$correlation_min, # R minimum value for shift => \$zmin, # minimum z-score interval for calculating shift => \$zmax, # maximum z-score interval for calculating shift => \$model, # write the strand shift model data => \$do_strand, # separate strands => \$flip, # flip the strands => \$min_mapq, # minimum mapping quality => \$nosecondary, # skip secondary alignments => \$noduplicate, # skip duplicate alignments => \$nosupplementary, # skip supplementary alignments => \$max_isize, # maximum paired insert size to accept => \$min_isize, # minimum paired insert size to accept => \$first_read, # only take first read => \$second_read, # only take second read => \$multi_hit_scale, # scale by number of hits => \$splice_scale, # divide counts by number of spliced segments => \$rpm, # calculate reads per million => \$do_mean, # rpm scale separately => \@scale_values, # user specified scale value => \$chrnorm, # chromosome-specific normalization => \$chrapply, # chromosome-specific normalization regex => \$chr_exclude, # regex for skipping chromosomes => \$exclude_file, # file for skipping regions => \$bin_size, # size for binning the data => \$dec_precison, # format to number of decimal positions => \$bigwig, # generate bigwig file => \$bwapp, # utility to generate a bigwig file => \$do_bedgraph, # write a bedgraph output => \$do_fixstep, # write a fixedStep output => \$do_varstep, # write a varStep output => \$no_zero, # do not write zero coverage => \$gz, # compress text output => \$cpu, # number of cpu cores to use => \$max_intron, # maximum intron size to allow => \$window, # window size to control memory usage => \$verbose, # print sample correlations => \$tempdir, # directory to write temp files => \$BAM_ADAPTER, # explicitly set the adapter version => \$help, # request help => \$print_version, # print the version unrecognized option(s)!! please refer to the help documentation\n\n";


  

  
    scripts/bam2wig.pl
     view on Meta::CPAN
    
    
		else {
			print STDERR
" FATAL: incompatible mode with paired and splices enabled! Try --smartcov\n";
			exit 1;
		}
	}
	$max_intron ||= 0;

	# check paired-end requirements
	$paired = 1 if $fastpaired;    # for purposes here, fastpair is paired
	if ( ( $paired or $use_smartpe ) and ( $first_read or $second_read ) ) {
		$paired      = 0;          # not necessary
		$use_smartpe = 0;
	}
	if ($use_smartpe) {
		$paired = 1;
		$splice = 1;

		# warnings
		if ($fastpaired) {
			print " WARNING: disabling fast-paired mode with smart-paired coverage\n";
			$fastpaired = 0;
		}
		if ($splice_scale) {
			print " WARNING: disabling splice scaling with smart-paired coverage\n";
			$splice_scale = 0;
		}

		# required modules
		my $fast = 0;
		eval {
			# required for conveniently assembling overlapping and spliced coverage
			require Set::IntSpan::Fast;
			$fast = 1;
		};
		unless ($fast) {
			print STDERR
" FATAL: Module Set::IntSpan::Fast is required for smart-paired coverage\n";
			exit 1;
		}
	}
	if ($use_ends) {
		$paired = 1;
	}

	# incompatible options
	if ( $splice and ( $use_extend or $extend_value ) ) {
		print " WARNING: disabling splices when extend is defined\n";
		$splice = 0;
	}
	if ( $shift and $splice ) {
		print
" WARNING: enabling both shift and splices is currently not allowed. Pick one.\n";
		exit 1;
	}
	if ( $shift and $paired ) {
		print " WARNING: disabling shift with paired reads\n";
		undef $shift;
	}
	if ( $use_ends and $shift ) {
		print " WARNING: disabling shift when recording paired endpoints\n";
		undef $shift;
	}
	if ( $use_ends and $splice ) {
		print " WARNING: disabling splices when recording paired endpoints\n";
		undef $splice;
	}

	# check to shift position or not
	if ( $shift or $use_extend ) {

		# set parameters for calculating shift value
		unless ( $shift_value or $extend_value ) {

			# not provided by user, empirical calculation required
			my $stat = 0;
			eval {
				# required for calculating shift
				require Statistics::Descriptive;
				$stat = 1;
			};
			unless ($stat) {
				print STDERR
" FATAL: Provide a shift value or install the Perl module Statistics::Descriptive\n"
					. " to empirically determine the shift value.\n";
				exit 1;
			}
		}

		if ( defined $correlation_min ) {
			if ( $correlation_min <= 0 or $correlation_min >= 1 ) {
				print STDERR
" FATAL: cannot use minimum correlation value of $correlation_min!\n use --help for more information\n";
				exit 1;
			}
		}
		else {
			$correlation_min = 0.5;
		}
		$chr_number ||= 4;
		$zmin       ||= 3;
		$zmax       ||= 10;
	}

	# center span should have extend value
	if ( $use_cspan and not $extend_value ) {
		print STDERR
" FATAL: please use the --extval option to define an extend value when using center span\n";
		exit 1;
	}

	# check mapping quality
	if ( defined $min_mapq ) {
		if ( $min_mapq > 255 or $min_mapq < 0 ) {
			print STDERR " FATAL: quality score must be 0-255!\n";
			exit 1;
		}
	}
	else {
		$min_mapq = 0;
	}

	# check paired-end insert size
	unless ( defined $max_isize ) {
		if ($use_smartpe) {

    
  
  

  
    scripts/bam2wig.pl
     view on Meta::CPAN
    
    
	elsif ( not $paired and $do_strand and not $shift and $use_extend ) {
		$callback = \&se_strand_extend;
		print " Recording single-end stranded, extended alignment span\n";
	}
	elsif ( not $paired and $do_strand and $shift and $use_extend ) {
		$callback = \&se_shift_strand_extend;
		print " Recording single-end shifted, stranded, extended alignment span\n";
	}

	# paired-end start
	elsif ( $paired and not $do_strand and $use_start ) {
		$callback = \&pe_start;
		print " Recording paired-end fragment start\n";
	}
	elsif ( $paired and $do_strand and $use_start ) {
		$callback = \&pe_strand_start;
		print " Recording paired-end stranded fragment start\n";
	}

	# paired-end midpoint
	elsif ( $paired and not $do_strand and $use_mid ) {
		$callback = \&pe_mid;
		print " Recording paired-end fragment midpoint\n";
	}
	elsif ( $paired and $do_strand and $use_mid ) {
		$callback = \&pe_strand_mid;
		print " Recording paired-end stranded fragment midpoint\n";
	}

	# paired-end span
	elsif ( $paired and not $do_strand and $use_span ) {
		$callback = \&pe_span;
		print " Recording paired-end fragment span\n";
	}
	elsif ( $paired and $do_strand and $use_span ) {
		$callback = \&pe_strand_span;
		print " Recording paired-end stranded, fragment span\n";
	}

	# paired-end center span
	elsif ( $paired and not $do_strand and $use_cspan ) {
		$callback = \&pe_center_span;
		print " Recording paired-end fragment center-span\n";
	}
	elsif ( $paired and $do_strand and $use_cspan ) {
		$callback = \&pe_strand_center_span;
		print " Recording paired-end stranded, fragment center-span\n";
	}

	# paired-end smart coverage
	elsif ( $paired and not $do_strand and $use_smartpe ) {
		$callback = \&smart_pe;
		print " Recording smart paired-end coverage\n";
	}
	elsif ( $paired and $do_strand and $use_smartpe ) {
		$callback = \&smart_stranded_pe;
		print " Recording stranded, smart paired-end coverage\n";
	}
	elsif ( $paired and not $do_strand and $use_ends ) {
		$callback = \&pe_ends;
		print " Recording paired-end fragment endpoints\n";
	}
	elsif ( $paired and $do_strand and $use_ends ) {
		$callback = \&pe_strand_ends;
		print " Recording stranded, paired-end fragment endpoints\n";
	}
	else {
		die "programmer error!\n" unless $shift;    # special exception
	}

	# summary of wig file being written
	if ($do_bedgraph) {
		print " Writing bedGraph format in $bin_size bp increments\n";
	}
	elsif ($do_varstep) {
		printf " Writing variableStep format in $bin_size bp bins\n";
	}
	elsif ($do_fixstep) {
		printf " Writing fixedStep format in $bin_size bp bins\n";
	}
}

sub process_exclusion_list {
	if ( $exclude_file and -e $exclude_file ) {
		my $i = 0;
		eval { require Set::IntervalTree; $i = 1; };
		unless ($i) {
			print " WARNING! Please install Set::IntervalTree to use exclusion lists\n";
			undef $exclude_file;
			return;
		}
		my %list_hash = map { $_ => [] } @seq_list;
		my $Data      = Bio::ToolBox->load_file($exclude_file)
			or die "unable to read exclusion list file '$exclude_file'\n";
		unless ( $Data->feature_type eq 'coordinate' ) {
			printf
" WARNING! Exclusion list file '%s' does not have coordinates! Ignoring.\n",
				$exclude_file;
			return \%list_hash;
		}
		$Data->iterate(
			sub {
				my $row = shift;
				push @{ $list_hash{ $row->seq_id } }, [ $row->start - 1, $row->end ]
					if exists $list_hash{ $row->seq_id };
			}
		);
		printf " Loaded %s exclusion list regions\n",
			format_with_commas( $Data->number_rows );
		return \%list_hash;
	}
	return;
}

### Determine the shift value
sub determine_shift_value {

	# identify top regions to score
	# we will walk through the largest chromosome(s) looking for the top
	# 500 bp regions containing the highest unstranded coverage to use
	print "  sampling the top coverage regions "
		. "on the largest $chr_number chromosomes...\n";

	# first sort the chromosomes by size
	# this is assuming all the chromosomes have different sizes ;-)

    
  
  

  
    scripts/bam2wig.pl
     view on Meta::CPAN
    
    
		my %files;
		my @filelist = glob("$outbase.*.temp.bin");
		my $expected = scalar(@sams) * scalar(@seq_list) * ( $do_strand ? 2 : 1 );
		if ( scalar(@filelist) != $expected ) {
			die sprintf(
" unable to find all the temporary binary children files! Only found %d, expected %d\n",
				scalar(@filelist), $expected );
		}
		foreach my $file (@filelist) {

			# each file name is basename.samid.seqid.count.strand.bin.gz
			if ( $file =~ /$outbase \.(\d+) \.(.+) \.0 \.f \.temp \.bin \Z/x ) {
				my $samid = $1;
				my $seqid = $2;
				$files{$seqid}{$samid} = $file;
			}
		}

		# check to see how we combine the multiple source sam file coverages
		my @norms;
		if ($do_mean) {

			# just need to take an average
			foreach (@sams) { push @norms, ( 1 / scalar(@sams) ) }
		}

		# otherwise we just add the samples

		# merge the samples
		foreach my $seq_id (@seq_list) {
			$pm->start and next;
			merge_bin_files( $seq_id, 'f', 0, $files{$seq_id}, \@norms );
			$pm->finish;
		}
		$pm->wait_all_children;
	}

	# merge and write final wig file
	return write_final_wig_file();
}

sub process_bam_coverage_on_chromosome {
	my ( $samid, $sam, $seq_id, $do_temp_bin ) = @_;
	my $chr_start_time = time;

	# identify chromosome target id
	my ( $tid, undef, undef ) = $sam->header->parse_region($seq_id);
	unless ( defined $tid ) {
		printf " WARNING: unable to parse target id for '%s' from bam file %s!\n",
			$seq_id, $bamfiles[$samid];

		# we continue but this will silently fail the low level fetch below
		# but at least an empty temp file will be written allowing program to finish
	}
	my $seq_length = $seq_name2length{$seq_id};

	# walk through the chromosome in 1 kb increments
	my $chrom_data;
	for ( my $start = 0; $start < $seq_length; $start += $coverage_dump ) {

		# set endpoint
		my $end = $start + $coverage_dump;
		$end = $seq_length if $end > $seq_length;

		# using the low level interface for a little more performance
		my $coverage = low_level_bam_coverage( $sam, $tid, $start, $end );

		# record the coverage
		&{$coverage_sub}( $coverage, \$chrom_data );
	}

	# write out chromosome binary file, set count to arbitrary 0
	if ($do_temp_bin) {
		write_bin_file( \$chrom_data,
			join( '.', $outbase, $samid, $seq_id, 0, 'f', 'temp.bin' ) );
	}
	else {
		&{$wig_writer}(
			\$chrom_data, $binpack, $seq_id, $seq_length,
			join( '.', $outbase, $samid, $seq_id, 0, 'f', 'temp.wig' )
		);
	}

	# verbose status line
	if ($verbose) {
		printf " Generated read coverage on $seq_id in %d seconds\n",
			time - $chr_start_time;
	}
}

sub process_alignments {

	# flag to write temporary binary files or go straight to wig files
	my $do_temp_bin = scalar(@sams) > 1 ? 1 : $rpm ? 1 : @scale_values ? 1 : 0;

	# walk through each bam file and chromosome one a time
	for my $samid ( 0 .. $#sams ) {
		foreach my $seq_id (@seq_list) {
			process_alignments_on_chromosome( $samid, $sams[$samid], $seq_id,
				$do_temp_bin );
		}
	}

	if ($verbose) {
		printf " Finished converting $items in %.3f minutes\n",
			( time - $start_time ) / 60;
	}

	# find and merge binary files
	# we can do this in parallel too!
	if ($do_temp_bin) {

		# find the children
		my @totals;
		my %seq_totals;
		my %files;
		my @filelist = glob("$outbase.*.temp.bin");
		my $expected = scalar(@sams) * scalar(@seq_list) * ( $do_strand ? 2 : 1 );
		if ( scalar(@filelist) != $expected ) {
			die sprintf(
" unable to find all the temporary binary children files! Only found %d, expected %d\n",

    
  
  

  
    scripts/bam2wig.pl
     view on Meta::CPAN
    
    

sub pe_strand_ends {
	my ( $a, $data, $score ) = @_;

	# we always receive the forward read, never reverse, from pe_callback
	# therefore the first and second read flag indicates orientation

	# calculate both positions
	my $flag = $a->flag;
	if ( $flag & 0x0040 ) {

		# first read implies forward orientation
		# first position is forward
		$data->{f}->[ int( $a->pos / $bin_size ) - $data->{f_offset} ] += $score;

		# second position is reverse
		my $pos = int( ( $a->pos + $a->isize ) / $bin_size );
		$data->{r}->[ $pos - $data->{r_offset} ] += $score;
	}
	elsif ( $flag & 0x0080 ) {

		# second read implies reverse orientation
		# first position is reverse
		$data->{r}->[ int( $a->pos / $bin_size ) - $data->{r_offset} ] += $score;

		# second position is forward
		my $pos = int( ( $a->pos + $a->isize ) / $bin_size );
		$data->{f}->[ $pos - $data->{f_offset} ] += $score;
	}
	else {
		die sprintf(
" Paired-end flags are set incorrectly; neither 0x040 or 0x080 are set for paired-end read %s.",
			$a->query->name );
	}
}

__END__

=head1 NAME

bam2wig.pl

A program to convert Bam alignments into a wig representation file.

=head1 SYNOPSIS

bam2wig.pl [--options...] E<lt>file.bamE<gt>

bam2wig.pl --extend --rpm --mean --out file --bw file1.bam file2.bam
  
 Required options:
  -i --in <filename.bam>        repeat if multiple bams, or comma-delimited list
 
 Reporting options (pick one):
  -s --start                    record at 5' position
  -d --mid                      record at midpoint of alignment or pair
  -a --span                     record across entire alignment or pair
  -e --extend                   extend alignment (record predicted fragment)
  --cspan                       record a span centered on midpoint
  --smartcov                    record paired coverage without overlaps, splices
  --ends                        record paired endpoints
  --coverage                    raw alignment coverage
 
 Alignment reporting options:
  -l --splice                   split alignment at N splices
  -t --strand                   record separate strands as two wig files
  --flip                        flip the strands for convenience
  
 Paired-end alignments:
  -p --pe                       process paired-end alignments, both are checked
  -P --fastpe                   process paired-end alignments, only F are checked
  --minsize <integer>           minimum allowed insertion size (30)
  --maxsize <integer>           maximum allowed insertion size (600)
  --first                       only process paired first read (0x40) as single-end
  --second                      only process paired second read (0x80) as single-end
  
 Alignment filtering options:
  -K --chrskip <regex>          regular expression to skip chromosomes
  -B --exclude <file>           interval file of regions to skip (bed, gff, txt)
  -q --qual <integer>           minimum mapping quality (0)          
  -S --nosecondary              ignore secondary (0x100) alignments (false)
  -D --noduplicate              ignore duplicate (0x400) alignments (false)
  -U --nosupplementary          ignore supplementary (0x800) alignments (false)
  --intron <integer>            maximum allowed gap (intron) size in bp (none)
  
  Shift options:
  -I --shift                    shift reads in the 3' direction
  -x --extval <integer>         explicit extension size in bp (default is to calculate)
  -H --shiftval <integer>       explicit shift value in bp (default is to calculate) 
  --chrom <integer>             number of chromosomes to sample (4)
  --minr <float>                minimum pearson correlation to calculate shift (0.5)
  --zmin <float>                minimum z-score from average to test peak for shift (3)
  --zmax <float>                maximum z-score from average to test peak for shift (10)
  -M --model                    write peak shift model file for graphing
  
 Score options:
  -r --rpm                      scale depth to Reads Per Million mapped
  -m --mean                     average multiple bams (default is addition)
  --scale <float>               explicit scaling factor, repeat for each bam file
  --fraction                    assign fractional counts to multi-mapped alignments
  --splfrac                     assign fractional count to each spliced segment
  --format <integer>            number of decimal positions (4)
  --chrnorm <float>             use chromosome-specific normalization factor
  --chrapply <regex>            regular expression to apply chromosome-specific factor
 
 Wig format:
  --bin <integer>               bin size for span or extend mode (10)
  --bdg                         bedGraph, default for span and extend at bin 1
  --fix                         fixedStep, default for bin > 1
  --var                         varStep, default for start, mid
  --nozero                      do not write zero score intervals in bedGraph
  
 Output options:
  -o --out <filename>           output file name, default is bam file basename
  -b --bw                       convert to bigWig format (supports bdg, fix, var)
  --bwapp /path/to/wigToBigWig  path to external converter (default searches \$PATH)
  -z --gz                       gzip compress text output 
  
 General options:
  -c --cpu <integer>            number of parallel processes (4)
  --temp <directory>            directory to write temporary files (output path)
  -V --verbose                  report additional information
  -v --version                  print version information
  -h --help                     show full documentation

=head1 OPTIONS

The command line flags and descriptions:

=head2 Input

=over 4

=item --in E<lt>filenameE<gt>

Specify the input Bam alignment file. More than one file may be 
specified, either with repeated options, a comma-delimited list, 
or simply appended to the command. Bam files will be automatically 
indexed if necessary.

=back

=head2 Reporting Options

=over 4

=item --start

Specify that the 5' position should be recorded in the wig file.

=item --mid

Specify that the midpoint of the alignment (single-end) or fragment 
(paired-end) will be recorded in the wig file.

=item --span

Specify that the entire span of the alignment (single-end) or 
fragment (paired-end) will be recorded in the wig file. 

=item --extend

Specify that the alignment should be extended in the 3' direction 
and that the entire length of the extension be recorded in the wig 
file. The extension may be defined by the user or empirically 
determined.

=item --cspan

Specify that a defined span centered at the alignment (single-end) 
or fragment (paired-end) midpoint will be recorded in the wig file.
The span is defined by the extension value.

=item --smartcov

Smart alignment coverage of paired-end alignments without 
double-counting overlaps or recording gaps (intron splices). 

=item --ends

Record both endpoints of paired-end fragments, i.e. the outermost 
or 5' ends of properly paired fragments. This may be useful with 
ATAC-Seq, Cut&Run-Seq, or other cleavage experiments where you want 
to record the locations of cutting yet retain the ability to filter 
paired-end fragment sizes.

=item --coverage

Specify that the raw alignment coverage be calculated and reported 
in the wig file. This utilizes a special low-level operation and 
precludes any alignment filtering or post-normalization methods. 
Counting overlapping bases in paired-end alignments are dependent on 
the bam adapter (older versions would double-count).

=item --position [start|mid|span|extend|cspan|coverage]

Legacy option for supporting previous versions of bam2wig. 

=back

=head2 Alignment reporting options

=over 4

=item --splice

Indicate that the bam file contains alignments with splices, such as 
from RNASeq experiments. Alignments will be split on cigar N operations 
and each sub fragment will be recorded. This only works with single-end 
alignments, and is disabled for paired-end reads (just treat as single-end). 
Only start and span recording options are supported.

=item --strand

Indicate that separate wig files should be written for each strand. 
The output file basename is appended with either '_f' or '_r' for 
both files. Strand for paired-end alignments are determined by the 
strand of the first read.

=item --flip

Flip the strand of the output files when generating stranded wig files. 
Do this when RNA-Seq alignments map to the opposite strand of the 
coding sequence, depending on the library preparation method. 

=back

=head2 Paired-end alignments

=over 4

=item --pe

The Bam file consists of paired-end alignments, and only properly 
mapped pairs of alignments will be counted. Properly mapped pairs 
include FR reads on the same chromosome, and not FF, RR, RF, or 
pairs aligning to separate chromosomes. Both alignments are required 
to be present before the pair is counted. The default is to treat 
all alignments as single-end.

=item --fastpe








        
        
        ( run in 1.413 second using v1.01-cache-2.11-cpan-9581c071862 )