Bio-ToolBox
view release on metacpan or search on metacpan
scripts/bam2wig.pl view on Meta::CPAN
$parallel = 1;
};
our $VERSION = '2.03';
print "\n This program will convert bam alignments to wig data\n";
### Quick help
unless (@ARGV) {
# when no command line options are present
# print SYNOPSIS
pod2usage(
{
'-verbose' => 0,
'-exitval' => 1,
}
);
}
### Get command line options and initialize values
my (
$outfile, $position, $splice, $paired,
$fastpaired, $shift, $chr_number, $correlation_min,
$zmin, $zmax, $model, $do_strand,
$flip, $min_mapq, $nosecondary, $noduplicate,
$nosupplementary, $max_isize, $min_isize, $first_read,
$second_read, $multi_hit_scale, $splice_scale, $rpm,
$do_mean, $chrnorm, $chrapply, $chr_exclude,
$exclude_file, $bin_size, $dec_precison, $bigwig,
$no_zero, $bwapp, $cpu, $max_intron,
$window, $verbose, $gz, $tempdir,
$help, $print_version,
);
my $use_start = 0; # these need to be explicitly set to zero because I sum them
my $use_mid = 0; # to verify user input
my $use_span = 0;
my $use_cspan = 0;
my $use_extend = 0;
my $use_smartpe = 0;
my $use_ends = 0;
my $use_coverage = 0;
my $do_fixstep = 0;
my $do_varstep = 0;
my $do_bedgraph = 0;
my $extend_value = 0;
my $shift_value = 0;
my @bamfiles;
my @scale_values;
# Command line options
GetOptions(
'i|in=s' => \@bamfiles, # one or more bam files
'o|out=s' => \$outfile, # name of output file
's|start!' => \$use_start, # record start point
'd|mid!' => \$use_mid, # record mid point
'a|span!' => \$use_span, # record span
'cspan!' => \$use_cspan, # record center span
'e|extend!' => \$use_extend, # extend read
'smartcov!' => \$use_smartpe, # smart paired coverage
'ends!' => \$use_ends, # record paired-end endpoints
'coverage!' => \$use_coverage, # calculate coverage
'position=s' => \$position, # legacy option
'l|splice|split!' => \$splice, # split splices
'p|pe!' => \$paired, # paired-end alignments
'P|fastpe!' => \$fastpaired, # fast paired-end alignments
'I|shift!' => \$shift, # shift coordinates 3'
'H|shiftval=i' => \$shift_value, # value to shift coordinates
'x|extval=i' => \$extend_value, # value to extend reads
'chrom=i' => \$chr_number, # number of chromosomes to sample
'minr=f' => \$correlation_min, # R minimum value for shift
'zmin=f' => \$zmin, # minimum z-score interval for calculating shift
'zmax=f' => \$zmax, # maximum z-score interval for calculating shift
'M|model!' => \$model, # write the strand shift model data
't|strand!' => \$do_strand, # separate strands
'flip!' => \$flip, # flip the strands
'q|qual=i' => \$min_mapq, # minimum mapping quality
'S|nosecondary' => \$nosecondary, # skip secondary alignments
'D|noduplicate' => \$noduplicate, # skip duplicate alignments
'U|nosupplementary' => \$nosupplementary, # skip supplementary alignments
'maxsize=i' => \$max_isize, # maximum paired insert size to accept
'minsize=i' => \$min_isize, # minimum paired insert size to accept
'first!' => \$first_read, # only take first read
'second!' => \$second_read, # only take second read
'fraction!' => \$multi_hit_scale, # scale by number of hits
'splfrac!' => \$splice_scale, # divide counts by number of spliced segments
'r|rpm!' => \$rpm, # calculate reads per million
'm|separate|mean!' => \$do_mean, # rpm scale separately
'scale=s' => \@scale_values, # user specified scale value
'chrnorm=f' => \$chrnorm, # chromosome-specific normalization
'chrapply=s' => \$chrapply, # chromosome-specific normalization regex
'K|chrskip=s' => \$chr_exclude, # regex for skipping chromosomes
'exclude|blacklist=s' => \$exclude_file, # file for skipping regions
'bin=i' => \$bin_size, # size for binning the data
'format=i' => \$dec_precison, # format to number of decimal positions
'b|bw!' => \$bigwig, # generate bigwig file
'bwapp=s' => \$bwapp, # utility to generate a bigwig file
'bdg!' => \$do_bedgraph, # write a bedgraph output
'fix!' => \$do_fixstep, # write a fixedStep output
'var!' => \$do_varstep, # write a varStep output
'nozero!' => \$no_zero, # do not write zero coverage
'z|gz!' => \$gz, # compress text output
'c|cpu=i' => \$cpu, # number of cpu cores to use
'intron=i' => \$max_intron, # maximum intron size to allow
'window=i' => \$window, # window size to control memory usage
'V|verbose!' => \$verbose, # print sample correlations
'temp=s' => \$tempdir, # directory to write temp files
'adapter=s' => \$BAM_ADAPTER, # explicitly set the adapter version
'h|help' => \$help, # request help
'v|version' => \$print_version, # print the version
) or die " unrecognized option(s)!! please refer to the help documentation\n\n";
# Print help
if ($help) {
# print entire POD
pod2usage(
{
'-verbose' => 2,
'-exitval' => 1,
}
scripts/bam2wig.pl view on Meta::CPAN
else {
print STDERR
" FATAL: incompatible mode with paired and splices enabled! Try --smartcov\n";
exit 1;
}
}
$max_intron ||= 0;
# check paired-end requirements
$paired = 1 if $fastpaired; # for purposes here, fastpair is paired
if ( ( $paired or $use_smartpe ) and ( $first_read or $second_read ) ) {
$paired = 0; # not necessary
$use_smartpe = 0;
}
if ($use_smartpe) {
$paired = 1;
$splice = 1;
# warnings
if ($fastpaired) {
print " WARNING: disabling fast-paired mode with smart-paired coverage\n";
$fastpaired = 0;
}
if ($splice_scale) {
print " WARNING: disabling splice scaling with smart-paired coverage\n";
$splice_scale = 0;
}
# required modules
my $fast = 0;
eval {
# required for conveniently assembling overlapping and spliced coverage
require Set::IntSpan::Fast;
$fast = 1;
};
unless ($fast) {
print STDERR
" FATAL: Module Set::IntSpan::Fast is required for smart-paired coverage\n";
exit 1;
}
}
if ($use_ends) {
$paired = 1;
}
# incompatible options
if ( $splice and ( $use_extend or $extend_value ) ) {
print " WARNING: disabling splices when extend is defined\n";
$splice = 0;
}
if ( $shift and $splice ) {
print
" WARNING: enabling both shift and splices is currently not allowed. Pick one.\n";
exit 1;
}
if ( $shift and $paired ) {
print " WARNING: disabling shift with paired reads\n";
undef $shift;
}
if ( $use_ends and $shift ) {
print " WARNING: disabling shift when recording paired endpoints\n";
undef $shift;
}
if ( $use_ends and $splice ) {
print " WARNING: disabling splices when recording paired endpoints\n";
undef $splice;
}
# check to shift position or not
if ( $shift or $use_extend ) {
# set parameters for calculating shift value
unless ( $shift_value or $extend_value ) {
# not provided by user, empirical calculation required
my $stat = 0;
eval {
# required for calculating shift
require Statistics::Descriptive;
$stat = 1;
};
unless ($stat) {
print STDERR
" FATAL: Provide a shift value or install the Perl module Statistics::Descriptive\n"
. " to empirically determine the shift value.\n";
exit 1;
}
}
if ( defined $correlation_min ) {
if ( $correlation_min <= 0 or $correlation_min >= 1 ) {
print STDERR
" FATAL: cannot use minimum correlation value of $correlation_min!\n use --help for more information\n";
exit 1;
}
}
else {
$correlation_min = 0.5;
}
$chr_number ||= 4;
$zmin ||= 3;
$zmax ||= 10;
}
# center span should have extend value
if ( $use_cspan and not $extend_value ) {
print STDERR
" FATAL: please use the --extval option to define an extend value when using center span\n";
exit 1;
}
# check mapping quality
if ( defined $min_mapq ) {
if ( $min_mapq > 255 or $min_mapq < 0 ) {
print STDERR " FATAL: quality score must be 0-255!\n";
exit 1;
}
}
else {
$min_mapq = 0;
}
# check paired-end insert size
unless ( defined $max_isize ) {
if ($use_smartpe) {
scripts/bam2wig.pl view on Meta::CPAN
elsif ( not $paired and $do_strand and not $shift and $use_extend ) {
$callback = \&se_strand_extend;
print " Recording single-end stranded, extended alignment span\n";
}
elsif ( not $paired and $do_strand and $shift and $use_extend ) {
$callback = \&se_shift_strand_extend;
print " Recording single-end shifted, stranded, extended alignment span\n";
}
# paired-end start
elsif ( $paired and not $do_strand and $use_start ) {
$callback = \&pe_start;
print " Recording paired-end fragment start\n";
}
elsif ( $paired and $do_strand and $use_start ) {
$callback = \&pe_strand_start;
print " Recording paired-end stranded fragment start\n";
}
# paired-end midpoint
elsif ( $paired and not $do_strand and $use_mid ) {
$callback = \&pe_mid;
print " Recording paired-end fragment midpoint\n";
}
elsif ( $paired and $do_strand and $use_mid ) {
$callback = \&pe_strand_mid;
print " Recording paired-end stranded fragment midpoint\n";
}
# paired-end span
elsif ( $paired and not $do_strand and $use_span ) {
$callback = \&pe_span;
print " Recording paired-end fragment span\n";
}
elsif ( $paired and $do_strand and $use_span ) {
$callback = \&pe_strand_span;
print " Recording paired-end stranded, fragment span\n";
}
# paired-end center span
elsif ( $paired and not $do_strand and $use_cspan ) {
$callback = \&pe_center_span;
print " Recording paired-end fragment center-span\n";
}
elsif ( $paired and $do_strand and $use_cspan ) {
$callback = \&pe_strand_center_span;
print " Recording paired-end stranded, fragment center-span\n";
}
# paired-end smart coverage
elsif ( $paired and not $do_strand and $use_smartpe ) {
$callback = \&smart_pe;
print " Recording smart paired-end coverage\n";
}
elsif ( $paired and $do_strand and $use_smartpe ) {
$callback = \&smart_stranded_pe;
print " Recording stranded, smart paired-end coverage\n";
}
elsif ( $paired and not $do_strand and $use_ends ) {
$callback = \&pe_ends;
print " Recording paired-end fragment endpoints\n";
}
elsif ( $paired and $do_strand and $use_ends ) {
$callback = \&pe_strand_ends;
print " Recording stranded, paired-end fragment endpoints\n";
}
else {
die "programmer error!\n" unless $shift; # special exception
}
# summary of wig file being written
if ($do_bedgraph) {
print " Writing bedGraph format in $bin_size bp increments\n";
}
elsif ($do_varstep) {
printf " Writing variableStep format in $bin_size bp bins\n";
}
elsif ($do_fixstep) {
printf " Writing fixedStep format in $bin_size bp bins\n";
}
}
sub process_exclusion_list {
if ( $exclude_file and -e $exclude_file ) {
my $i = 0;
eval { require Set::IntervalTree; $i = 1; };
unless ($i) {
print " WARNING! Please install Set::IntervalTree to use exclusion lists\n";
undef $exclude_file;
return;
}
my %list_hash = map { $_ => [] } @seq_list;
my $Data = Bio::ToolBox->load_file($exclude_file)
or die "unable to read exclusion list file '$exclude_file'\n";
unless ( $Data->feature_type eq 'coordinate' ) {
printf
" WARNING! Exclusion list file '%s' does not have coordinates! Ignoring.\n",
$exclude_file;
return \%list_hash;
}
$Data->iterate(
sub {
my $row = shift;
push @{ $list_hash{ $row->seq_id } }, [ $row->start - 1, $row->end ]
if exists $list_hash{ $row->seq_id };
}
);
printf " Loaded %s exclusion list regions\n",
format_with_commas( $Data->number_rows );
return \%list_hash;
}
return;
}
### Determine the shift value
sub determine_shift_value {
# identify top regions to score
# we will walk through the largest chromosome(s) looking for the top
# 500 bp regions containing the highest unstranded coverage to use
print " sampling the top coverage regions "
. "on the largest $chr_number chromosomes...\n";
# first sort the chromosomes by size
# this is assuming all the chromosomes have different sizes ;-)
scripts/bam2wig.pl view on Meta::CPAN
my %files;
my @filelist = glob("$outbase.*.temp.bin");
my $expected = scalar(@sams) * scalar(@seq_list) * ( $do_strand ? 2 : 1 );
if ( scalar(@filelist) != $expected ) {
die sprintf(
" unable to find all the temporary binary children files! Only found %d, expected %d\n",
scalar(@filelist), $expected );
}
foreach my $file (@filelist) {
# each file name is basename.samid.seqid.count.strand.bin.gz
if ( $file =~ /$outbase \.(\d+) \.(.+) \.0 \.f \.temp \.bin \Z/x ) {
my $samid = $1;
my $seqid = $2;
$files{$seqid}{$samid} = $file;
}
}
# check to see how we combine the multiple source sam file coverages
my @norms;
if ($do_mean) {
# just need to take an average
foreach (@sams) { push @norms, ( 1 / scalar(@sams) ) }
}
# otherwise we just add the samples
# merge the samples
foreach my $seq_id (@seq_list) {
$pm->start and next;
merge_bin_files( $seq_id, 'f', 0, $files{$seq_id}, \@norms );
$pm->finish;
}
$pm->wait_all_children;
}
# merge and write final wig file
return write_final_wig_file();
}
sub process_bam_coverage_on_chromosome {
my ( $samid, $sam, $seq_id, $do_temp_bin ) = @_;
my $chr_start_time = time;
# identify chromosome target id
my ( $tid, undef, undef ) = $sam->header->parse_region($seq_id);
unless ( defined $tid ) {
printf " WARNING: unable to parse target id for '%s' from bam file %s!\n",
$seq_id, $bamfiles[$samid];
# we continue but this will silently fail the low level fetch below
# but at least an empty temp file will be written allowing program to finish
}
my $seq_length = $seq_name2length{$seq_id};
# walk through the chromosome in 1 kb increments
my $chrom_data;
for ( my $start = 0; $start < $seq_length; $start += $coverage_dump ) {
# set endpoint
my $end = $start + $coverage_dump;
$end = $seq_length if $end > $seq_length;
# using the low level interface for a little more performance
my $coverage = low_level_bam_coverage( $sam, $tid, $start, $end );
# record the coverage
&{$coverage_sub}( $coverage, \$chrom_data );
}
# write out chromosome binary file, set count to arbitrary 0
if ($do_temp_bin) {
write_bin_file( \$chrom_data,
join( '.', $outbase, $samid, $seq_id, 0, 'f', 'temp.bin' ) );
}
else {
&{$wig_writer}(
\$chrom_data, $binpack, $seq_id, $seq_length,
join( '.', $outbase, $samid, $seq_id, 0, 'f', 'temp.wig' )
);
}
# verbose status line
if ($verbose) {
printf " Generated read coverage on $seq_id in %d seconds\n",
time - $chr_start_time;
}
}
sub process_alignments {
# flag to write temporary binary files or go straight to wig files
my $do_temp_bin = scalar(@sams) > 1 ? 1 : $rpm ? 1 : @scale_values ? 1 : 0;
# walk through each bam file and chromosome one a time
for my $samid ( 0 .. $#sams ) {
foreach my $seq_id (@seq_list) {
process_alignments_on_chromosome( $samid, $sams[$samid], $seq_id,
$do_temp_bin );
}
}
if ($verbose) {
printf " Finished converting $items in %.3f minutes\n",
( time - $start_time ) / 60;
}
# find and merge binary files
# we can do this in parallel too!
if ($do_temp_bin) {
# find the children
my @totals;
my %seq_totals;
my %files;
my @filelist = glob("$outbase.*.temp.bin");
my $expected = scalar(@sams) * scalar(@seq_list) * ( $do_strand ? 2 : 1 );
if ( scalar(@filelist) != $expected ) {
die sprintf(
" unable to find all the temporary binary children files! Only found %d, expected %d\n",
scripts/bam2wig.pl view on Meta::CPAN
sub pe_strand_ends {
my ( $a, $data, $score ) = @_;
# we always receive the forward read, never reverse, from pe_callback
# therefore the first and second read flag indicates orientation
# calculate both positions
my $flag = $a->flag;
if ( $flag & 0x0040 ) {
# first read implies forward orientation
# first position is forward
$data->{f}->[ int( $a->pos / $bin_size ) - $data->{f_offset} ] += $score;
# second position is reverse
my $pos = int( ( $a->pos + $a->isize ) / $bin_size );
$data->{r}->[ $pos - $data->{r_offset} ] += $score;
}
elsif ( $flag & 0x0080 ) {
# second read implies reverse orientation
# first position is reverse
$data->{r}->[ int( $a->pos / $bin_size ) - $data->{r_offset} ] += $score;
# second position is forward
my $pos = int( ( $a->pos + $a->isize ) / $bin_size );
$data->{f}->[ $pos - $data->{f_offset} ] += $score;
}
else {
die sprintf(
" Paired-end flags are set incorrectly; neither 0x040 or 0x080 are set for paired-end read %s.",
$a->query->name );
}
}
__END__
=head1 NAME
bam2wig.pl
A program to convert Bam alignments into a wig representation file.
=head1 SYNOPSIS
bam2wig.pl [--options...] E<lt>file.bamE<gt>
bam2wig.pl --extend --rpm --mean --out file --bw file1.bam file2.bam
Required options:
-i --in <filename.bam> repeat if multiple bams, or comma-delimited list
Reporting options (pick one):
-s --start record at 5' position
-d --mid record at midpoint of alignment or pair
-a --span record across entire alignment or pair
-e --extend extend alignment (record predicted fragment)
--cspan record a span centered on midpoint
--smartcov record paired coverage without overlaps, splices
--ends record paired endpoints
--coverage raw alignment coverage
Alignment reporting options:
-l --splice split alignment at N splices
-t --strand record separate strands as two wig files
--flip flip the strands for convenience
Paired-end alignments:
-p --pe process paired-end alignments, both are checked
-P --fastpe process paired-end alignments, only F are checked
--minsize <integer> minimum allowed insertion size (30)
--maxsize <integer> maximum allowed insertion size (600)
--first only process paired first read (0x40) as single-end
--second only process paired second read (0x80) as single-end
Alignment filtering options:
-K --chrskip <regex> regular expression to skip chromosomes
-B --exclude <file> interval file of regions to skip (bed, gff, txt)
-q --qual <integer> minimum mapping quality (0)
-S --nosecondary ignore secondary (0x100) alignments (false)
-D --noduplicate ignore duplicate (0x400) alignments (false)
-U --nosupplementary ignore supplementary (0x800) alignments (false)
--intron <integer> maximum allowed gap (intron) size in bp (none)
Shift options:
-I --shift shift reads in the 3' direction
-x --extval <integer> explicit extension size in bp (default is to calculate)
-H --shiftval <integer> explicit shift value in bp (default is to calculate)
--chrom <integer> number of chromosomes to sample (4)
--minr <float> minimum pearson correlation to calculate shift (0.5)
--zmin <float> minimum z-score from average to test peak for shift (3)
--zmax <float> maximum z-score from average to test peak for shift (10)
-M --model write peak shift model file for graphing
Score options:
-r --rpm scale depth to Reads Per Million mapped
-m --mean average multiple bams (default is addition)
--scale <float> explicit scaling factor, repeat for each bam file
--fraction assign fractional counts to multi-mapped alignments
--splfrac assign fractional count to each spliced segment
--format <integer> number of decimal positions (4)
--chrnorm <float> use chromosome-specific normalization factor
--chrapply <regex> regular expression to apply chromosome-specific factor
Wig format:
--bin <integer> bin size for span or extend mode (10)
--bdg bedGraph, default for span and extend at bin 1
--fix fixedStep, default for bin > 1
--var varStep, default for start, mid
--nozero do not write zero score intervals in bedGraph
Output options:
-o --out <filename> output file name, default is bam file basename
-b --bw convert to bigWig format (supports bdg, fix, var)
--bwapp /path/to/wigToBigWig path to external converter (default searches \$PATH)
-z --gz gzip compress text output
General options:
-c --cpu <integer> number of parallel processes (4)
--temp <directory> directory to write temporary files (output path)
-V --verbose report additional information
-v --version print version information
-h --help show full documentation
=head1 OPTIONS
The command line flags and descriptions:
=head2 Input
=over 4
=item --in E<lt>filenameE<gt>
Specify the input Bam alignment file. More than one file may be
specified, either with repeated options, a comma-delimited list,
or simply appended to the command. Bam files will be automatically
indexed if necessary.
=back
=head2 Reporting Options
=over 4
=item --start
Specify that the 5' position should be recorded in the wig file.
=item --mid
Specify that the midpoint of the alignment (single-end) or fragment
(paired-end) will be recorded in the wig file.
=item --span
Specify that the entire span of the alignment (single-end) or
fragment (paired-end) will be recorded in the wig file.
=item --extend
Specify that the alignment should be extended in the 3' direction
and that the entire length of the extension be recorded in the wig
file. The extension may be defined by the user or empirically
determined.
=item --cspan
Specify that a defined span centered at the alignment (single-end)
or fragment (paired-end) midpoint will be recorded in the wig file.
The span is defined by the extension value.
=item --smartcov
Smart alignment coverage of paired-end alignments without
double-counting overlaps or recording gaps (intron splices).
=item --ends
Record both endpoints of paired-end fragments, i.e. the outermost
or 5' ends of properly paired fragments. This may be useful with
ATAC-Seq, Cut&Run-Seq, or other cleavage experiments where you want
to record the locations of cutting yet retain the ability to filter
paired-end fragment sizes.
=item --coverage
Specify that the raw alignment coverage be calculated and reported
in the wig file. This utilizes a special low-level operation and
precludes any alignment filtering or post-normalization methods.
Counting overlapping bases in paired-end alignments are dependent on
the bam adapter (older versions would double-count).
=item --position [start|mid|span|extend|cspan|coverage]
Legacy option for supporting previous versions of bam2wig.
=back
=head2 Alignment reporting options
=over 4
=item --splice
Indicate that the bam file contains alignments with splices, such as
from RNASeq experiments. Alignments will be split on cigar N operations
and each sub fragment will be recorded. This only works with single-end
alignments, and is disabled for paired-end reads (just treat as single-end).
Only start and span recording options are supported.
=item --strand
Indicate that separate wig files should be written for each strand.
The output file basename is appended with either '_f' or '_r' for
both files. Strand for paired-end alignments are determined by the
strand of the first read.
=item --flip
Flip the strand of the output files when generating stranded wig files.
Do this when RNA-Seq alignments map to the opposite strand of the
coding sequence, depending on the library preparation method.
=back
=head2 Paired-end alignments
=over 4
=item --pe
The Bam file consists of paired-end alignments, and only properly
mapped pairs of alignments will be counted. Properly mapped pairs
include FR reads on the same chromosome, and not FF, RR, RF, or
pairs aligning to separate chromosomes. Both alignments are required
to be present before the pair is counted. The default is to treat
all alignments as single-end.
=item --fastpe
( run in 1.008 second using v1.01-cache-2.11-cpan-39bf76dae61 )