BioPerl

 view release on metacpan or  search on metacpan

Bio/Assembly/IO/bowtie.pm  view on Meta::CPAN


package Bio::Assembly::IO::bowtie;
use strict;
use warnings;

# Object preamble - inherits from Bio::Root::Root

use Bio::SeqIO;
use Bio::Tools::Run::Samtools;
use Bio::Assembly::IO;

use base qw( Bio::Assembly::IO );

our $HD = "\@HD\tVN:1.0\tSO:unsorted\n";
our $PG = "\@PG\tID=Bowtie\n";

our $HAVE_IO_UNCOMPRESS;
our $HAVE_BOWTIE;

BEGIN {
# check requirements
    eval "require Bio::Tools::Run::Bowtie; \$HAVE_BOWTIE = 1";
    unless ( eval "require IO::Uncompress::Gunzip; \$HAVE_IO_UNCOMPRESS = 1") {
        Bio::Root::Root->warn("IO::Uncompress::Gunzip is not available; you'll have to do your decompression by hand.");
    }
}

=head2 new()

 Title   : new
 Usage   : my $obj = new Bio::Assembly::IO::bowtie();
 Function: Builds a new Bio::Assembly::IO object
 Returns : an instance of Bio::Assembly::IO
 Args    : hash of options:

            -file    => bowtie_output_file
            -index   => bowtie_index or fasta_file used to create index
            -no_head => boolean skip SAM header
            -no_sq   => boolean skip SQ lines of SAM header

 Note    : bowtie_output and fasta files may be gzipped

=cut

sub new {
    my $class = shift;
    my @args = @_;
    my $self = $class->SUPER::new(@args);
    $self->_initialize(@args);
    $self->{'_tempdir'} = $self->tempdir(CLEANUP=>1);
    my ($file, $index, $no_head, $no_sq) = $self->_rearrange([qw(FILE INDEX NO_HEAD NO_SQ)], @args);
    $file =~ s/^<//;
    $self->{'_no_head'} = $no_head;
    $self->{'_no_sq'} = $no_sq;

    # get the sequence so Bio::DB::Sam can work with it
    my $refdb;
    my $inspector;
    if (-e $index && -r _ ) {
        $refdb = ($index =~ m/\.gz[^.]*$/) ? $self->_uncompress($index) : $index;
        my $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$refdb);
        $self->throw("'$index' is not a fasta file.")
            unless $guesser->guess =~ m/^fasta$/;
    } elsif ($HAVE_BOWTIE) {
        $inspector = Bio::Tools::Run::Bowtie->new( -command => 'inspect' );
        $refdb = $inspector->run($index);
    } else {
        $self->throw("Bio::Tools::Run::Bowtie is not available - cannot extract refdb from index.");
    }

    my $bam_file = $self->_make_bam($self->_bowtie_to_sam($file, $refdb));
    my $sam = Bio::Assembly::IO->new( -file => "<$bam_file", -refdb => $refdb , -format => 'sam' );

    return $sam;
}

sub _bowtie_to_sam {
    my ($self, $file, $refdb) = @_;

    $self->throw("'$file' does not exist or is not readable.")
        unless ( -e $file && -r _ );

    if ($file =~ m/\.gz[^.]*$/) {
        $file = $self->_uncompress($file);
        $self->close;
        open my $fh, '<', $file or $self->throw("Could not read file '$file': $!");
        $self->file($file);
        $self->_fh($fh);
    }

    my $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$file);
    $self->throw("'$file' is not a bowtie formatted file.") unless $guesser->guess =~ m/^bowtie$/;

    my %SQ;
    my $mapq = 255;
    my $in_pair;
    my @mate_line;
    my $mlen;

    # create temp file for working
    my ($sam_tmp_h, $sam_tmp_f) = $self->tempfile( -dir => $self->{'_tempdir'}, -suffix => '.sam' );
    
    while (my $line = $self->_readline) {
        chomp($line);
        my ($qname,$strand,$rname,$pos,$seq,$qual,$m,$details)=split("\cI",$line);
        $SQ{$rname} = 1;
        
        my $paired_f =  ($qname =~ m#/[12]#) ? 0x03 : 0;
        my $strand_f = ($strand eq '-') ? 0x10 : 0;
        my $op_strand_f = ($strand eq '+' && $paired_f) ? 0x20 : 0;
        my $first_f =  ($qname =~ m#/1#) ? 0x40 : 0;
        my $second_f = ($qname =~ m#/2#) ? 0x80 : 0;
        my $flag = $paired_f | $strand_f | $op_strand_f | $first_f | $second_f;

        $pos++;
        my $len = length $seq;
        die unless $len == length $qual;
        my $cigar = $len.'M';
        my @detail = split(',',$details);
        my $dist = 'NM:i:'.scalar @detail;

        my @mismatch;
        my $last_pos = 0;
        for (@detail) {
            m/(\d+):(\w)>\w/;
            my $err = ($1-$last_pos);
            $last_pos = $1+1;
            push @mismatch,($err,$2);
        }
        push @mismatch, $len-$last_pos;
        @mismatch = reverse @mismatch if $strand eq '-';
        my $mismatch = join('',('MD:Z:',@mismatch));

        if ($paired_f) {
            my $mrnm = '=';
            if ($in_pair) {
                my $mpos = $mate_line[3];
                $mate_line[7] = $pos;
                my $isize = $mpos-$pos-$len;
                $mate_line[8] = -$isize;
                print $sam_tmp_h join("\t",@mate_line),"\n";
                print $sam_tmp_h join("\t",$qname, $flag, $rname, $pos, $mapq, $cigar, $mrnm, $mpos, $isize, $seq, $qual, $mismatch, $dist),"\n";
                $in_pair = 0;
            } else {
                $mlen = $len;
                @mate_line = ($qname, $flag, $rname, $pos, $mapq, $cigar, $mrnm, undef, undef, $seq, $qual, $mismatch, $dist);
                $in_pair = 1;
            }
        } else {
            my $mrnm = '*';
            my $mpos = 0;
            my $isize = 0;



( run in 1.214 second using v1.01-cache-2.11-cpan-39bf76dae61 )