Bio-Pipeline-Comparison

 view release on metacpan or  search on metacpan

lib/FaSlice.pm  view on Meta::CPAN

#



package FaSlice;

use strict;
use warnings;
use Carp;


sub new
{
    my ($class,@args) = @_;
    my $self = @args ? {@args} : {};
    bless $self, ref($class) || $class;
    if ( !$$self{file} ) { $self->throw("Missing the parameter file\n"); }
    $$self{chr}  = undef;
    $$self{from} = undef;
    $$self{to}   = undef;
    if ( !$$self{size} ) { $$self{size}=1_000_000; }
    $$self{ncache_missed} = 0;
    $$self{nqueries} = 0;
    if ( !exists($$self{oob}) ) { $$self{oob}=''; }
    if ( $$self{oob} ne '' && $$self{oob} ne 'throw' && $$self{oob} ne 'N' ) { $self->throw("The value of oob not recognised: [$$self{oob}]"); }
    $self->chromosome_naming($$self{file});
    return $self;
}

sub throw
{
    my ($self,@msg) = @_;
    confess(@msg);
}

sub cmd
{
    my ($self,$cmd) = @_;
    my @out = `$cmd`;
    if ( $? )
    {
        my @msg = ();
        push @msg, qq[The command "$cmd" returned non-zero status $?];
        if ( $! )
        {
            push @msg, ": $!\n";
        }
        else
        {
            push @msg, ".\n";
        }
        if ( scalar @out )
        {
            push @msg, @out;
        }
        $self->throw(@msg);
    }
    return (@out);
}

# Read the first file of the fasta file and make a guess: Are all chromosomes
#   names as 'chr1','chr2',etc or just '1','2',...?
# Future TODO: more robust chromosome name mapping?
sub chromosome_naming
{
    my ($self,$fa_file) = @_;
    open(my $fh,'<',"$fa_file.fai") or $self->throw("$fa_file.fai: $!");
    my $line=<$fh>;
    if ( !($line=~/^(chr)?\S+\t/) ) { chomp($line); $self->throw("FIXME: the sequence names not in '>(chr)?\\S+' format [$line] ... $fa_file.fai\n"); }
    close($fh); 
    $$self{chr_naming} = defined $1 ? $1 : '';
}


sub read_chunk
{
    my ($self,$chr,$pos) = @_;
    $$self{chr}  = $chr;
    $chr =~ s/^chr//;
    $chr = $$self{chr_naming}.$chr;
    my $to = $pos + $$self{size};
    my $cmd = "samtools faidx $$self{file} $chr:$pos-$to";
    my @out = $self->cmd($cmd) or $self->throw("$cmd: $!");
    my $line = shift(@out);
    if ( !($line=~/^>$chr:(\d+)-(\d+)/) ) { $self->throw("Could not parse: $line"); }
    $$self{from} = $1;
    my $chunk = '';
    while ($line=shift(@out))
    {
        chomp($line);
        $chunk .= $line;
    }
    $$self{to} = $$self{from} + length($chunk) - 1;
    $$self{chunk} = $chunk;
    $$self{ncache_missed}++;
    return;
}


sub get_base
{
    my ($self,$chr,$pos) = @_;
    if ( !$$self{chr} || $chr ne $$self{chr} || $pos<$$self{from} || $pos>$$self{to} )
    {
        $self->read_chunk($chr,$pos);
    }
    $$self{nqueries}++;
    my $idx = $pos - $$self{from};
    if ( $$self{from}>$$self{to} ) 
    { 
        if ( $$self{oob} eq '' ) { return ''; }
        elsif ( $$self{oob} eq 'N' ) { return 'N'; }
        $self->throw("No such site $chr:$pos in $$self{file}\n"); 
    }
    return substr($$self{chunk},$idx,1);
}



sub get_slice
{



( run in 1.390 second using v1.01-cache-2.11-cpan-39bf76dae61 )