Bio-Pipeline-Comparison
view release on metacpan or search on metacpan
lib/FaSlice.pm view on Meta::CPAN
#
package FaSlice;
use strict;
use warnings;
use Carp;
sub new
{
my ($class,@args) = @_;
my $self = @args ? {@args} : {};
bless $self, ref($class) || $class;
if ( !$$self{file} ) { $self->throw("Missing the parameter file\n"); }
$$self{chr} = undef;
$$self{from} = undef;
$$self{to} = undef;
if ( !$$self{size} ) { $$self{size}=1_000_000; }
$$self{ncache_missed} = 0;
$$self{nqueries} = 0;
if ( !exists($$self{oob}) ) { $$self{oob}=''; }
if ( $$self{oob} ne '' && $$self{oob} ne 'throw' && $$self{oob} ne 'N' ) { $self->throw("The value of oob not recognised: [$$self{oob}]"); }
$self->chromosome_naming($$self{file});
return $self;
}
sub throw
{
my ($self,@msg) = @_;
confess(@msg);
}
sub cmd
{
my ($self,$cmd) = @_;
my @out = `$cmd`;
if ( $? )
{
my @msg = ();
push @msg, qq[The command "$cmd" returned non-zero status $?];
if ( $! )
{
push @msg, ": $!\n";
}
else
{
push @msg, ".\n";
}
if ( scalar @out )
{
push @msg, @out;
}
$self->throw(@msg);
}
return (@out);
}
# Read the first file of the fasta file and make a guess: Are all chromosomes
# names as 'chr1','chr2',etc or just '1','2',...?
# Future TODO: more robust chromosome name mapping?
sub chromosome_naming
{
my ($self,$fa_file) = @_;
open(my $fh,'<',"$fa_file.fai") or $self->throw("$fa_file.fai: $!");
my $line=<$fh>;
if ( !($line=~/^(chr)?\S+\t/) ) { chomp($line); $self->throw("FIXME: the sequence names not in '>(chr)?\\S+' format [$line] ... $fa_file.fai\n"); }
close($fh);
$$self{chr_naming} = defined $1 ? $1 : '';
}
sub read_chunk
{
my ($self,$chr,$pos) = @_;
$$self{chr} = $chr;
$chr =~ s/^chr//;
$chr = $$self{chr_naming}.$chr;
my $to = $pos + $$self{size};
my $cmd = "samtools faidx $$self{file} $chr:$pos-$to";
my @out = $self->cmd($cmd) or $self->throw("$cmd: $!");
my $line = shift(@out);
if ( !($line=~/^>$chr:(\d+)-(\d+)/) ) { $self->throw("Could not parse: $line"); }
$$self{from} = $1;
my $chunk = '';
while ($line=shift(@out))
{
chomp($line);
$chunk .= $line;
}
$$self{to} = $$self{from} + length($chunk) - 1;
$$self{chunk} = $chunk;
$$self{ncache_missed}++;
return;
}
sub get_base
{
my ($self,$chr,$pos) = @_;
if ( !$$self{chr} || $chr ne $$self{chr} || $pos<$$self{from} || $pos>$$self{to} )
{
$self->read_chunk($chr,$pos);
}
$$self{nqueries}++;
my $idx = $pos - $$self{from};
if ( $$self{from}>$$self{to} )
{
if ( $$self{oob} eq '' ) { return ''; }
elsif ( $$self{oob} eq 'N' ) { return 'N'; }
$self->throw("No such site $chr:$pos in $$self{file}\n");
}
return substr($$self{chunk},$idx,1);
}
sub get_slice
{
( run in 1.390 second using v1.01-cache-2.11-cpan-39bf76dae61 )