Bio-BLAST
view release on metacpan or search on metacpan
lib/Bio/BLAST/Database.pm view on Meta::CPAN
package Bio::BLAST::Database;
BEGIN {
$Bio::BLAST::Database::AUTHORITY = 'cpan:RBUELS';
}
BEGIN {
$Bio::BLAST::Database::VERSION = '0.4';
}
# ABSTRACT: work with formatted BLAST databases
use strict;
use warnings;
use POSIX;
use IO::Pipe;
use IPC::Cmd qw/ can_run /;
use Carp;
use Memoize;
use File::Basename;
use File::Copy;
use File::Path;
use File::Slurp qw/slurp/;
use File::Spec::Functions qw/ splitdir catdir devnull /;
use IPC::System::Simple 'systemx';
use List::Util qw/ min max /;
use List::MoreUtils qw/ all any /;
use Bio::BLAST::Database::Seq;
use namespace::clean;
use base qw/ Class::Accessor::Fast /;
sub new { croak "use open(), not new()" }
sub open {
my $class = shift;
#validate the args
@_ % 2 and croak 'invalid args to open()';
my %args = @_;
my %valid_keys = map {$_ => 1} qw( full_file_basename type write create_dirs );
$valid_keys{$_} or croak "invalid param '$_' passed to open()" for keys %args;
my $self = $class->SUPER::new(\%args);
$self->full_file_basename or croak 'must provide a full_file_basename';
unless( $self->type ) {
$self->type( $self->_guess_type )
or croak 'type not provided, and could not guess it';
}
if( $self->write ) {
$self->create_dirs || -d dirname( $self->full_file_basename )
or croak 'either directory must exist, or create_dirs must be set true';
my $perm_error = $self->check_format_permissions;
croak $perm_error if $perm_error;
}
# set some of our attrs from the existing files
$self->_read_fastacmd_info;
if( $self->write ) {
return $self;
} else {
# open succeeds if all the files are there
return $self if $self->files_are_complete;
#carp "cannot open for reading, not a complete set of files:\n",
# map " - $_\n", $self->list_files;
return;
}
}
__PACKAGE__->mk_accessors('full_file_basename');
__PACKAGE__->mk_accessors('create_dirs');
__PACKAGE__->mk_accessors('write');
__PACKAGE__->mk_accessors('title');
sub indexed_seqs { #< indexed_seqs is read-only externally
my ($self,@args) = @_;
croak 'indexed_seqs() is read-only' if @args;
shift->_indexed_seqs;
}
__PACKAGE__->mk_accessors('_indexed_seqs');
sub type {
my $self = shift;
if( @_ ) {
my $type = shift;
!defined $type || $type eq 'nucleotide' || $type eq 'protein'
or croak "invalid type '$type'";
$self->{type} = $type;
}
return $self->{type};
}
lib/Bio/BLAST/Database.pm view on Meta::CPAN
return @myfiles;
}
__PACKAGE__->mk_accessors('sequences_count');
sub get_sequence {
my ($self, $seqname) = @_;
croak "cannot call get_sequence on an incomplete database!"
unless $self->files_are_complete;
croak "cannot call get_sequence on a database that has not been indexed for retrieval!"
unless $self->indexed_seqs;
return Bio::BLAST::Database::Seq->new(
-bdb => $self,
-id => $seqname,
);
}
# internal function to set the title, sequence count, type,
# format_time, and indexed_seqs from the set of files on disk and from
# the output of fastacmd
sub _read_fastacmd_info {
my ($self) = @_;
my @files = $self->list_files
or return;
$self->_check_external_tools;
my $ffbn = $self->full_file_basename;
my $cmd = "fastacmd -d $ffbn -I";
my $fastacmd = `$cmd 2>&1`;
#warn "$fastacmd";
my ($title) = $fastacmd =~ /Database:\s*([\s\S]+)sequences/
or die "could not parse output of fastacmd (0):\n$fastacmd";
$title =~ s/\s*[\d,]+\s*$//;
my ($seq_cnt) = $fastacmd =~ /([\d,]+)\s*sequences/
or die "could not parse output of fastacmd (1):\n$fastacmd";
$seq_cnt =~ s/,//g;
my ($datestr) =
$fastacmd =~ m(
Date: \s* ( \w [\S\ ]+ \w )
\s+
Version:
)x
or die "could not parse output of fastacmd (2):\n$fastacmd";
my $indexed = (any {/sd$/} @files) && (any {/si$/} @files);
### set our data
$self->type( $self->_guess_type )
or confess 'could not determine db type';
### type: $self->type
$self->format_time( _parse_datestr($datestr) ); #< will die on failure
$title =~ s/\s+$//;
$self->title( $title );
$self->_indexed_seqs( $indexed );
$self->sequences_count( $seq_cnt );
}
sub _guess_type {
my ($self) = @_;
my $saved_type = $self->type;
foreach my $guess (qw( protein nucleotide )) {
$self->type( $guess );
if( $self->files_are_complete ) {
$self->type( $saved_type );
return $guess;
}
}
$self->type( $saved_type );
return;
}
sub _parse_datestr {
my ($datestr) = @_;
my @split = split /\W+/,$datestr;
my ($mon,$d,$y,$h,$min,$ampm) = @split
or die "could not parse data string '$datestr'";
# warn "got $mon,$d,$y,$h,$min,$ampm\n";
my %months = do{ my $n = 0; map { $_ => $n++ } qw/Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec/};
exists $months{$mon} or return;
$mon = $months{$mon};
$h = 0 if $h == 12;
$h += 12 if lc($ampm) eq 'pm';
#warn "mktime $min,$h,$d,$mon,".($y-1900)."\n";
my $time = mktime(0,$min,$h,$d,$mon,$y-1900,0,0,-1);
# warn "$datestr => ".ctime($time)."\n";
return $time;
}
###
1;#do not remove
###
__END__
=pod
=encoding utf-8
=head1 NAME
Bio::BLAST::Database - work with formatted BLAST databases
=head1 SYNOPSIS
use Bio::BLAST::Database;
# open an existing bdb for reading
my $fs = Bio::BLAST::Database->open(
full_file_basename => '/path/to/my_bdb',
);
# will read from /path/to/my_bdb.nin, /path/to/my_bdb.nsq, etc
my @filenames = $fs->list_files;
#reopen it for writing
$fs = Bio::BLAST::Database->open(
full_file_basename => '/path/to/my_bdb',
write => 1,
);
# replace it with a different set of sequences
$fs->format_from_file('myseqs.seq');
# can also get some metadata about it
print "db's title is ".$fs->title;
( run in 2.718 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )