Bio-BLAST

 view release on metacpan or  search on metacpan

lib/Bio/BLAST/Database.pm  view on Meta::CPAN

package Bio::BLAST::Database;
BEGIN {
  $Bio::BLAST::Database::AUTHORITY = 'cpan:RBUELS';
}
BEGIN {
  $Bio::BLAST::Database::VERSION = '0.4';
}
# ABSTRACT: work with formatted BLAST databases

use strict;
use warnings;

use POSIX;

use IO::Pipe;
use IPC::Cmd qw/ can_run /;

use Carp;
use Memoize;

use File::Basename;
use File::Copy;
use File::Path;
use File::Slurp qw/slurp/;
use File::Spec::Functions qw/ splitdir catdir devnull /;

use IPC::System::Simple 'systemx';

use List::Util qw/ min max /;
use List::MoreUtils qw/ all any /;

use Bio::BLAST::Database::Seq;

use namespace::clean;


use base qw/ Class::Accessor::Fast /;



sub new { croak "use open(), not new()" }

sub open {
    my $class = shift;
    #validate the args
    @_ % 2 and croak 'invalid args to open()';
    my %args = @_;
    my %valid_keys = map {$_ => 1} qw( full_file_basename type write create_dirs );
    $valid_keys{$_} or croak "invalid param '$_' passed to open()" for keys %args;

    my $self = $class->SUPER::new(\%args);

    $self->full_file_basename or croak 'must provide a full_file_basename';

    unless( $self->type ) {
        $self->type( $self->_guess_type )
            or croak 'type not provided, and could not guess it';
    }

    if( $self->write ) {
        $self->create_dirs || -d dirname( $self->full_file_basename )
            or croak 'either directory must exist, or create_dirs must be set true';

        my $perm_error = $self->check_format_permissions;
        croak $perm_error if $perm_error;
    }

    # set some of our attrs from the existing files
    $self->_read_fastacmd_info;

    if( $self->write ) {
        return $self;
    } else {
        # open succeeds if all the files are there
        return $self if $self->files_are_complete;

        #carp "cannot open for reading, not a complete set of files:\n",
        #    map "  - $_\n", $self->list_files;
        return;
    }
}


__PACKAGE__->mk_accessors('full_file_basename');


__PACKAGE__->mk_accessors('create_dirs');


__PACKAGE__->mk_accessors('write');


__PACKAGE__->mk_accessors('title');


sub indexed_seqs { #< indexed_seqs is read-only externally
  my ($self,@args) = @_;
  croak 'indexed_seqs() is read-only' if @args;
  shift->_indexed_seqs;
}
__PACKAGE__->mk_accessors('_indexed_seqs');


sub type {
    my $self = shift;

    if( @_ ) {
        my $type = shift;
        !defined $type || $type eq 'nucleotide' || $type eq 'protein'
            or croak "invalid type '$type'";
        $self->{type} = $type;
    }

    return $self->{type};
}


lib/Bio/BLAST/Database.pm  view on Meta::CPAN


  return @myfiles;
}


__PACKAGE__->mk_accessors('sequences_count');



sub get_sequence {
    my ($self, $seqname) = @_;

    croak "cannot call get_sequence on an incomplete database!"
        unless $self->files_are_complete;

    croak "cannot call get_sequence on a database that has not been indexed for retrieval!"
        unless $self->indexed_seqs;

    return Bio::BLAST::Database::Seq->new(
        -bdb => $self,
        -id  => $seqname,
        );
}

# internal function to set the title, sequence count, type,
# format_time, and indexed_seqs from the set of files on disk and from
# the output of fastacmd
sub _read_fastacmd_info {
    my ($self) = @_;

    my @files = $self->list_files
        or return;

    $self->_check_external_tools;

    my $ffbn = $self->full_file_basename;
    my $cmd = "fastacmd -d $ffbn -I";
    my $fastacmd = `$cmd 2>&1`;
    #warn "$fastacmd";

    my ($title) = $fastacmd =~ /Database:\s*([\s\S]+)sequences/
      or die "could not parse output of fastacmd (0):\n$fastacmd";
    $title =~ s/\s*[\d,]+\s*$//;

    my ($seq_cnt) = $fastacmd =~ /([\d,]+)\s*sequences/
      or die "could not parse output of fastacmd (1):\n$fastacmd";
    $seq_cnt =~ s/,//g;

    my ($datestr) =
        $fastacmd =~ m(
                     Date: \s* ( \w [\S\ ]+ \w )
                       \s+
                     Version:
                      )x
                          or die "could not parse output of fastacmd (2):\n$fastacmd";


    my $indexed = (any {/sd$/} @files) && (any {/si$/} @files);

    ### set our data
    $self->type( $self->_guess_type )
        or confess 'could not determine db type';

    ### type: $self->type

    $self->format_time( _parse_datestr($datestr) ); #< will die on failure
    $title =~ s/\s+$//;
    $self->title( $title );
    $self->_indexed_seqs( $indexed );
    $self->sequences_count( $seq_cnt );
}
sub _guess_type {
    my ($self) = @_;
    my $saved_type = $self->type;

    foreach my $guess (qw( protein nucleotide )) {
        $self->type( $guess );
        if( $self->files_are_complete ) {
            $self->type( $saved_type );
            return $guess;
        }
    }

    $self->type( $saved_type );
    return;
}
sub _parse_datestr {
    my ($datestr) = @_;
    my @split = split /\W+/,$datestr;
    my ($mon,$d,$y,$h,$min,$ampm) = @split
        or die "could not parse data string '$datestr'";

    #  warn "got $mon,$d,$y,$h,$min,$ampm\n";
    my %months = do{ my $n = 0; map { $_ => $n++ } qw/Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec/};
    exists $months{$mon} or return;
    $mon = $months{$mon};
    $h = 0 if $h == 12;
    $h += 12 if lc($ampm) eq 'pm';
    #warn "mktime $min,$h,$d,$mon,".($y-1900)."\n";
    my $time = mktime(0,$min,$h,$d,$mon,$y-1900,0,0,-1);
    #  warn "$datestr => ".ctime($time)."\n";
    return $time;
}

###
1;#do not remove
###

__END__
=pod

=encoding utf-8

=head1 NAME

Bio::BLAST::Database - work with formatted BLAST databases

=head1 SYNOPSIS

  use Bio::BLAST::Database;

  # open an existing bdb for reading
  my $fs = Bio::BLAST::Database->open(
               full_file_basename => '/path/to/my_bdb',
             );
  # will read from /path/to/my_bdb.nin, /path/to/my_bdb.nsq, etc

  my @filenames = $fs->list_files;

  #reopen it for writing
  $fs = Bio::BLAST::Database->open(
            full_file_basename => '/path/to/my_bdb',
            write => 1,
          );

  # replace it with a different set of sequences
  $fs->format_from_file('myseqs.seq');

  # can also get some metadata about it
  print "db's title is ".$fs->title;



( run in 2.718 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )