BioPerl
view release on metacpan or search on metacpan
Bio/DB/Taxonomy/sqlite.pm view on Meta::CPAN
=cut
sub index_directory {
my $self = shift;
return $self->{'index_directory'} = shift if @_;
return $self->{'index_directory'};
}
=head2 db_name
Title : db_name
Function : Get/set the name of the SQLite3 database where data is stored
Usage : $obj->db_name($newval)
Returns : value of db_name (a scalar)
Args : on set, new value (a scalar or undef, optional)
=cut
# TODO: this may need some disambiguation w/ index_directory above; for now we
# assume this doesn't have a full path name (though I see no reason why this
# shouldn't allow that)
sub db_name {
my $self = shift;
return $self->{'db_name'} = shift if @_;
return $self->{'db_name'};
}
=head2 cache_size
Title : cache_size
Function : Get/set the cachesize used for loading the SQLite3 database
Usage : $obj->cache_size($newval)
Returns : value of cache_size (a scalar)
Args : on set, new value (a scalar or undef, optional)
Note : we do no checking on whether this value is an integer (SQLite does this for use)
=cut
sub cache_size {
my $self = shift;
return $self->{'cache_size'} = shift if defined($_[0]);
return $self->{'cache_size'};
}
# internal method which does the indexing
sub _build_index {
my ( $self, $nodesfile, $namesfile, $force ) = @_;
# TODO: need to disambiguate using index_directory here since we only have
# one file. Mayeb ignore it in favor of having full path for db_name?
my ($dir, $db_name) = ($self->index_directory, $self->db_name);
if (! -e $db_name || $force) {
# TODO: we're ignoring index_directory for now, may add support for this
# down the way
my $dbh = DBI->connect("dbi:SQLite:dbname=$db_name","","") or die $!;
$self->debug("Running SQLite version:".$dbh->{sqlite_version}."\n");
#$dbh->do('PRAGMA synchronous = 0'); # Non transaction safe!!!
if ($self->cache_size) {
my $cs = $self->cache_size;
$self->debug("Setting cache size $cs\n");
$dbh->do("PRAGMA cache_size = $cs")
}
$self->debug("Loading taxon table data\n");
$self->_init_db($dbh);
open my $NODES, '<', $nodesfile
or $self->throw("Could not read node file '$nodesfile': $!");
# TODO: this has the really unnecessary 'OR IGNORE' option added,
# apparently b.c the test data expects to handle cases where the TaxID
# is repeated in this table (which should never happen in this table). I
# will likely change this to throw under those circumstances
my $sth = $dbh->prepare_cached(<<SQL);
INSERT OR IGNORE INTO taxon (taxon_id, parent_id, rank, code, division_id, gencode_id, mito_id) VALUES (?,?,?,?,?,?,?)
SQL
$dbh->do("BEGIN");
while (<$NODES>) {
next if /^\s*$/;
chomp;
my ($taxid,$parent,$rank,$code,$divid,undef,$gen_code,undef,$mito) = split(/\t\|\t/,$_);
next if $taxid == 1;
if ($parent == 1) {
$parent = undef;
}
$sth->execute($taxid, $parent, $rank, $code, $divid, $gen_code, $mito) or die $sth->errstr.": TaxID $taxid";
}
$dbh->do("COMMIT") or $self->throw($dbh->errstr);
close $NODES;
$self->debug("Loading name table data\n");
open my $NAMES, '<', $namesfile
or $self->throw("Could not read names file '$namesfile': $!");
my $sth = $dbh->prepare_cached(<<SQL) or $self->throw($dbh->errstr);
INSERT INTO names (taxon_id, name, uniq_name, class) VALUES (?,?,?,?)
SQL
$dbh->do("BEGIN");
while (<$NAMES>) {
next if /^$/;
chomp;
my ($taxid, $name, $unique_name, $class) = split(/\t\|\t/,$_);
# don't include the fake root node 'root' or 'all' with id 1
next if $taxid == 1;
$class =~ s/\s+\|\s*$//;
#if ($name =~ /\(class\)$/) { # it seems that only rank of class is ever used in this situation
# $name =~ s/\s+\(class\)$//;
#}
$sth->execute($taxid, $name, $unique_name, $class) or $self->throw($sth->errstr);
}
close $NAMES;
$dbh->do("COMMIT");
$self->debug("Creating taxon index\n");
$dbh->do("CREATE INDEX parent_idx ON taxon (parent_id)") or $self->throw($dbh->errstr);
$self->debug("Creating name index\n");
$dbh->do("CREATE INDEX name_idx ON names (name)") or $self->throw($dbh->errstr);
$self->debug("Creating taxon name table index\n");
$dbh->do("CREATE INDEX taxon_name_idx ON names (taxon_id)") or $self->throw($dbh->errstr);
$dbh->do("PRAGMA foreign_keys = ON");
#$dbh->do('PRAGMA synchronous = 1');
$self->{dbh} = $dbh;
$self->{'_initialized'} = 1;
}
1;
}
# connect the internal db handle
sub _db_connect {
my $self = shift;
return if $self->{'_initialized'};
my ($dir, $db_name) = ($self->index_directory, $self->db_name);
# TODO: we're ignoring index_directory for now, may add support for this
# down the way
my $dbh = DBI->connect("dbi:SQLite:dbname=$db_name","","") or die $!;
$dbh->do("PRAGMA foreign_keys = ON");
if ($self->cache_size) {
my $cs = $self->cache_size;
$self->debug("Setting cache size $cs\n");
$dbh->do("PRAGMA cache_size = $cs")
}
$self->{dbh} = $dbh;
$self->{'_initialized'} = 1;
}
sub _init_db {
my ($self, $dbh) = @_;
my $schema = $self->taxon_schema();
# TODO: set up handler parameters here
for my $table (sort keys %$schema) {
$dbh->do("DROP TABLE IF EXISTS $table") or $self->throw($dbh->errstr);
$dbh->do("CREATE TABLE $table ".$schema->{$table}) or $self->throw($dbh->errstr);
}
1;
}
sub _dbh_fetch {
my ($self, $sql) = @_;
# TODO: more sanity checks
my $rows = $self->{dbh}->selectrow_arrayref($sql) or $self->throw( $self->{dbh}->errstr );
return $rows;
}
sub _prepare_cached {
my ($self, $sql) = @_;
# TODO: more sanity checks
my $sth = $self->{dbh}->prepare_cached($sql) or $self->throw( $self->{dbh}->errstr );
$sth;
}
# TODO: check data size, this is a ballpark estimate (could be reduced)
sub taxon_schema {
my $self = shift;
return {
taxon => <<SCHEMA,
(
taxon_id INTEGER UNIQUE PRIMARY KEY NOT NULL,
( run in 0.785 second using v1.01-cache-2.11-cpan-39bf76dae61 )