BioPerl

 view release on metacpan or  search on metacpan

Bio/DB/Taxonomy/sqlite.pm  view on Meta::CPAN

=cut

sub index_directory {
    my $self = shift;
    return $self->{'index_directory'} = shift if @_;
    return $self->{'index_directory'};
}

=head2 db_name

 Title   : db_name
 Function : Get/set the name of the SQLite3 database where data is stored
 Usage   : $obj->db_name($newval)
 Returns : value of db_name (a scalar)
 Args    : on set, new value (a scalar or undef, optional)

=cut

# TODO: this may need some disambiguation w/ index_directory above; for now we
# assume this doesn't have a full path name (though I see no reason why this
# shouldn't allow that)

sub db_name {
    my $self = shift;
    return $self->{'db_name'} = shift if @_;
    return $self->{'db_name'};
}

=head2 cache_size

 Title   : cache_size
 Function : Get/set the cachesize used for loading the SQLite3 database
 Usage   : $obj->cache_size($newval)
 Returns : value of cache_size (a scalar)
 Args    : on set, new value (a scalar or undef, optional)
 Note    : we do no checking on whether this value is an integer (SQLite does this for use)

=cut

sub cache_size {
    my $self = shift;
    return $self->{'cache_size'} = shift if defined($_[0]);
    return $self->{'cache_size'};
}

# internal method which does the indexing
sub _build_index {
    my ( $self, $nodesfile, $namesfile, $force ) = @_;

    # TODO: need to disambiguate using index_directory here since we only have
    # one file.  Mayeb ignore it in favor of having full path for db_name?
    my ($dir, $db_name) = ($self->index_directory, $self->db_name);
    if (! -e $db_name || $force) {
        
        # TODO: we're ignoring index_directory for now, may add support for this
        # down the way
        my $dbh = DBI->connect("dbi:SQLite:dbname=$db_name","","") or die $!;
        
        $self->debug("Running SQLite version:".$dbh->{sqlite_version}."\n");
    
        #$dbh->do('PRAGMA synchronous = 0');      # Non transaction safe!!!
        
        if ($self->cache_size) {
            my $cs = $self->cache_size;
            $self->debug("Setting cache size $cs\n");
            $dbh->do("PRAGMA cache_size = $cs") 
        }

        $self->debug("Loading taxon table data\n");
        $self->_init_db($dbh);
        open my $NODES, '<', $nodesfile
            or $self->throw("Could not read node file '$nodesfile': $!");
    
        # TODO: this has the really unnecessary 'OR IGNORE' option added,
        # apparently b.c the test data expects to handle cases where the TaxID
        # is repeated in this table (which should never happen in this table). I
        # will likely change this to throw under those circumstances
        
        my $sth = $dbh->prepare_cached(<<SQL);
    INSERT OR IGNORE INTO taxon (taxon_id, parent_id, rank, code, division_id, gencode_id, mito_id) VALUES (?,?,?,?,?,?,?)
SQL
        $dbh->do("BEGIN");
        while (<$NODES>) {
            next if /^\s*$/;
            chomp;
            my ($taxid,$parent,$rank,$code,$divid,undef,$gen_code,undef,$mito) = split(/\t\|\t/,$_);
            next if $taxid == 1;
            if ($parent == 1) {
                $parent = undef;
            }
            
            $sth->execute($taxid, $parent, $rank, $code, $divid, $gen_code, $mito) or die $sth->errstr.": TaxID $taxid";
        }
        $dbh->do("COMMIT") or $self->throw($dbh->errstr);
        
        close $NODES;
        
        $self->debug("Loading name table data\n");
        open my $NAMES, '<', $namesfile
            or $self->throw("Could not read names file '$namesfile': $!");
    
        my $sth = $dbh->prepare_cached(<<SQL) or $self->throw($dbh->errstr);
    INSERT INTO names (taxon_id, name, uniq_name, class) VALUES (?,?,?,?)
SQL
        $dbh->do("BEGIN");
        while (<$NAMES>) {
            next if /^$/;
            chomp;
            my ($taxid, $name, $unique_name, $class) = split(/\t\|\t/,$_);
            
            # don't include the fake root node 'root' or 'all' with id 1
            next if $taxid == 1;
            
            $class =~ s/\s+\|\s*$//;
            
            #if ($name =~ /\(class\)$/) { # it seems that only rank of class is ever used in this situation
            #    $name =~ s/\s+\(class\)$//;
            #}
            
            $sth->execute($taxid, $name, $unique_name, $class) or $self->throw($sth->errstr);
        }
        close $NAMES;

        $dbh->do("COMMIT");
        
        $self->debug("Creating taxon index\n");
        $dbh->do("CREATE INDEX parent_idx ON taxon (parent_id)") or $self->throw($dbh->errstr);
        $self->debug("Creating name index\n");
        $dbh->do("CREATE INDEX name_idx ON names (name)") or $self->throw($dbh->errstr);
        $self->debug("Creating taxon name table index\n");
        $dbh->do("CREATE INDEX taxon_name_idx ON names (taxon_id)") or $self->throw($dbh->errstr);

        $dbh->do("PRAGMA foreign_keys = ON");
        
        #$dbh->do('PRAGMA synchronous = 1');
        $self->{dbh} = $dbh;
        $self->{'_initialized'} = 1;
    }
    1;
}

# connect the internal db handle
sub _db_connect {
    my $self = shift;
    return if $self->{'_initialized'};

    my ($dir, $db_name) = ($self->index_directory, $self->db_name);
    
    # TODO: we're ignoring index_directory for now, may add support for this
    # down the way
    my $dbh = DBI->connect("dbi:SQLite:dbname=$db_name","","") or die $!;
    $dbh->do("PRAGMA foreign_keys = ON");
    if ($self->cache_size) {
        my $cs = $self->cache_size;
        $self->debug("Setting cache size $cs\n");
        $dbh->do("PRAGMA cache_size = $cs") 
    }
    $self->{dbh} = $dbh;

    $self->{'_initialized'} = 1;
}

sub _init_db {
    my ($self, $dbh) = @_;
    my $schema = $self->taxon_schema();
    # TODO: set up handler parameters here
    for my $table (sort keys %$schema) {
        $dbh->do("DROP TABLE IF EXISTS $table") or $self->throw($dbh->errstr);
        $dbh->do("CREATE TABLE $table ".$schema->{$table}) or $self->throw($dbh->errstr);
    }
    1;
}

sub _dbh_fetch {
    my ($self, $sql) = @_;
    # TODO: more sanity checks
    my $rows = $self->{dbh}->selectrow_arrayref($sql) or $self->throw( $self->{dbh}->errstr );
    return $rows;
}

sub _prepare_cached {
    my ($self, $sql) = @_;
    # TODO: more sanity checks
    my $sth = $self->{dbh}->prepare_cached($sql) or $self->throw( $self->{dbh}->errstr );
    $sth;
}


# TODO: check data size, this is a ballpark estimate (could be reduced)
sub taxon_schema {
    my $self   = shift;
    return {
    taxon   => <<SCHEMA,
    (
        taxon_id        INTEGER UNIQUE PRIMARY KEY NOT NULL,



( run in 0.785 second using v1.01-cache-2.11-cpan-39bf76dae61 )