BioPerl
view release on metacpan or search on metacpan
Bio/DB/Flat/BinarySearch.pm view on Meta::CPAN
$index->build_index(@files);
Function: create a new Bio::DB::Flat::BinarySearch object
Returns : new Bio::DB::Flat::BinarySearch
Args : -directory Root directory for index files
-dbname Name of subdirectory containing indices
for named database
-write_flag Allow building index
-primary_pattern Regexp defining the primary id
-secondary_patterns A hash ref containing the secondary
patterns with the namespaces as keys
-primary_namespace A string defining what the primary key
is
Status : Public
=cut
sub new {
my ( $class, @args ) = @_;
my $self = $class->SUPER::new(@args);
bless $self, $class;
my ( $index_dir, $dbname, $format, $write_flag, $primary_pattern,
$primary_namespace, $start_pattern, $secondary_patterns )
= $self->_rearrange(
[
qw(DIRECTORY
DBNAME
FORMAT
WRITE_FLAG
PRIMARY_PATTERN
PRIMARY_NAMESPACE
START_PATTERN
SECONDARY_PATTERNS)
],
@args
);
$self->index_directory($index_dir);
$self->dbname($dbname);
if ( $self->index_directory && $self->read_config_file ) {
my $fh = $self->primary_index_filehandle;
my $record_width = $self->read_header($fh);
$self->record_size($record_width);
}
$format ||= DEFAULT_FORMAT;
$self->format($format);
$self->write_flag($write_flag);
if ( $self->write_flag && !$primary_namespace ) {
(
$primary_namespace, $primary_pattern,
$start_pattern, $secondary_patterns
) = $self->_guess_patterns( $self->format );
}
$self->primary_pattern($primary_pattern);
$self->primary_namespace($primary_namespace);
$self->start_pattern($start_pattern);
$self->secondary_patterns($secondary_patterns);
return $self;
}
sub new_from_registry {
my ( $self, %config ) = @_;
my $dbname = $config{'dbname'};
my $location = $config{'location'};
my $index = Bio::DB::Flat::BinarySearch->new(
-dbname => $dbname,
-index_dir => $location,
);
}
=head2 get_Seq_by_id
Title : get_Seq_by_id
Usage : $obj->get_Seq_by_id($newval)
Function:
Example :
Returns : value of get_Seq_by_id
Args : newvalue (optional)
=cut
sub get_Seq_by_id {
my ( $self, $id ) = @_;
# too many uninit variables...
local $^W = 0;
my ( $fh, $length ) = $self->get_stream_by_id($id);
unless ( defined( $self->format ) ) {
$self->throw("Can't create sequence - format is not defined");
}
return unless $fh;
unless ( defined( $self->{_seqio} ) ) {
$self->{_seqio} = Bio::SeqIO->new(
-fh => $fh,
-format => $self->format
);
}
else {
$self->{_seqio}->_fh($fh);
}
return $self->{_seqio}->next_seq;
}
Bio/DB/Flat/BinarySearch.pm view on Meta::CPAN
$self->make_config_file( \@files );
# And finally write out the indices
$self->write_primary_index;
$self->write_secondary_indices;
$entries;
}
=head2 _index_file
Title : _index_file
Usage : $obj->_index_file($newval)
Function:
Example :
Returns : value of _index_file
Args : newvalue (optional)
=cut
sub _index_file {
my ( $self, $file ) = @_;
my $v = $self->verbose;
open my $FILE, '<', $file or $self->throw("Could not read file '$file': $!");
my $recstart = 0;
my $fileid = $self->get_fileid_by_filename($file);
my $found = 0;
my $id;
my $count = 0;
my $primary = $self->primary_pattern;
my $start_pattern = $self->start_pattern;
my $pos = 0;
my $new_primary_entry;
my $length;
my $fh = $FILE;
my $done = -1;
my @secondary_names = $self->secondary_namespaces;
my %secondary_id;
my $last_one;
# In Windows, text files have '\r\n' as line separator, but when reading in
# text mode Perl will only show the '\n'. This means that for a line "ABC\r\n",
# "length $_" will report 4 although the line is 5 bytes in length.
# We assume that all lines have the same line separator and only read current line.
my $init_pos = tell($fh);
my $curr_line = <$fh>;
my $pos_diff = tell($fh) - $init_pos;
my $correction = $pos_diff - length $curr_line;
seek $fh, $init_pos, 0; # Rewind position to proceed to read the file
while (<$fh>) {
$last_one = $_;
$self->{alphabet} ||= $self->guess_alphabet($_);
if ( $_ =~ /$start_pattern/ ) {
if ( $done == 0 ) {
$id = $new_primary_entry;
$self->{alphabet} ||= $self->guess_alphabet($_);
my $tmplen = ( tell $fh ) - length($_) - $correction;
$length = $tmplen - $pos;
unless ( defined($id) ) {
$self->throw("No id defined for sequence");
}
unless ( defined($fileid) ) {
$self->throw("No fileid defined for file $file");
}
unless ( defined($pos) ) {
$self->throw( "No position defined for " . $id . "\n" );
}
unless ( defined($length) ) {
$self->throw( "No length defined for " . $id . "\n" );
}
$self->_add_id_position( $id, $pos, $fileid, $length,
\%secondary_id );
$pos = $tmplen;
if ( $count > 0 && $count % 1000 == 0 ) {
$self->debug("Indexed $count ids\n") if $v > 0;
}
$count++;
}
else {
$done = 0;
}
}
if ( $_ =~ /$primary/ ) {
$new_primary_entry = $1;
}
my $secondary_patterns = $self->secondary_patterns;
foreach my $sec (@secondary_names) {
my $pattern = $secondary_patterns->{$sec};
if ( $_ =~ /$pattern/ ) {
$secondary_id{$sec} = $1;
}
}
}
# Remember to add in the last one
$id = $new_primary_entry;
# my $tmplen = (tell $fh) - length($last_one);
my $tmplen = ( tell $fh );
$length = $tmplen - $pos;
if ( !defined($id) ) {
$self->throw("No id defined for sequence");
Bio/DB/Flat/BinarySearch.pm view on Meta::CPAN
-start_pattern => $start_pattern,
-secondary_patterns => \%secondary_patterns
);
$index->build_index(@files);
}
sub new_EMBL_index {
my ( $self, $index_dir, @files ) = @_;
my %secondary_patterns;
my $start_pattern = "^ID (\\S+)";
my $primary_pattern = "^AC (\\S+)\\;";
my $primary_namespace = "ACC";
$secondary_patterns{"ID"} = $start_pattern;
my $index = Bio::DB::Flat::BinarySearch->new(
-index_dir => $index_dir,
-format => 'embl',
-primary_pattern => $primary_pattern,
-primary_namespace => "ACC",
-start_pattern => $start_pattern,
-secondary_patterns => \%secondary_patterns
);
$index->build_index(@files);
return $index;
}
sub new_FASTA_index {
my ( $self, $index_dir, @files ) = @_;
my %secondary_patterns;
my $start_pattern = "^>";
my $primary_pattern = "^>(\\S+)";
my $primary_namespace = "ACC";
$secondary_patterns{"ID"} = "^>\\S+ +(\\S+)";
my $index = Bio::DB::Flat::BinarySearch->new(
-index_dir => $index_dir,
-format => 'fasta',
-primary_pattern => $primary_pattern,
-primary_namespace => "ACC",
-start_pattern => $start_pattern,
-secondary_patterns => \%secondary_patterns
);
$index->build_index(@files);
return $index;
}
# EVERYTHING THAT FOLLOWS THIS
# is an awful hack - in reality Michele's code needs to be rewritten
# to use Bio::SeqIO, but I have too little time to do this -- LS
sub guess_alphabet {
my $self = shift;
my $line = shift;
my $format = $self->format;
return 'protein' if $format eq 'swissprot';
if ( $format eq 'genbank' ) {
return unless $line =~ /^LOCUS/;
return 'dna' if $line =~ /\s+\d+\s+bp/i;
return 'protein';
}
if ( $format eq 'embl' ) {
return unless $line =~ /^ID/;
return 'dna' if $line =~ / DNA;/i;
return 'rna' if $line =~ / RNA;/i;
return 'protein';
}
return;
}
# return (namespace,primary_pattern,start_pattern,secondary_pattern)
sub _guess_patterns {
my $self = shift;
my $format = shift;
if ( $format =~ /swiss(prot)?/i ) {
return ( 'ID', "^ID (\\S+)", "^ID (\\S+)",
{ ACC => "^AC (\\S+);" } );
}
if ($format =~ /embl/i) {
return ('ID',
"^ID (\\S+[^; ])",
"^ID (\\S+[^; ])",
{
ACC => q/^AC (\S+);/,
VERSION => q/^SV\s+(\S+)/
});
}
if ( $format =~ /genbank/i ) {
return (
'ID',
q/^LOCUS\s+(\S+)/,
q/^LOCUS/,
{
ACC => q/^ACCESSION\s+(\S+)/,
VERSION => q/^VERSION\s+(\S+)/
}
);
}
if ( $format =~ /fasta/i ) {
return ( 'ACC', '^>(\S+)', '^>(\S+)', );
}
$self->throw("I can't handle format $format");
}
1;
( run in 2.193 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )