BioPerl

 view release on metacpan or  search on metacpan

Bio/DB/Flat/BinarySearch.pm  view on Meta::CPAN


             $index->build_index(@files);


 Function: create a new Bio::DB::Flat::BinarySearch object
 Returns : new Bio::DB::Flat::BinarySearch
 Args    : -directory          Root directory for index files
           -dbname             Name of subdirectory containing indices
                               for named database
           -write_flag         Allow building index
           -primary_pattern    Regexp defining the primary id
           -secondary_patterns A hash ref containing the secondary
                               patterns with the namespaces as keys
           -primary_namespace  A string defining what the primary key
                               is

 Status  : Public

=cut

sub new {
    my ( $class, @args ) = @_;

    my $self = $class->SUPER::new(@args);

    bless $self, $class;

    my ( $index_dir, $dbname, $format, $write_flag, $primary_pattern,
        $primary_namespace, $start_pattern, $secondary_patterns )
      = $self->_rearrange(
        [
            qw(DIRECTORY
              DBNAME
              FORMAT
              WRITE_FLAG
              PRIMARY_PATTERN
              PRIMARY_NAMESPACE
              START_PATTERN
              SECONDARY_PATTERNS)
        ],
        @args
      );

    $self->index_directory($index_dir);
    $self->dbname($dbname);

    if ( $self->index_directory && $self->read_config_file ) {

        my $fh           = $self->primary_index_filehandle;
        my $record_width = $self->read_header($fh);
        $self->record_size($record_width);
    }
    $format ||= DEFAULT_FORMAT;
    $self->format($format);
    $self->write_flag($write_flag);

    if ( $self->write_flag && !$primary_namespace ) {
        (
            $primary_namespace, $primary_pattern,
            $start_pattern,     $secondary_patterns
        ) = $self->_guess_patterns( $self->format );
    }

    $self->primary_pattern($primary_pattern);
    $self->primary_namespace($primary_namespace);
    $self->start_pattern($start_pattern);
    $self->secondary_patterns($secondary_patterns);

    return $self;
}

sub new_from_registry {
    my ( $self, %config ) = @_;

    my $dbname   = $config{'dbname'};
    my $location = $config{'location'};

    my $index = Bio::DB::Flat::BinarySearch->new(
        -dbname    => $dbname,
        -index_dir => $location,
    );
}

=head2 get_Seq_by_id

 Title   : get_Seq_by_id
 Usage   : $obj->get_Seq_by_id($newval)
 Function:
 Example :
 Returns : value of get_Seq_by_id
 Args    : newvalue (optional)

=cut

sub get_Seq_by_id {
    my ( $self, $id ) = @_;

    # too many uninit variables...
    local $^W = 0;

    my ( $fh, $length ) = $self->get_stream_by_id($id);

    unless ( defined( $self->format ) ) {
        $self->throw("Can't create sequence - format is not defined");
    }

    return unless $fh;

    unless ( defined( $self->{_seqio} ) ) {

        $self->{_seqio} = Bio::SeqIO->new(
            -fh     => $fh,
            -format => $self->format
        );
    }
    else {
        $self->{_seqio}->_fh($fh);
    }

    return $self->{_seqio}->next_seq;
}

Bio/DB/Flat/BinarySearch.pm  view on Meta::CPAN

    $self->make_config_file( \@files );

    # And finally write out the indices
    $self->write_primary_index;
    $self->write_secondary_indices;

    $entries;
}

=head2 _index_file

 Title   : _index_file
 Usage   : $obj->_index_file($newval)
 Function:
 Example :
 Returns : value of _index_file
 Args    : newvalue (optional)

=cut

sub _index_file {
    my ( $self, $file ) = @_;
    my $v = $self->verbose;
    open my $FILE, '<', $file or $self->throw("Could not read file '$file': $!");

    my $recstart = 0;
    my $fileid   = $self->get_fileid_by_filename($file);
    my $found    = 0;
    my $id;
    my $count = 0;

    my $primary       = $self->primary_pattern;
    my $start_pattern = $self->start_pattern;

    my $pos = 0;

    my $new_primary_entry;

    my $length;

    my $fh = $FILE;

    my $done = -1;

    my @secondary_names = $self->secondary_namespaces;
    my %secondary_id;
    my $last_one;

    # In Windows, text files have '\r\n' as line separator, but when reading in
    # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n",
    # "length $_" will report 4 although the line is 5 bytes in length.
    # We assume that all lines have the same line separator and only read current line.
    my $init_pos   = tell($fh);
    my $curr_line  = <$fh>;
    my $pos_diff   = tell($fh) - $init_pos;
    my $correction = $pos_diff - length $curr_line;
    seek $fh, $init_pos, 0; # Rewind position to proceed to read the file

    while (<$fh>) {
        $last_one = $_;
        $self->{alphabet} ||= $self->guess_alphabet($_);
        if ( $_ =~ /$start_pattern/ ) {
            if ( $done == 0 ) {
                $id = $new_primary_entry;
                $self->{alphabet} ||= $self->guess_alphabet($_);

                my $tmplen = ( tell $fh ) - length($_) - $correction;

                $length = $tmplen - $pos;

                unless ( defined($id) ) {
                    $self->throw("No id defined for sequence");
                }
                unless ( defined($fileid) ) {
                    $self->throw("No fileid defined for file $file");
                }
                unless ( defined($pos) ) {
                    $self->throw( "No position defined for " . $id . "\n" );
                }
                unless ( defined($length) ) {
                    $self->throw( "No length defined for " . $id . "\n" );
                }
                $self->_add_id_position( $id, $pos, $fileid, $length,
                    \%secondary_id );

                $pos = $tmplen;

                if ( $count > 0 && $count % 1000 == 0 ) {
                    $self->debug("Indexed $count ids\n") if $v > 0;
                }

                $count++;
            }
            else {
                $done = 0;
            }
        }

        if ( $_ =~ /$primary/ ) {
            $new_primary_entry = $1;
        }

        my $secondary_patterns = $self->secondary_patterns;

        foreach my $sec (@secondary_names) {
            my $pattern = $secondary_patterns->{$sec};

            if ( $_ =~ /$pattern/ ) {
                $secondary_id{$sec} = $1;
            }
        }

    }

    # Remember to add in the last one

    $id = $new_primary_entry;

    # my $tmplen = (tell $fh) - length($last_one);
    my $tmplen = ( tell $fh );

    $length = $tmplen - $pos;

    if ( !defined($id) ) {
        $self->throw("No id defined for sequence");

Bio/DB/Flat/BinarySearch.pm  view on Meta::CPAN

        -start_pattern      => $start_pattern,
        -secondary_patterns => \%secondary_patterns
    );

    $index->build_index(@files);
}

sub new_EMBL_index {
    my ( $self, $index_dir, @files ) = @_;

    my %secondary_patterns;

    my $start_pattern     = "^ID   (\\S+)";
    my $primary_pattern   = "^AC   (\\S+)\\;";
    my $primary_namespace = "ACC";

    $secondary_patterns{"ID"} = $start_pattern;

    my $index = Bio::DB::Flat::BinarySearch->new(
        -index_dir          => $index_dir,
        -format             => 'embl',
        -primary_pattern    => $primary_pattern,
        -primary_namespace  => "ACC",
        -start_pattern      => $start_pattern,
        -secondary_patterns => \%secondary_patterns
    );

    $index->build_index(@files);

    return $index;
}

sub new_FASTA_index {
    my ( $self, $index_dir, @files ) = @_;

    my %secondary_patterns;

    my $start_pattern     = "^>";
    my $primary_pattern   = "^>(\\S+)";
    my $primary_namespace = "ACC";

    $secondary_patterns{"ID"} = "^>\\S+ +(\\S+)";

    my $index = Bio::DB::Flat::BinarySearch->new(
        -index_dir          => $index_dir,
        -format             => 'fasta',
        -primary_pattern    => $primary_pattern,
        -primary_namespace  => "ACC",
        -start_pattern      => $start_pattern,
        -secondary_patterns => \%secondary_patterns
    );

    $index->build_index(@files);

    return $index;
}

# EVERYTHING THAT FOLLOWS THIS
# is an awful hack - in reality Michele's code needs to be rewritten
# to use Bio::SeqIO, but I have too little time to do this -- LS
sub guess_alphabet {
    my $self = shift;
    my $line = shift;

    my $format = $self->format;
    return 'protein' if $format eq 'swissprot';

    if ( $format eq 'genbank' ) {
        return unless $line =~ /^LOCUS/;
        return 'dna' if $line =~ /\s+\d+\s+bp/i;
        return 'protein';
    }

    if ( $format eq 'embl' ) {
        return unless $line =~ /^ID/;
        return 'dna' if $line =~ / DNA;/i;
        return 'rna' if $line =~ / RNA;/i;
        return 'protein';
    }

    return;
}

# return (namespace,primary_pattern,start_pattern,secondary_pattern)
sub _guess_patterns {
    my $self   = shift;
    my $format = shift;
    if ( $format =~ /swiss(prot)?/i ) {
        return ( 'ID', "^ID   (\\S+)", "^ID   (\\S+)",
            { ACC => "^AC   (\\S+);" } );
    }

    if ($format =~ /embl/i) {
        return ('ID',
            "^ID   (\\S+[^; ])",
            "^ID   (\\S+[^; ])",
            {
             ACC     => q/^AC   (\S+);/,
             VERSION => q/^SV\s+(\S+)/
            });
     }

    if ( $format =~ /genbank/i ) {
        return (
            'ID',
            q/^LOCUS\s+(\S+)/,
            q/^LOCUS/,
            {
                ACC     => q/^ACCESSION\s+(\S+)/,
                VERSION => q/^VERSION\s+(\S+)/
            }
        );
    }

    if ( $format =~ /fasta/i ) {
        return ( 'ACC', '^>(\S+)', '^>(\S+)', );
    }

    $self->throw("I can't handle format $format");

}

1;



( run in 2.193 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )