BioPerl
view release on metacpan or search on metacpan
Bio/DB/Fasta.pm view on Meta::CPAN
#
# BioPerl module for Bio::DB::Fasta
#
# You may distribute this module under the same terms as perl itself
#
=head1 NAME
Bio::DB::Fasta - Fast indexed access to fasta files
=head1 SYNOPSIS
use Bio::DB::Fasta;
# Create database from a directory of Fasta files
my $db = Bio::DB::Fasta->new('/path/to/fasta/files/');
my @ids = $db->get_all_primary_ids;
# Simple access
my $seqstr = $db->seq('CHROMOSOME_I', 4_000_000 => 4_100_000);
my $revseq = $db->seq('CHROMOSOME_I', 4_100_000 => 4_000_000);
my $length = $db->length('CHROMOSOME_I');
my $header = $db->header('CHROMOSOME_I');
my $alphabet = $db->alphabet('CHROMOSOME_I');
# Access to sequence objects. See Bio::PrimarySeqI.
my $seq = $db->get_Seq_by_id('CHROMOSOME_I');
my $seqstr = $seq->seq;
my $subseq = $seq->subseq(4_000_000 => 4_100_000);
my $trunc = $seq->trunc(4_000_000 => 4_100_000);
my $length = $seq->length;
# Loop through sequence objects
my $stream = $db->get_PrimarySeq_stream;
while (my $seq = $stream->next_seq) {
# Bio::PrimarySeqI stuff
}
# Filehandle access
my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files/');
while (my $seq = <$fh>) {
# Bio::PrimarySeqI stuff
}
# Tied hash access
tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files/';
print $sequences{'CHROMOSOME_I:1,20000'};
=head1 DESCRIPTION
Bio::DB::Fasta provides indexed access to a single Fasta file, several files,
or a directory of files. It provides persistent random access to each sequence
entry (either as a Bio::PrimarySeqI-compliant object or a string), and to
subsequences within each entry, allowing you to retrieve portions of very large
sequences without bringing the entire sequence into memory. Bio::DB::Fasta is
based on Bio::DB::IndexedBase. See this module's documentation for details.
The Fasta files may contain any combination of nucleotide and protein sequences;
during indexing the module guesses the molecular type. Entries may have any line
length up to 65,536 characters, and different line lengths are allowed in the
same file. However, within a sequence entry, all lines must be the same length
except for the last. An error will be thrown if this is not the case.
The module uses /^E<gt>(\S+)/ to extract the primary ID of each sequence
from the Fasta header. See -makeid in Bio::DB::IndexedBase to pass a callback
routine to reversibly modify this primary ID, e.g. if you wish to extract a
specific portion of the gi|gb|abc|xyz GenBank IDs.
=head1 DATABASE CREATION AND INDEXING
The object-oriented constructor is new(), the filehandle constructor is newFh()
and the tied hash constructor is tie(). They all allow one to index a single Fasta
file, several files, or a directory of files. See Bio::DB::IndexedBase.
=head1 SEE ALSO
L<Bio::DB::IndexedBase>
L<Bio::DB::Qual>
L<Bio::PrimarySeqI>
=head1 AUTHOR
Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
Copyright (c) 2001 Cold Spring Harbor Laboratory.
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself. See DISCLAIMER.txt for
disclaimers of warranty.
=head1 APPENDIX
The rest of the documentation details each of the object
methods. Internal methods are usually preceded with a _
For BioPerl-style access, the following methods are provided:
=head2 get_Seq_by_id
Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_primary_id
Usage : my $seq = $db->get_Seq_by_id($id);
Function: Given an ID, fetch the corresponding sequence from the database.
Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant)
Note that to save resource, Bio::PrimarySeq::Fasta sequence objects
only load the sequence string into memory when requested using seq().
See L<Bio::PrimarySeqI> for methods provided by the sequence objects
returned from get_Seq_by_id() and get_PrimarySeq_stream().
Args : ID
=head2 get_PrimarySeq_stream
Title : get_PrimarySeq_stream
Usage : my $stream = $db->get_PrimarySeq_stream();
Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a
single method, next_seq(). Each call to next_seq() returns a new
Bio::PrimarySeq::Fasta sequence object, until no more sequences remain.
Returns : A Bio::DB::Indexed::Stream object
Bio/DB/Fasta.pm view on Meta::CPAN
my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
binmode $fh;
warn "Indexing $file\n" if $self->{debug};
my ($offset, @ids, $linelen, $alphabet, $headerlen, $count, $seq_lines,
$last_line, %offsets);
my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
my $termination_length = $self->{termination_length};
while (my $line = <$fh>) {
# Account for crlf-terminated Windows files
if (index($line, '>') == 0) {
if ($line =~ /^>(\S+)/) {
print STDERR "Indexed $count sequences...\n"
if $self->{debug} && (++$count%1000) == 0;
# please, do not enforce arbitrary line length requirements.
# It's good practice but not enforced.
#$self->_check_linelength($linelen);
my $pos = tell($fh);
if (@ids) {
my $strlen = $pos - $offset - length($line);
$strlen -= $termination_length * $seq_lines;
my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen,
$linelen, $headerlen, $alphabet, $fileno);
$alphabet = Bio::DB::IndexedBase::NA;
for my $id (@ids) {
$offsets->{$id} = $ppos;
}
}
@ids = $self->_makeid($line);
($offset, $headerlen, $linelen, $seq_lines) = ($pos, length $line, 0, 0);
($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
} else {
# Catch bad header lines, bug 3172
$self->throw("FASTA header doesn't match '>(\\S+)': $line");
}
} elsif ($line !~ /\S/) {
# Skip blank line
$blank_lines++;
next;
} else {
# Need to check every line :(
$l3_len = $l2_len;
$l2_len = $l_len;
$l_len = length $line;
if (Bio::DB::IndexedBase::DIE_ON_MISSMATCHED_LINES) {
if ( ($l3_len > 0) && ($l2_len > 0) && ($l3_len != $l2_len) ) {
my $fap = substr($line, 0, 20)."..";
$self->throw("Each line of the fasta entry must be the same ".
"length except the last. Line above #$. '$fap' is $l2_len".
" != $l3_len chars.");
}
if ($blank_lines) {
# Blank lines not allowed in entry
$self->throw("Blank lines can only precede header lines, ".
"found preceding line #$.");
}
}
$linelen ||= length $line;
$alphabet ||= $self->_guess_alphabet($line);
$seq_lines++;
}
$last_line = $line;
}
# Process last entry
$self->_check_linelength($linelen);
my $pos = tell $fh;
if (@ids) {
my $strlen = $pos - $offset;
if ($linelen == 0) { # yet another pesky empty chr_random.fa file
$strlen = 0;
} else {
if ($last_line !~ /\s$/) {
$seq_lines--;
}
$strlen -= $termination_length * $seq_lines;
}
my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen, $linelen,
$headerlen, $alphabet, $fileno);
for my $id (@ids) {
$offsets->{$id} = $ppos;
}
}
return \%offsets;
}
=head2 seq
Title : seq, sequence, subseq
Usage : # Entire sequence string
my $seqstr = $db->seq($id);
# Subsequence
my $subseqstr = $db->seq($id, $start, $stop, $strand);
# or...
my $subseqstr = $db->seq($compound_id);
Function: Get a subseq of a sequence from the database. For your convenience,
the sequence to extract can be specified with any of the following
compound IDs:
$db->seq("$id:$start,$stop")
$db->seq("$id:$start..$stop")
$db->seq("$id:$start-$stop")
$db->seq("$id:$start,$stop/$strand")
$db->seq("$id:$start..$stop/$strand")
$db->seq("$id:$start-$stop/$strand")
$db->seq("$id/$strand")
In the case of DNA or RNA sequence, if $stop is less than $start,
then the reverse complement of the sequence is returned. Avoid using
it if possible since this goes against Bio::Seq conventions.
Returns : A string
Args : ID of sequence to retrieve
or
Compound ID of subsequence to fetch
or
ID, optional start (defaults to 1), optional end (defaults to length
of sequence) and optional strand (defaults to 1).
=cut
( run in 3.305 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )