BioPerl

 view release on metacpan or  search on metacpan

Bio/DB/IndexedBase.pm  view on Meta::CPAN

    my ($self, $index_file, $write) = @_;
    my %offsets;
    my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
    my @dbmargs = $self->dbmargs;
    tie %offsets, 'AnyDBM_File', $index_file, $flags, 0644, @dbmargs
        or $self->throw( "Could not open index file $index_file: $!");
    return \%offsets;
}


sub _close_index {
    # Close index file
    my ($self, $index) = @_;
    untie %$index;
    return 1;
}

# Compiling the below regular expression speeds up _parse_compound_id
my $compound_id = qr/^ (.+?) (?:\:([\d_]+)(?:,|-|\.\.)([\d_]+))? (?:\/(.+))? $/x;

sub _parse_compound_id {
    # Handle compound IDs:
    #     $db->seq($id)
    #     $db->seq($id, $start, $stop, $strand)
    #     $db->seq("$id:$start,$stop")
    #     $db->seq("$id:$start..$stop")
    #     $db->seq("$id:$start-$stop")
    #     $db->seq("$id:$start,$stop/$strand")
    #     $db->seq("$id:$start..$stop/$strand")
    #     $db->seq("$id:$start-$stop/$strand")
    #     $db->seq("$id/$strand")
    my ($self, $id, $start, $stop, $strand) = @_;

    if ( (not defined $start ) &&
         (not defined $stop  ) &&
         (not defined $strand) &&
         ($id =~ m{$compound_id}) ) {
        # Start, stop and strand not provided and ID looks like a compound ID
        ($id, $start, $stop, $strand) = ($1, $2, $3, $4);
    }

    # Start, stop and strand defaults
    $stop   ||= $self->length($id) || 0; # 0 if sequence not found in database
    $start  ||= ($stop > 0) ? 1 : 0;
    $strand ||= 1;

    # Convert numbers such as 1_000_000 to 1000000
    $start =~ s/_//g;
    $stop  =~ s/_//g;

    if ($start > $stop) {
        # Change the strand
        ($start, $stop) = ($stop, $start);
        $strand *= -1;
    }

    return $id, $start, $stop, $strand;
}


sub _guess_alphabet {
    # Determine the molecular type of the given sequence string:
    #    'dna', 'rna', 'protein' or '' (unknown/empty)
    my ($self, $string) = @_;
    # Handle IUPAC residues like PrimarySeq does
    my $alphabet = Bio::PrimarySeq::_guess_alphabet_from_string($self, $string, 1);
    return $alphabet eq 'dna' ? DNA
           : $alphabet eq 'rna' ? RNA
           : $alphabet eq 'protein' ? PROTEIN
           : NA;
}


sub _makeid {
    # Process the header line by applying any transformation given in -makeid
    my ($self, $header_line) = @_;
    return ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($header_line) : $1;
}


sub _check_linelength {
    # Check that the line length is valid. Generate an error otherwise.
    my ($self, $linelength) = @_;
    return if not defined $linelength;
    $self->throw(
        "Each line of the file must be less than 65,536 characters. Line ".
        "$. is $linelength chars."
    ) if $linelength > 65535;
}


sub _calc_termination_length {
    # Try the beginning of the file to determine termination length
    # Account for crlf-terminated Windows and Mac files
    my ($self, $file) = @_;
    my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");

    # In Windows, text files have '\r\n' as line separator, but when reading in
    # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n",
    # "length $_" will report 4 although the line is 5 bytes in length.
    # We assume that all lines have the same line separator and only read current line.
    my $init_pos   = tell($fh);
    my $curr_line  = <$fh>;
    my $pos_diff   = tell($fh) - $init_pos;
    my $correction = $pos_diff - length $curr_line;
    close $fh;

    $self->{termination_length} = ($curr_line =~ /\r\n$/) ? 2 : 1+$correction;
    return $self->{termination_length};
}


sub _calc_offset {
    # Get the offset of the n-th residue of the sequence with the given ID
    # and termination length (tl)
    my ($self, $id, $n) = @_;
    my $tl = $self->{termination_length};
    $n--;
    my ($offset, $seqlen, $linelen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,1,3];
    $n = 0            if $n < 0;
    $n = $seqlen-1 if $n >= $seqlen;
    return $offset + $linelen * int($n/($linelen-$tl)) + $n % ($linelen-$tl);
}


sub _fh {



( run in 0.991 second using v1.01-cache-2.11-cpan-39bf76dae61 )