BioX-Seq

 view release on metacpan or  search on metacpan

lib/BioX/Seq/Fetch.pm  view on Meta::CPAN

            $ll_mismatch = 0;
        }
        elsif ($line =~ /^[A-Za-z\-\*\.]+(\r?\n?)$/) {
            if ($bl_mismatch) {
                close $idx;
                unlink $fn_idx;
                die "Base length mismatch\n";
            }
            if ($ll_mismatch) {
                close $idx;
                unlink $fn_idx;
                die "Line length mismatch\n";
            }
            my $ll  = length $line;
            my $bl  = $ll - length $1;
            $curr_len += $bl;
            $curr_bases //= $bl;
            $curr_bytes //= $ll;
            $bl_mismatch = 1 if ($bl != $curr_bases);
            $ll_mismatch = 1 if ($ll != $curr_bytes);
        }
        elsif ($line =~ /\S/) {
            close $idx;
            unlink $fn_idx;
            die "Unexpected content: $line";
        }
                
    }

    # write remaining index
    # uncoverable branch false
    if (defined $curr_id) {
        say {$idx} join "\t",
            $curr_id,
            $curr_len,
            $curr_offset,
            $curr_bases,
            $curr_bytes,
        ;
    }

    close $idx;

    #reset filehandle
    seek $self->{fh}, 0, 0;

    return 1;

}
     

sub _load_faidx {
    
    my ($self) = @_;

    my $fn_idx  = $self->{fn} . '.fai';
    $self->write_index if (! -e $fn_idx);

    # make sure FASTA file is not newer than its index; otherwise index file
    # may be out of date and cause silent corruption downstream
    my $mtime_fas = stat($self->{fn})->mtime;
    my $mtime_idx = stat($fn_idx)->mtime;
    if ($mtime_fas > $mtime_idx) {
        die "Index file exists but is older than FASTA file and probably out"
            . " of date. Refresh or remove the existing index before"
            . " proceeding.\n"
    }

    my @ids;

    open my $in, '<', $fn_idx
        or die "Error opening index file: $!\n";
    while (my $line = <$in>) {
        chomp $line;
        my ($name, $len, $offset, $bases_per_line, $bytes_per_line)
            = split "\t", $line;
        die "ERROR: $fn_idx contains duplicate entries"
            if ( defined $self->{idx}->{$name} );
        my $eol = $bytes_per_line - $bases_per_line;
        $self->{idx}->{$name} = [$len, $offset, $bases_per_line, $eol];
        push @ids, $name;
    }
    close $in;
    $self->{ids} = \@ids;

    return;

}

sub ids {

    my ($self) = @_;
    return @{ $self->{ids} };

}

sub length {

    my ($self, $id) = @_;
    return $self->{idx}->{$id}->[0];

}

sub fetch_seq {
    
    my ($self, $id, $start_bp, $end_bp) = @_;

    return undef if (! defined $self->{idx}->{$id});
    my ($len, $off, $bpl, $eol) = @{ $self->{idx}->{$id} };

    my $fh = $self->{fh};

    $start_bp = $start_bp // 1;
    $end_bp   = $end_bp   // $len;
    --$start_bp; #make 0-based
    die "Coordinates out of bounds" if ($start_bp < 0 || $end_bp > $len);
    my $l = $end_bp - $start_bp;

    seek $fh, $off + $start_bp + int(($start_bp)/$bpl)*$eol, 0;
    read($fh, my $seq, $l + int(($l+$start_bp%$bpl)/$bpl)*$eol);
    $seq =~ s/\s//g;



( run in 0.963 second using v1.01-cache-2.11-cpan-98e64b0badf )