Bio-SearchIO-hmmer
view release on metacpan or search on metacpan
lib/Bio/SearchIO/hmmer3.pm view on Meta::CPAN
$desc = join ' ', @hitline;
$desc = '' if ( !defined($desc) );
push @hit_list, [ $hitid, $desc, $eval, $score ];
$hitinfo{"$hitid.$hit_counter"} = $#hit_list;
}
}
# Complete sequence table data below inclusion threshold
elsif ( $buffer =~ /inclusion threshold/ ) {
while ( defined( $buffer = $self->_readline ) ) {
if ( $buffer =~ /Domain( and alignment)? annotation for each/
|| $buffer =~ /Internal pipeline statistics summary/
|| $buffer =~ /Annotation for each hit\s+\(and alignments\)/
)
{
$self->_pushback($buffer);
last;
}
elsif ( $buffer =~ m/inclusion threshold/
|| $buffer =~ m/^$/
)
{
next;
}
# Grab table data
$hit_counter++;
my ($eval_full, $score_full, $bias_full, $eval_best,
$score_best, $bias_best, $exp, $n,
$hitid, $desc, @hitline
);
@hitline = split( " ", $buffer );
$eval_full = shift @hitline;
$score_full = shift @hitline;
$bias_full = shift @hitline;
$eval_best = shift @hitline;
$score_best = shift @hitline;
$bias_best = shift @hitline;
$exp = shift @hitline;
$n = shift @hitline;
$hitid = shift @hitline;
$desc = join " ", @hitline;
$desc = '' if ( !defined($desc) );
push @hit_list,
[ $hitid, $desc, $eval_full, $score_full ];
$hitinfo{"$hitid.$hit_counter"} = $#hit_list;
}
}
# Domain annotation for each sequence table data,
# for hmmscan, hmmsearch & nhmmer
elsif ( $buffer =~ /Domain( and alignment)? annotation for each/
or $buffer =~ /Annotation for each hit\s+\(and alignments\)/
) {
@hsp_list = (); # Here for multi-query reports
my $name;
my $annot_counter = 0;
while ( defined( $buffer = $self->_readline ) ) {
if ( $buffer =~ /\[No targets detected/
|| $buffer =~ /Internal pipeline statistics/ )
{
$self->_pushback($buffer);
last;
}
if ( $buffer =~ m/^\>\>\s(\S*)\s+(.*)/ ) {
$name = $1;
my $desc = $2;
$annot_counter++;
$domaincounter{"$name.$annot_counter"} = 0;
# The Hit Description from the Scores table can be truncated if
# its too long, so use the '>>' line description when its longer
if (length $hit_list[
$hitinfo{"$name.$annot_counter"}
]
[1] < length $desc
) {
$hit_list[ $hitinfo{"$name.$annot_counter"} ][1] = $desc;
}
while ( defined( $buffer = $self->_readline ) ) {
if ( $buffer =~ m/Internal pipeline statistics/
|| $buffer =~ m/Alignments for each domain/
|| $buffer =~ m/^\s+Alignment:/
|| $buffer =~ m/^\>\>/ )
{
$self->_pushback($buffer);
last;
}
elsif ( $buffer =~ m/^\s+score\s+bias/
|| $buffer =~ m/^\s+\#\s+score/
|| $buffer =~ m/^\s+------\s+/
|| $buffer =~ m/^\s\-\-\-\s+/
|| $buffer =~ m/^$/
)
{
next;
}
# Grab hsp data from table, push into @hsp;
if ($self->{'_reporttype'} =~ m/(?:HMMSCAN|HMMSEARCH|PHMMER|NHMMER)/) {
my ( $domain_num, $score, $bias,
$ceval, $ieval,
$hmm_start, $hmm_stop, $hmm_cov,
$seq_start, $seq_stop, $seq_cov,
$env_start, $env_stop, $env_cov,
$hitlength, $acc );
my @vals;
if ( # HMMSCAN & HMMSEARCH
( $domain_num, $score, $bias,
$ceval, $ieval,
$hmm_start, $hmm_stop, $hmm_cov,
$seq_start, $seq_stop, $seq_cov,
$env_start, $env_stop, $env_cov,
$acc )
= ( $buffer =~
m|^\s+(\d+)\s\!*\?*\s+ # domain number
(\S+)\s+(\S+)\s+ # score, bias
(\S+)\s+(\S+)\s+ # c-eval, i-eval
(\d+)\s+(\d+)\s+(\S+)\s+ # hmm start, stop, coverage
(\d+)\s+(\d+)\s+(\S+)\s+ # seq start, stop, coverage
(\d+)\s+(\d+)\s+(\S+)\s+ # env start, stop, coverage
(\S+) # posterior probability accuracy
\s*$|ox
)
) {
# Values assigned when IF succeeded
# Try to get the Hit length from the alignment information
$hitlength = 0;
if ($self->{'_reporttype'} eq 'HMMSEARCH' || $self->{'_reporttype'} eq 'PHMMER') {
# For Hmmsearch, if seq coverage ends in ']' it means that the alignment
# runs until the end. In that case add the END coordinate to @hitinfo
# to use it as Hit Length
if ( $seq_cov =~ m/\]$/ ) {
$hitlength = $seq_stop;
}
}
elsif ($self->{'_reporttype'} eq 'HMMSCAN') {
# For Hmmscan, if hmm coverage ends in ']' it means that the alignment
# runs until the end. In that case add the END coordinate to @hitinfo
# to use it as Hit Length
if ( $hmm_cov =~ m/\]$/ ) {
$hitlength = $hmm_stop;
}
}
}
elsif ( # NHMMER
( $score, $bias, $ceval,
$hmm_start, $hmm_stop, $hmm_cov,
$seq_start, $seq_stop, $seq_cov,
$env_start, $env_stop, $env_cov,
$hitlength, $acc )
= ( $buffer =~
m|^\s+[!?]\s+
(\S+)\s+(\S+)\s+(\S+)\s+ # score, bias, evalue
(\d+)\s+(\d+)\s+(\S+)\s+ # hmm start, stop, coverage
(\d+)\s+(\d+)\s+(\S+)\s+ # seq start, stop, coverage
(\d+)\s+(\d+)\s+(\S+)\s+ # env start, stop, coverage
(\d+)\s+(\S+) # target length, pp accuracy
.*$|ox
)
) {
# Values assigned when IF succeeded
}
else {
print STDERR "Missed this line: $buffer\n";
next;
}
my $info = $hit_list[ $hitinfo{"$name.$annot_counter"} ];
if ( !defined $info ) {
$self->warn(
"Incomplete information: can't find HSP $name in list of hits\n"
);
next;
}
$domaincounter{"$name.$annot_counter"}++;
my $hsp_key
= $name . "_" . $domaincounter{"$name.$annot_counter"};
# Keep it simple for now. let's customize later
@vals = (
$hmm_start, $hmm_stop,
$seq_start, $seq_stop,
$score, $ceval,
$hitlength, '',
'', '',
'', ''
);
push @hsp_list, [ $name, @vals ];
$hspinfo{"$hsp_key.$annot_counter"} = $#hsp_list;
}
}
}
elsif ( $buffer =~ /Alignment(?:s for each domain)?:/ ) {
#line counter
my $count = 0;
# There's an optional block, so we sometimes need to
# count to 3, and sometimes to 4.
my $max_count = 3;
my $lastdomain;
my $hsp;
my ( $csline, $hline, $midline, $qline, $pline );
# To avoid deleting whitespaces from the homology line,
# keep track of the position and length of the alignment
# in each individual hline/qline, to take them as reference
# and use them in the homology line
my $align_offset = 0;
my $align_length = 0;
while ( defined( $buffer = $self->_readline ) ) {
if ( $buffer =~ m/^\>\>/
|| $buffer =~ m/Internal pipeline statistics/ )
{
$self->_pushback($buffer);
last;
}
elsif ($buffer =~ m/^$/ )
{
# Reset these scalars on empty lines to help
# distinguish between the consensus structure/reference
# tracks (CS|RF lines) and homology lines ending in
# CS or RF aminoacids
$align_offset = 0;
$align_length = 0;
next;
}
if ( $buffer =~ /\s\s\=\=\sdomain\s(\d+)\s+/
or $buffer =~ /\s\sscore:\s\S+\s+/
) {
my $domainnum = $1 || 1;
$count = 0;
my $key = $name . "_" . $domainnum;
$hsp = $hsp_list[ $hspinfo{"$key.$annot_counter"} ];
$csline = $$hsp[-5];
$hline = $$hsp[-4];
$midline = $$hsp[-3];
$qline = $$hsp[-2];
$pline = $$hsp[-1];
$lastdomain = $name;
}
# Consensus Structure or Reference track, some reports
# don't have it. Since it appears on top of the alignment,
# the reset of $align_length to 0 between alignment blocks
# avoid confusing homology lines with it.
elsif ( $buffer =~ m/\s+\S+\s(?:CS|RF)$/ and $align_length == 0 ) {
my @data = split( " ", $buffer );
$csline .= $data[-2];
$max_count++;
$count++;
next;
}
# Query line and Hit line swaps positions
# depending of the program
elsif ( $count == $max_count - 3
or $count == $max_count - 1
) {
my @data = split( " ", $buffer );
my $line_offset = 0;
# Use \Q\E on match to avoid errors on alignments
# that include stop codons (*)
while ($buffer =~ m/\Q$data[-2]\E/g) {
$line_offset = pos $buffer;
}
if ($line_offset != 0) {
$align_length = length $data[-2];
$align_offset = $line_offset - $align_length;
}
if ($self->{'_reporttype'} eq 'HMMSCAN') {
# hit sequence
$hline .= $data[-2] if ($count == $max_count - 3);
# query sequence
$qline .= $data[-2] if ($count == $max_count - 1);
}
else { # hmmsearch & nhmmer
# hit sequence
$hline .= $data[-2] if ($count == $max_count - 1);
# query sequence
$qline .= $data[-2] if ($count == $max_count - 3);
}
$count++;
next;
}
# conservation track
# storage isn't quite right - need to remove
# leading/lagging whitespace while preserving
# gap data (latter isn't done, former is)
elsif ( $count == $max_count - 2 ) {
$midline .= substr $buffer, $align_offset, $align_length;
$count++;
next;
( run in 0.765 second using v1.01-cache-2.11-cpan-99c4e6809bf )