Bio-SearchIO-hmmer

 view release on metacpan or  search on metacpan

lib/Bio/SearchIO/hmmer3.pm  view on Meta::CPAN

                    $desc    = join ' ', @hitline;

                    $desc = '' if ( !defined($desc) );

                    push @hit_list, [ $hitid, $desc, $eval, $score ];
                    $hitinfo{"$hitid.$hit_counter"} = $#hit_list;
                }
            }

            # Complete sequence table data below inclusion threshold
            elsif ( $buffer =~ /inclusion threshold/ ) {
                while ( defined( $buffer = $self->_readline ) ) {
                    if (   $buffer =~ /Domain( and alignment)? annotation for each/
                        || $buffer =~ /Internal pipeline statistics summary/
                        || $buffer =~ /Annotation for each hit\s+\(and alignments\)/
                        )
                    {
                        $self->_pushback($buffer);
                        last;
                    }
                    elsif (   $buffer =~ m/inclusion threshold/
                           || $buffer =~ m/^$/
                        )
                    {
                        next;
                    }

                    # Grab table data
                    $hit_counter++;
                    my ($eval_full,  $score_full, $bias_full, $eval_best,
                        $score_best, $bias_best,  $exp,       $n,
                        $hitid,      $desc,       @hitline
                    );
                    @hitline    = split( " ", $buffer );
                    $eval_full  = shift @hitline;
                    $score_full = shift @hitline;
                    $bias_full  = shift @hitline;
                    $eval_best  = shift @hitline;
                    $score_best = shift @hitline;
                    $bias_best  = shift @hitline;
                    $exp        = shift @hitline;
                    $n          = shift @hitline;
                    $hitid      = shift @hitline;
                    $desc       = join " ", @hitline;

                    $desc = '' if ( !defined($desc) );

                    push @hit_list,
                        [ $hitid, $desc, $eval_full, $score_full ];
                    $hitinfo{"$hitid.$hit_counter"} = $#hit_list;
                }
            }

            # Domain annotation for each sequence table data,
            # for hmmscan, hmmsearch & nhmmer
            elsif (   $buffer =~ /Domain( and alignment)? annotation for each/
                   or $buffer =~ /Annotation for each hit\s+\(and alignments\)/
                   ) {
                @hsp_list = ();    # Here for multi-query reports
                my $name;
                my $annot_counter = 0;

                while ( defined( $buffer = $self->_readline ) ) {
                    if (   $buffer =~ /\[No targets detected/
                        || $buffer =~ /Internal pipeline statistics/ )
                    {
                        $self->_pushback($buffer);
                        last;
                    }

                    if ( $buffer =~ m/^\>\>\s(\S*)\s+(.*)/ ) {
                        $name    = $1;
                        my $desc = $2;
                        $annot_counter++;
                        $domaincounter{"$name.$annot_counter"} = 0;

                        # The Hit Description from the Scores table can be truncated if
                        # its too long, so use the '>>' line description when its longer
                        if (length $hit_list[
                                             $hitinfo{"$name.$annot_counter"}
                                             ]
                                             [1] < length $desc
                            ) {
                            $hit_list[ $hitinfo{"$name.$annot_counter"} ][1] = $desc;
                        }

                        while ( defined( $buffer = $self->_readline ) ) {
                            if (   $buffer =~ m/Internal pipeline statistics/
                                || $buffer =~ m/Alignments for each domain/
                                || $buffer =~ m/^\s+Alignment:/
                                || $buffer =~ m/^\>\>/ )
                            {
                                $self->_pushback($buffer);
                                last;
                            }
                            elsif (   $buffer =~ m/^\s+score\s+bias/
                                   || $buffer =~ m/^\s+\#\s+score/
                                   || $buffer =~ m/^\s+------\s+/
                                   || $buffer =~ m/^\s\-\-\-\s+/
                                   || $buffer =~ m/^$/
                                )
                            {
                                next;
                            }

                            # Grab hsp data from table, push into @hsp;
                            if ($self->{'_reporttype'} =~ m/(?:HMMSCAN|HMMSEARCH|PHMMER|NHMMER)/) {
                                my ( $domain_num, $score,    $bias,
                                     $ceval,      $ieval,
                                     $hmm_start,  $hmm_stop, $hmm_cov,
                                     $seq_start,  $seq_stop, $seq_cov,
                                     $env_start,  $env_stop, $env_cov,
                                     $hitlength,  $acc );
                                my @vals;

                                if ( # HMMSCAN & HMMSEARCH
                                    ( $domain_num, $score,    $bias,
                                      $ceval,      $ieval,
                                      $hmm_start,  $hmm_stop, $hmm_cov,
                                      $seq_start,  $seq_stop, $seq_cov,
                                      $env_start,  $env_stop, $env_cov,
                                      $acc )
                                       = ( $buffer =~
                                            m|^\s+(\d+)\s\!*\?*\s+     # domain number
                                              (\S+)\s+(\S+)\s+         # score, bias
                                              (\S+)\s+(\S+)\s+         # c-eval, i-eval
                                              (\d+)\s+(\d+)\s+(\S+)\s+ # hmm start, stop, coverage
                                              (\d+)\s+(\d+)\s+(\S+)\s+ # seq start, stop, coverage
                                              (\d+)\s+(\d+)\s+(\S+)\s+ # env start, stop, coverage
                                              (\S+)                    # posterior probability accuracy
                                               \s*$|ox
                                          )
                                    ) {
                                    # Values assigned when IF succeeded

                                    # Try to get the Hit length from the alignment information
                                    $hitlength = 0;
                                    if ($self->{'_reporttype'} eq 'HMMSEARCH' || $self->{'_reporttype'} eq 'PHMMER') {
                                        # For Hmmsearch, if seq coverage ends in ']' it means that the alignment
                                        # runs until the end. In that case add the END coordinate to @hitinfo
                                        # to use it as Hit Length
                                        if ( $seq_cov =~ m/\]$/ ) {
                                            $hitlength = $seq_stop;
                                        }
                                    }
                                    elsif ($self->{'_reporttype'} eq 'HMMSCAN') {
                                        # For Hmmscan, if hmm coverage ends in ']' it means that the alignment
                                        # runs until the end. In that case add the END coordinate to @hitinfo
                                        # to use it as Hit Length
                                        if ( $hmm_cov =~ m/\]$/ ) {
                                            $hitlength = $hmm_stop;
                                        }
                                    }
                                }
                                elsif ( # NHMMER
                                       ( $score,     $bias,     $ceval,
                                         $hmm_start, $hmm_stop, $hmm_cov,
                                         $seq_start, $seq_stop, $seq_cov,
                                         $env_start, $env_stop, $env_cov,
                                         $hitlength, $acc )
                                          = ( $buffer =~
                                                m|^\s+[!?]\s+
                                                  (\S+)\s+(\S+)\s+(\S+)\s+ # score, bias, evalue
                                                  (\d+)\s+(\d+)\s+(\S+)\s+ # hmm start, stop, coverage
                                                  (\d+)\s+(\d+)\s+(\S+)\s+ # seq start, stop, coverage
                                                  (\d+)\s+(\d+)\s+(\S+)\s+ # env start, stop, coverage
                                                  (\d+)\s+(\S+)            # target length, pp accuracy
                                                   .*$|ox
                                             )
                                    ) {
                                    # Values assigned when IF succeeded
                                }
                                else {
                                    print STDERR "Missed this line: $buffer\n";
                                    next;
                                }

                                my $info = $hit_list[ $hitinfo{"$name.$annot_counter"} ];
                                if ( !defined $info ) {
                                    $self->warn(
                                        "Incomplete information: can't find HSP $name in list of hits\n"
                                    );
                                    next;
                                }

                                $domaincounter{"$name.$annot_counter"}++;
                                my $hsp_key
                                    = $name . "_" . $domaincounter{"$name.$annot_counter"};

                                # Keep it simple for now. let's customize later
                                @vals = (
                                    $hmm_start, $hmm_stop,
                                    $seq_start, $seq_stop,
                                    $score,     $ceval,
                                    $hitlength, '',
                                    '',         '',
                                    '',         ''
                                );
                                push @hsp_list, [ $name, @vals ];
                                $hspinfo{"$hsp_key.$annot_counter"} = $#hsp_list;
                            }
                        }
                    }
                    elsif ( $buffer =~ /Alignment(?:s for each domain)?:/ ) {
                        #line counter
                        my $count = 0;

                        # There's an optional block, so we sometimes need to
                        # count to 3, and sometimes to 4.
                        my $max_count = 3;
                        my $lastdomain;
                        my $hsp;
                        my ( $csline, $hline, $midline, $qline, $pline );

                        # To avoid deleting whitespaces from the homology line,
                        # keep track of the position and length of the alignment
                        # in each individual hline/qline, to take them as reference
                        # and use them in the homology line
                        my $align_offset = 0;
                        my $align_length = 0;

                        while ( defined( $buffer = $self->_readline ) ) {
                            if (   $buffer =~ m/^\>\>/
                                || $buffer =~ m/Internal pipeline statistics/ )
                            {
                                $self->_pushback($buffer);
                                last;
                            }
                            elsif ($buffer =~ m/^$/ )
                            {
                                # Reset these scalars on empty lines to help
                                # distinguish between the consensus structure/reference
                                # tracks (CS|RF lines) and homology lines ending in
                                # CS or RF aminoacids
                                $align_offset = 0;
                                $align_length = 0;
                                next;
                            }

                            if (   $buffer =~ /\s\s\=\=\sdomain\s(\d+)\s+/
                                or $buffer =~ /\s\sscore:\s\S+\s+/
                                ) {
                                my $domainnum = $1 || 1;
                                $count = 0;
                                my $key = $name . "_" . $domainnum;
                                $hsp        = $hsp_list[ $hspinfo{"$key.$annot_counter"} ];
                                $csline     = $$hsp[-5];
                                $hline      = $$hsp[-4];
                                $midline    = $$hsp[-3];
                                $qline      = $$hsp[-2];
                                $pline      = $$hsp[-1];
                                $lastdomain = $name;
                            }
                            # Consensus Structure or Reference track, some reports
                            # don't have it. Since it appears on top of the alignment,
                            # the reset of $align_length to 0 between alignment blocks
                            # avoid confusing homology lines with it.
                            elsif ( $buffer =~ m/\s+\S+\s(?:CS|RF)$/ and $align_length == 0 ) {
                                my @data = split( " ", $buffer );
                                $csline .= $data[-2];
                                $max_count++;
                                $count++;
                                next;
                            }
                            # Query line and Hit line swaps positions
                            # depending of the program
                            elsif (    $count == $max_count - 3
                                   or  $count == $max_count - 1
                                   ) {
                                my @data = split( " ", $buffer );

                                my $line_offset = 0;
                                # Use \Q\E on match to avoid errors on alignments
                                # that include stop codons (*)
                                while ($buffer =~ m/\Q$data[-2]\E/g) {
                                    $line_offset = pos $buffer;
                                }
                                if ($line_offset != 0) {
                                    $align_length = length $data[-2];
                                    $align_offset = $line_offset - $align_length;
                                }

                                if ($self->{'_reporttype'} eq 'HMMSCAN') {
                                    # hit sequence
                                    $hline .= $data[-2] if ($count == $max_count - 3);
                                    # query sequence
                                    $qline .= $data[-2] if ($count == $max_count - 1);
                                }
                                else { # hmmsearch & nhmmer
                                    # hit sequence
                                    $hline .= $data[-2] if ($count == $max_count - 1);
                                    # query sequence
                                    $qline .= $data[-2] if ($count == $max_count - 3);
                                }

                                $count++;
                                next;
                            }
                            # conservation track
                            # storage isn't quite right - need to remove
                            # leading/lagging whitespace while preserving
                            # gap data (latter isn't done, former is)
                            elsif ( $count == $max_count - 2 ) {
                                $midline .= substr $buffer, $align_offset, $align_length;
                                $count++;
                                next;



( run in 0.765 second using v1.01-cache-2.11-cpan-99c4e6809bf )