BioPerl

 view release on metacpan or  search on metacpan

Bio/SearchIO/blast.pm  view on Meta::CPAN

 Usage   : my $obj = Bio::SearchIO::blast->new(%args);
 Function: Builds a new Bio::SearchIO::blast object
 Returns : Bio::SearchIO::blast
 Args    : Key-value pairs:
           -fh/-file => filehandle/filename to BLAST file
           -format   => 'blast'
           -report_type => 'blastx', 'tblastn', etc -- only for bl2seq
                           reports when you want to distinguish between
                           tblastn and blastx reports (this only controls
                           where the frame information is put - on the query
                           or subject object.
           -inclusion_threshold => e-value threshold for inclusion in the
                                   PSI-BLAST score matrix model (blastpgp)
           -signif      => float or scientific notation number to be used
                           as a P- or Expect value cutoff
           -score       => integer or scientific notation number to be used
                           as a blast score value cutoff
           -bits        => integer or scientific notation number to be used
                           as a bit score value cutoff
           -hit_filter  => reference to a function to be used for
                           filtering hits based on arbitrary criteria.
                           All hits of each BLAST report must satisfy
                           this criteria to be retained.
                           If a hit fails this test, it is ignored.
                           This function should take a
                           Bio::Search::Hit::BlastHit.pm object as its first
                           argument and return true
                           if the hit should be retained.
                           Sample filter function:
                              -hit_filter => sub { $hit = shift;
                                                   $hit->gaps == 0; },
                           (Note: -filt_func is synonymous with -hit_filter)
           -overlap     => integer. The amount of overlap to permit between
                           adjacent HSPs when tiling HSPs. A reasonable value is 2.
                           Default = $Bio::SearchIO::blast::MAX_HSP_OVERLAP.

            The following criteria are not yet supported:
            (these are probably best applied within this module rather than in the
             event handler since they would permit the parser to take some shortcuts.)

           -check_all_hits => boolean. Check all hits for significance against
                              significance criteria.  Default = false.
                              If false, stops processing hits after the first
                              non-significant hit or the first hit that fails
                              the hit_filter call. This speeds parsing,
                              taking advantage of the fact that the hits
                              are processed in the order they appear in the report.
           -min_query_len => integer to be used as a minimum for query sequence length.
                             Reports with query sequences below this length will
                             not be processed. Default = no minimum length.
           -best        => boolean. Only process the best hit of each report;
                           default = false.

=cut

sub _initialize {
    my ( $self, @args ) = @_;
    $self->SUPER::_initialize(@args);

    # Blast reports require a specialized version of the SREB due to the
    # possibility of iterations (PSI-BLAST). Forwarding all arguments to it. An
    # issue here is that we want to set new default object factories if none are
    # supplied.

    my $handler = Bio::SearchIO::IteratedSearchResultEventBuilder->new(@args);
    $self->attach_EventHandler($handler);

    # 2006-04-26 move this to the attach_handler function in this module so we
    # can really reset the handler
    # Optimization: caching
    # the EventHandler since it is used a lot during the parse.

    # $self->{'_handler_cache'} = $handler;

    my ($rpttype ) = $self->_rearrange(
        [
            qw(
              REPORT_TYPE)
        ],
        @args
    );
    defined $rpttype   && ( $self->{'_reporttype'} = $rpttype );
}

sub attach_EventHandler {
    my ($self,$handler) = @_;

    $self->SUPER::attach_EventHandler($handler);

    # Optimization: caching the EventHandler since it is used a lot
    # during the parse.

    $self->{'_handler_cache'} = $handler;
    return;
}

=head2 next_result

 Title   : next_result
 Usage   : my $hit = $searchio->next_result;
 Function: Returns the next Result from a search
 Returns : Bio::Search::Result::ResultI object
 Args    : none

=cut

sub next_result {
    my ($self) = @_;
    my $v      = $self->verbose;
    my $data   = '';
    my $flavor = '';
    $self->{'_seentop'} = 0;     # start next report at top

    my ( $reporttype, $seenquery, $reportline, $reportversion );
    my ( $seeniteration, $found_again );
    my $incl_threshold = $self->inclusion_threshold;
    my $bl2seq_fix;
    $self->start_document();  # let the fun begin...
    my (@hit_signifs);
    my $gapped_stats = 0;    # for switching between gapped/ungapped
                             # lambda, K, H

Bio/SearchIO/blast.pm  view on Meta::CPAN

            if ($reporttype =~ /RPS-BLAST/) {
                $reporttype .= '(BLASTP)'; # default RPS-BLAST type
            }
            $reportline = $_;   # to fix the fact that RPS-BLAST output is wrong
            $self->element(
                {
                    'Name' => 'BlastOutput_program',
                    'Data' => $reporttype
                }
            );

            $self->element(
                {
                    'Name' => 'BlastOutput_version',
                    'Data' => $reportversion
                }
            );
            $self->element(
                {
                    'Name' => 'BlastOutput_inclusion-threshold',
                    'Data' => $incl_threshold
                }
            );
        }
        # parse the BLAST algorithm reference
        elsif(/^Reference:\s+(.*)$/) {
            # want to preserve newlines for the BLAST algorithm reference
            my $algorithm_reference = "$1\n";
            $_ = $self->_readline;
            # while the current line, does not match an empty line, a RID:, a
            # Database:, or a query definition line (Query=) we are still
            # looking at the algorithm_reference, append it to what we parsed so
            # far
            while($_ !~ /^$/ && $_ !~ /^RID:/ && $_ !~ /^Database:/ && $_ !~ /^Query=/) {
                $algorithm_reference .= "$_";
                $_ = $self->_readline;
            }
            # if we exited the while loop, we saw an empty line, a RID:, or a
            # Database:, so push it back
            $self->_pushback($_);
            $self->element(
                {
                    'Name' => 'BlastOutput_algorithm-reference',
                    'Data' => $algorithm_reference
                }
            );
        }
        # parse BLAST RID (Request ID)
        elsif(/^RID:\s+(.*)$/) {
            my $rid = $1;
            $self->element(
                {
                    'Name' => 'BlastOutput_rid',
                    'Data' => $rid
                }
            );
        }
        # added Windows workaround for bug 1985
        elsif (/^(Searching|Results from round)/) {
            next unless $1 =~ /Results from round/;
            $self->debug("blast.pm: Possible psi blast iterations found...\n");

            $self->in_element('hsp')
              && $self->end_element( { 'Name' => 'Hsp' } );
            $self->in_element('hit')
              && $self->end_element( { 'Name' => 'Hit' } );
            if ( defined $seeniteration ) {
                $self->within_element('iteration')
                  && $self->end_element( { 'Name' => 'Iteration' } );
                $self->start_element( { 'Name' => 'Iteration' } );
            }
            else {
                $self->start_element( { 'Name' => 'Iteration' } );
            }
            $seeniteration = 1;
        }
        elsif (/^Query=\s*(.*)$/) {
            my $q    = $1;
            $self->debug("blast.pm: Query= found...$_\n");
            my $size = 0;
            if ( defined $seenquery ) {
                $self->_pushback($_);
                $self->_pushback($reportline) if $reportline;
                last PARSER;
            }
            if ( !defined $reporttype ) {
                $self->_start_blastoutput;
                if ( defined $seeniteration ) {
                    $self->in_element('iteration')
                      && $self->end_element( { 'Name' => 'Iteration' } );
                    $self->start_element( { 'Name' => 'Iteration' } );
                }
                else {
                    $self->start_element( { 'Name' => 'Iteration' } );
                }
                $seeniteration = 1;
            }
            $seenquery = $q;
            $_ = $self->_readline;
            while ( defined($_) ) {
                if (/^Database:/) {
                    $self->_pushback($_);
                    last;
                }
                # below line fixes length issue with BLAST v2.2.13; still works
                # with BLAST v2.2.12
                if ( /\((\-?[\d,]+)\s+letters.*\)/ || /^Length=(\-?[\d,]+)/ ) {
                    $size = $1;
                    $size =~ s/,//g;
                    last;
                }
                else {
                    # bug 2391
                    $q .= ($q =~ /\w$/ && $_ =~ /^\w/) ? " $_" : $_;
                    $q =~ s/\s+/ /g; # this catches the newline as well
                    $q =~ s/^ | $//g;
                }

                $_ = $self->_readline;
            }
            chomp($q);



( run in 0.568 second using v1.01-cache-2.11-cpan-71847e10f99 )