BioPerl

 view release on metacpan or  search on metacpan

Bio/Search/HSP/GenericHSP.pm  view on Meta::CPAN


=head2 Support 

Please direct usage questions or support issues to the mailing list:

I<bioperl-l@bioperl.org>

rather than to the module maintainer directly. Many experienced and 
reponsive experts will be able look at the problem and quickly 
address it. Please include a thorough description of the problem 
with code and data examples if at all possible.

=head2 Reporting Bugs

Report bugs to the Bioperl bug tracking system to help us keep track
of the bugs and their resolution. Bug reports can be submitted via the
web:

  https://github.com/bioperl/bioperl-live/issues

=head1 AUTHOR - Jason Stajich and Steve Chervitz

Email jason-at-bioperl.org
Email sac-at-bioperl.org

=head1 CONTRIBUTORS

Sendu Bala, bix@sendu.me.uk

=head1 APPENDIX

The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _

=cut

# Let the code begin...

package Bio::Search::HSP::GenericHSP;
use strict;

use Bio::Root::Root;
use Bio::SeqFeature::Similarity;

use base qw(Bio::Search::HSP::HSPI);

=head2 new

 Title   : new
 Usage   : my $obj = Bio::Search::HSP::GenericHSP->new();
 Function: Builds a new Bio::Search::HSP::GenericHSP object
 Returns : Bio::Search::HSP::GenericHSP
 Args    : -algorithm => algorithm used (BLASTP, TBLASTX, FASTX, etc)
           -evalue    => evalue
           -pvalue    => pvalue
           -bits      => bit value for HSP
           -score     => score value for HSP (typically z-score but depends on
                                              analysis)
           -hsp_length=> Length of the HSP (including gaps)
           -identical => # of residues that that matched identically
           -percent_identity => (optional) percent identity
           -conserved => # of residues that matched conservatively
                           (only protein comparisons;
                            conserved == identical in nucleotide comparisons)
           -hsp_gaps   => # of gaps in the HSP
           -query_gaps => # of gaps in the query in the alignment
           -hit_gaps   => # of gaps in the subject in the alignment
           -query_name  => HSP Query sequence name (if available)
           -query_start => HSP Query start (in original query sequence coords)
           -query_end   => HSP Query end (in original query sequence coords)
           -query_length=> total length of the query sequence
           -query_seq   => query sequence portion of the HSP
           -query_desc  => textual description of the query
           -hit_name    => HSP Hit sequence name (if available)
           -hit_start   => HSP Hit start (in original hit sequence coords)
           -hit_end     => HSP Hit end (in original hit sequence coords)
           -hit_length  => total length of the hit sequence
           -hit_seq     => hit sequence portion of the HSP
           -hit_desc    => textual description of the hit
           -homology_seq=> homology sequence for the HSP
           -hit_frame   => hit frame (only if hit is translated protein)
           -query_frame => query frame (only if query is translated protein)
           -rank        => HSP rank
           -links       => HSP links information (WU-BLAST only)
           -hsp_group   => HSP Group informat (WU-BLAST only)
           -gap_symbol  => symbol representing a gap (default = '-')
           -hit_features=> string of features found in or near HSP hit
                           region (reported in some BLAST text output,
                           v. 2.2.13 and up)
           -stranded    => If the algorithm isn't known (i.e. defaults to
                           'generic'), setting this will indicate start/end
                           coordinates are to be used to determine the strand
                           for 'query', 'subject', 'hit', 'both', or 'none'
                           (default = 'none')

=cut

sub new {
    my($class,%args) = @_;

    # don't pass anything to SUPER; complex hierarchy results in lots of work
    # for nothing
    
    my $self = $class->SUPER::new();

    # for speed, don't use _rearrange and just store all input data directly
    # with no method calls and no work done. work can be carried
    # out just-in-time later if desired
    while (my ($arg, $value) = each %args) {
        $arg =~ tr/a-z\055/A-Z/d;
        $self->{$arg} = $value;
    }
    my $bits = $self->{BITS};

    defined $self->{VERBOSE} && $self->verbose($self->{VERBOSE});
    if (exists $self->{GAP_SYMBOL}) {
        # not checking anything else but the length (must be 1 as only one gap
        # symbol allowed currently); can add in support for symbol checks or
        # multiple symbols later if needed
        $self->throw("Gap symbol must be of length 1") if
            CORE::length($self->{GAP_SYMBOL}) != 1;

Bio/Search/HSP/GenericHSP.pm  view on Meta::CPAN

        $value = $previous = '' unless defined $value;
        $self->{PP_SEQ} = $value;
        # do some housekeeping so we know when to
        # re-run _calculate_seq_positions
        $self->{'_sequenceschanged'} = 1;
    }
    return $previous;
}

=head2 length

 Title    : length
 Usage    : my $len = $hsp->length( ['query'|'hit'|'total'] );
 Function : Returns the length of the query or hit in the alignment
            (without gaps)
            or the aggregate length of the HSP (including gaps;
            this may be greater than either hit or query )
 Returns  : integer
 Args     : arg 1: 'query' = length of query seq (without gaps)
                   'hit'   = length of hit seq (without gaps) (synonyms: sbjct, subject)
                   'total' = length of alignment (with gaps)
                   default = 'total'
            arg 2: [optional] integer length value to set for specific type

=cut

sub length {

    my $self = shift;
    my $type = shift;

    $type = 'total' unless defined $type;
    $type = lc $type;

    if( $type =~ /^q/i ) {
        return $self->query()->length(shift);
    } elsif( $type =~ /^(hit|subject|sbjct)/ ) {
        return $self->hit()->length(shift);
    } else {
        my $v = shift;
        if( defined $v ) {
            $self->{HSP_LENGTH} = $v;
        }
        return $self->{HSP_LENGTH};
   }
    return 0; # should never get here
}

=head2 hsp_length

 Title   : hsp_length
 Usage   : my $len = $hsp->hsp_length()
 Function: shortcut  length('hsp')
 Returns : floating point between 0 and 100
 Args    : none

=cut

sub hsp_length { return shift->length('hsp', shift); }

=head2 percent_identity

 Title   : percent_identity
 Usage   : my $percentid = $hsp->percent_identity()
 Function: Returns the calculated percent identity for an HSP
 Returns : floating point between 0 and 100
 Args    : none


=cut

sub percent_identity {
    my $self = shift;

    unless ($self->{_did_prepi}) {
        $self->_pre_pi;
    }

    return $self->SUPER::percent_identity(@_);
}

=head2 frame

 Title   : frame
 Usage   : my ($qframe, $hframe) = $hsp->frame('list',$queryframe,$subjectframe)
 Function: Set the Frame for both query and subject and insure that
           they agree.
           This overrides the frame() method implementation in
           FeaturePair.
 Returns : array of query and subject frame if return type wants an array
           or query frame if defined or subject frame if not defined
 Args    : 'hit' or 'subject' or 'sbjct' to retrieve the frame of the subject (default)
           'query' to retrieve the query frame 
           'list' or 'array' to retrieve both query and hit frames together
 Note    : Frames are stored in the GFF way (0-2) not 1-3
           as they are in BLAST (negative frames are deduced by checking
                                 the strand of the query or hit)

=cut

# Note: changed 4/19/08 - bug 2485
#
# frame() is supposed to be a getter/setter (as is implied by the Function desc
# above; this is also consistent with Bio::SeqFeature::SimilarityPair).  Also,
# the API is not consistent with the other HSP/SimilarityPair methods such as
# strand(), start(), end(), etc. 
#
# In order to make this consistent with other methods all work is now done
# when the features are instantiated and not delayed.  We compromise by
# defaulting back 'to hit' w/o passed args.  Setting is now allowed.  

sub frame {
    my $self = shift;
    my $val = shift;
    if (!defined $val) {
        # unfortunately, w/o args we need to warn about API changes
        $self->warn("API for frame() has changed.\n".
                    "Please refer to documentation for Bio::Search::HSP::GenericHSP;\n".
                    "returning query frame");
        $val = 'query';
    }
    $val =~ s/^\s+//;

    if( $val =~ /^q/i ) {
        return $self->query->frame(@_);
    } elsif( $val =~ /^hi|^s/i ) {
        return $self->hit->frame(@_);
    } elsif (  $val =~ /^list|array/i ) {
        return ($self->query->frame($_[0]), 
            $self->hit->frame($_[1]) );
    } elsif ( $val =~ /^\d+$/) {
        # old API i.e. frame($query_frame, $hit_frame). This catches all but one
        # case, where no arg is passed (so the hit is wanted).  
        $self->warn("API for frame() has changed.\n".
                    "Please refer to documentation for Bio::Search::HSP::GenericHSP");
        wantarray ? 
        return ($self->query->frame($val), 
            $self->hit->frame(@_) ) :
        return $self->hit->frame($val,@_);

Bio/Search/HSP/GenericHSP.pm  view on Meta::CPAN

            $strand = 1;
        }
        else {
            $strand = undef;
        }
    }
    else {
        if ($hitfactor) {
            $strand = -1;
        }
        else {
            $strand = undef;
        }
        ($hs,$he) = ( $he,$hs); # reverse subject: start bigger than end
    }

    my $sim2 = $self->{_sim2} || Bio::SeqFeature::Similarity->new();
    $sim2->start($hs);
    $sim2->end($he);
    $sim2->significance($self->{EVALUE});
    $sim2->bits($self->{BITS});
    $sim2->score($self->{SCORE});
    $sim2->strand($strand);
    $sim2->seq_id($self->{HIT_NAME});
    $sim2->seqlength($self->{HIT_LENGTH});
    $sim2->source_tag($self->{ALGORITHM});
    $sim2->seqdesc($self->{HIT_DESC});
    $sim2->add_tag_value('meta', $self->{META}) if $self->can('meta');
    my $hframe = $self->{HIT_FRAME};
    
    if (defined $strand && !defined $hframe && $hitfactor) {
        $hframe = ( $hs % 3 ) * $strand;
    }
    elsif (!defined $strand) {
        $hframe = 0;
    }
    
    if( $hframe =~ /^([+-])?([0-3])/ ) {
        my $dir = $1 || '+';
        if($hframe && (($dir eq '-' && $strand >= 0) || ($dir eq '+' && $strand <= 0)) ) {
            $self->warn("Subject frame ($hframe) did not match strand of subject ($strand)");
        }
        $hframe = $2 != 0 ? $2 - 1 : $2;
    }  else {
        $self->warn("Unknown subject frame ($hframe)");
        $hframe = 0;
    }
    
    $sim2->frame($hframe);
    $self->SUPER::feature2($sim2);

    $self->{_created_sff} = 1;
    $self->{_making_sff} = 0;
}

# before calling the num_* methods
sub _pre_similar_stats {
    my $self = shift;
    my $identical = $self->{IDENTICAL};
    my $conserved = $self->{CONSERVED};
    my $percent_id = $self->{PERCENT_IDENTITY};

    if (! defined $identical) {
        if (! defined $percent_id) {
            $self->warn("Did not defined the number of identical matches or overall percent identity in the HSP; assuming 0");
            $identical = 0;
        }
        else {
            $identical = sprintf("%.0f",$percent_id * $self->{HSP_LENGTH});
        }
    }

    if (! defined $conserved) {
        $self->warn("Did not define the number of conserved matches in the HSP; assuming conserved == identical ($identical)")
            if( $self->{ALGORITHM} !~ /^((FAST|BLAST)N)|EXONERATE|SIM4|AXT|PSL|BLAT|BLASTZ|WABA/oi);
        $conserved = $identical;
    }
    $self->{IDENTICAL} = $identical;
    $self->{CONSERVED} = $conserved;
    $self->{_did_presimilar} = 1;
}

# before calling the frac_* methods

sub _pre_frac {
    my $self = shift;
    
    my $hsp_len = $self->{HSP_LENGTH};
    my $hit_len = $self->{HIT_LENGTH};
    my $query_len = $self->{QUERY_LENGTH};

    my $identical = $self->num_identical;
    my $conserved = $self->num_conserved;

    $self->{_did_prefrac} = 1;
    my $logical;
    if( $hsp_len ) {
        $self->length('total', $hsp_len);
        $logical = $self->_logical_length('total');
        $self->frac_identical( 'total', $identical / $hsp_len);
        $self->frac_conserved( 'total', $conserved / $hsp_len);
    }
    if( $hit_len ) {
        $logical = $self->_logical_length('hit');
        $self->frac_identical( 'hit', $identical / $logical);
        $self->frac_conserved( 'hit', $conserved / $logical);
    }
    if( $query_len ) {
        $logical = $self->_logical_length('query');
        $self->frac_identical( 'query', $identical / $logical) ;
        $self->frac_conserved( 'query', $conserved / $logical);
    }
}

# before calling gaps()
# This relies first on passed parameters (parser-dependent), then on gaps
# calculated by seq_inds() (if set), then falls back to directly checking
# for '-' or '.' as a last resort

sub _pre_gaps {
    my $self = shift;
    my $query_gaps = $self->{QUERY_GAPS};
    my $query_seq = $self->{QUERY_SEQ};
    my $hit_gaps = $self->{HIT_GAPS};
    my $hit_seq = $self->{HIT_SEQ};
    my $gaps = $self->{HSP_GAPS};

    $self->{_did_pregaps} = 1; # well, we're in the process; avoid recursion
    if( defined $query_gaps ) {
        $self->gaps('query', $query_gaps);
    } elsif( defined $query_seq ) {
        my $qg = (defined $self->{'_query_offset'}) ? $self->seq_inds('query','gaps')
               : ($self->algorithm eq 'ERPIN')      ? scalar( $hit_seq =~ tr/\-//)
               :  scalar( $query_seq =~ tr/\-\.// ); # HMMER3 and Infernal uses '.' and '-'
        my $offset = $self->{'_query_offset'} || 1;
        $self->gaps('query', $qg/$offset);
    }
    if( defined $hit_gaps ) {
        $self->gaps('hit', $hit_gaps);
    } elsif( defined $hit_seq ) {
        my $hg = (defined $self->{'_sbjct_offset'}) ? $self->seq_inds('hit','gaps')
               : ($self->algorithm eq 'ERPIN')      ? scalar( $hit_seq =~ tr/\-//)
               :  scalar( $hit_seq =~ tr/\-\.// ); # HMMER3 and Infernal uses '.' and '-'
        my $offset = $self->{'_sbjct_offset'} || 1;
        $self->gaps('hit', $hg/$offset);
    }
    if( ! defined $gaps ) {
        $gaps = $self->gaps("query") + $self->gaps("hit");
    }
    $self->gaps('total', $gaps);
}

# before percent_identity
sub _pre_pi {
    my $self = shift;
    $self->{_did_prepi} = 1;
    $self->percent_identity($self->{PERCENT_IDENTITY} || $self->frac_identical('total')*100) if( $self->{HSP_LENGTH} > 0 );
}

1;



( run in 1.290 second using v1.01-cache-2.11-cpan-39bf76dae61 )