Bio-MUST-Apps-FortyTwo

 view release on metacpan or  search on metacpan

lib/Bio/MUST/Apps/FortyTwo/OrgProcessor.pm  view on Meta::CPAN

    my $ap = $self->ali_proc;
    my $rp = $ap->run_proc;

    # ignore if no .para file in use
    my $blastdb = $ap->para_blastdb;
    return unless $blastdb;

    my $args = $rp->blast_args_for('templates') // {};
    $args->{-outfmt} = 6;
    $args->{-max_target_seqs} = 1;

    my $orthologous_seqs = $self->orthologous_seqs;
    $args->{-query_gencode} = $self->code           # if BLASTX
        if $orthologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
    my $parser = $blastdb->blast($orthologous_seqs, $args);
    ##### [ORG] BLASTX (or BLASTP/N): 'PARA ' . $parser->filename

    my %para_score_for;
    while (my $hsp = $parser->next_query) {
        my $transcript_acc = $orthologous_seqs->long_id_for( $hsp->query_id );
        my $hit_id = $blastdb->long_id_for( $hsp->hit_id );
        my $score = $hsp->bit_score;
        ######## [DEBUG] orthologue: $transcript_acc
        ######## [DEBUG] PARA hit: $hit_id
        ######## [DEBUG] score: $score
        $para_score_for{$transcript_acc} = $score;
    }
    $parser->remove unless $rp->debug_mode;

    return \%para_score_for;
}


sub _build_tol_scores {
    my $self = shift;

    my $ap = $self->ali_proc;
    my $rp = $ap->run_proc;

    # ignore if no tol_check in use
    my $blastdb = $rp->tol_blastdb;
    return unless $blastdb;

    my $args = $rp->blast_args_for('templates') // {};
    $args->{-outfmt} = 5;
    $args->{-max_target_seqs} = 250;

    my $orthologous_seqs = $self->orthologous_seqs;
    $args->{-query_gencode} = $self->code           # if BLASTX
        if $orthologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
    my $parser = $blastdb->blast($orthologous_seqs, $args);
    ##### [ORG] XML BLASTX (or BLASTP/N): 'TOL ' . $parser->filename

    # abort if no hit
    my $bo = $parser->blast_output;
    return unless $bo;

    my %tol_score_for;

    ORTHOLOGUE:
    for my $orthologue ($bo->all_iterations) {
        next ORTHOLOGUE unless $orthologue->count_hits;
        # Note: this should never happen...

        my $query_def = $orthologue->query_def;
        my $transcript_acc = $orthologous_seqs->long_id_for($query_def);
        ######## [DEBUG] orthologue: $transcript_acc

        TOL_HIT:
        for my $hit ($orthologue->all_hits) {

            # check taxonomy of hit: skip SELF-like hits
            my $hit_id = SeqId->new( full_id => $hit->id );
            ######## [DEBUG] TOL hit: $hit_id->full_id
            next TOL_HIT if $self->is_allowed($hit_id);

            # fetch score for first non-SELF-like hit
            my $score = $hit->get_hsp(0)->bit_score;
            ######## [DEBUG] score: $score
            $tol_score_for{$transcript_acc} = $score;
            next ORTHOLOGUE;
        }
    }
    $parser->remove unless $rp->debug_mode;

    return \%tol_score_for;
}


sub _build_aligned_seqs {               ## no critic (ProhibitExcessComplexity)
    my $self = shift;

    ##### [ORG] Aligning orthologues...

    my $ap = $self->ali_proc;
    my $rp = $ap->run_proc;

    my $args = $rp->blast_args_for('templates') // {};
    $args->{-outfmt} = 5;
    $args->{-max_target_seqs} = $rp->tax_max_hits * 2;

    my $orthologous_seqs = $self->orthologous_seqs;
    my $blastdb = $ap->blastdb;
    $args->{-query_gencode} = $self->code           # if BLASTX
        if $orthologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
    my $parser = $blastdb->blast($orthologous_seqs, $args);
    ##### [ORG] XML BLASTX (or BLASTP/N): $parser->filename

    my $aligned_seqs = Ali->new();

    # abort if no hit
    my $bo = $parser->blast_output;
    return $aligned_seqs unless $bo;

    ORTHOLOGUE:
    for my $orthologue ($bo->all_iterations) {
        unless ($orthologue->count_hits) {
            ###### [ORG] skipped orthologue due to lack of significant template
            next ORTHOLOGUE;
        }   # TODO: investigate why this should happen at all...

        # fetch tax_line from orthologue
        # Note: .para check is also done in this sub (hence the next below)
        my $tax_line = $self->_fetch_tax_line_for_transcript($orthologue);
        next ORTHOLOGUE unless $tax_line;

        # extract contam_org and transcript_id from tax_line...
        my $contam_org = $tax_line->contam_org;
        my $transcript_id = $tax_line->seq_id . '%s';   # chunk placeholder
           $transcript_id .= "...$contam_org" if $contam_org;
           $transcript_id .= '#NEW#';

        # ... and use it "as is" to store tax_line in tax_report (no chunk tag)
        my $tax_line_id = sprintf $transcript_id, q{};
        $ap->set_tax_line( $tax_line_id => $tax_line );

        # extract transcript_seq from tax_line
        my $transcript_seq = Seq->new(
            seq_id => $tax_line_id,     # same id without alignment
            seq    => $tax_line->seq
        );

        # skip both orthologue alignment and integration if metagenomic mode
        next ORTHOLOGUE if $rp->run_mode eq 'metagenomic';

        # optionally add transcript_seq as is (without alignment)
        if ($rp->aligner_mode eq 'off') {

            ####### [ORG] Adding: $transcript_seq->full_id

            $aligned_seqs->add_seq($transcript_seq);
            next ORTHOLOGUE;
        }

        # build a template_seq list as long as query coverage improves
        # exonerate will try to use the longest template for alignment
        # while BLAST will use each hit in turn (possibly as a fall-back)
        my @templates;
        my @template_seqs;
        my $best_coverage = 0;

        TEMPLATE:
        for my $template ($orthologue->all_hits) {

            # fetch template full_id
            my $template_id = SeqId->new(
                full_id => $blastdb->long_id_for($template->def)
            );

            # optionally skip template if from same org as the orthologue
            if ($rp->ali_skip_self eq 'on') {
                if ($template_id->full_org eq $transcript_seq->full_org) {
                    ###### [ORG] skipped same-org template due to ali_skip_self
                    next TEMPLATE;
                }
            }



( run in 0.652 second using v1.01-cache-2.11-cpan-96521ef73a4 )