Bio-MUST-Apps-FortyTwo
view release on metacpan or search on metacpan
lib/Bio/MUST/Apps/FortyTwo/OrgProcessor.pm view on Meta::CPAN
my $ap = $self->ali_proc;
my $rp = $ap->run_proc;
# ignore if no .para file in use
my $blastdb = $ap->para_blastdb;
return unless $blastdb;
my $args = $rp->blast_args_for('templates') // {};
$args->{-outfmt} = 6;
$args->{-max_target_seqs} = 1;
my $orthologous_seqs = $self->orthologous_seqs;
$args->{-query_gencode} = $self->code # if BLASTX
if $orthologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
my $parser = $blastdb->blast($orthologous_seqs, $args);
##### [ORG] BLASTX (or BLASTP/N): 'PARA ' . $parser->filename
my %para_score_for;
while (my $hsp = $parser->next_query) {
my $transcript_acc = $orthologous_seqs->long_id_for( $hsp->query_id );
my $hit_id = $blastdb->long_id_for( $hsp->hit_id );
my $score = $hsp->bit_score;
######## [DEBUG] orthologue: $transcript_acc
######## [DEBUG] PARA hit: $hit_id
######## [DEBUG] score: $score
$para_score_for{$transcript_acc} = $score;
}
$parser->remove unless $rp->debug_mode;
return \%para_score_for;
}
sub _build_tol_scores {
my $self = shift;
my $ap = $self->ali_proc;
my $rp = $ap->run_proc;
# ignore if no tol_check in use
my $blastdb = $rp->tol_blastdb;
return unless $blastdb;
my $args = $rp->blast_args_for('templates') // {};
$args->{-outfmt} = 5;
$args->{-max_target_seqs} = 250;
my $orthologous_seqs = $self->orthologous_seqs;
$args->{-query_gencode} = $self->code # if BLASTX
if $orthologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
my $parser = $blastdb->blast($orthologous_seqs, $args);
##### [ORG] XML BLASTX (or BLASTP/N): 'TOL ' . $parser->filename
# abort if no hit
my $bo = $parser->blast_output;
return unless $bo;
my %tol_score_for;
ORTHOLOGUE:
for my $orthologue ($bo->all_iterations) {
next ORTHOLOGUE unless $orthologue->count_hits;
# Note: this should never happen...
my $query_def = $orthologue->query_def;
my $transcript_acc = $orthologous_seqs->long_id_for($query_def);
######## [DEBUG] orthologue: $transcript_acc
TOL_HIT:
for my $hit ($orthologue->all_hits) {
# check taxonomy of hit: skip SELF-like hits
my $hit_id = SeqId->new( full_id => $hit->id );
######## [DEBUG] TOL hit: $hit_id->full_id
next TOL_HIT if $self->is_allowed($hit_id);
# fetch score for first non-SELF-like hit
my $score = $hit->get_hsp(0)->bit_score;
######## [DEBUG] score: $score
$tol_score_for{$transcript_acc} = $score;
next ORTHOLOGUE;
}
}
$parser->remove unless $rp->debug_mode;
return \%tol_score_for;
}
sub _build_aligned_seqs { ## no critic (ProhibitExcessComplexity)
my $self = shift;
##### [ORG] Aligning orthologues...
my $ap = $self->ali_proc;
my $rp = $ap->run_proc;
my $args = $rp->blast_args_for('templates') // {};
$args->{-outfmt} = 5;
$args->{-max_target_seqs} = $rp->tax_max_hits * 2;
my $orthologous_seqs = $self->orthologous_seqs;
my $blastdb = $ap->blastdb;
$args->{-query_gencode} = $self->code # if BLASTX
if $orthologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
my $parser = $blastdb->blast($orthologous_seqs, $args);
##### [ORG] XML BLASTX (or BLASTP/N): $parser->filename
my $aligned_seqs = Ali->new();
# abort if no hit
my $bo = $parser->blast_output;
return $aligned_seqs unless $bo;
ORTHOLOGUE:
for my $orthologue ($bo->all_iterations) {
unless ($orthologue->count_hits) {
###### [ORG] skipped orthologue due to lack of significant template
next ORTHOLOGUE;
} # TODO: investigate why this should happen at all...
# fetch tax_line from orthologue
# Note: .para check is also done in this sub (hence the next below)
my $tax_line = $self->_fetch_tax_line_for_transcript($orthologue);
next ORTHOLOGUE unless $tax_line;
# extract contam_org and transcript_id from tax_line...
my $contam_org = $tax_line->contam_org;
my $transcript_id = $tax_line->seq_id . '%s'; # chunk placeholder
$transcript_id .= "...$contam_org" if $contam_org;
$transcript_id .= '#NEW#';
# ... and use it "as is" to store tax_line in tax_report (no chunk tag)
my $tax_line_id = sprintf $transcript_id, q{};
$ap->set_tax_line( $tax_line_id => $tax_line );
# extract transcript_seq from tax_line
my $transcript_seq = Seq->new(
seq_id => $tax_line_id, # same id without alignment
seq => $tax_line->seq
);
# skip both orthologue alignment and integration if metagenomic mode
next ORTHOLOGUE if $rp->run_mode eq 'metagenomic';
# optionally add transcript_seq as is (without alignment)
if ($rp->aligner_mode eq 'off') {
####### [ORG] Adding: $transcript_seq->full_id
$aligned_seqs->add_seq($transcript_seq);
next ORTHOLOGUE;
}
# build a template_seq list as long as query coverage improves
# exonerate will try to use the longest template for alignment
# while BLAST will use each hit in turn (possibly as a fall-back)
my @templates;
my @template_seqs;
my $best_coverage = 0;
TEMPLATE:
for my $template ($orthologue->all_hits) {
# fetch template full_id
my $template_id = SeqId->new(
full_id => $blastdb->long_id_for($template->def)
);
# optionally skip template if from same org as the orthologue
if ($rp->ali_skip_self eq 'on') {
if ($template_id->full_org eq $transcript_seq->full_org) {
###### [ORG] skipped same-org template due to ali_skip_self
next TEMPLATE;
}
}
( run in 0.652 second using v1.01-cache-2.11-cpan-96521ef73a4 )