Bio-MUST-Apps-FortyTwo
view release on metacpan or search on metacpan
lib/Bio/MUST/Apps/Leel/OrgProcessor.pm view on Meta::CPAN
return Temporary->new( seqs => \@seqs );
}
sub _build_protein_seq_mapper {
my $self = shift;
my @protein_seq_ids = $self->protein_seqs->all_seq_ids;
# abbr_ids are protein ids turned to query_ids for BLAST report (seq1 etc)
my @abbr_ids = map {
$self->abbr_id_for( $_->full_id )
} @protein_seq_ids;
# TODO: put regex in common with Mick's one-on-one in BMC
# long_ids are bare accessions extracted from protein ids (Ahyp8251 etc)
# these protein accessions will be matched against transcript accessions
my @long_ids = map {
$_->accession
=~ s{ $TAIL_42 }{}xmsgr
} @protein_seq_ids;
return IdMapper->new(
long_ids => \@long_ids,
abbr_ids => \@abbr_ids,
);
}
sub _build_transcript_seqs {
my $self = shift;
##### [ORG] Searching for transcripts...
my $ap = $self->ali_proc;
my $rp = $ap->run_proc;
my $args = $rp->blast_args_for('transcripts') // {};
$args->{-outfmt} = 5;
$args->{-max_target_seqs} = 5
unless defined $args->{-max_target_seqs};
$args->{-db_gencode} = $self->code;
my @seqs;
for my $bank ($self->all_banks) {
###### [ORG] Processing BANK: $bank
my $file = file( $rp->bank_dir, $bank );
my $blastdb = Bio::MUST::Drivers::Blast::Database->new( file => $file );
my $parser = $blastdb->tblastn($self->protein_seqs, $args);
###### [ORG] XML TBLASTN: $bank . q{ } . $parser->filename
my $bo = $parser->blast_output;
# collect transcript ids and ranges covered by proteins
# tie might help making 1331 completely deterministic
tie my %mask_for, 'Tie::IxHash';
my %hit_for;
PROTEIN:
for my $protein ($bo->all_iterations) {
# get protein accession
my $protein_acc = $self->accession_for( $protein->query_def );
TRANSCRIPT:
for my $transcript ($protein->all_hits) {
# get (corresponding?) transcript accession
my @fields = split /\|/xms, $transcript->id;
my $transcript_acc = pop @fields;
# check that both accessions are identical
my $rank = $transcript->num;
unless ($protein_acc eq $transcript_acc) {
####### [ORG] ids do not match: "$protein_acc vs. $transcript_acc (hit $rank)"
# optionally try next transcript if accessions are different
next TRANSCRIPT if $rp->id_match_mode eq 'enforce';
}
# mention success only in case of previous failure
elsif ($rank > 1) {
####### [ORG] ids now do match: "$protein_acc (hit $rank)"
}
# collect protein id and mask for transcript
$self->_collect_hsps(
protein => $protein, # Blast::Xml::Iteration
transcript => $transcript, # Blast::Xml::Hit
mask_for => \%mask_for,
hit_for => \%hit_for,
);
# no need to examine remaining transcripts
next PROTEIN;
}
}
$parser->remove unless $rp->debug_mode;
# fetch (and optionally trim) transcripts to their max range
my @hits = $self->_fetch_and_trim_hits(
blastdb => $blastdb,
mask_for => \%mask_for,
hit_for => \%hit_for,
);
######## [DEBUG] transcripts: display( map { $_->seq } @hits )
# add transcripts from current bank to seqs
push @seqs, @hits
}
# check that all transcripts were indeed recovered
my $missing_n = $self->protein_seqs->count_seqs - @seqs;
carp "[ORG] Warning: cannot find $missing_n transcript(s):"
. ' skipping it (them)!' if $missing_n;
return Ali->new( seqs => \@seqs );
}
( run in 1.119 second using v1.01-cache-2.11-cpan-71847e10f99 )