Bio-MUST-Apps-FortyTwo

 view release on metacpan or  search on metacpan

lib/Bio/MUST/Apps/Leel/OrgProcessor.pm  view on Meta::CPAN

    return Temporary->new( seqs => \@seqs );
}


sub _build_protein_seq_mapper {
    my $self = shift;

    my @protein_seq_ids = $self->protein_seqs->all_seq_ids;

    # abbr_ids are protein ids turned to query_ids for BLAST report (seq1 etc)
    my @abbr_ids = map {
        $self->abbr_id_for( $_->full_id )
    } @protein_seq_ids;

    # TODO: put regex in common with Mick's one-on-one in BMC

    # long_ids are bare accessions extracted from protein ids (Ahyp8251 etc)
    # these protein accessions will be matched against transcript accessions
    my @long_ids = map {
        $_->accession
            =~ s{ $TAIL_42 }{}xmsgr
    } @protein_seq_ids;

    return IdMapper->new(
        long_ids => \@long_ids,
        abbr_ids => \@abbr_ids,
    );
}


sub _build_transcript_seqs {
    my $self = shift;

    ##### [ORG] Searching for transcripts...

    my $ap = $self->ali_proc;
    my $rp = $ap->run_proc;

    my $args = $rp->blast_args_for('transcripts') // {};
    $args->{-outfmt} = 5;
    $args->{-max_target_seqs} = 5
        unless defined $args->{-max_target_seqs};
    $args->{-db_gencode} = $self->code;

    my @seqs;
    for my $bank ($self->all_banks) {

        ###### [ORG] Processing BANK: $bank
        my $file = file( $rp->bank_dir, $bank );
        my $blastdb = Bio::MUST::Drivers::Blast::Database->new( file => $file );
        my $parser = $blastdb->tblastn($self->protein_seqs, $args);
        ###### [ORG] XML TBLASTN: $bank . q{ } . $parser->filename
        my $bo = $parser->blast_output;

        # collect transcript ids and ranges covered by proteins
        # tie might help making 1331 completely deterministic
        tie my %mask_for, 'Tie::IxHash';
        my %hit_for;

        PROTEIN:
        for my $protein ($bo->all_iterations) {

            # get protein accession
            my $protein_acc = $self->accession_for( $protein->query_def );

            TRANSCRIPT:
            for my $transcript ($protein->all_hits) {

                # get (corresponding?) transcript accession
                my @fields = split /\|/xms, $transcript->id;
                my $transcript_acc = pop @fields;

                # check that both accessions are identical
                my $rank = $transcript->num;
                unless ($protein_acc eq $transcript_acc) {
                    ####### [ORG] ids do not match: "$protein_acc vs. $transcript_acc (hit $rank)"

                    # optionally try next transcript if accessions are different
                    next TRANSCRIPT if $rp->id_match_mode eq 'enforce';
                }

                # mention success only in case of previous failure
                elsif ($rank > 1)  {
                    ####### [ORG] ids now do match: "$protein_acc (hit $rank)"
                }

                # collect protein id and mask for transcript
                $self->_collect_hsps(
                    protein    => $protein,         # Blast::Xml::Iteration
                    transcript => $transcript,      # Blast::Xml::Hit
                    mask_for   => \%mask_for,
                    hit_for    => \%hit_for,
                );

                # no need to examine remaining transcripts
                next PROTEIN;
            }
        }
        $parser->remove unless $rp->debug_mode;

        # fetch (and optionally trim) transcripts to their max range
        my @hits = $self->_fetch_and_trim_hits(
            blastdb  => $blastdb,
            mask_for => \%mask_for,
            hit_for  => \%hit_for,
        );
        ######## [DEBUG] transcripts: display( map { $_->seq } @hits )

        # add transcripts from current bank to seqs
        push @seqs, @hits
    }

    # check that all transcripts were indeed recovered
    my $missing_n = $self->protein_seqs->count_seqs - @seqs;
    carp "[ORG] Warning: cannot find $missing_n transcript(s):"
        . ' skipping it (them)!' if $missing_n;

    return Ali->new( seqs => \@seqs );
}




( run in 1.119 second using v1.01-cache-2.11-cpan-71847e10f99 )