Bio-MUST-Apps-FortyTwo

 view release on metacpan or  search on metacpan

lib/Bio/MUST/Apps/FortyTwo/OrgProcessor.pm  view on Meta::CPAN

                            $new_seq =     $new_seq->reverse_complemented_seq;
                        $subject_seq = $subject_seq->reverse_complemented_seq;
                    }

                    ####### [ORG] Adding: $new_seq->full_id

                    # align new_seq on template_seq using subject_seq as a guide
                    $aligned_seqs->add_seq(
                        $ap->integrator->align(
                             new_seq => $new_seq,
                             subject => $subject_seq,
                            template => $template_seq,
                               start => $hsp->hit_start,
                        )
                    );

                    ######## [DEBUG] aligned with BLAST
                }
            }
        }
    }

    $parser->remove unless $rp->debug_mode;

    return $aligned_seqs;
}

## use critic


sub _compress_seqs {
    my $self = shift;
    my $ali  = shift;

    my $rp = $self->ali_proc->run_proc;

    # first fetch all seqs...
    # ... then clear existing seqs
    my @seqs2cap = $ali->all_seqs;
    $ali->_set_seqs( [] );

    # proceed only if at least one seq
    return unless @seqs2cap;

    # Note: this might seem sub-optimal in case of singletons
    # but we follow as closely as possible the logic of compress-db.pl

    # proceed only if at least two seqs
    # otherwise add lone seq
    if (@seqs2cap < 2) {
        $ali->add_seq( shift @seqs2cap );
        return;
    }

    # TODO: add debugging comments?

    # try to cap seqs
    my $cap = Cap3->new(
        seqs      => \@seqs2cap,
        cap3_args => {
            -p => $rp->merge_min_ident * 100.0,     # CAP3 expects percents
            -o => $rp->merge_min_len,
        },
    );

    # add singlet seqs
    my @singlets = $cap->all_singlets;
    $ali->add_seq($_) for @singlets;

    # proceed only if contigs of seqs
    my @contigs  = $cap->all_contigs;
    return unless @contigs;

    for my $contig (@contigs) {
        my @ids = map { $_->full_id }
            @{ $cap->seq_ids_for( $contig->full_id ) };
        my $contig_id = shift(@ids) . ':::+' . @ids;
        my $contig_seq = $contig->seq;

        # add contig seq
        $ali->add_seq(
            Seq->new( seq_id => $contig_id, seq => $contig_seq )
        );
    }

    # Note: we do not need to return anything as we modified $ali in place
    return;
}


sub _fetch_tax_line_for_transcript {
    my $self       = shift;
    my $orthologue = shift;

    my $ap = $self->ali_proc;
    my $rp = $ap->run_proc;

    # fetch transcript accession (= query id) from BLAST report
    my $query_def = $orthologue->query_def;
    my $transcript_acc = $self->orthologous_seqs->long_id_for($query_def);

    # extract range and strand (if any) from transcript accession...
    # Note: this mostly makes sense for genomic contigs/scaffolds
    # ... and extract potential tail '+N' (if orthologues have been merged)
    # Note: this looks convoluted but it has to work in many different setups
    # depending on the value of trim_homologues and merge_orthologues:
    # - off/off: acc
    # - off/on:  acc:::+N
    # - on /off: acc:::start:::end:::strand
    # - on /on:  acc:::start:::end:::strand:::+N
    my @fields = split /:::/xms, $transcript_acc;
    my ($strip_acc, $start, $end, $strand, $more) = @fields;
    ($more, $start) = ($start, undef)
        if !defined $end && defined $start && $start =~ m/\A \+\d+ \z/xms;

    # set acc to transcript accession
    my ($first_chunk) = $strip_acc =~ m/^(\S+)/xms;
    my @parts = split /\|/xms, $first_chunk;
    my $acc = pop @parts;
    ###### [ORG] orthologous transcript: $acc



( run in 1.825 second using v1.01-cache-2.11-cpan-39bf76dae61 )