Bio-MUST-Core

 view release on metacpan or  search on metacpan

lib/Bio/MUST/Core/Taxonomy.pm  view on Meta::CPAN



sub compute_lca {                           ## no critic (RequireArgUnpacking)
    return shift->get_common_taxonomy_from_seq_ids(@_);
}

sub _common_taxonomy {                      ## no critic (RequireArgUnpacking)
    my $self      = shift;
    my $threshold = shift;
    my @lineages  = @_;

    # ignore missing lineages
    # TODO: decide whether missing lineages should be taken into account
    @lineages = grep { @{$_} ? $_ : () } @lineages;

    my @common_lineage;

    # original version: strict consensus
    # Note the use of a dynamically-sized multiple ArrayRef iterator
    # to walk down all lineages in parallel
    # my $ea = each_arrayref(@lineages);
    #
    # TAXON:
    # while ( my @taxa = $ea->() ) {
    #     no warnings 'uninitialized';  # avoid undef due to longer lineages
    #     last TAXON if uniq(@taxa) > 1;
    #     push @common_lineage, shift @taxa;
    # }

    # current version: threshold-based majority-rule consensus
    # algorithm: at each taxonomic rank count all seen taxa
    # if the most popular taxon is seen > threshold (w.r.t. all lineages)
    # then continue with the lineages featuring it
    # otherwise stop at previous taxon
    my $n = @lineages;

    TAXON:
    for (my $i = 0; $n; $i++) {
        my %count_for = count_by { $_->[$i] // q{} } @lineages;
        my $taxon = max_by { $count_for{$_} } keys %count_for;
        last TAXON unless $taxon;
        my $taxon_n = $count_for{$taxon};
        last TAXON if $taxon_n / $n < $threshold;
        push @common_lineage, $taxon;
        @lineages = grep { ( $_->[$i] // q{} ) eq $taxon } @lineages;
    }

    # examine context for returning plain array or ArrayRef
    return wantarray ? @common_lineage : \@common_lineage;
}

# tree annotation methods


sub attach_taxonomies_to_terminals {
    my $self = shift;
    my $tree = shift;

    #### ATTACHING TAXONOMIES TO TERMINALS...

    # transparently fetch Bio::Phylo component object
    $tree = $tree->tree if $tree->isa('Bio::MUST::Core::Tree');

    # store tip taxonomies in Bio::Phylo::Forest::Node generic attributes
    for my $tip ( @{ $tree->get_terminals } ) {

        # fetch taxonomy (and level list) from tip's seq id
        my @tax = $self->get_taxonomy_with_levels_from_seq_id($tip->get_name);

        # attach them as distinct ArrayRefs
        $tip->set_generic('taxonomy' => [ map { $_->[0] } @tax ] );
        $tip->set_generic('levels'   => [ map { $_->[1] } @tax ] );

        # Note: levels are needed for robust clade naming and collapsing.
        # Indeed, get_level_from_name() can return incorrect level when a name
        # exists at more than one taxonomic level in NCBI Taxonomy (e.g.,
        # Bacteria, which is also an insect genus).
    }

    return;
}



sub attach_taxonomies_to_internals {
    my $self = shift;
    my $tree = shift;

    # post-order tree traversal
    $tree->tree->visit_depth_first(

        # infer common taxonomy from direct children of each node
        -post => sub {
            my $node = shift;
            return if $node->is_terminal;

            # collect children lineages
            # ... and track the longest (better to be safe...) level list
            my @lineages;
            my @max_levels;
            for (my $i = 0; my $child = $node->get_child($i); $i++) {
                push @lineages, $child->get_generic('taxonomy');
                my @levels = @{ $child->get_generic('levels') };
                @max_levels = @levels if @levels > @max_levels;
            }

            # store into current node their common lineage
            # ... and cut down level list to match this common lineage
            my @common_lineage = $self->_common_taxonomy(1.0, @lineages);
            $node->set_generic('taxonomy' => \@common_lineage);
            $node->set_generic(
                'levels' => [ @max_levels[0..$#common_lineage] ]
            );

            return;
        },
     );

    return;
}



( run in 1.188 second using v1.01-cache-2.11-cpan-39bf76dae61 )