Bio-MUST-Core
view release on metacpan or search on metacpan
lib/Bio/MUST/Core/Taxonomy.pm view on Meta::CPAN
sub compute_lca { ## no critic (RequireArgUnpacking)
return shift->get_common_taxonomy_from_seq_ids(@_);
}
sub _common_taxonomy { ## no critic (RequireArgUnpacking)
my $self = shift;
my $threshold = shift;
my @lineages = @_;
# ignore missing lineages
# TODO: decide whether missing lineages should be taken into account
@lineages = grep { @{$_} ? $_ : () } @lineages;
my @common_lineage;
# original version: strict consensus
# Note the use of a dynamically-sized multiple ArrayRef iterator
# to walk down all lineages in parallel
# my $ea = each_arrayref(@lineages);
#
# TAXON:
# while ( my @taxa = $ea->() ) {
# no warnings 'uninitialized'; # avoid undef due to longer lineages
# last TAXON if uniq(@taxa) > 1;
# push @common_lineage, shift @taxa;
# }
# current version: threshold-based majority-rule consensus
# algorithm: at each taxonomic rank count all seen taxa
# if the most popular taxon is seen > threshold (w.r.t. all lineages)
# then continue with the lineages featuring it
# otherwise stop at previous taxon
my $n = @lineages;
TAXON:
for (my $i = 0; $n; $i++) {
my %count_for = count_by { $_->[$i] // q{} } @lineages;
my $taxon = max_by { $count_for{$_} } keys %count_for;
last TAXON unless $taxon;
my $taxon_n = $count_for{$taxon};
last TAXON if $taxon_n / $n < $threshold;
push @common_lineage, $taxon;
@lineages = grep { ( $_->[$i] // q{} ) eq $taxon } @lineages;
}
# examine context for returning plain array or ArrayRef
return wantarray ? @common_lineage : \@common_lineage;
}
# tree annotation methods
sub attach_taxonomies_to_terminals {
my $self = shift;
my $tree = shift;
#### ATTACHING TAXONOMIES TO TERMINALS...
# transparently fetch Bio::Phylo component object
$tree = $tree->tree if $tree->isa('Bio::MUST::Core::Tree');
# store tip taxonomies in Bio::Phylo::Forest::Node generic attributes
for my $tip ( @{ $tree->get_terminals } ) {
# fetch taxonomy (and level list) from tip's seq id
my @tax = $self->get_taxonomy_with_levels_from_seq_id($tip->get_name);
# attach them as distinct ArrayRefs
$tip->set_generic('taxonomy' => [ map { $_->[0] } @tax ] );
$tip->set_generic('levels' => [ map { $_->[1] } @tax ] );
# Note: levels are needed for robust clade naming and collapsing.
# Indeed, get_level_from_name() can return incorrect level when a name
# exists at more than one taxonomic level in NCBI Taxonomy (e.g.,
# Bacteria, which is also an insect genus).
}
return;
}
sub attach_taxonomies_to_internals {
my $self = shift;
my $tree = shift;
# post-order tree traversal
$tree->tree->visit_depth_first(
# infer common taxonomy from direct children of each node
-post => sub {
my $node = shift;
return if $node->is_terminal;
# collect children lineages
# ... and track the longest (better to be safe...) level list
my @lineages;
my @max_levels;
for (my $i = 0; my $child = $node->get_child($i); $i++) {
push @lineages, $child->get_generic('taxonomy');
my @levels = @{ $child->get_generic('levels') };
@max_levels = @levels if @levels > @max_levels;
}
# store into current node their common lineage
# ... and cut down level list to match this common lineage
my @common_lineage = $self->_common_taxonomy(1.0, @lineages);
$node->set_generic('taxonomy' => \@common_lineage);
$node->set_generic(
'levels' => [ @max_levels[0..$#common_lineage] ]
);
return;
},
);
return;
}
( run in 1.188 second using v1.01-cache-2.11-cpan-39bf76dae61 )