Bio-MUST-Apps-Physeter
view release on metacpan or search on metacpan
bin/physeter.pl view on Meta::CPAN
}
return \@subsets;
}
sub parse_report {
my $infile = shift;
my $query_org = shift;
my $taxids = shift // ();
# setup boolean filter for current database subset (if any)
my %unwanted = map { $_ => 1 } @{$taxids};
my $report = Table->new( file => $infile );
tie my %lca_for, 'Tie::IxHash';
my $curr_query = q{};
my $best_score;
my @relatives;
my $method = 'next_hit';
HIT:
while (my $hit = $report->$method) {
# at each new query...
if ($hit->query_id ne $curr_query) {
# store query LCA if enough relatives
# TODO: make Taxonomy:: class for collections of lineages
if (@relatives >= $ARGV_tax_min_hits) {
$lca_for{$curr_query} = {
taxon => scalar $tax->compute_lca(
$ARGV_tax_min_lca_freq, @relatives),
rel_n => scalar @relatives,
};
}
# prepare analysis of new query
$curr_query = q{};
$best_score = undef;
@relatives = ();
$method = 'next_hit';
}
# skip extra (unused) relatives
if (@relatives >= $ARGV_tax_max_hits) {
$method = 'next_query';
next HIT;
}
# skip weak hits (classical mode)
next HIT if $hit->hsp_length < $ARGV_tax_min_len;
next HIT if $hit->percent_identity < $ARGV_tax_min_ident;
next HIT if $hit->bit_score < $ARGV_tax_min_score;
# fetch hit taxonomy and org
# optimized code (requires taxon_id|accession seq_ids)
my $taxon_id = ( split m{\|}xms, $hit->hit_id )[0];
# k-folds mode (skip hits from current database subset)
next HIT if $ARGV_kfold && $unwanted{$taxon_id};
my @taxonomy = $tax->get_taxonomy($taxon_id);
# regular code (more general)
# my @taxonomy = $tax->get_taxonomy_from_seq_id( $hit->hit_id );
my $hit_org = $taxonomy[-1];
# skip self hits (corresponding to query_org)
next HIT if $hit_org eq $query_org;
# skip weak hits (MEGAN-like mode)
$best_score //= $hit->bit_score;
next HIT if $hit->bit_score < $ARGV_tax_score_mul * $best_score;
# retain hit taxonomy as relative
$curr_query = $hit->query_id;
push @relatives, \@taxonomy;
}
# store last query LCA if enough relatives
if ($curr_query && @relatives >= $ARGV_tax_min_hits) {
$lca_for{$curr_query} = {
taxon => scalar $tax->compute_lca(
$ARGV_tax_min_lca_freq, @relatives),
rel_n => scalar @relatives,
};
}
return \%lca_for;
}
sub auto_detect {
my $lca_for = shift;
# TODO: implement and compare the opposite approach where we first label
# then determine the exp_tax...
# count LCA lineage occurrences (i.e., sort | uniq -c)
my %count_for = count_by { join q{;}, @{ $_->{taxon} } } values %{$lca_for};
# compute LCA proportions
my @taxa = sort {
$count_for{$b} <=> $count_for{$a}
} keys %count_for;
my $exp_tax = $labeler->classify(
$taxa[0], { greedy => $ARGV_greedy_taxa }
);
return $exp_tax;
}
sub tabulate_lcas {
my $lca_for = shift;
my $exp_tax = shift;
my $seq_n = shift;
my $self_n = 0;
( run in 1.872 second using v1.01-cache-2.11-cpan-483215c6ad5 )