Bio-MUST-Apps-Physeter
view release on metacpan or search on metacpan
bin/physeter.pl view on Meta::CPAN
my $sub_size = ceil( @shuffle / 10 );
my @subsets;
while (@shuffle) {
push @subsets, [ splice @shuffle, 0, $sub_size ];
}
return \@subsets;
}
sub parse_report {
my $infile = shift;
my $query_org = shift;
my $taxids = shift // ();
# setup boolean filter for current database subset (if any)
my %unwanted = map { $_ => 1 } @{$taxids};
my $report = Table->new( file => $infile );
tie my %lca_for, 'Tie::IxHash';
my $curr_query = q{};
my $best_score;
my @relatives;
my $method = 'next_hit';
HIT:
while (my $hit = $report->$method) {
# at each new query...
if ($hit->query_id ne $curr_query) {
# store query LCA if enough relatives
# TODO: make Taxonomy:: class for collections of lineages
if (@relatives >= $ARGV_tax_min_hits) {
$lca_for{$curr_query} = {
taxon => scalar $tax->compute_lca(
$ARGV_tax_min_lca_freq, @relatives),
rel_n => scalar @relatives,
};
}
# prepare analysis of new query
$curr_query = q{};
$best_score = undef;
@relatives = ();
$method = 'next_hit';
}
# skip extra (unused) relatives
if (@relatives >= $ARGV_tax_max_hits) {
$method = 'next_query';
next HIT;
}
# skip weak hits (classical mode)
next HIT if $hit->hsp_length < $ARGV_tax_min_len;
next HIT if $hit->percent_identity < $ARGV_tax_min_ident;
next HIT if $hit->bit_score < $ARGV_tax_min_score;
# fetch hit taxonomy and org
# optimized code (requires taxon_id|accession seq_ids)
my $taxon_id = ( split m{\|}xms, $hit->hit_id )[0];
# k-folds mode (skip hits from current database subset)
next HIT if $ARGV_kfold && $unwanted{$taxon_id};
my @taxonomy = $tax->get_taxonomy($taxon_id);
# regular code (more general)
# my @taxonomy = $tax->get_taxonomy_from_seq_id( $hit->hit_id );
my $hit_org = $taxonomy[-1];
# skip self hits (corresponding to query_org)
next HIT if $hit_org eq $query_org;
# skip weak hits (MEGAN-like mode)
$best_score //= $hit->bit_score;
next HIT if $hit->bit_score < $ARGV_tax_score_mul * $best_score;
# retain hit taxonomy as relative
$curr_query = $hit->query_id;
push @relatives, \@taxonomy;
}
# store last query LCA if enough relatives
if ($curr_query && @relatives >= $ARGV_tax_min_hits) {
$lca_for{$curr_query} = {
taxon => scalar $tax->compute_lca(
$ARGV_tax_min_lca_freq, @relatives),
rel_n => scalar @relatives,
};
}
return \%lca_for;
}
sub auto_detect {
my $lca_for = shift;
# TODO: implement and compare the opposite approach where we first label
# then determine the exp_tax...
# count LCA lineage occurrences (i.e., sort | uniq -c)
my %count_for = count_by { join q{;}, @{ $_->{taxon} } } values %{$lca_for};
# compute LCA proportions
my @taxa = sort {
$count_for{$b} <=> $count_for{$a}
} keys %count_for;
my $exp_tax = $labeler->classify(
$taxa[0], { greedy => $ARGV_greedy_taxa }
);
return $exp_tax;
}
sub tabulate_lcas {
bin/physeter.pl view on Meta::CPAN
=for Euclid: file.type: writable
=for Euclid: file.type: readable
=item --taxdir=<dir>
Path to local mirror of the NCBI Taxonomy database.
=for Euclid: dir.type: string
=item --taxon-list=<file>
List of taxa to consider when looking for foreign sequences. This labeler file
is used throughout the program to truncate LCA lineages at specific taxonomic
levels (which can vary from one lineage to the other).
=back
=head1 OPTIONS
=over
=item --threads=<n>
Number of threads to run in parallel [default: n.default]. Parallelization is
achieved by processing several BLAST files in parallel using an internal queue.
Therefore, the specified number of threads should not be larger than the
number of input BLAST files.
=for Euclid: n.type: +int
n.default: 1
=item --fasta-dir=<dir>
Path to the directory holding the FASTA query files [default: dir.default].
FASTA files must have the same basenames as BLAST infiles.
=for Euclid: dir.type: readable
dir.default: './'
=item --exp[ected]-tax[on]=<string>
Organism taxon [default: automatic]. Use this option when the organism does not
have an assembly accession. The specified taxon must be in the file provided
with the C<--taxon-list> option.
=for Euclid: string.type: string
=item --auto-detect
Determine organism taxon based on BLAST infile [default: no].
=item --greedy-taxa
Enable greedy behavior when interpreting the ambiguous taxa provided in the
required argument C<--taxon-list> [default: no].
=item --tax-min-ident=<n>
Minimum identity percentage to consider a hit when computing a LCA [default:
n.default].
=for Euclid: n.type: +number
n.default: 0
=item --tax-min-len=<n>
Minimum alignment length to consider a hit when computing a LCA [default:
n.default].
=for Euclid: n.type: +integer
n.default: 0
=item --tax-min-score=<n>
Minimum bit score to consider a hit when computing a LCA [default: n.default].
=for Euclid: n.type: +integer
n.default: 80
=item --tax-score-mul=<n>
Bit score reduction allowed when accumulating hits for LCA inference (MEGAN-like
algorithm) [default: n.default].
=for Euclid: n.type: +number
n.default: 0.95
=item --tax-min-hits=<n>
Minimum number of hits to use when computing LCAs [default: n.default]. Must
be lower or equal to the next optional argument (C<--tax-max-hits>).
=for Euclid: n.type: 0+integer
n.default: 1
=item --tax-max-hits=<n>
Maximum number of hits to use when computing LCAs [default: n.default]. Must
be greater or equal to the previous optional argument (C<--tax-min-hits>).
=for Euclid: n.type: 0+integer
n.default: 1
=item --tax-min-lca-freq=<n>
Minimum frequency for the common taxon when computing LCA [default: n.default].
When specified and lower than 1.0, the LCA inference algorithm returns the
lowest taxon that is found in at least this fraction of lineages (instead of
returning the lowest taxon found in all lineages).
=for Euclid: n.type: 0+number
n.default: 1.0
=item --kraken
Write KRAKEN-like report file [default: no].
=item --anvio
( run in 1.724 second using v1.01-cache-2.11-cpan-39bf76dae61 )