Bio-MUST-Core
view release on metacpan or search on metacpan
0.252040 2025-07-23 23:59:32+02:00 Europe/Rome
- Additions
- Taxonomy::Criterion: added support for percentages
- Taxonomy: added class method no_warnings (to silence most Taxonomy-related warnings)
- Changes
- Taxonomy: updated ranks (e.g., domain) to match recent versions of NCBI Taxonomy
- Taxonomy: switched to https to download NCBI Taxonomy (setup-taxdir.pl)
- Taxonomy: improved handling of GCF numbers as GCA numbers with GTDB
0.251810 2025-06-30 15:44:10+02:00 Europe/Brussels
- Fixes
- Taxonomy: updated URL for GTDB download (used in setup-taxdir.pl)
bin/classify-ali.pl view on Meta::CPAN
Criteria may also have a 'min_seq_count' and a 'max_seq_count' arguments.
These respectively specify the minimum and maximum number of sequences that
must pass the tax_filter for the ALI to match the criterion. Minimum
defaults to 1, while there is no upper bound by default.
Other conditions are available: 'min_org_count' and 'max_org_count' deal with
organisms instead of sequences, whereas 'min_copy_mean' and 'max_copy_mean'
allow bounding the mean number of gene copies per organism. Moreover, there
exist 'perc' variants ('min_seq_perc', 'max_seq_perc', 'min_org_perc',
'max_org_perc') that count in percents (either of total sequences or total
organisms). All default to no bound.
An example YAML file follows:
categories:
- label: strict
description: strict species sampling
criteria:
- tax_filter: [ +Latimeria ]
min_seq_count: 1
bin/split-rates-ali.pl view on Meta::CPAN
my $outfile = secure_outfile(
change_suffix($infile, '.stats'), $ARGV_out_suffix
);
### Dumping site-wise stats to: $outfile
$rates->store($outfile);
}
}
### Computing masks from stats
my %args;
$args{percentile} = 1 if $ARGV_percentile;
$args{cumulative} = 1 if $ARGV_cumulative;
$args{descending} = 1 if $ARGV_descending;
my @masks = $rates->bin_rates_masks($ARGV_bin_number, \%args);
# output one Ali per bin
for my $i (0..$#masks) {
my $bin_ali = $masks[$i]->filtered_ali($ali);
my $outfile = secure_outfile($alifile, $ARGV_out_suffix . "-bin$i");
# Note: $ARGV_out_suffix defaults to q{} for smooth concatenation
bin/split-rates-ali.pl view on Meta::CPAN
either delta rates or chi-square statistics dependending on the infiles type
(see C<--sitefreq> option).
=item --bin-number=<n>
Number of bins to define [default: 10].
=for Euclid: n.type: number
n.default: 10
=item --percentile
Define bins containing an equal number of sites rather than bins of equal width
in terms of rates [default: no].
=item --cumulative
Define bins including all previous bins [default: no]. This leads to ALI
outfiles of increasing width and only makes sense when slower sites are in
lower bins. If higher "rates" mean slower sites, use the --descending option.
lib/Bio/MUST/Core/Ali.pm view on Meta::CPAN
=head2 seq_len_stats
Returns a list of 5 values summarizing the Ali seq lengths (ignoring gaps).
The values are the following: Q0 (min), Q1, Q2 (median), Q3, and Q4 (max).
This method does not accept any arguments.
=head2 perc_miss
Returns the percentage of missing (and gap-like) character states in the Ali.
As this method internally calls C<Ali::width>, the remarks above also apply.
my $miss_level = $ali->perc_miss;
This method does not accept any arguments.
=head1 MUTATORS
=head2 uc_seqs
lib/Bio/MUST/Core/SeqMask/Rates.pm view on Meta::CPAN
# small delta for slightly increasing extreme bins
const my $DELTA => 1e-13;
sub bin_rates_masks {
my $self = shift;
my $bin_n = shift;
my $args = shift // {}; # HashRef (should not be empty...)
my $percentile = $args->{percentile} // 0;
my $cumulative = $args->{cumulative} // 0;
my $descending = $args->{descending} // 0;
my @masks;
# define bin bounds based on equal count (in terms of sites)
if ($percentile) {
# create rates-sorted index of sites (from slow to fast)
my @index = sort {
$self->state_at($a) <=> $self->state_at($b)
} 0..$self->mask_len-1;
# optionally reverse index: higher values mean slower rates (TIGER)
@index = reverse @index if $descending;
# compute masks from index slices
lib/Bio/MUST/Core/Taxonomy/Criterion.pm view on Meta::CPAN
return $self->is_allowed($listable);
}
# case 2: handle "true" listable objects
# get seq_ids passing tax_filter
my @seq_ids = $listable->all_seq_ids;
my @targets = grep { $self->is_allowed($_) } @seq_ids;
# return success if positively avoided taxa are indeed absent
# Note: no need to compute percentages yet
my $seq_n = @targets;
unless ($seq_n) {
return 1
if ( defined $self->max_seq_count && !$self->max_seq_count )
|| ( defined $self->max_org_count && !$self->max_org_count )
|| ( defined $self->max_seq_perc && !$self->max_seq_perc )
|| ( defined $self->max_org_perc && !$self->max_org_perc )
;
}
# return failure unless #seqs within allowed bounds
# by default there is no upper bound on #seqs
return 0 if $seq_n < $self->min_seq_count;
return 0 if defined $self->max_seq_count && $seq_n > $self->max_seq_count;
# return failure unless #seqs within allowed bounds (in percents)
my $seq_p = 100.0 * $seq_n / $listable->all_seq_ids; # *all* ids count
return 0 if defined $self->min_seq_perc && $seq_p < $self->min_seq_perc;
return 0 if defined $self->max_seq_perc && $seq_p > $self->max_seq_perc;
# return success if no more condition for criterion
# this is optimized for speed
return 1
unless defined $self->min_org_count || defined $self->max_org_count
|| defined $self->min_copy_mean || defined $self->max_copy_mean
|| defined $self->min_org_perc || defined $self->max_org_perc
lib/Bio/MUST/Core/Taxonomy/Criterion.pm view on Meta::CPAN
# return failure unless #orgs within allowed bounds
# by default there is no lower nor upper bound on #seqs
return 0 if defined $self->min_org_count && $org_n < $self->min_org_count;
return 0 if defined $self->max_org_count && $org_n > $self->max_org_count;
# return failure unless mean(copy/org) within allowed bounds
# by default there is no lower nor upper bound on mean(copy/org)
return 0 if defined $self->min_copy_mean && $cpy_n < $self->min_copy_mean;
return 0 if defined $self->max_copy_mean && $cpy_n > $self->max_copy_mean;
# return failure unless #orgs within allowed bounds (in percents)
# these statistics are in percents of *all* orgs found in the listable
my $org_p
= 100.0 * $org_n / uniq_by { $_->full_org // $_->taxon_id } @seq_ids;
return 0 if defined $self->min_org_perc && $org_p < $self->min_org_perc;
return 0 if defined $self->max_org_perc && $org_p > $self->max_org_perc;
# return success
return 1;
}
__PACKAGE__->meta->make_immutable;
basename => 'primate',
len => 898,
bin_n => 10,
args => { cumulative => 1, descending => 1 },
bins => [ 373, 373, 376, 376, 376, 444, 517, 554, 706, 898 ],
},
{
basename => 'primate',
len => 898,
bin_n => 5,
args => { percentile => 1, descending => 1 },
bins => [ 180, 180, 180, 180, 178 ],
},
{
basename => 'primate',
len => 898,
bin_n => 5,
args => { cumulative => 1, percentile => 1, descending => 1 },
bins => [ 180, 360, 540, 720, 898 ],
},
{
basename => 'thermus',
len => 1273,
bin_n => 10,
args => { descending => 1 },
bins => [ 833, 0, 0, 0, 0, 116, 87, 72, 96, 69 ],
},
{
basename => 'thermus',
len => 1273,
bin_n => 10,
args => { cumulative => 1, descending => 1 },
bins => [ 833, 833, 833, 833, 833, 949, 1036, 1108, 1204, 1273 ],
},
{
basename => 'thermus',
len => 1273,
bin_n => 6,
args => { percentile => 1, descending => 1 },
bins => [ 213, 213, 213, 213, 213, 208 ],
},
{
basename => 'thermus',
len => 1273,
bin_n => 6,
args => { percentile => 1, cumulative => 1, descending => 1 },
bins => [ 213, 426, 639, 852, 1065, 1273 ],
},
);
for my $data (@exp_data) {
my ($basename, $exp_len, $bin_n, $args, $exp_bins)
= @{$data}{ qw(basename len bin_n args bins) };
my $infile = file('test', "$basename.rates");
my $rates = $class->load($infile);
}
{
my $infile = file('test', 'supermatrix-CATG-A-sample.rate');
my $rates = $class->load($infile);
isa_ok $rates, $class, $infile;
cmp_ok $rates->mask_len, '==', 17167,
'read expected number of site rates';
my $bin_n = 10;
my $args = { percentile => 1 };
my @masks = $rates->bin_rates_masks($bin_n, $args);
my $alifile = file('test', 'supermatrix.ali');
my $ali = Bio::MUST::Core::Ali->load($alifile);
$ali->apply_mask( Bio::MUST::Core::SeqMask->variable_mask($ali) );
for my $i (0..$#masks) {
cmp_store(
obj => $masks[$i]->filtered_ali($ali),
method => 'store',
t/taxonomy.t view on Meta::CPAN
# read configuration file
my $cfgfile_perc = file('test', 'classifier-perc.yaml');
my $config_perc = Config::Any->load_files( {
files => [ $cfgfile_perc->stringify ],
flatten_to_hash => 1,
use_ext => 1,
} );
explain $config_perc->{$cfgfile_perc};
# build percent-oriented classifier
my $class_perc = $tax->tax_classifier( $config_perc->{$cfgfile_perc} );
# classify list
my $exp_cat_perc = 'dom-mamm';
my $got_cat_perc = $class_perc->classify($list) // q{undef};
cmp_ok $got_cat_perc, 'eq', $exp_cat_perc,
"rightly classified list as $got_cat_perc";
}
{
( run in 0.781 second using v1.01-cache-2.11-cpan-39bf76dae61 )