Bio-MUST-Core

 view release on metacpan or  search on metacpan

Changes  view on Meta::CPAN

0.252040  2025-07-23 23:59:32+02:00 Europe/Rome
    - Additions
      - Taxonomy::Criterion: added support for percentages
      - Taxonomy: added class method no_warnings (to silence most Taxonomy-related warnings)
    - Changes
      - Taxonomy: updated ranks (e.g., domain) to match recent versions of NCBI Taxonomy
      - Taxonomy: switched to https to download NCBI Taxonomy (setup-taxdir.pl)
      - Taxonomy: improved handling of GCF numbers as GCA numbers with GTDB

0.251810  2025-06-30 15:44:10+02:00 Europe/Brussels
    - Fixes
      - Taxonomy: updated URL for GTDB download (used in setup-taxdir.pl)

bin/classify-ali.pl  view on Meta::CPAN


Criteria may also have a 'min_seq_count' and a 'max_seq_count' arguments.
These respectively specify the minimum and maximum number of sequences that
must pass the tax_filter for the ALI to match the criterion. Minimum
defaults to 1, while there is no upper bound by default.

Other conditions are available: 'min_org_count' and 'max_org_count' deal with
organisms instead of sequences, whereas 'min_copy_mean' and 'max_copy_mean'
allow bounding the mean number of gene copies per organism. Moreover, there
exist 'perc' variants ('min_seq_perc', 'max_seq_perc', 'min_org_perc',
'max_org_perc') that count in percents (either of total sequences or total
organisms). All default to no bound.

An example YAML file follows:

    categories:
    - label: strict
      description: strict species sampling
      criteria:
      - tax_filter: [ +Latimeria ]
        min_seq_count: 1

bin/split-rates-ali.pl  view on Meta::CPAN

            my $outfile = secure_outfile(
                change_suffix($infile, '.stats'), $ARGV_out_suffix
            );
            ### Dumping site-wise stats to: $outfile
            $rates->store($outfile);
        }
    }

    ### Computing masks from stats
    my %args;
    $args{percentile} = 1 if $ARGV_percentile;
    $args{cumulative} = 1 if $ARGV_cumulative;
    $args{descending} = 1 if $ARGV_descending;
    my @masks = $rates->bin_rates_masks($ARGV_bin_number, \%args);

    # output one Ali per bin
    for my $i (0..$#masks) {
        my $bin_ali = $masks[$i]->filtered_ali($ali);
        my $outfile = secure_outfile($alifile, $ARGV_out_suffix . "-bin$i");
        # Note: $ARGV_out_suffix defaults to q{} for smooth concatenation

bin/split-rates-ali.pl  view on Meta::CPAN

either delta rates or chi-square statistics dependending on the infiles type
(see C<--sitefreq> option).

=item --bin-number=<n>

Number of bins to define [default: 10].

=for Euclid: n.type:    number
    n.default: 10

=item --percentile

Define bins containing an equal number of sites rather than bins of equal width
in terms of rates [default: no].

=item --cumulative

Define bins including all previous bins [default: no]. This leads to ALI
outfiles of increasing width and only makes sense when slower sites are in
lower bins. If higher "rates" mean slower sites, use the --descending option.

lib/Bio/MUST/Core/Ali.pm  view on Meta::CPAN


=head2 seq_len_stats

Returns a list of 5 values summarizing the Ali seq lengths (ignoring gaps).
The values are the following: Q0 (min), Q1, Q2 (median), Q3, and Q4 (max).

This method does not accept any arguments.

=head2 perc_miss

Returns the percentage of missing (and gap-like) character states in the Ali.

As this method internally calls C<Ali::width>, the remarks above also apply.

    my $miss_level = $ali->perc_miss;

This method does not accept any arguments.

=head1 MUTATORS

=head2 uc_seqs

lib/Bio/MUST/Core/SeqMask/Rates.pm  view on Meta::CPAN



# small delta for slightly increasing extreme bins
const my $DELTA => 1e-13;

sub bin_rates_masks {
    my $self  = shift;
    my $bin_n = shift;
    my $args  = shift // {};            # HashRef (should not be empty...)

    my $percentile = $args->{percentile} // 0;
    my $cumulative = $args->{cumulative} // 0;
    my $descending = $args->{descending} // 0;

    my @masks;

    # define bin bounds based on equal count (in terms of sites)
    if ($percentile) {

        # create rates-sorted index of sites (from slow to fast)
        my @index = sort {
            $self->state_at($a) <=> $self->state_at($b)
        } 0..$self->mask_len-1;

        # optionally reverse index: higher values mean slower rates (TIGER)
        @index = reverse @index if $descending;

        # compute masks from index slices

lib/Bio/MUST/Core/Taxonomy/Criterion.pm  view on Meta::CPAN

        return $self->is_allowed($listable);
    }

    # case 2: handle "true" listable objects

    # get seq_ids passing tax_filter
    my @seq_ids = $listable->all_seq_ids;
    my @targets = grep { $self->is_allowed($_) } @seq_ids;

    # return success if positively avoided taxa are indeed absent
    # Note: no need to compute percentages yet
    my $seq_n = @targets;
    unless ($seq_n) {
        return 1
            if ( defined $self->max_seq_count && !$self->max_seq_count )
            || ( defined $self->max_org_count && !$self->max_org_count )
            || ( defined $self->max_seq_perc  && !$self->max_seq_perc  )
            || ( defined $self->max_org_perc  && !$self->max_org_perc  )
        ;
    }

    # return failure unless #seqs within allowed bounds
    # by default there is no upper bound on #seqs
    return 0 if                                 $seq_n < $self->min_seq_count;
    return 0 if defined $self->max_seq_count && $seq_n > $self->max_seq_count;

    # return failure unless #seqs within allowed bounds (in percents)
    my $seq_p = 100.0 * $seq_n / $listable->all_seq_ids;    # *all* ids count
    return 0 if defined $self->min_seq_perc  && $seq_p < $self->min_seq_perc;
    return 0 if defined $self->max_seq_perc  && $seq_p > $self->max_seq_perc;

    # return success if no more condition for criterion
    # this is optimized for speed
    return 1
        unless defined $self->min_org_count || defined $self->max_org_count
            || defined $self->min_copy_mean || defined $self->max_copy_mean
            || defined $self->min_org_perc  || defined $self->max_org_perc

lib/Bio/MUST/Core/Taxonomy/Criterion.pm  view on Meta::CPAN

    # return failure unless #orgs within allowed bounds
    # by default there is no lower nor upper bound on #seqs
    return 0 if defined $self->min_org_count && $org_n < $self->min_org_count;
    return 0 if defined $self->max_org_count && $org_n > $self->max_org_count;

    # return failure unless mean(copy/org) within allowed bounds
    # by default there is no lower nor upper bound on mean(copy/org)
    return 0 if defined $self->min_copy_mean && $cpy_n < $self->min_copy_mean;
    return 0 if defined $self->max_copy_mean && $cpy_n > $self->max_copy_mean;

    # return failure unless #orgs within allowed bounds (in percents)
    # these statistics are in percents of *all* orgs found in the listable
    my $org_p
        = 100.0 * $org_n / uniq_by { $_->full_org // $_->taxon_id } @seq_ids;
    return 0 if defined $self->min_org_perc && $org_p < $self->min_org_perc;
    return 0 if defined $self->max_org_perc && $org_p > $self->max_org_perc;

    # return success
    return 1;
}

__PACKAGE__->meta->make_immutable;

t/rates.t  view on Meta::CPAN

        basename => 'primate',
        len => 898,
        bin_n => 10,
        args => { cumulative => 1, descending => 1 },
        bins => [ 373, 373, 376, 376, 376, 444, 517, 554, 706, 898 ],
    },
    {
        basename => 'primate',
        len => 898,
        bin_n => 5,
        args => { percentile => 1, descending => 1 },
        bins => [ 180, 180, 180, 180, 178 ],
    },
    {
        basename => 'primate',
        len => 898,
        bin_n => 5,
        args => { cumulative => 1, percentile => 1, descending => 1 },
        bins => [ 180, 360, 540, 720, 898 ],
    },
    {
        basename => 'thermus',
        len => 1273,
        bin_n => 10,
        args => { descending => 1 },
        bins => [ 833, 0, 0, 0, 0, 116, 87, 72, 96, 69 ],
    },
    {
        basename => 'thermus',
        len => 1273,
        bin_n => 10,
        args => { cumulative => 1, descending => 1 },
        bins => [ 833, 833, 833, 833, 833, 949, 1036, 1108, 1204, 1273 ],
    },
    {
        basename => 'thermus',
        len => 1273,
        bin_n => 6,
        args => { percentile => 1, descending => 1 },
        bins => [ 213, 213, 213, 213, 213, 208 ],
    },
    {
        basename => 'thermus',
        len => 1273,
        bin_n => 6,
        args => { percentile => 1, cumulative => 1, descending => 1 },
        bins => [ 213, 426, 639, 852, 1065, 1273 ],
    },
);

for my $data (@exp_data) {
    my ($basename, $exp_len, $bin_n, $args, $exp_bins)
        = @{$data}{ qw(basename len bin_n args bins) };

    my $infile = file('test', "$basename.rates");
    my $rates = $class->load($infile);

t/rates.t  view on Meta::CPAN

}

{
    my $infile = file('test', 'supermatrix-CATG-A-sample.rate');
    my $rates = $class->load($infile);
    isa_ok $rates, $class, $infile;
    cmp_ok $rates->mask_len, '==', 17167,
        'read expected number of site rates';

    my $bin_n = 10;
    my $args = { percentile => 1 };
    my @masks = $rates->bin_rates_masks($bin_n, $args);

    my $alifile = file('test', 'supermatrix.ali');
    my $ali = Bio::MUST::Core::Ali->load($alifile);
    $ali->apply_mask( Bio::MUST::Core::SeqMask->variable_mask($ali) );

    for my $i (0..$#masks) {
        cmp_store(
            obj    => $masks[$i]->filtered_ali($ali),
            method => 'store',

t/taxonomy.t  view on Meta::CPAN


    # read configuration file
    my $cfgfile_perc = file('test', 'classifier-perc.yaml');
    my $config_perc = Config::Any->load_files( {
        files           => [ $cfgfile_perc->stringify ],
        flatten_to_hash => 1,
        use_ext         => 1,
    } );
    explain $config_perc->{$cfgfile_perc};

    # build percent-oriented classifier
    my $class_perc = $tax->tax_classifier( $config_perc->{$cfgfile_perc} );

    # classify list
    my $exp_cat_perc = 'dom-mamm';
    my $got_cat_perc = $class_perc->classify($list) // q{undef};
    cmp_ok $got_cat_perc, 'eq', $exp_cat_perc,
        "rightly classified list as $got_cat_perc";
}

{



( run in 0.781 second using v1.01-cache-2.11-cpan-39bf76dae61 )