Bio-MUST-Core

 view release on metacpan or  search on metacpan

bin/split-rates-ali.pl  view on Meta::CPAN


use Bio::MUST::Core;
use Bio::MUST::Core::Utils qw(change_suffix secure_outfile);
use aliased 'Bio::MUST::Core::Ali';
use aliased 'Bio::MUST::Core::SeqMask';


# TODO: generalize this to other executables through Utils?
my ($load, $store) = qw(load store);
my $suffix = '.ali';
my %out_args;
if ($ARGV_phylip) {
    ### Infiles (and outfiles) are in PHYLIP format
    $load  .= '_phylip';
    $store .= '_phylip';
    $suffix = '.phy';
    $out_args{chunk} = -1;
}

my $class = 'Bio::MUST::Core::SeqMask::Rates';
my $delta = 'delta_rates';
if ($ARGV_sitefreq) {
    $class = 'Bio::MUST::Core::SeqMask::Pmsf';
    $delta = 'chi_square_stats';
}

# optionally load other rates file for deltas
my $other_rates;
   $other_rates = $class->load($ARGV_other_rates) if $ARGV_other_rates;

for my $infile (@ARGV_infiles) {

    ### Processing: $infile
    my $rates = $class->load($infile);

    $infile =~ s/$_//xms for @ARGV_in_strip;
    my $alifile = change_suffix($infile, $suffix);
    my $ali = Ali->$load($alifile);
    $ali->gapify_seqs if $ARGV_from_scafos;
    ### Infile: "$alifile had " . $ali->width . ' sites'

    # optionally delete constant sites
    $ali->apply_mask( SeqMask->variable_mask($ali) ) if $ARGV_del_const;

    # optionally use delta rates instead of raw rates if available
    if ($other_rates) {
        $rates = $rates->$delta($other_rates);

        # optionally dump stats resulting from delta operation
        if ($ARGV_dump_stats) {
            my $outfile = secure_outfile(
                change_suffix($infile, '.stats'), $ARGV_out_suffix
            );
            ### Dumping site-wise stats to: $outfile
            $rates->store($outfile);
        }
    }

    ### Computing masks from stats
    my %args;
    $args{percentile} = 1 if $ARGV_percentile;
    $args{cumulative} = 1 if $ARGV_cumulative;
    $args{descending} = 1 if $ARGV_descending;
    my @masks = $rates->bin_rates_masks($ARGV_bin_number, \%args);

    # output one Ali per bin
    for my $i (0..$#masks) {
        my $bin_ali = $masks[$i]->filtered_ali($ali);
        my $outfile = secure_outfile($alifile, $ARGV_out_suffix . "-bin$i");
        # Note: $ARGV_out_suffix defaults to q{} for smooth concatenation

        ### Outfile: "$outfile has " . $bin_ali->width . ' sites'
        $bin_ali->$store($outfile, \%out_args);
    }
}

__END__

=pod

=head1 NAME

split-rates-ali.pl - Split ALI files into subsets of sites based on site-wise statistics

=head1 VERSION

version 0.252040

=head1 USAGE

    split-rates-ali.pl <infiles> [optional arguments]

=head1 REQUIRED ARGUMENTS

=over

=item <infiles>

Path to input RATE(S) (or SITEFREQ) files [repeatable argument].

=for Euclid: infiles.type: readable
    repeatable

=back

=head1 OPTIONAL ARGUMENTS

=over

=item --in[-strip]=<str>

Substring(s) to strip from infile basenames before attempting to derive other
infile (e.g., ALI files) and outfile names [default: none].

=for Euclid: str.type: string
    repeatable

=item --out[-suffix]=<suffix>

Suffix to append to (possibly stripped) infile basenames for deriving
outfile names [default: none]. When not specified, outfile names are taken
from infiles but original infiles are preserved by being appended a .bak
suffix.

=for Euclid: suffix.type: string
    suffix.default: q{}

=item --from-scafos

Consider the input ALI file as generated by SCaFoS [default: no]. Currently,
specifying this option results in turning all ambiguous and missing character
states to gaps.

=item --del-const

Delete constant sites just as the C<-dc> option of PhyloBayes [default: no].

=item --phylip

Assume infiles and outfiles are in PHYLIP format (instead of ALI format)
[default: no].

=item --sitefreq

Assume infile are IQ-TREE SITEFREQ files instead of RATE(S) files [default: no].

=item --other-rates=<file>

Optional additional RATE(S) (or SITEFREQ) file of the same length as the main
infile(s) that will be used to compute rate deltas [default: none]. Currently,
rate deltas are defined as the absolute difference between the two rates at each
site. This could be improved by, e.g., computing relative deltas.

When SITEFREQ are provided instead of RATE(S), deltas correspond to chi-square
test statistics computed between the two SITEFREQ files at each site.

=for Euclid: file.type: readable

=item --dump-stats

Output site-wise stats resulting from the comparison of a pair of infiles of
the same length (see C<--other-rates> option) [default: no]. The values are
either delta rates or chi-square statistics dependending on the infiles type
(see C<--sitefreq> option).

=item --bin-number=<n>

Number of bins to define [default: 10].

=for Euclid: n.type:    number
    n.default: 10

=item --percentile

Define bins containing an equal number of sites rather than bins of equal width
in terms of rates [default: no].

=item --cumulative

Define bins including all previous bins [default: no]. This leads to ALI
outfiles of increasing width and only makes sense when slower sites are in
lower bins. If higher "rates" mean slower sites, use the --descending option.

=item --descending

Reverse bin order to accommodate "rates" files where higher values actually
correspond to slower sites, such as those produced by Carla Cummins' TIGER
[default: no].

=item --version

=item --usage

=item --help

=item --man

Print the usual program information

=back

=head1 AUTHOR

Denis BAURAIN <denis.baurain@uliege.be>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut



( run in 1.329 second using v1.01-cache-2.11-cpan-39bf76dae61 )