Bio-MUST-Core

 view release on metacpan or  search on metacpan

bin/classify-ali.pl  view on Meta::CPAN

#!/usr/bin/env perl
# PODNAME: classify-ali.pl
# ABSTRACT: Classify ALI files based on taxonomic filters

use Modern::Perl '2011';
use autodie;

use Config::Any;
use File::Basename;
use File::Find::Rule;
use Getopt::Euclid qw(:vars);
use Path::Class qw(dir file);
use Smart::Comments;

use Bio::MUST::Core;
use Bio::MUST::Core::Constants qw(:dirs);
use aliased 'Bio::MUST::Core::Ali';
use aliased 'Bio::MUST::Core::Taxonomy';


# read configuration file
my $config = Config::Any->load_files( {
    files           => [ $ARGV_config ],
    flatten_to_hash => 1,
    use_ext         => 1,
} );
### config: $config->{$ARGV_config}

die "Error: no config file specified; aborting...\n"
    unless $config;

# build taxonomy and classifier objects
my $tax = Taxonomy->new_from_cache( tax_dir => $ARGV_taxdir );
my $classifier = $tax->tax_classifier( $config->{$ARGV_config} );

for my $indir (@ARGV_indirs) {

    ### Processing: $indir
    my @infiles = File::Find::Rule
        ->file()
        ->maxdepth(1)
        ->name( $SUFFICES_FOR{Ali} )
        ->in($indir)
    ;

    # create output directories named after input directory and categories
    my $outdir = dir($indir)->basename . '-classify';
    for my $cat ( $classifier->all_categories ) {
        my $subdir = dir( $outdir, $cat->label )->relative;
        $subdir->mkpath();
    }

    ALI:
    for my $infile (@infiles) {

        ### Processing: $infile
        my $ali = Ali->load($infile);
        $ali->dont_guess;

        # classify Ali
        my $cat_label = $classifier->classify($ali);
        ### classified to: $cat_label
        next ALI unless $cat_label;

        # store Ali in corresponding directory
        my $subdir = dir($outdir, $cat_label)->relative;
        my ($filename) = fileparse($infile);
        my $outfile = file($subdir, $filename);
        $ali->store($outfile);
    }
}

__END__

=pod

=head1 NAME

classify-ali.pl - Classify ALI files based on taxonomic filters

=head1 VERSION

version 0.252040

=head1 USAGE

    classify-ali.pl <indirs> --config=<file> --taxdir=<dir>
        [optional arguments]

=head1 REQUIRED ARGUMENTS

=over

=item <indirs>

Path to input directories containing ALI files [repeatable argument].

=for Euclid: indirs.type: string
    repeatable

=item --config=<file>

Path to the configuration file specifying the classifier details.

In principle, several configuration file formats are available: XML, JSON,
YAML. However, this program was designed with YAML in mind.

The configuration file defines different 'categories'. The order of
definition is relevant. Hence, if an ALI matches more than one category, it
is classified according to the first one that was defined. Each category has
a 'label' that is used to create the corresponding subdirectory for sorting
ALI files.

A category is characterized by one or more 'criteria'. To match a category,
an ALI must satisfy all criteria. Criteria are thus linked by logical ANDs
(and their order of definition is irrelevant).

Each criterion has a 'tax_filter' describing its taxonomic requirements. Wanted



( run in 0.900 second using v1.01-cache-2.11-cpan-39bf76dae61 )