Bio-MUST-Core
view release on metacpan or search on metacpan
bin/classify-ali.pl view on Meta::CPAN
#!/usr/bin/env perl
# PODNAME: classify-ali.pl
# ABSTRACT: Classify ALI files based on taxonomic filters
use Modern::Perl '2011';
use autodie;
use Config::Any;
use File::Basename;
use File::Find::Rule;
use Getopt::Euclid qw(:vars);
use Path::Class qw(dir file);
use Smart::Comments;
use Bio::MUST::Core;
use Bio::MUST::Core::Constants qw(:dirs);
use aliased 'Bio::MUST::Core::Ali';
use aliased 'Bio::MUST::Core::Taxonomy';
# read configuration file
my $config = Config::Any->load_files( {
files => [ $ARGV_config ],
flatten_to_hash => 1,
use_ext => 1,
} );
### config: $config->{$ARGV_config}
die "Error: no config file specified; aborting...\n"
unless $config;
# build taxonomy and classifier objects
my $tax = Taxonomy->new_from_cache( tax_dir => $ARGV_taxdir );
my $classifier = $tax->tax_classifier( $config->{$ARGV_config} );
for my $indir (@ARGV_indirs) {
### Processing: $indir
my @infiles = File::Find::Rule
->file()
->maxdepth(1)
->name( $SUFFICES_FOR{Ali} )
->in($indir)
;
# create output directories named after input directory and categories
my $outdir = dir($indir)->basename . '-classify';
for my $cat ( $classifier->all_categories ) {
my $subdir = dir( $outdir, $cat->label )->relative;
$subdir->mkpath();
}
ALI:
for my $infile (@infiles) {
### Processing: $infile
my $ali = Ali->load($infile);
$ali->dont_guess;
# classify Ali
my $cat_label = $classifier->classify($ali);
### classified to: $cat_label
next ALI unless $cat_label;
# store Ali in corresponding directory
my $subdir = dir($outdir, $cat_label)->relative;
my ($filename) = fileparse($infile);
my $outfile = file($subdir, $filename);
$ali->store($outfile);
}
}
__END__
=pod
=head1 NAME
classify-ali.pl - Classify ALI files based on taxonomic filters
=head1 VERSION
version 0.252040
=head1 USAGE
classify-ali.pl <indirs> --config=<file> --taxdir=<dir>
[optional arguments]
=head1 REQUIRED ARGUMENTS
=over
=item <indirs>
Path to input directories containing ALI files [repeatable argument].
=for Euclid: indirs.type: string
repeatable
=item --config=<file>
Path to the configuration file specifying the classifier details.
In principle, several configuration file formats are available: XML, JSON,
YAML. However, this program was designed with YAML in mind.
The configuration file defines different 'categories'. The order of
definition is relevant. Hence, if an ALI matches more than one category, it
is classified according to the first one that was defined. Each category has
a 'label' that is used to create the corresponding subdirectory for sorting
ALI files.
A category is characterized by one or more 'criteria'. To match a category,
an ALI must satisfy all criteria. Criteria are thus linked by logical ANDs
(and their order of definition is irrelevant).
Each criterion has a 'tax_filter' describing its taxonomic requirements. Wanted
( run in 0.900 second using v1.01-cache-2.11-cpan-39bf76dae61 )