Bio-MUST-Apps-FortyTwo

 view release on metacpan or  search on metacpan

bin/prune-outliers.pl  view on Meta::CPAN

use File::Find::Rule;
use Getopt::Euclid qw(:vars);
use Path::Class qw(dir file);
use Smart::Comments;

use Bio::MUST::Core;
use Bio::MUST::Core::Constants qw(:dirs);
use Bio::MUST::Drivers;

use aliased 'Bio::MUST::Core::Ali';
use aliased 'Bio::MUST::Core::IdList';

my $qr_class = 'Bio::MUST::Core::Ali::Temporary';
my $db_class = 'Bio::MUST::Drivers::Blast::Database';
my $db_tmp_class = 'Bio::MUST::Drivers::Blast::Database::Temporary';

for my $indir (@ARGV_indirs) {

    ### Processing: $indir
    my @infiles = File::Find::Rule
        ->file()
        ->name( $SUFFICES_FOR{Ali} )
        ->in($indir);

    for my $infile (@infiles) {
        ### Processing: $infile

        # BLASTP/N all versus all
        my $db = $db_tmp_class->new( seqs => file($infile) );

        # Note: this is a special case where query and database are identical
        # (but autodetection won't work because query is just a filename)
        my $query = $db->filename;
        my $pgm   = $db->type eq 'nucl' ? 'blastn' : 'blastp';

        my $parser = $db->$pgm($query, {
            -evalue => $ARGV_evalue,
            -outfmt => 6,
            $db->type eq 'nucl' ? ( -task => 'blastn' ) : ()
        } );

        # parsing
        my %count_for;
        while ( my $hit = $parser->next_hit ) {
            my $query_id = $db->long_id_for( $hit->query_id );
            $count_for{$query_id}++;
        }

        my $outdir = dir($indir)->basename . '-pruned';

        for (my $t = $ARGV_min_ident; $t <= $ARGV_max_ident; $t += 0.1) {

            my @ids
                = grep { ($count_for{$_} / keys %count_for) >= $t }
                  grep {  $count_for{$_} >= $ARGV_min_hits } keys %count_for
            ;

            ### threshold: $t . ' - ' . scalar @ids . ' seqs kept out of ' . scalar keys %count_for

            my $ali = Ali->load($infile);
            $ali->dont_guess if $ARGV_noguessing;

            my $list = IdList->new( ids => \@ids );
            my $new_ali = $list->filtered_ali($ali);

            # create output dirs named after input dir and identity threshold
            my $subdir = dir( $outdir, $t )->relative;
            $subdir->mkpath();

            # store Ali in corresponding dir
            my ($filename) = fileparse($infile);
            my $outfile = file($subdir, $filename)->stringify;
            $new_ali->store($outfile);
        }
    }

}

__END__

=pod

=head1 NAME

prune-outliers.pl - Identify and discard outliers based on all-versus-all BLAST searches

=head1 VERSION

version 0.213470

=head1 USAGE

	prune-outliers.pl <indirs> [options]

=head1 REQUIRED ARGUMENTS

=over

=item <indirs>

Path to input directories containing ALI (or FASTA) files [repeatable argument].

=for Euclid: indirs.type: string
    repeatable

=back

=head1 OPTIONS

=over

=item --evalue=<n>

evalue threshold for a hit to be considered during the all-versus-all BLAST
searches [default: n.default].

=for Euclid: n.type: num
    n.default: 1e-10

=item --min-ident=<n> | --min_ident=<n>

Minimal identity value used for selecting sequences that match at least this
proportion in the all-versus-all BLAST searches [default: n.default]. An output
dir will be created by step of 0.1 between the min threshold and max threshold.

=for Euclid: n.type: num
    n.default: 0.3

=item --max-ident=<n> | --max_ident=<n>

Maximum percent value used for selecting sequences that match at least this
proportion in the all versus all BLAST searches [default: n.default]. An output
dir will be created by step of 0.1 between the min threshold and max threshold.

=for Euclid: n.type: num
    n.default: 0.8

=item --min-hits=<n> | --min_hits=<n>

Minimum number of hits in the all-versus-all BLAST searches required for a
sequence to be retained in the output file [default: n.default].

=for Euclid: n.type: num
    n.default: 10

=item --[no]guessing

[Don't] guess whether sequences are aligned or not [default: yes].

=item --version

=item --usage

=item --help

=item --man

Print the usual program information

=back

=head1 AUTHOR

Denis BAURAIN <denis.baurain@uliege.be>

=head1 CONTRIBUTORS

=for stopwords Amandine BERTRAND Loic MEUNIER

=over 4

=item *

Amandine BERTRAND <amandine.bertrand@doct.uliege.be>

=item *

Loic MEUNIER <loic.meunier@doct.uliege.be>

=back

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut



( run in 0.754 second using v1.01-cache-2.11-cpan-ceb78f64989 )