Bio-MLST-Check

 view release on metacpan or  search on metacpan

lib/Bio/MLST/CheckMultipleSpecies.pm  view on Meta::CPAN

package Bio::MLST::CheckMultipleSpecies;
$Bio::MLST::CheckMultipleSpecies::VERSION = '2.1.1706216';
# ABSTRACT: High throughput multilocus sequence typing (MLST) checking against several MLST databases.



use Moose;
use Bio::MLST::Check;
use Bio::MLST::Databases;
use Parallel::ForkManager;
use File::Temp;
use Cwd;
use Text::CSV;

has 'species'               => ( is => 'ro', isa => 'ArrayRef', required => 1 ); # empty array searches against all databases
has 'base_directory'        => ( is => 'ro', isa => 'Str',      required => 1 ); 
has 'parallel_processes'    => ( is => 'ro', isa => 'Int',      default  => 1 ); # max parallel processes
has 'verbose'               => ( is => 'rw', isa => 'Bool',     default  => 0 ); # output search progress and number of matches
has 'report_all_mlst_db'    => ( is => 'rw', isa => 'Bool',     default  => 0 ); # report all mlst databases searched
has 'report_lowest_st'      => ( is => 'rw', isa => 'Bool',     default  => 0 );

has 'raw_input_fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'makeblastdb_exec'      => ( is => 'ro', isa => 'Str',      required => 1 ); 
has 'blastn_exec'           => ( is => 'ro', isa => 'Str',      required => 1 ); 
has 'output_directory'      => ( is => 'ro', isa => 'Str',      required => 1 ); 
has 'spreadsheet_basename'  => ( is => 'ro', isa => 'Str',      default  => 'mlst_results' ); 
has 'output_fasta_files'    => ( is => 'ro', isa => 'Bool',     default  => 0 ); # output of fasta not supported
has 'output_phylip_files'   => ( is => 'ro', isa => 'Bool',     default  => 0 ); # output of phylip not supported
has 'show_contamination_instead_of_alt_matches' => ( is => 'ro', isa => 'Bool',   default => 1 ); 

has '_species_list'         => ( is => 'ro', isa => 'ArrayRef', lazy_build => 1 );
has '_working_directory'    => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir(DIR => getcwd, CLEANUP => 1); });

sub _build__species_list
{
    my($self) = @_;
    my @species_list = @{$self->species};

    # if no species supplied then run vs all species
    unless(@species_list)
    {
        my $mlst_databases = Bio::MLST::Databases->new(
            base_directory => $self->base_directory,
            );
        @species_list = @{$mlst_databases->database_names};
    }

    @species_list = sort { $a cmp $b } @species_list;

    return \@species_list;
}

sub _check_input_files_exist
{
    my($self) = @_;

    my $check = Bio::MLST::Check->new( raw_input_fasta_files => $self->raw_input_fasta_files,
                                       species               => '',
                                       base_directory        => '',
                                       makeblastdb_exec      => '',
                                       blastn_exec           => '',
                                       output_directory      => '' );

    return $check->input_fasta_files_exist;
}

# print error message if phylip or fasta files requested
sub _check_fasta_phylip_options
{
    my($self) = @_;

    return 1 unless ($self->output_fasta_files || $self->output_phylip_files);

    print qq[
 The --output_fasta_files and --output_phylip_files options cannot be used when searching
 against more than one MLST database as the alleles searched will differ between species.

 To output fasta and phylip files, please search against a single MLST database.\n\n];
    return 0;
}

sub _run_mlst_for_species_list
{
    my ($self) = @_;

    # set parallel processes - if more species than processes then search input files in parallel.
    my $parallel_process_total   = $self->parallel_processes;
    my $parallel_process_species = @{$self->_species_list} < $self->parallel_processes ? @{$self->_species_list} : $self->parallel_processes;
    my $parallel_process_fa_file = int($self->parallel_processes/@{$self->_species_list}) ? int($self->parallel_processes/@{$self->_species_list}) : 1;

    # Run for each species - output to csv files named 0001,0002,etc.



( run in 2.254 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )