Bio-Roary
view release on metacpan or search on metacpan
lib/Bio/Roary/AssemblyStatistics.pm view on Meta::CPAN
package Bio::Roary::AssemblyStatistics;
$Bio::Roary::AssemblyStatistics::VERSION = '3.13.0';
# ABSTRACT: Given a spreadsheet of gene presence and absence calculate some statistics
use Moose;
use Bio::Roary::ExtractCoreGenesFromSpreadsheet;
use Log::Log4perl qw(:easy);
with 'Bio::Roary::SpreadsheetRole';
has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'assembly_statistics.csv' );
has 'job_runner' => ( is => 'ro', isa => 'Str', default => 'Local' );
has 'cpus' => ( is => 'ro', isa => 'Int', default => 1 );
has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 );
has '_cloud_percentage' => ( is => 'rw', isa => 'Num', default => 0.15 );
has '_shell_percentage' => ( is => 'rw', isa => 'Num', default => 0.95 );
has '_soft_core_percentage' => ( is => 'rw', isa => 'Num', default => 0.99 );
has 'verbose' => ( is => 'ro', isa => 'Bool', default => 0 );
has 'contiguous_window' => ( is => 'ro', isa => 'Int', default => 10 );
has 'ordered_genes' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_ordered_genes' );
has '_genes_to_rows' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__genes_to_rows' );
has 'all_sample_statistics' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_all_sample_statistics' );
has 'sample_names_to_column_index' => ( is => 'rw', isa => 'Maybe[HashRef]' );
has 'summary_output_filename'=> ( is => 'ro', isa => 'Str', default => 'summary_statistics.txt' );
has 'logger' => ( is => 'ro', lazy => 1, builder => '_build_logger');
has 'gene_category_count' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_gene_category_count' );
sub BUILD {
my ($self) = @_;
$self->_genes_to_rows;
$self->gene_category_count;
}
sub _build_logger
{
my ($self) = @_;
Log::Log4perl->easy_init( $ERROR );
my $logger = get_logger();
return $logger;
}
sub create_summary_output
{
my ($self) = @_;
open(my $fh, '>', $self->summary_output_filename) or Bio::Roary::Exceptions::CouldntWriteToFile->throw(error => "Couldnt write to ".$self->summary_output_filename);
my $core_percentage = $self->core_definition()*100;
my $soft_core_percentage = $self->_soft_core_percentage*100;
my $shell_percentage = $self->_shell_percentage()*100;
my $cloud_percentage = $self->_cloud_percentage()*100;
my $core_genes = ($self->gene_category_count->{core} ? $self->gene_category_count->{core} : 0);
my $soft_core_genes = ($self->gene_category_count->{soft_core} ? $self->gene_category_count->{soft_core} : 0);
my $shell_genes =($self->gene_category_count->{shell} ? $self->gene_category_count->{shell} : 0);
my $cloud_genes = ($self->gene_category_count->{cloud} ? $self->gene_category_count->{cloud} : 0);
my $total_genes = $core_genes + $soft_core_genes + $shell_genes + $cloud_genes ;
$self->logger->warn("Very few core genes detected with the current settings. Try modifying the core definition ( -cd 90 ) and/or
the blast identity (-i 70) parameters. Also try checking for contamination (-qc) and ensure you only have one species.") if($core_genes < 100);
print {$fh} "Core genes\t($core_percentage".'% <= strains <= 100%)'."\t$core_genes\n";
print {$fh} "Soft core genes\t(".$shell_percentage."% <= strains < ".$soft_core_percentage."%)\t$soft_core_genes\n";
print {$fh} "Shell genes\t(".$cloud_percentage."% <= strains < ".$shell_percentage."%)\t$shell_genes\n";
print {$fh} "Cloud genes\t(0% <= strains < ".$cloud_percentage."%)\t$cloud_genes\n";
print {$fh} "Total genes\t(0% <= strains <= 100%)\t$total_genes\n";
close($fh);
return 1;
}
sub _build_gene_category_count {
my ($self) = @_;
my %gene_category_count;
$self->_soft_core_percentage($self->core_definition);
if ( $self->_soft_core_percentage <= $self->_shell_percentage ) {
$self->_shell_percentage( $self->_soft_core_percentage - 0.01 );
}
my $number_of_samples = keys %{ $self->sample_names_to_column_index };
for my $gene_name ( keys %{ $self->_genes_to_rows } ) {
my $isolates_with_gene = 0;
for ( my $i = $self->_num_fixed_headers ; $i < @{ $self->_genes_to_rows->{$gene_name} } ; $i++ ) {
$isolates_with_gene++
if ( defined( $self->_genes_to_rows->{$gene_name}->[$i] ) && $self->_genes_to_rows->{$gene_name}->[$i] ne "" );
( run in 0.370 second using v1.01-cache-2.11-cpan-8f98c5d2c55 )