Bio-Roary

 view release on metacpan or  search on metacpan

lib/Bio/Roary/ExtractCoreGenesFromSpreadsheet.pm  view on Meta::CPAN

package Bio::Roary::ExtractCoreGenesFromSpreadsheet;
$Bio::Roary::ExtractCoreGenesFromSpreadsheet::VERSION = '3.13.0';
# ABSTRACT: Take in a spreadsheet produced by the pipeline and identify the core genes.


use Moose;
use Text::CSV;
use Bio::Roary::GroupStatistics;
use POSIX;

has 'spreadsheet'           => ( is => 'ro', isa  => 'Str',       required => 1 );
has '_csv_parser'           => ( is => 'ro', isa  => 'Text::CSV', lazy     => 1, builder => '_build__csv_parser' );
has '_input_spreadsheet_fh' => ( is => 'ro', lazy => 1,           builder  => '_build__input_spreadsheet_fh' );
has 'ordered_core_genes'    => ( is => 'ro', isa  => 'ArrayRef',  lazy     => 1, builder => '_build_ordered_core_genes' );
has 'core_definition'       => ( is => 'ro', isa  => 'Num',       default  => 1 );
has 'sample_names'          => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } );
has 'sample_names_to_genes' => ( is => 'rw', isa => 'HashRef',  default => sub { {} } );
has 'allow_paralogs'        => ( is => 'rw', isa => 'Bool',     default => 0 );

has '_number_of_isolates'               => ( is => 'rw', isa => 'Int' );
has '_gene_column'                      => ( is => 'rw', isa => 'Int' );
has '_num_isolates_column'              => ( is => 'rw', isa => 'Int' );
has '_avg_sequences_per_isolate_column' => ( is => 'rw', isa => 'Int' );
has '_genome_fragement_column'          => ( is => 'rw', isa => 'Int' );
has '_order_within_fragement_column'    => ( is => 'rw', isa => 'Int' );
has '_min_no_isolates_for_core'         => ( is => 'rw', isa => 'Num', lazy => 1, builder => '_build__min_no_isolates_for_core' );

sub _build__min_no_isolates_for_core {
    my ($self) = @_;
    my $threshold = $self->_number_of_isolates * $self->core_definition;

    return $threshold;
}

sub _build__csv_parser {
    my ($self) = @_;
    return Text::CSV->new( { binary => 1, always_quote => 1 } );
}

sub _build__input_spreadsheet_fh {
    my ($self) = @_;
    open( my $fh, $self->spreadsheet );
    return $fh;
}

sub _update_number_of_isolates {
    my ( $self, $header_row ) = @_;
    my $number_of_isolates = @{$header_row} - @{ Bio::Roary::GroupStatistics->fixed_headers };
    $self->_number_of_isolates($number_of_isolates);
}

sub _setup_column_mappings {
    my ( $self, $header_row ) = @_;

    # current ordering
    my %columns_of_interest_mappings = (
        'Gene'                      => 0,
        'No. isolates'              => 3,
        'Avg sequences per isolate' => 5,
        'Genome Fragment'           => 6,
        'Order within Fragment'     => 7,
        'QC'                        => 10,
    );

    # Dynamically overwrite the default ordering
    for ( my $i = 0 ; $i < @{$header_row} ; $i++ ) {
        for my $col_name (%columns_of_interest_mappings) {
            if ( $header_row->[$i] eq $col_name ) {
                $columns_of_interest_mappings{$col_name} = $i;
                last;
            }
        }
    }
    $self->_gene_column( $columns_of_interest_mappings{'Gene'} );



( run in 0.682 second using v1.01-cache-2.11-cpan-98e64b0badf )