Bio-Roary
view release on metacpan or search on metacpan
lib/Bio/Roary/AccessoryClustering.pm view on Meta::CPAN
package Bio::Roary::AccessoryClustering;
$Bio::Roary::AccessoryClustering::VERSION = '3.13.0';
# ABSTRACT: Take an a clusters file from CD-hit and the fasta file and output a fasta file without full clusters
use Moose;
use Bio::Roary::External::Cdhit;
with 'Bio::Roary::ClustersRole';
has 'input_file' => ( is => 'ro', isa => 'Str', required => 1 );
has 'identity' => ( is => 'ro', isa => 'Num', default => 0.9 );
has 'cpus' => ( is => 'ro', isa => 'Int', default => 1 );
has '_output_cd_hit_filename' => ( is => 'ro', isa => 'Str', default => '_accessory_clusters' );
has 'clusters_to_samples' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_clusters_to_samples' );
has 'samples_to_clusters' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_samples_to_clusters' );
has 'sample_weights' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_sample_weights' );
has 'clusters_filename' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_clusters_filename' );
has 'clusters' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__clusters' );
sub _build_sample_weights {
my ($self) = @_;
my %sample_weights;
for my $cluster_name ( keys %{ $self->clusters_to_samples } ) {
my $cluster_size = @{ $self->clusters_to_samples->{$cluster_name} };
for my $sample_name ( @{ $self->clusters_to_samples->{$cluster_name} } ) {
$sample_weights{$sample_name} = 1 / $cluster_size;
}
}
return \%sample_weights;
}
sub _build_samples_to_clusters {
my ($self) = @_;
my %samples_to_clusters;
for my $cluster_name ( keys %{ $self->clusters_to_samples } ) {
for my $sample_name ( @{ $self->clusters_to_samples->{$cluster_name} } ) {
$samples_to_clusters{$sample_name} = $cluster_name;
}
}
return \%samples_to_clusters;
}
sub _build_clusters_filename {
my ($self) = @_;
return $self->_output_cd_hit_filename . '.clstr';
}
sub _build_clusters_to_samples {
my ($self) = @_;
my $cdhit_obj = Bio::Roary::External::Cdhit->new(
input_file => $self->input_file,
output_base => $self->_output_cd_hit_filename,
_length_difference_cutoff => 1,
_sequence_identity_threshold => $self->identity,
cpus => $self->cpus
);
$cdhit_obj->run();
my $clusterd_genes = $self->_clustered_genes;
for my $cluster_name ( keys %{$clusterd_genes} ) {
my $found = 0;
for my $gene_name ( @{ $clusterd_genes->{$cluster_name} } ) {
if ( $gene_name eq $cluster_name ) {
$found = 1;
last;
}
}
if ( $found == 0 ) {
push( @{ $clusterd_genes->{$cluster_name} }, $cluster_name );
}
}
return $clusterd_genes;
}
no Moose;
( run in 1.753 second using v1.01-cache-2.11-cpan-ceb78f64989 )