Bio-Roary
view release on metacpan or search on metacpan
lib/Bio/Roary/SplitGroups.pm view on Meta::CPAN
package Bio::Roary::SplitGroups;
$Bio::Roary::SplitGroups::VERSION = '3.13.0';
# ABSTRACT: split groups
use Moose;
use Bio::Roary::AnalyseGroups;
use File::Path qw(make_path remove_tree);
use File::Copy qw(move);
use File::Temp;
use File::Basename;
use File::Slurper 'read_lines';
use Cwd;
has 'groupfile' => ( is => 'ro', isa => 'Str', required => 1 );
has 'fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'outfile' => ( is => 'ro', isa => 'Str', required => 1 );
has 'iterations' => ( is => 'ro', isa => 'Int', default => 5 );
has 'dont_delete' => ( is => 'ro', isa => 'Bool', default => 0 );
has '_neighbourhood_size' => ( is => 'ro', isa => 'Int', default => 5 );
has '_group_filelist' => ( is => 'rw', isa => 'ArrayRef', lazy_build => 1 );
has '_tmp_dir_object' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
has '_tmp_dir' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__tmp_dir' );
has '_analyse_groups_obj' => ( is => 'ro', lazy_build => 1 );
has '_genes_to_files' => ( is => 'ro', lazy_build => 1 );
has '_genes_to_groups' => ( is => 'rw', isa => 'HashRef' );
has '_first_gene_of_group_which_doesnt_have_paralogs' => ( is => 'rw', isa => 'HashRef', default => sub {{}} );
has '_genes_to_neighbourhood' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build__genes_to_neighbourhood' );
has '_gene_files_temp_dir_obj' =>
( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
has '_do_sorting' => ( is => 'rw', isa => 'Bool', default => 0 ); # set to 1 for testing only
sub _build__tmp_dir {
my ($self) = @_;
return $self->_tmp_dir_object->dirname();
}
sub _build__analyse_groups_obj {
my ( $self ) = @_;
return Bio::Roary::AnalyseGroups->new(
fasta_files => $self->fasta_files,
groups_filename => $self->groupfile
);
}
sub _build__genes_to_files {
my ( $self ) = @_;
return $self->_analyse_groups_obj->_genes_to_file;
}
sub _build__group_filelist {
my ( $self ) = @_;
my $tmp = $self->_tmp_dir;
my @filelist = ( $self->groupfile );
for my $i ( 1..($self->iterations - 1) ){
push( @filelist, "$tmp/group_$i" );
}
push( @filelist, $self->outfile );
return \@filelist;
}
sub _build__genes_to_neighbourhood
{
my ( $self ) = @_;
my %genes_to_neighbourhood;
for my $fasta_file( @{$self->fasta_files})
{
my ( $filename, $directories, $suffix ) = fileparse( $fasta_file, qr/\.[^.]*/ );
system('grep \> '.$fasta_file.'| sed \'s/>//\' >'.$self->_gene_files_temp_dir_obj."/".$filename.$suffix ) ;
my @genes = read_lines($self->_gene_files_temp_dir_obj."/".$filename.$suffix );
for(my $i =0; $i< @genes; $i++)
{
for(my $offset = 1; $offset <= $self->_neighbourhood_size; $offset++)
{
if($i -$offset >= 0)
{
push(@{$genes_to_neighbourhood{$genes[$i]}}, $genes[$i - $offset ]);
}
if($i +$offset <@genes)
{
push(@{$genes_to_neighbourhood{$genes[$i]}}, $genes[$i + $offset ]);
}
}
}
}
return \%genes_to_neighbourhood;
}
sub split_groups {
my ( $self ) = @_;
# iteratively
for my $x ( 0..($self->iterations - 1) ){
my ( $in_groups, $out_groups ) = $self->_get_files_for_iteration( $x );
# read in groups, check paralogs and split
my @newgroups;
my $any_paralogs = 0;
$self->_set_genes_to_groups( $in_groups );
open( my $group_handle, '<', $in_groups );
while( my $line = <$group_handle> ){
my @group = split( /\s+/, $line );
if($self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]})
{
push( @newgroups, \@group );
}
elsif(@group == 1)
{
$self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]}++;
push( @newgroups, \@group );
}
elsif( $self->_contains_paralogs( \@group ) ){
my @true_orthologs = @{ $self->_true_orthologs( \@group ) };
push( @newgroups, @true_orthologs);
$any_paralogs = 1;
}
else {
$self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]}++;
push( @newgroups, \@group );
}
}
close( $group_handle );
# check if next iteration required, move output if not
unless ($any_paralogs){
move $in_groups, $self->outfile; # input file will be the same as new output file if no splitting has been performed
last;
}
# write split groups to file
open( my $outfile_handle, '>', $out_groups );
for my $g ( @newgroups ) {
my $group_str = join( "\t", @{ $g } ) . "\n";
print $outfile_handle $group_str;
}
close( $outfile_handle );
}
}
sub _set_genes_to_groups {
my ( $self, $groupfile ) = @_;
my %genes2groups;
my $c = 0;
open( my $gfh, '<', $groupfile );
while( my $line = <$gfh> ){
chomp $line;
my @genes = split( /\s+/, $line );
for my $g ( @genes ){
$genes2groups{$g} = $c;
}
$c++;
( run in 0.980 second using v1.01-cache-2.11-cpan-71847e10f99 )