Bio-Roary

 view release on metacpan or  search on metacpan

lib/Bio/Roary/SplitGroups.pm  view on Meta::CPAN

package Bio::Roary::SplitGroups;
$Bio::Roary::SplitGroups::VERSION = '3.13.0';
# ABSTRACT: split groups


use Moose;
use Bio::Roary::AnalyseGroups;
use File::Path qw(make_path remove_tree);
use File::Copy qw(move);
use File::Temp;
use File::Basename;
use File::Slurper 'read_lines';
use Cwd;


has 'groupfile'   => ( is => 'ro', isa => 'Str',      required => 1 );
has 'fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'outfile'     => ( is => 'ro', isa => 'Str',      required => 1 );
has 'iterations'  => ( is => 'ro', isa => 'Int',      default  => 5 );
has 'dont_delete' => ( is => 'ro', isa => 'Bool',     default  => 0 );

has '_neighbourhood_size' => ( is => 'ro', isa => 'Int', default => 5 );

has '_group_filelist'  => ( is => 'rw', isa => 'ArrayRef', lazy_build => 1 );
has '_tmp_dir_object' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
has '_tmp_dir'        => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__tmp_dir' );

has '_analyse_groups_obj' => ( is => 'ro', lazy_build => 1 );
has '_genes_to_files'     => ( is => 'ro', lazy_build => 1 );
has '_genes_to_groups'    => ( is => 'rw', isa => 'HashRef' );

has '_first_gene_of_group_which_doesnt_have_paralogs'    => ( is => 'rw', isa => 'HashRef', default => sub {{}} );

has '_genes_to_neighbourhood' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build__genes_to_neighbourhood' );


has '_gene_files_temp_dir_obj' =>
  ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );


has '_do_sorting' => ( is => 'rw', isa => 'Bool', default => 0 ); # set to 1 for testing only

sub _build__tmp_dir {
    my ($self) = @_;
    return $self->_tmp_dir_object->dirname();
}

sub _build__analyse_groups_obj {
	my ( $self ) = @_;
	
	return Bio::Roary::AnalyseGroups->new(
		fasta_files     => $self->fasta_files,
		groups_filename => $self->groupfile
	);
}

sub _build__genes_to_files {
	my ( $self ) = @_;
	return $self->_analyse_groups_obj->_genes_to_file;
}

sub _build__group_filelist {
	my ( $self ) = @_;
	my $tmp = $self->_tmp_dir;

	my @filelist = ( $self->groupfile );
	for my $i ( 1..($self->iterations - 1) ){
		push( @filelist, "$tmp/group_$i" );
	}
	push( @filelist, $self->outfile );

	return \@filelist;
}

sub _build__genes_to_neighbourhood
{
  my ( $self ) = @_;
  my %genes_to_neighbourhood;
  for my $fasta_file( @{$self->fasta_files})
  {
	my ( $filename, $directories, $suffix ) = fileparse( $fasta_file, qr/\.[^.]*/ );
  	system('grep \> '.$fasta_file.'| sed  \'s/>//\' >'.$self->_gene_files_temp_dir_obj."/".$filename.$suffix ) ;
	
	my @genes = read_lines($self->_gene_files_temp_dir_obj."/".$filename.$suffix );
	
	for(my $i =0; $i< @genes; $i++)
	{
		for(my $offset = 1; $offset <= $self->_neighbourhood_size; $offset++)
		{
			if($i -$offset >= 0)
			{
			   push(@{$genes_to_neighbourhood{$genes[$i]}}, $genes[$i - $offset ]);
		    }
			if($i +$offset <@genes)
			{
			   push(@{$genes_to_neighbourhood{$genes[$i]}}, $genes[$i + $offset ]);
		    }
		}
	}
  }
  return \%genes_to_neighbourhood;
}

sub split_groups {
	my ( $self ) = @_;

	# iteratively
	for my $x ( 0..($self->iterations - 1) ){
		my ( $in_groups, $out_groups ) = $self->_get_files_for_iteration( $x ); 

		# read in groups, check paralogs and split
		my @newgroups;
		my $any_paralogs = 0;
		$self->_set_genes_to_groups( $in_groups );
		open( my $group_handle, '<', $in_groups );
		while( my $line = <$group_handle> ){
			my @group = split( /\s+/, $line );

			if($self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]})
			{
				push( @newgroups, \@group );
			}
			elsif(@group == 1)
			{
				$self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]}++;
				push( @newgroups, \@group );
			}
			elsif( $self->_contains_paralogs( \@group ) ){
				my @true_orthologs = @{ $self->_true_orthologs( \@group ) };
				push( @newgroups,  @true_orthologs);
				$any_paralogs = 1;
			}
			else {
				$self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]}++;
				push( @newgroups, \@group );
			}
		}
		close( $group_handle );

		# check if next iteration required, move output if not
		unless ($any_paralogs){
			move $in_groups, $self->outfile; # input file will be the same as new output file if no splitting has been performed
			last;
		}

		# write split groups to file
		open( my $outfile_handle, '>', $out_groups );
		for my $g ( @newgroups ) {
			my $group_str = join( "\t", @{ $g } ) . "\n";
			print $outfile_handle $group_str;
		}
		close( $outfile_handle );
	}
}

sub _set_genes_to_groups {
	my ( $self, $groupfile ) = @_;

	my %genes2groups;
	my $c = 0;
	open( my $gfh, '<', $groupfile );
	while( my $line = <$gfh> ){
		chomp $line;
		my @genes = split( /\s+/, $line );
		for my $g ( @genes ){
			$genes2groups{$g} = $c;
		}
		$c++;



( run in 0.980 second using v1.01-cache-2.11-cpan-71847e10f99 )