Bio-MUST-Drivers
view release on metacpan or search on metacpan
lib/Bio/MUST/Drivers/CdHit.pm view on Meta::CPAN
for my $id ( @{ $self->seq_ids_for($repr) } ) {
$representative_for{ $id->full_id }
= $self->get_representative_with_id($repr);
}
}
return \%representative_for;
}
## use critic
sub BUILD {
my $self = shift;
# provision executable
my $app = use_module('Bio::MUST::Provision::CdHit')->new;
$app->meet();
# setup output files
my $infile = $self->filename;
my $basename = $infile . '.cdhit';
my $outfile = $basename . '.out';
my $outfile_clstr = $basename . '.out.clstr';
# format cd-hit (optional) arguments
my $args = $self->cdhit_args;
my $args_str = stringify_args($args);
# create cd-hit (or cd-hit-est) command
my $pgm = $self->type eq 'prot' ? 'cd-hit' : 'cd-hit-est';
my $cmd = "$pgm -i $infile -o $outfile $args_str > /dev/null 2> /dev/null";
#### $cmd
# try to robustly execute cd-hit
my $ret_code = system( [ 0, 127 ], $cmd);
if ($ret_code == 127) {
carp "[BMD] Warning: cannot execute $pgm command;"
. ' returning without contigs!';
return;
}
# TODO: try to bypass shell (need for absolute path to executable then)
# parse output file
my $parser = CdHit->new(file => $outfile_clstr);
my $mapper = $self->mapper;
# restore original ids for cluster members...
tie my %cluster_seq_ids, 'Tie::IxHash';
for my $abbr_id ( $parser->all_representatives ) {
my @member_ids = map {
SeqId->new( full_id => $mapper->long_id_for($_) )
} @{ $parser->members_for($abbr_id) // [] };
$cluster_seq_ids{ $mapper->long_id_for($abbr_id) } = \@member_ids;
}
# ... and store cluster members
$self->_set_cluster_seq_ids(\%cluster_seq_ids);
# read and store representative seqs...
my $representatives = Ali->load($outfile);
$representatives->dont_guess;
$representatives->restore_ids($mapper); # ... restoring original ids
$self->_set_representatives($representatives);
# unlink temp files
file($_)->remove for $outfile, $outfile_clstr;
return;
}
# TODO: test this
sub filter_clusters {
my $self = shift;
my $list = shift;
my $key = shift // 'classic';
my $cluster = 1;
my %new_cluster_for;
CLUSTER:
for my $repr ( $self->all_cluster_names ) {
my @members = map { $_->full_id } @{ $self->seq_ids_for($repr) };
# start to fill in the hash if no members
if ( scalar @members == 0 ) {
$new_cluster_for{$cluster} = {
representatives => [$repr ],
members => [@members]
};
$cluster++;
next CLUSTER;
}
# ... otherwise select a repr. seq. included in list
else {
# merge repr and members
push @members, $repr;
# and sort by sequence length
my @sorted_ids = map { $_->full_id } sort_by { length($_->seq) }
map { $self->seqs->get_seq_with_id($_) } @members;
my @new_reprs
= intersect( @$list, @sorted_ids ) && $key eq 'classic'
? ( intersect( @$list, @sorted_ids ) )[0] # @sorted_ids order
: intersect( @$list, @sorted_ids ) && $key eq 'all'
? intersect( @$list, @sorted_ids )
: $sorted_ids[0]
;
my @new_members
= $key eq 'classic' ? grep { $_ ne $new_reprs[0] } @sorted_ids
: $key eq 'all' ? array_minus(@sorted_ids, @new_reprs)
: ()
;
$new_cluster_for{$cluster} = {
representatives => [@new_reprs ],
members => [@new_members]
};
( run in 0.582 second using v1.01-cache-2.11-cpan-39bf76dae61 )