BioPerl-Run
view release on metacpan or search on metacpan
lib/Bio/Tools/Run/TribeMCL.pm view on Meta::CPAN
next CLUSTER;
}
#all the same desc
my %desc = ();
foreach my $desc (@desc) { $desc{$desc}++; }
if ( (keys %desc) == 1 ) {
my ($best_annotation,) = keys %desc;
my $n = grep($_ eq $best_annotation, @desc);
my $perc= int( 100*($n/$total_members) );
$consensus{$i}{desc} = $best_annotation;
$consensus{$i}{conf} = $perc;
next CLUSTER;
}
my %lcshash = ();
my %lcnext = ();
while (@desc) {
# do an all-against-all LCS (longest commong substring) of the
# descriptions of all members; take the resulting strings, and
# again do an all-against-all LCS on them, until we have nothing
# left. The LCS's found along the way are in lcshash.
#
# Incidentally, longest common substring is a misnomer, since it
# is not guaranteed to occur in either of the original strings. It
# is more like the common parts of a Unix diff ...
for (my $i=0;$i<@desc;$i++) {
for (my $j=$i+1;$j<@desc;$j++){
my @list1=split(" ",$desc[$i]);
my @list2=split(" ",$desc[$j]);
my @lcs=Algorithm::Diff::LCS(\@list1,\@list2);
my $lcs=join(" ",@lcs);
$lcshash{$lcs}=1;
$lcnext{$lcs}=1;
}
}
@desc=keys(%lcnext);
undef %lcnext;
}
my ($best_score, $best_perc)=(0, 0);
my @all_cands=sort {length($b) <=>length($a)} keys %lcshash ;
foreach my $candidate_consensus (@all_cands) {
my @temp=split(" ",$candidate_consensus);
my $length=@temp; # num of words in annotation
# see how many members of cluster contain this LCS:
my ($lcs_count)=0;
foreach my $orig_desc (@orig_desc) {
my @list1=split(" ",$candidate_consensus);
my @list2=split(" ",$orig_desc);
my @lcs=Algorithm::Diff::LCS(\@list1,\@list2);
my $lcs=join(" ",@lcs);
if ($lcs eq $candidate_consensus ||
index($orig_desc,$candidate_consensus) != -1 # addition;
# many good (single word) annotations fall out otherwise
) {
$lcs_count++;
# Following is occurs frequently, as LCS is _not_ the longest
# common substring ... so we can't use the shortcut either
# if ( index($orig_desc,$candidate_consensus) == -1 ) {
# warn "lcs:'$lcs' eq cons:'$candidate_consensus' and
# orig:'$orig_desc', but index == -1\n"
# }
}
}
my $perc_with_desc=(($lcs_count/$total_members))*100;
my $perc=($lcs_count/$total_members)*100;
my $score=$perc + ($length*14); # take length into account as well
$score = 0 if $length==0;
if (($perc_with_desc >= 40) && ($length >= 1)) {
if ($score > $best_score) {
$best_score=$score;
$best_perc=$perc;
$best_annotation=$candidate_consensus;
}
}
}
if ($best_perc==0 || $best_perc >= 100 ) {
$best_perc='NA';
}
if ($best_annotation eq '') {
$best_annotation = 'AMBIGUOUS';
}
$consensus{$i}{desc} = $best_annotation;
$consensus{$i}{conf} = $best_perc;
}
return %consensus;
}
sub _setup_description {
my ($self,$file) = @_;
my $goners='().-';
my $spaces= ' ' x length($goners);
my $filter = "tr '$goners' '$spaces' < $file";
open (FILE,"$filter | ") || die "$filter: $!";
my %description;
while(<FILE>){
chomp;
my ($db,$acc,$description,$taxon_str) = split("\t",$_);
$description || $self->throw("Wrongly formated description file");
$description = $self->_apply_edits($description);
if($description{$acc}){
$self->warn("Duplicated entry $acc in description file, overwriting..");
}
$description{$acc} = [$db,$description,$taxon_str];
}
$self->description(\%description);
}
sub as_words {
#add ^ and $ to regexp
my (@words);
my @newwords=();
foreach my $word (@words) { push @newwords, "^$word\$" };
}
( run in 2.398 seconds using v1.01-cache-2.11-cpan-5837b0d9d2c )