BioPerl-Run

 view release on metacpan or  search on metacpan

lib/Bio/Tools/Run/TribeMCL.pm  view on Meta::CPAN

            next CLUSTER;
        }

        #all the same desc
        my %desc = ();
        foreach my $desc (@desc) {        $desc{$desc}++;     }
        if ( (keys %desc) == 1 ) {
          my ($best_annotation,) = keys %desc;
          my $n = grep($_ eq $best_annotation, @desc);
          my $perc= int( 100*($n/$total_members) );
          $consensus{$i}{desc} = $best_annotation;
          $consensus{$i}{conf} = $perc;
          next CLUSTER;
        }
      
        my %lcshash = ();
        my %lcnext = ();
        while (@desc) {
          # do an all-against-all LCS (longest commong substring) of the
          # descriptions of all members; take the resulting strings, and
          # again do an all-against-all LCS on them, until we have nothing
          # left. The LCS's found along the way are in lcshash.
          #
          # Incidentally, longest common substring is a misnomer, since it
          # is not guaranteed to occur in either of the original strings. It
          # is more like the common parts of a Unix diff ...
          for (my $i=0;$i<@desc;$i++) {
            for (my $j=$i+1;$j<@desc;$j++){
                my @list1=split(" ",$desc[$i]);
                my @list2=split(" ",$desc[$j]);
                my @lcs=Algorithm::Diff::LCS(\@list1,\@list2);
                my $lcs=join(" ",@lcs);
                $lcshash{$lcs}=1;
                $lcnext{$lcs}=1;
            }
          }
          @desc=keys(%lcnext);
          undef %lcnext;
        }
        my ($best_score, $best_perc)=(0, 0);
        my @all_cands=sort {length($b) <=>length($a)} keys %lcshash ;
        foreach my $candidate_consensus (@all_cands) {
          my @temp=split(" ",$candidate_consensus);
          my $length=@temp;               # num of words in annotation

          # see how many members of cluster contain this LCS:

          my ($lcs_count)=0;
          foreach my $orig_desc (@orig_desc) {
            my @list1=split(" ",$candidate_consensus);
            my @list2=split(" ",$orig_desc);
            my @lcs=Algorithm::Diff::LCS(\@list1,\@list2);
            my $lcs=join(" ",@lcs);

            if ($lcs eq $candidate_consensus ||
                index($orig_desc,$candidate_consensus) != -1 # addition;
                # many good (single word) annotations fall out otherwise
               ) {
                $lcs_count++;

                # Following is occurs frequently, as LCS is _not_ the longest
                # common substring ... so we can't use the shortcut either

                # if ( index($orig_desc,$candidate_consensus) == -1 ) {
                #   warn "lcs:'$lcs' eq cons:'$candidate_consensus' and
                # orig:'$orig_desc', but index == -1\n"
                # }
            }
          }
          my $perc_with_desc=(($lcs_count/$total_members))*100;
          my $perc=($lcs_count/$total_members)*100;
          my $score=$perc + ($length*14); # take length into account as well
          $score = 0 if $length==0;
          if (($perc_with_desc >= 40) && ($length >= 1)) {
            if ($score > $best_score) {
                $best_score=$score;
                $best_perc=$perc;
                $best_annotation=$candidate_consensus;
            }
          }
      }
      if ($best_perc==0 || $best_perc >= 100 )  {
        $best_perc='NA';
      }
      if  ($best_annotation eq  '')  {
        $best_annotation = 'AMBIGUOUS';
      }
      $consensus{$i}{desc} = $best_annotation;
      $consensus{$i}{conf} = $best_perc;
  }
  return %consensus;
}

sub _setup_description {
    my ($self,$file) = @_;
    my $goners='().-';
    my $spaces= ' ' x length($goners);
    my $filter = "tr '$goners' '$spaces' < $file";
    open (FILE,"$filter | ") || die "$filter: $!";
    my %description;
    while(<FILE>){
        chomp;
        my ($db,$acc,$description,$taxon_str) = split("\t",$_);
        $description || $self->throw("Wrongly formated description file");
        $description = $self->_apply_edits($description);

        if($description{$acc}){
            $self->warn("Duplicated entry $acc in description file, overwriting..");
        }
        $description{$acc} = [$db,$description,$taxon_str];
    }
    $self->description(\%description);
}

sub as_words {
    #add ^ and $ to regexp
    my (@words);
    my @newwords=();

    foreach my $word (@words) { push @newwords, "^$word\$" };
}



( run in 2.398 seconds using v1.01-cache-2.11-cpan-5837b0d9d2c )