Bio-Gonzales

 view release on metacpan or  search on metacpan

lib/Bio/Gonzales/Domain/Group.pm  view on Meta::CPAN

    $i++;
  }
  return \%group;
}

sub filter_hits {
  my ($self) = @_;

  my $q          = $self->search_result;
  my $id_lookup  = { map { $_ => 1 } @{ $self->filter_ids } };
  my $group_idcs = $self->group_idcs;

  my @hit_col;
  while ( my ( $domain_id, $result ) = each %$q ) {
    next unless ( exists( $group_idcs->{$domain_id} ) );
    while ( my ( $seq_id, $hits ) = each %$result ) {
      next if ( $id_lookup && !exists( $id_lookup->{$seq_id} ) );
      for my $hit (@$hits) {
        $hit->{domain_id} = $domain_id;
        $hit->{seq_id}    = $seq_id;
        $hit->{group_id}  = $group_idcs->{$domain_id};
        push @hit_col, $hit;
      }
    }
  }
  return $self->_pick_best_domain_hits( \@hit_col )
    if ( $self->only_best_hit );

  return \@hit_col;
}

sub _pick_best_domain_hits {
  my ( $self, $hit_list ) = @_;

  my %group_collection;
  for my $h (@$hit_list) {
    my $id = $h->{seq_id} . "__//__" . $h->{group_id};
    $group_collection{$id} //= [];

    push @{ $group_collection{$id} }, $h;
  }

  my @best_ones;
  for my $h ( values %group_collection ) {
    my $max;

    #find the domain with the maximal score
    map { $max = $_ if ( !$max || $_->{score} > $max->{score} ) } @$h;
    push @best_ones, $max;
  }

  return \@best_ones;
}

sub to_gff {
  my ( $self, $dest ) = @_;
  my $hits = $self->filter_hits;

  my $gffout = Bio::Gonzales::Feat::IO::GFF3->new( file_or_fh => $dest, mode => '>' );

  my $gidcs = $self->group_idcs;
  while ( my ( $d, $idx ) = each %$gidcs ) {
    $gffout->write_comment(" Group $idx contains domain: $d");
  }

  my $i = 0;
  for my $max (@$hits) {

    $gffout->write_feat(
      Bio::Gonzales::Feat->new(
        seq_id     => $max->{seq_id},
        source     => 'hmmer',
        type       => 'region',
        start      => $max->{env_from},
        end        => $max->{env_to},
        strand     => 0,
        score      => $max->{score},
        attributes => {
          ID       => [ "hit_" . $i++ ],
          domainID => [ $max->{domain_id} ],
          groupID  => [ $max->{group_id} ]
        }
      )
    );
  }

  $gffout->close;
}

sub filter_ids {
  my ($self) = @_;

  my $q       = $self->search_result;
  my $rgroups = $self->required_domains;
  return unless ($rgroups);

  my %id_occurrence;

  # select the ids for every group by...
  for my $domain_synonyms (@$rgroups) {
    my @ids;
    # iterating through the domains ...
    for my $domain_synonym (@$domain_synonyms) {
      # and storing all ids in an group-wide id list
      push @ids, grep { @{ $q->{$domain_synonym}{$_} } > 0 } keys %{ $q->{$domain_synonym} };
    }

    # we need to eliminate duplicate ids (multiple synonyms per group)
    # and increment the occurrence of the found ids
    map { $id_occurrence{$_}++ } uniq @ids;

  }

  # if an id is in every group it occurred #num of rgroups times and is not forbidden, return it
  return [ grep { $id_occurrence{$_} == @$rgroups && !$self->_is_forbidden($_) } keys %id_occurrence ];
}

sub _is_forbidden {
  my ( $self, $id ) = @_;

  my $q       = $self->search_result;
  my $fgroups = $self->forbidden_domains;



( run in 1.171 second using v1.01-cache-2.11-cpan-5b529ec07f3 )