Bio-Roary

 view release on metacpan or  search on metacpan

lib/Bio/Roary/ContigsToGeneIDsFromGFF.pm  view on Meta::CPAN

package Bio::Roary::ContigsToGeneIDsFromGFF;
$Bio::Roary::ContigsToGeneIDsFromGFF::VERSION = '3.13.0';
# ABSTRACT: Parse a GFF and efficiently and extract ordered gene ids on each contig


use Moose;
use Bio::Tools::GFF;
with 'Bio::Roary::ParseGFFAnnotationRole';

has 'contig_to_ids' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build_contig_to_ids');

has 'overlapping_hypothetical_protein_ids' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_overlapping_hypothetical_protein_ids');
has '_genes_annotation' => ( is => 'rw', isa => 'ArrayRef', default => sub{[]});

has '_min_nucleotide_overlap_percentage' => ( is => 'ro', isa => 'Int', default => 10);

# Manually parse the GFF file because the BioPerl module is too slow
sub _build_contig_to_ids
{
  my ($self) = @_;
  my %contigs_to_ids;
  my @genes_annotation;
  
  open( my $fh, '-|', $self->_gff_fh_input_string ) or die "Couldnt open GFF file";
  while(<$fh>)
  {
    chomp;
    my $line = $_;   
    my $id_name;
    if($line =~/ID=["']?([^;"']+)["']?;?/i)
    {
      $id_name= $1;
    }
    else
    {
      next;
    }
    
    my @annotation_elements = split(/\t/,$line);
    # Map gene IDs to the contig
    push(@{$contigs_to_ids{$annotation_elements[0]}}, $id_name);
    
    if($line =~/product=["']?([^;,"']+)[,"']?;?/i)
    {
	  my %gene_data; 
      $gene_data{product} = $1;
	  $gene_data{id_name} = $id_name;
      if($line =~ /UniProtKB/ || $line =~ /RefSeq/ || $line =~ /protein motif/)
      {
        $gene_data{database_annotation_exists} = 1;
      }
	  else
	  {
	  	$gene_data{database_annotation_exists} = 0;
	  }
      
      $gene_data{contig}  = $annotation_elements[0];
      $gene_data{start}   = $annotation_elements[1];
      $gene_data{end}     = $annotation_elements[2];
	  push(@genes_annotation,\%gene_data);
    }

  }
  close($fh);
  
  $self->_genes_annotation(\@genes_annotation);
  return \%contigs_to_ids;
}

sub _build_overlapping_hypothetical_protein_ids
{
  my ($self) = @_;
  $self->contig_to_ids;
  
  my %overlapping_protein_ids;
  
  #Checking to see if the current feature is hypotheitical and if the next one has annotation
  for(my $i = 0; $i< (@{$self->_genes_annotation} -1) ; $i++ )
  {
	  my $current_feature = $self->_genes_annotation->[$i];
	  my $next_feature = $self->_genes_annotation->[$i+1];
	  
	  next if($current_feature->{database_annotation_exists} == 1);
	  next unless($current_feature->{product} =~ /hypothetical/i);
	  next unless($next_feature->{database_annotation_exists} == 1);
	  
	  my $start_coord = $current_feature->{start} ;
      my $end_coord   = $current_feature->{end} ;
	  my $comparison_start_coord =$next_feature->{start} ;
	  my $comparison_end_coord   =$next_feature->{end} ;
      if($comparison_start_coord < $end_coord  && $comparison_end_coord > $start_coord )
      {
        my $percent_overlap = $self->_percent_overlap($start_coord, $end_coord , $comparison_start_coord,$comparison_end_coord);
        if($percent_overlap >= $self->_min_nucleotide_overlap_percentage)
        {
          $overlapping_protein_ids{$current_feature->{id_name}}++;
        }
      }
  }
  
  return \%overlapping_protein_ids;
}

sub _percent_overlap
{
   my ($self, $start_coord, $end_coord , $comparison_start_coord,$comparison_end_coord) = @_;
   my $size_of_hypothetical_gene =  $end_coord - $start_coord;
   
   my $lower_bound = $start_coord;
   if($comparison_start_coord > $start_coord)
   {
     $lower_bound = $comparison_start_coord;
   }
   my $upper_bound = $end_coord;
   if($comparison_end_coord < $end_coord   )
   {
      $upper_bound = $comparison_end_coord;
   }
   return (($upper_bound-$lower_bound)*100) / $size_of_hypothetical_gene;
}


sub _build__awk_filter {
    my ($self) = @_;
    return
        'awk \'BEGIN {FS="\t"};{ if ($3 ~/'
      . $self->_tags_to_filter
      . '/) print $1"\t"$4"\t"$5"\t"$9;}\' ';
}

no Moose;
__PACKAGE__->meta->make_immutable;

1;

__END__

=pod

=encoding UTF-8

=head1 NAME

Bio::Roary::ContigsToGeneIDsFromGFF - Parse a GFF and efficiently and extract ordered gene ids on each contig

=head1 VERSION

version 3.13.0

=head1 SYNOPSIS

Parse a GFF and efficiently and extract ordered gene ids on each contig
   use Bio::Roary::ContigsToGeneIDsFromGFF;

   my $obj = Bio::Roary::ContigsToGeneIDsFromGFF->new(
     gff_file   => 'abc.gff'
   );
   $obj->contig_to_ids;

=head1 AUTHOR

Andrew J. Page <ap13@sanger.ac.uk>

=head1 COPYRIGHT AND LICENSE



( run in 0.623 second using v1.01-cache-2.11-cpan-ceb78f64989 )