Bio-Roary

 view release on metacpan or  search on metacpan

lib/Bio/Roary/ContigsToGeneIDsFromGFF.pm  view on Meta::CPAN

package Bio::Roary::ContigsToGeneIDsFromGFF;
$Bio::Roary::ContigsToGeneIDsFromGFF::VERSION = '3.13.0';
# ABSTRACT: Parse a GFF and efficiently and extract ordered gene ids on each contig


use Moose;
use Bio::Tools::GFF;
with 'Bio::Roary::ParseGFFAnnotationRole';

has 'contig_to_ids' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build_contig_to_ids');

has 'overlapping_hypothetical_protein_ids' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_overlapping_hypothetical_protein_ids');
has '_genes_annotation' => ( is => 'rw', isa => 'ArrayRef', default => sub{[]});

has '_min_nucleotide_overlap_percentage' => ( is => 'ro', isa => 'Int', default => 10);

# Manually parse the GFF file because the BioPerl module is too slow
sub _build_contig_to_ids
{
  my ($self) = @_;
  my %contigs_to_ids;
  my @genes_annotation;
  
  open( my $fh, '-|', $self->_gff_fh_input_string ) or die "Couldnt open GFF file";
  while(<$fh>)
  {
    chomp;
    my $line = $_;   
    my $id_name;
    if($line =~/ID=["']?([^;"']+)["']?;?/i)
    {
      $id_name= $1;
    }
    else
    {
      next;
    }
    
    my @annotation_elements = split(/\t/,$line);
    # Map gene IDs to the contig
    push(@{$contigs_to_ids{$annotation_elements[0]}}, $id_name);
    
    if($line =~/product=["']?([^;,"']+)[,"']?;?/i)
    {
	  my %gene_data; 
      $gene_data{product} = $1;
	  $gene_data{id_name} = $id_name;
      if($line =~ /UniProtKB/ || $line =~ /RefSeq/ || $line =~ /protein motif/)
      {
        $gene_data{database_annotation_exists} = 1;
      }
	  else
	  {
	  	$gene_data{database_annotation_exists} = 0;
	  }
      
      $gene_data{contig}  = $annotation_elements[0];
      $gene_data{start}   = $annotation_elements[1];
      $gene_data{end}     = $annotation_elements[2];
	  push(@genes_annotation,\%gene_data);
    }

  }
  close($fh);
  
  $self->_genes_annotation(\@genes_annotation);
  return \%contigs_to_ids;
}

sub _build_overlapping_hypothetical_protein_ids
{
  my ($self) = @_;



( run in 1.255 second using v1.01-cache-2.11-cpan-98e64b0badf )