Bio-Roary
view release on metacpan or search on metacpan
lib/Bio/Roary/ContigsToGeneIDsFromGFF.pm view on Meta::CPAN
package Bio::Roary::ContigsToGeneIDsFromGFF;
$Bio::Roary::ContigsToGeneIDsFromGFF::VERSION = '3.13.0';
# ABSTRACT: Parse a GFF and efficiently and extract ordered gene ids on each contig
use Moose;
use Bio::Tools::GFF;
with 'Bio::Roary::ParseGFFAnnotationRole';
has 'contig_to_ids' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build_contig_to_ids');
has 'overlapping_hypothetical_protein_ids' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_overlapping_hypothetical_protein_ids');
has '_genes_annotation' => ( is => 'rw', isa => 'ArrayRef', default => sub{[]});
has '_min_nucleotide_overlap_percentage' => ( is => 'ro', isa => 'Int', default => 10);
# Manually parse the GFF file because the BioPerl module is too slow
sub _build_contig_to_ids
{
my ($self) = @_;
my %contigs_to_ids;
my @genes_annotation;
open( my $fh, '-|', $self->_gff_fh_input_string ) or die "Couldnt open GFF file";
while(<$fh>)
{
chomp;
my $line = $_;
my $id_name;
if($line =~/ID=["']?([^;"']+)["']?;?/i)
{
$id_name= $1;
}
else
{
next;
}
my @annotation_elements = split(/\t/,$line);
# Map gene IDs to the contig
push(@{$contigs_to_ids{$annotation_elements[0]}}, $id_name);
if($line =~/product=["']?([^;,"']+)[,"']?;?/i)
{
my %gene_data;
$gene_data{product} = $1;
$gene_data{id_name} = $id_name;
if($line =~ /UniProtKB/ || $line =~ /RefSeq/ || $line =~ /protein motif/)
{
$gene_data{database_annotation_exists} = 1;
}
else
{
$gene_data{database_annotation_exists} = 0;
}
$gene_data{contig} = $annotation_elements[0];
$gene_data{start} = $annotation_elements[1];
$gene_data{end} = $annotation_elements[2];
push(@genes_annotation,\%gene_data);
}
}
close($fh);
$self->_genes_annotation(\@genes_annotation);
return \%contigs_to_ids;
}
sub _build_overlapping_hypothetical_protein_ids
{
my ($self) = @_;
( run in 1.255 second using v1.01-cache-2.11-cpan-98e64b0badf )