Bio-AutomatedAnnotation

 view release on metacpan or  search on metacpan

lib/Bio/AutomatedAnnotation/GeneNamesFromGFF.pm  view on Meta::CPAN

package Bio::AutomatedAnnotation::GeneNamesFromGFF;
$Bio::AutomatedAnnotation::GeneNamesFromGFF::VERSION = '1.182770';
# ABSTRACT: Parse a GFF and efficiently extract out the Gene Names


use Moose;
use Bio::Tools::GFF;

has 'gff_file' => ( is => 'ro', isa => 'Str', required => 1 );

has '_tags_to_filter' => ( is => 'ro', isa => 'Str',             default => 'CDS' );
has '_tags_to_ignore' => ( is => 'ro', isa => 'Str',             default => 'rRNA|tRNA|ncRNA|tmRNA' );
has '_gff_parser'     => ( is => 'ro', isa => 'Bio::Tools::GFF', lazy    => 1, builder => '_build__gff_parser' );
has '_awk_filter'     => ( is => 'ro', isa => 'Str',             lazy    => 1, builder => '_build__awk_filter' );
has '_remove_sequence_filter' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__remove_sequence_filter' );

has 'gene_names' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_gene_names' );

sub _build_gene_names {
    my ($self) = @_;
    my %gene_names;

    while ( my $raw_feature = $self->_gff_parser->next_feature() ) {
        last unless defined($raw_feature);    # No more features
        next if !( $raw_feature->primary_tag eq 'CDS' );

        if ( $raw_feature->has_tag('gene') ) {
            my ( $gene_name, @junk ) = $raw_feature->get_tag_values('gene');
            $gene_name =~ s!"!!g;
            next if ( $gene_name eq "" );
            $gene_names{$gene_name} = 1;
        }
    }
    $self->_gff_parser->close();
    return \%gene_names;
}

# Bio::Tools::GFF->ignore_sequence(1) doesnt work with our data, triggers an infinite loop
sub _build__gff_parser {
    my ($self) = @_;
    open( my $fh, '-|', $self->_gff_fh_input_string ) or die "Couldnt open GFF file";
    my $gff_parser = Bio::Tools::GFF->new( -fh => $fh, gff_version => 3 );
    return $gff_parser;
}

sub _gff_fh_input_string {
    my ($self) = @_;
    return $self->_awk_filter . " " . $self->gff_file . " | " . $self->_remove_sequence_filter;
}

# Parsing a GFF file with perl is slow, so filter out CDSs which dont contain a gene name
sub _build__awk_filter {
    my ($self) = @_;
    return
        'awk \'BEGIN {FS="\t"};{ if ($3 ~/'
      . $self->_tags_to_filter
      . '/ && $9 ~ /gene=/) print $0;else if ($3 ~/'
      . $self->_tags_to_filter . '|'
      . $self->_tags_to_ignore
      . '/) ; else print $0;}\' ';
}

# Cut out the FASTA sequence at the bottom of the file
sub _build__remove_sequence_filter {
    my ($self) = @_;
    return 'sed -n \'/##gff-version 3/,/##FASTA/p\' | grep -v \'##FASTA\'';
}

no Moose;
__PACKAGE__->meta->make_immutable;

1;

__END__

=pod



( run in 0.887 second using v1.01-cache-2.11-cpan-39bf76dae61 )