Bio-FeatureIO

 view release on metacpan or  search on metacpan

lib/Bio/FeatureIO/gff.pm  view on Meta::CPAN

  if($gff_string =~ /^>/){
    
    my $pos = tell($self->_fh);
    
    # TODO: SeqIO::fasta uses _fh directly, whereas _pushback uses an in-memory
    # buffer; this completely breaks with piped data, so fix for files and punt
    # on the rest for now
    
    if ($pos >= 0) {
      seek($self->_fh, -length($gff_string), 1);
    } else {
      $self->warn("In-line parsing of FASTA not yet supported");
      $self->_pushback($gff_string);
    }
    $self->fasta_mode(1);
    return;
  }

  # got a directive
  elsif($gff_string =~ /^##/){
    $self->_handle_directive($gff_string);
    return $self->_buffer_feature();
  }

  # got a feature
  else {
    return $self->_handle_feature($gff_string);
  }
}

=head2 next_feature_group

 Title   : next_feature_group
 Usage   : @feature_group = $stream->next_feature_group
 Function: Reads the next feature_group from $stream and returns it.

           Feature groups in GFF3 files are separated by '###' directives. The
           features in a group might form a hierarchical structure. The
           complete hierarchy of features is returned, i.e. the returned array
           represents only the top-level features.  Lower-level features can
           be accessed using the 'get_SeqFeatures' method recursively.

 Example : # getting the complete hierarchy of features in a GFF3 file
           my @toplevel_features;
           while (my @fg = $stream->next_feature_group) {
               push(@toplevel_features, @fg);
           }
 Returns : an array of Bio::SeqFeature::Annotated objects
 Args    : none

=cut

sub next_feature_group {
  my $self = shift;

  my $feat;
  my %seen_ids;
  my @all_feats;
  my @toplevel_feats;

  $self->{group_not_done} = 1;

  while ($self->{group_not_done} && ($feat = $self->next_feature()) && defined($feat)) {
	# we start by collecting all features in the group and
	# memorizing those which have an ID attribute
    my $anno_ID = $feat->get_Annotations('ID');
	if(ref($anno_ID)) {
      my $attr_ID = $anno_ID->value;
      $self->throw("Oops! ID $attr_ID exists more than once in your file!")
		if (exists($seen_ids{$attr_ID}));
      $seen_ids{$attr_ID} = $feat;
	}
	push(@all_feats, $feat);
  }

  # assemble the top-level features
  foreach $feat (@all_feats) {
	my @parents = $feat->get_Annotations('Parent');
	if (@parents) {
      foreach my $parent (@parents) {
		my $parent_id = $parent->value;
		$self->throw("Parent with ID $parent_id not found!") unless (exists($seen_ids{$parent_id}));
		$seen_ids{$parent_id}->add_SeqFeature($feat);
      }
	} else {
	    push(@toplevel_feats, $feat);
      }
  }

  return @toplevel_feats;
}

=head2 next_seq()

access the FASTA section (if any) at the end of the GFF stream.  note that this method
will return undef if not all features in the stream have been handled

=cut

sub next_seq() {
  my $self = shift;
  return unless $self->fasta_mode();

  #first time next_seq has been called.  initialize Bio::SeqIO instance
  if(!$self->seqio){
    $self->seqio( Bio::SeqIO->new(-format => 'fasta', -fh => $self->_fh()) );
  }
  return $self->seqio->next_seq();
}

=head2 write_feature()

 Usage   : $featureio->write_feature( Bio::SeqFeature::Annotated->new(...) );
 Function: writes a feature in GFF format.  the GFF version used is governed by the
           '-version' argument passed to Bio::FeatureIO->new(), and defaults to GFF
           version 3.
 Returns : ###FIXME
 Args    : a Bio::SeqFeature::Annotated object.

=cut

sub write_feature {
  my($self,$feature) = @_;

lib/Bio/FeatureIO/gff.pm  view on Meta::CPAN


sub _handle_directive {
  my($self,$directive_string) = @_;

  $directive_string =~ s/^##//; #remove escape
  my($directive,@arg) = split /\s+/, $directive_string;

  if($directive eq 'gff-version'){
    my $version = $arg[0];
    if($version != 3){
      $self->throw("this is not a gff version 3 document, it is version '$version'");
    }
  }

  elsif($directive eq 'sequence-region'){
    # RAE: Sequence regions are in the format sequence-region seqid start end
    # for these we want to store the seqid, start, and end. Then when we validate
    # we want to make sure that the features are within the seqid/start/end

    $self->throw('Both start and end for sequence region should be defined')
      unless $arg[1] && $arg[2];
    my $fta = Bio::Annotation::OntologyTerm->new();
    $fta->name( 'region');

    my $f = Bio::SeqFeature::Annotated->new();
    $f->seq_id( $arg[0] );
    $f->start(  $arg[1] );
    $f->end(    $arg[2] );
    $f->strand(1);

    $f->type(   $fta    );

    #cache this in sequence_region(), we may need it for validation later.
    $self->sequence_region($f->seq_id => $f);

    #NOTE: is this the right thing to do -- treat this as a feature? -allenday
    #buffer it to be returned by next_feature()
    $self->_buffer_feature($f);
  }

  elsif($directive eq 'feature-ontology'){
    $self->warn("'##$directive' directive handling not yet implemented");
  }

  elsif($directive eq 'attribute-ontology'){
    $self->warn("'##$directive' directive handling not yet implemented");
  }

  elsif($directive eq 'source-ontology'){
    $self->warn("'##$directive' directive handling not yet implemented");
  }

  elsif($directive eq 'FASTA' or $directive =~ /^>/){
    #next_seq() will take care of this.
    $self->fasta_mode(1);
    return;
  }

  elsif($directive eq '#'){
    #all forward references resolved
    $self->{group_not_done} = 0;
  }

  elsif($directive eq 'organism') {
    my $organism = $arg[0];
    $self->organism($organism);
  }

  else {
    $self->throw("don't know what do do with directive: '##".$directive."'");
  }
}

=head1 _handle_feature()

this method is called for each line not beginning with '#'.  it parses the line and returns a
Bio::SeqFeature::Annotated object.

=cut

sub _handle_feature {
  my($self,$feature_string) = @_;

  my $feat = Bio::SeqFeature::Annotated->new();

  my($seq,$source,$type,$start,$end,$score,$strand,$phase,$attribute_string) = split /\t/, $feature_string;

  $feat->seq_id($seq);
  $feat->source_tag($source);
  $feat->start($start) unless $start eq '.';
  $feat->end($end) unless $end eq '.';
  $feat->strand($strand eq '+' ? 1 : $strand eq '-' ? -1 : 0);
  $feat->score($score);
  $feat->phase($phase);

  my $fta = Bio::Annotation::OntologyTerm->new();

  if($self->validate()){
    # RAE Added a couple of validations based on the GFF3 spec at http://song.sourceforge.net/gff3.shtml
    # 1. Validate the id
    if ($seq =~ /[^a-zA-Z0-9\.\-\:\^\*\$\@\!\+\_\?]/) { # I just escaped everything.
      $self->throw("Validation Error: seqid ($seq) contains characters that are not [a-zA-Z0-9.:^*\$\@!+_?\-] and not escaped");
    }

    if ($seq =~ /\s/) {
      $self->throw("Validation Error: seqid ($seq) contains unescaped whitespace")
    }

    # NOTE i think we're handling this in as a directive, and this test may be removed -allenday
    if ($seq =~ /^>/) {
      $self->throw("Validation Error: seqid ($seq) begins with a >")
    }

    # 2. Validate the starts and stops.
    # these need to be within the region's bounds, and
    # also start <= end.  bail out if either is not true.
    if ($start > $end) {
      $self->throw("Validation Error: start ($start) must be less than or equal to end in $seq");
    }

    my $region = $self->sequence_region($seq);



( run in 0.553 second using v1.01-cache-2.11-cpan-e1769b4cff6 )