Bio-FeatureIO
view release on metacpan or search on metacpan
lib/Bio/FeatureIO/gff.pm view on Meta::CPAN
if($gff_string =~ /^>/){
my $pos = tell($self->_fh);
# TODO: SeqIO::fasta uses _fh directly, whereas _pushback uses an in-memory
# buffer; this completely breaks with piped data, so fix for files and punt
# on the rest for now
if ($pos >= 0) {
seek($self->_fh, -length($gff_string), 1);
} else {
$self->warn("In-line parsing of FASTA not yet supported");
$self->_pushback($gff_string);
}
$self->fasta_mode(1);
return;
}
# got a directive
elsif($gff_string =~ /^##/){
$self->_handle_directive($gff_string);
return $self->_buffer_feature();
}
# got a feature
else {
return $self->_handle_feature($gff_string);
}
}
=head2 next_feature_group
Title : next_feature_group
Usage : @feature_group = $stream->next_feature_group
Function: Reads the next feature_group from $stream and returns it.
Feature groups in GFF3 files are separated by '###' directives. The
features in a group might form a hierarchical structure. The
complete hierarchy of features is returned, i.e. the returned array
represents only the top-level features. Lower-level features can
be accessed using the 'get_SeqFeatures' method recursively.
Example : # getting the complete hierarchy of features in a GFF3 file
my @toplevel_features;
while (my @fg = $stream->next_feature_group) {
push(@toplevel_features, @fg);
}
Returns : an array of Bio::SeqFeature::Annotated objects
Args : none
=cut
sub next_feature_group {
my $self = shift;
my $feat;
my %seen_ids;
my @all_feats;
my @toplevel_feats;
$self->{group_not_done} = 1;
while ($self->{group_not_done} && ($feat = $self->next_feature()) && defined($feat)) {
# we start by collecting all features in the group and
# memorizing those which have an ID attribute
my $anno_ID = $feat->get_Annotations('ID');
if(ref($anno_ID)) {
my $attr_ID = $anno_ID->value;
$self->throw("Oops! ID $attr_ID exists more than once in your file!")
if (exists($seen_ids{$attr_ID}));
$seen_ids{$attr_ID} = $feat;
}
push(@all_feats, $feat);
}
# assemble the top-level features
foreach $feat (@all_feats) {
my @parents = $feat->get_Annotations('Parent');
if (@parents) {
foreach my $parent (@parents) {
my $parent_id = $parent->value;
$self->throw("Parent with ID $parent_id not found!") unless (exists($seen_ids{$parent_id}));
$seen_ids{$parent_id}->add_SeqFeature($feat);
}
} else {
push(@toplevel_feats, $feat);
}
}
return @toplevel_feats;
}
=head2 next_seq()
access the FASTA section (if any) at the end of the GFF stream. note that this method
will return undef if not all features in the stream have been handled
=cut
sub next_seq() {
my $self = shift;
return unless $self->fasta_mode();
#first time next_seq has been called. initialize Bio::SeqIO instance
if(!$self->seqio){
$self->seqio( Bio::SeqIO->new(-format => 'fasta', -fh => $self->_fh()) );
}
return $self->seqio->next_seq();
}
=head2 write_feature()
Usage : $featureio->write_feature( Bio::SeqFeature::Annotated->new(...) );
Function: writes a feature in GFF format. the GFF version used is governed by the
'-version' argument passed to Bio::FeatureIO->new(), and defaults to GFF
version 3.
Returns : ###FIXME
Args : a Bio::SeqFeature::Annotated object.
=cut
sub write_feature {
my($self,$feature) = @_;
lib/Bio/FeatureIO/gff.pm view on Meta::CPAN
sub _handle_directive {
my($self,$directive_string) = @_;
$directive_string =~ s/^##//; #remove escape
my($directive,@arg) = split /\s+/, $directive_string;
if($directive eq 'gff-version'){
my $version = $arg[0];
if($version != 3){
$self->throw("this is not a gff version 3 document, it is version '$version'");
}
}
elsif($directive eq 'sequence-region'){
# RAE: Sequence regions are in the format sequence-region seqid start end
# for these we want to store the seqid, start, and end. Then when we validate
# we want to make sure that the features are within the seqid/start/end
$self->throw('Both start and end for sequence region should be defined')
unless $arg[1] && $arg[2];
my $fta = Bio::Annotation::OntologyTerm->new();
$fta->name( 'region');
my $f = Bio::SeqFeature::Annotated->new();
$f->seq_id( $arg[0] );
$f->start( $arg[1] );
$f->end( $arg[2] );
$f->strand(1);
$f->type( $fta );
#cache this in sequence_region(), we may need it for validation later.
$self->sequence_region($f->seq_id => $f);
#NOTE: is this the right thing to do -- treat this as a feature? -allenday
#buffer it to be returned by next_feature()
$self->_buffer_feature($f);
}
elsif($directive eq 'feature-ontology'){
$self->warn("'##$directive' directive handling not yet implemented");
}
elsif($directive eq 'attribute-ontology'){
$self->warn("'##$directive' directive handling not yet implemented");
}
elsif($directive eq 'source-ontology'){
$self->warn("'##$directive' directive handling not yet implemented");
}
elsif($directive eq 'FASTA' or $directive =~ /^>/){
#next_seq() will take care of this.
$self->fasta_mode(1);
return;
}
elsif($directive eq '#'){
#all forward references resolved
$self->{group_not_done} = 0;
}
elsif($directive eq 'organism') {
my $organism = $arg[0];
$self->organism($organism);
}
else {
$self->throw("don't know what do do with directive: '##".$directive."'");
}
}
=head1 _handle_feature()
this method is called for each line not beginning with '#'. it parses the line and returns a
Bio::SeqFeature::Annotated object.
=cut
sub _handle_feature {
my($self,$feature_string) = @_;
my $feat = Bio::SeqFeature::Annotated->new();
my($seq,$source,$type,$start,$end,$score,$strand,$phase,$attribute_string) = split /\t/, $feature_string;
$feat->seq_id($seq);
$feat->source_tag($source);
$feat->start($start) unless $start eq '.';
$feat->end($end) unless $end eq '.';
$feat->strand($strand eq '+' ? 1 : $strand eq '-' ? -1 : 0);
$feat->score($score);
$feat->phase($phase);
my $fta = Bio::Annotation::OntologyTerm->new();
if($self->validate()){
# RAE Added a couple of validations based on the GFF3 spec at http://song.sourceforge.net/gff3.shtml
# 1. Validate the id
if ($seq =~ /[^a-zA-Z0-9\.\-\:\^\*\$\@\!\+\_\?]/) { # I just escaped everything.
$self->throw("Validation Error: seqid ($seq) contains characters that are not [a-zA-Z0-9.:^*\$\@!+_?\-] and not escaped");
}
if ($seq =~ /\s/) {
$self->throw("Validation Error: seqid ($seq) contains unescaped whitespace")
}
# NOTE i think we're handling this in as a directive, and this test may be removed -allenday
if ($seq =~ /^>/) {
$self->throw("Validation Error: seqid ($seq) begins with a >")
}
# 2. Validate the starts and stops.
# these need to be within the region's bounds, and
# also start <= end. bail out if either is not true.
if ($start > $end) {
$self->throw("Validation Error: start ($start) must be less than or equal to end in $seq");
}
my $region = $self->sequence_region($seq);
( run in 0.553 second using v1.01-cache-2.11-cpan-e1769b4cff6 )