FAST

 view release on metacpan or  search on metacpan

lib/FAST/Bio/SeqIO/Handler/GenericRichSeqHandler.pm  view on Meta::CPAN

 Usage   :  
 Function:  
 Returns :  
 Args    :
 Throws  :
 Note    :  

=cut

sub location_factory {
    my ($self, $factory) = @_;
    if ($factory) {
        $self->throw("Must have a FAST::Bio::Factory::LocationFactoryI when ".
                     "explicitly setting factory()") unless
             (ref($factory) && $factory->isa('FAST::Bio::Factory::LocationFactoryI'));
        $self->{'_locfactory'} = $factory;
    } elsif (!defined($self->{'_locfactory'})) {
        $self->{'_locfactory'} = FAST::Bio::Factory::FTLocationFactory->new()
    }
    return $self->{'_locfactory'};
}

=head2 annotation_collection

 Title   :  annotation_collection
 Usage   :  
 Function:  
 Returns :  
 Args    :
 Throws  :
 Note    :  

=cut

sub annotation_collection {
    my ($self, $coll) = @_;
    if ($coll) {
        $self->throw("Must have FAST::Bio::AnnotationCollectionI ".
                     "when explicitly setting collection()")
            unless (ref($coll) && $coll->isa('FAST::Bio::AnnotationCollectionI'));
        $self->{'_params'}->{'-annotation'} = $coll;
    } elsif (!exists($self->{'_params'}->{'-annotation'})) {
        $self->{'_params'}->{'-annotation'} = FAST::Bio::Annotation::Collection->new()
    }
    return $self->{'_params'}->{'-annotation'};
}

####################### SEQUENCE HANDLERS #######################

# any sequence data
sub _generic_seq {
    my ($self, $data) = @_;
    $self->{'_params'}->{'-seq'} = $data->{DATA};
}

####################### RAW DATA HANDLERS #######################

# GenBank LOCUS line
sub _genbank_locus {
    my ($self, $data) = @_;
    my (@tokens) = split m{\s+}, $data->{DATA};
    my $display_id = shift @tokens;
    $self->{'_params'}->{'-display_id'} = $display_id;
    my $seqlength = shift @tokens;
    if (exists $VALID_ALPHABET{$seqlength}) {
        # moved one token too far.  No locus name?
        $self->warn("Bad LOCUS name?  Changing [".$self->{'_params'}->{'-display_id'}.
                    "] to 'unknown' and length to ".$self->{'_params'}->{'-display_id'});
        $self->{'_params'}->{'-length'} = $self->{'_params'}->{'-display_id'};
        $self->{'_params'}->{'-display_id'} = 'unknown';
        # add token back...
        unshift @tokens, $seqlength;
    } else {
    	$self->{'_params'}->{'-length'} = $seqlength;
    }
    my $alphabet = lc(shift @tokens);        
    $self->{'_params'}->{'-alphabet'} =
        (exists $VALID_ALPHABET{$alphabet}) ? $VALID_ALPHABET{$alphabet} :
                           $self->warn("Unknown alphabet: $alphabet");
    if (($self->{'_params'}->{'-alphabet'} eq 'dna') || (@tokens > 2)) {
	    $self->{'_params'}->{'-molecule'} = shift(@tokens);
	    my $circ = shift(@tokens);
	    if ($circ eq 'circular') {
		$self->{'_params'}->{'-is_circular'} = 1;
		$self->{'_params'}->{'-division'} = shift(@tokens);
	    } else {
				# 'linear' or 'circular' may actually be omitted altogether
		$self->{'_params'}->{'-division'} =
		    (CORE::length($circ) == 3 ) ? $circ : shift(@tokens);
	    }
	} else {
	    $self->{'_params'}->{'-molecule'} = 'PRT' if($self->{'_params'}->{'-alphabet'} eq 'aa');
	    $self->{'_params'}->{'-division'} = shift(@tokens);
	}
    my $date = join(' ', @tokens);
    # maybe use Date::Time for dates?
    if($date && $date =~ s{\s*((\d{1,2})-(\w{3})-(\d{2,4})).*}{$1}) {
        
	    if( length($date) < 11 ) {
            # improperly formatted date
            # But we'll be nice and fix it for them
            my ($d,$m,$y) = ($2,$3,$4);
            if( length($d) == 1 ) {
                $d = "0$d";
            }
            # guess the century here
            if( length($y) == 2 ) {
                if( $y > 60 ) { # arbitrarily guess that '60' means 1960
                $y = "19$y";
                } else {
                $y = "20$y";
                }
                $self->warn("Date was malformed, guessing the century for $date to be $y\n");
            }
            $self->{'_params'}->{'-dates'} = [join('-',$d,$m,$y)];
	    } else {
            $self->{'_params'}->{'-dates'} = [$date];
	    }
	}
}

lib/FAST/Bio/SeqIO/Handler/GenericRichSeqHandler.pm  view on Meta::CPAN

                $section =~ s/[\(\)\.]//g;
                my @names = split(m{\s+OR\s+}, $section);
                push @genenames, ['Name' => shift @names];
                push @genenames, map {['Synonyms' => $_]} @names;
                push @stags, ['gene_name' => \@genenames]            
            }
        } #use Data::Dumper; print Dumper $gn, $genename;# exit;
        my $gn = FAST::Bio::Annotation::TagTree->new(-tagname => 'gene_name',
                                               -value => ['gene_names' => \@stags]);
        $self->annotation_collection->add_Annotation('gene_name', $gn);
    }
}

# GenBank VERSION line
# old EMBL SV line (now obsolete)
# UniProt/SwissProt?
sub _generic_version {
    my ($self, $data) = @_;
    my ($acc,$gi) = split(' ',$data->{DATA});
    if($acc =~ m{^\w+\.(\d+)}xmso) {
        $self->{'_params'}->{'-version'} = $1;
        $self->{'_params'}->{'-seq_version'} = $1;
    }
    if($gi && (index($gi,"GI:") == 0)) {
        $self->{'_params'}->{'-primary_id'} = substr($gi,3);
    }
}

# EMBL DT lines
sub _embl_date {
    my ($self, $data) = @_;
    while ($data->{DATA} =~ m{(\S+)\s\((.*?)\)}g) {
        my ($date, $version) = ($1, $2);
        $date =~ tr{,}{}d; # remove comma if new version
        if ($version =~ m{\(Rel\.\s(\d+),\sCreated\)}xmso ) {
            my $release = FAST::Bio::Annotation::SimpleValue->new(
                            -tagname    => 'creation_release',
                            -value      => $1
                            );
            $self->annotation_collection->add_Annotation($release);
        } elsif ($version =~ m{\(Rel\.\s(\d+),\sLast\supdated,\sVersion\s(\d+)\)}xmso ) {
            my $release = FAST::Bio::Annotation::SimpleValue->new(
                            -tagname    => 'update_release',
                            -value      => $1
                            );
            $self->annotation_collection->add_Annotation($release);
            my $update = FAST::Bio::Annotation::SimpleValue->new(
                           -tagname    => 'update_version',
                           -value      => $2
                           );
            $self->annotation_collection->add_Annotation($update);
        }
        push @{ $self->{'_params'}->{'-dates'} }, $date;
    }
}

# UniProt/SwissProt DT lines
sub _swiss_date {
    my ($self, $data) = @_;
    # swissprot
    my @dls = split m{\n}, $data->{DATA};
    for my $dl (@dls) {
        my ($date, $version) = split(' ', $dl, 2);
        $date =~ tr{,}{}d; # remove comma if new version    
        if ($version =~ m{\(Rel\. (\d+), Last sequence update\)} || # old
            $version =~ m{sequence version (\d+)\.}) { #new
        my $update = FAST::Bio::Annotation::SimpleValue->new(
                                    -tagname    => 'seq_update',
                                    -value      => $1
                                    );
        $self->annotation_collection->add_Annotation($update);
        } elsif ($version =~ m{\(Rel\. (\d+), Last annotation update\)} || #old
                 $version =~ m{entry version (\d+)\.}) { #new
            $self->{'_params'}->{'-version'} = $1;
            $self->{'_params'}->{'-seq_version'} = $1;
        }
        push @{ $self->{'_params'}->{'-dates'} }, $date;
    }
}

# GenBank KEYWORDS line
# EMBL KW line
# UniProt/SwissProt KW line
sub _generic_keywords {
    my ($self, $data) = @_;
    $data->{DATA} =~ s{\.$}{};
    my @kw = split m{\s*\;\s*}xo ,$data->{DATA};
    $self->{'_params'}->{'-keywords'} = \@kw;
}

# GenBank DEFINITION line
# EMBL DE line
# UniProt/SwissProt DE line
sub _generic_description {
    my ($self, $data) = @_;
    $self->{'_params'}->{'-desc'} = $data->{DATA};
}

# GenBank ACCESSION line
# EMBL AC line
# UniProt/SwissProt AC line
sub _generic_accession {
    my ($self, $data) = @_;
    my @accs = split m{[\s;]+}, $data->{DATA};
    $self->{'_params'}->{'-accession_number'} = shift @accs;
    $self->{'_params'}->{'-secondary_accessions'} = \@accs if @accs;
}

####################### SPECIES HANDLERS #######################

# uses FAST::Bio::Species
# GenBank SOURCE, ORGANISM lines
# EMBL O* lines
# UniProt/SwissProt O* lines
sub _generic_species {
    my ($self, $data) = @_;
    
    my $seqformat = $self->format;
    # if data is coming in from GenBank parser...
    if ($seqformat eq 'genbank' &&
        $data->{ORGANISM} =~ m{(.+?)\s(\S+;[^\n\.]+)}ox) {
        ($data->{ORGANISM}, $data->{CLASSIFICATION}) = ($1, $2);
    }
    
    # SwissProt stuff...
    # hybrid names in swissprot files are no longer valid per intergration into
    # UniProt. Files containing these have been split into separate entries, so
    # it is probably a good idea to update if one has these lingering around...

    my $taxid;
    if ($seqformat eq 'swiss') {
        if ($data->{DATA} =~ m{^([^,]+)}ox) {
            $data->{DATA} = $1;
        }
        if ($data->{CROSSREF} && $data->{CROSSREF} =~ m{NCBI_TaxID=(\d+)}) {
            $taxid = $1;
        }
    }
    
    my ($sl, $class, $sci_name) = ($data->{DATA},
                                   $data->{CLASSIFICATION},
                                   $data->{ORGANISM} || '');
    my ($organelle,$abbr_name, $common);    
    my @class = reverse split m{\s*;\s*}, $class;
    # have to treat swiss different from everything else...
    if ($sl =~ m{^(mitochondrion|chloroplast|plastid)?   # GenBank format
                \s*(.*?)
                \s*(?: \( (.*?) \) )?\.?$ 
         }xmso ){ 
        ($organelle, $abbr_name, $common) = ($1, $2, $3); # optional
    } else {
        $abbr_name = $sl;	# nothing caught; this is a backup!
    }
    # there is no 'abbreviated name' for EMBL
    $sci_name = $abbr_name if $seqformat ne 'genbank';
    $organelle ||= '';
    $common ||= '';
    $sci_name || return;
    unshift @class, $sci_name;
    # no genus/species parsing here; moving to FAST::Bio::Taxon-based taxonomy
    my $make = FAST::Bio::Species->new();
    $make->scientific_name($sci_name);
    $make->classification(@class) if @class > 0;
    $common    && $make->common_name( $common );
    $abbr_name && $make->name('abbreviated', $abbr_name);
    $organelle && $make->organelle($organelle);
    $taxid     && $make->ncbi_taxid($taxid);
    $self->{'_params'}->{'-species'} = $make;
}

####################### ANNOTATION HANDLERS #######################

# GenBank DBSOURCE line
sub _genbank_dbsource {
    my ($self, $data) = @_;
    my $dbsource = $data->{DATA};
    my $annotation = $self->annotation_collection;
    # deal with swissprot dbsources
    # we could possibly parcel these out to subhandlers...
    if( $dbsource =~ s/(UniProt(?:KB)|swissprot):\s+locus\s+(\S+)\,.+\n// ) {
        $annotation->add_Annotation
            ('dblink',
             FAST::Bio::Annotation::DBLink->new
             (-primary_id => $2,
              -database => $1,
              -tagname => 'dblink'));
        if( $dbsource =~ s/\s*created:\s+([^\.]+)\.\n// ) {
            $annotation->add_Annotation
            ('swissprot_dates',
             FAST::Bio::Annotation::SimpleValue->new
             (-tagname => 'date_created',
              -value => $1));
        }
        while( $dbsource =~ s/\s*(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g ) {
            $annotation->add_Annotation
            ('swissprot_dates',
             FAST::Bio::Annotation::SimpleValue->new
             (-tagname => 'date_updated',
              -value => $1));
        }
        $dbsource =~ s/\n/ /g;
        if( $dbsource =~ s/\s*xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ ) {
            # will use $i to determine even or odd
            # for swissprot the accessions are paired



( run in 1.029 second using v1.01-cache-2.11-cpan-71847e10f99 )