FAST
view release on metacpan or search on metacpan
lib/FAST/Bio/SeqIO/Handler/GenericRichSeqHandler.pm view on Meta::CPAN
Usage :
Function:
Returns :
Args :
Throws :
Note :
=cut
sub location_factory {
my ($self, $factory) = @_;
if ($factory) {
$self->throw("Must have a FAST::Bio::Factory::LocationFactoryI when ".
"explicitly setting factory()") unless
(ref($factory) && $factory->isa('FAST::Bio::Factory::LocationFactoryI'));
$self->{'_locfactory'} = $factory;
} elsif (!defined($self->{'_locfactory'})) {
$self->{'_locfactory'} = FAST::Bio::Factory::FTLocationFactory->new()
}
return $self->{'_locfactory'};
}
=head2 annotation_collection
Title : annotation_collection
Usage :
Function:
Returns :
Args :
Throws :
Note :
=cut
sub annotation_collection {
my ($self, $coll) = @_;
if ($coll) {
$self->throw("Must have FAST::Bio::AnnotationCollectionI ".
"when explicitly setting collection()")
unless (ref($coll) && $coll->isa('FAST::Bio::AnnotationCollectionI'));
$self->{'_params'}->{'-annotation'} = $coll;
} elsif (!exists($self->{'_params'}->{'-annotation'})) {
$self->{'_params'}->{'-annotation'} = FAST::Bio::Annotation::Collection->new()
}
return $self->{'_params'}->{'-annotation'};
}
####################### SEQUENCE HANDLERS #######################
# any sequence data
sub _generic_seq {
my ($self, $data) = @_;
$self->{'_params'}->{'-seq'} = $data->{DATA};
}
####################### RAW DATA HANDLERS #######################
# GenBank LOCUS line
sub _genbank_locus {
my ($self, $data) = @_;
my (@tokens) = split m{\s+}, $data->{DATA};
my $display_id = shift @tokens;
$self->{'_params'}->{'-display_id'} = $display_id;
my $seqlength = shift @tokens;
if (exists $VALID_ALPHABET{$seqlength}) {
# moved one token too far. No locus name?
$self->warn("Bad LOCUS name? Changing [".$self->{'_params'}->{'-display_id'}.
"] to 'unknown' and length to ".$self->{'_params'}->{'-display_id'});
$self->{'_params'}->{'-length'} = $self->{'_params'}->{'-display_id'};
$self->{'_params'}->{'-display_id'} = 'unknown';
# add token back...
unshift @tokens, $seqlength;
} else {
$self->{'_params'}->{'-length'} = $seqlength;
}
my $alphabet = lc(shift @tokens);
$self->{'_params'}->{'-alphabet'} =
(exists $VALID_ALPHABET{$alphabet}) ? $VALID_ALPHABET{$alphabet} :
$self->warn("Unknown alphabet: $alphabet");
if (($self->{'_params'}->{'-alphabet'} eq 'dna') || (@tokens > 2)) {
$self->{'_params'}->{'-molecule'} = shift(@tokens);
my $circ = shift(@tokens);
if ($circ eq 'circular') {
$self->{'_params'}->{'-is_circular'} = 1;
$self->{'_params'}->{'-division'} = shift(@tokens);
} else {
# 'linear' or 'circular' may actually be omitted altogether
$self->{'_params'}->{'-division'} =
(CORE::length($circ) == 3 ) ? $circ : shift(@tokens);
}
} else {
$self->{'_params'}->{'-molecule'} = 'PRT' if($self->{'_params'}->{'-alphabet'} eq 'aa');
$self->{'_params'}->{'-division'} = shift(@tokens);
}
my $date = join(' ', @tokens);
# maybe use Date::Time for dates?
if($date && $date =~ s{\s*((\d{1,2})-(\w{3})-(\d{2,4})).*}{$1}) {
if( length($date) < 11 ) {
# improperly formatted date
# But we'll be nice and fix it for them
my ($d,$m,$y) = ($2,$3,$4);
if( length($d) == 1 ) {
$d = "0$d";
}
# guess the century here
if( length($y) == 2 ) {
if( $y > 60 ) { # arbitrarily guess that '60' means 1960
$y = "19$y";
} else {
$y = "20$y";
}
$self->warn("Date was malformed, guessing the century for $date to be $y\n");
}
$self->{'_params'}->{'-dates'} = [join('-',$d,$m,$y)];
} else {
$self->{'_params'}->{'-dates'} = [$date];
}
}
}
lib/FAST/Bio/SeqIO/Handler/GenericRichSeqHandler.pm view on Meta::CPAN
$section =~ s/[\(\)\.]//g;
my @names = split(m{\s+OR\s+}, $section);
push @genenames, ['Name' => shift @names];
push @genenames, map {['Synonyms' => $_]} @names;
push @stags, ['gene_name' => \@genenames]
}
} #use Data::Dumper; print Dumper $gn, $genename;# exit;
my $gn = FAST::Bio::Annotation::TagTree->new(-tagname => 'gene_name',
-value => ['gene_names' => \@stags]);
$self->annotation_collection->add_Annotation('gene_name', $gn);
}
}
# GenBank VERSION line
# old EMBL SV line (now obsolete)
# UniProt/SwissProt?
sub _generic_version {
my ($self, $data) = @_;
my ($acc,$gi) = split(' ',$data->{DATA});
if($acc =~ m{^\w+\.(\d+)}xmso) {
$self->{'_params'}->{'-version'} = $1;
$self->{'_params'}->{'-seq_version'} = $1;
}
if($gi && (index($gi,"GI:") == 0)) {
$self->{'_params'}->{'-primary_id'} = substr($gi,3);
}
}
# EMBL DT lines
sub _embl_date {
my ($self, $data) = @_;
while ($data->{DATA} =~ m{(\S+)\s\((.*?)\)}g) {
my ($date, $version) = ($1, $2);
$date =~ tr{,}{}d; # remove comma if new version
if ($version =~ m{\(Rel\.\s(\d+),\sCreated\)}xmso ) {
my $release = FAST::Bio::Annotation::SimpleValue->new(
-tagname => 'creation_release',
-value => $1
);
$self->annotation_collection->add_Annotation($release);
} elsif ($version =~ m{\(Rel\.\s(\d+),\sLast\supdated,\sVersion\s(\d+)\)}xmso ) {
my $release = FAST::Bio::Annotation::SimpleValue->new(
-tagname => 'update_release',
-value => $1
);
$self->annotation_collection->add_Annotation($release);
my $update = FAST::Bio::Annotation::SimpleValue->new(
-tagname => 'update_version',
-value => $2
);
$self->annotation_collection->add_Annotation($update);
}
push @{ $self->{'_params'}->{'-dates'} }, $date;
}
}
# UniProt/SwissProt DT lines
sub _swiss_date {
my ($self, $data) = @_;
# swissprot
my @dls = split m{\n}, $data->{DATA};
for my $dl (@dls) {
my ($date, $version) = split(' ', $dl, 2);
$date =~ tr{,}{}d; # remove comma if new version
if ($version =~ m{\(Rel\. (\d+), Last sequence update\)} || # old
$version =~ m{sequence version (\d+)\.}) { #new
my $update = FAST::Bio::Annotation::SimpleValue->new(
-tagname => 'seq_update',
-value => $1
);
$self->annotation_collection->add_Annotation($update);
} elsif ($version =~ m{\(Rel\. (\d+), Last annotation update\)} || #old
$version =~ m{entry version (\d+)\.}) { #new
$self->{'_params'}->{'-version'} = $1;
$self->{'_params'}->{'-seq_version'} = $1;
}
push @{ $self->{'_params'}->{'-dates'} }, $date;
}
}
# GenBank KEYWORDS line
# EMBL KW line
# UniProt/SwissProt KW line
sub _generic_keywords {
my ($self, $data) = @_;
$data->{DATA} =~ s{\.$}{};
my @kw = split m{\s*\;\s*}xo ,$data->{DATA};
$self->{'_params'}->{'-keywords'} = \@kw;
}
# GenBank DEFINITION line
# EMBL DE line
# UniProt/SwissProt DE line
sub _generic_description {
my ($self, $data) = @_;
$self->{'_params'}->{'-desc'} = $data->{DATA};
}
# GenBank ACCESSION line
# EMBL AC line
# UniProt/SwissProt AC line
sub _generic_accession {
my ($self, $data) = @_;
my @accs = split m{[\s;]+}, $data->{DATA};
$self->{'_params'}->{'-accession_number'} = shift @accs;
$self->{'_params'}->{'-secondary_accessions'} = \@accs if @accs;
}
####################### SPECIES HANDLERS #######################
# uses FAST::Bio::Species
# GenBank SOURCE, ORGANISM lines
# EMBL O* lines
# UniProt/SwissProt O* lines
sub _generic_species {
my ($self, $data) = @_;
my $seqformat = $self->format;
# if data is coming in from GenBank parser...
if ($seqformat eq 'genbank' &&
$data->{ORGANISM} =~ m{(.+?)\s(\S+;[^\n\.]+)}ox) {
($data->{ORGANISM}, $data->{CLASSIFICATION}) = ($1, $2);
}
# SwissProt stuff...
# hybrid names in swissprot files are no longer valid per intergration into
# UniProt. Files containing these have been split into separate entries, so
# it is probably a good idea to update if one has these lingering around...
my $taxid;
if ($seqformat eq 'swiss') {
if ($data->{DATA} =~ m{^([^,]+)}ox) {
$data->{DATA} = $1;
}
if ($data->{CROSSREF} && $data->{CROSSREF} =~ m{NCBI_TaxID=(\d+)}) {
$taxid = $1;
}
}
my ($sl, $class, $sci_name) = ($data->{DATA},
$data->{CLASSIFICATION},
$data->{ORGANISM} || '');
my ($organelle,$abbr_name, $common);
my @class = reverse split m{\s*;\s*}, $class;
# have to treat swiss different from everything else...
if ($sl =~ m{^(mitochondrion|chloroplast|plastid)? # GenBank format
\s*(.*?)
\s*(?: \( (.*?) \) )?\.?$
}xmso ){
($organelle, $abbr_name, $common) = ($1, $2, $3); # optional
} else {
$abbr_name = $sl; # nothing caught; this is a backup!
}
# there is no 'abbreviated name' for EMBL
$sci_name = $abbr_name if $seqformat ne 'genbank';
$organelle ||= '';
$common ||= '';
$sci_name || return;
unshift @class, $sci_name;
# no genus/species parsing here; moving to FAST::Bio::Taxon-based taxonomy
my $make = FAST::Bio::Species->new();
$make->scientific_name($sci_name);
$make->classification(@class) if @class > 0;
$common && $make->common_name( $common );
$abbr_name && $make->name('abbreviated', $abbr_name);
$organelle && $make->organelle($organelle);
$taxid && $make->ncbi_taxid($taxid);
$self->{'_params'}->{'-species'} = $make;
}
####################### ANNOTATION HANDLERS #######################
# GenBank DBSOURCE line
sub _genbank_dbsource {
my ($self, $data) = @_;
my $dbsource = $data->{DATA};
my $annotation = $self->annotation_collection;
# deal with swissprot dbsources
# we could possibly parcel these out to subhandlers...
if( $dbsource =~ s/(UniProt(?:KB)|swissprot):\s+locus\s+(\S+)\,.+\n// ) {
$annotation->add_Annotation
('dblink',
FAST::Bio::Annotation::DBLink->new
(-primary_id => $2,
-database => $1,
-tagname => 'dblink'));
if( $dbsource =~ s/\s*created:\s+([^\.]+)\.\n// ) {
$annotation->add_Annotation
('swissprot_dates',
FAST::Bio::Annotation::SimpleValue->new
(-tagname => 'date_created',
-value => $1));
}
while( $dbsource =~ s/\s*(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g ) {
$annotation->add_Annotation
('swissprot_dates',
FAST::Bio::Annotation::SimpleValue->new
(-tagname => 'date_updated',
-value => $1));
}
$dbsource =~ s/\n/ /g;
if( $dbsource =~ s/\s*xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ ) {
# will use $i to determine even or odd
# for swissprot the accessions are paired
( run in 1.029 second using v1.01-cache-2.11-cpan-71847e10f99 )