BioPerl
view release on metacpan or search on metacpan
Bio/SeqIO/Handler/GenericRichSeqHandler.pm view on Meta::CPAN
# EMBL DE line
# UniProt/SwissProt DE line
sub _generic_description {
my ($self, $data) = @_;
$self->{'_params'}->{'-desc'} = $data->{DATA};
}
# GenBank ACCESSION line
# EMBL AC line
# UniProt/SwissProt AC line
sub _generic_accession {
my ($self, $data) = @_;
my @accs = split m{[\s;]+}, $data->{DATA};
$self->{'_params'}->{'-accession_number'} = shift @accs;
$self->{'_params'}->{'-secondary_accessions'} = \@accs if @accs;
}
####################### SPECIES HANDLERS #######################
# uses Bio::Species
# GenBank SOURCE, ORGANISM lines
# EMBL O* lines
# UniProt/SwissProt O* lines
sub _generic_species {
my ($self, $data) = @_;
my $seqformat = $self->format;
# if data is coming in from GenBank parser...
if ($seqformat eq 'genbank' &&
$data->{ORGANISM} =~ m{(.+?)\s(\S+;[^\n\.]+)}ox) {
($data->{ORGANISM}, $data->{CLASSIFICATION}) = ($1, $2);
}
# SwissProt stuff...
# hybrid names in swissprot files are no longer valid per intergration into
# UniProt. Files containing these have been split into separate entries, so
# it is probably a good idea to update if one has these lingering around...
my $taxid;
if ($seqformat eq 'swiss') {
if ($data->{DATA} =~ m{^([^,]+)}ox) {
$data->{DATA} = $1;
}
if ($data->{CROSSREF} && $data->{CROSSREF} =~ m{NCBI_TaxID=(\d+)}) {
$taxid = $1;
}
}
my ($sl, $class, $sci_name) = ($data->{DATA},
$data->{CLASSIFICATION},
$data->{ORGANISM} || '');
my ($organelle,$abbr_name, $common);
my @class = reverse split m{\s*;\s*}, $class;
# have to treat swiss different from everything else...
if ($sl =~ m{^(mitochondrion|chloroplast|plastid)? # GenBank format
\s*(.*?)
\s*(?: \( (.*?) \) )?\.?$
}xmso ){
($organelle, $abbr_name, $common) = ($1, $2, $3); # optional
} else {
$abbr_name = $sl; # nothing caught; this is a backup!
}
# there is no 'abbreviated name' for EMBL
$sci_name = $abbr_name if $seqformat ne 'genbank';
$organelle ||= '';
$common ||= '';
$sci_name || return;
unshift @class, $sci_name;
# no genus/species parsing here; moving to Bio::Taxon-based taxonomy
my $make = Bio::Species->new();
$make->scientific_name($sci_name);
$make->classification(@class) if @class > 0;
$common && $make->common_name( $common );
$abbr_name && $make->name('abbreviated', $abbr_name);
$organelle && $make->organelle($organelle);
$taxid && $make->ncbi_taxid($taxid);
$self->{'_params'}->{'-species'} = $make;
}
####################### ANNOTATION HANDLERS #######################
# GenBank DBSOURCE line
sub _genbank_dbsource {
my ($self, $data) = @_;
my $dbsource = $data->{DATA};
my $annotation = $self->annotation_collection;
# deal with swissprot dbsources
# we could possibly parcel these out to subhandlers...
if( $dbsource =~ s/(UniProt(?:KB)|swissprot):\s+locus\s+(\S+)\,.+\n// ) {
$annotation->add_Annotation
('dblink',
Bio::Annotation::DBLink->new
(-primary_id => $2,
-database => $1,
-tagname => 'dblink'));
if( $dbsource =~ s/\s*created:\s+([^\.]+)\.\n// ) {
$annotation->add_Annotation
('swissprot_dates',
Bio::Annotation::SimpleValue->new
(-tagname => 'date_created',
-value => $1));
}
while( $dbsource =~ s/\s*(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g ) {
$annotation->add_Annotation
('swissprot_dates',
Bio::Annotation::SimpleValue->new
(-tagname => 'date_updated',
-value => $1));
}
$dbsource =~ s/\n/ /g;
if( $dbsource =~ s/\s*xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ ) {
# will use $i to determine even or odd
# for swissprot the accessions are paired
my $i = 0;
for my $dbsrc ( split(/,\s+/,$1) ) {
if( $dbsrc =~ /(\S+)\.(\d+)/ || $dbsrc =~ /(\S+)/ ) {
my ($id,$version) = ($1,$2);
$version ='' unless defined $version;
my $db;
if( $id =~ /^\d\S{3}/) {
$db = 'PDB';
( run in 0.807 second using v1.01-cache-2.11-cpan-39bf76dae61 )