BioPerl

 view release on metacpan or  search on metacpan

Bio/SeqIO/Handler/GenericRichSeqHandler.pm  view on Meta::CPAN

# EMBL DE line
# UniProt/SwissProt DE line
sub _generic_description {
    my ($self, $data) = @_;
    $self->{'_params'}->{'-desc'} = $data->{DATA};
}

# GenBank ACCESSION line
# EMBL AC line
# UniProt/SwissProt AC line
sub _generic_accession {
    my ($self, $data) = @_;
    my @accs = split m{[\s;]+}, $data->{DATA};
    $self->{'_params'}->{'-accession_number'} = shift @accs;
    $self->{'_params'}->{'-secondary_accessions'} = \@accs if @accs;
}

####################### SPECIES HANDLERS #######################

# uses Bio::Species
# GenBank SOURCE, ORGANISM lines
# EMBL O* lines
# UniProt/SwissProt O* lines
sub _generic_species {
    my ($self, $data) = @_;
    
    my $seqformat = $self->format;
    # if data is coming in from GenBank parser...
    if ($seqformat eq 'genbank' &&
        $data->{ORGANISM} =~ m{(.+?)\s(\S+;[^\n\.]+)}ox) {
        ($data->{ORGANISM}, $data->{CLASSIFICATION}) = ($1, $2);
    }
    
    # SwissProt stuff...
    # hybrid names in swissprot files are no longer valid per intergration into
    # UniProt. Files containing these have been split into separate entries, so
    # it is probably a good idea to update if one has these lingering around...

    my $taxid;
    if ($seqformat eq 'swiss') {
        if ($data->{DATA} =~ m{^([^,]+)}ox) {
            $data->{DATA} = $1;
        }
        if ($data->{CROSSREF} && $data->{CROSSREF} =~ m{NCBI_TaxID=(\d+)}) {
            $taxid = $1;
        }
    }
    
    my ($sl, $class, $sci_name) = ($data->{DATA},
                                   $data->{CLASSIFICATION},
                                   $data->{ORGANISM} || '');
    my ($organelle,$abbr_name, $common);    
    my @class = reverse split m{\s*;\s*}, $class;
    # have to treat swiss different from everything else...
    if ($sl =~ m{^(mitochondrion|chloroplast|plastid)?   # GenBank format
                \s*(.*?)
                \s*(?: \( (.*?) \) )?\.?$ 
         }xmso ){ 
        ($organelle, $abbr_name, $common) = ($1, $2, $3); # optional
    } else {
        $abbr_name = $sl;	# nothing caught; this is a backup!
    }
    # there is no 'abbreviated name' for EMBL
    $sci_name = $abbr_name if $seqformat ne 'genbank';
    $organelle ||= '';
    $common ||= '';
    $sci_name || return;
    unshift @class, $sci_name;
    # no genus/species parsing here; moving to Bio::Taxon-based taxonomy
    my $make = Bio::Species->new();
    $make->scientific_name($sci_name);
    $make->classification(@class) if @class > 0;
    $common    && $make->common_name( $common );
    $abbr_name && $make->name('abbreviated', $abbr_name);
    $organelle && $make->organelle($organelle);
    $taxid     && $make->ncbi_taxid($taxid);
    $self->{'_params'}->{'-species'} = $make;
}

####################### ANNOTATION HANDLERS #######################

# GenBank DBSOURCE line
sub _genbank_dbsource {
    my ($self, $data) = @_;
    my $dbsource = $data->{DATA};
    my $annotation = $self->annotation_collection;
    # deal with swissprot dbsources
    # we could possibly parcel these out to subhandlers...
    if( $dbsource =~ s/(UniProt(?:KB)|swissprot):\s+locus\s+(\S+)\,.+\n// ) {
        $annotation->add_Annotation
            ('dblink',
             Bio::Annotation::DBLink->new
             (-primary_id => $2,
              -database => $1,
              -tagname => 'dblink'));
        if( $dbsource =~ s/\s*created:\s+([^\.]+)\.\n// ) {
            $annotation->add_Annotation
            ('swissprot_dates',
             Bio::Annotation::SimpleValue->new
             (-tagname => 'date_created',
              -value => $1));
        }
        while( $dbsource =~ s/\s*(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g ) {
            $annotation->add_Annotation
            ('swissprot_dates',
             Bio::Annotation::SimpleValue->new
             (-tagname => 'date_updated',
              -value => $1));
        }
        $dbsource =~ s/\n/ /g;
        if( $dbsource =~ s/\s*xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ ) {
            # will use $i to determine even or odd
            # for swissprot the accessions are paired
            my $i = 0;
            for my $dbsrc ( split(/,\s+/,$1) ) {
                if( $dbsrc =~ /(\S+)\.(\d+)/ || $dbsrc =~ /(\S+)/ ) {
                    my ($id,$version) = ($1,$2);
                    $version ='' unless defined $version;
                    my $db;
                    if( $id =~ /^\d\S{3}/) {
                        $db = 'PDB';



( run in 0.807 second using v1.01-cache-2.11-cpan-39bf76dae61 )