Bio-MUST-Core
view release on metacpan or search on metacpan
lib/Bio/MUST/Core/SeqId.pm view on Meta::CPAN
# from Bio::Phylo::PhyloRole
# =item get_nexus_name()
#
# Gets invocant's name, modified to be safely used in nexus files. This means that:
#
# =item names with spaces in them that aren't 'single quoted' have their spaces replaced
# with underscores
#
# =item names with any of the following characters in them are single quoted:
# -^*(){}[]+=;:"\<>/,
#
# =item names with single quotes inside them (i.e. not around them) are "double quoted"
#
# Type : Accessor
# Title : get_nexus_name
# Usage : my $name = $obj->get_nexus_name;
# Function: Returns the object's name.
# Returns : A string
# Args : None
#
# =cut
#
# sub get_nexus_name {
# my $self = shift;
# my $name = $self->get_internal_name;
# if ( $name =~ /\s/ && $name !~ /^'.+'$/ ) {
# $name =~ s/\s/_/g;
# }
# if ( $name =~ /(?:\-|\^|\*|\(|\)|{|}|\[|\]|\+|=|;|:|"|\\|<|>|\/|,)/
# && $name !~ /^'.+'$/ )
# {
# $name = "'${name}'";
# }
# if ( $name =~ /'/ && $name !~ /^".+"$/ && $name !~ /^'.+'$/ ) {
# $name = "\"${name}\"";
# }
# return $name;
# }
# memoized constructor derived from MooseX::Role::Flyweight
# cache for created SeqId objects (even if out of scope)
my %instance_for;
sub instance {
my ($class, %args) = @_;
# TODO: check if this optimized way always work for us
my $key = $args{full_id};
# return the existing instance
return $instance_for{$key} if defined $instance_for{$key};
# create a new instance
my $instance = $class->new(%args);
$instance_for{$key} = $instance;
# Note: do not weaken reference or this will defeat the purpose.
# However, this could lead to memory leak in some extreme cases.
# Scalar::Util::weaken $instance_for{$key};
return $instance;
}
# class methods to build modern MUST-compliant id from NCBI components
sub new_with { ## no critic (RequireArgUnpacking)
my $class = shift;
my %args = @_; # TODO: handle HashRef?
my ($org, $taxon_id, $accession, $keep_strain)
= @args{ qw(org taxon_id accession keep_strain) };
$accession //= $args{gi}; # fall back to legacy argument name
# extract components from organism name
my ($genus, $species, $strain) = $class->parse_ncbi_name($org);
# Note: genus, species and strain will have been cleaned-up at this stage
# truncate name to Genus species (or sp. if none)
# append strain (if asked to do so)
# append NCBI taxon_id or GCA/GCF as pseudo-strain
# append accession number (if provided)
my $full_id
= $genus . ' '
. ($species ? ( $species ) : 'sp.')
. ($strain ? ( $keep_strain ? ('_' . $strain ) : q{} ) : q{} )
. ($taxon_id ? ('_' . $taxon_id ) : q{} )
. ($accession ? ('@' . $accession ) : q{} )
;
return $class->new( full_id => $full_id );
}
sub parse_ncbi_name {
my $class = shift;
my $org = shift;
# clean org name
$org = $class->clean_ncbi_name($org);
# split org name into max 3 components: genus, species and strain
# strain is a greedy component for trailing information
my ($genus, $species, $strain) = split /\s+/xms, $org, 3;
# clean strain of unwanted prefices and characters (if any)
$strain = $class->clean_strain($strain) if $strain;
return ($genus, $species, $strain);
}
sub clean_ncbi_name {
my $class = shift;
my $org = shift;
# remove unwanted prefices
( run in 2.005 seconds using v1.01-cache-2.11-cpan-5a3173703d6 )