Bio-MUST-Core

 view release on metacpan or  search on metacpan

lib/Bio/MUST/Core/SeqId.pm  view on Meta::CPAN


# from Bio::Phylo::PhyloRole

# =item get_nexus_name()
#
# Gets invocant's name, modified to be safely used in nexus files. This means that:
#
# =item names with spaces in them that aren't 'single quoted' have their spaces replaced
# with underscores
#
# =item names with any of the following characters in them are single quoted:
#  -^*(){}[]+=;:"\<>/,
#
# =item names with single quotes inside them (i.e. not around them) are "double quoted"
#
#  Type    : Accessor
#  Title   : get_nexus_name
#  Usage   : my $name = $obj->get_nexus_name;
#  Function: Returns the object's name.
#  Returns : A string
#  Args    : None
#
# =cut
#
#     sub get_nexus_name {
#         my $self = shift;
#         my $name = $self->get_internal_name;
#         if ( $name =~ /\s/ && $name !~ /^'.+'$/ ) {
#             $name =~ s/\s/_/g;
#         }
#         if (   $name =~ /(?:\-|\^|\*|\(|\)|{|}|\[|\]|\+|=|;|:|"|\\|<|>|\/|,)/
#             && $name !~ /^'.+'$/ )
#         {
#             $name = "'${name}'";
#         }
#         if ( $name =~ /'/ && $name !~ /^".+"$/ && $name !~ /^'.+'$/ ) {
#             $name = "\"${name}\"";
#         }
#         return $name;
#     }


# memoized constructor derived from MooseX::Role::Flyweight


# cache for created SeqId objects (even if out of scope)
my %instance_for;

sub instance {
    my ($class, %args) = @_;

    # TODO: check if this optimized way always work for us
    my $key = $args{full_id};

    # return the existing instance
    return $instance_for{$key} if defined $instance_for{$key};

    # create a new instance
    my $instance = $class->new(%args);
    $instance_for{$key} = $instance;
    # Note: do not weaken reference or this will defeat the purpose.
    # However, this could lead to memory leak in some extreme cases.
    # Scalar::Util::weaken $instance_for{$key};

    return $instance;
}


# class methods to build modern MUST-compliant id from NCBI components


sub new_with {                              ## no critic (RequireArgUnpacking)
    my $class = shift;
    my %args  = @_;                         # TODO: handle HashRef?

    my ($org, $taxon_id, $accession, $keep_strain)
        = @args{ qw(org taxon_id accession keep_strain) };
    $accession //= $args{gi};       # fall back to legacy argument name

    # extract components from organism name
    my ($genus, $species, $strain) = $class->parse_ncbi_name($org);
    # Note: genus, species and strain will have been cleaned-up at this stage

    # truncate name to Genus species (or sp. if none)
    # append strain (if asked to do so)
    # append NCBI taxon_id or GCA/GCF as pseudo-strain
    # append accession number (if provided)
    my $full_id
        = $genus . ' '
        . ($species   ? ( $species                               ) : 'sp.')
        . ($strain    ? ( $keep_strain ? ('_' . $strain  ) : q{} ) :  q{} )
        . ($taxon_id  ? ('_' . $taxon_id                         ) :  q{} )
        . ($accession ? ('@' . $accession                        ) :  q{} )
    ;

    return $class->new( full_id => $full_id );
}



sub parse_ncbi_name {
    my $class = shift;
    my $org   = shift;

    # clean org name
    $org = $class->clean_ncbi_name($org);

    # split org name into max 3 components: genus, species and strain
    # strain is a greedy component for trailing information
    my ($genus, $species, $strain) = split /\s+/xms, $org, 3;

    # clean strain of unwanted prefices and characters (if any)
    $strain = $class->clean_strain($strain) if $strain;

    return ($genus, $species, $strain);
}


sub clean_ncbi_name {
    my $class = shift;
    my $org   = shift;

    # remove unwanted prefices



( run in 2.005 seconds using v1.01-cache-2.11-cpan-5a3173703d6 )