Bio-MUST-Core
view release on metacpan or search on metacpan
lib/Bio/MUST/Core/GeneticCode/Factory.pm view on Meta::CPAN
use Bio::MUST::Core::Types;
use aliased 'Bio::MUST::Core::GeneticCode';
# public path to NCBI Taxonomy dump directory
has 'tax_dir' => (
is => 'ro',
isa => 'Bio::MUST::Core::Types::Dir',
coerce => 1,
);
# private hash hosting NCBI codes
has '_code_for' => (
traits => ['Hash'],
is => 'ro',
isa => 'HashRef[Bio::MUST::Core::GeneticCode]',
init_arg => undef,
lazy => 1,
builder => '_build_code_for',
handles => {
code_for => 'get',
list_codes => 'keys',
},
);
## no critic (ProhibitUnusedPrivateSubroutines)
sub _build_code_for {
my $self = shift;
# split file content into code blocks
my @codes = $self->_get_gcprt_content =~ m/ \{ ( [^{}]+ ) \} /xmsgc;
croak "[BMC] Error: cannot parse 'gc.prt' file; aborting!" unless @codes;
# Genetic-code-table ::= {
# ...
# {
# name "Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate
# Mitochondrial; Mycoplasma; Spiroplasma" ,
# name "SGC3" ,
# id 4 ,
# ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
# sncbieaa "--MM---------------M------------MMMM---------------M------------"
# -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
# -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
# -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# ...
# }
my %code_for;
for my $code (@codes) {
# get all names and id for current code
my ($id) = $code =~ m/ id \s* (\d+) /xms;
my @names = $code =~ m/ name \s* \"(.*?)\" /xmsg;
@names = map { s{\n}{}xmsgr } @names; # remove newline chars
@names = map { split m{;\s*}xms } @names; # demultiplex names
# retrieve the amino acid line
my ($aa_line) = $code =~ m/ ncbieaa \s* \"(.*?)\" /xms;
$aa_line =~ s{\*}{x}xmsg; # make STOPs MUST-compliant
# retrieve the three codon lines
my ($b1_line) = $code =~ m/ Base1 \s* ([TACG]+) /xms;
my ($b2_line) = $code =~ m/ Base2 \s* ([TACG]+) /xms;
my ($b3_line) = $code =~ m/ Base3 \s* ([TACG]+) /xms;
# split lines into aas and bases
my @aas = split //, $aa_line;
my @bases1 = split //, $b1_line;
my @bases2 = split //, $b2_line;
my @bases3 = split //, $b3_line;
# build translation table for current code
my %aa_for = map {
join( q{}, $bases1[$_], $bases2[$_], $bases3[$_] ) => $aas[$_]
} 0..$#aas;
# augment code using ambiguous nucleotides and gap codons
%aa_for = _augment_code(%aa_for);
# store translation table under its various id and names
$code_for{$_} = GeneticCode->new(
ncbi_id => $id,
_code => \%aa_for
) for ($id, @names);
}
return \%code_for;
}
const my %BASES_FOR => (
A => q{A},
C => q{C},
G => q{G},
T => q{T},
U => q{T},
M => q{[AC]},
R => q{[AG]},
W => q{[AT]},
S => q{[CG]},
Y => q{[CT]},
K => q{[GT]},
V => q{[ACG]},
H => q{[ACT]},
D => q{[AGT]},
B => q{[CGT]},
N => q{[ACGT]},
X => q{[ACGT]},
);
sub _augment_code {
my %aa_for = @_;
my %amb_aa_for;
# Note: each cannot be used here because of the nested loops
( run in 3.213 seconds using v1.01-cache-2.11-cpan-483215c6ad5 )