BioPerl
view release on metacpan or search on metacpan
Bio/CodonUsage/Table.pm view on Meta::CPAN
my $arg = shift @args;
$self->throw("need a hash reference, not a [" . ref($arg). "] reference") if ref($arg) ne 'HASH';
### flags to detect argument type, can be either to start with ##
my $is_codon_hash = 1;
my $is_Aa_hash = 1;
for my $k (keys %$arg) {
## convert to UC
$k =~ s/(\w+)/\U$1/;
if (!exists($STRICTAA{$k}) ){
$is_Aa_hash = 0;
}
elsif ($k =~ /[^ATCGatcg]/) {
$is_codon_hash = 0;
}
}
if (!$is_codon_hash && !$is_Aa_hash) {
$self->throw(" invalid key values in CUT hash - must be unique aa or nucleotide identifiers");
}
elsif ($is_Aa_hash) {
$self->_init_from_aa($arg);
}
elsif($is_codon_hash) {
$self->_init_from_cod($arg);
}
while (@args) {
my $key = shift @args;
$key =~ s/\-(\w+)/\L$1/;
$self->$key(shift @args);
}
}
return $self;
}
=head2 all_aa_frequencies
Title : all_aa_frequencies
Usage : my $freq = $cdtable->all_aa_frequencies();
Returns : a reference to a hash where each key is an amino acid
and each value is its frequency in all proteins in that
species.
Args : none
=cut
sub all_aa_frequencies {
my $self = shift;
my %aa_freqs =();
for my $aa (keys %STRICTAA) {
my $freq = $self->aa_frequency($aa);
$aa_freqs{$aa} = $freq;
}
return \%aa_freqs;
}
=head2 codon_abs_frequency
Title : codon_abs_frequency
Usage : my $freq = $cdtable->codon_abs_frequency('CTG');
Purpose : To return the frequency of that codon as a percentage
of all codons in the organism.
Returns : a percentage frequency
Args : a non-ambiguous codon string
=cut
sub codon_abs_frequency {
my ($self, $a) = @_;
my $cod = uc $a;
if ($self->_check_codon($cod)) {
my $ctable = Bio::Tools::CodonTable->new;
$ctable->id($self->genetic_code() );
my $aa =$Bio::SeqUtils::THREECODE {$ctable->translate($cod)};
return $self->{'_table'}{$aa}{$cod}{'per1000'}/10 ;
}
else {return 0;}
}
=head2 codon_rel_frequency
Title : codon_rel_frequency
Usage : my $freq = $cdtable->codon_rel_frequency('CTG');
Purpose : To return the frequency of that codon as a percentage
of codons coding for the same amino acid. E.g., ATG and TGG
would return 100 as those codons are unique.
Returns : a percentage frequency
Args : a non-ambiguous codon string
=cut
sub codon_rel_frequency {
my ($self, $a) = @_;
my $cod = uc $a;
if ($self->_check_codon($cod)) {
my $ctable = Bio::Tools::CodonTable->new;
$ctable->id($self->genetic_code () );
my $aa =$Bio::SeqUtils::THREECODE {$ctable->translate($cod)};
return $self->{'_table'}{$aa}{$cod}{'rel_freq'};
}
else {
return 0;
}
}
=head2 probable_codons
Title : probable_codons
Usage : my $prob_codons = $cd_table->probable_codons(10);
Purpose : to obtain a list of codons for the amino acid above a given
threshold % relative frequency
Returns : A reference to a hash where keys are 1 letter amino acid codes
and values are references to arrays of codons whose frequency
is above the threshold.
Arguments: a minimum threshold frequency
=cut
sub probable_codons {
my ($self, $threshold) = @_;
if (!$threshold || $threshold < 0 || $threshold > 100) {
$self->throw(" I need a threshold percentage ");
}
my %return_hash;
for my $a(keys %STRICTAA) {
my @common_codons;
my $aa =$Bio::SeqUtils::THREECODE{$a};
for my $codon (keys %{ $self->{'_table'}{$aa}}) {
if ($self->{'_table'}{$aa}{$codon}{'rel_freq'} > $threshold/100){
push @common_codons, $codon;
}
}
$return_hash{$a} = \@common_codons;
}
## check to make sure that all codons are populated ##
if (grep{scalar @{$return_hash{$_}} == 0} keys %return_hash) {
$self->warn("Threshold is too high, some amino acids do not" .
" have any codon above the threshold!");
}
return \%return_hash;
}
=head2 most_common_codons
Title : most_common_codons
Usage : my $common_codons = $cd_table->most_common_codons();
Purpose : To obtain the most common codon for a given amino acid
Returns : A reference to a hash where keys are 1 letter amino acid codes
and the values are the single most common codons for those amino acids
Arguments: None
=cut
sub most_common_codons {
my $self = shift;
my %return_hash;
for my $a ( keys %STRICTAA ) {
my $common_codon = '';
my $rel_freq = 0;
my $aa = $Bio::SeqUtils::THREECODE{$a};
if ( ! defined $self->{'_table'}{$aa} ) {
$self->warn("Amino acid $aa ($a) does not have any codons!");
next;
}
for my $codon ( keys %{ $self->{'_table'}{$aa}} ) {
if ($self->{'_table'}{$aa}{$codon}{'rel_freq'} > $rel_freq ){
$common_codon = $codon;
$rel_freq = $self->{'_table'}{$aa}{$codon}{'rel_freq'};
}
}
$return_hash{$a} = $common_codon;
}
return \%return_hash;
}
=head2 codon_count
Title : codon_count
Usage : my $count = $cdtable->codon_count('CTG');
Purpose : To obtain the absolute number of the codons in the
organism.
Returns : an integer
Args : a non-ambiguous codon string
=cut
sub codon_count {
my $self = shift;
if (@_) {
my $a = shift;
my $cod = uc $a;
if ($self->_check_codon($cod)) {
my $ctable = Bio::Tools::CodonTable->new;
$ctable->id($self->genetic_code());
my $aa =$Bio::SeqUtils::THREECODE {$ctable->translate($cod)};
return $self->{'_table'}{$aa}{$cod}{'abs_count'};
}
else {return 0;}
}
else {
$self->warn(" need to give a codon sequence as a parameter ");
return 0;
}
}
=head2 get_coding_gc
Title : get_coding_gc
Usage : my $count = $cdtable->get_coding_gc(1);
Purpose : To return the percentage GC composition for the organism at
codon positions 1,2 or 3, or an average for all coding sequence
('all').
Returns : a number (%-age GC content) or 0 if these fields are undefined
Args : 1,2,3 or 'all'.
=cut
sub get_coding_gc {
my $self = shift;
if (! @_) {
$self->warn(" no parameters supplied must be a codon position (1,2,3) or 'all'");
return 0;
}
else{
my $n = shift;
##return request if valid ##
if ( exists($self->{'_coding_gc'}{$n} ) ) {
return sprintf("%.2f", $self->{'_coding_gc'}{$n});
}
##else return 'all' value if exists
elsif (exists($self->{'_coding_gc'}{'all'} )) {
$self->warn("coding gc doesn't have value for [$n], returning gc content for all CDSs");
return sprintf("%.2f", $self->{'_coding_gc'}{'all'});
}
### else return 0,
else {
$self->warn("coding gc values aren't defined, returning 0");
return 0;
}
}#end of outer else
}
=head2 set_coding_gc
Title : set_coding_gc
Usage : my $count = $cdtable->set_coding_gc(-1=>55.78);
Purpose : To set the percentage GC composition for the organism at
codon positions 1,2 or 3, or an average for all coding sequence
('all').
Returns : void
Args : a hash where the key must be 1,2,3 or 'all' and the value the %age GC
at that codon position..
=cut
sub set_coding_gc {
my ($self, $key, $value) = @_;
my @allowed = qw(1 2 3 all);
$key =~ s/\-//;
if (!grep {$key eq $_} @allowed ) {
$self->warn ("invalid key! - must be one of [ ". (join " ", @allowed) . "]");
return;
}
$self->{'_coding_gc'}{$key} = $value;
}
=head2 species
Title : species
Usage : my $sp = $cut->species();
Purpose : Get/setter for species name of codon table
Returns : Void or species name string
Args : None or species name string
=cut
sub species {
my $self = shift;
if (@_ ){
$self->{'_species'} = shift;
}
return $self->{'_species'} || "unknown";
}
=head2 genetic_code
Title : genetic_code
Usage : my $sp = $cut->genetic_code();
Purpose : Get/setter for genetic_code name of codon table
Returns : Void or genetic_code id, 1 by default
Args : None or genetic_code id, 1 by default if invalid argument.
=cut
sub genetic_code {
my $self = shift;
if (@_ ){
my $val = shift;
if ($val < 0 || $val >16 || $val =~ /[^\d]/
|| $val ==7 || $val ==8) {
$self->warn ("invalid genetic code - must be 1-16 but not 7 or 8,setting to default [1]");
$self->{'_genetic_code'} = 1;
}
else {
$self->{'_genetic_code'} = shift;
}
}
return $self->{'_genetic_code'} || 1;
}
=head2 cds_count
Title : cds_count
Usage : my $count = $cdtable->cds_count();
Purpose : To retrieve the total number of CDSs used to generate the Codon Table
for that organism.
Returns : an integer
Args : none (if retrieving the value) or an integer( if setting ).
=cut
sub cds_count {
my $self= shift;
if (@_) {
my $val = shift;
if ($val < 0) {
$self->warn("can't have negative count initializing to 1");
$self->{'_cds_count'} = 0.00;
}
else{
$self->{'_cds_count'} = $val;
}
}
$self->warn("cds_count value is undefined, returning 0")
if !exists($self->{'_cds_count'});
return $self->{'_cds_count'} || 0.00;
}
=head2 aa_frequency
Title : aa_frequency
Usage : my $freq = $cdtable->aa_frequency('Leu');
Purpose : To retrieve the frequency of an amino acid in the organism
Returns : a percentage
Args : a 1 letter or 3 letter string representing the amino acid
=cut
sub aa_frequency {
my ($self, $a) = @_;
## process args ##
## deal with cases ##
my $aa = lc $a;
$aa =~ s/^(\w)/\U$1/;
if (!exists($STRICTAA{$aa}) && !exists($Bio::SeqUtils::ONECODE{$aa}) ) {
$self->warn("Invalid amino acid! must be a unique 1 letter or 3 letter identifier");
return;
}
#translate to 3 letter code for Ctable #
my $aa3 = $Bio::SeqUtils::THREECODE{$aa} || $aa;
## return % of all amino acids in organism ##
my $freq = 0;
map {$freq += $self->{'_table'}{$aa3}{$_}{'per1000'} } keys %{$self->{'_table'}{$aa3}};
return sprintf("%.2f", $freq/10);
}
=head2 common_codon
Title : common_codon
Usage : my $freq = $cdtable->common_codon('Leu');
Purpose : To retrieve the frequency of the most common codon of that aa
Returns : a percentage
Args : a 1 letter or 3 letter string representing the amino acid
=cut
sub common_codon{
my ($self, $a) = @_;
my $aa = lc $a;
$aa =~ s/^(\w)/\U$1/;
if ($self->_check_aa($aa)) {
my $aa3 = $Bio::SeqUtils::THREECODE{$aa} ;
$aa3 ||= $aa;
my $max = 0;
for my $cod (keys %{$self->{'_table'}{$aa3}}) {
$max = ($self->{'_table'}{$aa3}{$cod}{'rel_freq'} > $max) ?
$self->{'_table'}{$aa3}{$cod}{'rel_freq'}:$max;
}
return $max;
}else {return 0;}
}
=head2 rare_codon
Title : rare_codon
Usage : my $freq = $cdtable->rare_codon('Leu');
Purpose : To retrieve the frequency of the least common codon of that aa
Returns : a percentage
Args : a 1 letter or 3 letter string representing the amino acid
=cut
sub rare_codon {
my ($self, $a) = @_;
my $aa = lc $a;
$aa =~ s/^(\w)/\U$1/;
if ($self->_check_aa($aa)) {
my $aa3 = $Bio::SeqUtils::THREECODE{$aa};
$aa3 ||= $aa;
my $min = 1;
for my $cod (keys %{$self->{'_table'}{$aa3}}) {
$min = ($self->{'_table'}{$aa3}{$cod}{'rel_freq'} < $min) ?
$self->{'_table'}{$aa3}{$cod}{'rel_freq'}:$min;
}
return $min;
}else {return 0;}
}
## internal sub that checks a codon is correct format
sub _check_aa {
my ($self, $aa ) = @_;
if (!exists($STRICTAA{$aa}) && !exists($Bio::SeqUtils::ONECODE{$aa}) ) {
$self->warn("Invalid amino acid! must be a unique 1 letter or 3 letter identifier");
return 0;
}else {return 1;}
}
sub _check_codon {
my ($self, $cod) = @_;
if ($cod =~ /[^ATCG]/ || $cod !~ /\w\w\w/) {
$self->warn(" impossible codon - must be 3 letters and just containing ATCG");
return 0;
}
else {return 1;}
}
sub _init_from_cod {
## make hash based on aa and then send to _init_from_aa
my ($self, $ref) = @_;
my $ct = Bio::Tools::CodonTable->new();
my %aa_hash;
for my $codon(keys %$ref ) {
my $aa = $ct->translate($codon);
$aa_hash{$aa}{$codon} = $ref->{$codon};
}
$self->_init_from_aa(\%aa_hash);
}
sub _init_from_aa {
my ($self, $ref) = @_;
## abs counts and count codons
( run in 0.777 second using v1.01-cache-2.11-cpan-39bf76dae61 )