BioPerl

 view release on metacpan or  search on metacpan

Bio/CodonUsage/Table.pm  view on Meta::CPAN

		my $arg = shift @args;
		$self->throw("need a hash reference, not a [" . ref($arg). "] reference") if ref($arg) ne 'HASH';
		### flags to detect argument type, can be either to start with  ##
		my $is_codon_hash = 1;
		my $is_Aa_hash = 1;
		for my $k (keys %$arg) {
			## convert to UC
			$k =~ s/(\w+)/\U$1/;
			if (!exists($STRICTAA{$k}) ){
				$is_Aa_hash = 0;
				}
			elsif ($k =~ /[^ATCGatcg]/) {
				$is_codon_hash = 0;
				}
		}
		if (!$is_codon_hash && !$is_Aa_hash) {
			$self->throw(" invalid key values in CUT hash - must be unique aa or nucleotide identifiers");
			}
		elsif ($is_Aa_hash) {
			$self->_init_from_aa($arg);
			}
		elsif($is_codon_hash) {
			$self->_init_from_cod($arg);
			}
		while (@args) {
			my $key = shift @args;
			$key =~ s/\-(\w+)/\L$1/;
			
			$self->$key(shift @args);
			}
	}
		
	return $self;
}

=head2 all_aa_frequencies

 Title   : all_aa_frequencies
 Usage   : my $freq = $cdtable->all_aa_frequencies();
 Returns : a reference to a hash where each key is an amino acid
           and each value is its frequency in all proteins in that
           species.
 Args    : none

=cut

sub all_aa_frequencies {
	my $self = shift;
	my %aa_freqs =();
	for my $aa (keys %STRICTAA) {
		my $freq = $self->aa_frequency($aa);
		$aa_freqs{$aa} = $freq;
		}
	return \%aa_freqs;
}

=head2 codon_abs_frequency

 Title   : codon_abs_frequency
 Usage   : my $freq = $cdtable->codon_abs_frequency('CTG');
 Purpose : To return the frequency of that codon as a percentage
           of all codons in the organism. 
 Returns : a percentage frequency
 Args    : a non-ambiguous codon string

=cut

sub codon_abs_frequency {
	my ($self, $a) = @_;
	my $cod = uc $a;
	if ($self->_check_codon($cod))  {
		my $ctable =  Bio::Tools::CodonTable->new;
		$ctable->id($self->genetic_code() );
		my $aa =$Bio::SeqUtils::THREECODE {$ctable->translate($cod)};

		return $self->{'_table'}{$aa}{$cod}{'per1000'}/10 ;
		}
	else {return 0;}
}

=head2 codon_rel_frequency

 Title   : codon_rel_frequency
 Usage   : my $freq = $cdtable->codon_rel_frequency('CTG');
 Purpose : To return the frequency of that codon as a percentage
           of codons coding for the same amino acid. E.g., ATG and TGG
           would return 100 as those codons are unique.
 Returns : a percentage frequency
 Args    : a non-ambiguous codon string

=cut


sub codon_rel_frequency {
	my ($self, $a) = @_;
	my $cod = uc $a;
	if ($self->_check_codon($cod)) {
		my $ctable =  Bio::Tools::CodonTable->new;
		$ctable->id($self->genetic_code () );
		my $aa =$Bio::SeqUtils::THREECODE {$ctable->translate($cod)};
		return $self->{'_table'}{$aa}{$cod}{'rel_freq'};
	}
	else {
		return 0;
		}
}

=head2 probable_codons

 Title    : probable_codons
 Usage    : my $prob_codons = $cd_table->probable_codons(10);
 Purpose  : to obtain a list of codons for the amino acid above a given
            threshold % relative frequency
 Returns  : A reference to a hash where keys are 1 letter amino acid  codes
            and values are references to arrays of codons whose frequency
            is above the threshold.
 Arguments: a minimum threshold frequency

=cut

sub probable_codons {
	my ($self, $threshold) = @_;
	if (!$threshold || $threshold < 0 || $threshold > 100) {
		$self->throw(" I need a threshold percentage ");
		}
	my %return_hash;
	for my $a(keys %STRICTAA) {
		my @common_codons;
		my $aa =$Bio::SeqUtils::THREECODE{$a};
		for my $codon (keys %{ $self->{'_table'}{$aa}}) {
			if ($self->{'_table'}{$aa}{$codon}{'rel_freq'} > $threshold/100){
				push @common_codons, $codon;
			}
		}
		$return_hash{$a} = \@common_codons;
	}
    ## check to make sure that all codons are populated ##
	if (grep{scalar @{$return_hash{$_}} == 0} keys %return_hash) {
		$self->warn("Threshold is too high, some amino acids do not" .
					" have any codon above the threshold!");
		}
    return \%return_hash;
}
		

=head2 most_common_codons

 Title    : most_common_codons
 Usage    : my $common_codons = $cd_table->most_common_codons();
 Purpose  : To obtain the most common codon for a given amino acid
 Returns  : A reference to a hash where keys are 1 letter amino acid codes
            and the values are the single most common codons for those amino acids
 Arguments: None

=cut

sub most_common_codons {
	my $self = shift;

	my %return_hash;

	for my $a ( keys %STRICTAA ) {

		my $common_codon = '';
		my $rel_freq = 0;
		my $aa = $Bio::SeqUtils::THREECODE{$a};

		if ( ! defined $self->{'_table'}{$aa} ) {
			$self->warn("Amino acid $aa ($a) does not have any codons!");
			next;
		}

		for my $codon ( keys %{ $self->{'_table'}{$aa}} ) {
			if ($self->{'_table'}{$aa}{$codon}{'rel_freq'} > $rel_freq ){
				$common_codon = $codon;
				$rel_freq = $self->{'_table'}{$aa}{$codon}{'rel_freq'};
			}
		}
		$return_hash{$a} = $common_codon;
	}
   
	return \%return_hash;
}

=head2 codon_count

 Title   : codon_count
 Usage   : my $count = $cdtable->codon_count('CTG');
 Purpose : To obtain the absolute number of the codons in the
           organism. 
 Returns : an integer
 Args    : a non-ambiguous codon string

=cut

sub codon_count {
	my $self = shift;
	if (@_) {
		my $a = shift;
		my $cod = uc $a;
		if ($self->_check_codon($cod)) {
			my $ctable =  Bio::Tools::CodonTable->new;
			$ctable->id($self->genetic_code());
			my $aa =$Bio::SeqUtils::THREECODE {$ctable->translate($cod)};
			return $self->{'_table'}{$aa}{$cod}{'abs_count'};
			}
		else {return 0;}
	}
	else {
		$self->warn(" need to give a codon sequence as a parameter ");
		return 0;
		}
	
}

=head2 get_coding_gc

 Title   : get_coding_gc
 Usage   : my $count = $cdtable->get_coding_gc(1);
 Purpose : To return the percentage GC composition for the organism at
           codon positions 1,2 or 3, or an average for all coding sequence
          ('all').
 Returns : a number (%-age GC content) or 0 if these fields are undefined
 Args    : 1,2,3 or 'all'.

=cut

sub get_coding_gc {
	my $self  = shift;
	if (! @_) {
		$self->warn(" no parameters supplied must be  a codon position (1,2,3) or 'all'");
		return 0;
			}
	else{
		my $n = shift;
		##return request if valid ##
		if ( exists($self->{'_coding_gc'}{$n} ) ) {
			return sprintf("%.2f", $self->{'_coding_gc'}{$n});
			}
		##else return 'all' value if exists
		elsif (exists($self->{'_coding_gc'}{'all'} )) {
			$self->warn("coding gc doesn't have value for [$n], returning gc content for all CDSs");
			return sprintf("%.2f", $self->{'_coding_gc'}{'all'});
			}
		### else return 0, 
		else {
			$self->warn("coding gc values aren't defined, returning 0");
			return 0;
		}

	}#end of outer else
		
}

=head2 set_coding_gc

 Title   : set_coding_gc
 Usage   : my $count = $cdtable->set_coding_gc(-1=>55.78);
 Purpose : To set the percentage GC composition for the organism at
           codon positions 1,2 or 3, or an average for all coding sequence
           ('all').  
 Returns : void
 Args    : a hash where the key must be 1,2,3 or 'all' and the value the %age GC
           at that codon position..

=cut

sub set_coding_gc {
	my ($self, $key, $value) = @_;
	my @allowed = qw(1 2 3 all);
	$key =~ s/\-//;
	if (!grep {$key eq $_} @allowed ) {
		$self->warn ("invalid key! - must be one of [ ". (join " ", @allowed) . "]");
		return;
		}
	$self->{'_coding_gc'}{$key} = $value;
	

}

=head2 species

 Title     : species
 Usage     : my $sp = $cut->species();
 Purpose   : Get/setter for species name of codon table
 Returns   : Void or species name string
 Args      : None or species name string

=cut

sub species {
	my $self = shift;
	if (@_ ){
		$self->{'_species'} = shift;
		}
	return $self->{'_species'} || "unknown";
}

=head2 genetic_code

 Title     : genetic_code
 Usage     : my $sp = $cut->genetic_code();
 Purpose   : Get/setter for genetic_code name of codon table
 Returns   : Void or genetic_code id, 1 by default
 Args      : None or genetic_code id, 1 by default if invalid argument.

=cut

sub genetic_code {
	my $self = shift;
	if (@_ ){
		my $val = shift;
		if ($val < 0 || $val >16 || $val =~ /[^\d]/ 
				|| $val ==7 || $val ==8) {
			$self->warn ("invalid genetic code - must be 1-16 but not 7 or 8,setting to default [1]");
			$self->{'_genetic_code'} = 1;
			}
		else {
			$self->{'_genetic_code'} = shift;
			}
		}
	return $self->{'_genetic_code'} || 1;
}

=head2 cds_count

 Title   : cds_count
 Usage   : my $count = $cdtable->cds_count();
 Purpose : To retrieve the total number of CDSs used to generate the Codon Table
           for that organism. 
 Returns : an integer
 Args    : none (if retrieving the value) or an integer( if setting ). 

=cut

sub cds_count {
	my $self= shift;
	if (@_) {
		my $val = shift;
		if ($val < 0) {
			$self->warn("can't have negative count initializing to 1");
			$self->{'_cds_count'} = 0.00;
			}
		else{
			$self->{'_cds_count'} = $val;
		}
	}
	$self->warn("cds_count value is undefined, returning 0") 
		if !exists($self->{'_cds_count'});

	return $self->{'_cds_count'} || 0.00;
	}

=head2 aa_frequency

 Title   : aa_frequency
 Usage   : my $freq = $cdtable->aa_frequency('Leu');
 Purpose : To retrieve the frequency of an amino acid in the organism
 Returns : a percentage
 Args    : a 1 letter or 3 letter string representing the amino acid

=cut

	

sub aa_frequency {
	my ($self, $a) = @_;
	## process args ##

	## deal with cases ##
	my $aa = lc $a;	
	$aa =~ s/^(\w)/\U$1/;
	if (!exists($STRICTAA{$aa}) && !exists($Bio::SeqUtils::ONECODE{$aa}) ) {
		$self->warn("Invalid amino acid! must be a unique 1 letter or 3 letter identifier");
		return;
		}
	#translate to 3 letter code for Ctable #
	my $aa3 = $Bio::SeqUtils::THREECODE{$aa} || $aa;

	## return % of all amino acids in organism ## 
	my $freq = 0;
	map {$freq += $self->{'_table'}{$aa3}{$_}{'per1000'} } keys %{$self->{'_table'}{$aa3}};
	return sprintf("%.2f", $freq/10);
}

=head2 common_codon

 Title   : common_codon
 Usage   : my $freq = $cdtable->common_codon('Leu');
 Purpose : To retrieve the frequency of the most common codon of that aa
 Returns : a percentage
 Args    : a 1 letter or 3 letter string representing the amino acid

=cut

sub common_codon{

	my ($self, $a) = @_;
	my $aa = lc $a;	
	$aa =~ s/^(\w)/\U$1/;

	if ($self->_check_aa($aa))	{
		my $aa3 = $Bio::SeqUtils::THREECODE{$aa} ;
		$aa3 ||= $aa;
		my $max = 0;
		for my $cod (keys %{$self->{'_table'}{$aa3}}) {
			$max = ($self->{'_table'}{$aa3}{$cod}{'rel_freq'} > $max) ?
					$self->{'_table'}{$aa3}{$cod}{'rel_freq'}:$max;
			}
		return $max;
		}else {return 0;}
}

=head2 rare_codon

 Title   : rare_codon
 Usage   : my $freq = $cdtable->rare_codon('Leu');
 Purpose : To retrieve the frequency of the least common codon of that aa
 Returns : a percentage
 Args    : a 1 letter or 3 letter string representing the amino acid

=cut

sub rare_codon {
my ($self, $a) = @_;
	my $aa = lc $a;	
	$aa =~ s/^(\w)/\U$1/;
	if ($self->_check_aa($aa))	{
		my $aa3 = $Bio::SeqUtils::THREECODE{$aa};
		$aa3 ||= $aa;
		my $min = 1;
		for my $cod (keys %{$self->{'_table'}{$aa3}}) {
			$min = ($self->{'_table'}{$aa3}{$cod}{'rel_freq'} < $min) ?
					$self->{'_table'}{$aa3}{$cod}{'rel_freq'}:$min;
			}
		return $min;
		}else {return 0;}


}


## internal sub that checks a codon is correct format
sub _check_aa {
	my ($self, $aa ) = @_;
	if (!exists($STRICTAA{$aa}) && !exists($Bio::SeqUtils::ONECODE{$aa}) ) {
		$self->warn("Invalid amino acid! must be a unique 1 letter or 3 letter identifier");
		return 0;
		}else {return 1;}
}

	


sub _check_codon {
	my ($self, $cod) = @_;
	if ($cod =~ /[^ATCG]/  || $cod !~ /\w\w\w/) {
		$self->warn(" impossible codon - must be 3 letters and just containing ATCG");
		return 0;
	}
	else {return 1;}
}
sub _init_from_cod {

	## make hash based on aa and then send to _init_from_aa
	my ($self, $ref) = @_;
	my $ct = Bio::Tools::CodonTable->new();
	my %aa_hash;
	for my $codon(keys %$ref ) {
		my $aa = $ct->translate($codon);
		$aa_hash{$aa}{$codon} = $ref->{$codon};
		}
	$self->_init_from_aa(\%aa_hash);
}


sub _init_from_aa {
	my ($self, $ref) = @_;
		## abs counts  and count codons



( run in 0.777 second using v1.01-cache-2.11-cpan-39bf76dae61 )