Bio-EnsEMBL
view release on metacpan or search on metacpan
lib/Bio/EnsEMBL/Utils/VegaCuration/Translation.pm view on Meta::CPAN
elsif ($start_codon ne 'ATG') {
$results->{'START_MISSING'}{'ABSENT'} = $start_codon;
}
}
return $results;
}
=head2 get_havana_seleno_comments
Args : none
Example : my $results = $support->get_havana_seleno_comments
Description: parses the HEREDOC containing Havana comments in this module
Returntype : hashref
=cut
sub get_havana_seleno_comments {
my $seen_translations;
while (<DATA>) {
next if /^\s+$/ or /#+/;
my ($obj,$comment) = split /=/;
$obj =~ s/^\s+|\s+$//g;
$comment =~ s/^\s+|\s+$//g;
# We add the origin as now "seen" can come from a number of places, and have
# a number of consequences in different cases, not just discounted Secs from this method. -- ds23
$seen_translations->{$obj} = [ $comment,"notsec-havana" ];
}
return $seen_translations;
}
sub check_for_stops {
my $support = shift;
my ($gene,$seen_transcripts,$log_object) = @_;
my $transcripts;
my $has_log_object=defined($log_object);
if($has_log_object){
my @help = $log_object->species_params->get_trans($gene->stable_id);
$transcripts=\@help;
}else{
$log_object=$support;
$transcripts=$gene->get_all_Transcripts;
}
my $gname = $gene->get_all_Attributes('name')->[0]->value;
my $gsi = $gene->stable_id;
my $scodon = 'TGA';
my $mod_date = $support->date_format( $gene->modified_date,'%d/%m/%y' );
my $hidden_remak_ttributes;
TRANS:
foreach my $trans (@$transcripts) {
my $tsi = $trans->stable_id;
my $tID = $trans->dbID;
my $tname = $trans->get_all_Attributes('name')->[0]->value;
if($has_log_object){
$hidden_remak_ttributes=$log_object->species_params->get_attributes->{$tsi}->{'hidden_remark'};
}else{
$hidden_remak_ttributes=$trans->get_all_Attributes('hidden_remark');
}
foreach my $rem (@$hidden_remak_ttributes) {
if ($rem->value =~ /not_for_Vega/) {
#$support->log_verbose("Skipping transcript $tname ($tsi) since 'not_for_Vega'\n",1);
$log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Skipping transcript $tname ($tsi) since 'not_for_Vega'");
next TRANS;
}
}
# go no further if there is a ribosomal framshift attribute
foreach my $attrib (@{$trans->get_all_Attributes('_rib_frameshift')}) {
if ($attrib->value) {
$log_object->_save_log('log', '', $gsi, '', $tsi, '', "Skipping $tsi ($tname) since it has a ribosomal frameshift attribute");
next TRANS;
}
}
foreach my $attrib (@{$trans->get_all_Attributes('hidden_remark')}) {
if ($attrib->value eq 'ribosomal_frameshift') {
$log_object->_save_log('log', '', $gsi, '', $tsi, '', "Skipping $tsi ($tname) since it has a ribosomal frameshift hidden_remark");
next TRANS;
}
}
#$support->log_verbose("Studying transcript $tsi ($tname, $tID)\n",1);
$log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Studying transcript $tsi ($tname, $tID)");
my $peptide;
# go no further if the transcript doesn't translate or if there are no stops
next TRANS unless ($peptide = $trans->translate);
my $pseq = $peptide->seq;
my $orig_seq = $pseq;
# (translate method trims stops from sequence end)
next TRANS unless ($pseq =~ /\*/);
next TRANS if ($trans->biotype eq 'polymorphic_pseudogene' or
$trans->biotype =~ /processed_pseudogene$/);
# warn sprintf("Stop codon is '%s'\n",substr($trans->translateable_seq,-3));
#$support->log_verbose("Stops found in $tsi ($tname)\n",1);
$log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Stops found in $tsi ($tname)");
# find out where and how many stops there are
my @found_stops;
my $mrna = $trans->translateable_seq;
my $offset = 0;
my $tstop;
while ($pseq =~ /^([^\*]*)\*(.*)/) {
my $pseq1_f = $1;
$pseq = $2;
my $seq_flag = 0;
$offset += length($pseq1_f) * 3;
my $stop = substr($mrna, $offset, 3);
my $aaoffset = int($offset / 3)+1;
push(@found_stops, [ $stop, $aaoffset ]);
$tstop .= "$aaoffset ";
$offset += 3;
}
# are all stops TGA...?
my $num_stops = scalar(@found_stops);
my $num_tga = 0;
my $positions;
foreach my $stop (@found_stops) {
$positions .= $stop->[0]."(".$stop->[1].") ";
if ($stop->[0] eq $scodon) {
$num_tga++;
}
}
my $source = $gene->source;
#...no - an internal stop codon error in the database...
if ($num_tga < $num_stops) {
if ($source eq 'havana') {
#$support->log_warning("INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n");
$log_object->_save_log('log_warning', '', $gsi, 'TRANSCRIPT', $tsi, 'VQCT_internal_stop', "INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]: Sequence = $orig_seq Stops at $positions)...
}
else {
#$support->log_warning("INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n");
$log_object->_save_log('log_warning', '', $gsi, 'TRANSCRIPT', $tsi, 'VQCT_internal_stop', "INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]: Sequence = $orig_seq Stops at $positions...
}
}
#...yes - check remarks
else {
my $flag_remark = 0; # 1 if word seleno has been used
my $flag_remark2 = 0; # 1 if existing remark has correct numbering
my $alabel = 'Annotation_remark- selenocysteine ';
my $alabel2 = 'selenocysteine ';
my $annot_stops;
my $remarks;
my $att;
#get both hidden_remarks and remarks
foreach my $remark_type ('remark','hidden_remark') {
if($has_log_object){
$att=$log_object->species_params->get_attributes->{$trans->stable_id}->{$remark_type};
}else{
$att=$trans->get_all_Attributes($remark_type)
}
foreach my $attrib ( @$att) {
push @{$remarks->{$remark_type}}, $attrib->value;
}
}
#parse remarks to check syntax for location of edits
while (my ($attrib,$remarks)= each %$remarks) {
foreach my $text (@{$remarks}) {
if ( ($attrib eq 'remark') && ($text=~/^$alabel(.*)/) ){
#$support->log_warning("seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]\n");
$log_object->_save_log('log_warning', '', $gsi, '', $tsi, 'VQCT_wrong_selC_coord', "seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]");
$annot_stops=$1;
}
elsif ($text =~ /^$alabel2(.*)/) {
my $maybe = $1;
if($maybe =~ /^\s*\d+(\s+\d+)*\s*$/) {
$annot_stops=$maybe;
} else {
$log_object->_save_log('log', '', $gene->stable_id, '', $tsi, '', "Maybe annotated stop in incorrect format, maybe just a remark that happens to begin '$alabel2'".
" -- might need to investigate: '$alabel2$maybe' [$mod_date]");
}
}
}
}
#check the location of the annotated edits matches actual stops in the sequence
my @annotated_stops;
if ($annot_stops){
my $i = 0;
foreach my $offset (split(/\s+/, $annot_stops)) {
#OK if it matches a known stop
if (
defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && ($found_stops[$i]->[1] == $offset)) {
push @annotated_stops, $offset;
}
# catch old annotations where number was in DNA not peptide coordinates
elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1] * 3) == $offset)) {
$log_object->_save_log('log_warning', '', $gene->stable_id, 'DNA', $tsi, 'VQCT_wrong_selC_coord', "DNA: Annotated stop for transcript tsi ($tname) is in DNA not peptide coordinates) [$mod_date]");
}
# catch old annotations where number off by one
elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1]) == $offset+1)) {
$log_object->_save_log('log_warning', '', $gene->stable_id, 'PEPTIDE', $tsi, 'VQCT_wrong_selC_coord', "PEPTIDE: Annotated stop for transcript $tsi ($tname) is out by one) [$mod_date]");
}
elsif (defined($offset) && ($offset=~/^\d+$/)){
if ($offset == length($orig_seq)+1) {
if($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'known-tga-stop') {
$log_object->_save_log('log', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) known to be a stop codon. Ok. [$mod_date]");
} elsif($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'known-terminal-sec') {
$log_object->_save_log('log', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) known to be a terminal Sec. Ok. [$mod_date]");
} else {
$log_object->_save_log('log_warning', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) \"$offset\" matches actual stop codon yet has no entry in script config to disambiguate it. Please invest...
}
}
else {
$log_object->_save_log('log_warning', '', $gene->stable_id, 'TRANSCRIPT', $tsi, 'VQCT_wrong_selC_coord', "Annotated stop for transcript $tsi ($tname) \"$offset\" does not match a TGA codon) [$mod_date]");
push @annotated_stops, $offset;
}
}
$i++;
}
}
#check location of found stops matches annotated ones - any new ones are reported
foreach my $stop ( @found_stops ) {
my $pos = $stop->[1];
my $seq = $stop->[0];
unless ( grep { $pos == $_} @annotated_stops) {
if ($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'notsec-havana') {
#$support->log_verbose("Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators:\n\t".$seen_transcripts->{$tsi}.") [$mod_date]\n");
$log_object->_save_log('log_verbose', '', $gene->stable_id, '', $tsi, 'VQCT_pot_selC', "Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators: ".$seen_transcripts->{$tsi}->[0].") [$mod_date]");
}
else {
#$support->log("POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]\n");
$log_object->_save_log('log', '', $gene->stable_id, '', $tsi, 'VQCT_pot_selC', "POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]");
}
}
}
}
}
}
sub _save_log{
my $self=shift;
my $log_type = shift;
my $chrom_name=shift || '';
my $gsi=shift || '';
my $type=shift || '';
my $tsi=shift || '';
my $tag=shift || '';
my $txt=shift || '';
$self->$log_type($txt."\n");
( run in 2.978 seconds using v1.01-cache-2.11-cpan-98e64b0badf )