Bio-EnsEMBL

 view release on metacpan or  search on metacpan

lib/Bio/EnsEMBL/Utils/VegaCuration/Translation.pm  view on Meta::CPAN

    elsif ($start_codon ne 'ATG') {
      $results->{'START_MISSING'}{'ABSENT'} = $start_codon;
    }

  }
  return $results;
}

=head2 get_havana_seleno_comments

   Args       : none
   Example    : my $results = $support->get_havana_seleno_comments
   Description: parses the HEREDOC containing Havana comments in this module
   Returntype : hashref

=cut

sub get_havana_seleno_comments {
  my $seen_translations;
  while (<DATA>) {
    next if /^\s+$/ or /#+/;
    my ($obj,$comment) = split /=/;
    $obj =~ s/^\s+|\s+$//g;
    $comment =~ s/^\s+|\s+$//g;
    # We add the origin as now "seen" can come from a number of places, and have
    # a number of consequences in different cases, not just discounted Secs from this method. -- ds23
    $seen_translations->{$obj} = [ $comment,"notsec-havana" ];
  }
  return $seen_translations;
}

sub check_for_stops {
  my $support = shift;
  my ($gene,$seen_transcripts,$log_object) = @_;
  my $transcripts;
  my $has_log_object=defined($log_object);
  if($has_log_object){
    my @help = $log_object->species_params->get_trans($gene->stable_id);
    $transcripts=\@help;
  }else{
    $log_object=$support;
    $transcripts=$gene->get_all_Transcripts;
  }

  my $gname = $gene->get_all_Attributes('name')->[0]->value;
  my $gsi = $gene->stable_id;
  my $scodon = 'TGA';
  my $mod_date = $support->date_format( $gene->modified_date,'%d/%m/%y' );
  my $hidden_remak_ttributes;
 TRANS:
  foreach my $trans (@$transcripts) {
    my $tsi = $trans->stable_id;
    my $tID = $trans->dbID;
    my $tname = $trans->get_all_Attributes('name')->[0]->value;
    if($has_log_object){
      $hidden_remak_ttributes=$log_object->species_params->get_attributes->{$tsi}->{'hidden_remark'};
    }else{
      $hidden_remak_ttributes=$trans->get_all_Attributes('hidden_remark');
    }
    foreach my $rem (@$hidden_remak_ttributes) {
      if ($rem->value =~ /not_for_Vega/) {
        #$support->log_verbose("Skipping transcript $tname ($tsi) since 'not_for_Vega'\n",1);
        $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Skipping transcript $tname ($tsi) since 'not_for_Vega'");
        next TRANS;
      }
    }

    # go no further if there is a ribosomal framshift attribute
    foreach my $attrib (@{$trans->get_all_Attributes('_rib_frameshift')}) {
      if ($attrib->value) {
        $log_object->_save_log('log', '', $gsi, '', $tsi, '', "Skipping $tsi ($tname) since it has a ribosomal frameshift attribute");
        next TRANS;
      }
    }
    foreach my $attrib (@{$trans->get_all_Attributes('hidden_remark')}) {
      if ($attrib->value eq 'ribosomal_frameshift') {
        $log_object->_save_log('log', '', $gsi, '', $tsi, '', "Skipping $tsi ($tname) since it has a ribosomal frameshift hidden_remark");
        next TRANS;
      }
    }

    #$support->log_verbose("Studying transcript $tsi ($tname, $tID)\n",1);
    $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Studying transcript $tsi ($tname, $tID)");
    my $peptide;
		
    # go no further if the transcript doesn't translate or if there are no stops
    next TRANS unless ($peptide = $trans->translate);
    my $pseq = $peptide->seq;
    my $orig_seq = $pseq;
    # (translate method trims stops from sequence end)

    next TRANS unless ($pseq =~ /\*/);
    next TRANS if ($trans->biotype eq 'polymorphic_pseudogene' or
                   $trans->biotype =~ /processed_pseudogene$/);

    # warn sprintf("Stop codon is '%s'\n",substr($trans->translateable_seq,-3));
    #$support->log_verbose("Stops found in $tsi ($tname)\n",1);
    $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Stops found in $tsi ($tname)");

    # find out where and how many stops there are
    my @found_stops;
    my $mrna   = $trans->translateable_seq;
    my $offset = 0;
    my $tstop;
    while ($pseq =~ /^([^\*]*)\*(.*)/) {
      my $pseq1_f = $1;
      $pseq = $2;
      my $seq_flag = 0;
      $offset += length($pseq1_f) * 3;
      my $stop = substr($mrna, $offset, 3);
      my $aaoffset = int($offset / 3)+1;
      push(@found_stops, [ $stop, $aaoffset ]);
      $tstop .= "$aaoffset ";
      $offset += 3;
    }
		
    # are all stops TGA...?
    my $num_stops = scalar(@found_stops);
    my $num_tga = 0;
    my $positions;
    foreach my $stop (@found_stops) {
      $positions .= $stop->[0]."(".$stop->[1].") ";
      if ($stop->[0] eq $scodon) {
	$num_tga++;
      }
    }
    my $source = $gene->source;
    #...no - an internal stop codon error in the database...
    if ($num_tga < $num_stops) {
      if ($source eq 'havana') {
        #$support->log_warning("INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n");
        $log_object->_save_log('log_warning', '', $gsi, 'TRANSCRIPT', $tsi, 'VQCT_internal_stop', "INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]: Sequence = $orig_seq Stops at $positions)...
      }
      else {
        #$support->log_warning("INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n");
        $log_object->_save_log('log_warning', '', $gsi, 'TRANSCRIPT', $tsi, 'VQCT_internal_stop', "INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]: Sequence = $orig_seq Stops at $positions...
      }
    }
    #...yes - check remarks
    else {
      my $flag_remark  = 0; # 1 if word seleno has been used
      my $flag_remark2 = 0; # 1 if existing remark has correct numbering
      my $alabel       = 'Annotation_remark- selenocysteine ';
      my $alabel2      = 'selenocysteine ';
      my $annot_stops;
      my $remarks;
      my $att;
      #get both hidden_remarks and remarks
      foreach my $remark_type ('remark','hidden_remark') {
        if($has_log_object){
          $att=$log_object->species_params->get_attributes->{$trans->stable_id}->{$remark_type};
        }else{
          $att=$trans->get_all_Attributes($remark_type)
        }
        foreach my $attrib ( @$att) {
          push @{$remarks->{$remark_type}}, $attrib->value;
        }
      }
      #parse remarks to check syntax for location of edits
      while (my ($attrib,$remarks)= each %$remarks) {
        foreach my $text (@{$remarks}) {					
          if ( ($attrib eq 'remark') && ($text=~/^$alabel(.*)/) ){
            #$support->log_warning("seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]\n");
            $log_object->_save_log('log_warning', '', $gsi, '', $tsi, 'VQCT_wrong_selC_coord', "seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]");        
            $annot_stops=$1;
          }
          elsif ($text =~ /^$alabel2(.*)/) {
            my $maybe = $1;
            if($maybe =~ /^\s*\d+(\s+\d+)*\s*$/) {
              $annot_stops=$maybe;
            } else {
              $log_object->_save_log('log', '', $gene->stable_id, '', $tsi, '', "Maybe annotated stop in incorrect format, maybe just a remark that happens to begin '$alabel2'".
                                                                                " -- might need to investigate: '$alabel2$maybe' [$mod_date]");
            }
          }
        }
      }

      #check the location of the annotated edits matches actual stops in the sequence
      my @annotated_stops;
      if ($annot_stops){
        my $i = 0;
        foreach my $offset (split(/\s+/, $annot_stops)) {
          #OK if it matches a known stop
          if (
            defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && ($found_stops[$i]->[1] == $offset)) {
            push  @annotated_stops, $offset;
          }
          # catch old annotations where number was in DNA not peptide coordinates
          elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1] * 3) == $offset)) {
            $log_object->_save_log('log_warning', '', $gene->stable_id, 'DNA', $tsi, 'VQCT_wrong_selC_coord', "DNA: Annotated stop for transcript tsi ($tname) is in DNA not peptide coordinates) [$mod_date]");
          }
          # catch old annotations where number off by one
          elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1]) == $offset+1)) {
            $log_object->_save_log('log_warning', '', $gene->stable_id, 'PEPTIDE', $tsi, 'VQCT_wrong_selC_coord', "PEPTIDE: Annotated stop for transcript $tsi ($tname) is out by one) [$mod_date]");
          }
          elsif (defined($offset)  && ($offset=~/^\d+$/)){
            if ($offset == length($orig_seq)+1) {
              if($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'known-tga-stop') {
                $log_object->_save_log('log', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) known to be a stop codon. Ok. [$mod_date]");
              } elsif($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'known-terminal-sec') {
                $log_object->_save_log('log', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) known to be a terminal Sec. Ok. [$mod_date]");
              } else {
                $log_object->_save_log('log_warning', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) \"$offset\" matches actual stop codon yet has no entry in script config to disambiguate it. Please invest...
              }
            }
            else {
              $log_object->_save_log('log_warning', '', $gene->stable_id, 'TRANSCRIPT', $tsi, 'VQCT_wrong_selC_coord', "Annotated stop for transcript $tsi ($tname) \"$offset\" does not match a TGA codon) [$mod_date]");
              push  @annotated_stops, $offset;
            }
          }						
          $i++;
        }
      }

      #check location of found stops matches annotated ones - any new ones are reported
      foreach my $stop ( @found_stops ) {
        my $pos = $stop->[1];
        my $seq = $stop->[0];
        unless ( grep { $pos == $_} @annotated_stops) {
          if ($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'notsec-havana') {
            #$support->log_verbose("Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators:\n\t".$seen_transcripts->{$tsi}.") [$mod_date]\n");
            $log_object->_save_log('log_verbose', '', $gene->stable_id, '', $tsi, 'VQCT_pot_selC', "Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators: ".$seen_transcripts->{$tsi}->[0].") [$mod_date]");
          }
          else {
            #$support->log("POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]\n");
            $log_object->_save_log('log', '', $gene->stable_id, '', $tsi, 'VQCT_pot_selC', "POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]");
          }
        }
      }
    }
  }
}
sub _save_log{
  my $self=shift;
  my $log_type = shift;
  my $chrom_name=shift || '';
  my $gsi=shift || '';
  my $type=shift || '';  
  my $tsi=shift || '';  
  my $tag=shift || '';
  my $txt=shift || '';
  $self->$log_type($txt."\n");



( run in 2.978 seconds using v1.01-cache-2.11-cpan-98e64b0badf )