Bio-EnsEMBL

 view release on metacpan or  search on metacpan

lib/Bio/EnsEMBL/BaseAlignFeature.pm  view on Meta::CPAN

    my $mapped_length;

    # explicit if statements to avoid rounding problems
    # and make sure we have sane coordinate systems
    if( $query_unit == 1 && $hit_unit == 3 ) {
      $mapped_length = $length*3;
    } elsif( $query_unit == 3 && $hit_unit == 1 ) {
      $mapped_length = $length / 3;
    } elsif ( $query_unit == 1 && $hit_unit == 1 ) {
      $mapped_length = $length;
    } else {
      throw("Internal error $query_unit $hit_unit, currently only " .
            "allowing 1 or 3 ");
    }

    if( int($mapped_length) != $mapped_length and
        ($piece =~ /M$/ or $piece =~ /D$/)) {
      throw("Internal error with mismapped length of hit, query " .
            "$query_unit, hit $hit_unit, length $length");
    }

    if( $piece =~ /M$/ ) {
      #
      # MATCH
      #
      my ( $qstart, $qend);
      if( $strand1 == 1 ) {
        $qstart = $start1;
        $qend = $start1 + $length - 1;
        $start1 = $qend + 1;
      } else {
        $qend = $start1;
        $qstart = $start1 - $length + 1;
        $start1 = $qstart - 1;
      }

      my ($hstart, $hend);
      if( $strand2 == 1 ) {
        $hstart = $start2;
        $hend = $start2 + $mapped_length - 1;
        $start2 = $hend + 1;
      } else {
        $hend = $start2;
        $hstart = $start2 - $mapped_length + 1;
        $start2 = $hstart - 1;
      }


      push @features, Bio::EnsEMBL::FeaturePair->new
        (-SLICE      => $self->{'slice'},
         -SEQNAME   => $self->{'seqname'},
         -START      => $qstart,
         -END        => $qend,
         -STRAND     => $strand1,
         -HSLICE      => $self->{'hslice'},
         -HSEQNAME   => $self->{'hseqname'},
         -HSTART     => $hstart,
         -HEND       => $hend,
         -HSTRAND    => $strand2,
         -SCORE      => $self->{'score'},
         -PERCENT_ID => $self->{'percent_id'},
         -ANALYSIS   => $self->{'analysis'},
         -P_VALUE    => $self->{'p_value'},
         -EXTERNAL_DB_ID => $self->{'external_db_id'},
         -HCOVERAGE   => $self->{'hcoverage'},
         -GROUP_ID    => $self->{'group_id'},
         -LEVEL_ID    => $self->{'level_id'});
      

      # end M cigar bits 
    } elsif( $piece =~ /I$/ ) {
      #
      # INSERT
      #
      if( $strand1 == 1 ) {
        $start1 += $length;
      } else {
        $start1 -= $length;
      }
    } elsif( $piece =~ /D$/ ) {
      #
      # DELETION
      #
      if( $strand2 == 1 ) {
        $start2 += $mapped_length;
      } else {
        $start2 -= $mapped_length;
      }
    } else {
      throw( "Illegal cigar line $string!" );
    }
  }

  return \@features;
}


sub _parse_cigar {
  my $self = shift;

  if ($self->{'align_type'} eq 'ensembl') {
    return $self->_parse_ensembl_cigar();
  }
   else {
    throw("No parsing method implemented for " . $self->{'align_type'});
  }
}


=head2 _parse_features

  Arg  [1]   : listref Bio::EnsEMBL::FeaturePair $ungapped_features
  Description: creates internal cigar_string and start,end hstart,hend
               entries.
  Returntype : none, fills in values of self
  Exceptions : argument list undergoes many sanity checks - throws under many
               invalid conditions
  Caller     : new
  Status     : Stable

=cut

sub _parse_features {
  my ($self, $features) = @_;
  if ($self->{'align_type'} eq 'ensembl') {
    $self->_parse_ensembl_features($features);
  } else {
    throw("No _parse_features method implemented for " . $self->{'align_type'});
  }
}

my $message_only_once = 1;

sub _parse_ensembl_features {
  my ($self,$features ) = @_;

  if (ref($features) ne "ARRAY") {
    throw("features must be an array reference not a [".ref($features)."]");
  } elsif (scalar(@$features) == 0) {
    throw("features array must not be empty");
  }

  my $query_unit = $self->_query_unit();
  my $hit_unit = $self->_hit_unit();

  my $strand  = $features->[0]->strand;

  throw ('FeaturePair needs to have strand == 1 or strand == -1') if(!$strand);

  my @f;

  #
  # Sort the features on their start position
  # Ascending order on positive strand, descending on negative strand
  #
  if( $strand == 1 ) {
    @f = sort {$a->start <=> $b->start} @$features;
  } else {
    @f = sort { $b->start <=> $a->start} @$features;
  }

  my $hstrand     = $f[0]->hstrand;
  my $slice       = $f[0]->slice();
  my $hslice       = $f[0]->hslice();
  my $name        = $slice ? $slice->name() : undef;
  my $hname       = $f[0]->hseqname;
  my $score       = $f[0]->score;
  my $percent     = $f[0]->percent_id;
  my $analysis    = $f[0]->analysis;
  my $pvalue      = $f[0]->p_value();
  my $external_db_id = $f[0]->external_db_id;
  my $hcoverage   = $f[0]->hcoverage;
  my $group_id    = $f[0]->group_id;
  my $level_id    = $f[0]->level_id;

  my $seqname = $f[0]->seqname;
  # implicit strand 1 for peptide sequences
  $strand  ||= 1;
  $hstrand ||= 1;
  my $ori = $strand * $hstrand;

  throw("No features in the array to parse") if(scalar(@f) == 0);

  my $prev1; # where last feature q part ended
  my $prev2; # where last feature s part ended

  my $string;

  # Use strandedness info of query and hit to make sure both sets of 
  # start and end  coordinates are oriented the right way around.
  my $f1start;
  my $f1end;
  my $f2end;
  my $f2start;

  if ($strand == 1) {
    $f1start = $f[0]->start;
    $f1end = $f[-1]->end;
  } else {
    $f1start = $f[-1]->start;
    $f1end = $f[0]->end;
  }

  if ($hstrand == 1) {
    $f2start = $f[0]->hstart;
    $f2end = $f[-1]->hend;
  } else {
    $f2start = $f[-1]->hstart;
    $f2end = $f[0]->hend;
  }

  #
  # Loop through each portion of alignment and construct cigar string
  #

  foreach my $f (@f) {
    #
    # Sanity checks
    #

    if (!$f->isa("Bio::EnsEMBL::FeaturePair")) {
      throw("Array element [$f] is not a Bio::EnsEMBL::FeaturePair");
    }
    if( defined($f->hstrand()) && $f->hstrand() != $hstrand ) {
      throw("Inconsistent hstrands in feature array");
    }
    if( defined($f->strand()) && ($f->strand != $strand)) {
      throw("Inconsistent strands in feature array");
    }
    if ( defined($name) && $name ne $f->slice->name()) {
      throw("Inconsistent names in feature array [$name - ".
            $f->slice->name()."]");
    }
    if ( defined($hname) && $hname ne $f->hseqname) {
      throw("Inconsistent hit names in feature array [$hname - ".
            $f->hseqname . "]");
    }
    if ( defined($score) && $score ne $f->score) {
      throw("Inconsisent scores in feature array [$score - " .
            $f->score . "]");
    }
    if (defined($f->percent_id) && $percent ne $f->percent_id) {
      throw("Inconsistent pids in feature array [$percent - " .
            $f->percent_id . "]");
    }
    if(defined($pvalue) && $pvalue != $f->p_value()) {
      throw("Inconsistant p_values in feature arraw [$pvalue " .
            $f->p_value() . "]");
    }
    if($seqname && $seqname ne $f->seqname){
      throw("Inconsistent seqname in feature array [$seqname - ".
            $f->seqname . "]");
    }
    my $start1 = $f->start;      #source sequence alignment start
    my $start2 = $f->hstart();   #hit sequence alignment start

    #
    # More sanity checking
    #
    if (defined($prev1)) {
      if ( $strand == 1 ) {
        if ($f->start < $prev1) {
          throw("Inconsistent coords in feature array (forward strand).\n" .
		       "Start [".$f->start()."] in current feature should be greater " .
           "than previous feature end [$prev1].");
        }
      } else {
        if ($f->end > $prev1) {
          throw("Inconsistent coords in feature array (reverse strand).\n" .
                "End [".$f->end() ."] should be less than previous feature " .
                "start [$prev1].");
        }
      }
    }

    my $length = ($f->end - $f->start + 1);    #length of source seq alignment
    my $hlength = ($f->hend - $f->hstart + 1); #length of hit seq alignment

    # using multiplication to avoid rounding errors, hence the
    # switch from query to hit for the ratios

    #
    # Yet more sanity checking
    #
    if($query_unit > $hit_unit){
      # I am going to make the assumption here that this situation will 
      # only occur with DnaPepAlignFeatures, this may not be true
      my $query_p_length = sprintf "%.0f", ($length/$query_unit);
      my $hit_p_length = sprintf "%.0f", ($hlength * $hit_unit);
      if( $query_p_length != $hit_p_length) {
        throw( "Feature lengths not comparable Lengths:" .$length .
               " " . $hlength . " Ratios:" . $query_unit . " " .
               $hit_unit );
      }
    } else{
      my $query_d_length = sprintf "%.0f", ($length*$hit_unit);
      my $hit_d_length = sprintf "%.0f", ($hlength * $query_unit);
      if( $length * $hit_unit != $hlength * $query_unit ) {
        throw( "Feature lengths not comparable Lengths:" . $length .
               " " . $hlength . " Ratios:" . $query_unit . " " .
               $hit_unit );
      }
    }

lib/Bio/EnsEMBL/BaseAlignFeature.pm  view on Meta::CPAN


        #there is a deletion
        my $gap = $f->hstart - $prev2 - 1;
        my $gap2 = int( $gap * $hlengthfactor + 0.5 );
	
        if( $gap2 == 1 ) {
          $gap2 = "";  # no need for a number if gap length is 1
        }
        $string .= "$gap2"."D";

        #sanity check,  Should not be an insertion and deletion
        if($insertion_flag) {
          if ($message_only_once) {
            warning("Should not be an deletion and insertion on the " .
                    "same alignment region. cigar_line=$string\n");
            $message_only_once = 0;
          }
        }
      }
      #shift our position in the hit seq alignment
      $prev2 = $f->hend();

     } else {
      if( ( defined $prev2 ) && ( $f->hend() + 1 < $prev2 )) {

        #there is a deletion
        my $gap = $prev2 - $f->hend - 1;
        my $gap2 = int( $gap * $hlengthfactor + 0.5 );
	
        if( $gap2 == 1 ) {
          $gap2 = "";  # no need for a number if gap length is 1
        }
        $string .= "$gap2"."D";

        #sanity check,  Should not be an insertion and deletion
        if($insertion_flag) {
          if ($message_only_once) {
            warning("Should not be an deletion and insertion on the " .
                    "same alignment region. prev2 = $prev2; f->hend() = " .
                    $f->hend() . "; cigar_line = $string;\n");
            $message_only_once = 0;
          }
        }
      }
      #shift our position in the hit seq alignment
      $prev2 = $f->hstart();
    }

    my $matchlength = $f->end() - $f->start() + 1;
    if( $matchlength == 1 ) {
      $matchlength = "";
    }
    $string .= $matchlength."M";
  }

  $self->{'start'}      = $f1start;
  $self->{'end'}        = $f1end;
  $self->{'seqname'}    = $seqname;
  $self->{'strand'}     = $strand;
  $self->{'score'}      = $score;
  $self->{'percent_id'} = $percent;
  $self->{'analysis'}   = $analysis;
  $self->{'slice'}      = $slice;
  $self->{'hslice'}     = $hslice;
  $self->{'hstart'}     = $f2start;
  $self->{'hend'}       = $f2end;
  $self->{'hstrand'}    = $hstrand;
  $self->{'hseqname'}   = $hname;
  $self->{'cigar_string'} = $string;
  $self->{'p_value'}      = $pvalue;
  $self->{'external_db_id'} = $external_db_id; 
  $self->{'hcoverage'}    = $hcoverage; 
  $self->{'group_id'}     = $group_id;
  $self->{'level_id'}     = $level_id;
}






=head2 _hit_unit

  Args       : none
  Description: abstract method, overwrite with something that returns
               one or three
  Returntype : int 1,3
  Exceptions : none
  Caller     : internal
  Status     : Stable

=cut

sub _hit_unit {
  my $self = shift;
  throw( "Abstract method call!" );
}



=head2 _query_unit

  Args       : none
  Description: abstract method, overwrite with something that returns
               one or three
  Returntype : int 1,3
  Exceptions : none
  Caller     : internal
  Status     : Stable

=cut

sub _query_unit {
  my $self = shift;
  throw( "Abstract method call!" );
}

=head2 _mdtag_alignment_length

  Arg [1]    : None
  Description: return the alignment length (including indels) based on the mdtag (mdz) string



( run in 0.981 second using v1.01-cache-2.11-cpan-39bf76dae61 )