Bio-EnsEMBL

 view release on metacpan or  search on metacpan

lib/Bio/EnsEMBL/DBSQL/DensityFeatureAdaptor.pm  view on Meta::CPAN



=head2 new

  Arg [1]    : list of args @args
               Superclass constructor arguments
  Example    : none
  Description: Constructor which just initializes internal cache structures
  Returntype : Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor
  Exceptions : none
  Caller     : implementing subclass constructors
  Status     : Stable

=cut

sub new {
  my $caller = shift;

  my $class = ref($caller) || $caller;

  my $self = $class->SUPER::new(@_);

  #initialize an LRU cache
  my %cache;
  tie(%cache, 'Bio::EnsEMBL::Utils::Cache', $DENSITY_FEATURE_CACHE_SIZE);
  $self->{'_density_feature_cache'} = \%cache;

  return $self;
}

=head2 fetch_all_by_Slice

  Arg [1]    : Bio::EnsEMBL::Slice $slice - The slice representing the region
               to retrieve density features from.
  Arg [2]    : string $logic_name - The logic name of the density features to
               retrieve.
  Arg [3]    : int $num_blocks (optional; default = 50) - The number of
               features that are desired. The ratio between the size of these
               features and the size of the features in the database will be
               used to determine which database features will be used.
  Arg [4]    : boolean $interpolate (optional; default = 0) - A flag indicating
               whether the features in the database should be interpolated to
               fit them to the requested number of features.  If true the
               features will be interpolated to provide $num_blocks features.
               This will not guarantee that exactly $num_blocks features are
               returned due to rounding etc. but it will be close.
  Arg [5]    : float $max_ratio - The maximum ratio between the size of the
               requested features (as determined by $num_blocks) and the actual
               size of the features in the database.  If this value is exceeded
               then an empty list will be returned.  This can be used to
               prevent excessive interpolation of the database values.
  Example    : #interpolate:
               $feats = $dfa->fetch_all_by_Slice($slice,'SNPDensity', 10, 1);
               #do not interpoloate, get what is in the database:
               $feats = $dfa->fetch_all_by_Slice($slice,'SNPDensity', 50);
               #interpolate, but not too much
               $feats = $dfa->fetch_all_by_Slice($slice,'SNPDensity',50,1,5.0);
  Description: Retrieves a set of density features which overlap the region
               of this slice. Density features are a discrete representation
               of a continuous value along a sequence, such as a density or
               percent coverage.  Density Features may be stored in chunks of
               different sizes in the database, and interpolated to fit the
               desired size for the requested region.  For example the database
               may store a single density value for each 1KB and also for each
               1MB.  When fetching for an entire chromosome the 1MB density
               chunks will be used if the requested number of blocks is not
               very high.
               Note that features which have been interpolated are not stored
               in the database and as such will have no dbID or adaptor set.
  Returntype : Bio::EnsEMBL::DensityFeature
  Exceptions : warning on invalid $num_blocks argument
               warning if no features with logic_name $logic_name exist
               warning if density_type table has invalid block_size value
  Caller     : general
  Status     : Stable

=cut

sub fetch_all_by_Slice {
  my ($self, $slice, $logic_name, $num_blocks, $interpolate, $max_ratio) = @_;

  if(defined($num_blocks) && $num_blocks < 1) {
    warning("Invalid number of density blocks [$num_blocks] requested.\n" .
	   "Returning empty list.");
    return [];
  }

  $num_blocks ||= 50;
  my $length = $slice->length();

  my $wanted_block_size = POSIX::ceil($length/$num_blocks);

  #
  # get out all of the density types and choose the one with the
  # block size closest to our desired block size
  #

  my $dta = $self->db()->get_DensityTypeAdaptor();

  my @dtypes = @{$dta->fetch_all_by_logic_name($logic_name)};
  if( ! @dtypes ){
    my @all_dtypes =  @{ $dta->fetch_all() };
    @all_dtypes or warning( "No DensityTypes in $dta" ) && return [];
    my $valid_list = join( ", ", map{$_->analysis->logic_name} @all_dtypes );
    warning( "No DensityTypes for logic name $logic_name. ".
	     "Select from $valid_list" );
    return [];
  }

  my $best_ratio   = undef;
  my $density_type = undef;
  my $best_ratio_large = undef;
  my $density_type_large = undef;
  my %dt_ratio_hash;

  foreach my $dt (@dtypes) {

    my $ratio;
    if( $dt->block_size() > 0 ) {
      $ratio = $wanted_block_size/$dt->block_size();
    } else {

lib/Bio/EnsEMBL/DBSQL/DensityFeatureAdaptor.pm  view on Meta::CPAN


    $dt_ratio_hash{ $ratio } = $dt;
  }

  $best_ratio = (sort {$a<=>$b} (keys %dt_ratio_hash))[0];
  
  #the ratio was not good enough, or this logic name was not in the
  #density type table, return an empty list
  if(!defined($best_ratio) ||
     (defined($max_ratio) && $best_ratio > $max_ratio)) {
    return [];
  }

  $density_type = $dt_ratio_hash{$best_ratio};

  my $constraint = "df.density_type_id = " . $density_type->dbID();

  my @features =
    @{$self->fetch_all_by_Slice_constraint($slice,$constraint)};

  return \@features if(!$interpolate);

  #interpolate the features into new features of a different size
  my @out;
  #sort the features on start position
  @features = sort( { $a->start() <=> $b->start() } @features );

  #resize the features that were returned
  my $start = 1;
  my $end   = $start+$wanted_block_size-1;

  my $value_type = $density_type->value_type();

  # create a new density type object for the interpolated features that
  # is not stored in the database
  $density_type = Bio::EnsEMBL::DensityType->new
    (-VALUE_TYPE => $value_type,
     -ANALYSIS   => $density_type->analysis(),
     -BLOCK_SIZE => $wanted_block_size);

  while($start < $length) {
#    $end = $length if($end > $length);

    my $density_value = 0.0;
    my ($f, $fstart, $fend, $portion);
    my @dvalues;

    #construct a new feature using all of the old density features that
    #we overlapped
    while(($f = shift(@features)) && $end > $f->{'start'}) {

      #what portion of this feature are we using to construct our new block?
      $fstart = ($f->{'start'} < $start) ? $start : $f->{'start'};
      $fend   = ($f->{'end'}   > $end  ) ? $end   : $f->{'end'};
      $fend   = $length if($fend > $length);

      if($value_type eq 'sum') {

        $portion = ($fend-$fstart+1)/$f->length();

        #take a percentage of density value, depending on how much of the
        #feature we overlapped
        $density_value += $portion * $f->{'density_value'};

      } elsif($value_type eq 'ratio') {

        #maintain a running total of the length used from each feature
        #and its value
        push(@dvalues,[$fend-$fstart+1,$f->{'density_value'}]);

      } else {
        throw("Unknown density value type [$value_type].");
      }

      #do not want to look at next feature if we only used part of this one:
      last if($fend < $f->{'end'});
    }

    #if we did not completely overlap the last feature, put it back on so
    #it can be partially used by the next block
    if(defined($f) && (!defined($fend) || $fend < $f->{'end'})) {
      unshift(@features, $f);
    }

    if($value_type eq 'ratio') {
      #take a weighted average of the all the density values of the features
      #used to construct this one
      my $total_len = $end - $start + 1;
      if($total_len > 0) {
        foreach my $pair (@dvalues) {
          my ($dlen, $dval) = @$pair;
          $density_value += $dval * ($dlen/$total_len);
        }
      }
    }

    # interpolated features are not stored in the db so do not set the dbID
    # or the adaptor
    push @out, Bio::EnsEMBL::DensityFeature->new
      (-seq_region    => $slice,
       -start         => $start,
       -end           => $end,
       -density_type  => $density_type,
       -density_value => $density_value);

    $start = $end + 1;
    $end  += $wanted_block_size;
  }

  return \@out;
}


sub fetch_all {
  my $self = shift;
  my $logic_name = shift;

  my ($sth, $seq_region_id, $start, $end, $density_value, $density_type_id);
  my ($slice, $density_type, @out);
  my $sa = $self->db()->get_SliceAdaptor();
  my $dta = $self->db()->get_DensityTypeAdaptor();



( run in 1.688 second using v1.01-cache-2.11-cpan-39bf76dae61 )