Bio-GeneDesign

 view release on metacpan or  search on metacpan

lib/Bio/GeneDesign/Basic.pm  view on Meta::CPAN

  "GT" => ([ 9.4, 25.5, 1.5]), "AC" => ([ 9.4, 25.5, 1.5])
);

=head1 Functions

=head2 _generate_dimers

=cut

sub _generate_dimers
{
  my @dimers;
  foreach my $a (@BASES)
  {
    foreach my $b (@BASES)
    {
      push @dimers, $a . $b;
    }
  }
  return @dimers;
}

=head2 _sanitize()

remove non nucleotide characters from nucleotide sequences. remove non amino
acid characters from peptide sequences. return sanitized strand and list of bad
characters.

=cut

sub _sanitize
{
  my ($strand, $swit) = @_;
  $swit = $swit || 'nuc';
  die ('bad sanitize switch') if ($swit ne 'nuc' && $swit ne 'pep');
  my %noncount = ();
  my $newstrand = q{};
  my @arr = split q{}, $strand;
  foreach my $char (@arr)
  {
    my $CH = uc $char;
    if ($swit eq 'nuc' && exists $NTIDES{$CH}
    || ($swit eq 'pep' && exists $AACIDS{$CH}))
    {
      $newstrand .= $char;
    }
    else
    {
      $noncount{$char}++;
    }
  }
  my @baddies = keys %noncount;
  return ($newstrand, @baddies);
}

=head2 _count()

takes a nucleotide sequence and returns a base count.  Looks for total length,
purines, pyrimidines, and degenerate bases. If degenerate bases are present
assumes their substitution for non degenerate bases is totally random for
percentage estimation.

  in: nucleotide sequence (string),
  out: base count (hash)

=cut

sub _count
{
  my ($strand) = @_;
  my $len = length $strand;
  my @arr = split q{}, uc $strand;
  my %C = map {$_ => 0} keys %NTIDES;

  #Count bases
  $C{$_}++ foreach @arr;

  $C{d} += $C{$_} foreach (qw(A T C G));
  $C{n} += $C{$_} foreach (qw(B D H K M N R S V W Y));
  $C{'?'} = ($C{d} + $C{n}) - $len;

  #Estimate how many of each degenerate base would be a G or C
  my $split = .5*$C{R}  + .5*$C{Y}  + .5*$C{K}  + .5*$C{M}  + .5*$C{N};
  my $trip  = (2*$C{B} / 3) + (2*$C{V} / 3) + ($C{D} / 3) + ($C{H} / 3);

  #Calculate GC/AT percentage
  my $gcc = $C{S} + $C{G} + $C{C} + $split + $trip;
  my $gcp = sprintf "%.1f", ($gcc / $len) * 100;
  $C{GCp} = $gcp;
  $C{ATp} = 100 - $gcp;
  $C{len} = $len;

  return \%C;
}

=head2 _gcwindows()

takes a nucleotide sequence, a window size, and minimum and maximum values.
returns lists of real coordinates of subsequences that violate mimimum or
maximum GC percentages.

Values are returned inside an array reference such that the first value is an
array ref of minimum violators (as array refs of left/right coordinates), and
the second value is an array ref of maximum violators.

$return_value = [
  [[left, right], [left, right]], #minimum violators
  [[left, right], [left, right]]  #maximum violators
];

=cut

sub _gcwindows
{
  my ($seq, $winsize, $min, $max) = @_;
  my @minflags = ();
  my @maxflags = ();
  my $seqlen = length $seq;
  my $lefpos = 0;
  my $rigpos = $winsize - 1;
  my $win = substr $seq, $lefpos, $winsize;
  my %hsh = (G => 0, C => 0, A => 0, T => 0);
  $hsh{$_}++ foreach (split q{}, $win);
  my $initbase = substr $seq, $lefpos, 1;
  my $lastbase = substr $seq, $rigpos, 1;
  while ($rigpos < $seqlen)
  {
    my $GCperc = 100 * (($hsh{G} + $hsh{C}) / $winsize);
    $lefpos++;
    $rigpos++;
    if ($GCperc < $min)
    {
      push @minflags, [$lefpos, $rigpos];
    }
    if ($GCperc > $max)
    {
      push @maxflags, [$lefpos, $rigpos];
    }
    $hsh{$initbase}--;
    $initbase = substr $seq, $lefpos, 1;
    $lastbase = substr $seq, $rigpos, 1;
    $hsh{$lastbase}++;
  }
  
  return [\@minflags, \@maxflags];
}

=head2 _melt()

takes a nucleotide sequence and returns a melting temperature

  in: nucleotide sequence (string)
      salt concentration (string, opt, def =.05),
      oligo concentration (string, opt, def = .0000001)
  out: temperature (float)

=cut

sub _melt
{

lib/Bio/GeneDesign/Basic.pm  view on Meta::CPAN

  my ($seq, $pattern, $min) = @_;
  my @arr = ();
  $min = $min || 5;
  if (_is_ambiguous($pattern))
  {
    die "Ambiguous pattern passed to _find_runs";
  }
  my $reg = '[' . $pattern . ']';
  while ($seq =~ m{(($pattern) {$min,})}msixg)
  {
    my $len = length $1;
    my $rig = pos $seq;
    my $lef = $rig - $len + 1;
    push @arr, [$lef, $rig];
  }
  return \@arr;
}

=head2 _is_ambiguous

=cut

sub _is_ambiguous
{
  my ($sequence) = @_;
  return 1 if ($sequence =~ $ambnt);
  return 0;
}

=head2 _compare_sequences()

takes two nucleotide sequences that are assumed to be perfectly aligned and
roughly equivalent and returns similarity metrics. should be twweeaakkeedd

  in: 2x nucleotide sequence (string)
  out: similarity metrics (hash)

=cut

sub _compare_sequences
{
  my ($topseq, $botseq) = @_;
  return if (!$botseq || length($botseq) != length($topseq));
  my ($tsit, $tver, $len) = (0, 0, length($topseq));
  while (length($topseq) > 0)
  {
    my ($topbit, $botbit) = (chop($topseq), chop ($botseq));
    if ($topbit ne $botbit)
    {
      $topbit = $topbit =~ $REGHSH{R}  ?  1  :  0;
      $botbit = $botbit =~ $REGHSH{R}  ?  1  :  0;
      $tsit++ if ($topbit == $botbit);
      $tver++ if ($topbit != $botbit);
    }
  }
  my %A;
  $A{D} = $tsit + $tver;               #changes
  $A{I} = $len - $A{D};                #identities
  $A{T} = $tsit;                       #transitions
  $A{V} = $tver;                       #transversions
  $A{P} = sprintf "%.1f", 100 - (($A{D} / $len) * 100);  #percent identity
  return \%A;
}

=head2 _amb_transcription

=cut

sub _amb_transcription
{
  my ($ntseq) = @_;
  my $prseq = uc $ntseq;
  my (@SEED, @NEW) = ((), ());
  my $offset = 0;
  my $ntlen = length $prseq;
  while ($offset < $ntlen)
  {
    my $template = substr $prseq, $offset, 1;
    my $ntides = $NTIDES{$template};
    my $possibilities = scalar @$ntides;
    if ($possibilities == 1)
    {
      @SEED = ( $template ) if ($offset == 0);
      @NEW  = ( $template ) if ($offset >  0);
    }
    else
    {
      @SEED = @$ntides if ($offset == 0);
      @NEW  = @$ntides if ($offset >  0);
    }
    unless ($offset == 0)
    {
      @SEED = _add_arr(\@SEED, \@NEW);
    }
    $offset ++;
  }
  my %hsh = map {$_ => 1 } @SEED;
  my @keys = sort keys %hsh;
  return \@keys;
}


=head2 _add_arr

Basically builds a list of tree nodes for the amb_trans* functions.
in: 2 x peptide lists (array reference) out: combined list of peptides

=cut

sub _add_arr
{
  my ($arr1ref, $arr2ref) = @_;
  my @arr1 = @$arr1ref;
  my @arr2 = @$arr2ref;
  my @arr3 = ();
  foreach my $do (@arr1)
  {
    foreach my $re (@arr2)
    {
      push @arr3, $do . $re
    }



( run in 0.793 second using v1.01-cache-2.11-cpan-39bf76dae61 )