Bio-GeneDesign
view release on metacpan or search on metacpan
lib/Bio/GeneDesign/Basic.pm view on Meta::CPAN
"GT" => ([ 9.4, 25.5, 1.5]), "AC" => ([ 9.4, 25.5, 1.5])
);
=head1 Functions
=head2 _generate_dimers
=cut
sub _generate_dimers
{
my @dimers;
foreach my $a (@BASES)
{
foreach my $b (@BASES)
{
push @dimers, $a . $b;
}
}
return @dimers;
}
=head2 _sanitize()
remove non nucleotide characters from nucleotide sequences. remove non amino
acid characters from peptide sequences. return sanitized strand and list of bad
characters.
=cut
sub _sanitize
{
my ($strand, $swit) = @_;
$swit = $swit || 'nuc';
die ('bad sanitize switch') if ($swit ne 'nuc' && $swit ne 'pep');
my %noncount = ();
my $newstrand = q{};
my @arr = split q{}, $strand;
foreach my $char (@arr)
{
my $CH = uc $char;
if ($swit eq 'nuc' && exists $NTIDES{$CH}
|| ($swit eq 'pep' && exists $AACIDS{$CH}))
{
$newstrand .= $char;
}
else
{
$noncount{$char}++;
}
}
my @baddies = keys %noncount;
return ($newstrand, @baddies);
}
=head2 _count()
takes a nucleotide sequence and returns a base count. Looks for total length,
purines, pyrimidines, and degenerate bases. If degenerate bases are present
assumes their substitution for non degenerate bases is totally random for
percentage estimation.
in: nucleotide sequence (string),
out: base count (hash)
=cut
sub _count
{
my ($strand) = @_;
my $len = length $strand;
my @arr = split q{}, uc $strand;
my %C = map {$_ => 0} keys %NTIDES;
#Count bases
$C{$_}++ foreach @arr;
$C{d} += $C{$_} foreach (qw(A T C G));
$C{n} += $C{$_} foreach (qw(B D H K M N R S V W Y));
$C{'?'} = ($C{d} + $C{n}) - $len;
#Estimate how many of each degenerate base would be a G or C
my $split = .5*$C{R} + .5*$C{Y} + .5*$C{K} + .5*$C{M} + .5*$C{N};
my $trip = (2*$C{B} / 3) + (2*$C{V} / 3) + ($C{D} / 3) + ($C{H} / 3);
#Calculate GC/AT percentage
my $gcc = $C{S} + $C{G} + $C{C} + $split + $trip;
my $gcp = sprintf "%.1f", ($gcc / $len) * 100;
$C{GCp} = $gcp;
$C{ATp} = 100 - $gcp;
$C{len} = $len;
return \%C;
}
=head2 _gcwindows()
takes a nucleotide sequence, a window size, and minimum and maximum values.
returns lists of real coordinates of subsequences that violate mimimum or
maximum GC percentages.
Values are returned inside an array reference such that the first value is an
array ref of minimum violators (as array refs of left/right coordinates), and
the second value is an array ref of maximum violators.
$return_value = [
[[left, right], [left, right]], #minimum violators
[[left, right], [left, right]] #maximum violators
];
=cut
sub _gcwindows
{
my ($seq, $winsize, $min, $max) = @_;
my @minflags = ();
my @maxflags = ();
my $seqlen = length $seq;
my $lefpos = 0;
my $rigpos = $winsize - 1;
my $win = substr $seq, $lefpos, $winsize;
my %hsh = (G => 0, C => 0, A => 0, T => 0);
$hsh{$_}++ foreach (split q{}, $win);
my $initbase = substr $seq, $lefpos, 1;
my $lastbase = substr $seq, $rigpos, 1;
while ($rigpos < $seqlen)
{
my $GCperc = 100 * (($hsh{G} + $hsh{C}) / $winsize);
$lefpos++;
$rigpos++;
if ($GCperc < $min)
{
push @minflags, [$lefpos, $rigpos];
}
if ($GCperc > $max)
{
push @maxflags, [$lefpos, $rigpos];
}
$hsh{$initbase}--;
$initbase = substr $seq, $lefpos, 1;
$lastbase = substr $seq, $rigpos, 1;
$hsh{$lastbase}++;
}
return [\@minflags, \@maxflags];
}
=head2 _melt()
takes a nucleotide sequence and returns a melting temperature
in: nucleotide sequence (string)
salt concentration (string, opt, def =.05),
oligo concentration (string, opt, def = .0000001)
out: temperature (float)
=cut
sub _melt
{
lib/Bio/GeneDesign/Basic.pm view on Meta::CPAN
my ($seq, $pattern, $min) = @_;
my @arr = ();
$min = $min || 5;
if (_is_ambiguous($pattern))
{
die "Ambiguous pattern passed to _find_runs";
}
my $reg = '[' . $pattern . ']';
while ($seq =~ m{(($pattern) {$min,})}msixg)
{
my $len = length $1;
my $rig = pos $seq;
my $lef = $rig - $len + 1;
push @arr, [$lef, $rig];
}
return \@arr;
}
=head2 _is_ambiguous
=cut
sub _is_ambiguous
{
my ($sequence) = @_;
return 1 if ($sequence =~ $ambnt);
return 0;
}
=head2 _compare_sequences()
takes two nucleotide sequences that are assumed to be perfectly aligned and
roughly equivalent and returns similarity metrics. should be twweeaakkeedd
in: 2x nucleotide sequence (string)
out: similarity metrics (hash)
=cut
sub _compare_sequences
{
my ($topseq, $botseq) = @_;
return if (!$botseq || length($botseq) != length($topseq));
my ($tsit, $tver, $len) = (0, 0, length($topseq));
while (length($topseq) > 0)
{
my ($topbit, $botbit) = (chop($topseq), chop ($botseq));
if ($topbit ne $botbit)
{
$topbit = $topbit =~ $REGHSH{R} ? 1 : 0;
$botbit = $botbit =~ $REGHSH{R} ? 1 : 0;
$tsit++ if ($topbit == $botbit);
$tver++ if ($topbit != $botbit);
}
}
my %A;
$A{D} = $tsit + $tver; #changes
$A{I} = $len - $A{D}; #identities
$A{T} = $tsit; #transitions
$A{V} = $tver; #transversions
$A{P} = sprintf "%.1f", 100 - (($A{D} / $len) * 100); #percent identity
return \%A;
}
=head2 _amb_transcription
=cut
sub _amb_transcription
{
my ($ntseq) = @_;
my $prseq = uc $ntseq;
my (@SEED, @NEW) = ((), ());
my $offset = 0;
my $ntlen = length $prseq;
while ($offset < $ntlen)
{
my $template = substr $prseq, $offset, 1;
my $ntides = $NTIDES{$template};
my $possibilities = scalar @$ntides;
if ($possibilities == 1)
{
@SEED = ( $template ) if ($offset == 0);
@NEW = ( $template ) if ($offset > 0);
}
else
{
@SEED = @$ntides if ($offset == 0);
@NEW = @$ntides if ($offset > 0);
}
unless ($offset == 0)
{
@SEED = _add_arr(\@SEED, \@NEW);
}
$offset ++;
}
my %hsh = map {$_ => 1 } @SEED;
my @keys = sort keys %hsh;
return \@keys;
}
=head2 _add_arr
Basically builds a list of tree nodes for the amb_trans* functions.
in: 2 x peptide lists (array reference) out: combined list of peptides
=cut
sub _add_arr
{
my ($arr1ref, $arr2ref) = @_;
my @arr1 = @$arr1ref;
my @arr2 = @$arr2ref;
my @arr3 = ();
foreach my $do (@arr1)
{
foreach my $re (@arr2)
{
push @arr3, $do . $re
}
( run in 0.793 second using v1.01-cache-2.11-cpan-39bf76dae61 )