Bio-BioStudio
view release on metacpan or search on metacpan
lib/Bio/BioStudio/PCRTagging.pm view on Meta::CPAN
use base qw(Exporter);
use strict;
use warnings;
our $VERSION = '2.10';
our @EXPORT_OK = qw(
serial_tagging
farm_tagging
tag_gene
);
our %EXPORT_TAGS = (BS => \@EXPORT_OK);
=head1 Functions
=head2 farm_tagging
=cut
sub farm_tagging
{
my ($newchr, $genome, $p) = @_;
my $GD = $newchr->GD;
my $chrname = $newchr->name;
my $key = Digest::MD5::md5_hex(Digest::MD5::md5_hex(time().{}.rand().$$));
my $tmp_path = Bio::BioStudio::ConfigData->config('tmp_path');
my $bdir = Bio::BioStudio::ConfigData->config('tmp_path');
my $factory = Bio::Tools::Run::StandAloneBlastPlus->new(
-db_dir => $bdir,
-db_name => $key,
-db_data => $genome,
-create => 1
);
$factory->make_db();
$factory->_register_temp_for_cleanup($key);
my $parampath = $tmp_path . $key . '.param';
store $p, $parampath;
my %report = ();
my @genes = $newchr->db->features(
-seq_id => $newchr->seq_id,
-types => 'gene',
-start => $p->{STARTPOS},
-end => $p->{STOPPOS},
-range_type => 'contains'
);
@genes = sort {$b->end - $b->start <=> $a->end - $a->start} @genes;
#Find tags, pick tags, implement tags, check for errors
my @JOBLIST;
my @OUTS;
my @CLEANUP = ($parampath);
my $exec = 'BS_auto_tag_genes.pl';
my $hookup = {};
foreach my $gene (@genes)
{
my $gid = $gene->Tag_load_id;
my $cmd = $exec . q{ -chr } . $chrname . q{ -key } . $key;
$cmd .= q{ -gid } . $gid;
my $outpath = $tmp_path . $key . q{_} . $gid . '.out';
push @JOBLIST, $cmd . q{:} . $outpath . q{:0};
push @OUTS, $outpath;
$hookup->{$gid} = $outpath;
}
my $jobs = join "\n", @JOBLIST;
my $morefiles = taskfarm($jobs, $chrname, $key);
@genes = sort {$a->start <=> $b->start} @genes;
foreach my $gene (@genes)
{
my $gid = $gene->Tag_load_id;
my $outpath = $hookup->{$gid};
if (! -e $outpath)
{
$report{$gid} = 'TAGGING FAILED!';
next;
}
my $result = retrieve($outpath);
my @tags = @{$result->[1]};
my $comment = $result->[0];
if (scalar @tags)
{
add_tags($newchr, $gene, $result->[1]);
}
$report{$gid} = $comment;
}
push @CLEANUP, @OUTS, @{$morefiles};
cleanup(\@CLEANUP);
$factory->cleanup();
return \%report;
}
=head2 serial_tagging
=cut
sub serial_tagging
{
my ($newchr, $genome, $p) = @_;
my $mask = $newchr->type_mask(['CDS', 'intron']);
my $GD = $newchr->GD;
my $bdir = Bio::BioStudio::ConfigData->config('tmp_path');
my $factory = Bio::Tools::Run::StandAloneBlastPlus->new(
-db_dir => $bdir,
-db_data => $genome,
-create => 1
);
$factory->make_db();
my %report = ();
my @genes = $newchr->db->features(
-seq_id => $newchr->seq_id,
-types => 'gene',
-start => $p->{STARTPOS},
-end => $p->{STOPPOS},
-range_type => 'contains'
);
#Find tags, pick tags, implement tags, check for errors
foreach my $gene (@genes)
{
my $gid = $gene->Tag_load_id;
my $result = tag_gene($newchr, $mask, $factory, $gid, $p);
my @tags = @{$result->[1]};
my $comment = $result->[0];
if (scalar @tags)
{
add_tags($newchr, $gene, $result->[1]);
}
$report{$gid} .= $comment;
}
$factory->cleanup();
return \%report;
}
=head2 tag_gene
=cut
sub tag_gene
{
my ($chr, $mask, $factory, $gid, $p) = @_;
my $gene = $chr->fetch_features(-name => $gid);
my $cDNA = $chr->make_cDNA($gene);
my $dir = $gene->strand;
my $return = undef;
if (length $cDNA < $p->{MINORFLEN})
{
return ['too short', []];
}
#Extract all possible tags from the gene
#
my @pretags = @{find_tags($gene, $chr, $mask, $p)};
if (scalar @pretags < 2)
{
return ['no more than one good tag preBLAST', []];
}
#BLAST all tags against the latest version of the genome
# only keep tags that hit wtseq once and newseq never
my @posttags = @{BLAST_tags(\@pretags, $factory)};
if (scalar @posttags < 2)
{
return ['no more than one good tag postBLAST', []];
}
if ($posttags[0]->start + $p->{MINAMPLEN} > $posttags[-1]->end)
{
return ['bad tag range postBLAST', []];
}
#choose tag pairs
#
my $tagtarget = length($cDNA) - $p->{MINORFLEN};
$tagtarget = $tagtarget / $p->{ORF_TAG_INC};
$tagtarget = ($tagtarget % $p->{ORF_TAG_INC}) + 1;
$p->{TARGET} = $tagtarget;
my @chosen = @{pair_tags($gene, \@posttags, $mask, $factory, $p)};
my $tagcount = scalar @chosen;
if (! $tagcount)
{
return ['no good tag pairs', []];
}
my $plural = $tagcount > 1 ? q{s} : q{};
return ["$tagcount pair$plural chosen", \@chosen];
}
=head2 find_tags
=cut
sub find_tags
{
my ($gene, $chr, $mask, $p) = @_;
my $MINTAGLEN = $p->{MINTAGLEN},
my $MAXTAGLEN = $p->{MAXTAGLEN},
my $MINTAGMELT = $p->{MINTAGMELT},
my $MAXTAGMELT = $p->{MAXTAGMELT},
my %BADAAS = %{$p->{BADAAS}},
my $MINPERDIFF = $p->{MINPERDIFF},
my $THREEPRIMEBUFFER = $p->{THREEPRIMEBUFFER},
my $FIVEPRIMEBUFFER = $p->{FIVEPRIMEBUFFER},
my $gid = $gene->Tag_load_id;
my $cDNA = $chr->make_cDNA($gene);
my $GD = $chr->GD();
my $gstart = $gene->start();
my ($rstart, $rstop) = ($gstart, $gene->end);
my $dir = $gene->strand;
if ($dir != 1)
{
$rstart += $THREEPRIMEBUFFER;
$rstop -= $FIVEPRIMEBUFFER;
}
else
{
$rstart += $FIVEPRIMEBUFFER;
$rstop -= $THREEPRIMEBUFFER;
}
my $geneseq = $gene->seq->seq;
my $aaseq = _translate($cDNA, 1, $GD->{codontable});
#Extract all possible tags from the gene
#
my $chrseq = $chr->sequence;
my @pretags = ();
my $count = $MAXTAGLEN;
while ($rstart + $count < $rstop - $MINTAGLEN)
{
#Make the wide oligo and the oligo
my $woligo = substr $chrseq, $rstart - 1, $count + 2;
my $wolen = length $woligo;
my $tstart = $dir == 1 ? $rstart + 2 : $rstart;
my $oligo = substr $chrseq, $tstart - 1, $count;
my $olen = length $oligo;
#Exclude oligos that begin or end in unswappable codons
my $peptide = _translate($woligo, $dir, $GD->{codontable});
my $firstres = substr $peptide, 0, 1;
my $lastres = substr $peptide, -1, 1;
if (exists $BADAAS{$firstres} || exists $BADAAS{$lastres})
{
($count, $rstart) = counterset($count, $rstart, $MINTAGLEN, $MAXTAGLEN);
next;
}
#Exclude oligos that don't meet melting standard
my $currTm = _ntherm($oligo);
if ($currTm < $MINTAGMELT || $currTm > $MAXTAGMELT)
{
if ($currTm < $MINTAGMELT)
{
$rstart = $rstart + 3;
$count = $MAXTAGLEN;
}
elsif ($currTm > $MAXTAGMELT)
{
($count, $rstart) = counterset($count, $rstart, $MINTAGLEN, $MAXTAGLEN);
}
next;
}
#Exclude oligos that aren't entirely in exons or lapped by other genes
my @ofeats = $mask->feature_objects_in_range($rstart, $count);
my @ifeats = grep { $_->primary_tag eq 'intron' } @ofeats;
if (scalar @ifeats || scalar @ofeats > 1)
{
($count, $rstart) = counterset($count, $rstart, $MINTAGLEN, $MAXTAGLEN);
next;
}
#Recode oligos
$woligo = _complement($woligo, 1) if ($dir == -1);
my $wmdoligo = $GD->codon_juggle(
-sequence => $woligo,
-algorithm => 'most_different_sequence'
);
#Ensure that the first two bases are the same
if ((substr $wmdoligo, 0, 2) ne (substr $woligo, 0, 2))
{
my $codon = substr $woligo, 0, 3;
my $aa = $GD->{codontable}->{$codon};
my $di = substr $codon, 0, 2;
my $possibles = $GD->{reversecodontable}->{$aa};
my @choices = grep {(substr $_, 0, 2) eq $di} @{$possibles};
@choices = grep {(substr $_, -1) ne (substr $codon, -1)} @choices;
substr $wmdoligo, 0, 3, $choices[0];
}
$woligo = _complement($woligo, 1) if ($dir == -1);
$wmdoligo = _complement($wmdoligo, 1) if ($dir == -1);
my $mdoligo = $dir == 1
? substr($wmdoligo, 2, $wolen - 2)
: substr($wmdoligo, 0, $wolen - 2);
#Exclude oligos whose recodes don't meet percent difference standard
my $comps = _compare_sequences($oligo, $mdoligo);
if ( $comps->{P} < $MINPERDIFF)
{
($count, $rstart) = counterset($count, $rstart, $MINTAGLEN, $MAXTAGLEN);
next;
}
#Exclude oligos whose recodes don't meet melting standard
my $MDTm = _ntherm($mdoligo);
if ($MDTm < $MINTAGMELT || $MDTm > $MAXTAGMELT)
{
($count, $rstart) = counterset($count, $rstart, $MINTAGLEN, $MAXTAGLEN);
next;
}
my $offset = $tstart - $gstart + 1;
my $tagid = $gid . q{_} . $offset;
my $tag = Bio::BioStudio::SeqFeature::Tag->new(
-start => $offset,
-end => $offset + $olen - 1,
-display_name => $tagid,
-ingene => $gid,
-wtseq => $oligo,
-newseq => $mdoligo,
-wtpos => $tstart,
-difference => $comps->{P},
-translation => $peptide,
);
push @pretags, $tag;
($count, $rstart) = counterset($count, $rstart, $MINTAGLEN, $MAXTAGLEN);
}
return \@pretags;
}
=head2 BLAST_tags
=cut
sub BLAST_tags
{
my ($tagarr, $factory) = @_;
my @tags = @{$tagarr};
#BLAST all tags against the latest version of the genome
my @tagobjs = ();
foreach my $tag (@tags)
{
my $id = $tag->display_name;
my $wtobj = Bio::Seq->new(-seq => $tag->wtseq(), id => 'wt_' . $id);
my $newobj = Bio::Seq->new(-seq => $tag->newseq(), id => 'md_' . $id);
push @tagobjs, $wtobj, $newobj;
}
$factory->run(
-method => 'blastn',
-query => \@tagobjs,
-method_args => [
-word_size => 17,
-perc_identity => 70
]);
$factory->rewind_results;
my %hits;
while (my $result = $factory->next_result)
{
my $name = $result->query_name();
while( my $hit = $result->next_hit())
{
while( my $hsp = $hit->next_hsp())
{
$hits{$name}++;
}
}
}
#only keep tags that hit wtseq once and newseq never
my @posttags;
foreach my $tag (sort {$a->start <=> $b->start} @tags)
{
my $id = $tag->display_name;
my $wthits = $hits{'wt_' . $id} || 0;
my $mdhits = $hits{'md_' . $id} || 0;
lib/Bio/BioStudio/PCRTagging.pm view on Meta::CPAN
}
next if (! scalar(keys %possibles));
my @downstreams = sort { $possibles{$a}->[0] <=> $possibles{$b}->[0]
|| $possibles{$a}->[1] <=> $possibles{$b}->[1] }
keys %possibles;
my @downbuds = map {$possibles{$_}->[2]} @downstreams;
my $partner = BLAST_pairs($utag, \@downbuds, $factory);
if ($partner)
{
$tagcount++;
$usedtags{$utag->display_id}++;
$usedtags{$partner->display_id}++;
$tmask->add_to_mask([$utag, $partner]);
my ($tstart, $tend) = (undef, undef);
if ($ustart < $partner->start)
{
$tstart = $ustart;
$tend = $partner->end;
push @chosen, [$utag, $partner];
}
else
{
$tstart = $partner->start;
$tend = $utag->end;
push @chosen, [$partner, $utag];
}
my $amp = Bio::SeqFeature::Generic->new(
-start => $tstart,
-end => $tend,
-display_name => 'anonamp' . $tagcount,
-primary_tag => 'amplicon',
);
$amask->add_to_mask([$amp]);
}
last if ($tagcount == $TARGET);
}
return \@chosen;
}
=head2 BLAST_pairs
=cut
sub BLAST_pairs
{
my ($utag, $dntags, $factory) = @_;
#BLAST the pair to check amplification uniqueness.
#
my %phits = ();
my %objs = ();
# The utag wt and new sequences, forwards
my $uid = $utag->display_name;
my $uwtid = 'rcwt_' . $uid;
$phits{$uwtid} = [];
$objs{$uwtid} = Bio::Seq->new(-seq => $utag->wtseq(), id => $uwtid);
my $unewid = 'rcmd_' . $uid;
$phits{$unewid} = [];
$objs{$unewid} = Bio::Seq->new(-seq => $utag->newseq(), id => $unewid);
# The dtags wt and new sequences, reverses
my @dtagids = ();
my @downbuds = @{$dntags};
my %refs;
foreach my $dtag (@downbuds)
{
my $did = $dtag->display_name;
$refs{$did} = $dtag;
push @dtagids, $did;
my $dwtid = 'rcwt_' . $did;
$phits{$dwtid} = [];
$objs{$dwtid} = Bio::Seq->new(-seq => $dtag->wtseq(), id => $dwtid);
my $dnewid = 'rcmd_' . $did;
$phits{$dnewid} = [];
$objs{$dnewid} = Bio::Seq->new(-seq => $dtag->newseq(), id => $dnewid);
}
my @pairobjs = values %objs;
$factory->run(
-method => 'blastn',
-query => \@pairobjs,
-method_args => [
-word_size => 4,
-gapextend => 2,
-gapopen => 1,
-penalty => -1,
-perc_identity => 70,
-best_hit_overhang => .25
]);
$factory->rewind_results;
while (my $result = $factory->next_result)
{
my $name = $result->query_name();
my $qlen = length $objs{$name}->seq;
while( my $hit = $result->next_hit())
{
while( my $hsp = $hit->next_hsp())
{
next if $hsp->percent_identity < 85;
next if ($hsp->length < .7 * $qlen);
push @{$phits{$name}}, [$hit, $hsp];
}
}
}
my $partner = undef;
my @Fwts = @{$phits{'rcwt_' . $uid}};
my @Fmds = @{$phits{'rcmd_' . $uid}};
foreach my $dtagid (@dtagids)
{
my ($wtnogo, $mdnogo) = (0, 0);
my @Rwts = @{$phits{'rcwt_' . $dtagid}};
foreach my $Rwpair (@Rwts)
{
my ($Rwhit, $Rwhsp) = @{$Rwpair};
foreach my $Fwpair (@Fwts)
{
my ($Fwhit, $Fwhsp) = @{$Fwpair};
#Pass if hits are on different chromosomes
next if ($Rwhit->name ne $Fwhit->name);
#Pass if hits are on the same strand
next if ($Rwhsp->strand eq $Fwhsp->strand);
$wtnogo++;
}
}
my @Rmds = @{$phits{'rcmd_' . $dtagid}};
foreach my $Rmpair (@Rmds)
{
my ($Rmhit, $Rmhsp) = @{$Rmpair};
foreach my $Fmpair (@Fmds)
{
my ($Fmhit, $Fmhsp) = @{$Fmpair};
#Pass if hits are on different chromosomes
next if ($Rmhit->name ne $Fmhit->name);
#Pass if hits are on the same strand
next if ($Rmhsp->strand eq $Fmhsp->strand);
$mdnogo++;
}
}
if ($mdnogo + $wtnogo == 0)
{
$partner = $refs{$dtagid};
last;
}
}
$factory->cleanup;
return $partner;
}
=head2 counterset
=cut
sub counterset
{
my ($count, $start, $minlen, $maxlen) = @_;
$start = $start + 3 if ($count < $minlen + 2);
$count = $count >= $minlen + 2 ? $count - 3 : $maxlen;
return ($count, $start);
}
=head2 add_tags
Add tags and amplicons to database, make sequence changes
=cut
sub add_tags
{
my ($chr, $gene, $tagarr, $report) = @_;
my @chosen = @{$tagarr};
my $tagcount = scalar @chosen;
my $gstart = $gene->start;
my $gend = $gene->end;
my $gid = $gene->Tag_load_id;
my $tcount = 1;
foreach my $tagpair (@chosen)
{
my ($gutag, $gdtag) = @{$tagpair};
my $adjust = $gstart - 1;
$gutag->start($gutag->start() + $adjust);
$gdtag->start($gdtag->start() + $adjust);
$gutag->end($gutag->end() + $adjust);
$gdtag->end($gdtag->end() + $adjust);
my $utagid = $gutag->display_name;
my $dtagid = $gdtag->display_name;
my $aid = $gid . '_amp' . $tcount;
my $comment = "PCR_product $aid annotated ";
$comment .= "(tags $utagid and $dtagid added)";
my $amp = Bio::BioStudio::SeqFeature::Amplicon->new(
-start => $gutag->start,
-end => $gdtag->end,
-ingene => $gid,
-uptag => $utagid,
-dntag => $dtagid,
-display_name => $aid,
);
$chr->add_feature(-feature => $gutag);
$chr->add_feature(-feature => $gdtag);
$chr->add_feature(-feature => $amp, -comments => [$comment]);
$tcount++;
}
return;
}
=head2 sortdiff
=cut
sub sortdiff
{
return $b->difference() <=> $a->difference() || $a->start <=> $b->start;
}
1;
__END__
=head1 COPYRIGHT AND LICENSE
Copyright (c) 2015, BioStudio developers
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
* The names of Johns Hopkins, the Joint Genome Institute, the Joint BioEnergy
Institute, the Lawrence Berkeley National Laboratory, the Department of Energy,
and the BioStudio developers may not be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE DEVELOPERS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
=cut
( run in 0.629 second using v1.01-cache-2.11-cpan-5735350b133 )