Bio-WGS2NCBI
view release on metacpan or search on metacpan
lib/Bio/WGS2NCBI.pm view on Meta::CPAN
The location of the template file produced with the form at:
L<http://www.ncbi.nlm.nih.gov/WebSub/template.cgi>
=item C<outdir>
The location where to write the resulting ASN.1 files.
=item C<discrep>
The location where to write the discrepancy report.
=item C<tbl2asn>
The location where the tbl2asn executable is located.
=back
=cut
sub convert {
my $config = Bio::WGS2NCBI::Config->new;
my $INDIR = $config->datadir;
my $TMPL = $config->template;
my $OUTDIR = $config->outdir;
my $DISCREP = $config->discrep;
my $TBL2ASN = $config->tbl2asn;
my $command = "$TBL2ASN -p $INDIR -t $TMPL -M n -a r10k -l paired-ends -r $OUTDIR -Z $DISCREP > /dev/null 2>&1";
INFO "going to execute command '$command'";
exec $command;
}
=head1 trim
The C<trim> action trims stretches of leading or trailing NNNs from sequence records, and
updates the coordinates in the associated feature tables accordingly. In cases where a
feature falls within a trimmed region, the feature is removed entirely.
=over
=item C<datadir>
The location of the dir where the (potentially 'chunked', see below) sequence files
and feature tables were written by L<Bio::WGS2NCBI/process>.
=back
=cut
sub trim {
my $config = Bio::WGS2NCBI::Config->new;
my $INDIR = $config->datadir;
# iterate over files in folder, read FASTA files
opendir my $dh, $INDIR or die $!;
while( my $file = readdir $dh ) {
# have a FASTA file
if ( $file =~ /(.+)\.fsa$/ ) {
my $stem = $1;
# make backup of FASTA file
rename "${INDIR}/${file}", "${INDIR}/${file}.bak";
# read file, look op non-missing residue positions, write truncated
open my $fh, '<', "${INDIR}/${file}.bak" or die $!;
open my $out, '>', "${INDIR}/${file}" or die $!;
my ( $pos, $seq, %coord );
while( not eof($fh) ) {
( $pos, $seq ) = Bio::WGS2NCBI::Seq->read_fasta( $fh, $pos );
my $id = $seq->id;
my $i1 = $seq->get_non_missing_index;
my $i2 = $seq->get_non_missing_index(1);
INFO "$id\t$i1 .. $i2";
$coord{$id} = [ $i1, $i2 ];
$seq->trunc( $i1 + 1, $i2 + 1 )->write_fasta($out);
}
# make backup of TBL file, open handle for writing
rename "${INDIR}/${stem}.tbl", "${INDIR}/${stem}.tbl.bak";
open my $outtbl, '>', "${INDIR}/${stem}.tbl" or die $!;
# initialize variables
my $tr = Bio::WGS2NCBI::TableReader->new(
'-file' => "${INDIR}/${stem}.tbl.bak",
'-cb' => sub {
my $id = shift;
print $outtbl '>Features ', $id, "\n";
}
);
# iterate over features
my ( $oldid, $drop, $id ) = ( '' );
while( my $f = $tr->next_feature ) {
$id = $tr->seq;
if ( $f->isa('Bio::WGS2NCBI::GeneFeature') ) {
if ( $f->lies_within( @{ $coord{$id} } ) ) {
$drop = 0;
}
else {
$drop = 1;
}
}
if ( not $drop ) {
# shift features leftward
if ( my $diff = $coord{$id}->[0] ) {
my @r = $f->range;
for my $r ( @r ) {
my @coord;
for my $coord ( @$r ) {
if ( $coord =~ /^([^0-9]*)(\d+)$/ ) {
my $prefix = $1;
my $number = $2;
$number -= $diff;
push @coord, $prefix . $number;
}
}
$r->[0] = $coord[0];
$r->[1] = $coord[1];
}
}
print $outtbl $f->to_string;
}
$oldid = $id;
}
}
}
}
=head1 prune
The C<prune> action reads a discrepancy file as supplied by NCBI, parses out errors that
have locations in them, which are then pruned from the table files in $config->datadir.
This requires the following configuration settings:
=over
( run in 2.262 seconds using v1.01-cache-2.11-cpan-5a3173703d6 )