backup results from the CPAN

Bio-WGS2NCBI
view release on metacpan or search on metacpan
The location of the template file produced with the form at:
L<http://www.ncbi.nlm.nih.gov/WebSub/template.cgi>

=item C<outdir>

The location where to write the resulting ASN.1 files.

=item C<discrep>

The location where to write the discrepancy report.

=item C<tbl2asn>

The location where the tbl2asn executable is located.

=back

=cut

sub convert {
	my $config = Bio::WGS2NCBI::Config->new;
	my $INDIR   = $config->datadir;
	my $TMPL    = $config->template;
	my $OUTDIR  = $config->outdir;
	my $DISCREP = $config->discrep;
	my $TBL2ASN = $config->tbl2asn;	
	my $command = "$TBL2ASN -p $INDIR -t $TMPL -M n -a r10k -l paired-ends -r $OUTDIR -Z $DISCREP > /dev/null 2>&1";
	INFO "going to execute command '$command'";
	exec $command;
}

=head1 trim

The C<trim> action trims stretches of leading or trailing NNNs from sequence records, and
updates the coordinates in the associated feature tables accordingly. In cases where a 
feature falls within a trimmed region, the feature is removed entirely.

=over

=item C<datadir>

The location of the dir where the (potentially 'chunked', see below) sequence files
and feature tables were written by L<Bio::WGS2NCBI/process>.

=back

=cut

sub trim {
	my $config = Bio::WGS2NCBI::Config->new;
	my $INDIR  = $config->datadir;
	
	# iterate over files in folder, read FASTA files
	opendir my $dh, $INDIR or die $!;
	while( my $file = readdir $dh ) {
		
		# have a FASTA file
		if ( $file =~ /(.+)\.fsa$/ ) {
			my $stem = $1;
		
			# make backup of FASTA file
			rename "${INDIR}/${file}", "${INDIR}/${file}.bak";
			
			# read file, look op non-missing residue positions, write truncated
			open my $fh,  '<', "${INDIR}/${file}.bak" or die $!;
			open my $out, '>', "${INDIR}/${file}"     or die $!;
			my ( $pos, $seq, %coord );
			while( not eof($fh) ) {
				( $pos, $seq ) = Bio::WGS2NCBI::Seq->read_fasta( $fh, $pos );
				my $id = $seq->id;
				my $i1 = $seq->get_non_missing_index;
				my $i2 = $seq->get_non_missing_index(1);
				INFO "$id\t$i1 .. $i2";
				$coord{$id} = [ $i1, $i2  ];
				$seq->trunc( $i1 + 1, $i2 + 1 )->write_fasta($out);	
			}
			
			# make backup of TBL file, open handle for writing		
			rename "${INDIR}/${stem}.tbl", "${INDIR}/${stem}.tbl.bak";
			open my $outtbl, '>', "${INDIR}/${stem}.tbl" or die $!;
			
			# initialize variables
			my $tr = Bio::WGS2NCBI::TableReader->new( 
				'-file' => "${INDIR}/${stem}.tbl.bak",
				'-cb'   => sub {
					my $id = shift;
					print $outtbl '>Features ', $id, "\n";
				}
			);
						
			# iterate over features
			my ( $oldid, $drop, $id ) = ( '' );
			while( my $f = $tr->next_feature ) {
				$id = $tr->seq;
				if ( $f->isa('Bio::WGS2NCBI::GeneFeature') ) {
					if ( $f->lies_within( @{ $coord{$id} } ) ) {
						$drop = 0;
					}
					else {
						$drop = 1;
					}
				}				
				if ( not $drop ) {
					
					# shift features leftward		
					if ( my $diff = $coord{$id}->[0] ) {
						my @r = $f->range;
						for my $r ( @r ) {
							my @coord;
							for my $coord ( @$r ) {
								if ( $coord =~ /^([^0-9]*)(\d+)$/ ) {
									my $prefix = $1;
									my $number = $2;
									$number -= $diff;
									push @coord, $prefix . $number;
								}
							}
							$r->[0] = $coord[0];
							$r->[1] = $coord[1];
						}
					}
					print $outtbl $f->to_string;
				}
				$oldid = $id;
			}
		}
	}
}

=head1 prune

The C<prune> action reads a discrepancy file as supplied by NCBI, parses out errors that
have locations in them, which are then pruned from the table files in $config->datadir.

This requires the following configuration settings:

=over
( run in 2.262 seconds using v1.01-cache-2.11-cpan-5a3173703d6 )