Bio-BPWrapper

 view release on metacpan or  search on metacpan

lib/Bio/BPWrapper/SeqManipulations.pm  view on Meta::CPAN

=encoding utf8

=head1 NAME

Bio::BPWrapper::SeqManipulations - Functions for L<bioseq>

=head1 SYNOPSIS

    use Bio::BPWrapper::SeqManipulations;
    # Set options hash ...
    initialize(\%opts);
    write_out(\%opts);

=cut

package Bio::BPWrapper::SeqManipulations;

use strict;    # Still on 5.10, so need this for strict
use warnings;
use 5.010;
use Bio::Seq;
use Bio::SeqIO;
use File::Basename;
use Bio::Tools::CodonTable;
use Bio::DB::RefSeq;
use Bio::Tools::SeqStats;
use Bio::SeqUtils;
use Scalar::Util;
use Exporter ();
use Bio::CodonUsage::IO;
use Bio::Tools::pICalculator;
# use Bio::Tools::GuessSeqFormat;

if ($ENV{'DEBUG'}) { use Data::Dumper }

use vars qw(@ISA @EXPORT @EXPORT_OK);

@ISA         = qw(Exporter);

#restrict_coord restrict_digest
@EXPORT      = qw(initialize can_handle handle_opt write_out
print_composition filter_seqs retrieve_seqs remove_gaps
print_lengths print_seq_count make_revcom
print_subseq
anonymize
shred_seq count_codons print_gb_gene_feats
count_leading_gaps hydroB linearize reloop_at
remove_stop parse_orders find_by_order
pick_by_order del_by_order find_by_id
pick_by_id del_by_id find_by_re
pick_by_re del_by_re
pick_by_file del_by_file
find_by_ambig pick_by_ambig del_by_ambig find_by_length
del_by_length codon_sim codon_info);

# Package global variables
my ($in, $out, $seq, %opts, $filename, $in_format, $out_format, $guesser);
use Bio::BPWrapper;
my $VERSION = '1.0';

## For new options, just add an entry into this table with the same key as in
## the GetOpts function in the main program. Make the key be a reference to the handler subroutine (defined below), and test that it works.
my %opt_dispatch = (
    'codon-table' => \&codon_table,
#    'codon-sim' => \&codon_sim,
    'codon-info' => \&codon_info,
    'iep' => \&iso_electric_point,
    'composition' => \&print_composition,
    'mol-wt' => \&print_weight,
    'delete' => \&filter_seqs,
    'fetch' => \&retrieve_seqs,
    'no-gaps' => \&remove_gaps,
    'length' => \&print_lengths,
    'longest-orf' => \&update_longest_orf,
    'num-seq' => \&print_seq_count,
    'num-gaps-dna' => \&count_gaps_dna,
    'num-gaps-aa' => \&count_gaps_aa,
    'pick' => \&filter_seqs,
    'revcom' => \&make_revcom,
    'subseq' => \&print_subseq,
    'translate' => \&reading_frame_ops,
#    'restrict-coord' => \&restrict_coord,
#    'restrict' => \&restrict_digest,
    'anonymize' => \&anonymize,
    'break' => \&shred_seq,
    'count-codons' => \&count_codons,
    'feat2fas' => \&print_gb_gene_feats,
    'lead-gaps' => \&count_leading_gaps,
    'hydroB' => \&hydroB,
    'linearize' => \&linearize,
    'reloop' => \&reloop_at,
    'remove-stop' => \&remove_stop,
    'sort' => \&sort_by,
    'split-cdhit' => \&split_cdhit,
    'syn-code' => \&synon_codons,
#   'dotplot' => \&draw_dotplot,
#    'extract' => \&reading_frame_ops,
#	'longest-orf' => \&reading_frame_ops,
#	'prefix' => \&anonymize,
	'rename' => \&rename_id,
#	'slidingwindow' => \&sliding_window,
#	'split' => \&split_seqs,
  );

my %filter_dispatch = (
    'find_by_order'  => \&find_by_order,
    'pick_by_order'  => \&pick_by_order,
    'delete_by_order'   => \&del_by_order,
    'find_by_id'     => \&find_by_id,
    'pick_by_id'     => \&pick_by_id,
    'delete_by_id'      => \&del_by_id,
    'find_by_re'     => \&find_by_re,
    'pick_by_re'     => \&pick_by_re,
    'find_by_file'     => \&find_by_file,
    'pick_by_file'     => \&pick_by_file,
    'delete_by_file'      => \&del_by_file,
    'find_by_ambig'  => \&find_by_ambig,
    'pick_by_ambig'  => \&pick_by_ambig,
    'delete_by_ambig'   => \&del_by_ambig,
    'find_by_length' => \&find_by_length,
    'pick_by_length' => \&pick_by_length,
    'delete_by_length'  => \&del_by_length,
);

##################### initializer & option handlers ###################

=head1 SUBROUTINES

=head2 initialize()

Sets up most of the actions to be performed on sequences.

Call this right after setting up an options hash.

Sets package variables: C<$in>, C<$in_format>, C<$filename>, C<$out_format>, and C<$out>.


=cut

sub initialize {
    my $opts_ref = shift;
    Bio::BPWrapper::common_opts($opts_ref);
    %opts = %{$opts_ref};

    die "Option 'prefix' requires a value\n" if defined $opts{"prefix"} && $opts{"prefix"} =~ /^$/;

    $filename = shift @ARGV || "STDIN";    # If no more arguments were given on the command line, assume we're getting input from standard input

# guess format won't work for piped input; remove
#    if ($filename eq "STDIN") {
#	my $lines; 
#	my $line_ct = 0; 
#	while(<>) { $lines .= $_; $line_ct++; last if $line_ct >= 100 } # read the first 100 lines
#	$guesser = Bio::Tools::GuessSeqFormat->new( -text => $lines );
#    } else {
#	$guesser = Bio::Tools::GuessSeqFormat->new( -file => $filename);
#    }
#    $in_format  = $guesser->guess() unless $opts{'input'};

    $in_format = $opts{"input"} // 'fasta';

#    die "Reads only fasta, fastq, embl, genbank. Not aligment file formats like clustalw\n" unless $in_format =~ /fasta|fastq|embl|genbank/;
    $in = Bio::SeqIO->new(-format => $in_format, ($filename eq "STDIN")? (-fh => \*STDIN) : (-file => $filename));

    $out_format = $opts{"output"} // 'fasta';

# A change in SeqIO, commit 0e04486ca4cc2e61fd72, means -fh or -file is required
    $out = Bio::SeqIO->new(-format => $out_format, -fh => \*STDOUT)
}

sub can_handle {
    my $option = shift;
    return defined($opt_dispatch{$option})
}

sub handle_opt {
    my $option = shift;
    $opt_dispatch{$option}->($option)  # This passes option name to all functions
}

=head2 write_out()

Writes out the sequence file.

Call this after calling C<#initialize(\%opts)> and processing those options.

=cut

sub write_out {
    while ($seq = $in->next_seq()) { $out->write_seq($seq) }
}


################### subroutines ########################

sub codon_sim {
#    my $seq = $in->next_seq(); # only the first sequence used
#    if (&_internal_stop_or_x($seq->translate()->seq())) {
#	die "internal stop or non-standard base:\t" . $seq->id . "\texit.\n";
#    }
#    use Algorithm::Numerical::Sample  qw /sample/;
#    use Math::Random qw /random_permutation/;
########################
# Read CUTG and make a random codon set for each AA
########################
#    my $cutg_file = $opts{'codon-sim'};
#    my $io = Bio::CodonUsage::IO->new(-file => $cutg_file);
#    my $cdtable = $io->next_data();
    my @bases = qw(A T C G);
    my @codons;
    for (my $i=0; $i<=3; $i++) {
	my $first = $bases[$i];
	for (my $j=0; $j<=3; $j++) {
	    my $second = $bases[$j];
	    for (my $k=0; $k<=3; $k++) {
		my $third = $bases[$k];
		push @codons, $first . $second . $third;
	    }



( run in 0.896 second using v1.01-cache-2.11-cpan-39bf76dae61 )