Bio-BPWrapper
view release on metacpan or search on metacpan
B<bioaln> [options] <alignment file>
B<bioaln> [-h | --help | -V | --version | --man]
=head2 Alignment descriptors
bioaln -l aln_file # [l]ength of an alignment
bioaln -L aln_file # [L]ist sequence IDs
bioaln -n aln_file # [n]umber of aligned sequences
bioaln -a aln_file # [a]verage percent identity
bioaln -w '30' aln_file # average identifies for sliding [w]indows of 30
=head2 Alignment viewers
bioaln -c aln_file # [c]odon view (groups of 3 nts)
bioaln -m aln_file # [m]atch view (show variable sites)
=head2 Alignment filters (output a new alignment)
bioaln -d 'Seq1,Seq2' aln_file # [d]elete sequences
=head1 OPTIONS
=over 4
=item --aln-index, -I <seq_id,position>
Return aligned position of a residue of a sequence based on its unaligned (gap-free) position.
=item --avg-pid, -a
Return average percent identity of an alignment.
=item --binary
Transform sequences into binary (0/1) strings for e.g., PHYLIP suites (see below)
=item --bin-inform
Print only binary informative sites (e.g., for parsimony analysis). Example PHYLIP application:
bioaln --bin-inform --binary -o'phylip' foo.aln > foo.phy
bioaln --concat gene1.aln gene2.aln gene3.aln gene4.aln
or using wildcard to specify multiple files (check with "ls *.aln" first to make sure of alignment order):
bioaln --concat gene*.aln
Two outputs:
1. concated alignment (in STANDOUT)
2. "concat.log" file, which shows mapped positions for a reference seq (specified by "-r" otherwise first sequence)
=item --consensus, -C 'percent' (default 50)
Add a consensus sequence to the end of the alignment with a certain threshold percent and id Consensus_<percent>.
=item --delete, -d 'seq_id1,seq_id2,etc'
Delete sequences based on their ids. Option takes a comma-separated list of ids.
=item --dna2pep, -D
Turn an in-frame protein-coding sequence alignment to a corresponding protein alignment.
=item --gap-char '.'
=item --num-seq, -n
Print number of sequences in alignment.
=item --output, -o 'format'
Output file format. Common ones include 'clustalw' (default), 'fasta' and 'phylip'. See L<Bio::AlignIO> for supported formats. An additional format 'paml' is supported.
=item --pair-diff
Print pairwise sequence differences, including columns: seqA, seqB, num_variable_sites (no gap), num_pair_diff (no gap), total_pair_length (no gap), percent identity, fraction diff, and pair_diff/num_variable. For DNA seqs, it counts any non-ATCG's (...
=item --pair-diff-ref <id>
Print pairwise sequence differences to a specified sequence. For DNA seqs, it counts any non-ATCG's (e.g., N,n) as invalid, making it more robust than relying soly on percent_differece().
=item --pep2dna, -P 'unaligned-cds-file' <protein_alignment>
Produce an in-frame codon alignment by align CDS sequences according to their corresponding protein alignment. Throws an error if names in two files do not match exactly.
=item --permute-states, -M
Generate an alignment with randomly permuted residues at each site. This operation removes phylogenetic signal among aligned sequences, if there is any in the original alignment. This is the basis of the Permutation Trail Prob (PTP) test of the tree-...
=item --phy-nonint
lib/Bio/BPWrapper/AlnManipulations.pm view on Meta::CPAN
for (my $j = 1; $j <= $pair->length; $j++) {
my $mA = $refSeq->subseq($j,$j);
my $mB = $seqB->subseq($j,$j);
next if $refSeq->alphabet eq 'dna' && $seqB->alphabet eq 'dna' && ($mA !~ /^[ATCG]$/i || $mB !~ /^[ATCG]$/i);
# next if $match_symbols[$i] eq '*';
next if $mA eq '-' || $mB eq '-';
$ct_valid++;
$ct_diff++ unless $mA eq $mB;
# next if $match_symbols[$i] eq '*';
}
my $pairdiff = $pair->percentage_identity();
print join "\t", ($refId, $idB, $ct_diff, $ct_valid, $pair->length());
printf "\t%.4f\t%.4f\n", $pairdiff, 1-$pairdiff/100;
}
exit;
}
sub pair_diff {
my $alnBack = $aln;
my $lenTotal = $alnBack->length();
# $alnBack = $alnBack->remove_gaps();
lib/Bio/BPWrapper/AlnManipulations.pm view on Meta::CPAN
my $mA = $seqA->subseq($j,$j);
my $mB = $seqB->subseq($j,$j);
next if $seqA->alphabet eq 'dna' && $seqB->alphabet eq 'dna' && ($mA !~ /^[ATCG]$/i || $mB !~ /^[ATCG]$/i);
$gap_included_diff++ unless $mA eq $mB;
# next if $match_symbols[$i] eq '*';
next if $mA eq '-' || $mB eq '-';
$ct_valid++;
$ct_diff++ unless $mA eq $mB;
}
# while ($mask =~ /[^\0]/g) { $ct_diff++ }
my $pairdiff = $pair->percentage_identity();
print join "\t", ($idA, $idB, $ct_valid, $num_var, $ct_diff, $gap_included_diff, $pair->length());
printf "\t%.4f\t%.4f\t%.4f\t%.4f\n", $pairdiff, 1-$pairdiff/100, $ct_diff/$ct_valid, $gap_included_diff/$num_var;
}
}
exit;
}
sub split_cdhit {
my $cls_file = $opts{'split-cdhit'};
lib/Bio/BPWrapper/AlnManipulations.pm view on Meta::CPAN
print "\n";
}
# foreach my $gap (@uniq_gaps) { say join "\t", ($file, $gap->{start}, $gap->{end}, $gap->{is_edge}, $gap->{in_frame}, $gap->{counts}, $aln->length()) }
# print Dumper(\@uniq_gaps);
exit;
}
=head2 print_avp_id
Print the average percent identity of an alignment.
Wraps
L<Bio::SimpleAlign-E<gt>average_percentage_identity()|https://metacpan.org/pod/Bio::SimpleAlign#average_percentage_identity>.
=cut
sub print_avp_id {
printf "%.4f\n", $aln->average_percentage_identity();
exit
}
=head2 boostrap()
Produce a bootstrapped
alignment. L<Bio::Align::Utilities-E<gt>bootstrap()|https://metacpan.org/pod/Bio::Align::Utilities#bootstrap_replicates>.
=cut
lib/Bio/BPWrapper/AlnManipulations.pm view on Meta::CPAN
windows with fixed step of 1). The window size is set in
C<$opts{"window"}> which is set via L<C<#initilize(\%opts)>|/initialize>.
=cut
sub avg_id_by_win {
my $window_sz = $opts{"window"};
for my $i (1 .. ($aln->length() - $window_sz + 1)) {
my $slice = $aln->slice($i, $i + $window_sz - 1);
my $pi = (100 - $slice->average_percentage_identity()) / 100;
printf "%d\t%d\t%.4f\n", $i, $i + $window_sz - 1, $pi
}
exit
}
=head2 concat()
Concatenate multiple alignments sharing the same set of unique
IDs. This is normally used for concatenating individual gene
alignments of the same set of samples to a single one for making a
lib/Bio/BPWrapper/AlnManipulations.pm view on Meta::CPAN
my $loc_seq = Bio::LocatableSeq->new(-seq => $seq_str, -id => $id, -start => $ungapped_start, -end => $ungapped_end);
$block_aln->add_seq($loc_seq)
}
$out->write_aln($block_aln)
}
exit
}
sub get_consensus {
my $percent_threshold = $opts{"consensus"};
my $consense = Bio::LocatableSeq->new(
-seq => $aln->consensus_string($percent_threshold),
-id => "Consensus_$percent_threshold",
-start => 1,
-end => $aln->length()
);
$aln->add_seq($consense)
}
=head2 dna_to_protein()
Align CDS sequences according to their corresponding protein
alignment. Wraps
lib/Bio/BPWrapper/AlnManipulations.pm view on Meta::CPAN
=over 4
=item *
Create a new method like one of the above in the previous section.
=item *
Document your method in pod using C<=head2>. For example:
=head2 print_avpid
Print the average percent identity of an alignment.
Wraps
L<Bio::SimpleAlign-E<gt>average_percentage_identity()|https://metacpan.org/pod/Bio::SimpleAlign#average_percentage_identity>.
=cut
See L<C<print_avpid()>|/print_avpid> for how this gets rendered.
=item *
Add the method to C<@EXPORT> list in C<AlnManipulations.pm>.
=item *
lib/Bio/BPWrapper/SeqManipulations.pm view on Meta::CPAN
if ($seq_id =~ /$regex/) { warn "Deleted sequence: $seq_id\n" }
else { $out->write_seq($currseq) }
}
# TODO This needs better documentation
sub find_by_ambig {
my ($action, $currseq, $cutoff) = @_;
my $string = $currseq->seq();
my $ct = ($string =~ s/([^ATCG])/$1/gi); # won't work for AA seqs
my $percent_ambig = $ct / $currseq->length();
$filter_dispatch{"$action" . "_by_ambig"}->($currseq, $cutoff, $ct, $percent_ambig)
}
# TODO Probably better to change behavior when 'picking'?
sub pick_by_ambig {
my ($currseq, $cutoff, $ct, $percent_ambig) = @_;
$out->write_seq($currseq) if $percent_ambig > $cutoff
}
sub del_by_ambig {
my ($currseq, $cutoff, $ct, $percent_ambig) = @_;
# if ($percent_ambig > $cutoff) { warn "Deleted sequence: ", $currseq->id(), " number of N: ", $ct, "\n" }
if ($ct >= $cutoff) { warn "Deleted sequence: ", $currseq->id(), " number of bad monomers: ", $ct, "\n" }
else { $out->write_seq($currseq) }
}
sub find_by_length {
my ($action, $currseq, $value) = @_;
$filter_dispatch{$action . "_by_length"}->($currseq, $value)
}
sub pick_by_length {
t/10test-bioaln.t view on Meta::CPAN
use warnings;
use Test::More;
use Path::Tiny;
use lib path($0)->absolute->parent->stringify;
use Config;
use Helper;
# option background (background needs special care)
my %notes = (
'avg-pid' => 'average percent identity',
'codon-view' => 'codon view',
'con-blocks' => 'extract conserved blocks',
'concat' => 'concatenate aln files',
'dna2pep' => 'CDS alignment to protein alignment',
'length' => 'length of an alignment',
'list-ids' => 'list all sequence IDs',
'match' => 'match view',
'no-flat' => "turns ON 'begin-end' naming",
'no-gaps' => 'remove gapped sites',
'num-seq' => 'number of aligned sequences',
( run in 0.387 second using v1.01-cache-2.11-cpan-709fd43a63f )