Bio-BPWrapper

 view release on metacpan or  search on metacpan

bin/bioaln  view on Meta::CPAN

B<bioaln> [options] <alignment file>

B<bioaln> [-h | --help | -V | --version | --man]


=head2 Alignment descriptors

 bioaln -l aln_file            # [l]ength of an alignment
 bioaln -L aln_file            # [L]ist sequence IDs
 bioaln -n aln_file            # [n]umber of aligned sequences
 bioaln -a aln_file            # [a]verage percent identity
 bioaln -w '30' aln_file       # average identifies for sliding [w]indows of 30

=head2 Alignment viewers

 bioaln -c aln_file            # [c]odon view (groups of 3 nts)
 bioaln -m aln_file            # [m]atch view (show variable sites)

=head2 Alignment filters (output a new alignment)

 bioaln -d 'Seq1,Seq2' aln_file           # [d]elete sequences

bin/bioaln  view on Meta::CPAN

=head1 OPTIONS

=over 4

=item --aln-index, -I <seq_id,position>

Return aligned position of a residue of a sequence based on its unaligned (gap-free) position.

=item --avg-pid, -a

Return average percent identity of an alignment.

=item --binary

Transform sequences into binary (0/1) strings for e.g., PHYLIP suites (see below)

=item --bin-inform

Print only binary informative sites (e.g., for parsimony analysis). Example PHYLIP application:

 bioaln --bin-inform --binary -o'phylip' foo.aln > foo.phy

bin/bioaln  view on Meta::CPAN

   bioaln --concat gene1.aln gene2.aln gene3.aln gene4.aln

or using wildcard to specify multiple files (check with "ls *.aln" first to make sure of alignment order):

   bioaln --concat gene*.aln

Two outputs:
 1. concated alignment (in STANDOUT)
 2. "concat.log" file, which shows mapped positions for a reference seq (specified by "-r" otherwise first sequence)

=item --consensus, -C 'percent' (default 50)

Add a consensus sequence to the end of the alignment with a certain threshold percent and id Consensus_<percent>.

=item --delete, -d 'seq_id1,seq_id2,etc'

Delete sequences based on their ids. Option takes a comma-separated list of ids.

=item --dna2pep, -D

Turn an in-frame protein-coding sequence alignment to a corresponding protein alignment.

=item --gap-char '.'

bin/bioaln  view on Meta::CPAN

=item --num-seq, -n

Print number of sequences in alignment.

=item --output, -o 'format'

Output file format. Common ones include 'clustalw' (default), 'fasta' and 'phylip'. See L<Bio::AlignIO> for supported formats. An additional format 'paml' is supported.

=item --pair-diff

Print pairwise sequence differences, including columns: seqA, seqB, num_variable_sites (no gap), num_pair_diff (no gap), total_pair_length (no gap), percent identity, fraction diff, and pair_diff/num_variable. For DNA seqs, it counts any non-ATCG's (...

=item --pair-diff-ref <id>

Print pairwise sequence differences to a specified sequence. For DNA seqs, it counts any non-ATCG's (e.g., N,n) as invalid, making it more robust than relying soly on percent_differece().

=item --pep2dna, -P 'unaligned-cds-file' <protein_alignment>

Produce an in-frame codon alignment by align CDS sequences according to their corresponding protein alignment. Throws an error if names in two files do not match exactly.

=item --permute-states, -M

Generate an alignment with randomly permuted residues at each site. This operation removes phylogenetic signal among aligned sequences, if there is any in the original alignment. This is the basis of the Permutation Trail Prob (PTP) test of the tree-...

=item --phy-nonint

lib/Bio/BPWrapper/AlnManipulations.pm  view on Meta::CPAN

	for (my $j = 1; $j <= $pair->length; $j++) {
	    my $mA = $refSeq->subseq($j,$j);
	    my $mB = $seqB->subseq($j,$j);
	    next if $refSeq->alphabet eq 'dna' && $seqB->alphabet eq 'dna' && ($mA !~ /^[ATCG]$/i || $mB !~ /^[ATCG]$/i);
#		next if $match_symbols[$i] eq '*';
	    next if $mA eq '-' || $mB eq '-'; 
	    $ct_valid++;
	    $ct_diff++ unless $mA eq $mB;
#	    next if $match_symbols[$i] eq '*'; 
	}
	my $pairdiff = $pair->percentage_identity();
	print join "\t", ($refId, $idB, $ct_diff, $ct_valid, $pair->length());
	printf "\t%.4f\t%.4f\n", $pairdiff, 1-$pairdiff/100;
    }
    exit;
}

sub pair_diff {
    my $alnBack = $aln;
    my $lenTotal = $alnBack->length();
#    $alnBack = $alnBack->remove_gaps();

lib/Bio/BPWrapper/AlnManipulations.pm  view on Meta::CPAN

		my $mA = $seqA->subseq($j,$j);
		my $mB = $seqB->subseq($j,$j);
		next if $seqA->alphabet eq 'dna' && $seqB->alphabet eq 'dna' && ($mA !~ /^[ATCG]$/i || $mB !~ /^[ATCG]$/i);
		$gap_included_diff++ unless $mA eq $mB;
#		next if $match_symbols[$i] eq '*'; 
		next if $mA eq '-' || $mB eq '-';
		$ct_valid++;
		$ct_diff++ unless $mA eq $mB;
	    }
#	    while ($mask =~ /[^\0]/g) { $ct_diff++ }
	    my $pairdiff = $pair->percentage_identity();
	    print join "\t", ($idA, $idB, $ct_valid, $num_var, $ct_diff, $gap_included_diff, $pair->length());
	    printf "\t%.4f\t%.4f\t%.4f\t%.4f\n", $pairdiff, 1-$pairdiff/100, $ct_diff/$ct_valid, $gap_included_diff/$num_var;
	}
    }
    exit;
}


sub split_cdhit {
    my $cls_file = $opts{'split-cdhit'};

lib/Bio/BPWrapper/AlnManipulations.pm  view on Meta::CPAN

	print "\n";
    }

#    foreach my $gap (@uniq_gaps) { say join "\t", ($file, $gap->{start}, $gap->{end}, $gap->{is_edge}, $gap->{in_frame}, $gap->{counts}, $aln->length()) }
#    print Dumper(\@uniq_gaps);
    exit;
}

=head2 print_avp_id

Print the average percent identity of an alignment.

Wraps
L<Bio::SimpleAlign-E<gt>average_percentage_identity()|https://metacpan.org/pod/Bio::SimpleAlign#average_percentage_identity>.


=cut

sub print_avp_id {
    printf "%.4f\n", $aln->average_percentage_identity();
    exit
}

=head2 boostrap()

Produce a bootstrapped
alignment. L<Bio::Align::Utilities-E<gt>bootstrap()|https://metacpan.org/pod/Bio::Align::Utilities#bootstrap_replicates>.

=cut

lib/Bio/BPWrapper/AlnManipulations.pm  view on Meta::CPAN

windows with fixed step of 1). The window size is set in
C<$opts{"window"}> which is set via L<C<#initilize(\%opts)>|/initialize>.

=cut


sub avg_id_by_win {
    my $window_sz = $opts{"window"};
    for my $i (1 .. ($aln->length() - $window_sz + 1)) {
        my $slice = $aln->slice($i, $i + $window_sz - 1);
        my $pi = (100 - $slice->average_percentage_identity()) / 100;
        printf "%d\t%d\t%.4f\n", $i, $i + $window_sz - 1, $pi
    }
    exit
}

=head2 concat()

Concatenate multiple alignments sharing the same set of unique
IDs. This is normally used for concatenating individual gene
alignments of the same set of samples to a single one for making a

lib/Bio/BPWrapper/AlnManipulations.pm  view on Meta::CPAN


            my $loc_seq = Bio::LocatableSeq->new(-seq => $seq_str, -id => $id, -start => $ungapped_start, -end => $ungapped_end);
            $block_aln->add_seq($loc_seq)
        }
        $out->write_aln($block_aln)
    }
    exit
}

sub get_consensus {
    my $percent_threshold = $opts{"consensus"};
    my $consense = Bio::LocatableSeq->new(
        -seq   => $aln->consensus_string($percent_threshold),
        -id    => "Consensus_$percent_threshold",
        -start => 1,
        -end   => $aln->length()
   );
    $aln->add_seq($consense)
}

=head2 dna_to_protein()

Align CDS sequences according to their corresponding protein
alignment. Wraps

lib/Bio/BPWrapper/AlnManipulations.pm  view on Meta::CPAN

=over 4

=item *
Create a new method like one of the above in the previous section.

=item *
Document your method in pod using C<=head2>. For example:

    =head2 print_avpid

    Print the average percent identity of an alignment.

    Wraps
    L<Bio::SimpleAlign-E<gt>average_percentage_identity()|https://metacpan.org/pod/Bio::SimpleAlign#average_percentage_identity>.

    =cut

See L<C<print_avpid()>|/print_avpid> for how this gets rendered.


=item *
Add the method to C<@EXPORT> list in C<AlnManipulations.pm>.

=item *

lib/Bio/BPWrapper/SeqManipulations.pm  view on Meta::CPAN


    if ($seq_id =~ /$regex/) { warn "Deleted sequence: $seq_id\n" }
    else { $out->write_seq($currseq) }
}

# TODO This needs better documentation
sub find_by_ambig {
    my ($action, $currseq, $cutoff) = @_;
    my $string        = $currseq->seq();
    my $ct            = ($string =~ s/([^ATCG])/$1/gi); # won't work for AA seqs
    my $percent_ambig = $ct / $currseq->length();
    $filter_dispatch{"$action" . "_by_ambig"}->($currseq, $cutoff, $ct, $percent_ambig)
}

# TODO Probably better to change behavior when 'picking'?
sub pick_by_ambig {
    my ($currseq, $cutoff, $ct, $percent_ambig) = @_;
    $out->write_seq($currseq) if $percent_ambig > $cutoff
}

sub del_by_ambig {
    my ($currseq, $cutoff, $ct, $percent_ambig) = @_;

#    if ($percent_ambig > $cutoff) { warn "Deleted sequence: ", $currseq->id(), " number of N: ", $ct, "\n" }
    if ($ct >= $cutoff) { warn "Deleted sequence: ", $currseq->id(), " number of bad monomers: ", $ct, "\n" }
    else { $out->write_seq($currseq) }
}

sub find_by_length {
    my ($action, $currseq, $value) = @_;
    $filter_dispatch{$action . "_by_length"}->($currseq, $value)
}

sub pick_by_length {

t/10test-bioaln.t  view on Meta::CPAN

use warnings;
use Test::More;
use Path::Tiny;
use lib path($0)->absolute->parent->stringify;
use Config;
use Helper;


# option background (background needs special care)
my %notes = (
    'avg-pid' => 'average percent identity',
    'codon-view' => 'codon view',
    'con-blocks' => 'extract conserved blocks',
    'concat' => 'concatenate aln files',
    'dna2pep' => 'CDS alignment to protein alignment',
    'length' => 'length of an alignment',
    'list-ids' => 'list all sequence IDs',
    'match' => 'match view',
    'no-flat' => "turns ON 'begin-end' naming",
    'no-gaps' => 'remove gapped sites',
    'num-seq' => 'number of aligned sequences',



( run in 0.387 second using v1.01-cache-2.11-cpan-709fd43a63f )