Bio-BPWrapper
view release on metacpan or search on metacpan
lib/Bio/BPWrapper/AlnManipulations.pm view on Meta::CPAN
$ct++;
push @seq, $seq
}
die "No computable sequences: less than 2 seq.\n" unless $ct >= 2;
print "\t", $ct, "\t", $aln->length(), "\n";
foreach (@seq) {
printf "%-50s", $_->display_id();
print $_->seq(), "\n"
}
exit;
}
###################### subroutine ######################
sub gap_char {
my $char = $opts{'gap-char'};
die "gap-char takes a single character\n" unless length($char) == 1;
foreach my $seq ($aln->each_seq()) {
my $seq_str = $seq->seq();
$seq_str =~ s/[\.-]/$char/g;
$seq->seq($seq_str);
}
}
sub pair_diff_ref {
my $refId = $opts{'pair-diff-ref'};
my (@seqs, $refSeq);
foreach my $seq ($aln->each_seq()) {
if ($seq->id eq $refId) {
$refSeq = $seq;
} else {
push @seqs, $seq;
}
}
die "ref seq $refId not found\n" unless $refSeq;
@seqs = sort { $a->id() cmp $b->id() } @seqs;
for (my $i=0; $i < $#seqs; $i++) {
my $idB = $seqs[$i]->id();
my $seqB = $seqs[$i];
my $pair = new Bio::SimpleAlign;
$pair->add_seq($refSeq);
$pair->add_seq($seqB);
# $pair = $pair->remove_gaps(); # not relialbe, e.g., n/N for DNA seqs
my $ct_diff = 0;
my $ct_valid = 0;
# my $matchLine = $pair->match_line();
# my @match_symbols = split //, $matchLine;
for (my $j = 1; $j <= $pair->length; $j++) {
my $mA = $refSeq->subseq($j,$j);
my $mB = $seqB->subseq($j,$j);
next if $refSeq->alphabet eq 'dna' && $seqB->alphabet eq 'dna' && ($mA !~ /^[ATCG]$/i || $mB !~ /^[ATCG]$/i);
# next if $match_symbols[$i] eq '*';
next if $mA eq '-' || $mB eq '-';
$ct_valid++;
$ct_diff++ unless $mA eq $mB;
# next if $match_symbols[$i] eq '*';
}
my $pairdiff = $pair->percentage_identity();
print join "\t", ($refId, $idB, $ct_diff, $ct_valid, $pair->length());
printf "\t%.4f\t%.4f\n", $pairdiff, 1-$pairdiff/100;
}
exit;
}
sub pair_diff {
my $alnBack = $aln;
my $lenTotal = $alnBack->length();
# $alnBack = $alnBack->remove_gaps();
my $matchLineFull = $alnBack->match_line();
my @match_symbols_full = split //, $matchLineFull;
my $num_var = 0; # contain gaps
# my $num_var = 0; # de-gapped variable sites
for (my $i = 0; $i < $alnBack->length; $i++) {
next if $match_symbols_full[$i] eq '*';
$num_var++;
}
my (@seqs);
print join "\t", qw(seq_1 seq_2 len_gapless len_variable diff_gapless diff_variable len_aln identitty fac_diff frac_gapless frac_variable);
print "\n";
foreach my $seq ($aln->each_seq()) { push @seqs, $seq }
@seqs = sort { $a->id() cmp $b->id() } @seqs;
for (my $i=0; $i < $#seqs; $i++) {
my $idA = $seqs[$i]->id();
my $seqA = $seqs[$i];
for (my $j=$i+1; $j <= $#seqs; $j++) {
my $idB = $seqs[$j]->id();
my $seqB = $seqs[$j];
my $pair = new Bio::SimpleAlign;
$pair->add_seq($seqA);
$pair->add_seq($seqB);
# $pair = $pair->remove_gaps(); # unreliable: e.g., "n", "N" in DNA seqs
# my $mask = $seqA->seq ^ $seqB->seq; # (exclusive or) operator: returns "\0" if same
my $ct_diff = 0;
my $ct_valid = 0;
my $gap_included_diff = 0;
# my $matchLine = $pair->match_line();
# my @match_symbols = split //, $matchLine;
for (my $j = 1; $j <= $pair->length; $j++) {
my $mA = $seqA->subseq($j,$j);
my $mB = $seqB->subseq($j,$j);
next if $seqA->alphabet eq 'dna' && $seqB->alphabet eq 'dna' && ($mA !~ /^[ATCG]$/i || $mB !~ /^[ATCG]$/i);
$gap_included_diff++ unless $mA eq $mB;
# next if $match_symbols[$i] eq '*';
next if $mA eq '-' || $mB eq '-';
$ct_valid++;
$ct_diff++ unless $mA eq $mB;
}
# while ($mask =~ /[^\0]/g) { $ct_diff++ }
my $pairdiff = $pair->percentage_identity();
print join "\t", ($idA, $idB, $ct_valid, $num_var, $ct_diff, $gap_included_diff, $pair->length());
printf "\t%.4f\t%.4f\t%.4f\t%.4f\n", $pairdiff, 1-$pairdiff/100, $ct_diff/$ct_valid, $gap_included_diff/$num_var;
}
}
exit;
}
sub split_cdhit {
my $cls_file = $opts{'split-cdhit'};
open IN, "<" . $cls_file || die "cdhit clstr file not found: $cls_file\n";
my %clusters;
my $cl_id;
my @mem;
while (<IN>) {
my $line = $_;
chomp $line;
if ($line =~ /^>(\S+)\s+(\d+)/) {
$cl_id = $1 . "_" . $2;
my @mem = ();
$clusters{$cl_id} = \@mem;
} else {
my $ref = $clusters{$cl_id};
my @mems = @$ref;
my @els = split /\s+/, $line;
my $seq_id = $els[2];
$seq_id =~ s/>//;
$seq_id =~ s/\.\.\.$//;
push @mems, $seq_id;
$clusters{$cl_id} = \@mems;
}
}
# print Dumper(\%clusters);
my %seqs;
foreach my $seq ($aln->each_seq()) {
my $id = $seq->display_id();
$seqs{$id} = $seq;
}
foreach my $id (keys %clusters) {
my $out = Bio::AlignIO->new( -file => ">" . $file . "-". $id . ".aln", -format => 'clustalw');
my $new_aln = Bio::SimpleAlign->new();
my @seqids = @{ $clusters{$id} };
foreach my $seq_id (@seqids) {
$new_aln->add_seq($seqs{$seq_id});
}
$new_aln->set_displayname_flat();
# $new_aln = &_remove_common_gaps($new_aln);
$out->write_aln($new_aln);
}
exit;
}
#sub _remove_common_gaps {
#}
sub trim_ends {
my (@seqs, @gaps);
foreach my $seq ($aln->each_seq()) {
my $id = $seq->display_id();
lib/Bio/BPWrapper/AlnManipulations.pm view on Meta::CPAN
my @nts = split //, $seq->seq();
my $gap_start = 0;
my $new;
for (my $i=0; $i< $aln->length(); $i++) {
if ($nts[$i] eq '-') {
if ($gap_start) { # gap -> gap
$new->{end}++;
} else { # nt -> gap
$gap_start = 1;
$new = { 'start' => $i+1, 'end' => $i+1, 'seq_name' => $id }
}
} else {
if ($gap_start) { # gap -> nt
$gap_start = 0;
push @gaps, $new;
} else { # nt -> nt
next;
}
}
}
push @gaps, $new if $gap_start;
}
my (%gap_freqs, @uniq_gaps, %gap_presence);
foreach my $gap (@gaps) {
my $id = $gap->{start} . "-" . $gap->{end};
$gap->{id} = $id;
$gap_freqs{$id}++;
$gap_presence{$id}->{$gap->{seq_name}} = 1;
}
foreach my $id (keys %gap_freqs) {
my ($start, $end) = split /-/, $id;
push @uniq_gaps, {
'start' => $start,
'end' => $end,
'is_edge' => ($start == 1 || $end == $aln->length) ? 1 : 0,
'in_frame' => ($end - $start + 1) % 3 ? 0 : 1,
'counts' => $gap_freqs{$id},
'id' => $id
};
}
my @gaps_sorted = sort {$a->{start} <=> $b->{start} || $a->{end} <=> $b->{end}} @uniq_gaps;
foreach (@gaps_sorted) { print "\t", $_->{id}}
print "\n";
foreach my $sid (sort @seq_ids) {
print $sid;
foreach my $u_gap (@gaps_sorted) {
print "\t", $gap_presence{$u_gap->{id}}->{$sid} || 0;
}
print "\n";
}
# foreach my $gap (@uniq_gaps) { say join "\t", ($file, $gap->{start}, $gap->{end}, $gap->{is_edge}, $gap->{in_frame}, $gap->{counts}, $aln->length()) }
# print Dumper(\@uniq_gaps);
exit;
}
=head2 print_avp_id
Print the average percent identity of an alignment.
Wraps
L<Bio::SimpleAlign-E<gt>average_percentage_identity()|https://metacpan.org/pod/Bio::SimpleAlign#average_percentage_identity>.
=cut
sub print_avp_id {
printf "%.4f\n", $aln->average_percentage_identity();
exit
}
=head2 boostrap()
Produce a bootstrapped
alignment. L<Bio::Align::Utilities-E<gt>bootstrap()|https://metacpan.org/pod/Bio::Align::Utilities#bootstrap_replicates>.
=cut
sub bootstrap {
my $replicates = bootstrap_replicates($aln,1);
$aln = shift @$replicates
}
=head2 draw_codon_view()
Print a CLUSTALW-like alignment, but separated by codons. Intended for
use with DNA sequences. Block-final position numbers are printed at
the end of every alignment block at the point of wrapping, and
block-initial counts appear over first nucleotide in a block.
=cut
sub draw_codon_view {
# my $aln = shift;
# Is 20 by default. Blocks are measured in CODONS, so mult by 3
my $block_length = 3 * $opts{"codon-view"};
my $aln_length = $aln->length();
my $num_seqs = $aln->num_sequences();
my $min_pad = 4; # Minimum padding between sequence and ID
my $seq_matrix;
my @seqs = ($aln->each_seq);
my @display_ids;
# Find longest id length, add id/sequence padding
my $max_id_len = _find_max_id_len(\@seqs);
# id length includes padding
$max_id_len += $min_pad;
# Extract display_ids and sequences from AlignIO object.
foreach my $seq (@seqs) {
my @seq_str = split '', $seq->seq();
push @$seq_matrix, \@seq_str;
push @display_ids, $seq->display_id;
# Pad display ids so that space between them and sequence is consistent
$display_ids[-1] = _pad_display_id($display_ids[-1], $max_id_len)
}
my $nuc_count = 0;
# Loop over each sequence.
for (my $i = 0; $i < $num_seqs; $i++) {
# Print count at end of block when we are starting out a new block
_print_positions($nuc_count, $aln_length, $max_id_len) if $i == 0;
# Loop over nucleotides
lib/Bio/BPWrapper/AlnManipulations.pm view on Meta::CPAN
}
my $loc_seq = Bio::LocatableSeq->new(-seq => $seq_str, -id => $id, -start => 1);
my $end = $loc_seq->end;
$loc_seq->end($end);
$new_aln->add_seq($loc_seq)
}
$aln = $new_aln
}
=head2 variable_sites()
Extracts variable sites.
=cut
sub variable_sites {
$aln = $aln->remove_gaps();
my $new_aln = Bio::SimpleAlign->new();
my $len=$aln->length();
my (%seq_ids, @sites, @var_sites);
# Go through each column and save variable sites
for (my $i=1; $i<=$len; $i++) {
my ($ref_bases, $ref_ids) = &_get_a_site($i);
%seq_ids = %{$ref_ids};
my $is_constant = &_is_constant(&_paste_nt($ref_bases));
if ($is_constant < 1) { push @sites, $ref_bases; push @var_sites, $i }
}
foreach (@var_sites) { warn $_, "\n" }
# Recreate the object for output
foreach my $id (sort keys %seq_ids) {
my $seq_str;
foreach my $aln_site (@sites) {
foreach (@$aln_site) { $seq_str .= $_->{nt} if $_->{id} eq $id }
}
my $loc_seq = Bio::LocatableSeq->new(-seq => $seq_str, -id => $id, -start => 1);
my $end = $loc_seq->end;
$loc_seq->end($end);
$new_aln->add_seq($loc_seq)
}
$aln = $new_aln
}
=head2 avg_id_by_win()
Calculate pairwise average sequence difference by windows (overlapping
windows with fixed step of 1). The window size is set in
C<$opts{"window"}> which is set via L<C<#initilize(\%opts)>|/initialize>.
=cut
sub avg_id_by_win {
my $window_sz = $opts{"window"};
for my $i (1 .. ($aln->length() - $window_sz + 1)) {
my $slice = $aln->slice($i, $i + $window_sz - 1);
my $pi = (100 - $slice->average_percentage_identity()) / 100;
printf "%d\t%d\t%.4f\n", $i, $i + $window_sz - 1, $pi
}
exit
}
=head2 concat()
Concatenate multiple alignments sharing the same set of unique
IDs. This is normally used for concatenating individual gene
alignments of the same set of samples to a single one for making a
"supertree". Wraps
L<Bio::Align::UtilitiesE<gt>cat()|https://metacpan.org/pod/Bio::Align::Utilities#cat>.
=cut
##################################################################
# 2/26/2021: add position map (for rate4site applications)
##############################################################
sub concat {
$aln = cat(@alns);
warn "Alignment concated. Getting position maps...\n";
my $refSeq = $opts{"ref-seq"} ? $aln->get_seq_by_id($opts{"ref-seq"}) : $aln->get_seq_by_pos(1);
# my $refSeq = $aln->get_seq_by_pos(1);
my @refGenesInOrder = map { $_ -> get_seq_by_id($refSeq->id) } @alns;
# remap start & end for individual gene alignments, sync pos with concatenated aln:
my $pos = 0;
my %geneRange;
for (my $i = 0; $i <= $#refGenesInOrder; $i++) {
my $start = $pos + 1;
$pos += $refGenesInOrder[$i]->length();
my $end = $pos;
$geneRange{$i+1} = {'start' => $start,
'end' => $end,
};
}
# warn Dumper(\%geneRange);
my @locTable;
for (my $i = 1; $i <= $refSeq->length(); $i++) {
next if $refSeq->subseq($i, $i) eq '-';
my ($inGene, $posGene) = &__gene_order($i, \%geneRange);
push @locTable, {
'pos_concat' => $i,
'pos_unaligned' => $refSeq->location_from_column($i)->start(),
'gene_order' => $inGene,
'pos_gene_aln' => $posGene,
'pos_gene_unaligned' => $refGenesInOrder[$inGene-1]->location_from_column($posGene)->start()
};
}
open LOG, ">concat.log";
print LOG join "\t", ("seq_id", "pos_concat", "pos_residue", "gene", "pos_gene_aligned", "pos_gene");
print LOG "\n";
foreach (@locTable) {
print LOG join "\t", (
$refSeq->id(),
$_->{pos_concat},
$_->{pos_unaligned},
$_->{gene_order},
$_->{pos_gene_aln},
lib/Bio/BPWrapper/AlnManipulations.pm view on Meta::CPAN
my $min_block_size = $opts{"con-blocks"};
my %seq_ids;
die "Alignment contains only one sequence: $file\n" if $nseq < 2;
my (@blocks, $block);
my $in_block=0;
for (my $i=1; $i<=$len; $i++) {
my ($ref_bases, $ref_ids) = &_get_a_site($i);
%seq_ids = %{$ref_ids};
my $is_constant = &_is_constant(&_paste_nt($ref_bases));
if ($in_block) { # previous site is a contant one
if ($is_constant) {
$block->{length} ++;
my @sites = @{$block->{sites}};
push @sites, $ref_bases;
$block->{sites} = \@sites;
if ($i == $len) {
warn "Leaving a constant block at the end of alignment: $i\n";
push @blocks, $block if $block->{length} >= $min_block_size
}
} else {
$in_block = 0;
push @blocks, $block if $block->{length} >= $min_block_size;
warn "Leaving a constant block at $i\n"
}
} else { # previous site not a constant one
if ($is_constant) { # entering a block
warn "Entering a constant block at site $i ...\n";
$in_block=1;
$block = {start => $i, length => 1, num_seq => $nseq, sites => [($ref_bases)]} # start a new block
}
}
}
foreach my $bl (@blocks) {
my $out = Bio::AlignIO->new(-file=> ">$file" . ".slice-". $bl->{start} . ".aln" , -format=>'clustalw');
my $block_aln = Bio::SimpleAlign->new();
foreach my $id (sort keys %seq_ids) {
my ($seq_str, $ungapped_start, $ungapped_end);
my @sites = @{$bl->{sites}};
for (my $i = 0; $i <= $#sites; $i++) {
my $ref_chars = $sites[$i];
foreach (@$ref_chars) {
next unless $_->{id} eq $id;
$ungapped_start = $_->{ungapped_pos} if $i == 0;
$ungapped_end = $_->{ungapped_pos} if $i == $#sites;
$seq_str .= $_->{nt}
}
}
my $loc_seq = Bio::LocatableSeq->new(-seq => $seq_str, -id => $id, -start => $ungapped_start, -end => $ungapped_end);
$block_aln->add_seq($loc_seq)
}
$out->write_aln($block_aln)
}
exit
}
sub get_consensus {
my $percent_threshold = $opts{"consensus"};
my $consense = Bio::LocatableSeq->new(
-seq => $aln->consensus_string($percent_threshold),
-id => "Consensus_$percent_threshold",
-start => 1,
-end => $aln->length()
);
$aln->add_seq($consense)
}
=head2 dna_to_protein()
Align CDS sequences according to their corresponding protein
alignment. Wraps
L<Bio::Align::Utilities-E<gt>aa_to_dna_aln()|https://metacpan.org/pod/Bio::Align::Utilities#aa_to_dna_aln>.
=cut
sub dna_to_protein {
$aln = dna_to_aa_aln($aln)
}
sub remove_gapped_cols_in_one_seq {
my $id = $opts{"rm-col"};
my $nmatch=0;
my $ref_seq;
foreach ($aln->each_seq) {
if ($_->id() =~ /$id/) { $nmatch++; $ref_seq = $_ }
}
die "Quit. No ref seq found or more than one ref seq!\n" if !$nmatch || $nmatch > 1;
my ($ct_gap, $ref) = &_get_gaps($ref_seq);
warn "Original length: " . $aln->length() . "\n";
if ($ct_gap) {
my @args;
push @args, [$_, $_] foreach @$ref;
$aln = $aln->remove_columns(@args);
warn "New length: " . $aln->length() . "\n"
} else {
warn "No gap: " . $aln->length() . "\n"
}
}
sub colnum_from_residue_pos {
my ($id, $pos) = split /\s*,\s*/, $opts{"aln-index"};
print $aln->column_from_residue_number($id, $pos), "\n";
exit
}
=head2 list_ids()
List all sequence ids.
=cut
sub list_ids {
my @ids;
foreach ($aln->each_seq) { push @ids, $_->display_id() }
say join "\n", @ids;
exit
}
sub permute_states {
my $new_aln = Bio::SimpleAlign->new();
my $len=$aln->length();
lib/Bio/BPWrapper/AlnManipulations.pm view on Meta::CPAN
$count{$_}++ foreach @array;
my @keys = keys %count;
$constant = 0 if @keys > 1;
return $constant
}
sub _column_status {
my %count;
my $ref = shift;
my @array = @$ref;
my $st = { gap => 0, informative => 1, constant => 1 };
foreach (@array) {
$count{$_}++;
$st->{gap} = 1 if $_ =~ /[\-\?]/
}
my @keys = keys %count;
foreach (values %count) {
if ($_ < 2) { $st->{informative} = 0; last }
}
$st->{constant} = 0 if @keys > 1;
return $st
}
sub _get_a_site_v2 {
my %seq_ids;
my $len = $aln->length();
foreach my $seq ($aln->each_seq) {
my $id = $seq->id();
for (my $i = 1; $i <= $len; $i++) { $seq_ids{$id}{$i} = $seq->subseq($i, $i) }
}
return \%seq_ids
}
sub _find_max_id_len {
my $seqs = shift;
my @sorted_by_length = sort {length $a->display_id <=> length $b->display_id} @$seqs;
return length $sorted_by_length[-1]->display_id
}
1;
__END__
=head1 EXTENDING THIS MODULE
We encourage BioPerl developers to add command-line interface to their BioPerl methods here.
Here is how to extend. We'll use option C<--avpid> as an example.
=over 4
=item *
Create a new method like one of the above in the previous section.
=item *
Document your method in pod using C<=head2>. For example:
=head2 print_avpid
Print the average percent identity of an alignment.
Wraps
L<Bio::SimpleAlign-E<gt>average_percentage_identity()|https://metacpan.org/pod/Bio::SimpleAlign#average_percentage_identity>.
=cut
See L<C<print_avpid()>|/print_avpid> for how this gets rendered.
=item *
Add the method to C<@EXPORT> list in C<AlnManipulations.pm>.
=item *
Add option to C<%opt_displatch> which maps the option used in C<bioaln> to the subroutine that
gets called here. For example:
"avpid" => \&print_avp_id,
=item *
Add option in to C<bioaln> script. See the code that starts:
GetOptions(
...
"avpid|a",
...
This option has a short option name C<a> and takes no additional argument
=item *
Write a test for the option. See the file C<t/10test-bioaln.t> and L<Testing|https://github.com/bioperl/p5-bpwrapper/wiki/Testing>.
=item *
Share back. Create a pull request to the github repository and contact
Weigang Qiu, City University of New York, Hunter College (L<mailto:weigang@genectr.hunter.cuny.edu>)
=back
=head1 SEE ALSO
=over 4
=item *
L<bioaln>: command-line tool for using this
=item *
L<Qiu Lab wiki page|http://diverge.hunter.cuny.edu/labwiki/Bioutils>
=item *
L<Github project wiki page|https://github.com/bioperl/p5-bpwrapper>
=back
=head1 CONTRIBUTORS
=over 4
=item *
William McCaig <wmccaig at gmail dot com>
=item *
( run in 1.622 second using v1.01-cache-2.11-cpan-39bf76dae61 )