Bio-BPWrapper

 view release on metacpan or  search on metacpan

lib/Bio/BPWrapper/AlnManipulations.pm  view on Meta::CPAN

        $ct++;
        push @seq, $seq
    }

    die "No computable sequences: less than 2 seq.\n" unless $ct >= 2;
    print "\t", $ct, "\t", $aln->length(), "\n";
    foreach (@seq) {
	   printf "%-50s", $_->display_id();
	   print $_->seq(), "\n"
    }
    exit;
}

###################### subroutine ######################

sub gap_char {
    my $char = $opts{'gap-char'};
    die "gap-char takes a single character\n" unless length($char) == 1;
    foreach my $seq ($aln->each_seq()) { 
	my $seq_str = $seq->seq();
	$seq_str =~ s/[\.-]/$char/g;
	$seq->seq($seq_str); 
    }
}

sub pair_diff_ref {
    my $refId = $opts{'pair-diff-ref'};
    my (@seqs, $refSeq);
    foreach my $seq ($aln->each_seq()) { 
	if ($seq->id eq $refId) {
	    $refSeq = $seq;
	} else {
	    push @seqs, $seq;
	}
    }

    die "ref seq $refId not found\n" unless $refSeq;

    @seqs = sort { $a->id() cmp $b->id() } @seqs;
    for (my $i=0; $i < $#seqs; $i++) {
	my $idB = $seqs[$i]->id();
	my $seqB = $seqs[$i];
	my $pair = new Bio::SimpleAlign;
	$pair->add_seq($refSeq);
	$pair->add_seq($seqB);
#	$pair = $pair->remove_gaps(); # not relialbe, e.g., n/N for DNA seqs
	my $ct_diff = 0;
	my $ct_valid = 0;
#	my $matchLine = $pair->match_line();
#	my @match_symbols = split //, $matchLine;
	for (my $j = 1; $j <= $pair->length; $j++) {
	    my $mA = $refSeq->subseq($j,$j);
	    my $mB = $seqB->subseq($j,$j);
	    next if $refSeq->alphabet eq 'dna' && $seqB->alphabet eq 'dna' && ($mA !~ /^[ATCG]$/i || $mB !~ /^[ATCG]$/i);
#		next if $match_symbols[$i] eq '*';
	    next if $mA eq '-' || $mB eq '-'; 
	    $ct_valid++;
	    $ct_diff++ unless $mA eq $mB;
#	    next if $match_symbols[$i] eq '*'; 
	}
	my $pairdiff = $pair->percentage_identity();
	print join "\t", ($refId, $idB, $ct_diff, $ct_valid, $pair->length());
	printf "\t%.4f\t%.4f\n", $pairdiff, 1-$pairdiff/100;
    }
    exit;
}

sub pair_diff {
    my $alnBack = $aln;
    my $lenTotal = $alnBack->length();
#    $alnBack = $alnBack->remove_gaps();
    my $matchLineFull = $alnBack->match_line();
    my @match_symbols_full = split //, $matchLineFull;
    my $num_var = 0; # contain gaps
#    my $num_var = 0; # de-gapped variable sites
    for (my $i = 0; $i < $alnBack->length; $i++) {
	next if $match_symbols_full[$i] eq '*'; 
	$num_var++;
    }

    my (@seqs);
    print join "\t", qw(seq_1 seq_2 len_gapless len_variable diff_gapless diff_variable len_aln identitty fac_diff frac_gapless frac_variable);
    print "\n";
    foreach my $seq ($aln->each_seq()) { push @seqs, $seq }
    @seqs = sort { $a->id() cmp $b->id() } @seqs;
    for (my $i=0; $i < $#seqs; $i++) {
	my $idA = $seqs[$i]->id();
	my $seqA = $seqs[$i];
	for (my $j=$i+1; $j <= $#seqs; $j++) {
	    my $idB = $seqs[$j]->id();
	    my $seqB = $seqs[$j];
	    my $pair = new Bio::SimpleAlign;
	    $pair->add_seq($seqA);
	    $pair->add_seq($seqB);
#	    $pair = $pair->remove_gaps(); # unreliable: e.g., "n", "N" in DNA seqs
#	    my $mask = $seqA->seq ^ $seqB->seq; #  (exclusive or) operator: returns "\0" if same
	    my $ct_diff = 0;
	    my $ct_valid = 0;
	    my $gap_included_diff = 0;
#	    my $matchLine = $pair->match_line();
#	    my @match_symbols = split //, $matchLine;
	    for (my $j = 1; $j <= $pair->length; $j++) {
		my $mA = $seqA->subseq($j,$j);
		my $mB = $seqB->subseq($j,$j);
		next if $seqA->alphabet eq 'dna' && $seqB->alphabet eq 'dna' && ($mA !~ /^[ATCG]$/i || $mB !~ /^[ATCG]$/i);
		$gap_included_diff++ unless $mA eq $mB;
#		next if $match_symbols[$i] eq '*'; 
		next if $mA eq '-' || $mB eq '-';
		$ct_valid++;
		$ct_diff++ unless $mA eq $mB;
	    }
#	    while ($mask =~ /[^\0]/g) { $ct_diff++ }
	    my $pairdiff = $pair->percentage_identity();
	    print join "\t", ($idA, $idB, $ct_valid, $num_var, $ct_diff, $gap_included_diff, $pair->length());
	    printf "\t%.4f\t%.4f\t%.4f\t%.4f\n", $pairdiff, 1-$pairdiff/100, $ct_diff/$ct_valid, $gap_included_diff/$num_var;
	}
    }
    exit;
}


sub split_cdhit {
    my $cls_file = $opts{'split-cdhit'};
    open IN, "<" . $cls_file || die "cdhit clstr file not found: $cls_file\n";
    my %clusters;
    my $cl_id;
    my @mem;
    while (<IN>) {
	my $line = $_;
	chomp $line;
	if ($line =~ /^>(\S+)\s+(\d+)/) {
	    $cl_id = $1 . "_" . $2;
	    my @mem = ();
	    $clusters{$cl_id} = \@mem;
	} else {
	    my $ref = $clusters{$cl_id};
	    my @mems = @$ref;
	    my @els = split /\s+/, $line;
	    my $seq_id = $els[2];
	    $seq_id =~ s/>//;
	    $seq_id =~ s/\.\.\.$//;
	    push @mems, $seq_id;
	    $clusters{$cl_id} = \@mems;
	}
    }
#    print Dumper(\%clusters);
    my %seqs;
    foreach my $seq ($aln->each_seq()) {
        my $id = $seq->display_id();
	$seqs{$id} = $seq;
    }

    foreach my $id (keys %clusters) {
	my $out = Bio::AlignIO->new( -file => ">" . $file . "-". $id . ".aln", -format => 'clustalw');
	my $new_aln = Bio::SimpleAlign->new();
	my @seqids = @{ $clusters{$id} };
	foreach my $seq_id (@seqids) {
	    $new_aln->add_seq($seqs{$seq_id});
	}
	$new_aln->set_displayname_flat();
#	$new_aln = &_remove_common_gaps($new_aln);
	$out->write_aln($new_aln);
    }
    exit;
}

#sub _remove_common_gaps {
#}

sub trim_ends {
    my (@seqs, @gaps);
    foreach my $seq ($aln->each_seq()) {
        my $id = $seq->display_id();

lib/Bio/BPWrapper/AlnManipulations.pm  view on Meta::CPAN

        my @nts = split //, $seq->seq();
	my $gap_start = 0;
	my $new;
	for (my $i=0; $i< $aln->length(); $i++) {
	    if ($nts[$i] eq '-') {
		if ($gap_start) { # gap -> gap
		    $new->{end}++;
		} else { # nt -> gap
		    $gap_start = 1;
		    $new = { 'start' => $i+1, 'end' => $i+1, 'seq_name' => $id }
		}
	    } else {
		if ($gap_start) { # gap -> nt
		    $gap_start = 0;
		    push @gaps, $new;
		} else { # nt -> nt
		    next;
		}
	    }
	}
	push @gaps, $new if $gap_start;
    }
    my (%gap_freqs, @uniq_gaps, %gap_presence);
    foreach my $gap (@gaps) {
	my $id = $gap->{start} . "-" . $gap->{end};
	$gap->{id} = $id;
	$gap_freqs{$id}++;
	$gap_presence{$id}->{$gap->{seq_name}} = 1;
    }

    foreach my $id (keys %gap_freqs) {
	my ($start, $end) = split /-/, $id;
	push @uniq_gaps, {
	    'start' => $start,
	    'end' => $end,
	    'is_edge' => ($start == 1 || $end == $aln->length) ? 1 : 0,
	    'in_frame' => ($end - $start + 1) % 3 ? 0 : 1,
	    'counts' => $gap_freqs{$id},
	    'id' => $id
	};
    }

    my @gaps_sorted = sort {$a->{start} <=> $b->{start} || $a->{end} <=> $b->{end}} @uniq_gaps;
    foreach (@gaps_sorted) { print "\t", $_->{id}}
    print "\n";
    foreach my $sid (sort @seq_ids) {
	print $sid;
	foreach my $u_gap (@gaps_sorted) {
	    print "\t", $gap_presence{$u_gap->{id}}->{$sid} || 0;
	}
	print "\n";
    }

#    foreach my $gap (@uniq_gaps) { say join "\t", ($file, $gap->{start}, $gap->{end}, $gap->{is_edge}, $gap->{in_frame}, $gap->{counts}, $aln->length()) }
#    print Dumper(\@uniq_gaps);
    exit;
}

=head2 print_avp_id

Print the average percent identity of an alignment.

Wraps
L<Bio::SimpleAlign-E<gt>average_percentage_identity()|https://metacpan.org/pod/Bio::SimpleAlign#average_percentage_identity>.


=cut

sub print_avp_id {
    printf "%.4f\n", $aln->average_percentage_identity();
    exit
}

=head2 boostrap()

Produce a bootstrapped
alignment. L<Bio::Align::Utilities-E<gt>bootstrap()|https://metacpan.org/pod/Bio::Align::Utilities#bootstrap_replicates>.

=cut

sub bootstrap {
    my $replicates = bootstrap_replicates($aln,1);
    $aln = shift @$replicates
}

=head2 draw_codon_view()

Print a CLUSTALW-like alignment, but separated by codons. Intended for
use with DNA sequences. Block-final position numbers are printed at
the end of every alignment block at the point of wrapping, and
block-initial counts appear over first nucleotide in a block.

=cut

sub draw_codon_view {
#    my $aln = shift;
    # Is 20 by default. Blocks are measured in CODONS, so mult by 3
    my $block_length = 3 * $opts{"codon-view"};
    my $aln_length   = $aln->length();
    my $num_seqs     = $aln->num_sequences();
    my $min_pad = 4;    # Minimum padding between sequence and ID
    my $seq_matrix;
    my @seqs = ($aln->each_seq);
    my @display_ids;

    # Find longest id length, add id/sequence padding
    my $max_id_len = _find_max_id_len(\@seqs);

    # id length includes padding
    $max_id_len += $min_pad;

    # Extract display_ids and sequences from AlignIO object.
    foreach my $seq (@seqs) {
        my @seq_str = split '', $seq->seq();
        push @$seq_matrix, \@seq_str;
        push @display_ids, $seq->display_id;

       # Pad display ids so that space between them and sequence is consistent
        $display_ids[-1] = _pad_display_id($display_ids[-1], $max_id_len)
    }

    my $nuc_count = 0;

    # Loop over each sequence.
    for (my $i = 0; $i < $num_seqs; $i++) {

        # Print count at end of block when we are starting out a new block
        _print_positions($nuc_count, $aln_length, $max_id_len) if $i == 0;

        # Loop over nucleotides

lib/Bio/BPWrapper/AlnManipulations.pm  view on Meta::CPAN

        }
        my $loc_seq = Bio::LocatableSeq->new(-seq => $seq_str, -id => $id, -start => 1);
        my $end = $loc_seq->end;
        $loc_seq->end($end);
        $new_aln->add_seq($loc_seq)
    }
    $aln = $new_aln
}


=head2 variable_sites()

Extracts variable sites.

=cut
sub variable_sites {
    $aln = $aln->remove_gaps();
    my $new_aln = Bio::SimpleAlign->new();
    my $len=$aln->length();
    my (%seq_ids, @sites, @var_sites);

    # Go through each column and save variable sites
    for (my $i=1; $i<=$len; $i++) {
        my ($ref_bases, $ref_ids) = &_get_a_site($i);
        %seq_ids = %{$ref_ids};
        my $is_constant = &_is_constant(&_paste_nt($ref_bases));
        if ($is_constant < 1) { push @sites, $ref_bases; push @var_sites, $i }
    }

    foreach (@var_sites) { warn $_, "\n" }

    # Recreate the object for output
    foreach my $id (sort keys %seq_ids) {
        my $seq_str;
        foreach my $aln_site (@sites) {
            foreach (@$aln_site) { $seq_str .= $_->{nt} if $_->{id} eq $id }
        }

        my $loc_seq = Bio::LocatableSeq->new(-seq => $seq_str, -id => $id, -start => 1);
        my $end = $loc_seq->end;
        $loc_seq->end($end);
        $new_aln->add_seq($loc_seq)
    }

    $aln = $new_aln
}

=head2 avg_id_by_win()

Calculate pairwise average sequence difference by windows (overlapping
windows with fixed step of 1). The window size is set in
C<$opts{"window"}> which is set via L<C<#initilize(\%opts)>|/initialize>.

=cut


sub avg_id_by_win {
    my $window_sz = $opts{"window"};
    for my $i (1 .. ($aln->length() - $window_sz + 1)) {
        my $slice = $aln->slice($i, $i + $window_sz - 1);
        my $pi = (100 - $slice->average_percentage_identity()) / 100;
        printf "%d\t%d\t%.4f\n", $i, $i + $window_sz - 1, $pi
    }
    exit
}

=head2 concat()

Concatenate multiple alignments sharing the same set of unique
IDs. This is normally used for concatenating individual gene
alignments of the same set of samples to a single one for making a
"supertree". Wraps
L<Bio::Align::UtilitiesE<gt>cat()|https://metacpan.org/pod/Bio::Align::Utilities#cat>.

=cut

##################################################################
# 2/26/2021: add position map (for rate4site applications)
##############################################################
sub concat {
    $aln = cat(@alns);
    warn "Alignment concated. Getting position maps...\n";
    my $refSeq = $opts{"ref-seq"} ? $aln->get_seq_by_id($opts{"ref-seq"}) : $aln->get_seq_by_pos(1);
#    my $refSeq = $aln->get_seq_by_pos(1);
    my @refGenesInOrder = map { $_ -> get_seq_by_id($refSeq->id) } @alns;
    # remap start & end for individual gene alignments, sync pos with concatenated aln:
    my $pos = 0;
    my %geneRange;
    for (my $i = 0; $i <= $#refGenesInOrder; $i++) {
	my $start = $pos + 1;
	$pos += $refGenesInOrder[$i]->length();
	my $end = $pos;
	$geneRange{$i+1} = {'start' => $start, 
			    'end' => $end, 
	};
    }

#    warn Dumper(\%geneRange);

    my @locTable;
    for (my $i = 1; $i <= $refSeq->length(); $i++) {
	next if $refSeq->subseq($i, $i) eq '-';
	my ($inGene, $posGene) = &__gene_order($i, \%geneRange);
	push @locTable, {
	    'pos_concat' => $i,
	    'pos_unaligned' => $refSeq->location_from_column($i)->start(),
	    'gene_order' => $inGene,
	    'pos_gene_aln' => $posGene,
	    'pos_gene_unaligned' => $refGenesInOrder[$inGene-1]->location_from_column($posGene)->start() 
	};
    }
    open LOG, ">concat.log";
    print LOG join "\t", ("seq_id", "pos_concat", "pos_residue", "gene", "pos_gene_aligned", "pos_gene");
    print LOG "\n"; 
    foreach (@locTable) {
	print LOG join "\t", (
	    $refSeq->id(),
	    $_->{pos_concat}, 
	    $_->{pos_unaligned}, 
	    $_->{gene_order},
	    $_->{pos_gene_aln},

lib/Bio/BPWrapper/AlnManipulations.pm  view on Meta::CPAN

    my $min_block_size = $opts{"con-blocks"};
    my %seq_ids;

    die "Alignment contains only one sequence: $file\n" if $nseq < 2;

    my (@blocks, $block);
    my $in_block=0;
    for (my $i=1; $i<=$len; $i++) {
        my ($ref_bases, $ref_ids) = &_get_a_site($i);
        %seq_ids = %{$ref_ids};
        my $is_constant = &_is_constant(&_paste_nt($ref_bases));
        if ($in_block) { # previous site is a contant one
            if ($is_constant) {
                $block->{length} ++;
                my @sites = @{$block->{sites}};
                push @sites, $ref_bases;
                $block->{sites} = \@sites;
                if ($i == $len) {
                    warn "Leaving a constant block at the end of alignment: $i\n";
                    push @blocks, $block if $block->{length} >= $min_block_size
                }
            } else {
                $in_block = 0;
                push @blocks, $block if $block->{length} >= $min_block_size;
                warn "Leaving a constant block at $i\n"
            }
        } else { # previous site not a constant one
            if ($is_constant) { # entering a block
                warn "Entering a constant block at site $i ...\n";
                $in_block=1;
                $block = {start => $i, length => 1, num_seq => $nseq, sites => [($ref_bases)]}  # start a new block
            }
        }
    }

    foreach my $bl (@blocks) {
        my $out = Bio::AlignIO->new(-file=> ">$file" . ".slice-". $bl->{start} . ".aln" , -format=>'clustalw');
        my $block_aln = Bio::SimpleAlign->new();
        foreach my $id (sort keys %seq_ids) {
            my ($seq_str, $ungapped_start, $ungapped_end);
            my @sites = @{$bl->{sites}};
            for (my $i = 0; $i <= $#sites; $i++) {
                my $ref_chars = $sites[$i];
                foreach (@$ref_chars) {
                    next unless $_->{id} eq $id;
                    $ungapped_start = $_->{ungapped_pos} if $i == 0;
                    $ungapped_end = $_->{ungapped_pos} if $i == $#sites;
                    $seq_str .= $_->{nt}
                }
            }

            my $loc_seq = Bio::LocatableSeq->new(-seq => $seq_str, -id => $id, -start => $ungapped_start, -end => $ungapped_end);
            $block_aln->add_seq($loc_seq)
        }
        $out->write_aln($block_aln)
    }
    exit
}

sub get_consensus {
    my $percent_threshold = $opts{"consensus"};
    my $consense = Bio::LocatableSeq->new(
        -seq   => $aln->consensus_string($percent_threshold),
        -id    => "Consensus_$percent_threshold",
        -start => 1,
        -end   => $aln->length()
   );
    $aln->add_seq($consense)
}

=head2 dna_to_protein()

Align CDS sequences according to their corresponding protein
alignment. Wraps
L<Bio::Align::Utilities-E<gt>aa_to_dna_aln()|https://metacpan.org/pod/Bio::Align::Utilities#aa_to_dna_aln>.

=cut

sub dna_to_protein {
    $aln = dna_to_aa_aln($aln)
}

sub remove_gapped_cols_in_one_seq {
    my $id = $opts{"rm-col"};
    my $nmatch=0;
    my $ref_seq;
    foreach ($aln->each_seq) {
        if ($_->id() =~ /$id/) { $nmatch++; $ref_seq = $_ }
    }
    die "Quit. No ref seq found or more than one ref seq!\n" if !$nmatch || $nmatch > 1;
    my ($ct_gap, $ref) = &_get_gaps($ref_seq);
    warn "Original length: " . $aln->length() . "\n";
    if ($ct_gap) {
        my @args;
        push @args, [$_, $_] foreach @$ref;
        $aln = $aln->remove_columns(@args);
        warn "New length: " . $aln->length() . "\n"
    } else {
        warn "No gap: " . $aln->length() . "\n"
    }
}

sub colnum_from_residue_pos {
    my ($id, $pos) = split /\s*,\s*/, $opts{"aln-index"};
    print $aln->column_from_residue_number($id, $pos), "\n";
    exit
}

=head2 list_ids()

List all sequence ids.

=cut

sub list_ids {
    my @ids;
    foreach ($aln->each_seq) { push @ids, $_->display_id() }
    say join "\n", @ids;
    exit
}

sub permute_states {
    my $new_aln = Bio::SimpleAlign->new();
    my $len=$aln->length();

lib/Bio/BPWrapper/AlnManipulations.pm  view on Meta::CPAN

    $count{$_}++ foreach @array;
    my @keys = keys %count;
    $constant = 0 if @keys > 1;
    return $constant
}

sub _column_status {
    my %count;
    my $ref   = shift;
    my @array = @$ref;
    my $st    = { gap => 0, informative => 1, constant => 1 };

    foreach (@array) {
        $count{$_}++;
        $st->{gap} = 1 if $_ =~ /[\-\?]/
    }

    my @keys = keys %count;
    foreach (values %count) {
        if ($_ < 2) { $st->{informative} = 0; last }
    }
    $st->{constant} = 0 if @keys > 1;
    return $st
}

sub _get_a_site_v2 {
    my %seq_ids;
    my $len = $aln->length();
    foreach my $seq ($aln->each_seq) {
        my $id = $seq->id();
        for (my $i = 1; $i <= $len; $i++) { $seq_ids{$id}{$i} = $seq->subseq($i, $i) }
    }
    return \%seq_ids
}

sub _find_max_id_len {
    my $seqs = shift;
    my @sorted_by_length = sort {length $a->display_id <=> length $b->display_id} @$seqs;
    return length $sorted_by_length[-1]->display_id
}

1;
__END__

=head1 EXTENDING THIS MODULE

We encourage BioPerl developers to add command-line interface to their BioPerl methods here.

Here is how to extend.  We'll use option C<--avpid> as an example.

=over 4

=item *
Create a new method like one of the above in the previous section.

=item *
Document your method in pod using C<=head2>. For example:

    =head2 print_avpid

    Print the average percent identity of an alignment.

    Wraps
    L<Bio::SimpleAlign-E<gt>average_percentage_identity()|https://metacpan.org/pod/Bio::SimpleAlign#average_percentage_identity>.

    =cut

See L<C<print_avpid()>|/print_avpid> for how this gets rendered.


=item *
Add the method to C<@EXPORT> list in C<AlnManipulations.pm>.

=item *
Add option to C<%opt_displatch> which maps the option used in C<bioaln> to the subroutine that
gets called here. For example:

    "avpid" => \&print_avp_id,

=item *
Add option in to C<bioaln> script. See the code that starts:

    GetOptions(
    ...
    "avpid|a",
    ...

This option has a short option name C<a> and takes no additional argument

=item *
Write a test for the option. See the file C<t/10test-bioaln.t> and L<Testing|https://github.com/bioperl/p5-bpwrapper/wiki/Testing>.

=item *
Share back. Create a pull request to the github repository and contact
Weigang Qiu, City University of New York, Hunter College (L<mailto:weigang@genectr.hunter.cuny.edu>)

=back

=head1 SEE ALSO

=over 4

=item *

L<bioaln>: command-line tool for using this

=item *

L<Qiu Lab wiki page|http://diverge.hunter.cuny.edu/labwiki/Bioutils>

=item *

L<Github project wiki page|https://github.com/bioperl/p5-bpwrapper>

=back

=head1 CONTRIBUTORS

=over 4

=item *
William McCaig <wmccaig at gmail dot com>

=item *



( run in 1.622 second using v1.01-cache-2.11-cpan-39bf76dae61 )