Bio-MUST-Core
view release on metacpan or search on metacpan
lib/Bio/MUST/Core/Ali.pm view on Meta::CPAN
$Bio::MUST::Core::Ali::VERSION = '0.252040';
use Moose;
use namespace::autoclean;
use autodie;
use feature qw(say);
# use Smart::Comments;
use Carp;
use File::Temp;
use List::AllUtils qw(uniq indexes sum0);
use Path::Class qw(file);
use POSIX qw(ceil floor);
use Statistics::Descriptive;
use Tie::IxHash;
use Bio::MUST::Core::Types;
use Bio::MUST::Core::Constants qw(:ncbi :gaps :files);
use aliased 'Bio::MUST::Core::Seq';
use aliased 'Bio::MUST::Core::SeqId';
use aliased 'Bio::MUST::Core::SeqMask';
# TODO: add information about methods available in Ali-like objects
# ATTRIBUTES
has 'seqs' => (
traits => ['Array'],
is => 'ro',
isa => 'ArrayRef[Bio::MUST::Core::Seq]',
default => sub { [] },
writer => '_set_seqs',
handles => {
add_seq => 'push',
get_seq => 'get',
set_seq => 'set',
delete_seq => 'delete',
insert_seq => 'insert',
count_seqs => 'count',
all_seqs => 'elements',
first_seq => 'first',
filter_seqs => 'grep',
},
);
has 'file' => (
is => 'ro',
isa => 'Bio::MUST::Core::Types::File',
default => 'untitled.ali',
coerce => 1,
handles => {
filename => 'stringify',
},
);
has 'guessing' => (
traits => ['Bool'],
is => 'ro',
isa => 'Bool',
default => 1,
handles => {
dont_guess => 'unset',
guess => 'set',
},
);
with 'Bio::MUST::Core::Roles::Commentable',
'Bio::MUST::Core::Roles::Listable';
with 'Bio::MUST::Core::Roles::Aliable'; ## no critic (ProhibitMultipleWiths)
# CONSTRUCTORS
sub clone {
my $self = shift;
return $self->new(
comments => [ $self->all_comments ],
seqs => [ map { $_->clone } $self->all_seqs ],
file => file( $self->filename ),
guessing => $self->guessing,
);
}
# ACCESSORS
sub get_seq_with_id {
my $self = shift;
my $id = shift;
my $seq = $self->first_seq( sub { $_->full_id eq $id } );
carp "[BMC] Warning: cannot find seq with id: $id; returning undef!"
unless $seq;
return $seq;
}
sub all_new_seqs {
return shift->filter_seqs( sub { $_->is_new } );
}
sub all_but_new_seqs {
return shift->filter_seqs( sub { not $_->is_new } );
}
sub all_seq_ids {
my $self = shift;
return map { $_->seq_id } $self->all_seqs;
}
# PROPERTIES
sub has_uniq_ids {
my $self = shift;
my @ids = map { $_->full_id } $self->all_seq_ids;
return 1 if $self->count_seqs == uniq @ids;
return 0;
}
sub is_protein {
my $self = shift;
return 1 if List::AllUtils::any { $_->is_protein } $self->all_seqs;
return 0;
}
sub is_aligned {
my $self = shift;
return 0 if not $self->guessing;
return 1 if List::AllUtils::any { $_->is_aligned } $self->all_seqs;
return 0;
}
sub width {
my $self = shift;
$self->uniformize if $self->is_aligned; # pad seqs for robustness
return $self->_max_seq_len;
}
sub _max_seq_len {
my $self = shift;
my @lengths = map { $_->seq_len } $self->all_seqs;
return (List::AllUtils::max @lengths) // 0; # to avoid warnings
}
sub seq_len_stats {
my $self = shift;
my @lengths = map { $_->nomiss_seq_len } $self->all_seqs;
my $stat = Statistics::Descriptive::Full->new;
$stat->add_data( \@lengths );
my @quantiles = map { $stat->quantile($_) } 0..4;
return @quantiles;
}
sub perc_miss {
my $self = shift;
my $n = sum0 map { $_->nomiss_seq_len } $self->all_seqs;
return 0.0 unless $n;
my $total = $self->width * $self->count_seqs;
return 100.0 * ($total - $n) / $total;
}
# MUTATORS
sub uc_seqs {
my $self = shift;
$_->uc for $self->all_seqs;
return $self;
}
sub recode_seqs { ## no critic (RequireArgUnpacking)
my $self = shift;
$_->recode(@_) for $self->all_seqs;
return $self;
}
sub degap_seqs {
lib/Bio/MUST/Core/Ali.pm view on Meta::CPAN
my $mask = SeqMask->parsimony_mask($ali);
$ali->apply_mask($mask);
# write down reduced Ali to disk
$ali->store('example-uc-genus-sp-20.ali');
$ali->store_fasta('example-uc-genus-sp-20.fasta');
# see below for additional methods
# ...
=head1 DESCRIPTION
This module implements the multiple sequence alignment (MSA) class and its
methods. An Ali is modeled as an array of L<Bio::MUST::Core::Seq> objects.
Consequently, sequence ids do not absolutely need to be unique for it to
work (though id uniqueness helps a lot for sequence access and filtering).
An Ali knows whether it contains nucleotide or protein sequences, whether
those are aligned or not, as well as its Seq count and exact width. All
these properties are introspected on the fly, which means that, while they
can be expensive to compute, they are always accurate.
This is important because an Ali provides methods for inserting, deleting
and editing its Seq objects. Further, it does its best to maintain a semantic
distinction between true gap states (encoded as '*') and missing regions of
undetermined length (encoded as pure whitespace, for example at the end of a
short sequence).
It also has methods for mapping its sequence ids (for example, before export
to the PHYLIP format), as well as methods to selectively retain sequences
and sites based on L<Bio::MUST::Core::IdList> and
L<Bio::MUST::Core::SeqMask> objects. For example, the C<idealize> method
discards shared gaps and optionally removes gap-rich sites only due to a
tiny number of sequences.
Finally, an Ali can be stored in MUST pseudo-FASTA format (which handles
meaningful whitespace and allows comment lines in the header) or be
imported/exported from/to several popular MSA formats, such as plain FASTA,
STOCKHOLM and PHYLIP.
=head1 ATTRIBUTES
=head2 seqs
ArrayRef of L<Bio::MUST::Core::Seq> objects (optional)
Most of the accessor methods described below are implemented by delegation
to this public attribute using L<Moose::Meta::Attribute::Native/Moose native
delegation>. Their documentation thus heavily borrows from the corresponding
help pages.
=head2 file
L<Path::Class::File> object (optional)
This optional attribute is initialized by class methods that C<load> an Ali
from disk. It is meant to improve the introspection capabilities of the Ali.
For now, this attribute is not used by the C<store> methods, though it might
provide them with a default value in the future.
=head2 guessing
Boolean (optional)
By default, an Ali object tries to guess whether it is aligned or not by
looking for gap-like characters in any of its Seq objects (see
L<Bio::MUST::Core::Seq> for the exact test performed on each sequence).
When this smart behavior causes issues, one can disable it by unsetting this
boolean attribute (see C<dont_guess> and C<guess> accessor methods).
=head2 comments
ArrayRef of strings (optional)
An Ali object is commentable, which means that it supports all the methods
pertaining to comment lines described in
L<Bio::MUST::Core::Roles::Commentable> (such as C<header>).
=head1 CONSTRUCTORS
=head2 new
Default constructor (class method) returning a new Ali.
use aliased 'Bio::MUST::Core::Ali';
my $ali1 = Ali->new();
my @seqs = $ali->all_seqs;
my $ali2 = Ali->new( seqs => \@seqs );
This method accepts four optional arguments (see ATTRIBUTES above): C<seqs>,
C<file>, C<guessing> and C<comments>.
=head2 clone
Creates a deep copy (a clone) of the Ali. Returns the copy.
use aliased 'Bio::MUST::Core::Ali';
my $ali = Ali->load('input.ali');
my $ali_copy = $ali->clone;
# you can now mess with $ali_copy without affecting $ali
This method does not accept any arguments.
=head1 ACCESSORS
=head2 add_seq
Adds one (or more) new sequence(s) at the end of the Ali. Returns the new
number of sequences of the Ali.
use aliased 'Bio::MUST::Core::Seq';
my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' );
$ali->add_seq($new_seq);
This method accepts any number of arguments.
=head2 get_seq
Returns a sequence of the Ali by its index. You can also use negative index
numbers, just as with Perl's core array handling. If the specified sequence
does not exist, this method will return C<undef>.
my $seq = $ali->get_seq($index);
croak "Seq $index not found in Ali!" unless defined $seq;
This method accepts just one argument (and not an array slice).
=head2 get_seq_with_id
Returns a sequence of the Ali by its id. If the specified id is not unique,
only the first matching sequence will be returned, whereas if no sequence
exists for the specified id, this method will return C<undef>.
my $id = 'Pyrus malus_3750@658052655';
my $seq = $ali->get_seq_with_id($id);
croak "Seq $id not found in Ali!" unless defined $seq;
This method accepts just one argument.
=head2 set_seq
Given an index and a sequence, sets the specified Ali element to the
sequence. This method returns the new sequence at C<$index>.
use aliased 'Bio::MUST::Core::Seq';
my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' );
$ali->set_seq($index, $new_seq);
This method requires two arguments.
=head2 delete_seq
lib/Bio/MUST/Core/Ali.pm view on Meta::CPAN
=head2 first_seq
Returns the first sequence of the Ali matching a given criterion, just like
L<List::Util>'s C<first> function. This method requires a subroutine
implementing the matching logic.
# emulate get_seq_with_id method
my $id2find = 'seq13';
my $seq = $ali->first_seq( sub { $_->full_id eq $id2find } );
This method requires a single argument.
=head2 filter_seqs
Returns every sequence of the Ali matching a given criterion, just like
Perl's core C<grep> function. This method requires a subroutine implementing
the matching logic.
# keep only long sequences (ignoring gaps and missing states)
my @long_seqs = $ali->filter_seqs( sub { $_->nomiss_seq_len > 500 } );
This method requires a single argument.
=head2 all_new_seqs
Returns all the sequences of the Ali tagged as #NEW# (not an array reference).
my @new_seqs = $ali->all_new_seqs;
This method does not accept any arguments.
=head2 all_but_new_seqs
Returns all the sequences of the Ali except those tagged as #NEW# (not an array
reference).
my @preexisting_seqs = $ali->all_but_new_seqs;
This method does not accept any arguments.
=head2 all_seq_ids
Returns all the sequence ids (L<Bio::MUST::Core::SeqId> objects) of the Ali
(not an array reference). This is only a convenience method.
use Test::Deeply;
my @ids1 = $ali->all_seq_ids;
my @ids2 = map { $_->seq_id } $ali->all_seqs;
is_deeply \@ids1, \@ids2, 'should be true';
my @orgs = map { $_->org } @ids1;
This method does not accept any arguments.
=head2 filename
Returns the stringified filename of the Ali.
This method does not accept any arguments.
=head2 guess
Turn on the smart detection of gaps (see C<guessing> attribute above).
This method does not accept any arguments.
=head2 dont_guess
Turn off the smart detection of gaps (see C<guessing> attribute above).
use aliased 'Bio::MUST::Core::Ali';
my $ali = Ali->load('ensembl.fasta');
$ali->dont_guess;
This method does not accept any arguments.
=head1 PROPERTIES
=head2 has_uniq_ids
Returns true if all the sequence ids are unique.
carp 'Warning: duplicate sequence ids!' unless $ali->has_uniq_ids;
This method does not accept any arguments.
=head2 is_protein
Returns true if any sequence of the Ali looks like a protein. See
L<Bio::MUST::Core::Seq> for the exact test performed on each sequence.
say 'Your file includes nucleotide sequences' unless $ali->is_protein;
This method does not accept any arguments.
=head2 is_aligned
Returns true if any sequence of the Ali appears to be aligned. See
L<Bio::MUST::Core::Seq> for the exact test performed on each sequence.
If the boolean attribute guessing is not set, always returns false.
carp 'Warning: file does not look aligned!' unless $ali->is_aligned;
This method does not accept any arguments.
=head2 count_seqs
Returns the number of sequences of the Ali. The alias method C<height> is
provided for convenience.
my $height = $ali->count_seqs;
This method does not accept any arguments.
=head2 width
Returns the width of the Ali (in characters). If the Ali is not aligned,
returns the length of the longest sequence instead.
To avoid potential bugs due to caching, this method dynamically computes the
Ali width at each call. Moreover, the Ali is always uniformized (see below)
beforehands to ensure accurate width value. Therefore, this method is
expensive and should not be called repeatedly (e.g., in a loop condition).
# you'd better looping through sites like this...
my $width = $ali->width;
for my $site (0..$width-1) {
...
}
This method does not accept any arguments.
=head2 seq_len_stats
Returns a list of 5 values summarizing the Ali seq lengths (ignoring gaps).
The values are the following: Q0 (min), Q1, Q2 (median), Q3, and Q4 (max).
This method does not accept any arguments.
=head2 perc_miss
Returns the percentage of missing (and gap-like) character states in the Ali.
As this method internally calls C<Ali::width>, the remarks above also apply.
my $miss_level = $ali->perc_miss;
This method does not accept any arguments.
=head1 MUTATORS
=head2 uc_seqs
Turn all the sequences of the Ali to uppercase and returns it.
This method does not accept any arguments.
=head2 recode_seqs
Recode all the sequences of the Ali and returns it.
( run in 2.030 seconds using v1.01-cache-2.11-cpan-75ffa21a3d4 )