view release on metacpan or search on metacpan
- Ali::Temporary: added option to control the lifespan of FASTA file
- Changes
- GeneticCode::Factory: hard-coded NCBI gc.prt file for robustness
- Fixes
- IdList: fixed regression bug preventing FortyTwo to work
- Taxonomy: improved robustness of setup-taxdir.pl-related methods
- Taxonomy: better described dependencies for more robust deployment
0.180190 2018-01-19 10:00:47+01:00 Europe/Brussels
- Changes
- Ali: improved consistency of guessing default value (always on)
- Seq: ungapped seqs ending in '*' are now trimmed and considered non-aligned
- Fixes
- Ali::Temporary: fixed regression bug preventing IdList application
- GeneticCode::Factory: tests should now pass on more platforms
- PostPred::Composition: tests should now pass on more platforms
0.180140 2018-01-14 22:36:55+01:00 Europe/Brussels
- Additions
- Ali::Temporary: new args attribute (to fine-tune FASTA file creation)
- Ali::Temporary: improved documentation
bin/ali2fasta.pl view on Meta::CPAN
use Bio::MUST::Core;
use Bio::MUST::Core::Utils qw(change_suffix);
use aliased 'Bio::MUST::Core::Ali';
for my $infile (@ARGV_infiles) {
### Processing: $infile
my $ali = Ali->load($infile);
$ali->dont_guess if $ARGV_noguessing;
$ali->degap_seqs if $ARGV_degap;
my $outfile = change_suffix($infile, '.fasta');
my $chunk = $ARGV_nowrap ? -1 : undef;
$ali->store_fasta($outfile, $chunk);
}
__END__
=pod
bin/ali2fasta.pl view on Meta::CPAN
=over
=item --degap
Discard gaps when converting sequences [default: no].
=item --[no]wrap
[Don't] wrap sequences [default: yes].
=item --[no]guessing
[Don't] guess whether sequences are aligned or not [default: yes].
=item --version
=item --usage
=item --help
=item --man
Print the usual program information
bin/app-len-ids-ali.pl view on Meta::CPAN
use Bio::MUST::Core;
use Bio::MUST::Core::Utils qw(secure_outfile);
use aliased 'Bio::MUST::Core::Ali';
use aliased 'Bio::MUST::Core::IdMapper';
for my $infile (@ARGV_infiles) {
### Processing: $infile
my $ali = Ali->load($infile);
$ali->dont_guess if $ARGV_noguessing;
# append seq lengths to ids
my $idm = $ali->len_mapper;
$ali->restore_ids($idm);
my $outfile = secure_outfile($infile, $ARGV_out_suffix);
$ali->store($outfile);
}
__END__
bin/app-len-ids-ali.pl view on Meta::CPAN
=over
=item --out[-suffix]=<suffix>
Suffix to append to infile basenames for deriving outfile names [default:
none]. When not specified, outfile names are taken from infiles but original
infiles are preserved by being appended a .bak suffix.
=for Euclid: suffix.type: string
=item --[no]guessing
[Don't] guess whether sequences are aligned or not [default: yes].
=item --version
=item --usage
=item --help
=item --man
Print the usual program information
bin/change-ids-ali.pl view on Meta::CPAN
use aliased 'Bio::MUST::Core::IdMapper';
### Mapping organisms from: $ARGV_org_mapper
my $org_mapper = IdMapper->load($ARGV_org_mapper);
for my $infile (@ARGV_infiles) {
### Processing: $infile
my $ali = Ali->load($infile);
$ali->dont_guess if $ARGV_noguessing;
# build id_mapper and change ids
my $id_mapper;
if ($ARGV_mode eq 'long2abbr') {
$id_mapper = $ali->org_mapper_from_long_ids($org_mapper);
$ali->shorten_ids($id_mapper);
}
else { # 'abbr2long'
$id_mapper = $ali->org_mapper_from_abbr_ids($org_mapper);
bin/change-ids-ali.pl view on Meta::CPAN
=over
=item --out[-suffix]=<suffix>
Suffix to append to infile basenames for deriving outfile names [default:
none]. When not specified, outfile names are taken from infiles but original
infiles are preserved by being appended a .bak suffix.
=for Euclid: suffix.type: string
=item --[no]guessing
[Don't] guess whether sequences are aligned or not [default: yes].
=item --store-id-mapper
Store the IDM file corresponding to each output file [default: no].
=item --version
=item --usage
=item --help
bin/classify-ali.pl view on Meta::CPAN
for my $cat ( $classifier->all_categories ) {
my $subdir = dir( $outdir, $cat->label )->relative;
$subdir->mkpath();
}
ALI:
for my $infile (@infiles) {
### Processing: $infile
my $ali = Ali->load($infile);
$ali->dont_guess;
# classify Ali
my $cat_label = $classifier->classify($ali);
### classified to: $cat_label
next ALI unless $cat_label;
# store Ali in corresponding directory
my $subdir = dir($outdir, $cat_label)->relative;
my ($filename) = fileparse($infile);
my $outfile = file($subdir, $filename);
bin/extract-ali.pl view on Meta::CPAN
for my $infile (@ARGV_infiles) {
### Processing: $infile
my $list = IdList->load($infile);
# assemble Ali and store it as FASTA file
my $ali = $ARGV_reorder ? $list->reordered_ali($db)
: $list->filtered_ali($db)
;
$ali->dont_guess;
my $outfile = change_suffix($infile, '.ali');
$ali->store($outfile);
}
__END__
=pod
=head1 NAME
bin/fasta2ali.pl view on Meta::CPAN
use Bio::MUST::Core;
use Bio::MUST::Core::Utils qw(change_suffix);
use aliased 'Bio::MUST::Core::Ali';
for my $infile (@ARGV_infiles) {
### Processing: $infile
my $ali = Ali->load($infile);
$ali->dont_guess if $ARGV_noguessing;
$ali->degap_seqs if $ARGV_degap;
my $outfile = change_suffix($infile, '.ali');
$ali->store($outfile);
}
__END__
=pod
=head1 NAME
bin/fasta2ali.pl view on Meta::CPAN
=back
=head1 OPTIONAL ARGUMENTS
=over
=item --degap
Discard gaps when converting sequences [default: no].
=item --[no]guessing
[Don't] guess whether sequences are aligned or not [default: yes].
=item --version
=item --usage
=item --help
=item --man
Print the usual program information
bin/inst-qual-filter.pl view on Meta::CPAN
### Processing: $infile
Ali->instant_store(
$outfile, { infile => $infile, coderef => $purity_filter }
);
}
if ($ARGV_filter_out) {
### Storing filtered seqs in: $ARGV_filter_out
my $ali = Ali->new( seqs => \@bad_seqs, guessing => 0 );
$ali->store_fasta($ARGV_filter_out);
}
__END__
=pod
=head1 NAME
inst-qual-filter.pl - Discard low-quality nt seqs in FASTA files (optimized)
bin/prune-ali.pl view on Meta::CPAN
for my $infile (@ARGV_infiles) {
### Processing: $infile
my $list = IdList->$method($infile);
$infile =~ s/$_//xms for @ARGV_in_strip;
my $alifile = change_suffix($infile, '.ali');
### Filtering sequences in: $alifile
my $ali = Ali->load($alifile);
$ali->dont_guess if $ARGV_noguessing;
# optionally negate list
$list = $list->negative_list($ali) if $ARGV_negate_list;
# apply list to Ali
my $pruned_ali = $ARGV_reorder ? $list->reordered_ali($ali)
: $list->filtered_ali($ali)
;
my $outfile = secure_outfile($alifile, $ARGV_out_suffix);
bin/prune-ali.pl view on Meta::CPAN
=item --out[-suffix]=<suffix>
Suffix to append to (possibly stripped) infile basenames for deriving
outfile names [default: none]. When not specified, outfile names are taken
from infiles but original infiles are preserved by being appended a .bak
suffix.
=for Euclid: suffix.type: string
=item --[no]guessing
[Don't] guess whether sequences are aligned or not [default: yes].
=item --from-must
Consider the input file as generated by ed/treeplot [default: no]. Currently,
this switches to the legacy .lis format (instead of the modern .idl format).
=item --negate-list
Interpret the list as a negative list instead of a positive list [default:
no]. This means that seqs corresponding to listed ids are discarded.
bin/rest-ids-ali.pl view on Meta::CPAN
use Bio::MUST::Core;
use Bio::MUST::Core::Utils qw(change_suffix secure_outfile);
use aliased 'Bio::MUST::Core::Ali';
use aliased 'Bio::MUST::Core::IdMapper';
for my $infile (@ARGV_infiles) {
### Processing: $infile
my $ali = Ali->load($infile);
$ali->dont_guess if $ARGV_noguessing;
$infile =~ s/$_//xms for @ARGV_in_strip;
my $idmfile = change_suffix($infile, '.idm');
my $idm = IdMapper->load($idmfile);
### Restoring seq ids from: $idmfile
$ali->restore_ids($idm);
my $outfile = secure_outfile($infile, $ARGV_out_suffix);
$ali->store($outfile);
}
bin/rest-ids-ali.pl view on Meta::CPAN
repeatable
=item --out[-suffix]=<suffix>
Suffix to append to infile basenames for deriving outfile names [default:
none]. When not specified, outfile names are taken from infiles but original
infiles are preserved by being appended a .bak suffix.
=for Euclid: suffix.type: string
=item --[no]guessing
[Don't] guess whether sequences are aligned or not [default: yes].
=item --version
=item --usage
=item --help
=item --man
Print the usual program information
lib/Bio/MUST/Core/Ali.pm view on Meta::CPAN
is => 'ro',
isa => 'Bio::MUST::Core::Types::File',
default => 'untitled.ali',
coerce => 1,
handles => {
filename => 'stringify',
},
);
has 'guessing' => (
traits => ['Bool'],
is => 'ro',
isa => 'Bool',
default => 1,
handles => {
dont_guess => 'unset',
guess => 'set',
},
);
with 'Bio::MUST::Core::Roles::Commentable',
'Bio::MUST::Core::Roles::Listable';
with 'Bio::MUST::Core::Roles::Aliable'; ## no critic (ProhibitMultipleWiths)
# CONSTRUCTORS
sub clone {
my $self = shift;
return $self->new(
comments => [ $self->all_comments ],
seqs => [ map { $_->clone } $self->all_seqs ],
file => file( $self->filename ),
guessing => $self->guessing,
);
}
# ACCESSORS
sub get_seq_with_id {
my $self = shift;
my $id = shift;
lib/Bio/MUST/Core/Ali.pm view on Meta::CPAN
sub is_protein {
my $self = shift;
return 1 if List::AllUtils::any { $_->is_protein } $self->all_seqs;
return 0;
}
sub is_aligned {
my $self = shift;
return 0 if not $self->guessing;
return 1 if List::AllUtils::any { $_->is_aligned } $self->all_seqs;
return 0;
}
sub width {
my $self = shift;
$self->uniformize if $self->is_aligned; # pad seqs for robustness
return $self->_max_seq_len;
}
lib/Bio/MUST/Core/Ali.pm view on Meta::CPAN
=head2 file
L<Path::Class::File> object (optional)
This optional attribute is initialized by class methods that C<load> an Ali
from disk. It is meant to improve the introspection capabilities of the Ali.
For now, this attribute is not used by the C<store> methods, though it might
provide them with a default value in the future.
=head2 guessing
Boolean (optional)
By default, an Ali object tries to guess whether it is aligned or not by
looking for gap-like characters in any of its Seq objects (see
L<Bio::MUST::Core::Seq> for the exact test performed on each sequence).
When this smart behavior causes issues, one can disable it by unsetting this
boolean attribute (see C<dont_guess> and C<guess> accessor methods).
=head2 comments
ArrayRef of strings (optional)
An Ali object is commentable, which means that it supports all the methods
pertaining to comment lines described in
L<Bio::MUST::Core::Roles::Commentable> (such as C<header>).
=head1 CONSTRUCTORS
lib/Bio/MUST/Core/Ali.pm view on Meta::CPAN
=head2 new
Default constructor (class method) returning a new Ali.
use aliased 'Bio::MUST::Core::Ali';
my $ali1 = Ali->new();
my @seqs = $ali->all_seqs;
my $ali2 = Ali->new( seqs => \@seqs );
This method accepts four optional arguments (see ATTRIBUTES above): C<seqs>,
C<file>, C<guessing> and C<comments>.
=head2 clone
Creates a deep copy (a clone) of the Ali. Returns the copy.
use aliased 'Bio::MUST::Core::Ali';
my $ali = Ali->load('input.ali');
my $ali_copy = $ali->clone;
# you can now mess with $ali_copy without affecting $ali
lib/Bio/MUST/Core/Ali.pm view on Meta::CPAN
my @orgs = map { $_->org } @ids1;
This method does not accept any arguments.
=head2 filename
Returns the stringified filename of the Ali.
This method does not accept any arguments.
=head2 guess
Turn on the smart detection of gaps (see C<guessing> attribute above).
This method does not accept any arguments.
=head2 dont_guess
Turn off the smart detection of gaps (see C<guessing> attribute above).
use aliased 'Bio::MUST::Core::Ali';
my $ali = Ali->load('ensembl.fasta');
$ali->dont_guess;
This method does not accept any arguments.
=head1 PROPERTIES
=head2 has_uniq_ids
Returns true if all the sequence ids are unique.
carp 'Warning: duplicate sequence ids!' unless $ali->has_uniq_ids;
lib/Bio/MUST/Core/Ali.pm view on Meta::CPAN
say 'Your file includes nucleotide sequences' unless $ali->is_protein;
This method does not accept any arguments.
=head2 is_aligned
Returns true if any sequence of the Ali appears to be aligned. See
L<Bio::MUST::Core::Seq> for the exact test performed on each sequence.
If the boolean attribute guessing is not set, always returns false.
carp 'Warning: file does not look aligned!' unless $ali->is_aligned;
This method does not accept any arguments.
=head2 count_seqs
Returns the number of sequences of the Ali. The alias method C<height> is
provided for convenience.
lib/Bio/MUST/Core/Ali/Stash.pm view on Meta::CPAN
# ATTRIBUTES
has 'seqs' => (
is => 'ro',
isa => 'Bio::MUST::Core::Ali',
required => 1,
handles => [
qw(filename count_comments all_comments get_comment
guessing all_seq_ids has_uniq_ids is_protein is_aligned
get_seq first_seq all_seqs filter_seqs count_seqs
gapmiss_regex
)
], # comment-related methods needed by IdList
);
has 'lookup' => (
is => 'ro',
isa => 'Bio::MUST::Core::IdList',
lib/Bio/MUST/Core/Ali/Stash.pm view on Meta::CPAN
# I/O methods
sub load {
my $class = shift;
my $infile = shift;
my $args = shift // {}; # HashRef (should not be empty...)
my $seqs = Ali->load($infile);
$seqs->dont_guess;
if ( $args->{truncate_ids} ) {
my $mapper = $seqs->regex_mapper( q{}, $DEF_ID );
$seqs->shorten_ids($mapper);
}
return $class->new(seqs => $seqs);
}
__PACKAGE__->meta->make_immutable;
lib/Bio/MUST/Core/Ali/Stash.pm view on Meta::CPAN
while (my $line = <$in>) {
chomp $line;
# extract member id list for current cluster
my ($cluster, @ids) = split /\s+/xms, $line;
$cluster =~ s/:\z//xms; # remove trailing colon (:)
my $list = IdList->new( ids => \@ids );
# assemble Ali and store it as FASTA file
my $ali = $list->reordered_ali($db);
$ali->dont_guess;
$ali->store( $cluster . '.fasta' );
}
=head1 DESCRIPTION
This module implements a class representing a sequence database where ids are
indexed for faster access. To this end, it combines an internal
L<Bio::MUST::Core::Ali> object and a L<Bio::MUST::Core::IdList> object.
An Ali::Stash is meant to be built from an existing ALI (or FASTA) file
lib/Bio/MUST/Core/Ali/Stash.pm view on Meta::CPAN
=head2 seqs
L<Bio::MUST::Core::Ali> object (required)
This required attribute contains the L<Bio::MUST::Core::Seq> objects that
populate the associated sequence database file. It should be initialized
through the class method C<load> (see the SYNOPSIS for an example).
For now, it provides the following methods: C<count_comments>,
C<all_comments>, C<get_comment>, C<guessing>, C<all_seq_ids>, C<has_uniq_ids>,
C<is_protein>, C<is_aligned>, C<get_seq>, C<get_seq_with_id> (see below),
C<first_seq>, C<all_seqs>, C<filter_seqs> and C<count_seqs> (see
L<Bio::MUST::Core::Ali>).
=head2 lookup
L<Bio::MUST::Core::IdList> object (auto)
This attribute is automatically initialized with the list indexing the
sequence ids of the internal C<Ali> object. Thus, it cannot be user-specified.
lib/Bio/MUST/Core/Ali/Temporary.pm view on Meta::CPAN
# ATTRIBUTES
has 'seqs' => (
is => 'ro',
isa => 'Bio::MUST::Core::Ali',
required => 1,
coerce => 1,
handles => [
qw(count_comments all_comments get_comment
guessing all_seq_ids has_uniq_ids is_protein is_aligned
get_seq get_seq_with_id first_seq all_seqs filter_seqs count_seqs
gapmiss_regex
)
], # comment-related methods needed by IdList
);
has 'args' => (
is => 'ro',
isa => 'HashRef',
lib/Bio/MUST/Core/Ali/Temporary.pm view on Meta::CPAN
=head2 seqs
L<Bio::MUST::Core::Ali> object (required)
This required attribute contains the L<Bio::MUST::Core::Seq> objects that are
written in the associated temporary FASTA file. It can be specified either as
a path to an ALI/FASTA file or as an C<Ali> object or as an ArrayRef of C<Seq>
objects (see the SYNOPSIS for examples).
For now, it provides the following methods: C<count_comments>,
C<all_comments>, C<get_comment>, C<guessing>, C<all_seq_ids>, C<has_uniq_ids>,
C<is_protein>, C<is_aligned>, C<get_seq>, C<get_seq_with_id>, C<first_seq>,
C<all_seqs>, C<filter_seqs> and C<count_seqs> (see L<Bio::MUST::Core::Ali>).
=head2 args
HashRef (optional)
When specified this optional attribute is passed to the C<temp_fasta> method
of the internal C<Ali> object. Its purpose is to allow the fine-tuning of the
format of the associated temporary FASTA file.
lib/Bio/MUST/Core/Roles/Aliable.pm view on Meta::CPAN
use Moose::Role;
use autodie;
use feature qw(say);
use Bio::MUST::Core::Types;
requires qw(
count_comments all_comments get_comment
guessing all_seq_ids has_uniq_ids is_protein is_aligned
get_seq get_seq_with_id first_seq all_seqs filter_seqs count_seqs
gapmiss_regex
);
no Moose::Role;
1;
__END__
=pod
lib/Bio/MUST/Core/Types.pm view on Meta::CPAN
class_type('Path::Class::Dir');
class_type('Path::Class::File');
class_type('File::Temp');
# auto-build Ali/Stash from various source types...
# useful in Bio::MUST::Drivers modules
coerce 'Bio::MUST::Core::Ali'
=> from 'Bio::MUST::Core::Ali::Stash'
=> via { Bio::MUST::Core::Ali->new( seqs => $_->seqs, guessing => 1 ) }
=> from 'ArrayRef[Bio::MUST::Core::Seq]'
=> via { Bio::MUST::Core::Ali->new( seqs => $_, guessing => 1 ) }
=> from 'Path::Class::File'
=> via { Bio::MUST::Core::Ali->load( $_->stringify ) }
=> from 'Str'
=> via { Bio::MUST::Core::Ali->load( $_ ) }
;
coerce 'Bio::MUST::Core::Ali::Stash'
=> from 'Path::Class::File'
test => 'wrote expected unaligned Ali',
);
}
my @exp_nomiss_lens = (56, 54, 48, 54, 44, 50, 48, 50, 52, 34);
{
my $infile = file('test', 'complete.ali');
my $ali = $class->load($infile);
$ali->dont_guess;
ok !$ali->is_aligned, 'rightly overrided guessing of alignment';
cmp_store(
obj => $ali, method => 'store_fasta',
file => 'complete.fasta',
test => 'wrote expected unaltered FASTA from non-guessing Ali',
);
$ali->guess;
ok $ali->is_aligned, 'rightly detected alignment';
cmp_ok $ali->width, '==', 56, 'got expected Ali width';
is_deeply [ map { $_->nomiss_seq_len } $ali->all_seqs ], \@exp_nomiss_lens,
'got expected seq lengths (excluding gaps and missing chars)';
}
{
my $infile = file('test', 'AhHMA4_clustalw.ali');
my $ali = $class->load($infile);