Bio-MUST-Core

 view release on metacpan or  search on metacpan

Changes  view on Meta::CPAN

      - Ali::Temporary: added option to control the lifespan of FASTA file
    - Changes
      - GeneticCode::Factory: hard-coded NCBI gc.prt file for robustness
    - Fixes
      - IdList: fixed regression bug preventing FortyTwo to work
      - Taxonomy: improved robustness of setup-taxdir.pl-related methods
      - Taxonomy: better described dependencies for more robust deployment

0.180190  2018-01-19 10:00:47+01:00 Europe/Brussels
    - Changes
      - Ali: improved consistency of guessing default value (always on)
      - Seq: ungapped seqs ending in '*' are now trimmed and considered non-aligned
    - Fixes
      - Ali::Temporary: fixed regression bug preventing IdList application
      - GeneticCode::Factory: tests should now pass on more platforms
      - PostPred::Composition: tests should now pass on more platforms

0.180140  2018-01-14 22:36:55+01:00 Europe/Brussels
    - Additions
      - Ali::Temporary: new args attribute (to fine-tune FASTA file creation)
      - Ali::Temporary: improved documentation

bin/ali2fasta.pl  view on Meta::CPAN


use Bio::MUST::Core;
use Bio::MUST::Core::Utils qw(change_suffix);
use aliased 'Bio::MUST::Core::Ali';


for my $infile (@ARGV_infiles) {

    ### Processing: $infile
    my $ali = Ali->load($infile);
    $ali->dont_guess if $ARGV_noguessing;
    $ali->degap_seqs if $ARGV_degap;
    my $outfile = change_suffix($infile, '.fasta');

    my $chunk = $ARGV_nowrap ? -1 : undef;
    $ali->store_fasta($outfile, $chunk);
}

__END__

=pod

bin/ali2fasta.pl  view on Meta::CPAN

=over

=item --degap

Discard gaps when converting sequences [default: no].

=item --[no]wrap

[Don't] wrap sequences [default: yes].

=item --[no]guessing

[Don't] guess whether sequences are aligned or not [default: yes].

=item --version

=item --usage

=item --help

=item --man

Print the usual program information

bin/app-len-ids-ali.pl  view on Meta::CPAN

use Bio::MUST::Core;
use Bio::MUST::Core::Utils qw(secure_outfile);
use aliased 'Bio::MUST::Core::Ali';
use aliased 'Bio::MUST::Core::IdMapper';


for my $infile (@ARGV_infiles) {

    ### Processing: $infile
    my $ali = Ali->load($infile);
    $ali->dont_guess if $ARGV_noguessing;

    # append seq lengths to ids
    my $idm = $ali->len_mapper;
    $ali->restore_ids($idm);

    my $outfile = secure_outfile($infile, $ARGV_out_suffix);
    $ali->store($outfile);
}

__END__

bin/app-len-ids-ali.pl  view on Meta::CPAN

=over

=item --out[-suffix]=<suffix>

Suffix to append to infile basenames for deriving outfile names [default:
none]. When not specified, outfile names are taken from infiles but original
infiles are preserved by being appended a .bak suffix.

=for Euclid: suffix.type: string

=item --[no]guessing

[Don't] guess whether sequences are aligned or not [default: yes].

=item --version

=item --usage

=item --help

=item --man

Print the usual program information

bin/change-ids-ali.pl  view on Meta::CPAN

use aliased 'Bio::MUST::Core::IdMapper';


### Mapping organisms from: $ARGV_org_mapper
my $org_mapper = IdMapper->load($ARGV_org_mapper);

for my $infile (@ARGV_infiles) {

    ### Processing: $infile
    my $ali = Ali->load($infile);
    $ali->dont_guess if $ARGV_noguessing;

    # build id_mapper and change ids
    my $id_mapper;

    if ($ARGV_mode eq 'long2abbr') {
        $id_mapper = $ali->org_mapper_from_long_ids($org_mapper);
        $ali->shorten_ids($id_mapper);
    }
    else {          # 'abbr2long'
        $id_mapper = $ali->org_mapper_from_abbr_ids($org_mapper);

bin/change-ids-ali.pl  view on Meta::CPAN

=over

=item --out[-suffix]=<suffix>

Suffix to append to infile basenames for deriving outfile names [default:
none]. When not specified, outfile names are taken from infiles but original
infiles are preserved by being appended a .bak suffix.

=for Euclid: suffix.type: string

=item --[no]guessing

[Don't] guess whether sequences are aligned or not [default: yes].

=item --store-id-mapper

Store the IDM file corresponding to each output file [default: no].

=item --version

=item --usage

=item --help

bin/classify-ali.pl  view on Meta::CPAN

    for my $cat ( $classifier->all_categories ) {
        my $subdir = dir( $outdir, $cat->label )->relative;
        $subdir->mkpath();
    }

    ALI:
    for my $infile (@infiles) {

        ### Processing: $infile
        my $ali = Ali->load($infile);
        $ali->dont_guess;

        # classify Ali
        my $cat_label = $classifier->classify($ali);
        ### classified to: $cat_label
        next ALI unless $cat_label;

        # store Ali in corresponding directory
        my $subdir = dir($outdir, $cat_label)->relative;
        my ($filename) = fileparse($infile);
        my $outfile = file($subdir, $filename);

bin/extract-ali.pl  view on Meta::CPAN


for my $infile (@ARGV_infiles) {

    ### Processing: $infile
    my $list = IdList->load($infile);

    # assemble Ali and store it as FASTA file
    my $ali = $ARGV_reorder ? $list->reordered_ali($db)
            :                 $list->filtered_ali($db)
    ;
    $ali->dont_guess;
    my $outfile = change_suffix($infile, '.ali');
    $ali->store($outfile);
}

__END__

=pod

=head1 NAME

bin/fasta2ali.pl  view on Meta::CPAN


use Bio::MUST::Core;
use Bio::MUST::Core::Utils qw(change_suffix);
use aliased 'Bio::MUST::Core::Ali';


for my $infile (@ARGV_infiles) {

    ### Processing: $infile
    my $ali = Ali->load($infile);
    $ali->dont_guess if $ARGV_noguessing;
    $ali->degap_seqs if $ARGV_degap;
    my $outfile = change_suffix($infile, '.ali');
    $ali->store($outfile);
}

__END__

=pod

=head1 NAME

bin/fasta2ali.pl  view on Meta::CPAN

=back

=head1 OPTIONAL ARGUMENTS

=over

=item --degap

Discard gaps when converting sequences [default: no].

=item --[no]guessing

[Don't] guess whether sequences are aligned or not [default: yes].

=item --version

=item --usage

=item --help

=item --man

Print the usual program information

bin/inst-qual-filter.pl  view on Meta::CPAN


    ### Processing: $infile
    Ali->instant_store(
        $outfile, { infile => $infile, coderef => $purity_filter }
    );
}

if ($ARGV_filter_out) {

    ### Storing filtered seqs in: $ARGV_filter_out
    my $ali = Ali->new( seqs => \@bad_seqs, guessing => 0 );
    $ali->store_fasta($ARGV_filter_out);
}

__END__

=pod

=head1 NAME

inst-qual-filter.pl - Discard low-quality nt seqs in FASTA files (optimized)

bin/prune-ali.pl  view on Meta::CPAN


for my $infile (@ARGV_infiles) {

    ### Processing: $infile
    my $list = IdList->$method($infile);

    $infile =~ s/$_//xms for @ARGV_in_strip;
    my $alifile = change_suffix($infile, '.ali');
    ### Filtering sequences in: $alifile
    my $ali = Ali->load($alifile);
    $ali->dont_guess if $ARGV_noguessing;

    # optionally negate list
    $list = $list->negative_list($ali) if $ARGV_negate_list;

    # apply list to Ali
    my $pruned_ali = $ARGV_reorder ? $list->reordered_ali($ali)
                   :                 $list->filtered_ali($ali)
    ;

    my $outfile = secure_outfile($alifile, $ARGV_out_suffix);

bin/prune-ali.pl  view on Meta::CPAN


=item --out[-suffix]=<suffix>

Suffix to append to (possibly stripped) infile basenames for deriving
outfile names [default: none]. When not specified, outfile names are taken
from infiles but original infiles are preserved by being appended a .bak
suffix.

=for Euclid: suffix.type: string

=item --[no]guessing

[Don't] guess whether sequences are aligned or not [default: yes].

=item --from-must

Consider the input file as generated by ed/treeplot [default: no]. Currently,
this switches to the legacy .lis format (instead of the modern .idl format).

=item --negate-list

Interpret the list as a negative list instead of a positive list [default:
no]. This means that seqs corresponding to listed ids are discarded.

bin/rest-ids-ali.pl  view on Meta::CPAN

use Bio::MUST::Core;
use Bio::MUST::Core::Utils qw(change_suffix secure_outfile);
use aliased 'Bio::MUST::Core::Ali';
use aliased 'Bio::MUST::Core::IdMapper';


for my $infile (@ARGV_infiles) {

    ### Processing: $infile
    my $ali = Ali->load($infile);
    $ali->dont_guess if $ARGV_noguessing;

    $infile =~ s/$_//xms for @ARGV_in_strip;
    my $idmfile = change_suffix($infile, '.idm');
    my $idm = IdMapper->load($idmfile);
    ### Restoring seq ids from: $idmfile
    $ali->restore_ids($idm);

    my $outfile = secure_outfile($infile, $ARGV_out_suffix);
    $ali->store($outfile);
}

bin/rest-ids-ali.pl  view on Meta::CPAN

    repeatable

=item --out[-suffix]=<suffix>

Suffix to append to infile basenames for deriving outfile names [default:
none]. When not specified, outfile names are taken from infiles but original
infiles are preserved by being appended a .bak suffix.

=for Euclid: suffix.type: string

=item --[no]guessing

[Don't] guess whether sequences are aligned or not [default: yes].

=item --version

=item --usage

=item --help

=item --man

Print the usual program information

lib/Bio/MUST/Core/Ali.pm  view on Meta::CPAN

    is       => 'ro',
    isa      => 'Bio::MUST::Core::Types::File',
    default  => 'untitled.ali',
    coerce   => 1,
    handles  => {
        filename => 'stringify',
    },
);


has 'guessing' => (
    traits   => ['Bool'],
    is       => 'ro',
    isa      => 'Bool',
    default  => 1,
    handles  => {
        dont_guess => 'unset',
        guess      => 'set',
    },
);


with 'Bio::MUST::Core::Roles::Commentable',
     'Bio::MUST::Core::Roles::Listable';
with 'Bio::MUST::Core::Roles::Aliable';     ## no critic (ProhibitMultipleWiths)

# CONSTRUCTORS



sub clone {
    my $self = shift;

    return $self->new(
        comments => [ $self->all_comments ],
        seqs     => [ map { $_->clone } $self->all_seqs ],
        file     => file( $self->filename ),
        guessing => $self->guessing,
    );
}

# ACCESSORS


sub get_seq_with_id {
    my $self = shift;
    my $id   = shift;

lib/Bio/MUST/Core/Ali.pm  view on Meta::CPAN


sub is_protein {
    my $self = shift;
    return 1 if List::AllUtils::any { $_->is_protein } $self->all_seqs;
    return 0;
}


sub is_aligned {
    my $self = shift;
    return 0 if not $self->guessing;
    return 1 if List::AllUtils::any { $_->is_aligned } $self->all_seqs;
    return 0;
}


sub width {
    my $self = shift;
    $self->uniformize if $self->is_aligned;     # pad seqs for robustness
    return $self->_max_seq_len;
}

lib/Bio/MUST/Core/Ali.pm  view on Meta::CPAN


=head2 file

L<Path::Class::File> object (optional)

This optional attribute is initialized by class methods that C<load> an Ali
from disk. It is meant to improve the introspection capabilities of the Ali.
For now, this attribute is not used by the C<store> methods, though it might
provide them with a default value in the future.

=head2 guessing

Boolean (optional)

By default, an Ali object tries to guess whether it is aligned or not by
looking for gap-like characters in any of its Seq objects (see
L<Bio::MUST::Core::Seq> for the exact test performed on each sequence).

When this smart behavior causes issues, one can disable it by unsetting this
boolean attribute (see C<dont_guess> and C<guess> accessor methods).

=head2 comments

ArrayRef of strings (optional)

An Ali object is commentable, which means that it supports all the methods
pertaining to comment lines described in
L<Bio::MUST::Core::Roles::Commentable> (such as C<header>).

=head1 CONSTRUCTORS

lib/Bio/MUST/Core/Ali.pm  view on Meta::CPAN

=head2 new

Default constructor (class method) returning a new Ali.

    use aliased 'Bio::MUST::Core::Ali';
    my $ali1 = Ali->new();
    my @seqs = $ali->all_seqs;
    my $ali2 = Ali->new( seqs => \@seqs );

This method accepts four optional arguments (see ATTRIBUTES above): C<seqs>,
C<file>, C<guessing> and C<comments>.

=head2 clone

Creates a deep copy (a clone) of the Ali. Returns the copy.

    use aliased 'Bio::MUST::Core::Ali';
    my $ali = Ali->load('input.ali');
    my $ali_copy = $ali->clone;
    # you can now mess with $ali_copy without affecting $ali

lib/Bio/MUST/Core/Ali.pm  view on Meta::CPAN

    my @orgs = map { $_->org } @ids1;

This method does not accept any arguments.

=head2 filename

Returns the stringified filename of the Ali.

This method does not accept any arguments.

=head2 guess

Turn on the smart detection of gaps (see C<guessing> attribute above).

This method does not accept any arguments.

=head2 dont_guess

Turn off the smart detection of gaps (see C<guessing> attribute above).

    use aliased 'Bio::MUST::Core::Ali';
    my $ali = Ali->load('ensembl.fasta');
    $ali->dont_guess;

This method does not accept any arguments.

=head1 PROPERTIES

=head2 has_uniq_ids

Returns true if all the sequence ids are unique.

    carp 'Warning: duplicate sequence ids!' unless $ali->has_uniq_ids;

lib/Bio/MUST/Core/Ali.pm  view on Meta::CPAN


    say 'Your file includes nucleotide sequences' unless $ali->is_protein;

This method does not accept any arguments.

=head2 is_aligned

Returns true if any sequence of the Ali appears to be aligned. See
L<Bio::MUST::Core::Seq> for the exact test performed on each sequence.

If the boolean attribute guessing is not set, always returns false.

    carp 'Warning: file does not look aligned!' unless $ali->is_aligned;

This method does not accept any arguments.

=head2 count_seqs

Returns the number of sequences of the Ali. The alias method C<height> is
provided for convenience.

lib/Bio/MUST/Core/Ali/Stash.pm  view on Meta::CPAN


# ATTRIBUTES


has 'seqs' => (
    is       => 'ro',
    isa      => 'Bio::MUST::Core::Ali',
    required => 1,
    handles  => [
        qw(filename count_comments all_comments get_comment
            guessing all_seq_ids has_uniq_ids is_protein is_aligned
            get_seq first_seq all_seqs filter_seqs count_seqs
            gapmiss_regex
        )
    ],      # comment-related methods needed by IdList
);


has 'lookup' => (
    is       => 'ro',
    isa      => 'Bio::MUST::Core::IdList',

lib/Bio/MUST/Core/Ali/Stash.pm  view on Meta::CPAN


# I/O methods


sub load {
    my $class  = shift;
    my $infile = shift;
    my $args   = shift // {};           # HashRef (should not be empty...)

    my $seqs = Ali->load($infile);
       $seqs->dont_guess;

    if ( $args->{truncate_ids} ) {
        my $mapper = $seqs->regex_mapper( q{}, $DEF_ID );
        $seqs->shorten_ids($mapper);
    }

    return $class->new(seqs => $seqs);
}

__PACKAGE__->meta->make_immutable;

lib/Bio/MUST/Core/Ali/Stash.pm  view on Meta::CPAN

    while (my $line = <$in>) {
        chomp $line;

        # extract member id list for current cluster
        my ($cluster, @ids) = split /\s+/xms, $line;
        $cluster =~ s/:\z//xms;             # remove trailing colon (:)
        my $list = IdList->new( ids => \@ids );

        # assemble Ali and store it as FASTA file
        my $ali = $list->reordered_ali($db);
           $ali->dont_guess;
        $ali->store( $cluster . '.fasta' );
    }

=head1 DESCRIPTION

This module implements a class representing a sequence database where ids are
indexed for faster access. To this end, it combines an internal
L<Bio::MUST::Core::Ali> object and a L<Bio::MUST::Core::IdList> object.

An Ali::Stash is meant to be built from an existing ALI (or FASTA) file

lib/Bio/MUST/Core/Ali/Stash.pm  view on Meta::CPAN


=head2 seqs

L<Bio::MUST::Core::Ali> object (required)

This required attribute contains the L<Bio::MUST::Core::Seq> objects that
populate the associated sequence database file. It should be initialized
through the class method C<load> (see the SYNOPSIS for an example).

For now, it provides the following methods: C<count_comments>,
C<all_comments>, C<get_comment>, C<guessing>, C<all_seq_ids>, C<has_uniq_ids>,
C<is_protein>, C<is_aligned>, C<get_seq>, C<get_seq_with_id> (see below),
C<first_seq>, C<all_seqs>, C<filter_seqs> and C<count_seqs> (see
L<Bio::MUST::Core::Ali>).

=head2 lookup

L<Bio::MUST::Core::IdList> object (auto)

This attribute is automatically initialized with the list indexing the
sequence ids of the internal C<Ali> object. Thus, it cannot be user-specified.

lib/Bio/MUST/Core/Ali/Temporary.pm  view on Meta::CPAN

# ATTRIBUTES


has 'seqs' => (
    is       => 'ro',
    isa      => 'Bio::MUST::Core::Ali',
    required => 1,
    coerce   => 1,
    handles  => [
        qw(count_comments all_comments get_comment
            guessing all_seq_ids has_uniq_ids is_protein is_aligned
            get_seq get_seq_with_id first_seq all_seqs filter_seqs count_seqs
            gapmiss_regex
        )
    ],      # comment-related methods needed by IdList
);


has 'args' => (
    is       => 'ro',
    isa      => 'HashRef',

lib/Bio/MUST/Core/Ali/Temporary.pm  view on Meta::CPAN

=head2 seqs

L<Bio::MUST::Core::Ali> object (required)

This required attribute contains the L<Bio::MUST::Core::Seq> objects that are
written in the associated temporary FASTA file. It can be specified either as
a path to an ALI/FASTA file or as an C<Ali> object or as an ArrayRef of C<Seq>
objects (see the SYNOPSIS for examples).

For now, it provides the following methods: C<count_comments>,
C<all_comments>, C<get_comment>, C<guessing>, C<all_seq_ids>, C<has_uniq_ids>,
C<is_protein>, C<is_aligned>, C<get_seq>, C<get_seq_with_id>, C<first_seq>,
C<all_seqs>, C<filter_seqs> and C<count_seqs> (see L<Bio::MUST::Core::Ali>).

=head2 args

HashRef (optional)

When specified this optional attribute is passed to the C<temp_fasta> method
of the internal C<Ali> object. Its purpose is to allow the fine-tuning of the
format of the associated temporary FASTA file.

lib/Bio/MUST/Core/Roles/Aliable.pm  view on Meta::CPAN

use Moose::Role;

use autodie;
use feature qw(say);

use Bio::MUST::Core::Types;


requires qw(
    count_comments all_comments get_comment
    guessing all_seq_ids has_uniq_ids is_protein is_aligned
    get_seq get_seq_with_id first_seq all_seqs filter_seqs count_seqs
    gapmiss_regex
);

no Moose::Role;
1;

__END__

=pod

lib/Bio/MUST/Core/Types.pm  view on Meta::CPAN


class_type('Path::Class::Dir');
class_type('Path::Class::File');
class_type('File::Temp');

# auto-build Ali/Stash from various source types...
# useful in Bio::MUST::Drivers modules

coerce 'Bio::MUST::Core::Ali'
    => from 'Bio::MUST::Core::Ali::Stash'
    => via { Bio::MUST::Core::Ali->new( seqs => $_->seqs, guessing => 1 ) }

    => from 'ArrayRef[Bio::MUST::Core::Seq]'
    => via { Bio::MUST::Core::Ali->new( seqs => $_,       guessing => 1 ) }

    => from 'Path::Class::File'
    => via { Bio::MUST::Core::Ali->load( $_->stringify ) }

    => from 'Str'
    => via { Bio::MUST::Core::Ali->load( $_ ) }
;

coerce 'Bio::MUST::Core::Ali::Stash'
    => from 'Path::Class::File'

t/ali.t  view on Meta::CPAN

        test => 'wrote expected unaligned Ali',
    );
}

my @exp_nomiss_lens = (56, 54, 48, 54, 44, 50, 48, 50, 52, 34);

{
    my $infile = file('test', 'complete.ali');
    my $ali = $class->load($infile);

    $ali->dont_guess;
    ok !$ali->is_aligned, 'rightly overrided guessing of alignment';
    cmp_store(
        obj => $ali, method => 'store_fasta',
        file => 'complete.fasta',
        test => 'wrote expected unaltered FASTA from non-guessing Ali',
    );

    $ali->guess;
    ok $ali->is_aligned, 'rightly detected alignment';
    cmp_ok $ali->width, '==', 56, 'got expected Ali width';
    is_deeply [ map { $_->nomiss_seq_len } $ali->all_seqs ], \@exp_nomiss_lens,
        'got expected seq lengths (excluding gaps and missing chars)';
}

{
    my $infile = file('test', 'AhHMA4_clustalw.ali');
    my $ali = $class->load($infile);



( run in 2.107 seconds using v1.01-cache-2.11-cpan-748bfb374f4 )