Bio-MUST-Apps-TwoScalp
view release on metacpan or search on metacpan
bin/two-scalp.pl view on Meta::CPAN
### Check if more than one in-seqs file
if ($len > 1) {
@ord_fams = (1..$len);
for my $fam (@ord_fams) {
my $file = shift @ARGV_in_seqs;
#### file: $file
my $ali = Ali->load( $file );
@{ $seqs_for{$fam} } = map { $_ } $ali->all_seqs;
}
}
# TODO: check alternative here; too complex
### Check if family asked
if (@ord_fams && @ARGV_fam) {
warn <<'EOT';
Warning: --fam option specified but there is more than one file.
--fam option will be ignored and I will follow infile order instead!"
EOT
}
elsif (@ARGV_fam) {
@ord_fams = @ARGV_fam;
my %exist_for = map { $_ => 1 } @ord_fams;
my $ali = Ali->load(@ARGV_in_seqs);
for my $seq ($ali->all_seqs) {
my $fam = $seq->family // $DEF_FAM;
# TODO: streamline this using either // or ? : within @{ ... }
if ( exists $exist_for{$fam} ) {
push @{ $seqs_for{$fam} }, $seq;
}
else {
push @{ $seqs_for{other} }, $seq;
}
}
push @ord_fams, 'other' if $seqs_for{other};
}
unless (@ord_fams) {
### One file with no family specified
my $ali = Ali->load(@ARGV_in_seqs);
# TODO: move this to Ali introspective methods
my ($unaligned_seqs, $aligned_seqs)
= part { $_->is_aligned ? 1 : 0 } $ali->all_seqs;
@ord_fams = qw(aligned unaligned);
$seqs_for{ aligned} = $aligned_seqs if $aligned_seqs;
$seqs_for{unaligned} = $unaligned_seqs if $unaligned_seqs;
}
#### fam: @ord_fams
##### %seqs_for
FAM:
for my $fam (@ord_fams) {
### Check if seqs are aligned from part: $fam
# TODO: fix this as it is very dangerous to have my depending on if
# https://metacpan.org/pod/Perl::Critic::Policy::Variables::ProhibitConditionalDeclarations
## no critic (ProhibitConditionalDeclarations)
my $ali = Ali->new( seqs => $seqs_for{$fam}, guessing => 1 )
if $seqs_for{$fam};
## use critic
unless ($ali) {
carp "Warning: no sequence found for family: $fam";
next FAM;
}
if ($fam eq 'other') {
### Degap seqs from families not specified
$ali->degap_seqs;
}
my ($unaligned_seqs, $aligned_seqs)
= part { $_->is_aligned ? 1 : 0 } $ali->all_seqs;
#= part { $_->is_aligned ? 1 : 0 } $seqs_for{$fam};
my $p2;
# TODO: check how to simplify complex alternatives here
if ($aligned_seqs) {
my $aligned = Ali->new(
seqs => $aligned_seqs,
guessing => 1,
);
if ($unaligned_seqs) {
my $unaligned = Ali->new(
seqs => $unaligned_seqs,
guessing => 1,
);
### Align non aligned seqs on aligned seqs from the same family
$p2 = align_on_profile($aligned, $unaligned);
}
$p2 = $aligned unless $p2;
}
elsif ($unaligned_seqs && $master_profile) {
### There are only unaligned seqs in this family
my $unaligned = Ali->new(
seqs => $unaligned_seqs,
guessing => 1,
);
$p2 = $unaligned;
}
if ($master_profile) {
my $new_master_profile = align_on_profile($master_profile, $p2);
$master_profile = $new_master_profile;
}
# align from scratch only if only unaligned seqs and no other family
# TODO: make this more explicit: too subtle combining ? : and postfix unless
$master_profile = $p2 ? $p2 : align_from_scratch($unaligned_seqs)
unless $master_profile;
}
if ($ARGV_p_ref) {
### prune seq from reference profile
my @ids2prune = map { $_->full_id } $ref_prefix->all_seq_ids;
my $list = IdList->new( ids => \@ids2prune );
$list = $list->negative_list($master_profile);
my $pruned_ali = $list->filtered_ali($master_profile);
$master_profile = $pruned_ali;
}
# add suffix
my $outfile = secure_outfile($basename, $ARGV_out_suffix);
### Store fasta outfile: $outfile
$master_profile->store($outfile);
sub align_on_profile {
# TODO choose better name here (reused from main code)
## no critic (ProhibitReusedNames)
my $master_profile = shift;
## use critic
my $other = shift;
##### profile sequences: $master_profile
my $new_profile = $master_profile;
unless ($new_profile->has_uniq_ids) {
### non uniq seq id in master profile
$new_profile = uniq_ids($new_profile);
}
my $toalign = $other;
##### sequence to add: $toalign
unless ($toalign->has_uniq_ids) {
### non uniq seq id p2
$toalign = uniq_ids($toalign);
}
my ($toalign_file, $toalign_mapper)
= $toalign->temp_fasta( {id_prefix => 'toalign-'} );
my $profile = $new_profile;
my ($profile_file, $profile_mapper)
= $profile->temp_fasta( {id_prefix => 'profile-'} );
my %mapper = ( profile => $profile_mapper, toalign => $toalign_mapper );
my $type = $other->is_aligned ? 'prof' : 'seqs';
### type of sequence to add: $type
# set up new (expendable) hash to change used options
tie my %reduced_opt, 'Tie::IxHash';
%reduced_opt = %opt;
if ($type eq 'prof') {
delete @reduced_opt{ qw( --fragments --long --keeplength ) };
$new_profile = Profile2Profile->new( file1 => $toalign_file,
file2 => $profile_file,
options => \%reduced_opt );
}
elsif ($type eq 'seqs') {
delete @reduced_opt{ qw( --maxiterate --localpair ) }
if $ARGV_keep_length;
$new_profile = Seqs2Profile->new( file1 => $toalign_file,
file2 => $profile_file,
options => \%reduced_opt );
}
$new_profile->restore_ids( $mapper{profile} );
$new_profile->restore_ids( $mapper{toalign} );
unless ($new_profile->has_uniq_ids) {
### non uniq ids between parts to align
$new_profile = uniq_ids($new_profile);
}
return $new_profile;
}
sub align_from_scratch {
my $seqs = shift;
my $toalign = Ali->new(
seqs => $seqs,
guessing => 1,
);
$toalign->degap_seqs;
unless ($toalign->has_uniq_ids) {
### non uniq seq id
$toalign = uniq_ids($toalign);
}
my ($toalign_file, $toalign_mapper) = $toalign->temp_fasta;
# set up new (expendable) hash to change used options
tie my %reduced_opt, 'Tie::IxHash';
%reduced_opt = %opt;
delete $reduced_opt{ '--keeplength' };
my $new_profile = AlignAll->new( file => $toalign_file,
options => \%reduced_opt );
$new_profile->restore_ids($toalign_mapper);
return $new_profile;
}
sub uniq_ids {
# TODO: modify way to check ids and seq to keep seq from master profile
# with ids slightly different if subseq
my $ali = shift;
my @ids = map { $_->full_id } $ali->all_seq_ids;
my %count_for = count_by { $_ } @ids;
my @duplicates = grep { $count_for{ $_ } > 1 } @ids;
#### non uniq ids: @duplicates
for my $dup ( uniq @duplicates ) {
my @seqs = grep { $_->full_id eq $dup } $ali->all_seqs;
my $len_seq = @seqs;
for my $i ( 1..($len_seq-1) ) {
unless ( $seqs[$i]->is_subseq_of($seqs[$i-1])
|| ( $seqs[$i-1]->is_subseq_of($seqs[$i])
&& $seqs[$i]->is_subseq_of($seqs[$i-1]) ) ) {
carp "Warning: duplicate ids but different sequences for: $dup";
}
}
}
my @uniq_ids = uniq @ids;
my @seq_uniq_ids = map { $ali->get_seq_with_id($_) } @uniq_ids;
my $ali2 = Ali->new(
seqs => \@seq_uniq_ids,
guessing => 1,
);
return $ali2;
}
# TODO: check if args are better served with 'repeatable' or '...'
__END__
=pod
=head1 NAME
two-scalp.pl - Align or re-align sequences using various strategies
=head1 VERSION
version 0.243240
=head1 USAGE
two-scalp.pl --in-seqs=<infiles>... [optional arguments]
=head1 REQUIRED ARGUMENTS
=over
=item --in-seqs=<infiles>...
Path to input ALI file(s) with sequences to align. If only one file is
specified, the script aligns all non-aligned sequences on the already aligned
sequences (but see option C<--fam> below). If several files are specified,
each file is aligned on the previous one(s) in the given order.
=for Euclid: infiles.type: readable
=back
=head1 OPTIONAL ARGUMENTS
=over
=item --p-ref=<infile>
Path to input ALI file with already aligned sequences [default: none]. When
such a master profile is specified, its sequences are only used as references
for aligning the sequences of the other input file(s). Therefore, they will
not appear in the final output.
=for Euclid: infile.type: readable
=item --fam=<family>...
Family (or families) to consider when aligning sequences (already aligned or
not) [default: none]. Families are processed in the given order, i.e., the
first specified family is the first aligned and will serve as a master profile
for the second one, etc. If sequences from additional families (or devoid of
family) are also present, these are degapped and aligned on the profile
obtained after aligning the specified families.
( run in 2.939 seconds using v1.01-cache-2.11-cpan-8f98c5d2c55 )