FAST
view release on metacpan or search on metacpan
lib/FAST/Bio/UnivAln.pm view on Meta::CPAN
be supported for the forseeable future.
Version 1.006 on 15 Dec 1997. Added function no_allgap_sites(); out_fasta()
now accepts same row/column designations like seqs(). For advanced users:
the width() of sequence bags can now be controlled by supplying the index of
the row whose width is taken -- by default the width of the first row is used.
Version 1.007 on 15 Mar 1998.
* Added functions that return indices instead of actual sequence data:
var_inds(), invar_inds(), gap_free_inds(), unknown_free_inds(),
special_free_inds() and no_allgap_inds() return the indices of
(in)variable, gap-free, unknown-free, gap+unknown-free and non-gap-only
columns; they are the counterparts of var_sites(), etc.
* Added remove_gaps() to return ungapped (original) sequence data
* Added equal_no_gaps() to check for equality ignoring gaps
* Added equalize_lengths() for padding a sequence bag with gaps
such that all rows have equal length (current procedure is slow)
* The default id is now ``_'' (was: ``No_Id_Given'')
* Internal functions _rowbounds() and _colbounds() now return indices
from the user's perspective, but WITHOUT substracting offset. Instead,
all functions subtract offset after applying _rowbounds()/_colbounds()
Version 1.008 on 13 May 1998.
* Added readseq conversion support, making it possible to read and write
the following formats: MSF, Paup, PIR/Codata, ASN.1. (ASN.1 was not
parsed successfully: readseq seems to be unable to read in its own ASN.1
output). Technically, readseq is now used to parse files that have been
processed as ``raw'' before; now ``raw'' format is recognized using the
expression /^[A-Z_0-9$_GAP_SYMBOL$_UNKN_SYMBOL\s]+$/im, i.e. the file
may only have alphanumerical characters, gap and unknown-symbol, and
whitespace. If commata, etc, are detected, readseq is used for parsing.
Readseq itself seems to be unable to detect ``raw'' format in some cases,
causing weird results.
* Added Clustal support for parsing; still to be tested and documented.
Version 1.009 on 25 May 1998.
* The module can now be ``built'' in the standard way (perl Makefile.PL, etc).
=head1 APPENDIX
Please note that functions starting with an underscore (``_'') are
intended for internal use only: Use them only if you know what you're
doing :-)
=cut
#"
use Exporter;
use vars qw( @ISA @EXPORT @EXPORT_OK );
@ISA = qw(Exporter);
@EXPORT = qw();
@EXPORT_OK = qw($VERSION %UnivAlnType @UnivAlnType
%UnivAlnForm @UnivAlnForm %UnivAlnAlphs @UnivAlnAlphs);
require 5.002;
use Carp;
#if ($] < 5.005) {carp "Not tested for Perl 5.002 or 5.003";}
use POSIX;
use File::Basename;
use vars qw($_NOFILE_FLAG $_GAP_SYMBOL $_UNKN_SYMBOL $_UNKN_SYMBOL2 $_NO_CONSENSUS_SYMBOL $_NaN $_MAX_SIZE_TAXANAME $_READSEQ $_CLUSTAL);
$_NOFILE_FLAG='_undef';
$_GAP_SYMBOL ='-';
$_UNKN_SYMBOL='\?'; # the backslash seems to be needed to be able to write
# ($str =~ /$_UNKN_SYMBOL/) ??!!
$_UNKN_SYMBOL2='N'; # ONLY USED IN UNDOCUMENTED FUNCTION special_free_seqs()
$_NO_CONSENSUS_SYMBOL='!';
$_NaN = POSIX::INT_MAX; #anything better ?
$_MAX_SIZE_TAXANAME=20;
$_READSEQ = "";
$_CLUSTAL = "";
$ENV{READSEQ} = "readseq";
# name of the readseq executable to be found in
# $ENV{READSEQ_DIR} or "\.". Set this to "" if readseq is
# installed, but shall be ignored
$ENV{CLUSTAL} = "clustal";
# name of the clustal executable to be found in
# $ENV{CLUSTAL_DIR} or "\.". Set this to "" if clustal is
# installed, but shall be ignored
use vars qw(%UnivAlnType %TypeUnivAln @TypeUnivAln %UnivAlnForm %FormUnivAln @FormUnivAln);
# List of recognized alignment types
$UnivAlnType{unknown} = 0;
$UnivAlnType{dna} = 1;
$UnivAlnType{rna} = 2;
$UnivAlnType{amino} = 3;
$UnivAlnType{otherseq} = 4; # type is explicitly NOT dna/rna/amino/etc, but
# nothing more is known
# Future support for the following 4 acronyms is doubtful:
$UnivAlnType{Unknown} = 0;
$UnivAlnType{Dna} = 1;
$UnivAlnType{Rna} = 2;
$UnivAlnType{Amino} = 3;
# %TypeUnivAln = reverse %UnivAlnType; # TypeUnivAln is inverse of %UnivAlnType
# i.e. keys and values are exchanged
@TypeUnivAln = ('unknown','dna','rna','amino','otherseq');
# List of recognized file formats
$UnivAlnForm{unknown} = 0;
$UnivAlnForm{raw} = 13;
$UnivAlnForm{raw2} = 14;
$UnivAlnForm{fasta} = 7;
# Future support for the following 3 acronyms is doubtful:
$UnivAlnForm{Unknown} = 0;
$UnivAlnForm{Raw} = 13;
$UnivAlnForm{Raw2} = 14;
$UnivAlnForm{Fasta} = 7;
# %FormUnivAln = reverse %UnivAlnForm; # FormUnivAln is inverse of %UnivAlnForm
# i.e. keys and values are exchanged
@FormUnivAln = ('unknown',undef,undef,undef,undef,undef,undef,
'raw','raw2',undef,undef,undef,undef,'fasta');
use vars qw(%FuncParse %FuncOut %UnivAlnAlphs);
# %FuncParse and %FuncOut are for internal use ONLY
# %FuncParse is an array of (<ffmt>,<_parse_meth>), where <ffmt>
# is the file format code (e.g., 0 for "unknown", 7 for 'fasta', etc.), and
# <_parse_meth> is the method which parses strings in that file
# format. %FuncParse{$i} is mostly set to \&_parse_bad right now,
# since we don't have many file formats supported yet.
my %FuncParse =
($UnivAlnForm{"unknown"} => \&_parse_unknown,
$UnivAlnForm{'fasta'} => \&_parse_fasta,
$UnivAlnForm{'raw'} => \&_parse_raw,
);
# Set all other formats to call &_parse_bad, so that currently unsupported
lib/FAST/Bio/UnivAln.pm view on Meta::CPAN
}
return (@return_array);
}
# helper functions used by seqs(), etc
=head2 _rowbounds()
Usage : $corrected_bounds = $aln->_rowbounds($uncorrected_bounds);
Function : create default row index list if necessary,
create row index list if specified by a hash of ids or
by a selector function that acts on rows and returns true/false,
check row index list for bounds errors,
NO LONGER DONE IN VERSION 1.007 AND HIGHER: substract offset of 1.
Returns : reference to corrected row index list
Argument : reference to uncorrected row index list
=cut
sub _rowbounds {
my($self) = shift;
my $rowindices = shift;
my $rowindices2;
my $firstindx = 1; # currently, it's assumed the first row is row no. 1,
# and other conventions aren't supported.
if ( !defined($rowindices) ) {
# create default row index list if necessary
$rowindices2 = [$firstindx .. $self->height() - 1 + $firstindx];
# cleaner??!! $rowindices2 = [];
}
if (ref($rowindices) eq 'ARRAY') {
if (scalar(@$rowindices) == 0) {
$rowindices2 = [$firstindx .. $self->height() - 1 + $firstindx];
} else {
$rowindices2 = [@$rowindices];
}
}
if (ref($rowindices) eq 'HASH') {
my @row_ids = split /\s/, $rowindices->{'ids'};
my($ii,@rows,$row_id);
$ii=0;
my %all_row_ids_hash = map {($_,$ii++)} @{$self->{'row_ids'}};
foreach $row_id (@row_ids) {
if (exists $all_row_ids_hash{$row_id}) {
push @rows, $all_row_ids_hash{$row_id} + $firstindx;
delete $all_row_ids_hash{$row_id};
} else {
carp("Requested row named $row_id not found, or duplicate");
}
}
$rowindices2 = \@rows;
}
if (ref($rowindices) eq 'CODE') {
my $ctr = $firstindx - 1;
$rowindices2 = [ grep {$_ != $_NaN}
# filter out $_NaN-value indexes, see next comment
map {($ctr++,$_) ? $ctr : $_NaN}
# convert list of function values into list of indices
# in canonical order, except that false values trigger an
# index of $_NaN, e.g. (1,$_NaN,3,$_NaN,5) is the result
# of map {($ctr++,$_) ? $ctr : $_NaN} ("A","","B",0,"C")
# note that the comma operator evaluates the first
# argument and throws the result away, and then
# evaluates the second argument; the side effect
# of the first evaluation is the increment of $ctr.
$self->map_r($rowindices) ];
}
my($ii);
for $ii (0 .. $#{$rowindices2}) {
if ($rowindices2->[$ii] < $firstindx) {
carp "Requested row $rowindices2->[$ii] of an alignment where $firstindx is the first row";
}
if (($rowindices2->[$ii] - $firstindx + 1) > scalar(@{ $self->{'seqs'} })) {
carp "Requested row $rowindices2->[$ii] of an alignment with fewer rows";
}
}
return $rowindices2;
#return $self->_rownorm($rowindices2);
}
=head2 _colbounds()
Usage : $corrected_bounds = $aln->_colbounds($uncorrected_bounds);
Function : create default column index list if necessary,
create column index list if specified by a hash of ids or
by a selector function that acts on columns and returns true/false,
check column index list for bounds errors,
NOT IN 1.007 and higher: substract offset (according to numbering scheme).
Returns : reference to corrected column index list
Argument : reference to uncorrected column index list
reference to row index list: its first row is used to
provide the width of the alignment considered, which matters
if the alignment is really a sequence bag
=cut
sub _colbounds {
my($self) = shift;
my $colindices = shift;
my $rowindices = shift || [0];
my $colindices2;
my $firstindx2 = 1; # currently, it's assumed the first row is row no. 1,
# and other conventions aren't supported.
my $width;
if (scalar(@$rowindices) != 0) {
$width = $self->width($rowindices->[0]+$firstindx2);
} else {
return [];
}
my $firstindx = $self->numbering;
if ( !defined($colindices) ) {
# create default col index list if necessary
$colindices2 = [$firstindx .. $width - 1 + $firstindx]
if (defined($width) && $width > 0);
#??!! $colindices2 = [];
}
if (ref($colindices) eq 'ARRAY') {
if (scalar(@$colindices) == 0) {
$colindices2 = [$firstindx .. $width - 1 + $firstindx]
if (defined($width) && $width > 0);
} else {
$colindices2 = [@$colindices];
}
}
if (ref($colindices) eq 'HASH') {
my @col_ids = split /\s/, $colindices->{'ids'};
my($ii,@cols);
my %col_ids_hash = map {($_,"")} @col_ids;
foreach $ii (0..$#{$self->{'col_ids'}}) {
my $col_id = $self->{'col_ids'}[$ii];
if (exists $col_ids_hash{$col_id}) {
push @cols, $ii+$firstindx;
delete $col_ids_hash{$col_id};
}
}
my(@restkeys) = keys %col_ids_hash;
if (scalar(@restkeys) > 0) {
carp("Requested columns named @restkeys not found");
}
$colindices2 = \@cols;
}
if (ref($colindices) eq 'CODE') {
my $ctr = $firstindx - 1;
$colindices2 = [ grep {$_ != $_NaN}
# see _rowbounds()
map {($ctr++,$_) ? $ctr : $_NaN}
$self->map_c($colindices) ];
}
my($ii);
for $ii (0 .. ($#{$colindices2})) {
if ($colindices2->[$ii] < $firstindx) {
carp "Requested column $colindices2->[$ii] of an alignment starting at $firstindx";
}
if (($colindices2->[$ii] - $firstindx + 1) > $width) {
carp "Requested column $colindices2->[$ii] is beyond the end of the alignment";
}
}
return $colindices2;
#return $self->_colnorm($colindices2);
}
# normalize by substracting the first row index (offset)
sub _rownorm {
my($self) = shift;
my $rowindices = [@{$_[0]}];
my $firstindx = 1; # currently, it's assumed the first row is row no. 1,
# and other conventions aren't supported.
return [ map {$_ -= $firstindx} @$rowindices ];
}
# normalize by substracting the first column index (offset)
sub _colnorm {
my($self) = shift;
my $colindices = [@{$_[0]}];
my $firstindx = $self->numbering;
return [ map {$_ -= $firstindx} @$colindices ];
}
=head2 _fixbounds()
Usage : ($corrected_rowbounds,$corrected_colbounds) =
$aln->_fixbounds($uncorrected_rowbounds,$uncorrected_colbounds);
OR
($corrected_rowbounds,$corrected_colbounds) =
$aln->_fixbounds($firstpos1,$lastpos1,$firstpos2,$lastpos2)
Function : Convert unfixed index list information into the standard internal
one, allowing as input either max. 2 references or max. 4 boundary
coordinates (2 coord. for row indices and 2 for column indices).
Call functions to create maximal default index lists if needed,
to create index lists if specified by a hash of ids or by a selector
function that acts on rows/columns and returns true/false,
to check index lists for bounds errors, and to substract offsets.
Returns : 2 references to corrected index lists
Argument : EITHER max. 2 references to uncorrected index lists,
OR max. 4 boundary coordinates (integers)
=cut
sub _fixbounds {
my($self) = shift;
( run in 2.010 seconds using v1.01-cache-2.11-cpan-0d23b851a93 )