FAST

 view release on metacpan or  search on metacpan

lib/FAST/Bio/UnivAln.pm  view on Meta::CPAN

be supported for the forseeable future.

Version 1.006 on 15 Dec 1997. Added function no_allgap_sites(); out_fasta()
now accepts same row/column designations like seqs(). For advanced users:
the width() of sequence bags can now be controlled by supplying the index of
the row whose width is taken -- by default the width of the first row is used.

Version 1.007 on 15 Mar 1998.
* Added functions that return indices instead of actual sequence data:
var_inds(), invar_inds(), gap_free_inds(), unknown_free_inds(),
special_free_inds() and no_allgap_inds() return the indices of
(in)variable, gap-free, unknown-free, gap+unknown-free and non-gap-only
columns; they are the counterparts of var_sites(), etc.
* Added remove_gaps() to return ungapped (original) sequence data
* Added equal_no_gaps() to check for equality ignoring gaps
* Added equalize_lengths() for padding a sequence bag with gaps
such that all rows have equal length (current procedure is slow)
* The default id is now ``_'' (was: ``No_Id_Given'')
* Internal functions _rowbounds() and _colbounds() now return indices
from the user's perspective, but WITHOUT substracting offset.  Instead,
all functions subtract offset after applying _rowbounds()/_colbounds()

Version 1.008 on 13 May 1998.
* Added readseq conversion support, making it possible to read and write
the following formats: MSF, Paup, PIR/Codata, ASN.1. (ASN.1 was not
parsed successfully: readseq seems to be unable to read in its own ASN.1
output). Technically, readseq is now used to parse files that have been
processed as ``raw'' before; now ``raw'' format is recognized using the
expression /^[A-Z_0-9$_GAP_SYMBOL$_UNKN_SYMBOL\s]+$/im, i.e. the file
may only have alphanumerical characters, gap and unknown-symbol, and
whitespace. If commata, etc, are detected, readseq is used for parsing.
Readseq itself seems to be unable to detect ``raw'' format in some cases,
causing weird results.
* Added Clustal support for parsing; still to be tested and documented.

Version 1.009 on 25 May 1998.
* The module can now be ``built'' in the standard way (perl Makefile.PL, etc).

=head1 APPENDIX

Please note that functions starting with an underscore (``_'') are
intended for internal use only: Use them only if you know what you're
doing :-)

=cut

#"

use Exporter;
use vars qw( @ISA  @EXPORT @EXPORT_OK );
@ISA         = qw(Exporter);
@EXPORT      = qw();
@EXPORT_OK   = qw($VERSION %UnivAlnType @UnivAlnType
                  %UnivAlnForm @UnivAlnForm %UnivAlnAlphs @UnivAlnAlphs);
require 5.002;
use Carp;
#if ($] < 5.005) {carp "Not tested for Perl 5.002 or 5.003";}
use POSIX;
use File::Basename;

use vars qw($_NOFILE_FLAG $_GAP_SYMBOL $_UNKN_SYMBOL $_UNKN_SYMBOL2 $_NO_CONSENSUS_SYMBOL $_NaN $_MAX_SIZE_TAXANAME $_READSEQ $_CLUSTAL);
$_NOFILE_FLAG='_undef';
$_GAP_SYMBOL ='-';
$_UNKN_SYMBOL='\?'; # the backslash seems to be needed to be able to write
                    # ($str =~ /$_UNKN_SYMBOL/) ??!!
$_UNKN_SYMBOL2='N'; # ONLY USED IN UNDOCUMENTED FUNCTION special_free_seqs()
$_NO_CONSENSUS_SYMBOL='!';
$_NaN = POSIX::INT_MAX;   #anything better ?
$_MAX_SIZE_TAXANAME=20;
$_READSEQ = "";
$_CLUSTAL = "";
$ENV{READSEQ} = "readseq";
 # name of the readseq executable to be found in
 # $ENV{READSEQ_DIR} or "\.". Set this to "" if readseq is
 # installed, but shall be ignored
$ENV{CLUSTAL} = "clustal";
 # name of the clustal executable to be found in
 # $ENV{CLUSTAL_DIR} or "\.". Set this to "" if clustal is
 # installed, but shall be ignored

use vars qw(%UnivAlnType %TypeUnivAln @TypeUnivAln %UnivAlnForm %FormUnivAln @FormUnivAln);

# List of recognized alignment types
$UnivAlnType{unknown}  = 0;
$UnivAlnType{dna}      = 1;
$UnivAlnType{rna}      = 2;
$UnivAlnType{amino}    = 3;
$UnivAlnType{otherseq} = 4;  # type is explicitly NOT dna/rna/amino/etc, but
                             # nothing more is known
     # Future support for the following 4 acronyms is doubtful:
$UnivAlnType{Unknown}  = 0;
$UnivAlnType{Dna}      = 1;
$UnivAlnType{Rna}      = 2;
$UnivAlnType{Amino}    = 3;
# %TypeUnivAln = reverse %UnivAlnType; # TypeUnivAln is inverse of %UnivAlnType
                             # i.e. keys and values are exchanged
@TypeUnivAln = ('unknown','dna','rna','amino','otherseq');

# List of recognized file formats
$UnivAlnForm{unknown}  = 0;
$UnivAlnForm{raw}      = 13;
$UnivAlnForm{raw2}     = 14;
$UnivAlnForm{fasta}    = 7;
     # Future support for the following 3 acronyms is doubtful:
$UnivAlnForm{Unknown}  = 0;
$UnivAlnForm{Raw}      = 13;
$UnivAlnForm{Raw2}     = 14;
$UnivAlnForm{Fasta}    = 7;
# %FormUnivAln = reverse %UnivAlnForm; # FormUnivAln is inverse of %UnivAlnForm
                             # i.e. keys and values are exchanged
@FormUnivAln = ('unknown',undef,undef,undef,undef,undef,undef,
                'raw','raw2',undef,undef,undef,undef,'fasta');

use vars qw(%FuncParse %FuncOut %UnivAlnAlphs);

# %FuncParse and %FuncOut are for internal use ONLY
# %FuncParse is an array of (<ffmt>,<_parse_meth>), where <ffmt>
# is the file format code (e.g., 0 for "unknown", 7 for 'fasta', etc.), and
# <_parse_meth> is the method which parses strings in that file
# format. %FuncParse{$i} is mostly set to \&_parse_bad right now,
# since we don't have many file formats supported yet.

my %FuncParse =
  ($UnivAlnForm{"unknown"} => \&_parse_unknown,
   $UnivAlnForm{'fasta'}   => \&_parse_fasta,
   $UnivAlnForm{'raw'}     => \&_parse_raw,
   );
# Set all other formats to call &_parse_bad, so that currently unsupported

lib/FAST/Bio/UnivAln.pm  view on Meta::CPAN

  }

  return (@return_array);
}

# helper functions used by seqs(), etc

=head2 _rowbounds()

 Usage    : $corrected_bounds = $aln->_rowbounds($uncorrected_bounds);
 Function : create default row index list if necessary,
            create row index list if specified by a hash of ids or
            by a selector function that acts on rows and returns true/false,
            check row index list for bounds errors,
            NO LONGER DONE IN VERSION 1.007 AND HIGHER: substract offset of 1.
 Returns  : reference to corrected row index list
 Argument : reference to uncorrected row index list

=cut

sub _rowbounds {
  my($self) = shift;
  my $rowindices = shift;
  my $rowindices2;

  my $firstindx = 1; # currently, it's assumed the first row is row no. 1,
                     # and other conventions aren't supported.

  if ( !defined($rowindices) ) {
      # create default row index list if necessary
    $rowindices2 = [$firstindx .. $self->height() - 1 + $firstindx];
    # cleaner??!! $rowindices2 = [];
  }

  if (ref($rowindices) eq 'ARRAY') {
    if (scalar(@$rowindices) == 0) {
      $rowindices2 = [$firstindx .. $self->height() - 1 + $firstindx];
    } else {
      $rowindices2 = [@$rowindices];
    }
  }

  if (ref($rowindices) eq 'HASH') {
    my @row_ids = split /\s/, $rowindices->{'ids'};
    my($ii,@rows,$row_id);
    $ii=0;
    my %all_row_ids_hash = map {($_,$ii++)} @{$self->{'row_ids'}};
    foreach $row_id (@row_ids) {
      if (exists $all_row_ids_hash{$row_id}) {
        push @rows, $all_row_ids_hash{$row_id} + $firstindx;
        delete $all_row_ids_hash{$row_id};
      } else {
        carp("Requested row named $row_id not found, or duplicate");
      }
    }
    $rowindices2 = \@rows;
  }

  if (ref($rowindices) eq 'CODE') {
    my $ctr = $firstindx - 1;
    $rowindices2 = [ grep {$_ != $_NaN}
                      # filter out $_NaN-value indexes, see next comment
                    map {($ctr++,$_) ? $ctr : $_NaN}
                      # convert list of function values into list of indices
                      # in canonical order, except that false values trigger an
                      # index of $_NaN, e.g. (1,$_NaN,3,$_NaN,5) is the result
                      # of map {($ctr++,$_) ? $ctr : $_NaN} ("A","","B",0,"C")
                      # note that the comma operator evaluates the first
                      # argument and throws the result away, and then
                      # evaluates the second argument; the side effect
                      # of the first evaluation is the increment of $ctr.
                    $self->map_r($rowindices) ];
  }

  my($ii);
  for $ii (0 .. $#{$rowindices2}) {
    if ($rowindices2->[$ii] < $firstindx) {
      carp "Requested row $rowindices2->[$ii] of an alignment where $firstindx is the first row";
    }
    if (($rowindices2->[$ii] - $firstindx + 1) > scalar(@{ $self->{'seqs'} })) {
      carp "Requested row $rowindices2->[$ii] of an alignment with fewer rows";
    }
  }
  return $rowindices2;
  #return $self->_rownorm($rowindices2);
}


=head2 _colbounds()

 Usage    : $corrected_bounds = $aln->_colbounds($uncorrected_bounds);
 Function : create default column index list if necessary,
            create column index list if specified by a hash of ids or
            by a selector function that acts on columns and returns true/false,
            check column index list for bounds errors,
            NOT IN 1.007 and higher: substract offset (according to numbering scheme).
 Returns  : reference to corrected column index list
 Argument : reference to uncorrected column index list
            reference to row index list: its first row is used to
            provide the width of the alignment considered, which matters
            if the alignment is really a sequence bag

=cut

sub _colbounds {
  my($self) = shift;
  my $colindices = shift;
  my $rowindices = shift || [0];
  my $colindices2;
  my $firstindx2 = 1; # currently, it's assumed the first row is row no. 1,
                    # and other conventions aren't supported.
  my $width;
  if (scalar(@$rowindices) != 0) {
    $width = $self->width($rowindices->[0]+$firstindx2);
  } else {
    return [];
  }
  my $firstindx = $self->numbering;

  if ( !defined($colindices) ) {
      # create default col index list if necessary
    $colindices2 = [$firstindx .. $width - 1 + $firstindx]
      if (defined($width) && $width > 0);
    #??!! $colindices2 = [];
  }

  if (ref($colindices) eq 'ARRAY') {
    if (scalar(@$colindices) == 0) {
      $colindices2 = [$firstindx .. $width - 1 + $firstindx]
        if (defined($width) && $width > 0);
    } else {
      $colindices2 = [@$colindices];
    }
  }

  if (ref($colindices) eq 'HASH') {
    my @col_ids = split /\s/, $colindices->{'ids'};
    my($ii,@cols);
    my %col_ids_hash = map {($_,"")} @col_ids;
    foreach $ii (0..$#{$self->{'col_ids'}}) {
      my $col_id = $self->{'col_ids'}[$ii];
      if (exists $col_ids_hash{$col_id}) {
        push @cols, $ii+$firstindx;
        delete $col_ids_hash{$col_id};
      }
    }
    my(@restkeys) = keys %col_ids_hash;
    if (scalar(@restkeys) > 0) {
      carp("Requested columns named @restkeys not found");
    }
    $colindices2 = \@cols;
  }

  if (ref($colindices) eq 'CODE') {
    my $ctr = $firstindx - 1;
    $colindices2 = [ grep {$_ != $_NaN}
                      # see _rowbounds()
                    map {($ctr++,$_) ? $ctr : $_NaN}
                    $self->map_c($colindices) ];
  }

  my($ii);
  for $ii (0 .. ($#{$colindices2})) {
    if ($colindices2->[$ii] < $firstindx) {
      carp "Requested column $colindices2->[$ii] of an alignment starting at $firstindx";
    }
    if (($colindices2->[$ii] - $firstindx + 1) > $width) {
      carp "Requested column $colindices2->[$ii] is beyond the end of the alignment";
    }
  }
  return $colindices2;
  #return $self->_colnorm($colindices2);
}


    # normalize by substracting the first row index (offset)
sub _rownorm {
  my($self) = shift;
  my $rowindices = [@{$_[0]}];

  my $firstindx = 1; # currently, it's assumed the first row is row no. 1,
                    # and other conventions aren't supported.

  return [ map {$_ -= $firstindx} @$rowindices ];
}

    # normalize by substracting the first column index (offset)
sub _colnorm {
  my($self) = shift;
  my $colindices = [@{$_[0]}];

  my $firstindx = $self->numbering;

  return [ map {$_ -= $firstindx} @$colindices ];
}

=head2 _fixbounds()

 Usage    : ($corrected_rowbounds,$corrected_colbounds) =
               $aln->_fixbounds($uncorrected_rowbounds,$uncorrected_colbounds);
                    OR
            ($corrected_rowbounds,$corrected_colbounds) =
               $aln->_fixbounds($firstpos1,$lastpos1,$firstpos2,$lastpos2)
 Function : Convert unfixed index list information into the standard internal
            one, allowing as input either max. 2 references or max. 4 boundary
            coordinates (2 coord. for row indices and 2 for column indices).
            Call functions to create maximal default index lists if needed,
            to create index lists if specified by a hash of ids or by a selector
            function that acts on rows/columns and returns true/false,
            to check index lists for bounds errors, and to substract offsets.
 Returns  : 2 references to corrected index lists
 Argument : EITHER max. 2 references to uncorrected index lists,
            OR max. 4 boundary coordinates (integers)

=cut

sub _fixbounds {
  my($self) = shift;



( run in 2.010 seconds using v1.01-cache-2.11-cpan-0d23b851a93 )