Bio-Tools-Run-Alignment-TCoffee
view release on metacpan or search on metacpan
lib/Bio/Tools/Run/Alignment/TCoffee.pm view on Meta::CPAN
}
$instring .= ' -matrix='.$self->matrix unless (($self->matrix =~ /none/i) || ($self->matrix =~ /null/i)) ;
$instring .= ' -method='.join(',',$self->methods) if ($self->methods) ;
}
}
my $param_string = shift;
# my ($paramfh,$parameterFile) = $self->io->tempfile;
# print $paramfh ( "$instring\n-output=gcg$param_string") ;
# close($paramfh);
# my $commandstring = "t_coffee -output=gcg -parameters $parameterFile" ; ##MJL
my $commandstring = $self->executable." $instring $param_string";
#$self->debug( "tcoffee command = $commandstring \n");
my $status = system($commandstring);
my $outfile = $self->outfile();
if( !-e $outfile || -z $outfile ) {
$self->warn( "TCoffee call crashed: $? [command $commandstring]\n");
return undef;
}
# retrieve alignment (Note: MSF format for AlignIO = GCG format of
# tcoffee)
my $in = Bio::AlignIO->new('-file' => $outfile, '-format' =>
$self->output);
my $aln = $in->next_aln();
# Replace file suffix with dnd to find name of dendrogram file(s) to delete
if( ! $self->keepdnd ) {
foreach my $f ( $infilename, $infile1, $infile2 ) {
next if( !defined $f || $f eq '');
$f =~ s/\.[^\.]*$// ;
# because TCoffee writes these files to the CWD
if( $Bio::Root::IO::PATHSEP ) {
my @line = split(/$Bio::Root::IO::PATHSEP/, $f);
$f = pop @line;
} else {
(undef, undef, $f) = File::Spec->splitpath($f);
}
unlink $f .'.dnd' if( $f ne '' );
}
}
return $aln;
}
sub _setinput {
my ($self,$input) = @_;
my ($infilename, $seq, $temp, $tfh);
# If $input is not a reference it better be the name of a
# file with the sequence/ alignment data...
my $type = '';
if (! ref $input) {
# check that file exists or throw
$infilename = $input;
unless (-e $input) {return 0;}
# let's peek and guess
open(my $IN,$infilename) || $self->throw("Cannot open $infilename");
my $header = <$IN>;
if( $header =~ /^\s+\d+\s+\d+/ ||
$header =~ /Pileup/i ||
$header =~ /clustal/i ) { # phylip
$type = 'A';
}
# On some systems, having filenames with / in them (ie. a file in a
# directory) causes t-coffee to completely fail. It warns on all systems.
# The -no_warning option solves this, but there is still some strange
# bug when doing certain profile-related things. This is magically solved
# by copying the profile file to a temp file in the current directory, so
# it its filename supplied to t-coffee contains no /
# (It's messy here - I just do this to /all/ input files to most easily
# catch all variants of providing a profile - it may only be the last
# form (isa("Bio::PrimarySeqI")) that causes a problem?)
my (undef, undef, $adjustedfilename) = File::Spec->splitpath($infilename);
if ($adjustedfilename ne $infilename) {
my ($fh, $tempfile) = $self->io->tempfile(-dir => cwd());
seek($IN, 0, 0);
while (<$IN>) {
print $fh $_;
}
close($fh);
(undef, undef, $tempfile) = File::Spec->splitpath($tempfile);
$infilename = $tempfile;
$type = 'S';
}
close($IN);
return ($infilename,$type);
} elsif (ref($input) =~ /ARRAY/i ) {
# $input may be an array of BioSeq objects...
# Open temporary file for both reading & writing of array
($tfh,$infilename) = $self->io->tempfile(-dir => cwd());
(undef, undef, $infilename) = File::Spec->splitpath($infilename);
if( ! ref($input->[0]) ) {
$self->warn("passed an array ref which did not contain objects to _setinput");
return undef;
} elsif( $input->[0]->isa('Bio::PrimarySeqI') ) {
$temp = Bio::SeqIO->new('-fh' => $tfh,
'-format' => 'fasta');
my $ct = 1;
foreach $seq (@$input) {
return 0 unless ( ref($seq) &&
$seq->isa("Bio::PrimarySeqI") );
if( ! defined $seq->display_id ||
$seq->display_id =~ /^\s+$/) {
$seq->display_id( "Seq".$ct++);
}
$temp->write_seq($seq);
}
$temp->close();
undef $temp;
close($tfh);
$tfh = undef;
$type = 'S';
} elsif( $input->[0]->isa('Bio::Align::AlignI' ) ) {
lib/Bio/Tools/Run/Alignment/TCoffee.pm view on Meta::CPAN
=head2 _numeric_version
Version "numbers" are not numeric values. In the case of T-Coffee,
they have an hash component at their end. This was not the case in
older releases. We use the first two numeric parts to do different
things though. See also issue #1.
=head2 _run
Title : _run
Usage : Internal function, not to be called directly
Function: makes actual system call to tcoffee program
Example :
Returns : nothing; tcoffee output is written to a
temporary file OR specified output file
Args : Name of a file containing a set of unaligned fasta sequences
and hash of parameters to be passed to tcoffee
=head2 _setinput
Title : _setinput
Usage : Internal function, not to be called directly
Function: Create input file for tcoffee program
Example :
Returns : name of file containing tcoffee data input AND
type of file (if known, S for sequence, L for sequence library,
A for sequence alignment)
Args : Seq or Align object reference or input file name
=head2 _setparams
Title : _setparams
Usage : Internal function, not to be called directly
Function: Create parameter inputs for tcoffee program
Example :
Returns : parameter string to be passed to tcoffee
during align or profile_align
Args : name of calling object
=head1 PARAMETERS FOR ALIGNMENT COMPUTATION
There are a number of possible parameters one can pass in TCoffee.
One should really read the online manual for the best explanation of
all the features. See
http://igs-server.cnrs-mrs.fr/~cnotred/Documentation/t_coffee/t_coffee_doc.html
These can be specified as parameters when instantiating a new TCoffee
object, or through get/set methods of the same name (lowercase).
=head2 IN
Title : IN
Description : (optional) input filename, this is specified when
align so should not use this directly unless one
understand TCoffee program very well.
=head2 TYPE
Title : TYPE
Args : [string] DNA, PROTEIN
Description : (optional) set the sequence type, guessed automatically
so should not use this directly
=head2 PARAMETERS
Title : PARAMETERS
Description : (optional) Indicates a file containing extra parameters
=head2 EXTEND
Title : EXTEND
Args : 0, 1, or positive value
Default : 1
Description : Flag indicating that library extension should be
carried out when performing multiple alignments, if set
to 0 then extension is not made, if set to 1 extension
is made on all pairs in the library. If extension is
set to another positive value, the extension is only
carried out on pairs having a weigth value superior to
the specified limit.
=head2 DP_NORMALISE
Title : DP_NORMALISE
Args : 0 or positive value
Default : 1000
Description : When using a value different from 0, this flag sets the
score of the highest scoring pair to 1000.
=head2 DP_MODE
Title : DP_MODE
Args : [string] gotoh_pair_wise, myers_miller_pair_wise,
fasta_pair_wise cfasta_pair_wise
Default : cfast_fair_wise
Description : Indicates the type of dynamic programming used by
the program
gotoh_pair_wise : implementation of the gotoh algorithm
(quadratic in memory and time)
myers_miller_pair_wise : implementation of the Myers and Miller
dynamic programming algorithm ( quadratic in time and linear in
space). This algorithm is recommended for very long sequences. It
is about 2 time slower than gotoh. It only accepts tg_mode=1.
fasta_pair_wise: implementation of the fasta algorithm. The
sequence is hashed, looking for ktuples words. Dynamic programming
is only carried out on the ndiag best scoring diagonals. This is
much faster but less accurate than the two previous.
cfasta_pair_wise : c stands for checked. It is the same
algorithm. The dynamic programming is made on the ndiag best
diagonals, and then on the 2*ndiags, and so on until the scores
converge. Complexity will depend on the level of divergence of the
sequences, but will usually be L*log(L), with an accuracy
comparable to the two first mode ( this was checked on BaliBase).
=head2 KTUPLE
Title : KTUPLE
( run in 0.420 second using v1.01-cache-2.11-cpan-5b529ec07f3 )