Bio-NEXUS

 view release on metacpan or  search on metacpan

exec/nexfix.pl  view on Meta::CPAN

# > OTU names that contain NEXUS punctuation (i.e., (){}/\,;:=*'"`+-<>) need   #
#   to be single-quoted.  Some taxa in our files are hyphenated.  nexfix       #
#   currently checks the following commands for OTU names: TAXLABELS, MATRIX,  #
#   TAXSET.  It does not check the ADD command in the Span Block.              #
#                                                                              #
################################################################################
# Usage: #######################################################################
#                                                                              #
# nexfix.pl [-ckP] filename [filenames]                                        #
#                                                                              #
#   -c   : clobber the old files with the new ones (don't rename the old ones) #
#   -k   : keep the old files intact, renaming the new ones (default is to     #
#          rename the old ones, and write the new files using the old name)    #
#   -P   : do not move non-compliant files to a /problematic/ directory--leave #
#          them where they are                                                 #
#   -v   : send error messages to screen instead of error.log file             #
#                                                                              #
################################################################################

my $RCSId = '$Id: nexfix.pl,v 1.11 2007/02/01 04:52:09 vivek Exp $';
my $shortname = join (" v", $RCSId =~ m/(\w+.?\w+,)v (\d+\.\d+)/);

use strict;
use Data::Dumper;
use Bio::NEXUS;
use File::Copy;
use File::Path;
use Getopt::Std;

# read in the command-line options, if any; see Usage above for details
my %flags;
getopts('ckPv', \%flags) or die "ERROR: Unknown options\n";

# get list of NEXUS files to process
my @paths = @ARGV;

my $punctuation_pattern = '[\(\)\{\}\/\\,;:=\*\'"`\+\-<>]';


# loop through them
for my $path (@paths) {

    # verify that $path is a file
    unless (-e $path) {warn "'$path' is not a valid filename: skipping\n"; next };
    
    print "Processing: <$path>\n";
    
    # split the path into directory and filename components
    my ($directory, $filename) = $path =~ /(.*?)([^\/]+)$/;

    unless ($flags{v}) { 
        open(STDERR, ">> ./$directory/error.log") || die "Couldn't open error log in $directory\n";
    }
    
    # get the time stamp, so it can be reported later
    my $time = localtime time;
    
#    print STDERR "$shortname run on $time\n";
    
    # slurp in the entire NEXUS file
    my $nexus_text = do {local(@ARGV, $/) = $path; <>};
    
    # move the old file to filename.old unless the user has specified 
    # overwriting (-c) or keeping the original file intact (-k)
    move($path, "$path.old") unless ($flags{c} || $flags{k});
    
    # use an alternate filename for the new file if the user specifies keeping the original file
    if ($flags{k}) {$path = "$path.new";}
    
    # split the file into its commands (and their arguments)
    my @commands = split(/;/, $nexus_text);
    # loop through them
    my $within_history_block = 0;
    for my $command (@commands) {
        # remove previous comments that this program inserted
        $command =~ s/\[This file was checked by nexfix\.pl, v\d+\.\d+ on \w+ \w+\s+\d{1,2}\s+\d{1,2}:\d{2}:\d{2} \d{4}\]\n//;
        # if it's the beginning of the file, insert a comment documenting this processing
        $command =~ s/(#NEXUS)/$1\n\[This file was checked by $shortname on $time\]/;

    $within_history_block = 0 if ($within_history_block and $command =~/(end|endblock)/i);

        # match command, capture the arguments
        if ($command =~ /^\s*charlabels\s+(.+?)\s*$/si) {
            $command = &charlabels($1);
        } elsif ($command =~ /^\s*taxlabels\s+(.+?)\s*$/si) {
            $command = &taxlabels($1);
        } elsif ($command =~ /^\s*matrix\s+(.+?)\s*$/si) {
        $command = &matrix($1); 
        $command =  &history_matrix($command) if ($within_history_block); 
        $command = "\n\tMATRIX\n" . $command; 
        } elsif ($command =~ /^\s*taxset\s+(.+?)\s+=(.+?)\s*$/si) {
            $command = "TAXSET $1 = " . &taxset($2) . "\n";
    } elsif ($command =~ /^\s*tree\s+(.+?)\s+=(.+?)\s*$/si) {
        $command = "\nTREE $1 = " . &tree($2) . "\n";
    } elsif ($command =~ /\s*Begin\s+History/si) {
        $within_history_block = 1;
    } elsif ($command =~ /\s*format /si and $within_history_block) {
        $command .= " statesformat=frequency" if $command !~ /statesformat/;
        } #!!!!!! TO ADD MORE ERROR-CHECKING FUNCTUNALITY, PUT AN ELSIF HERE !!!!!!

    }
    # join the commands back together
    $nexus_text = join(";", @commands);
    # open a FH, write out the altered text
    open( my $fh, ">$path" ) || die "Can't create $path $!" ;
    print $fh $nexus_text;
    close $fh;
    
    # to make sure that the new file is a well-formed NEXUS file, read it in and
    # write it back out with NEXPL.  This is done using a system call so that
    # die commands within NEXPL do not kill this process.

    if ((my $retval = system("perl -MBio::NEXUS -e 'new Bio::NEXUS(\"$path\")->write(\"$path\")'")) == 0) {
        # system command finished properly
        warn "File: <$path> has been validated and written.\n";
    } elsif ( $retval == 2 ) {
        # if the system call returned a value of 2, it's because the process was 
        # interrupted by a SIGINT (such as Ctrl-C)
        warn "Processing of File: <$path> interrupted; file written but not validated\n";
        print "\n$shortname interrupted by SIGINT\n" unless $flags{v};
        die "$shortname interrupted by SIGINT\n";



( run in 0.761 second using v1.01-cache-2.11-cpan-5a3173703d6 )