App-Egaz

 view release on metacpan or  search on metacpan

lib/App/Egaz/Command/multiz.pm  view on Meta::CPAN

package App::Egaz::Command::multiz;
use strict;
use warnings;
use autodie;

use MCE;
use MCE::Flow;

use App::Egaz -command;
use App::Egaz::Common;

sub abstract {
    return 'multiz step by step';
}

sub opt_spec {
    return (
        [ "outdir|o=s", "Output directory", ],
        [ "tree=s",     "a rooted newick tree", ],
        [ "target=s",   "target name, this command can automatically pick one", ],
        [ "keeptmp",    "keep intermediate files", ],
        [ "parallel|p=i", "number of threads", { default => 2 }, ],
        { show_defaults => 1, }
    );
}

sub usage_desc {
    return "egaz multiz [options] <maf dir> [more dirs]";
}

sub description {
    my $desc;
    $desc .= ucfirst(abstract) . ".\n";
    $desc .= <<'MARKDOWN';

* <maf dirs> are directories containing multiple .maf or .maf.gz files
* `multiz` should be in $PATH
* Use a modified [`multiz`](https://github.com/wang-q/multiz) supports gzipped .maf files
* [Original `multiz`](https://www.bx.psu.edu/miller_lab/)

MARKDOWN

    return $desc;
}

sub validate_args {
    my ( $self, $opt, $args ) = @_;

    if ( @{$args} < 1 ) {
        my $message = "This command need one or more input directories.\n\tIt found";
        $message .= sprintf " [%s]", $_ for @{$args};
        $message .= ".\n";
        $self->usage_error($message);
    }
    for ( @{$args} ) {
        if ( !( Path::Tiny::path($_)->is_dir ) ) {
            $self->usage_error("The input directory [$_] doesn't exist.");
        }
    }

    if ( $opt->{tree} ) {
        if ( !( Path::Tiny::path( $opt->{tree} )->is_file ) ) {
            $self->usage_error("The newick tree file [$opt->{tree}] doesn't exist.");
        }
    }
}

sub execute {
    my ( $self, $opt, $args ) = @_;

    #----------------------------#
    # inputs
    #----------------------------#
    my $suffix = '.maf';
    my @files  = File::Find::Rule->file->name("*$suffix")->in( @{$args} );
    if ( scalar @files == 0 ) {
        $suffix = '.maf.gz';
        @files  = sort File::Find::Rule->file->name("*$suffix")->in( @{$args} );
    }
    printf STDERR "* $suffix files: [%d]\n", scalar @files;

    if ( scalar @files == 0 ) {
        Carp::croak "Can't find .maf or .maf.gz files\n";
    }

    #----------------------------#
    # Gather species list
    #----------------------------#
    #---
    #Q_aliena:
    #  NC_020152:
    #    - t/Q_rubravsQ_aliena/mafSynNet/NC_020152.synNet.maf.gz
    #Q_aquifolioides:
    #  NC_020152:
    #    - t/Q_rubravsQ_aquifolioides/mafSynNet/NC_020152.synNet.maf.gz
    my $file_of = {};    # all info here
    my %seen;            # count
    my @potential_targets;
    my @species;         # species list gathered from maf files; and then shift target out

    {
        print STDERR "Get species list\n";
        my $worker = sub {
            my ( $self, $chunk_ref, $chunk_id ) = @_;
            my $file = $chunk_ref->[0];



( run in 1.430 second using v1.01-cache-2.11-cpan-0d23b851a93 )