App-Dazz

 view release on metacpan or  search on metacpan

lib/App/Dazz/Command/group.pm  view on Meta::CPAN

package App::Dazz::Command::group;
use strict;
use warnings;
use autodie;

use App::Dazz -command;
use App::Dazz::Common;

use constant abstract => "group anchors by long reads";

sub opt_spec {
    return (
        [ "dir|d=s", "output directory", ],
        [ "range|r=s",    "ranges of anchors",            { required => 1 }, ],
        [ "coverage|c=i", "minimal coverage",             { default  => 2 }, ],
        [ "max=i",        "max distance",                 { default  => 5000 }, ],
        [ "len|l=i",      "minimal length of overlaps",   { default  => 1000 }, ],
        [ "idt|i=f",      "minimal identity of overlaps", { default  => 0.85 }, ],
        [ "keep",         "don't remove multi-matched reads", ],
        [ 'oa=s',         'overlaps between anchors', ],
        [ "parallel|p=i", "number of threads",            { default  => 4 }, ],
        [ "verbose|v",    "verbose mode", ],
        [ "png",          "write a png file via graphviz", ],
        { show_defaults => 1, }
    );
}


# 三代 reads 里有一个常见的错误, 即单一 ZMW 里的测序结果中, 接头序列部分的测序结果出现了较多的错误,
# 因此并没有将接头序列去除干净, 形成的 subreads 里含有多份基因组上同一片段, 它们之间以接头序列为间隔.
#
# `dazz group` 命令默认会将这种三代的 reads 去除. `--keep` 选项会留下这种 reads, 这适用于组装好的三代序列.
#
# ```text
#       ===
# ------------>
#              )
#   <----------
#       ===
# ```

sub usage_desc {
    return "dazz group [options] <dazz DB> <ovlp file>";
}

sub description {
    my $desc;
    $desc .= ucfirst(abstract) . ".\n";
    $desc .= "\tThis command relies on an existing dazz db.\n";
    return $desc;
}

sub validate_args {
    my ( $self, $opt, $args ) = @_;

    if ( @{$args} != 2 ) {
        my $message = "This command need one or more input files.\n\tIt found";
        $message .= sprintf " [%s]", $_ for @{$args};
        $message .= ".\n";
        $self->usage_error($message);
    }
    for ( @{$args} ) {
        if ( !Path::Tiny::path($_)->is_file ) {
            $self->usage_error("The input file [$_] doesn't exist.");
        }
    }

    if ( !AlignDB::IntSpan->valid( $opt->{range} ) ) {
        $self->usage_error("Invalid --range [$opt->{range}]\n");
    }

    if ( $opt->{oa} ) {
        if ( !Path::Tiny::path( $opt->{oa} )->is_file ) {
            $self->usage_error("The overlap file [$opt->{oa}] doesn't exist.\n");
        }
    }

    if ( !exists $opt->{dir} ) {
        $opt->{dir}
            = Path::Tiny::path( $args->[0] )->parent->child("group")->absolute->stringify;
    }



( run in 0.889 second using v1.01-cache-2.11-cpan-0bb4e1dffa6 )