App-Egaz

 view release on metacpan or  search on metacpan

lib/App/Egaz/Command/exactmatch.pm  view on Meta::CPAN

package App::Egaz::Command::exactmatch;
use strict;
use warnings;
use autodie;

use App::Egaz -command;
use App::Egaz::Common;

sub abstract {
    return 'exact matched positions in genome sequences';
}

sub opt_spec {
    return (
        [ "outfile|o=s", "Output filename. [stdout] for screen", { default => "stdout" }, ],
        [ 'length|l=i',  'match length of mummer',               { default => 100 }, ],
        [ 'discard=i',    'discard blocks with copy number larger than this', { default => 0 }, ],
        [ 'debug',        'write a file of exactmatch.json for debugging', ],
        { show_defaults => 1, }
    );
}

sub usage_desc {
    return "egaz exactmatch [options] <infile> <genome.fa>";
}

sub description {
    my $desc;
    $desc .= ucfirst(abstract) . ".\n";
    $desc .= <<'MARKDOWN';

* <infile> can't be stdin
* `mummer` or `sparsemem` should be in $PATH

MARKDOWN

    return $desc;
}

sub validate_args {
    my ( $self, $opt, $args ) = @_;

    if ( @{$args} != 2 ) {
        my $message = "This command need two input files.\n\tIt found";
        $message .= sprintf " [%s]", $_ for @{$args};
        $message .= ".\n";
        $self->usage_error($message);
    }
    for ( @{$args} ) {
        if ( !Path::Tiny::path($_)->is_file ) {
            $self->usage_error("The input file [$_] doesn't exist.");
        }
    }

}

sub execute {
    my ( $self, $opt, $args ) = @_;

    #----------------------------#
    # Run sparsemem
    #----------------------------#
    #@type Path::Tiny
    my $mem_result = Path::Tiny->tempfile;
    App::Egaz::Common::run_sparsemem( $mem_result, $args->[0], $args->[1], $opt->{length} );

    #----------------------------#
    # Parse sparsemem
    #----------------------------#
    tie my %match_of, "Tie::IxHash";
    my $fh_mem = $mem_result->openr;
    my $cur_name;
    while ( my $line = <$fh_mem> ) {
        if ( $line =~ /^\>/ ) {
            $line =~ s/^\>\s*//;
            chomp $line;
            $cur_name = $line;
        }
        elsif ( $line =~ /^\s*[\w-]+/ ) {
            chomp $line;

            #   XII    1065146         1      6369
            # ID
            # the position in the reference sequence
            # the position in the query sequence
            # the length of the match
            my @fields = grep {/\S+/} split /\s+/, $line;

            if ( @fields != 4 ) {
                warn "    Line [$line] seems wrong.\n";
                next;
            }

            if ( !exists $match_of{$cur_name} ) {
                $match_of{$cur_name} = [];
            }
            push @{ $match_of{$cur_name} }, \@fields;
        }
        else {    # Blank line, do nothing
        }



( run in 1.273 second using v1.01-cache-2.11-cpan-0bb4e1dffa6 )