App-Egaz
view release on metacpan or search on metacpan
lib/App/Egaz/Command/fas2vcf.pm view on Meta::CPAN
package App::Egaz::Command::fas2vcf;
use strict;
use warnings;
use autodie;
use App::Egaz -command;
use App::Egaz::Common;
sub abstract {
return 'list variations in blocked fasta file';
}
sub opt_spec {
return (
[ "outfile|o=s", "Output filename. [stdout] for screen", { default => "stdout" }, ],
[ "list=s", "a list of names to keep, one per line", ],
[ "verbose|v", "verbose mode", ],
{ show_defaults => 1, }
);
}
sub usage_desc {
return "egaz fas2vcf [options] <infile> <chr.sizes>";
}
sub description {
my $desc;
$desc .= ucfirst(abstract) . ".\n";
$desc .= <<MARKDOWN;
* infile == stdin means reading from STDIN
* Steps:
1. split .fas to a temp dir by `fasops split`
2. convert each fasta files to .vcf by `snp-sites`
3. concat every .vcf files by `bcftools`
MARKDOWN
return $desc;
}
sub validate_args {
my ( $self, $opt, $args ) = @_;
if ( @{$args} != 2 ) {
my $message = "This command need two input files.\n\tIt found";
$message .= sprintf " [%s]", $_ for @{$args};
$message .= ".\n";
$self->usage_error($message);
}
for ( @{$args} ) {
next if lc $_ eq "stdin";
if ( !Path::Tiny::path($_)->is_file ) {
$self->usage_error("The input file [$_] doesn't exist.");
}
}
}
sub execute {
my ( $self, $opt, $args ) = @_;
my $tempdir = Path::Tiny->tempdir( TEMPLATE => "fas2vcf_XXXXXXXX", );
my $length_of = App::RL::Common::read_sizes( $args->[1] );
{ # fasops split
my $cmd = "";
if ( $opt->{list} ) {
$cmd .= " fasops subset $args->[0] $opt->{list} --required -o stdout";
$cmd .= " |";
$cmd .= " fasops split stdin --simple -o $tempdir";
}
else {
$cmd .= "fasops split $args->[0] --simple -o $tempdir";
}
App::Egaz::Common::exec_cmd( $cmd, { verbose => $opt->{verbose}, } );
}
{ # snp-sites
my @files = $tempdir->children(qr/\.fas$/);
printf STDERR " Find %d .fas files\n", scalar @files if $opt->{verbose};
for my Path::Tiny $f (@files) {
my ( $name, $chr_name, $chr_strand, $chr_pos ) = split /\./, $f->basename(".fas");
my ( $chr_start, $chr_end ) = split /\-/, $chr_pos;
my $chr_length = $length_of->{$chr_name};
my $cmd = "snp-sites -v $f";
my @lines = split /\n/, `$cmd`;
for my $l (@lines) {
if ( $l =~ /^\#\#contig\=\<ID\=/ ) {
$l = "##contig=<ID=$chr_name,length=$chr_length>";
}
# jvarkit/biostar94573.jar uses chrUn; snp-sites uses 1
if ( $l =~ /^chrUn\t/ or $l =~ /^1\t/ ) {
my @fields = split /\t/, $l;
$fields[0] = $chr_name;
# vcf position is 1-based
$fields[1] = $chr_start + $fields[1] - 1;
( run in 1.062 second using v1.01-cache-2.11-cpan-99c4e6809bf )