App-Anchr
view release on metacpan or search on metacpan
lib/App/Anchr/Command/quorum.pm view on Meta::CPAN
NC=
if tty -s < /dev/fd/1 2> /dev/null; then
GREEN='\033[0;32m'
RED='\033[0;31m'
NC='\033[0m' # No Color
fi
log_warn () {
echo >&2 -e "${RED}==> $@ <==${NC}"
}
log_info () {
echo >&2 -e "${GREEN}==> $@${NC}"
}
log_debug () {
echo >&2 -e " * $@"
}
#----------------------------#
# masurca
#----------------------------#
set +e
# Set some paths and prime system to save environment variables
save () {
printf ". + {%s: \"%s\"}" $1 $(eval "echo -n \"\$$1\"") > jq.filter.txt
if [ -e environment.json ]; then
cat environment.json \
| jq -f jq.filter.txt \
> environment.json.new
rm environment.json
else
jq -f jq.filter.txt -n \
> environment.json.new
fi
mv environment.json.new environment.json
rm jq.filter.txt
}
signaled () {
log_warn Interrupted
exit 1
}
trap signaled TERM QUIT INT
START_TIME=$(date +%s)
save START_TIME
NUM_THREADS=[% opt.parallel %]
save NUM_THREADS
#----------------------------#
# Renaming reads
#----------------------------#
log_info 'Processing pe and/or se library reads'
rm -rf meanAndStdevByPrefix.pe.txt
echo 'pe [% opt.size %] [% opt.std %]' >> meanAndStdevByPrefix.pe.txt
if [ ! -e pe.renamed.fastq ]; then
rename_filter_fastq \
'pe' \
<(exec expand_fastq '[% args.0 %]' ) \
<(exec expand_fastq '[% args.1 %]' ) \
> 'pe.renamed.fastq'
fi
[% IF args.2 -%]
echo 'se [% opt.size %] [% opt.std %]' >> meanAndStdevByPrefix.pe.txt
if [ ! -e se.renamed.fastq ]; then
rename_filter_fastq \
'se' \
<(exec expand_fastq '[% args.2 %]' ) \
'' \
> 'se.renamed.fastq'
fi
[% END -%]
#----------------------------#
# Stats of PE reads
#----------------------------#
head -n 80000 pe.renamed.fastq > pe_data.tmp
export PE_AVG_READ_LENGTH=$(
head -n 40000 pe_data.tmp \
| grep --text -v '^+' \
| grep --text -v '^@' \
| awk '{if(length($1)>31){n+=length($1);m++;}}END{print int(n/m)}'
)
save PE_AVG_READ_LENGTH
log_debug "Average PE read length $PE_AVG_READ_LENGTH"
KMER=$(
tail -n 40000 pe_data.tmp \
| perl -e '
my @lines;
while ( my $line = <STDIN> ) {
$line = <STDIN>;
chomp($line);
push( @lines, $line );
$line = <STDIN>;
$line = <STDIN>;
}
my @legnths;
my $min_len = 100000;
my $base_count = 0;
for my $l (@lines) {
$base_count += length($l);
push( @lengths, length($l) );
for $base ( split( "", $l ) ) {
if ( uc($base) eq "G" or uc($base) eq "C" ) { $gc_count++; }
}
}
@lengths = sort { $b <=> $a } @lengths;
$min_len = $lengths[ int( $#lengths * .75 ) ];
$gc_ratio = $gc_count / $base_count;
$kmer = 0;
if ( $gc_ratio < 0.5 ) {
$kmer = int( $min_len * .7 );
}
elsif ( $gc_ratio >= 0.5 && $gc_ratio < 0.6 ) {
$kmer = int( $min_len * .5 );
}
else {
$kmer = int( $min_len * .33 );
}
$kmer++ if ( $kmer % 2 == 0 );
$kmer = 31 if ( $kmer < 31 );
$kmer = 127 if ( $kmer > 127 );
print $kmer;
' )
save KMER
log_debug "Choosing kmer size of $KMER"
MIN_Q_CHAR=$(
head -n 40000 pe_data.tmp \
| awk 'BEGIN{flag=0}{if($0 ~ /^\+/){flag=1}else if(flag==1){print $0;flag=0}}' \
| perl -ne '
BEGIN { $q0_char = "@"; }
chomp;
for $v ( split "" ) {
if ( ord($v) < ord($q0_char) ) { $q0_char = $v; }
}
END {
$ans = ord($q0_char);
if ( $ans < 64 ) { print "33\n" }
else { print "64\n" }
}
')
save MIN_Q_CHAR
log_debug "MIN_Q_CHAR: $MIN_Q_CHAR"
#----------------------------#
# Error correct PE
#----------------------------#
JF_SIZE=$( ls -l *.fastq \
| awk '{n+=$5} END{s=int(n / 50 * 1.1); if(s>[% opt.jf %])print s;else print "[% opt.jf %]";}' )
save JF_SIZE
perl -e '
if(int('$JF_SIZE') > [% opt.jf %]) {
print "WARNING: JF_SIZE set too low, increasing JF_SIZE to at least '$JF_SIZE'.\n";
}
'
if [ ! -e quorum_mer_db.jf ]; then
log_info Creating mer database for Quorum.
quorum_create_database \
-t [% opt.parallel %] \
-s $JF_SIZE -b 7 -m 24 -q $((MIN_Q_CHAR + 5)) \
-o quorum_mer_db.jf.tmp \
pe.renamed.fastq [% IF args.2 %]se.renamed.fastq [% END %]\
&& mv quorum_mer_db.jf.tmp quorum_mer_db.jf
if [ $? -ne 0 ]; then
log_warn Increase JF_SIZE by --jf, the recommendation value is genome_size*coverage/2
exit 1
fi
fi
# -m Minimum count for a k-mer to be considered "good" (1)
# -g Number of good k-mer in a row for anchor (2)
# -a Minimum count for an anchor k-mer (3)
# -w Size of window (10)
# -e Maximum number of error in a window (3)
# As we have trimmed reads with sickle, we lower `-e` to 1 from original value of 3,
# remove `--no-discard`.
# And we only want most reliable parts of the genome other than the whole genome, so dropping rare
# k-mers is totally OK for us. Raise `-m` from 1 to 3, `-g` from 1 to 3, and `-a` from 1 to 4.
if [ ! -e pe.cor.fa ]; then
log_info Error correct PE.
quorum_error_correct_reads \
-q $((MIN_Q_CHAR + 40)) \
--contaminant=[% opt.adapter %] \
-m 3 -s 1 -g 3 -a 4 -t [% opt.parallel %] -w 10 -e 1 \
quorum_mer_db.jf \
pe.renamed.fastq [% IF args.2 %]se.renamed.fastq [% END %]\
-o pe.cor --verbose 1>quorum.err 2>&1 \
|| {
mv pe.cor.fa pe.cor.fa.failed;
log_warn Error correction of PE reads failed. Check pe.cor.log.;
exit 1;
}
log_debug "Discard any reads with subs"
mv pe.cor.fa pe.cor.sub.fa
cat pe.cor.sub.fa | grep -E '^>\w+\s*$' -A 1 | sed '/^--$/d' > pe.cor.fa
fi
SUM_IN=$( faops n50 -H -N 0 -S pe.renamed.fastq [% IF args.2 %]se.renamed.fastq [% END %])
save SUM_IN
SUM_OUT=$( faops n50 -H -N 0 -S pe.cor.fa )
save SUM_OUT
#----------------------------#
# Estimating genome size.
#----------------------------#
log_info Estimating genome size.
[% IF opt.estsize == 'auto' -%]
if [ ! -e k_u_hash_0 ]; then
jellyfish count -m 31 -t [% opt.parallel %] -C -s $JF_SIZE -o k_u_hash_0 pe.cor.fa
fi
ESTIMATED_GENOME_SIZE=$(jellyfish histo -t [% opt.parallel %] -h 1 k_u_hash_0 | tail -n 1 |awk '{print $2}')
save ESTIMATED_GENOME_SIZE
log_debug "Estimated genome size: $ESTIMATED_GENOME_SIZE"
[% ELSE -%]
ESTIMATED_GENOME_SIZE=[% opt.estsize %]
save ESTIMATED_GENOME_SIZE
log_debug "You set ESTIMATED_GENOME_SIZE of $ESTIMATED_GENOME_SIZE"
[% END -%]
#----------------------------#
# Done.
#----------------------------#
END_TIME=$(date +%s)
save END_TIME
RUNTIME=$((END_TIME-START_TIME))
save RUNTIME
log_info Done.
exit 0
EOF
my $output;
$tt->process(
\$text,
{ args => $args,
opt => $opt,
},
\$output
);
print {$out_fh} $output;
close $out_fh;
}
1;
( run in 1.627 second using v1.01-cache-2.11-cpan-5b529ec07f3 )