App-Anchr
view release on metacpan or search on metacpan
doc/e_coli.md view on Meta::CPAN
| Q30L90X160P000 | 742.66M | 160.0 | 44646 | 4.55M | 198 | 44646 | 4.53M | 177 | 975 | 20.46K | 21 | "31,41,51,61,71,81" | 0:16'17'' | 0:01'13'' |
| Q30L90X200P000 | 928.33M | 200.0 | 46294 | 4.55M | 190 | 48130 | 4.53M | 170 | 945 | 20K | 20 | "31,41,51,61,71,81" | 0:19'49'' | 0:01'17'' |
| Q30L120X40P000 | 185.67M | 40.0 | 10194 | 4.48M | 771 | 10531 | 4.37M | 650 | 853 | 108.66K | 121 | "31,41,51,61,71,81" | 0:05'50'' | 0:00'51'' |
| Q30L120X40P001 | 185.67M | 40.0 | 9937 | 4.48M | 766 | 10141 | 4.38M | 649 | 837 | 102.24K | 117 | "31,41,51,61,71,81" | 0:05'33'' | 0:00'26'' |
| Q30L120X40P002 | 185.67M | 40.0 | 21129 | 4.54M | 390 | 21129 | 4.52M | 357 | 802 | 25.63K | 33 | "31,41,51,61,71,81" | 0:05'37'' | 0:00'41'' |
| Q30L120X80P000 | 371.33M | 80.0 | 16109 | 4.53M | 498 | 16360 | 4.48M | 435 | 822 | 55.11K | 63 | "31,41,51,61,71,81" | 0:08'57'' | 0:00'36'' |
| Q30L120X120P000 | 557M | 120.0 | 35814 | 4.55M | 240 | 35814 | 4.52M | 216 | 976 | 24.08K | 24 | "31,41,51,61,71,81" | 0:12'34'' | 0:00'38'' |
| Q35L30X40P000 | 185.67M | 40.0 | 6751 | 4.51M | 1071 | 6955 | 4.36M | 865 | 762 | 152.52K | 206 | "31,41,51,61,71,81" | 0:03'02'' | 0:00'32'' |
| Q35L30X40P001 | 185.67M | 40.0 | 6571 | 4.51M | 1112 | 6753 | 4.35M | 895 | 767 | 163.46K | 217 | "31,41,51,61,71,81" | 0:02'57'' | 0:00'32'' |
| Q35L30X40P002 | 185.67M | 40.0 | 6716 | 4.53M | 1104 | 6929 | 4.34M | 867 | 783 | 188.15K | 237 | "31,41,51,61,71,81" | 0:02'50'' | 0:00'30'' |
| Q35L30X80P000 | 371.33M | 80.0 | 11480 | 4.55M | 665 | 11787 | 4.47M | 560 | 781 | 88.42K | 105 | "31,41,51,61,71,81" | 0:04'51'' | 0:00'48'' |
| Q35L30X120P000 | 557M | 120.0 | 14990 | 4.56M | 530 | 15235 | 4.49M | 449 | 844 | 74.65K | 81 | "31,41,51,61,71,81" | 0:06'10'' | 0:00'48'' |
| Q35L60X40P000 | 185.67M | 40.0 | 2132 | 3.76M | 2304 | 2583 | 2.84M | 1237 | 805 | 918.74K | 1067 | "31,41,51,61,71,81" | 0:02'27'' | 0:00'43'' |
## Merge anchors with Qxx, Lxx and QxxLxx
```bash
BASE_NAME=e_coli
cd ${HOME}/data/anchr/${BASE_NAME}
# merge anchors with Qxx
for Q in 20 25 30 35; do
mkdir -p mergeQ${Q}
anchr contained \
$(
parallel -k --no-run-if-empty -j 6 '
if [ -e Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa ]; then
echo Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa
fi
' ::: ${Q} ::: 30 60 90 120 ::: 40 80 120 160 200 ::: 000 001 002 003 004 005
) \
--len 1000 --idt 0.98 --proportion 0.99999 --parallel 16 \
-o stdout \
| faops filter -a 1000 -l 0 stdin mergeQ${Q}/anchor.contained.fasta
anchr orient mergeQ${Q}/anchor.contained.fasta --len 1000 --idt 0.98 -o mergeQ${Q}/anchor.orient.fasta
anchr merge mergeQ${Q}/anchor.orient.fasta --len 1000 --idt 0.999 -o stdout \
| faops filter -a 1000 -l 0 stdin mergeQ${Q}/anchor.merge.fasta
done
# merge anchors with Lxx
for L in 30 60 90 120; do
mkdir -p mergeL${L}
anchr contained \
$(
parallel -k --no-run-if-empty -j 6 '
if [ -e Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa ]; then
echo Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa
fi
' ::: 20 25 30 35 ::: ${L} ::: 40 80 120 160 200 ::: 000 001 002 003 004 005
) \
--len 1000 --idt 0.98 --proportion 0.99999 --parallel 16 \
-o stdout \
| faops filter -a 1000 -l 0 stdin mergeL${L}/anchor.contained.fasta
anchr orient mergeL${L}/anchor.contained.fasta --len 1000 --idt 0.98 -o mergeL${L}/anchor.orient.fasta
anchr merge mergeL${L}/anchor.orient.fasta --len 1000 --idt 0.999 -o stdout \
| faops filter -a 1000 -l 0 stdin mergeL${L}/anchor.merge.fasta
done
# quast
rm -fr 9_qa_mergeQL
quast --no-check --threads 16 \
-R 1_genome/genome.fa \
mergeQ20/anchor.merge.fasta \
mergeQ25/anchor.merge.fasta \
mergeQ30/anchor.merge.fasta \
mergeQ35/anchor.merge.fasta \
mergeL30/anchor.merge.fasta \
mergeL60/anchor.merge.fasta \
mergeL90/anchor.merge.fasta \
mergeL120/anchor.merge.fasta \
1_genome/paralogs.fas \
--label "mergeQ20,mergeQ25,mergeQ30,mergeQ35,mergeL30,mergeL60,mergeL90,mergeL120,paralogs" \
-o 9_qa_mergeQL
# merge anchors with QxxLxx
for Q in 20 25 30; do
for L in 30 60 90; do
mkdir -p mergeQ${Q}L${L}
anchr contained \
$(
parallel -k --no-run-if-empty -j 6 '
if [ -e Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa ]; then
echo Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa
fi
' ::: ${Q} ::: ${L} ::: 40 80 120 160 200 ::: 000 001 002 003 004 005
) \
--len 1000 --idt 0.98 --proportion 0.99999 --parallel 16 \
-o stdout \
| faops filter -a 1000 -l 0 stdin mergeQ${Q}L${L}/anchor.contained.fasta
anchr orient mergeQ${Q}L${L}/anchor.contained.fasta --len 1000 --idt 0.98 -o mergeQ${Q}L${L}/anchor.orient.fasta
anchr merge mergeQ${Q}L${L}/anchor.orient.fasta --len 1000 --idt 0.999 -o stdout \
| faops filter -a 1000 -l 0 stdin mergeQ${Q}L${L}/anchor.merge.fasta
done
done
# quast
rm -fr 9_qa_mergeQxxLxx
quast --no-check --threads 16 \
-R 1_genome/genome.fa \
mergeQ20L30/anchor.merge.fasta \
mergeQ20L60/anchor.merge.fasta \
mergeQ20L90/anchor.merge.fasta \
mergeQ25L30/anchor.merge.fasta \
mergeQ25L60/anchor.merge.fasta \
mergeQ25L90/anchor.merge.fasta \
mergeQ30L30/anchor.merge.fasta \
mergeQ30L60/anchor.merge.fasta \
mergeQ30L90/anchor.merge.fasta \
1_genome/paralogs.fas \
--label "mergeQ20L30,mergeQ20L60,mergeQ20L90,mergeQ25L30,mergeQ25L60,mergeQ25L90,mergeQ30L30,mergeQ30L60,mergeQ30L90,paralogs" \
-o 9_qa_mergeQxxLxx
```
## Merge anchors
```bash
BASE_NAME=e_coli
cd ${HOME}/data/anchr/${BASE_NAME}
# merge anchors
mkdir -p merge
anchr contained \
$(
parallel -k --no-run-if-empty -j 6 "
if [ -e Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa ]; then
echo Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa
fi
" ::: 25 30 ::: 60 ::: 40 80 120 160 200 ::: 000 001 002 003 004 005
) \
--len 1000 --idt 0.98 --proportion 0.99999 --parallel 16 \
-o stdout \
| faops filter -a 1000 -l 0 stdin merge/anchor.contained.fasta
anchr orient merge/anchor.contained.fasta --len 1000 --idt 0.98 -o merge/anchor.orient.fasta
anchr merge merge/anchor.orient.fasta --len 1000 --idt 0.999 -o merge/anchor.merge0.fasta
anchr contained merge/anchor.merge0.fasta --len 1000 --idt 0.98 \
--proportion 0.99 --parallel 16 -o stdout \
| faops filter -a 1000 -l 0 stdin merge/anchor.merge1.fasta
faops order merge/anchor.merge1.fasta \
<(faops size merge/anchor.merge1.fasta | sort -n -r -k2,2 | cut -f 1) \
merge/anchor.merge.fasta
# merge others
mkdir -p merge
anchr contained \
$(
parallel -k --no-run-if-empty -j 6 "
if [ -e Q{1}L{2}X{3}P{4}/anchor/pe.others.fa ]; then
echo Q{1}L{2}X{3}P{4}/anchor/pe.others.fa
fi
" ::: 25 30 ::: 60 ::: 40 80 120 160 200 ::: 000 001 002 003 004 005
) \
--len 1000 --idt 0.98 --proportion 0.99999 --parallel 16 \
-o stdout \
| faops filter -a 1000 -l 0 stdin merge/others.contained.fasta
anchr orient merge/others.contained.fasta --len 1000 --idt 0.98 -o merge/others.orient.fasta
anchr merge merge/others.orient.fasta --len 1000 --idt 0.999 -o stdout \
| faops filter -a 1000 -l 0 stdin merge/others.merge.fasta
# anchor sort on ref
bash ~/Scripts/cpan/App-Anchr/share/sort_on_ref.sh merge/anchor.merge.fasta 1_genome/genome.fa merge/anchor.sort
# mummerplot files
nucmer -l 200 1_genome/genome.fa merge/anchor.sort.fa
mummerplot out.delta --png --large -p anchor.sort
rm *.[fr]plot
rm out.delta
rm *.gp
mv anchor.sort.png merge/
# minidot
minimap merge/anchor.sort.fa 1_genome/genome.fa \
| minidot - > merge/anchor.minidot.eps
# quast
rm -fr 9_qa_merge
quast --no-check --threads 16 \
-R 1_genome/genome.fa \
8_spades/contigs.fasta \
8_spades/scaffolds.fasta \
8_spades_Q30L60/scaffolds.fasta \
8_platanus/out_contig.fa \
8_platanus/out_gapClosed.fa \
8_platanus_quorum/out_gapClosed.fa \
8_platanus_Q30L60/out_gapClosed.fa \
merge/anchor.merge.fasta \
merge/scaffold/out_scaffold.fa \
merge/scaffold/out_gapClosed.fa \
1_genome/paralogs.fas \
--label "spades.contig,spades.scaffold,spades_Q30L60,platanus.contig,platanus.scaffold,platanus_quorum,platanus_Q30L60,merge,merge.scaffold,merge.gapClosed,paralogs" \
-o 9_qa_merge
```
## Scaffolding with PE
```bash
BASE_NAME=e_coli
cd ${HOME}/data/anchr/${BASE_NAME}
# PE
mkdir -p merge/scaffold
cd merge/scaffold
if [ ! -e pe.fa ]; then
faops interleave \
-p pe \
../../2_illumina/Q25L60/R1.fq.gz \
../../2_illumina/Q25L60/R2.fq.gz \
> pe.fa
fi
anchr scaffold \
../anchor.merge.fasta \
pe.fa \
-p 8 \
-o scaffold.sh
bash scaffold.sh
```
## Different K values
```bash
BASE_NAME=e_coli
cd ${HOME}/data/anchr/${BASE_NAME}
# oriR1: 67; oriR2: 43; Q30L60: 71
parallel -j 3 "
mkdir -p Q25L60K{}
cd Q25L60K{}
anchr kunitigs \
../2_illumina/Q25L60X40P000/pe.cor.fa \
../2_illumina/Q25L60X40P000/environment.json \
-p 8 \
--kmer {} \
-o kunitigs.sh
bash kunitigs.sh
rm -fr anchor
mkdir -p anchor
cd anchor
anchr anchors \
../k_unitigs.fasta \
../pe.cor.fa \
-p 8 \
-o anchors.sh
bash anchors.sh
" ::: 21 31 41 43 51 61 67 71 81 91 101 111 121
mkdir -p Q25L60Kmerge
anchr contained \
Q25L60K21/anchor/pe.anchor.fa \
Q25L60K31/anchor/pe.anchor.fa \
Q25L60K41/anchor/pe.anchor.fa \
Q25L60K43/anchor/pe.anchor.fa \
Q25L60K51/anchor/pe.anchor.fa \
Q25L60K61/anchor/pe.anchor.fa \
Q25L60K67/anchor/pe.anchor.fa \
Q25L60K71/anchor/pe.anchor.fa \
Q25L60K81/anchor/pe.anchor.fa \
Q25L60K91/anchor/pe.anchor.fa \
Q25L60K101/anchor/pe.anchor.fa \
Q25L60K111/anchor/pe.anchor.fa \
Q25L60K121/anchor/pe.anchor.fa \
--len 1000 --idt 0.98 --proportion 0.99999 --parallel 16 \
-o stdout \
| faops filter -a 1000 -l 0 stdin Q25L60Kmerge/anchor.contained.fasta
anchr orient Q25L60Kmerge/anchor.contained.fasta --len 1000 --idt 0.98 -o Q25L60Kmerge/anchor.orient.fasta
anchr merge Q25L60Kmerge/anchor.orient.fasta --len 1000 --idt 0.999 -o stdout \
| faops filter -a 1000 -l 0 stdin Q25L60Kmerge/anchor.merge.fasta
rm -fr 9_qa_kmer_Q25
quast --no-check --threads 16 \
-R 1_genome/genome.fa \
Q25L60K21/anchor/pe.anchor.fa \
Q25L60K31/anchor/pe.anchor.fa \
Q25L60K41/anchor/pe.anchor.fa \
Q25L60K43/anchor/pe.anchor.fa \
Q25L60K51/anchor/pe.anchor.fa \
Q25L60K61/anchor/pe.anchor.fa \
Q25L60K67/anchor/pe.anchor.fa \
Q25L60K71/anchor/pe.anchor.fa \
Q25L60K81/anchor/pe.anchor.fa \
Q25L60K91/anchor/pe.anchor.fa \
Q25L60K101/anchor/pe.anchor.fa \
Q25L60K111/anchor/pe.anchor.fa \
Q25L60K121/anchor/pe.anchor.fa \
Q25L60X40P000/anchor/pe.anchor.fa \
Q25L60Kmerge/anchor.merge.fasta \
1_genome/paralogs.fas \
--label "Q25L60K21,Q25L60K31,Q25L60K41,Q25L60K43,Q25L60K51,Q25L60K61,Q25L60K67,Q25L60K71,Q25L60K81,Q25L60K91,Q25L60K101,Q25L60K111,Q25L60K121,Q25L60X40P000,Q25L60Kmerge,paralogs" \
-o 9_qa_kmer_Q25
parallel -j 3 "
mkdir -p Q30L60K{}
cd Q30L60K{}
anchr kunitigs \
../2_illumina/Q30L60X40P000/pe.cor.fa \
../2_illumina/Q30L60X40P000/environment.json \
-p 8 \
--kmer {} \
-o kunitigs.sh
bash kunitigs.sh
rm -fr anchor
mkdir -p anchor
cd anchor
anchr anchors \
../k_unitigs.fasta \
../pe.cor.fa \
-p 8 \
-o anchors.sh
bash anchors.sh
" ::: 21 31 41 43 51 61 67 71 81 91 101 111 121
mkdir -p Q30L60Kmerge
anchr contained \
Q30L60K21/anchor/pe.anchor.fa \
Q30L60K31/anchor/pe.anchor.fa \
Q30L60K41/anchor/pe.anchor.fa \
Q30L60K43/anchor/pe.anchor.fa \
Q30L60K51/anchor/pe.anchor.fa \
Q30L60K61/anchor/pe.anchor.fa \
Q30L60K67/anchor/pe.anchor.fa \
Q30L60K71/anchor/pe.anchor.fa \
Q30L60K81/anchor/pe.anchor.fa \
Q30L60K91/anchor/pe.anchor.fa \
Q30L60K101/anchor/pe.anchor.fa \
Q30L60K111/anchor/pe.anchor.fa \
Q30L60K121/anchor/pe.anchor.fa \
--len 1000 --idt 0.98 --proportion 0.99999 --parallel 16 \
-o stdout \
| faops filter -a 1000 -l 0 stdin Q30L60Kmerge/anchor.contained.fasta
anchr orient Q30L60Kmerge/anchor.contained.fasta --len 1000 --idt 0.98 -o Q30L60Kmerge/anchor.orient.fasta
anchr merge Q30L60Kmerge/anchor.orient.fasta --len 1000 --idt 0.999 -o stdout \
| faops filter -a 1000 -l 0 stdin Q30L60Kmerge/anchor.merge.fasta
rm -fr 9_qa_kmer_Q30
quast --no-check --threads 16 \
-R 1_genome/genome.fa \
Q30L60K21/anchor/pe.anchor.fa \
Q30L60K31/anchor/pe.anchor.fa \
Q30L60K41/anchor/pe.anchor.fa \
Q30L60K43/anchor/pe.anchor.fa \
Q30L60K51/anchor/pe.anchor.fa \
Q30L60K61/anchor/pe.anchor.fa \
Q30L60K67/anchor/pe.anchor.fa \
Q30L60K71/anchor/pe.anchor.fa \
Q30L60K81/anchor/pe.anchor.fa \
Q30L60K91/anchor/pe.anchor.fa \
Q30L60K101/anchor/pe.anchor.fa \
Q30L60K111/anchor/pe.anchor.fa \
Q30L60K121/anchor/pe.anchor.fa \
Q30L60X40P000/anchor/pe.anchor.fa \
Q30L60Kmerge/anchor.merge.fasta \
1_genome/paralogs.fas \
--label "Q30L60K21,Q30L60K31,Q30L60K41,Q30L60K43,Q30L60K51,Q30L60K61,Q30L60K67,Q30L60K71,Q30L60K81,Q30L60K91,Q30L60K101,Q30L60K111,Q30L60K121,Q30L60X40P000,Q30L60Kmerge,paralogs" \
-o 9_qa_kmer_Q30
# stat2
REAL_G=4641652
bash ~/Scripts/cpan/App-Anchr/share/sr_stat.sh 2 header \
> statK2.md
parallel -k --no-run-if-empty -j 6 "
bash ~/Scripts/cpan/App-Anchr/share/sr_stat.sh 2 {1}K{2} ${REAL_G}
" ::: Q25L60 Q30L60 ::: 21 31 41 43 51 61 67 71 81 91 101 111 121 \
>> statK2.md
```
| Name | SumCor | CovCor | N50SR | Sum | # | N50Anchor | Sum | # | N50Others | Sum | # | Kmer | RunTimeKU | RunTimeAN |
|:-----------|--------:|-------:|------:|------:|-----:|----------:|--------:|-----:|----------:|--------:|-----:|------:|----------:|:----------|
| Q25L60K21 | 185.67M | 40.0 | 7089 | 4.44M | 999 | 7305 | 4.31M | 827 | 759 | 127.25K | 172 | "21" | 0:00'48'' | 0:00'33'' |
| Q25L60K31 | 185.67M | 40.0 | 14206 | 4.5M | 500 | 14262 | 4.47M | 462 | 801 | 29.46K | 38 | "31" | 0:00'47'' | 0:00'33'' |
| Q25L60K41 | 185.67M | 40.0 | 18971 | 4.52M | 387 | 19351 | 4.51M | 362 | 804 | 18.97K | 25 | "41" | 0:00'47'' | 0:00'33'' |
| Q25L60K43 | 185.67M | 40.0 | 21290 | 4.53M | 366 | 21290 | 4.51M | 343 | 808 | 17.3K | 23 | "43" | 0:00'46'' | 0:00'35'' |
| Q25L60K51 | 185.67M | 40.0 | 26192 | 4.53M | 334 | 26195 | 4.51M | 309 | 720 | 17.93K | 25 | "51" | 0:00'45'' | 0:00'34'' |
| Q25L60K61 | 185.67M | 40.0 | 23820 | 4.54M | 344 | 24492 | 4.52M | 319 | 706 | 17.67K | 25 | "61" | 0:00'46'' | 0:00'33'' |
| Q25L60K67 | 185.67M | 40.0 | 21564 | 4.54M | 387 | 21613 | 4.52M | 359 | 754 | 20.67K | 28 | "67" | 0:00'42'' | 0:00'35'' |
| Q25L60K71 | 185.67M | 40.0 | 20891 | 4.55M | 403 | 20891 | 4.52M | 370 | 754 | 24.42K | 33 | "71" | 0:00'42'' | 0:00'35'' |
| Q25L60K81 | 185.67M | 40.0 | 15295 | 4.55M | 529 | 15393 | 4.51M | 476 | 757 | 39.55K | 53 | "81" | 0:00'40'' | 0:00'35'' |
| Q25L60K91 | 185.67M | 40.0 | 9179 | 4.56M | 818 | 9314 | 4.48M | 703 | 789 | 85.19K | 115 | "91" | 0:00'37'' | 0:00'36'' |
| Q25L60K101 | 185.67M | 40.0 | 5239 | 4.54M | 1393 | 5449 | 4.32M | 1081 | 764 | 227.82K | 312 | "101" | 0:00'36'' | 0:00'36'' |
| Q25L60K111 | 185.67M | 40.0 | 2453 | 4.39M | 2418 | 2852 | 3.72M | 1494 | 754 | 668.43K | 924 | "111" | 0:00'34'' | 0:00'35'' |
| Q25L60K121 | 185.67M | 40.0 | 1146 | 3.59M | 3369 | 1734 | 2.08M | 1217 | 724 | 1.51M | 2152 | "121" | 0:00'20'' | 0:00'21'' |
| Q30L60K21 | 185.67M | 40.0 | 7464 | 4.44M | 948 | 7708 | 4.32M | 788 | 760 | 119.55K | 160 | "21" | 0:00'50'' | 0:00'36'' |
| Q30L60K31 | 185.67M | 40.0 | 15791 | 4.5M | 447 | 15800 | 4.48M | 412 | 801 | 27.53K | 35 | "31" | 0:00'50'' | 0:00'36'' |
| Q30L60K41 | 185.67M | 40.0 | 21978 | 4.52M | 378 | 22693 | 4.5M | 348 | 754 | 22.49K | 30 | "41" | 0:00'49'' | 0:00'35'' |
| Q30L60K43 | 185.67M | 40.0 | 23150 | 4.53M | 371 | 23535 | 4.5M | 338 | 813 | 25.78K | 33 | "43" | 0:00'44'' | 0:00'34'' |
| Q30L60K51 | 185.67M | 40.0 | 21553 | 4.53M | 380 | 21581 | 4.51M | 346 | 782 | 26K | 34 | "51" | 0:00'44'' | 0:00'35'' |
| Q30L60K61 | 185.67M | 40.0 | 17816 | 4.54M | 477 | 17876 | 4.5M | 428 | 792 | 37.13K | 49 | "61" | 0:00'45'' | 0:00'35'' |
| Q30L60K67 | 185.67M | 40.0 | 13527 | 4.54M | 592 | 13936 | 4.49M | 530 | 785 | 46.6K | 62 | "67" | 0:00'40'' | 0:00'36'' |
| Q30L60K71 | 185.67M | 40.0 | 10956 | 4.54M | 726 | 11268 | 4.48M | 640 | 757 | 63.27K | 86 | "71" | 0:00'39'' | 0:00'36'' |
| Q30L60K81 | 185.67M | 40.0 | 6773 | 4.53M | 1094 | 6982 | 4.39M | 906 | 760 | 138.44K | 188 | "81" | 0:00'39'' | 0:00'36'' |
| Q30L60K91 | 185.67M | 40.0 | 3702 | 4.45M | 1787 | 4021 | 4.08M | 1292 | 761 | 361.39K | 495 | "91" | 0:00'35'' | 0:00'34'' |
| Q30L60K101 | 185.67M | 40.0 | 1829 | 4.13M | 2788 | 2227 | 3.23M | 1527 | 744 | 907.41K | 1261 | "101" | 0:00'33'' | 0:00'32'' |
| Q30L60K111 | 185.67M | 40.0 | 1027 | 3.11M | 3144 | 1636 | 1.6M | 965 | 710 | 1.51M | 2179 | "111" | 0:00'31'' | 0:00'29'' |
doc/e_coli.md view on Meta::CPAN
## 3GS
* Canu
```bash
BASE_NAME=e_coli
REAL_G=4641652
cd ${HOME}/data/anchr/${BASE_NAME}
canu \
-p ${BASE_NAME} -d canu-raw-20x \
gnuplot=$(brew --prefix)/Cellar/$(brew list --versions gnuplot | sed 's/ /\//')/bin/gnuplot \
genomeSize=${REAL_G} \
-pacbio-raw 3_pacbio/pacbio.20x.fasta
canu \
-p ${BASE_NAME} -d canu-raw-40x \
gnuplot=$(brew --prefix)/Cellar/$(brew list --versions gnuplot | sed 's/ /\//')/bin/gnuplot \
genomeSize=${REAL_G} \
-pacbio-raw 3_pacbio/pacbio.40x.fasta
canu \
-p ${BASE_NAME} -d canu-raw-80x \
gnuplot=$(brew --prefix)/Cellar/$(brew list --versions gnuplot | sed 's/ /\//')/bin/gnuplot \
genomeSize=${REAL_G} \
-pacbio-raw 3_pacbio/pacbio.80x.fasta
canu \
-p ${BASE_NAME} -d canu-raw \
gnuplot=$(brew --prefix)/Cellar/$(brew list --versions gnuplot | sed 's/ /\//')/bin/gnuplot \
genomeSize=${REAL_G} \
-pacbio-raw 3_pacbio/pacbio.fasta
canu \
-p ${BASE_NAME} -d canu-trim-20x \
gnuplot=$(brew --prefix)/Cellar/$(brew list --versions gnuplot | sed 's/ /\//')/bin/gnuplot \
genomeSize=${REAL_G} \
-pacbio-raw 3_pacbio/pacbio.20x.trim.fasta
canu \
-p ${BASE_NAME} -d canu-trim-40x \
gnuplot=$(brew --prefix)/Cellar/$(brew list --versions gnuplot | sed 's/ /\//')/bin/gnuplot \
genomeSize=${REAL_G} \
-pacbio-raw 3_pacbio/pacbio.40x.trim.fasta
canu \
-p ${BASE_NAME} -d canu-trim-80x \
gnuplot=$(brew --prefix)/Cellar/$(brew list --versions gnuplot | sed 's/ /\//')/bin/gnuplot \
genomeSize=${REAL_G} \
-pacbio-raw 3_pacbio/pacbio.80x.trim.fasta
canu \
-p ${BASE_NAME} -d canu-trim \
gnuplot=$(brew --prefix)/Cellar/$(brew list --versions gnuplot | sed 's/ /\//')/bin/gnuplot \
genomeSize=${REAL_G} \
-pacbio-raw 3_pacbio/pacbio.trim.fasta
# quast
rm -fr 9_qa_canu
quast --no-check --threads 16 \
--eukaryote \
-R 1_genome/genome.fa \
canu-raw-20x/${BASE_NAME}.contigs.fasta \
canu-trim-20x/${BASE_NAME}.contigs.fasta \
canu-raw-40x/${BASE_NAME}.contigs.fasta \
canu-trim-40x/${BASE_NAME}.contigs.fasta \
canu-raw-80x/${BASE_NAME}.contigs.fasta \
canu-trim-80x/${BASE_NAME}.contigs.fasta \
canu-raw/${BASE_NAME}.contigs.fasta \
canu-trim/${BASE_NAME}.contigs.fasta \
1_genome/paralogs.fas \
--label "20x,20x.trim,40x,40x.trim,80x,80x.trim,raw,trim,paralogs" \
-o 9_qa_canu
faops n50 -S -C canu-raw-20x/${BASE_NAME}.trimmedReads.fasta.gz
faops n50 -S -C canu-trim-20x/${BASE_NAME}.trimmedReads.fasta.gz
faops n50 -S -C canu-raw-40x/${BASE_NAME}.trimmedReads.fasta.gz
faops n50 -S -C canu-trim-40x/${BASE_NAME}.trimmedReads.fasta.gz
faops n50 -S -C canu-raw-80x/${BASE_NAME}.trimmedReads.fasta.gz
faops n50 -S -C canu-trim-80x/${BASE_NAME}.trimmedReads.fasta.gz
faops n50 -S -C canu-raw/${BASE_NAME}.trimmedReads.fasta.gz
faops n50 -S -C canu-trim/${BASE_NAME}.trimmedReads.fasta.gz
find . -type d -name "correction" -path "*canu-*" | xargs rm -fr
```
* miniasm
* `-S skip self and dual mappings`
* `-w INT minizer window size [{-k}*2/3]`
* `-L INT min matching length [40]`
* `-m FLOAT merge two chains if FLOAT fraction of minimizers are shared [0.50]`
* `-t INT number of threads [3]`
```bash
BASE_NAME=e_coli
cd ${HOME}/data/anchr/${BASE_NAME}
mkdir -p miniasm
minimap -Sw5 -L100 -m0 -t16 \
3_pacbio/pacbio.40x.fasta 3_pacbio/pacbio.40x.fasta \
> miniasm/pacbio.40x.paf
miniasm miniasm/pacbio.40x.paf > miniasm/utg.noseq.gfa
miniasm -f 3_pacbio/pacbio.40x.fasta miniasm/pacbio.40x.paf \
> miniasm/utg.gfa
awk '/^S/{print ">"$2"\n"$3}' miniasm/utg.gfa > miniasm/utg.fa
minimap 1_genome/genome.fa miniasm/utg.fa | minidot - > miniasm/utg.eps
```
```bash
#real 0m19.504s
#user 1m11.237s
#sys 0m18.500s
time anchr paf2ovlp --parallel 16 miniasm/pacbio.40x.paf -o miniasm/pacbio.40x.ovlp.tsv
#real 0m19.451s
#user 0m43.343s
#sys 0m9.734s
time anchr paf2ovlp --parallel 4 miniasm/pacbio.40x.paf -o miniasm/pacbio.40x.ovlp.tsv
#real 0m17.324s
#user 0m9.276s
#sys 1m23.833s
time jrange covered miniasm/pacbio.40x.paf --longest --paf -o miniasm/pacbio.40x.pos.txt
```
## Local corrections
```bash
BASE_NAME=e_coli
REAL_G=4641652
cd ${HOME}/data/anchr/${BASE_NAME}
rm -fr localCor
anchr overlap2 \
--parallel 16 \
merge/anchor.merge.fasta \
3_pacbio/pacbio.40x.trim.fasta \
-d localCor \
-b 10 --len 1000 --idt 0.85 --all
pushd localCor
anchr cover \
--range "1-$(faops n50 -H -N 0 -C anchor.fasta)" \
--len 1000 --idt 0.85 -c 2 \
anchorLong.ovlp.tsv \
-o anchor.cover.json
doc/e_coli.md view on Meta::CPAN
anchorLong.ovlp.tsv \
--parallel 16 \
--range $(cat environment.json | jq -r '.TRUSTED') \
--len 1000 --idt 0.85 --trim -v
faops some -i -l 0 \
long.fasta \
group/overlapped.long.txt \
independentLong.fasta
# localCor
gzip -d -c -f $(find group -type f -name "*.correctedReads.fasta.gz") \
| faops filter -l 0 stdin stdout \
| grep -E '^>long' -A 1 \
| sed '/^--$/d' \
| faops dazz -a -l 0 stdin stdout \
| pigz -c > localCor.fasta.gz
canu \
-p ${BASE_NAME} -d localCor \
gnuplotTested=true \
genomeSize=${REAL_G} \
-pacbio-corrected localCor.fasta.gz \
-pacbio-corrected anchor.fasta
canu \
-p ${BASE_NAME} -d localCorIndep \
gnuplotTested=true \
genomeSize=${REAL_G} \
-pacbio-raw localCor.fasta.gz \
-pacbio-raw anchor.fasta \
-pacbio-raw independentLong.fasta
# localTrim
gzip -d -c -f $(find group -type f -name "*.trimmedReads.fasta.gz") \
| faops filter -l 0 stdin stdout \
| grep -E '^>long' -A 1 \
| sed '/^--$/d' \
| faops dazz -a -l 0 stdin stdout \
| pigz -c > localTrim.fasta.gz
canu \
-p ${BASE_NAME} -d localTrim \
gnuplotTested=true \
genomeSize=${REAL_G} \
-pacbio-corrected localCor.fasta.gz \
-pacbio-corrected anchor.fasta
# globalTrim
canu -assemble \
-p ${BASE_NAME} -d globalTrim \
gnuplotTested=true \
genomeSize=${REAL_G} \
-pacbio-corrected ../canu-raw-40x/${BASE_NAME}.trimmedReads.fasta.gz \
-pacbio-corrected anchor.fasta
popd
# quast
rm -fr 9_qa_localCor
quast --no-check --threads 16 \
--eukaryote \
-R 1_genome/genome.fa \
localCor/anchor.fasta \
localCor/localCor/${BASE_NAME}.contigs.fasta \
localCor/localCorIndep/${BASE_NAME}.contigs.fasta \
localCor/localTrim/${BASE_NAME}.contigs.fasta \
localCor/globalTrim/${BASE_NAME}.contigs.fasta \
canu-raw-40x/${BASE_NAME}.contigs.fasta \
canu-trim-40x/${BASE_NAME}.contigs.fasta \
1_genome/paralogs.fas \
--label "anchor,localCor,localCorIndep,localTrim,globalTrim,40x,40x.trim,paralogs" \
-o 9_qa_localCor
find . -type d -name "correction" | xargs rm -fr
```
## Expand anchors
ä¸ä»£ reads éæä¸ä¸ªå¸¸è§çé误, å³åä¸ ZMW éçæµåºç»æä¸, æ¥å¤´åºåé¨åçæµåºç»æåºç°äºè¾å¤çé误,
å æ¤å¹¶æ²¡æå°æ¥å¤´åºåå»é¤å¹²å, å½¢æç subreads é嫿å¤ä»½åºå ç»ä¸åä¸ç段, å®ä»¬ä¹é´ä»¥æ¥å¤´åºå为é´é.
`anchr group` å½ä»¤é»è®¤ä¼å°è¿ç§ä¸ä»£ç reads å»é¤. `--keep` é项ä¼çä¸è¿ç§ reads, è¿éç¨äºç»è£
好çä¸ä»£åºå.
```text
===
------------>
)
<----------
===
```
* anchorLong
```bash
BASE_NAME=e_coli
cd ${HOME}/data/anchr/${BASE_NAME}
rm -fr anchorLong
anchr overlap2 \
--parallel 16 \
merge/anchor.merge.fasta \
3_pacbio/pacbio.40x.trim.fasta \
-d anchorLong \
-b 10 --len 1000 --idt 0.85 --all
pushd anchorLong
anchr cover \
--range "1-$(faops n50 -H -N 0 -C anchor.fasta)" \
--len 1000 --idt 0.85 -c 2 \
anchorLong.ovlp.tsv \
-o anchor.cover.json
cat anchor.cover.json | jq "." > environment.json
anchr overlap \
anchor.fasta \
--serial --len 30 --idt 0.9999 \
-o stdout \
doc/e_coli.md view on Meta::CPAN
```
## Final stats
* Stats
```bash
BASE_NAME=e_coli
cd ${HOME}/data/anchr/${BASE_NAME}
printf "| %s | %s | %s | %s |\n" \
"Name" "N50" "Sum" "#" \
> stat3.md
printf "|:--|--:|--:|--:|\n" >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "Genome"; faops n50 -H -S -C 1_genome/genome.fa;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "Paralogs"; faops n50 -H -S -C 1_genome/paralogs.fas;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "anchor.merge"; faops n50 -H -S -C merge/anchor.merge.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "others.merge"; faops n50 -H -S -C merge/others.merge.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "anchorLong"; faops n50 -H -S -C anchorLong/contig.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "contigTrim"; faops n50 -H -S -C contigTrim/contig.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "spades.contig"; faops n50 -H -S -C 8_spades/contigs.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "spades.scaffold"; faops n50 -H -S -C 8_spades/scaffolds.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "platanus.contig"; faops n50 -H -S -C 8_platanus/out_contig.fa;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "platanus.scaffold"; faops n50 -H -S -C 8_platanus/out_gapClosed.fa;) >> stat3.md
cat stat3.md
```
| Name | N50 | Sum | # |
|:------------------|--------:|--------:|-----:|
| Genome | 4641652 | 4641652 | 1 |
| Paralogs | 1934 | 195673 | 106 |
| anchor.merge | 73736 | 4532566 | 117 |
| others.merge | 5923 | 21847 | 6 |
| anchorLong | 80390 | 4531790 | 109 |
| contigTrim | 3790335 | 4616261 | 4 |
| spades.contig | 132662 | 4645193 | 311 |
| spades.scaffold | 133063 | 4645555 | 306 |
| platanus.contig | 15090 | 4683012 | 1069 |
| platanus.scaffold | 133014 | 4575941 | 137 |
* quast
```bash
BASE_NAME=e_coli
cd ${HOME}/data/anchr/${BASE_NAME}
rm -fr 9_qa_contig
quast --no-check --threads 16 \
-R 1_genome/genome.fa \
merge/anchor.merge.fasta \
anchorLong/contig.fasta \
contigTrim/contig.fasta \
canu-raw-40x/${BASE_NAME}.contigs.fasta \
8_spades/scaffolds.fasta \
8_platanus/out_gapClosed.fa \
1_genome/paralogs.fas \
--label "merge,contig,contigTrim,canu-40x,spades,platanus,paralogs" \
-o 9_qa_contig
```
* Clear QxxLxxx.
```bash
BASE_NAME=e_coli
cd ${HOME}/data/anchr/${BASE_NAME}
rm -fr 2_illumina/Q{20,25,30,35}L{30,60,90,120}X*
rm -fr Q{20,25,30,35}L{30,60,90,120}X*
```
( run in 0.546 second using v1.01-cache-2.11-cpan-39bf76dae61 )