App-Anchr

 view release on metacpan or  search on metacpan

doc/e_coli.md  view on Meta::CPAN

| Q30L90X160P000  | 742.66M |  160.0 | 44646 | 4.55M |  198 |     44646 | 4.53M |  177 |       975 |  20.46K |   21 | "31,41,51,61,71,81" | 0:16'17'' | 0:01'13'' |
| Q30L90X200P000  | 928.33M |  200.0 | 46294 | 4.55M |  190 |     48130 | 4.53M |  170 |       945 |     20K |   20 | "31,41,51,61,71,81" | 0:19'49'' | 0:01'17'' |
| Q30L120X40P000  | 185.67M |   40.0 | 10194 | 4.48M |  771 |     10531 | 4.37M |  650 |       853 | 108.66K |  121 | "31,41,51,61,71,81" | 0:05'50'' | 0:00'51'' |
| Q30L120X40P001  | 185.67M |   40.0 |  9937 | 4.48M |  766 |     10141 | 4.38M |  649 |       837 | 102.24K |  117 | "31,41,51,61,71,81" | 0:05'33'' | 0:00'26'' |
| Q30L120X40P002  | 185.67M |   40.0 | 21129 | 4.54M |  390 |     21129 | 4.52M |  357 |       802 |  25.63K |   33 | "31,41,51,61,71,81" | 0:05'37'' | 0:00'41'' |
| Q30L120X80P000  | 371.33M |   80.0 | 16109 | 4.53M |  498 |     16360 | 4.48M |  435 |       822 |  55.11K |   63 | "31,41,51,61,71,81" | 0:08'57'' | 0:00'36'' |
| Q30L120X120P000 |    557M |  120.0 | 35814 | 4.55M |  240 |     35814 | 4.52M |  216 |       976 |  24.08K |   24 | "31,41,51,61,71,81" | 0:12'34'' | 0:00'38'' |
| Q35L30X40P000   | 185.67M |   40.0 |  6751 | 4.51M | 1071 |      6955 | 4.36M |  865 |       762 | 152.52K |  206 | "31,41,51,61,71,81" | 0:03'02'' | 0:00'32'' |
| Q35L30X40P001   | 185.67M |   40.0 |  6571 | 4.51M | 1112 |      6753 | 4.35M |  895 |       767 | 163.46K |  217 | "31,41,51,61,71,81" | 0:02'57'' | 0:00'32'' |
| Q35L30X40P002   | 185.67M |   40.0 |  6716 | 4.53M | 1104 |      6929 | 4.34M |  867 |       783 | 188.15K |  237 | "31,41,51,61,71,81" | 0:02'50'' | 0:00'30'' |
| Q35L30X80P000   | 371.33M |   80.0 | 11480 | 4.55M |  665 |     11787 | 4.47M |  560 |       781 |  88.42K |  105 | "31,41,51,61,71,81" | 0:04'51'' | 0:00'48'' |
| Q35L30X120P000  |    557M |  120.0 | 14990 | 4.56M |  530 |     15235 | 4.49M |  449 |       844 |  74.65K |   81 | "31,41,51,61,71,81" | 0:06'10'' | 0:00'48'' |
| Q35L60X40P000   | 185.67M |   40.0 |  2132 | 3.76M | 2304 |      2583 | 2.84M | 1237 |       805 | 918.74K | 1067 | "31,41,51,61,71,81" | 0:02'27'' | 0:00'43'' |

## Merge anchors with Qxx, Lxx and QxxLxx

```bash
BASE_NAME=e_coli
cd ${HOME}/data/anchr/${BASE_NAME}

# merge anchors with Qxx
for Q in 20 25 30 35; do
    mkdir -p mergeQ${Q}
    anchr contained \
        $(
            parallel -k --no-run-if-empty -j 6 '
                if [ -e Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa ]; then
                    echo Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa
                fi
                ' ::: ${Q} ::: 30 60 90 120 ::: 40 80 120 160 200 ::: 000 001 002 003 004 005
        ) \
        --len 1000 --idt 0.98 --proportion 0.99999 --parallel 16 \
        -o stdout \
        | faops filter -a 1000 -l 0 stdin mergeQ${Q}/anchor.contained.fasta
    anchr orient mergeQ${Q}/anchor.contained.fasta --len 1000 --idt 0.98 -o mergeQ${Q}/anchor.orient.fasta
    anchr merge mergeQ${Q}/anchor.orient.fasta --len 1000 --idt 0.999 -o stdout \
        | faops filter -a 1000 -l 0 stdin mergeQ${Q}/anchor.merge.fasta
done

# merge anchors with Lxx
for L in 30 60 90 120; do
    mkdir -p mergeL${L}
    anchr contained \
        $(
            parallel -k --no-run-if-empty -j 6 '
                if [ -e Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa ]; then
                    echo Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa
                fi
                ' ::: 20 25 30 35 ::: ${L} ::: 40 80 120 160 200 ::: 000 001 002 003 004 005
        ) \
        --len 1000 --idt 0.98 --proportion 0.99999 --parallel 16 \
        -o stdout \
        | faops filter -a 1000 -l 0 stdin mergeL${L}/anchor.contained.fasta
    anchr orient mergeL${L}/anchor.contained.fasta --len 1000 --idt 0.98 -o mergeL${L}/anchor.orient.fasta
    anchr merge mergeL${L}/anchor.orient.fasta --len 1000 --idt 0.999 -o stdout \
        | faops filter -a 1000 -l 0 stdin mergeL${L}/anchor.merge.fasta
done

# quast
rm -fr 9_qa_mergeQL
quast --no-check --threads 16 \
    -R 1_genome/genome.fa \
    mergeQ20/anchor.merge.fasta \
    mergeQ25/anchor.merge.fasta \
    mergeQ30/anchor.merge.fasta \
    mergeQ35/anchor.merge.fasta \
    mergeL30/anchor.merge.fasta \
    mergeL60/anchor.merge.fasta \
    mergeL90/anchor.merge.fasta \
    mergeL120/anchor.merge.fasta \
    1_genome/paralogs.fas \
    --label "mergeQ20,mergeQ25,mergeQ30,mergeQ35,mergeL30,mergeL60,mergeL90,mergeL120,paralogs" \
    -o 9_qa_mergeQL

# merge anchors with QxxLxx
for Q in 20 25 30; do
    for L in 30 60 90; do
        mkdir -p mergeQ${Q}L${L}
        anchr contained \
            $(
                parallel -k --no-run-if-empty -j 6 '
                    if [ -e Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa ]; then
                        echo Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa
                    fi
                    ' ::: ${Q} ::: ${L} ::: 40 80 120 160 200 ::: 000 001 002 003 004 005
            ) \
            --len 1000 --idt 0.98 --proportion 0.99999 --parallel 16 \
            -o stdout \
            | faops filter -a 1000 -l 0 stdin mergeQ${Q}L${L}/anchor.contained.fasta
        anchr orient mergeQ${Q}L${L}/anchor.contained.fasta --len 1000 --idt 0.98 -o mergeQ${Q}L${L}/anchor.orient.fasta
        anchr merge mergeQ${Q}L${L}/anchor.orient.fasta --len 1000 --idt 0.999 -o stdout \
            | faops filter -a 1000 -l 0 stdin mergeQ${Q}L${L}/anchor.merge.fasta
    done
done

# quast
rm -fr 9_qa_mergeQxxLxx
quast --no-check --threads 16 \
    -R 1_genome/genome.fa \
    mergeQ20L30/anchor.merge.fasta \
    mergeQ20L60/anchor.merge.fasta \
    mergeQ20L90/anchor.merge.fasta \
    mergeQ25L30/anchor.merge.fasta \
    mergeQ25L60/anchor.merge.fasta \
    mergeQ25L90/anchor.merge.fasta \
    mergeQ30L30/anchor.merge.fasta \
    mergeQ30L60/anchor.merge.fasta \
    mergeQ30L90/anchor.merge.fasta \
    1_genome/paralogs.fas \
    --label "mergeQ20L30,mergeQ20L60,mergeQ20L90,mergeQ25L30,mergeQ25L60,mergeQ25L90,mergeQ30L30,mergeQ30L60,mergeQ30L90,paralogs" \
    -o 9_qa_mergeQxxLxx

```

## Merge anchors

```bash
BASE_NAME=e_coli
cd ${HOME}/data/anchr/${BASE_NAME}

# merge anchors
mkdir -p merge
anchr contained \
    $(
        parallel -k --no-run-if-empty -j 6 "
            if [ -e Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa ]; then
                echo Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa
            fi
            " ::: 25 30 ::: 60 ::: 40 80 120 160 200 ::: 000 001 002 003 004 005
    ) \
    --len 1000 --idt 0.98 --proportion 0.99999 --parallel 16 \
    -o stdout \
    | faops filter -a 1000 -l 0 stdin merge/anchor.contained.fasta
anchr orient merge/anchor.contained.fasta --len 1000 --idt 0.98 -o merge/anchor.orient.fasta
anchr merge merge/anchor.orient.fasta --len 1000 --idt 0.999 -o merge/anchor.merge0.fasta
anchr contained merge/anchor.merge0.fasta --len 1000 --idt 0.98 \
    --proportion 0.99 --parallel 16 -o stdout \
    | faops filter -a 1000 -l 0 stdin merge/anchor.merge1.fasta
faops order merge/anchor.merge1.fasta \
    <(faops size merge/anchor.merge1.fasta | sort -n -r -k2,2 | cut -f 1) \
    merge/anchor.merge.fasta

# merge others
mkdir -p merge
anchr contained \
    $(
        parallel -k --no-run-if-empty -j 6 "
            if [ -e Q{1}L{2}X{3}P{4}/anchor/pe.others.fa ]; then
                echo Q{1}L{2}X{3}P{4}/anchor/pe.others.fa
            fi
            " ::: 25 30 ::: 60 ::: 40 80 120 160 200 ::: 000 001 002 003 004 005
    ) \
    --len 1000 --idt 0.98 --proportion 0.99999 --parallel 16 \
    -o stdout \
    | faops filter -a 1000 -l 0 stdin merge/others.contained.fasta
anchr orient merge/others.contained.fasta --len 1000 --idt 0.98 -o merge/others.orient.fasta
anchr merge merge/others.orient.fasta --len 1000 --idt 0.999 -o stdout \
    | faops filter -a 1000 -l 0 stdin merge/others.merge.fasta

# anchor sort on ref
bash ~/Scripts/cpan/App-Anchr/share/sort_on_ref.sh merge/anchor.merge.fasta 1_genome/genome.fa merge/anchor.sort

# mummerplot files
nucmer -l 200 1_genome/genome.fa merge/anchor.sort.fa
mummerplot out.delta --png --large -p anchor.sort
rm *.[fr]plot
rm out.delta
rm *.gp
mv anchor.sort.png merge/

# minidot
minimap merge/anchor.sort.fa 1_genome/genome.fa \
    | minidot - > merge/anchor.minidot.eps

# quast
rm -fr 9_qa_merge
quast --no-check --threads 16 \
    -R 1_genome/genome.fa \
    8_spades/contigs.fasta \
    8_spades/scaffolds.fasta \
    8_spades_Q30L60/scaffolds.fasta \
    8_platanus/out_contig.fa \
    8_platanus/out_gapClosed.fa \
    8_platanus_quorum/out_gapClosed.fa \
    8_platanus_Q30L60/out_gapClosed.fa \
    merge/anchor.merge.fasta \
    merge/scaffold/out_scaffold.fa \
    merge/scaffold/out_gapClosed.fa \
    1_genome/paralogs.fas \
    --label "spades.contig,spades.scaffold,spades_Q30L60,platanus.contig,platanus.scaffold,platanus_quorum,platanus_Q30L60,merge,merge.scaffold,merge.gapClosed,paralogs" \
    -o 9_qa_merge
```

## Scaffolding with PE

```bash
BASE_NAME=e_coli
cd ${HOME}/data/anchr/${BASE_NAME}

# PE
mkdir -p merge/scaffold
cd merge/scaffold

if [ ! -e pe.fa ]; then
    faops interleave \
        -p pe \
        ../../2_illumina/Q25L60/R1.fq.gz \
        ../../2_illumina/Q25L60/R2.fq.gz \
        > pe.fa
fi

anchr scaffold \
    ../anchor.merge.fasta \
    pe.fa \
    -p 8 \
    -o scaffold.sh
bash scaffold.sh

```

## Different K values

```bash
BASE_NAME=e_coli
cd ${HOME}/data/anchr/${BASE_NAME}

# oriR1: 67; oriR2: 43; Q30L60: 71

parallel -j 3 "
    mkdir -p Q25L60K{}
    cd Q25L60K{}

    anchr kunitigs \
        ../2_illumina/Q25L60X40P000/pe.cor.fa \
        ../2_illumina/Q25L60X40P000/environment.json \
        -p 8 \
        --kmer {} \
        -o kunitigs.sh
    bash kunitigs.sh

    rm -fr anchor
    mkdir -p anchor
    cd anchor
    anchr anchors \
        ../k_unitigs.fasta \
        ../pe.cor.fa \
        -p 8 \
        -o anchors.sh
    bash anchors.sh
    " ::: 21 31 41 43 51 61 67 71 81 91 101 111 121

mkdir -p Q25L60Kmerge
anchr contained \
    Q25L60K21/anchor/pe.anchor.fa \
    Q25L60K31/anchor/pe.anchor.fa \
    Q25L60K41/anchor/pe.anchor.fa \
    Q25L60K43/anchor/pe.anchor.fa \
    Q25L60K51/anchor/pe.anchor.fa \
    Q25L60K61/anchor/pe.anchor.fa \
    Q25L60K67/anchor/pe.anchor.fa \
    Q25L60K71/anchor/pe.anchor.fa \
    Q25L60K81/anchor/pe.anchor.fa \
    Q25L60K91/anchor/pe.anchor.fa \
    Q25L60K101/anchor/pe.anchor.fa \
    Q25L60K111/anchor/pe.anchor.fa \
    Q25L60K121/anchor/pe.anchor.fa \
    --len 1000 --idt 0.98 --proportion 0.99999 --parallel 16 \
    -o stdout \
    | faops filter -a 1000 -l 0 stdin Q25L60Kmerge/anchor.contained.fasta
anchr orient Q25L60Kmerge/anchor.contained.fasta --len 1000 --idt 0.98 -o Q25L60Kmerge/anchor.orient.fasta
anchr merge Q25L60Kmerge/anchor.orient.fasta --len 1000 --idt 0.999 -o stdout \
    | faops filter -a 1000 -l 0 stdin Q25L60Kmerge/anchor.merge.fasta

rm -fr 9_qa_kmer_Q25
quast --no-check --threads 16 \
    -R 1_genome/genome.fa \
    Q25L60K21/anchor/pe.anchor.fa \
    Q25L60K31/anchor/pe.anchor.fa \
    Q25L60K41/anchor/pe.anchor.fa \
    Q25L60K43/anchor/pe.anchor.fa \
    Q25L60K51/anchor/pe.anchor.fa \
    Q25L60K61/anchor/pe.anchor.fa \
    Q25L60K67/anchor/pe.anchor.fa \
    Q25L60K71/anchor/pe.anchor.fa \
    Q25L60K81/anchor/pe.anchor.fa \
    Q25L60K91/anchor/pe.anchor.fa \
    Q25L60K101/anchor/pe.anchor.fa \
    Q25L60K111/anchor/pe.anchor.fa \
    Q25L60K121/anchor/pe.anchor.fa \
    Q25L60X40P000/anchor/pe.anchor.fa \
    Q25L60Kmerge/anchor.merge.fasta \
    1_genome/paralogs.fas \
    --label "Q25L60K21,Q25L60K31,Q25L60K41,Q25L60K43,Q25L60K51,Q25L60K61,Q25L60K67,Q25L60K71,Q25L60K81,Q25L60K91,Q25L60K101,Q25L60K111,Q25L60K121,Q25L60X40P000,Q25L60Kmerge,paralogs" \
    -o 9_qa_kmer_Q25

parallel -j 3 "
    mkdir -p Q30L60K{}
    cd Q30L60K{}

    anchr kunitigs \
        ../2_illumina/Q30L60X40P000/pe.cor.fa \
        ../2_illumina/Q30L60X40P000/environment.json \
        -p 8 \
        --kmer {} \
        -o kunitigs.sh
    bash kunitigs.sh

    rm -fr anchor
    mkdir -p anchor
    cd anchor
    anchr anchors \
        ../k_unitigs.fasta \
        ../pe.cor.fa \
        -p 8 \
        -o anchors.sh
    bash anchors.sh
    " ::: 21 31 41 43 51 61 67 71 81 91 101 111 121

mkdir -p Q30L60Kmerge
anchr contained \
    Q30L60K21/anchor/pe.anchor.fa \
    Q30L60K31/anchor/pe.anchor.fa \
    Q30L60K41/anchor/pe.anchor.fa \
    Q30L60K43/anchor/pe.anchor.fa \
    Q30L60K51/anchor/pe.anchor.fa \
    Q30L60K61/anchor/pe.anchor.fa \
    Q30L60K67/anchor/pe.anchor.fa \
    Q30L60K71/anchor/pe.anchor.fa \
    Q30L60K81/anchor/pe.anchor.fa \
    Q30L60K91/anchor/pe.anchor.fa \
    Q30L60K101/anchor/pe.anchor.fa \
    Q30L60K111/anchor/pe.anchor.fa \
    Q30L60K121/anchor/pe.anchor.fa \
    --len 1000 --idt 0.98 --proportion 0.99999 --parallel 16 \
    -o stdout \
    | faops filter -a 1000 -l 0 stdin Q30L60Kmerge/anchor.contained.fasta
anchr orient Q30L60Kmerge/anchor.contained.fasta --len 1000 --idt 0.98 -o Q30L60Kmerge/anchor.orient.fasta
anchr merge Q30L60Kmerge/anchor.orient.fasta --len 1000 --idt 0.999 -o stdout \
    | faops filter -a 1000 -l 0 stdin Q30L60Kmerge/anchor.merge.fasta

rm -fr 9_qa_kmer_Q30
quast --no-check --threads 16 \
    -R 1_genome/genome.fa \
    Q30L60K21/anchor/pe.anchor.fa \
    Q30L60K31/anchor/pe.anchor.fa \
    Q30L60K41/anchor/pe.anchor.fa \
    Q30L60K43/anchor/pe.anchor.fa \
    Q30L60K51/anchor/pe.anchor.fa \
    Q30L60K61/anchor/pe.anchor.fa \
    Q30L60K67/anchor/pe.anchor.fa \
    Q30L60K71/anchor/pe.anchor.fa \
    Q30L60K81/anchor/pe.anchor.fa \
    Q30L60K91/anchor/pe.anchor.fa \
    Q30L60K101/anchor/pe.anchor.fa \
    Q30L60K111/anchor/pe.anchor.fa \
    Q30L60K121/anchor/pe.anchor.fa \
    Q30L60X40P000/anchor/pe.anchor.fa \
    Q30L60Kmerge/anchor.merge.fasta \
    1_genome/paralogs.fas \
    --label "Q30L60K21,Q30L60K31,Q30L60K41,Q30L60K43,Q30L60K51,Q30L60K61,Q30L60K67,Q30L60K71,Q30L60K81,Q30L60K91,Q30L60K101,Q30L60K111,Q30L60K121,Q30L60X40P000,Q30L60Kmerge,paralogs" \
    -o 9_qa_kmer_Q30

# stat2
REAL_G=4641652

bash ~/Scripts/cpan/App-Anchr/share/sr_stat.sh 2 header \
    > statK2.md

parallel -k --no-run-if-empty -j 6 "
    bash ~/Scripts/cpan/App-Anchr/share/sr_stat.sh 2 {1}K{2} ${REAL_G}
    " ::: Q25L60 Q30L60 ::: 21 31 41 43 51 61 67 71 81 91 101 111 121 \
    >> statK2.md

```

| Name       |  SumCor | CovCor | N50SR |   Sum |    # | N50Anchor |     Sum |    # | N50Others |     Sum |    # |  Kmer | RunTimeKU | RunTimeAN |
|:-----------|--------:|-------:|------:|------:|-----:|----------:|--------:|-----:|----------:|--------:|-----:|------:|----------:|:----------|
| Q25L60K21  | 185.67M |   40.0 |  7089 | 4.44M |  999 |      7305 |   4.31M |  827 |       759 | 127.25K |  172 |  "21" | 0:00'48'' | 0:00'33'' |
| Q25L60K31  | 185.67M |   40.0 | 14206 |  4.5M |  500 |     14262 |   4.47M |  462 |       801 |  29.46K |   38 |  "31" | 0:00'47'' | 0:00'33'' |
| Q25L60K41  | 185.67M |   40.0 | 18971 | 4.52M |  387 |     19351 |   4.51M |  362 |       804 |  18.97K |   25 |  "41" | 0:00'47'' | 0:00'33'' |
| Q25L60K43  | 185.67M |   40.0 | 21290 | 4.53M |  366 |     21290 |   4.51M |  343 |       808 |   17.3K |   23 |  "43" | 0:00'46'' | 0:00'35'' |
| Q25L60K51  | 185.67M |   40.0 | 26192 | 4.53M |  334 |     26195 |   4.51M |  309 |       720 |  17.93K |   25 |  "51" | 0:00'45'' | 0:00'34'' |
| Q25L60K61  | 185.67M |   40.0 | 23820 | 4.54M |  344 |     24492 |   4.52M |  319 |       706 |  17.67K |   25 |  "61" | 0:00'46'' | 0:00'33'' |
| Q25L60K67  | 185.67M |   40.0 | 21564 | 4.54M |  387 |     21613 |   4.52M |  359 |       754 |  20.67K |   28 |  "67" | 0:00'42'' | 0:00'35'' |
| Q25L60K71  | 185.67M |   40.0 | 20891 | 4.55M |  403 |     20891 |   4.52M |  370 |       754 |  24.42K |   33 |  "71" | 0:00'42'' | 0:00'35'' |
| Q25L60K81  | 185.67M |   40.0 | 15295 | 4.55M |  529 |     15393 |   4.51M |  476 |       757 |  39.55K |   53 |  "81" | 0:00'40'' | 0:00'35'' |
| Q25L60K91  | 185.67M |   40.0 |  9179 | 4.56M |  818 |      9314 |   4.48M |  703 |       789 |  85.19K |  115 |  "91" | 0:00'37'' | 0:00'36'' |
| Q25L60K101 | 185.67M |   40.0 |  5239 | 4.54M | 1393 |      5449 |   4.32M | 1081 |       764 | 227.82K |  312 | "101" | 0:00'36'' | 0:00'36'' |
| Q25L60K111 | 185.67M |   40.0 |  2453 | 4.39M | 2418 |      2852 |   3.72M | 1494 |       754 | 668.43K |  924 | "111" | 0:00'34'' | 0:00'35'' |
| Q25L60K121 | 185.67M |   40.0 |  1146 | 3.59M | 3369 |      1734 |   2.08M | 1217 |       724 |   1.51M | 2152 | "121" | 0:00'20'' | 0:00'21'' |
| Q30L60K21  | 185.67M |   40.0 |  7464 | 4.44M |  948 |      7708 |   4.32M |  788 |       760 | 119.55K |  160 |  "21" | 0:00'50'' | 0:00'36'' |
| Q30L60K31  | 185.67M |   40.0 | 15791 |  4.5M |  447 |     15800 |   4.48M |  412 |       801 |  27.53K |   35 |  "31" | 0:00'50'' | 0:00'36'' |
| Q30L60K41  | 185.67M |   40.0 | 21978 | 4.52M |  378 |     22693 |    4.5M |  348 |       754 |  22.49K |   30 |  "41" | 0:00'49'' | 0:00'35'' |
| Q30L60K43  | 185.67M |   40.0 | 23150 | 4.53M |  371 |     23535 |    4.5M |  338 |       813 |  25.78K |   33 |  "43" | 0:00'44'' | 0:00'34'' |
| Q30L60K51  | 185.67M |   40.0 | 21553 | 4.53M |  380 |     21581 |   4.51M |  346 |       782 |     26K |   34 |  "51" | 0:00'44'' | 0:00'35'' |
| Q30L60K61  | 185.67M |   40.0 | 17816 | 4.54M |  477 |     17876 |    4.5M |  428 |       792 |  37.13K |   49 |  "61" | 0:00'45'' | 0:00'35'' |
| Q30L60K67  | 185.67M |   40.0 | 13527 | 4.54M |  592 |     13936 |   4.49M |  530 |       785 |   46.6K |   62 |  "67" | 0:00'40'' | 0:00'36'' |
| Q30L60K71  | 185.67M |   40.0 | 10956 | 4.54M |  726 |     11268 |   4.48M |  640 |       757 |  63.27K |   86 |  "71" | 0:00'39'' | 0:00'36'' |
| Q30L60K81  | 185.67M |   40.0 |  6773 | 4.53M | 1094 |      6982 |   4.39M |  906 |       760 | 138.44K |  188 |  "81" | 0:00'39'' | 0:00'36'' |
| Q30L60K91  | 185.67M |   40.0 |  3702 | 4.45M | 1787 |      4021 |   4.08M | 1292 |       761 | 361.39K |  495 |  "91" | 0:00'35'' | 0:00'34'' |
| Q30L60K101 | 185.67M |   40.0 |  1829 | 4.13M | 2788 |      2227 |   3.23M | 1527 |       744 | 907.41K | 1261 | "101" | 0:00'33'' | 0:00'32'' |
| Q30L60K111 | 185.67M |   40.0 |  1027 | 3.11M | 3144 |      1636 |    1.6M |  965 |       710 |   1.51M | 2179 | "111" | 0:00'31'' | 0:00'29'' |

doc/e_coli.md  view on Meta::CPAN


## 3GS

* Canu

```bash
BASE_NAME=e_coli
REAL_G=4641652
cd ${HOME}/data/anchr/${BASE_NAME}

canu \
    -p ${BASE_NAME} -d canu-raw-20x \
    gnuplot=$(brew --prefix)/Cellar/$(brew list --versions gnuplot | sed 's/ /\//')/bin/gnuplot \
    genomeSize=${REAL_G} \
    -pacbio-raw 3_pacbio/pacbio.20x.fasta

canu \
    -p ${BASE_NAME} -d canu-raw-40x \
    gnuplot=$(brew --prefix)/Cellar/$(brew list --versions gnuplot | sed 's/ /\//')/bin/gnuplot \
    genomeSize=${REAL_G} \
    -pacbio-raw 3_pacbio/pacbio.40x.fasta

canu \
    -p ${BASE_NAME} -d canu-raw-80x \
    gnuplot=$(brew --prefix)/Cellar/$(brew list --versions gnuplot | sed 's/ /\//')/bin/gnuplot \
    genomeSize=${REAL_G} \
    -pacbio-raw 3_pacbio/pacbio.80x.fasta

canu \
    -p ${BASE_NAME} -d canu-raw \
    gnuplot=$(brew --prefix)/Cellar/$(brew list --versions gnuplot | sed 's/ /\//')/bin/gnuplot \
    genomeSize=${REAL_G} \
    -pacbio-raw 3_pacbio/pacbio.fasta

canu \
    -p ${BASE_NAME} -d canu-trim-20x \
    gnuplot=$(brew --prefix)/Cellar/$(brew list --versions gnuplot | sed 's/ /\//')/bin/gnuplot \
    genomeSize=${REAL_G} \
    -pacbio-raw 3_pacbio/pacbio.20x.trim.fasta

canu \
    -p ${BASE_NAME} -d canu-trim-40x \
    gnuplot=$(brew --prefix)/Cellar/$(brew list --versions gnuplot | sed 's/ /\//')/bin/gnuplot \
    genomeSize=${REAL_G} \
    -pacbio-raw 3_pacbio/pacbio.40x.trim.fasta

canu \
    -p ${BASE_NAME} -d canu-trim-80x \
    gnuplot=$(brew --prefix)/Cellar/$(brew list --versions gnuplot | sed 's/ /\//')/bin/gnuplot \
    genomeSize=${REAL_G} \
    -pacbio-raw 3_pacbio/pacbio.80x.trim.fasta

canu \
    -p ${BASE_NAME} -d canu-trim \
    gnuplot=$(brew --prefix)/Cellar/$(brew list --versions gnuplot | sed 's/ /\//')/bin/gnuplot \
    genomeSize=${REAL_G} \
    -pacbio-raw 3_pacbio/pacbio.trim.fasta

# quast
rm -fr 9_qa_canu
quast --no-check --threads 16 \
    --eukaryote \
    -R 1_genome/genome.fa \
    canu-raw-20x/${BASE_NAME}.contigs.fasta \
    canu-trim-20x/${BASE_NAME}.contigs.fasta \
    canu-raw-40x/${BASE_NAME}.contigs.fasta \
    canu-trim-40x/${BASE_NAME}.contigs.fasta \
    canu-raw-80x/${BASE_NAME}.contigs.fasta \
    canu-trim-80x/${BASE_NAME}.contigs.fasta \
    canu-raw/${BASE_NAME}.contigs.fasta \
    canu-trim/${BASE_NAME}.contigs.fasta \
    1_genome/paralogs.fas \
    --label "20x,20x.trim,40x,40x.trim,80x,80x.trim,raw,trim,paralogs" \
    -o 9_qa_canu

faops n50 -S -C canu-raw-20x/${BASE_NAME}.trimmedReads.fasta.gz
faops n50 -S -C canu-trim-20x/${BASE_NAME}.trimmedReads.fasta.gz
faops n50 -S -C canu-raw-40x/${BASE_NAME}.trimmedReads.fasta.gz
faops n50 -S -C canu-trim-40x/${BASE_NAME}.trimmedReads.fasta.gz
faops n50 -S -C canu-raw-80x/${BASE_NAME}.trimmedReads.fasta.gz
faops n50 -S -C canu-trim-80x/${BASE_NAME}.trimmedReads.fasta.gz
faops n50 -S -C canu-raw/${BASE_NAME}.trimmedReads.fasta.gz
faops n50 -S -C canu-trim/${BASE_NAME}.trimmedReads.fasta.gz

find . -type d -name "correction" -path "*canu-*" | xargs rm -fr

```

* miniasm

    * `-S         skip self and dual mappings`
    * `-w INT     minizer window size [{-k}*2/3]`
    * `-L INT     min matching length [40]`
    * `-m FLOAT   merge two chains if FLOAT fraction of minimizers are shared [0.50]`
    * `-t INT     number of threads [3]`

```bash
BASE_NAME=e_coli
cd ${HOME}/data/anchr/${BASE_NAME}

mkdir -p miniasm

minimap -Sw5 -L100 -m0 -t16 \
    3_pacbio/pacbio.40x.fasta 3_pacbio/pacbio.40x.fasta \
    > miniasm/pacbio.40x.paf

miniasm miniasm/pacbio.40x.paf > miniasm/utg.noseq.gfa

miniasm -f 3_pacbio/pacbio.40x.fasta miniasm/pacbio.40x.paf \
    > miniasm/utg.gfa

awk '/^S/{print ">"$2"\n"$3}' miniasm/utg.gfa > miniasm/utg.fa

minimap 1_genome/genome.fa miniasm/utg.fa | minidot - > miniasm/utg.eps
```

```bash
#real    0m19.504s
#user    1m11.237s
#sys     0m18.500s
time anchr paf2ovlp --parallel 16 miniasm/pacbio.40x.paf -o miniasm/pacbio.40x.ovlp.tsv

#real    0m19.451s
#user    0m43.343s
#sys     0m9.734s
time anchr paf2ovlp --parallel 4 miniasm/pacbio.40x.paf -o miniasm/pacbio.40x.ovlp.tsv

#real    0m17.324s
#user    0m9.276s
#sys     1m23.833s
time jrange covered miniasm/pacbio.40x.paf --longest --paf -o miniasm/pacbio.40x.pos.txt
```

## Local corrections

```bash
BASE_NAME=e_coli
REAL_G=4641652
cd ${HOME}/data/anchr/${BASE_NAME}

rm -fr localCor
anchr overlap2 \
    --parallel 16 \
    merge/anchor.merge.fasta \
    3_pacbio/pacbio.40x.trim.fasta \
    -d localCor \
    -b 10 --len 1000 --idt 0.85 --all

pushd localCor

anchr cover \
    --range "1-$(faops n50 -H -N 0 -C anchor.fasta)" \
    --len 1000 --idt 0.85 -c 2 \
    anchorLong.ovlp.tsv \
    -o anchor.cover.json

doc/e_coli.md  view on Meta::CPAN

    anchorLong.ovlp.tsv \
    --parallel 16 \
    --range $(cat environment.json | jq -r '.TRUSTED') \
    --len 1000 --idt 0.85 --trim -v

faops some -i -l 0 \
    long.fasta \
    group/overlapped.long.txt \
    independentLong.fasta

# localCor
gzip -d -c -f $(find group -type f -name "*.correctedReads.fasta.gz") \
    | faops filter -l 0 stdin stdout \
    | grep -E '^>long' -A 1 \
    | sed '/^--$/d' \
    | faops dazz -a -l 0 stdin stdout \
    | pigz -c > localCor.fasta.gz

canu \
    -p ${BASE_NAME} -d localCor \
    gnuplotTested=true \
    genomeSize=${REAL_G} \
    -pacbio-corrected localCor.fasta.gz \
    -pacbio-corrected anchor.fasta

canu \
    -p ${BASE_NAME} -d localCorIndep \
    gnuplotTested=true \
    genomeSize=${REAL_G} \
    -pacbio-raw localCor.fasta.gz \
    -pacbio-raw anchor.fasta \
    -pacbio-raw independentLong.fasta

# localTrim
gzip -d -c -f $(find group -type f -name "*.trimmedReads.fasta.gz") \
    | faops filter -l 0 stdin stdout \
    | grep -E '^>long' -A 1 \
    | sed '/^--$/d' \
    | faops dazz -a -l 0 stdin stdout \
    | pigz -c > localTrim.fasta.gz

canu \
    -p ${BASE_NAME} -d localTrim \
    gnuplotTested=true \
    genomeSize=${REAL_G} \
    -pacbio-corrected localCor.fasta.gz \
    -pacbio-corrected anchor.fasta

# globalTrim
canu -assemble \
    -p ${BASE_NAME} -d globalTrim \
    gnuplotTested=true \
    genomeSize=${REAL_G} \
    -pacbio-corrected ../canu-raw-40x/${BASE_NAME}.trimmedReads.fasta.gz \
    -pacbio-corrected anchor.fasta

popd

# quast
rm -fr 9_qa_localCor
quast --no-check --threads 16 \
    --eukaryote \
    -R 1_genome/genome.fa \
    localCor/anchor.fasta \
    localCor/localCor/${BASE_NAME}.contigs.fasta \
    localCor/localCorIndep/${BASE_NAME}.contigs.fasta \
    localCor/localTrim/${BASE_NAME}.contigs.fasta \
    localCor/globalTrim/${BASE_NAME}.contigs.fasta \
    canu-raw-40x/${BASE_NAME}.contigs.fasta \
    canu-trim-40x/${BASE_NAME}.contigs.fasta \
    1_genome/paralogs.fas \
    --label "anchor,localCor,localCorIndep,localTrim,globalTrim,40x,40x.trim,paralogs" \
    -o 9_qa_localCor

find . -type d -name "correction" | xargs rm -fr

```

## Expand anchors

三代 reads 里有一个常见的错误, 即单一 ZMW 里的测序结果中, 接头序列部分的测序结果出现了较多的错误,
因此并没有将接头序列去除干净, 形成的 subreads 里含有多份基因组上同一片段, 它们之间以接头序列为间隔.

`anchr group` 命令默认会将这种三代的 reads 去除. `--keep` 选项会留下这种 reads, 这适用于组装好的三代序列.

```text
      ===
------------>
             )
  <----------
      ===
```

* anchorLong

```bash
BASE_NAME=e_coli
cd ${HOME}/data/anchr/${BASE_NAME}

rm -fr anchorLong
anchr overlap2 \
    --parallel 16 \
    merge/anchor.merge.fasta \
    3_pacbio/pacbio.40x.trim.fasta \
    -d anchorLong \
    -b 10 --len 1000 --idt 0.85 --all

pushd anchorLong

anchr cover \
    --range "1-$(faops n50 -H -N 0 -C anchor.fasta)" \
    --len 1000 --idt 0.85 -c 2 \
    anchorLong.ovlp.tsv \
    -o anchor.cover.json

cat anchor.cover.json | jq "." > environment.json

anchr overlap \
    anchor.fasta \
    --serial --len 30 --idt 0.9999 \
    -o stdout \

doc/e_coli.md  view on Meta::CPAN


```

## Final stats

* Stats

```bash
BASE_NAME=e_coli
cd ${HOME}/data/anchr/${BASE_NAME}

printf "| %s | %s | %s | %s |\n" \
    "Name" "N50" "Sum" "#" \
    > stat3.md
printf "|:--|--:|--:|--:|\n" >> stat3.md

printf "| %s | %s | %s | %s |\n" \
    $(echo "Genome";   faops n50 -H -S -C 1_genome/genome.fa;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
    $(echo "Paralogs";   faops n50 -H -S -C 1_genome/paralogs.fas;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
    $(echo "anchor.merge"; faops n50 -H -S -C merge/anchor.merge.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
    $(echo "others.merge"; faops n50 -H -S -C merge/others.merge.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
    $(echo "anchorLong"; faops n50 -H -S -C anchorLong/contig.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
    $(echo "contigTrim"; faops n50 -H -S -C contigTrim/contig.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
    $(echo "spades.contig"; faops n50 -H -S -C 8_spades/contigs.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
    $(echo "spades.scaffold"; faops n50 -H -S -C 8_spades/scaffolds.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
    $(echo "platanus.contig"; faops n50 -H -S -C 8_platanus/out_contig.fa;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
    $(echo "platanus.scaffold"; faops n50 -H -S -C 8_platanus/out_gapClosed.fa;) >> stat3.md

cat stat3.md
```

| Name              |     N50 |     Sum |    # |
|:------------------|--------:|--------:|-----:|
| Genome            | 4641652 | 4641652 |    1 |
| Paralogs          |    1934 |  195673 |  106 |
| anchor.merge      |   73736 | 4532566 |  117 |
| others.merge      |    5923 |   21847 |    6 |
| anchorLong        |   80390 | 4531790 |  109 |
| contigTrim        | 3790335 | 4616261 |    4 |
| spades.contig     |  132662 | 4645193 |  311 |
| spades.scaffold   |  133063 | 4645555 |  306 |
| platanus.contig   |   15090 | 4683012 | 1069 |
| platanus.scaffold |  133014 | 4575941 |  137 |

* quast

```bash
BASE_NAME=e_coli
cd ${HOME}/data/anchr/${BASE_NAME}

rm -fr 9_qa_contig
quast --no-check --threads 16 \
    -R 1_genome/genome.fa \
    merge/anchor.merge.fasta \
    anchorLong/contig.fasta \
    contigTrim/contig.fasta \
    canu-raw-40x/${BASE_NAME}.contigs.fasta \
    8_spades/scaffolds.fasta \
    8_platanus/out_gapClosed.fa \
    1_genome/paralogs.fas \
    --label "merge,contig,contigTrim,canu-40x,spades,platanus,paralogs" \
    -o 9_qa_contig

```

* Clear QxxLxxx.

```bash
BASE_NAME=e_coli
cd ${HOME}/data/anchr/${BASE_NAME}

rm -fr 2_illumina/Q{20,25,30,35}L{30,60,90,120}X*
rm -fr Q{20,25,30,35}L{30,60,90,120}X*
```



( run in 0.546 second using v1.01-cache-2.11-cpan-39bf76dae61 )