App-Anchr

 view release on metacpan or  search on metacpan

doc/bacteria_2_3.md  view on Meta::CPAN

Project
[SRP055199](https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP055199)

## lambda: download

* Reference genome

    * Strain: Escherichia virus Lambda (viruses)
    * Taxid: [10710](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=10710&lvl=3&lin=f&keep=1&srchmode=1&unlock)
    * RefSeq assembly accession:
      [GCF_000840245.1](ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/840/245/GCF_000840245.1_ViralProj14204/GCF_000840245.1_ViralProj14204_assembly_report.txt)
    * Proportion of paralogs (> 1000 bp): 0.0

```bash
mkdir -p ~/data/anchr/lambda/1_genome
cd ~/data/anchr/lambda/1_genome

aria2c -x 9 -s 3 -c ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/840/245/GCF_000840245.1_ViralProj14204/GCF_000840245.1_ViralProj14204_genomic.fna.gz

TAB=$'\t'
cat <<EOF > replace.tsv
NC_001416.1${TAB}1
EOF

faops replace GCF_000840245.1_ViralProj14204_genomic.fna.gz replace.tsv genome.fa

#cp ~/data/anchr/paralogs/otherbac/Results/lambda/lambda.multi.fas paralogs.fas

doc/bacteria_2_3.md  view on Meta::CPAN

        fi

        bash ~/Scripts/cpan/App-Anchr/share/sr_stat.sh 2 ${BASE_DIR}/{}
    " >> ${BASE_DIR}/stat2.md

cat stat2.md
```

| Name           |   SumFq | CovFq | AvgRead |       Kmer |   SumFa | Discard% | RealG |  EstG | Est/Real | SumKU | SumSR |   RunTime |
|:---------------|--------:|------:|--------:|-----------:|--------:|---------:|------:|------:|---------:|------:|------:|----------:|
| Q20L60_1000000 | 200.24M |  58.9 |     100 | "41,61,81" | 181.17M |   9.524% |  3.4M |  3.4M |     1.00 | 3.42M |     0 | 0:05'07'' |
| Q20L60_2000000 | 400.49M | 117.9 |     100 | "41,61,81" | 362.45M |   9.500% |  3.4M | 3.41M |     1.00 | 3.43M |     0 | 0:08'38'' |
| Q20L60_3000000 | 600.74M | 176.8 |      99 | "41,61,81" | 543.81M |   9.476% |  3.4M | 3.43M |     1.01 | 3.43M |     0 | 0:12'31'' |
| Q20L60_4000000 |    801M | 235.7 |      99 | "41,61,81" |  725.8M |   9.388% |  3.4M | 3.47M |     1.02 | 3.43M |     0 | 0:15'59'' |
| Q25L60_1000000 | 199.18M |  58.6 |      99 | "41,61,81" | 183.62M |   7.812% |  3.4M |  3.4M |     1.00 | 3.48M |     0 | 0:05'12'' |
| Q25L60_2000000 | 398.32M | 117.2 |      99 | "41,61,81" | 367.28M |   7.792% |  3.4M | 3.41M |     1.00 | 3.43M |     0 | 0:08'33'' |
| Q25L60_3000000 | 597.52M | 175.9 |      99 | "41,61,81" |  551.1M |   7.769% |  3.4M | 3.41M |     1.00 | 3.42M |     0 | 0:12'32'' |
| Q25L60_4000000 | 796.67M | 234.5 |      99 | "41,61,81" |  734.9M |   7.753% |  3.4M | 3.42M |     1.01 | 3.43M |     0 | 0:14'31'' |
| Q30L60_1000000 | 195.64M |  57.6 |      98 | "41,61,81" |  183.5M |   6.208% |  3.4M |  3.4M |     1.00 | 3.41M |     0 | 0:05'13'' |
| Q30L60_2000000 | 391.27M | 115.2 |      98 | "41,61,81" | 367.04M |   6.193% |  3.4M |  3.4M |     1.00 | 3.42M |     0 | 0:07'40'' |
| Q30L60_3000000 | 586.89M | 172.7 |      97 | "41,61,81" |  550.6M |   6.183% |  3.4M | 3.41M |     1.00 | 3.42M |     0 | 0:10'04'' |

doc/bacteria_2_3.md  view on Meta::CPAN


| Name     |     N50 |        Sum |        # |
|:---------|--------:|-----------:|---------:|
| Genome   | 2153922 |    2153922 |        1 |
| Paralogs |    4318 |     142093 |       53 |
| Illumina |     101 | 1491583958 | 14768158 |
| PacBio   |   11808 | 1187845820 |   137516 |
| uniq     |     101 | 1485449016 | 14707416 |
| scythe   |     101 | 1460356291 | 14707416 |
| Q20L60   |     101 | 1239834586 | 12544518 |
| Q25L60   |     101 | 1062429395 | 10873960 |
| Q30L60   |     101 |  734805677 |  7775198 |

## Ngon: down sampling

```bash
BASE_DIR=$HOME/data/anchr/Ngon
cd ${BASE_DIR}

ARRAY=(
    "2_illumina/Q20L60:Q20L60:4000000"

doc/bacteria_2_3.md  view on Meta::CPAN

REAL_G=2488635

for QxxLxx in $( parallel "echo 'Q{1}L{2}'" ::: 25 30 ::: 60 ); do
    echo "==> ${QxxLxx}"

    if [ ! -e 2_illumina/${QxxLxx}/pe.cor.fa ]; then
        echo "2_illumina/${QxxLxx}/pe.cor.fa not exists"
        continue;
    fi

    for X in 40 80 120 160 240; do
        printf "==> Coverage: %s\n" ${X}
        
        rm -fr 2_illumina/${QxxLxx}X${X}*
    
        faops split-about -l 0 \
            2_illumina/${QxxLxx}/pe.cor.fa \
            $(( ${REAL_G} * ${X} )) \
            "2_illumina/${QxxLxx}X${X}"
        
        MAX_SERIAL=$(

doc/bacteria_2_3.md  view on Meta::CPAN


    anchr kunitigs \
        ../2_illumina/Q{1}L{2}X{3}P{4}/pe.cor.fa \
        ../2_illumina/Q{1}L{2}X{3}P{4}/environment.json \
        -p 8 \
        --kmer 31,41,51,61,71,81 \
        -o kunitigs.sh
    bash kunitigs.sh

    echo >&2
    " ::: 25 30 ::: 60 ::: 40 80 120 160 240 ::: 000 001 002 003 004 005 006

# anchors (sampled)
parallel --no-run-if-empty -j 3 "
    echo >&2 '==> Group Q{1}L{2}X{3}P{4}'

    if [ -e Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa ]; then
        exit;
    fi

    rm -fr Q{1}L{2}X{3}P{4}/anchor
    bash ~/Scripts/cpan/App-Anchr/share/anchor.sh Q{1}L{2}X{3}P{4} 8 false
    
    echo >&2
    " ::: 25 30 ::: 60 ::: 40 80 120 160 240 ::: 000 001 002 003 004 005 006

# Stats of anchors
REAL_G=2488635

bash ~/Scripts/cpan/App-Anchr/share/sr_stat.sh 2 header \
    > stat2.md

parallel -k --no-run-if-empty -j 6 "
    if [ ! -e Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa ]; then
        exit;
    fi

    bash ~/Scripts/cpan/App-Anchr/share/sr_stat.sh 2 Q{1}L{2}X{3}P{4} ${REAL_G}
    " ::: 25 30 ::: 60 ::: 40 80 120 160 240 ::: 000 001 002 003 004 005 006 \
    >> stat2.md

cat stat2.md
```

| Name           | SumCor  | CovCor | N50SR |   Sum |   # | N50Anchor |   Sum |   # | N50Others |    Sum |   # |                Kmer | RunTimeKU | RunTimeAN |
|:---------------|:--------|-------:|------:|------:|----:|----------:|------:|----:|----------:|-------:|----:|--------------------:|----------:|----------:|
| Q25L60X40P000  | 99.55M  |   40.0 | 34190 | 2.46M | 140 |     34190 | 2.45M | 131 |       844 |  7.28K |   9 | "31,41,51,61,71,81" | 0:02'48'' | 0:01'25'' |
| Q25L60X40P001  | 99.55M  |   40.0 | 30045 | 2.46M | 148 |     30045 | 2.45M | 132 |       844 | 13.48K |  16 | "31,41,51,61,71,81" | 0:02'49'' | 0:01'35'' |
| Q25L60X40P002  | 99.55M  |   40.0 | 27638 | 2.47M | 162 |     27680 | 2.45M | 145 |       742 | 13.08K |  17 | "31,41,51,61,71,81" | 0:02'46'' | 0:01'35'' |
| Q25L60X40P003  | 99.55M  |   40.0 | 33236 | 2.46M | 131 |     33236 | 2.45M | 117 |       684 |  9.59K |  14 | "31,41,51,61,71,81" | 0:02'53'' | 0:01'23'' |
| Q25L60X40P004  | 99.55M  |   40.0 | 49674 | 2.45M |  99 |     49674 | 2.45M |  91 |       748 |  6.37K |   8 | "31,41,51,61,71,81" | 0:02'58'' | 0:01'31'' |
| Q25L60X40P005  | 99.55M  |   40.0 | 46364 | 2.46M | 108 |     46364 | 2.45M |  97 |       727 |  7.86K |  11 | "31,41,51,61,71,81" | 0:02'55'' | 0:01'33'' |
| Q25L60X40P006  | 99.55M  |   40.0 | 47421 | 2.46M | 117 |     47421 | 2.45M | 105 |       783 |  9.12K |  12 | "31,41,51,61,71,81" | 0:02'45'' | 0:01'35'' |
| Q25L60X80P000  | 199.09M |   80.0 | 19434 | 2.46M | 238 |     19434 | 2.44M | 213 |       822 | 19.67K |  25 | "31,41,51,61,71,81" | 0:03'58'' | 0:02'06'' |
| Q25L60X80P001  | 199.09M |   80.0 | 15365 | 2.46M | 246 |     15447 | 2.45M | 227 |       727 | 13.98K |  19 | "31,41,51,61,71,81" | 0:03'53'' | 0:02'11'' |
| Q25L60X80P002  | 199.09M |   80.0 | 27534 | 2.46M | 163 |     27534 | 2.45M | 151 |       707 |  8.48K |  12 | "31,41,51,61,71,81" | 0:03'57'' | 0:01'56'' |
| Q25L60X120P000 | 298.64M |  120.0 |  9278 | 2.47M | 391 |      9498 | 2.44M | 351 |       770 | 29.14K |  40 | "31,41,51,61,71,81" | 0:05'22'' | 0:02'46'' |
| Q25L60X120P001 | 298.64M |  120.0 | 13839 | 2.46M | 290 |     13936 | 2.44M | 261 |       727 | 20.61K |  29 | "31,41,51,61,71,81" | 0:05'23'' | 0:02'34'' |
| Q25L60X160P000 | 398.18M |  160.0 |  6698 | 2.47M | 550 |      6848 | 2.42M | 479 |       727 | 50.73K |  71 | "31,41,51,61,71,81" | 0:07'01'' | 0:03'12'' |
| Q25L60X240P000 | 597.27M |  240.0 |  4746 | 2.47M | 759 |      4908 | 2.38M | 627 |       778 |  95.9K | 132 | "31,41,51,61,71,81" | 0:09'39'' | 0:03'43'' |
| Q30L60X40P000  | 99.55M  |   40.0 | 55218 | 2.46M |  91 |     55218 | 2.44M |  81 |     10398 | 17.13K |  10 | "31,41,51,61,71,81" | 0:03'05'' | 0:01'35'' |
| Q30L60X40P001  | 99.55M  |   40.0 | 55749 | 2.45M |  93 |     55749 | 2.45M |  85 |       844 |  6.42K |   8 | "31,41,51,61,71,81" | 0:02'52'' | 0:01'35'' |
| Q30L60X40P002  | 99.55M  |   40.0 | 65454 | 2.46M |  75 |     65454 | 2.44M |  62 |      1126 | 13.91K |  13 | "31,41,51,61,71,81" | 0:03'01'' | 0:01'37'' |
| Q30L60X40P003  | 99.55M  |   40.0 | 97954 | 2.45M |  68 |     97954 | 2.45M |  62 |       834 |  4.77K |   6 | "31,41,51,61,71,81" | 0:02'53'' | 0:01'25'' |
| Q30L60X40P004  | 99.55M  |   40.0 | 71924 | 2.45M |  76 |     71924 | 2.45M |  67 |       727 |  6.18K |   9 | "31,41,51,61,71,81" | 0:02'54'' | 0:01'26'' |
| Q30L60X40P005  | 99.55M  |   40.0 | 63766 | 2.45M |  88 |     63766 | 2.44M |  76 |       727 |  8.73K |  12 | "31,41,51,61,71,81" | 0:02'37'' | 0:01'26'' |
| Q30L60X80P000  | 199.09M |   80.0 | 60425 | 2.45M |  76 |     60425 | 2.45M |  70 |       753 |  4.59K |   6 | "31,41,51,61,71,81" | 0:04'00'' | 0:02'03'' |
| Q30L60X80P001  | 199.09M |   80.0 | 68973 | 2.45M |  64 |     68973 | 2.45M |  57 |       844 |  5.27K |   7 | "31,41,51,61,71,81" | 0:04'02'' | 0:02'00'' |
| Q30L60X80P002  | 199.09M |   80.0 | 89791 | 2.45M |  65 |     89791 | 2.45M |  58 |       809 |  5.53K |   7 | "31,41,51,61,71,81" | 0:04'01'' | 0:02'08'' |
| Q30L60X120P000 | 298.64M |  120.0 | 60425 | 2.45M |  74 |     60427 | 2.45M |  67 |       727 |  5.37K |   7 | "31,41,51,61,71,81" | 0:05'23'' | 0:02'27'' |
| Q30L60X120P001 | 298.64M |  120.0 | 71924 | 2.45M |  62 |     71924 | 2.44M |  56 |       844 |   4.9K |   6 | "31,41,51,61,71,81" | 0:05'18'' | 0:02'36'' |
| Q30L60X160P000 | 398.18M |  160.0 | 60427 | 2.45M |  75 |     60427 | 2.45M |  68 |       727 |  5.37K |   7 | "31,41,51,61,71,81" | 0:06'26'' | 0:03'04'' |
| Q30L60X240P000 | 597.27M |  240.0 | 57594 | 2.45M |  86 |     59198 | 2.44M |  74 |       844 |  9.05K |  12 | "31,41,51,61,71,81" | 0:06'47'' | 0:03'28'' |

## Cdip: merge anchors

```bash
BASE_NAME=Cdip
cd ${HOME}/data/anchr/${BASE_NAME}

# merge anchors
mkdir -p merge
anchr contained \
    $(
        parallel -k --no-run-if-empty -j 6 "
            if [ -e Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa ]; then
                echo Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa
            fi
            " ::: 25 30 ::: 60 ::: 40 80 120 160 240 ::: 000 001 002 003 004 005 006
    ) \
    --len 1000 --idt 0.98 --proportion 0.99999 --parallel 16 \
    -o stdout \
    | faops filter -a 1000 -l 0 stdin merge/anchor.contained.fasta
anchr orient merge/anchor.contained.fasta --len 1000 --idt 0.98 -o merge/anchor.orient.fasta
anchr merge merge/anchor.orient.fasta --len 1000 --idt 0.999 -o stdout \
    | faops filter -a 1000 -l 0 stdin merge/anchor.merge.fasta

# merge others
mkdir -p merge
anchr contained \
    $(
        parallel -k --no-run-if-empty -j 6 "
            if [ -e Q{1}L{2}X{3}P{4}/anchor/pe.others.fa ]; then
                echo Q{1}L{2}X{3}P{4}/anchor/pe.others.fa
            fi
            " ::: 25 30 ::: 60 ::: 40 80 120 160 240 ::: 000 001 002 003 004 005 006
    ) \
    --len 1000 --idt 0.98 --proportion 0.99999 --parallel 16 \
    -o stdout \
    | faops filter -a 1000 -l 0 stdin merge/others.contained.fasta
anchr orient merge/others.contained.fasta --len 1000 --idt 0.98 -o merge/others.orient.fasta
anchr merge merge/others.orient.fasta --len 1000 --idt 0.999 -o stdout \
    | faops filter -a 1000 -l 0 stdin merge/others.merge.fasta

# anchors sorted on ref
bash ~/Scripts/cpan/App-Anchr/share/sort_on_ref.sh merge/anchor.merge.fasta 1_genome/genome.fa merge/anchor.sort

doc/bacteria_2_3.md  view on Meta::CPAN

* BioSample: [SAMN04875536](https://www.ncbi.nlm.nih.gov/biosample/SAMN04875536)

## Hinf: download

* Reference genome

    * Strain: Haemophilus influenzae Rd KW20 (g-proteobacteria)
    * Taxid: [71421](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=71421)
    * RefSeq assembly accession:
      [GCF_000027305.1](ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/027/305/GCF_000027305.1_ASM2730v1/GCF_000027305.1_ASM2730v1_assembly_report.txt)
    * Proportion of paralogs (> 1000 bp): 0.0324

```bash
mkdir -p ~/data/anchr/Hinf/1_genome
cd ~/data/anchr/Hinf/1_genome

aria2c -x 9 -s 3 -c ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/027/305/GCF_000027305.1_ASM2730v1/GCF_000027305.1_ASM2730v1_genomic.fna.gz

TAB=$'\t'
cat <<EOF > replace.tsv
NC_000907.1${TAB}1

doc/e_coli.md  view on Meta::CPAN

| Q20L60X40P005   | 185.67M |   40.0 |  5322 | 4.59M | 1291 |      5531 | 4.41M | 1050 |       805 | 182.01K |  241 | "31,41,51,61,71,81" | 0:07'43'' | 0:00'33'' |
| Q20L60X80P000   | 371.33M |   80.0 |  2129 | 4.51M | 2683 |      2448 |  3.8M | 1737 |       783 |  702.3K |  946 | "31,41,51,61,71,81" | 0:13'15'' | 0:00'36'' |
| Q20L60X80P001   | 371.33M |   80.0 |  2153 | 4.51M | 2634 |      2477 | 3.83M | 1719 |       782 | 679.02K |  915 | "31,41,51,61,71,81" | 0:12'05'' | 0:00'47'' |
| Q20L60X80P002   | 371.33M |   80.0 |  2186 |  4.5M | 2653 |      2525 |  3.8M | 1702 |       772 | 698.13K |  951 | "31,41,51,61,71,81" | 0:12'02'' | 0:00'46'' |
| Q20L60X120P000  |    557M |  120.0 |  1468 | 4.28M | 3375 |      1822 | 3.05M | 1698 |       772 |   1.23M | 1677 | "31,41,51,61,71,81" | 0:17'10'' | 0:00'46'' |
| Q20L60X120P001  |    557M |  120.0 |  1461 | 4.28M | 3372 |      1846 | 3.09M | 1727 |       750 |   1.19M | 1645 | "31,41,51,61,71,81" | 0:16'54'' | 0:00'46'' |
| Q20L60X160P000  | 742.66M |  160.0 |  1207 | 4.05M | 3683 |      1644 | 2.49M | 1527 |       756 |   1.56M | 2156 | "31,41,51,61,71,81" | 0:22'14'' | 0:01'06'' |
| Q20L60X200P000  | 928.33M |  200.0 |  1089 | 3.87M | 3791 |      1557 | 2.13M | 1361 |       740 |   1.74M | 2430 | "31,41,51,61,71,81" | 0:28'42'' | 0:01'10'' |
| Q20L90X40P000   | 185.67M |   40.0 |  6570 | 4.61M | 1008 |      6725 | 4.49M |  879 |       856 | 117.92K |  129 | "31,41,51,61,71,81" | 0:08'32'' | 0:00'31'' |
| Q20L90X40P001   | 185.67M |   40.0 |  7208 | 4.59M | 1012 |      7356 |  4.5M |  890 |       795 |  93.56K |  122 | "31,41,51,61,71,81" | 0:09'17'' | 0:00'23'' |
| Q20L90X40P002   | 185.67M |   40.0 |  6970 | 4.59M | 1005 |      7253 |  4.5M |  890 |       769 |  84.54K |  115 | "31,41,51,61,71,81" | 0:08'24'' | 0:00'24'' |
| Q20L90X40P003   | 185.67M |   40.0 |  7017 | 4.59M | 1014 |      7125 | 4.49M |  886 |       832 |  97.74K |  128 | "31,41,51,61,71,81" | 0:08'37'' | 0:00'27'' |
| Q20L90X40P004   | 185.67M |   40.0 |  6957 | 4.59M | 1005 |      7184 | 4.49M |  888 |       800 |  92.41K |  117 | "31,41,51,61,71,81" | 0:07'13'' | 0:00'23'' |
| Q20L90X40P005   | 185.67M |   40.0 |  6736 | 4.59M | 1010 |      6980 | 4.49M |  876 |       811 | 101.03K |  134 | "31,41,51,61,71,81" | 0:06'35'' | 0:00'25'' |
| Q20L90X80P000   | 371.33M |   80.0 |  3098 | 4.59M | 2005 |      3287 | 4.24M | 1525 |       777 | 358.32K |  480 | "31,41,51,61,71,81" | 0:12'00'' | 0:00'27'' |
| Q20L90X80P001   | 371.33M |   80.0 |  3045 | 4.59M | 2019 |      3280 | 4.24M | 1540 |       783 | 355.31K |  479 | "31,41,51,61,71,81" | 0:13'23'' | 0:00'25'' |
| Q20L90X80P002   | 371.33M |   80.0 |  3172 | 4.58M | 1969 |      3342 | 4.23M | 1492 |       770 | 350.42K |  477 | "31,41,51,61,71,81" | 0:11'49'' | 0:00'27'' |
| Q20L90X120P000  |    557M |  120.0 |  2165 | 4.55M | 2630 |      2473 | 3.91M | 1768 |       773 | 639.84K |  862 | "31,41,51,61,71,81" | 0:14'55'' | 0:00'41'' |
| Q20L90X120P001  |    557M |  120.0 |  2232 | 4.53M | 2568 |      2519 |  3.9M | 1716 |       774 | 630.62K |  852 | "31,41,51,61,71,81" | 0:14'05'' | 0:01'03'' |
| Q20L90X160P000  | 742.66M |  160.0 |  1852 | 4.48M | 2899 |      2174 | 3.68M | 1811 |       772 | 804.27K | 1088 | "31,41,51,61,71,81" | 0:18'06'' | 0:01'07'' |
| Q20L90X200P000  | 928.33M |  200.0 |  1732 | 4.45M | 3035 |      2069 | 3.56M | 1826 |       763 | 886.28K | 1209 | "31,41,51,61,71,81" | 0:24'00'' | 0:00'51'' |
| Q20L120X40P000  | 185.67M |   40.0 |  8832 |  4.6M |  838 |      8954 | 4.49M |  740 |       891 | 101.73K |   98 | "31,41,51,61,71,81" | 0:06'15'' | 0:00'24'' |
| Q20L120X40P001  | 185.67M |   40.0 |  8577 | 4.58M |  836 |      8892 | 4.51M |  742 |       788 |  69.07K |   94 | "31,41,51,61,71,81" | 0:06'47'' | 0:00'26'' |
| Q20L120X40P002  | 185.67M |   40.0 |  8147 | 4.58M |  859 |      8263 |  4.5M |  756 |       795 |  75.92K |  103 | "31,41,51,61,71,81" | 0:06'57'' | 0:00'24'' |
| Q20L120X40P003  | 185.67M |   40.0 |  8864 | 4.57M |  819 |      8970 |  4.5M |  727 |       860 |  75.55K |   92 | "31,41,51,61,71,81" | 0:06'51'' | 0:00'24'' |
| Q20L120X40P004  | 185.67M |   40.0 |  8495 | 4.58M |  856 |      8659 |  4.5M |  755 |       810 |  75.85K |  101 | "31,41,51,61,71,81" | 0:06'23'' | 0:00'27'' |
| Q20L120X80P000  | 371.33M |   80.0 |  4501 | 4.59M | 1519 |      4648 | 4.38M | 1231 |       809 | 217.48K |  288 | "31,41,51,61,71,81" | 0:10'09'' | 0:00'33'' |
| Q20L120X80P001  | 371.33M |   80.0 |  4373 |  4.6M | 1530 |      4690 | 4.38M | 1240 |       804 | 219.06K |  290 | "31,41,51,61,71,81" | 0:10'30'' | 0:00'35'' |
| Q20L120X120P000 |    557M |  120.0 |  3256 | 4.58M | 1931 |      3501 | 4.24M | 1474 |       795 | 344.09K |  457 | "31,41,51,61,71,81" | 0:14'16'' | 0:00'43'' |
| Q20L120X160P000 | 742.66M |  160.0 |  2868 | 4.58M | 2131 |      3138 | 4.16M | 1576 |       795 | 417.96K |  555 | "31,41,51,61,71,81" | 0:20'19'' | 0:00'56'' |
| Q20L120X200P000 | 928.33M |  200.0 |  2730 | 4.57M | 2203 |      2999 | 4.12M | 1604 |       801 | 451.19K |  599 | "31,41,51,61,71,81" | 0:23'31'' | 0:00'49'' |
| Q25L30X40P000   | 185.67M |   40.0 | 50567 | 4.55M |  196 |     50567 | 4.53M |  179 |       754 |     13K |   17 | "31,41,51,61,71,81" | 0:07'33'' | 0:00'45'' |
| Q25L30X40P001   | 185.67M |   40.0 | 38554 | 4.55M |  215 |     40089 | 4.53M |  196 |       754 |  14.03K |   19 | "31,41,51,61,71,81" | 0:06'56'' | 0:00'29'' |
| Q25L30X40P002   | 185.67M |   40.0 | 41181 | 4.55M |  203 |     41181 | 4.53M |  184 |       812 |  14.21K |   19 | "31,41,51,61,71,81" | 0:05'51'' | 0:00'29'' |
| Q25L30X40P003   | 185.67M |   40.0 | 39467 | 4.55M |  210 |     39467 | 4.53M |  193 |       812 |  12.79K |   17 | "31,41,51,61,71,81" | 0:06'00'' | 0:00'25'' |

doc/model_organisms.md  view on Meta::CPAN


```

## s288c: expand anchors

在酿酒酵母中, 有下列几组完全相同的序列, 它们都是新近发生的片段重复:

* I:216563-218385, VIII:537165-538987
* I:223713-224783, VIII:550350-551420
* IV:528442-530427, IV:532327-534312, IV:536212-538197
* IV:530324-531519, IV:534209-535404
* IV:5645-7725, X:738076-740156
* IV:7810-9432, X:736368-737990
* IX:9683-11043, X:9666-11026
* IV:1244112-1245373, XV:575980-577241
* VIII:212266-214124, VIII:214264-216122
* IX:11366-14953, X:11349-14936
* XII:468935-470576, XII:472587-474228, XII:482167-483808, XII:485819-487460,
* XII:483798-485798, XII:487450-489450

* anchorLong

doc/model_organisms.md  view on Meta::CPAN

tar xvfz Dro5_29NOV2013_402.tgz --directory untar
tar xvfz Dro6_1DEC2013_403.tgz --directory untar

find . -type f -name "*.ba?.h5" | parallel -j 1 "mv {} untar" 

# convert .bax.h5 to .subreads.bam
mkdir -p ~/data/anchr/iso_1/3_pacbio/bam
cd ~/data/anchr/iso_1/3_pacbio/bam

source ~/share/pitchfork/deployment/setup-env.sh
for movie in m131124_190051 m131124_221952 m131125_013854 m131125_045830 m131130_054035 m131130_091217 m131130_124231 m131130_161213 m131130_194336 m131130_231441 m131201_024805 m131201_061903 m131201_223357 m131202_020424 m131202_053545 m131202_0905...
do 
    if [ -e ~/data/anchr/iso_1/3_pacbio/bam/${movie}*.subreads.bam ]; then
        continue
    fi
    bax2bam ~/data/anchr/iso_1/3_pacbio/untar/${movie}*.bax.h5
done

# convert .subreads.bam to fasta
mkdir -p ~/data/anchr/iso_1/3_pacbio/fasta
for movie in m131124_190051 m131124_221952 m131125_013854 m131125_045830 m131130_054035 m131130_091217 m131130_124231 m131130_161213 m131130_194336 m131130_231441 m131201_024805 m131201_061903 m131201_223357 m131202_020424 m131202_053545 m131202_0905...
do
    if [ ! -e ~/data/anchr/iso_1/3_pacbio/bam/${movie}*.subreads.bam ]; then
        continue
    fi

    samtools fasta \
        ~/data/anchr/iso_1/3_pacbio/bam/${movie}*.subreads.bam \
        > ~/data/anchr/iso_1/3_pacbio/fasta/${movie}.fasta
done

doc/model_organisms.md  view on Meta::CPAN


cat stat2.md
```

| Name          | SumCor | CovCor | N50SR |    Sum |     # | N50Anchor |    Sum |     # | N50Others |    Sum |    # |                Kmer | RunTimeKU | RunTimeAN |
|:--------------|-------:|-------:|------:|-------:|------:|----------:|-------:|------:|----------:|-------:|-----:|--------------------:|----------:|:----------|
| Q25L60X30P000 |  3.01G |   30.0 | 10992 | 98.21M | 22237 |     11789 | 85.96M | 13658 |      2311 | 12.25M | 8579 | "31,41,51,61,71,81" | 1:14'53'' | 0:07'05'' |
| Q25L60X30P001 |  3.01G |   30.0 | 10433 | 97.91M | 23035 |     11351 | 85.44M | 13871 |      1562 | 12.47M | 9164 | "31,41,51,61,71,81" | 1:11'02'' | 0:07'05'' |
| Q25L60X60P000 |  6.02G |   60.0 | 11787 | 99.24M | 19543 |     12467 | 88.53M | 12969 |      4629 | 10.71M | 6574 | "31,41,51,61,71,81" | 1:25'03'' | 0:08'23'' |
| Q30L60X30P000 |  3.01G |   30.0 | 10914 | 97.81M | 22843 |     11752 | 85.35M | 13764 |      1510 | 12.46M | 9079 | "31,41,51,61,71,81" | 0:59'47'' | 0:07'06'' |
| Q30L60X30P001 |  3.01G |   30.0 | 10160 | 97.39M | 24052 |     10924 | 84.51M | 14218 |      1255 | 12.87M | 9834 | "31,41,51,61,71,81" | 0:45'09'' | 0:07'05'' |
| Q30L60X60P000 |  6.02G |   60.0 | 12462 |  99.3M | 19363 |     12943 | 88.29M | 12773 |      5901 | 11.01M | 6590 | "31,41,51,61,71,81" | 0:59'46'' | 0:08'02'' |

## n2: merge anchors

```bash
BASE_NAME=n2
cd ${HOME}/data/anchr/${BASE_NAME}

# merge anchors
mkdir -p merge

doc/model_organisms.md  view on Meta::CPAN

mkdir -p ~/data/anchr/col_0/1_genome
cd ~/data/anchr/col_0/1_genome
wget -N ftp://ftp.ensemblgenomes.org/pub/release-29/plants/fasta/arabidopsis_thaliana/dna/Arabidopsis_thaliana.TAIR10.29.dna_sm.toplevel.fa.gz
faops order Arabidopsis_thaliana.TAIR10.29.dna_sm.toplevel.fa.gz \
    <(for chr in {1,2,3,4,5,Mt,Pt}; do echo $chr; done) \
    genome.fa
```

* Illumina HiSeq (100 bp)

    [SRX202246](https://www.ncbi.nlm.nih.gov/sra/SRX202246[accn])

```bash
# Downloading from ena with aria2
mkdir -p ~/data/anchr/col_0/2_illumina
cd ~/data/anchr/col_0/2_illumina

cat << EOF > sra_ftp.txt
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR611/SRR611086
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR616/SRR616966
EOF

doc/model_organisms.md  view on Meta::CPAN


https://www.ncbi.nlm.nih.gov/biosample/4539665

[SRX1715692](https://www.ncbi.nlm.nih.gov/sra/SRX1715692[accn])

```bash
mkdir -p ~/data/anchr/col_0/3_pacbio
cd ~/data/anchr/col_0/3_pacbio

cat <<EOF > sra_ftp.txt
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/002/SRR3405242
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/003/SRR3405243
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/004/SRR3405244
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/006/SRR3405246
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/008/SRR3405248
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/000/SRR3405250
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/002/SRR3405252
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/003/SRR3405253
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/004/SRR3405254
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/005/SRR3405255
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/006/SRR3405256
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/007/SRR3405257
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/008/SRR3405258
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/009/SRR3405259
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/005/SRR3405245
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/007/SRR3405247
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/009/SRR3405249
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/001/SRR3405251
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/000/SRR3405260
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/003/SRR3405263
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/005/SRR3405265
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/007/SRR3405267
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/009/SRR3405269
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/001/SRR3405271
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/004/SRR3405274
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/005/SRR3405275
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/006/SRR3405276

doc/model_organisms.md  view on Meta::CPAN

ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/006/SRR3405266
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/008/SRR3405268
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/000/SRR3405270
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/002/SRR3405272
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/003/SRR3405273
EOF

aria2c -x 6 -s 3 -c -i sra_ftp.txt

cat << EOF > sra_md5.txt
be9c803f847ff1c81d153110cc699390        SRR3405242
c68a2c3b62245a697722fd3f8fda7a2d        SRR3405243
7116e8a0de87b1acd016d9b284e4795c        SRR3405244
51f8e5ee4565aace4e5a5cba73e3e597        SRR3405246
f339f580e86aad3a5487b5cec8ae80d4        SRR3405248
1a8246ed1f7c38801cfc603e088abb70        SRR3405250
a0ce8435a7fa2e7ddbd6ac181902f751        SRR3405252
8754f69a1c8c1f00b58b48454c1c01ad        SRR3405253
367508500303325e855666133505a5af        SRR3405254
d250f69fcf2975c89ceab5a4f9425b36        SRR3405255
badd9b2d23f94d1c98263d2e786742ae        SRR3405256
6c5cbd3bce9459283a415d8a5c05c86e        SRR3405257
32da7a364c8cbda5cf76b87f7c51b475        SRR3405258
eb3819adf483451ac670f89d1ea6b76e        SRR3405259
5337862eeb0945f932de74e8f7b9ec4f        SRR3405245
4545ce4666878fcbcda1e7737be1896b        SRR3405247
71d61bc64e3ca9b91f08b1c6b1389f16        SRR3405249
b9a911b8eb4fbfe29dff8cf920429f18        SRR3405251
99bae070fa90d53c8f15b9cf42c634f6        SRR3405260
830e02f1f3cb66b9e085803a21ad8040        SRR3405263
86d28c63f00095ae0ff1151e7e0bf7b4        SRR3405265
3e048ad8dbb526d4a533ee1d5ec10a43        SRR3405267
1b73ed3a1124f5f025c511672c1e18d3        SRR3405269
fa07c85b9e6258abcef8bdb730ab812f        SRR3405271
aeb6ab7edfa42e5e27704b7625c659c1        SRR3405274
0eb24fcc9b40f6fe0f013fe79dd7edf7        SRR3405275
f051e0065602477e0a1d13a6d0a42d3d        SRR3405276

doc/model_organisms.md  view on Meta::CPAN

| Name            |      N50 |         Sum |        # |
|:----------------|---------:|------------:|---------:|
| Genome          | 23459830 |   119667750 |        7 |
| Paralogs        |     2007 |    16447809 |     8055 |
| Illumina        |      301 | 15529845059 | 53786130 |
| uniq            |      301 | 15528150050 | 53779068 |
| Q20L60          |      301 | 13359936477 | 52318516 |
| Q25L60          |      301 | 11821537855 | 49650904 |
| Q30L60          |      301 | 10366980114 | 48122656 |
| PacBio          |     6754 | 18768526777 |  5721958 |
| PacBio.40x      |     7830 |  4906030224 |  1300000 |
| PacBio.40x.trim |     6904 |  2032710549 |   381134 |
| PacBio.80x      |     7448 |  9473394614 |  2600000 |
| PacBio.80x.trim |     6975 |  3942522483 |   729527 |

## col_0: spades

```bash
BASE_NAME=col_0
cd ${HOME}/data/anchr/${BASE_NAME}

doc/pacbio_consensus.md  view on Meta::CPAN


```bash
mkdir -p $HOME/data/pacbio/rawdata/ler0_test/fasta
cd $HOME/data/pacbio/rawdata/ler0_test/fasta

samtools fasta \
    ~/data/pacbio/rawdata/public/SequelData/ArabidopsisDemoData/SequenceData/1_A01_customer/m54113_160913_184949.subreads.bam \
    > m54113_160913_184949.fasta

samtools fasta \
    ~/data/pacbio/rawdata/public/SequelData/ArabidopsisDemoData/SequenceData/3_C01_customer/m54113_160914_092411.subreads.bam \
    > m54113_160914_092411.fasta

#N50     70763
#S       10753458447
#C       1135065
faops n50 -C -S *.fasta
```

## 其它模式生物

用这篇文章里提供的样例, doi:10.1038/sdata.2014.45.

t/24_4.ovlp.tsv  view on Meta::CPAN

anchor/282/0_2680	long/5011/0_28061	2680	0.905	0	0	2680	2680	0	5864	8699	28061	contained
anchor/282/0_2680	long/4614/0_23784	2680	0.904	0	0	2680	2680	0	18900	21712	23784	contained
anchor/282/0_2680	long/14357/0_23576	1372	0.914	0	0	1372	2680	0	22147	23576	23576	overlap
anchor/282/0_2680	long/7809/0_20200	2680	0.909	0	0	2680	2680	0	9909	12648	20200	contained
anchor/282/0_2680	long/6479/0_19161	2680	0.872	0	0	2680	2680	0	12964	15616	19161	contained
anchor/282/0_2680	long/14135/0_18323	2680	0.901	0	0	2680	2680	0	10784	13531	18323	contained
anchor/282/0_2680	long/9192/0_18207	2680	0.897	0	0	2680	2680	0	3663	6411	18207	contained
anchor/282/0_2680	long/9672/0_17039	2680	0.902	0	0	2680	2680	0	271	3121	17039	contained
anchor/282/0_2680	long/6096/0_9538	2680	0.886	0	0	2680	2680	0	6157	8862	9538	contained
anchor/282/0_2680	long/6019/0_9345	2680	0.874	0	0	2680	2680	0	4944	7805	9345	contained
anchor/282/0_2680	long/6005/0_8970	2680	0.878	0	0	2680	2680	0	2443	5282	8970	contained
anchor/282/0_2680	long/2618/0_8586	1146	0.865	0	1534	2680	2680	0	0	1221	8586	overlap
anchor/306/0_2073	long/5011/0_28061	2073	0.901	0	0	2073	2073	0	8657	10858	28061	contained
anchor/306/0_2073	long/14475/0_26732	2073	0.888	0	0	2073	2073	0	160	2354	26732	contained
anchor/306/0_2073	long/4614/0_23784	2010	0.908	0	0	2010	2073	0	21673	23784	23784	overlap
anchor/306/0_2073	long/5288/0_22087	2073	0.909	0	0	2073	2073	0	1530	3659	22087	contained
anchor/306/0_2073	long/7809/0_20200	2073	0.922	0	0	2073	2073	0	12613	14731	20200	contained
anchor/306/0_2073	long/6479/0_19161	2073	0.862	0	0	2073	2073	0	15580	17617	19161	contained
anchor/306/0_2073	long/14135/0_18323	2073	0.905	0	0	2073	2073	0	13496	15644	18323	contained
anchor/306/0_2073	long/9192/0_18207	2073	0.912	0	0	2073	2073	0	6375	8501	18207	contained
anchor/306/0_2073	long/4795/0_17190	1361	0.909	0	712	2073	2073	0	0	1392	17190	overlap

t/24_4.ovlp.tsv  view on Meta::CPAN

long/4795/0_17190	anchor/306/0_2073	1392	0.909	0	0	1392	17190	0	712	2073	2073	overlap
long/4795/0_17190	anchor/311/0_1888	1944	0.917	0	1358	3302	17190	0	0	1888	1888	contains
long/9672/0_17039	anchor/282/0_2680	2850	0.902	0	271	3121	17039	0	0	2680	2680	contains
long/9672/0_17039	anchor/306/0_2073	2198	0.904	0	3082	5280	17039	0	0	2073	2073	contains
long/9672/0_17039	anchor/311/0_1888	1991	0.914	0	5244	7235	17039	0	0	1888	1888	contains
long/6096/0_9538	anchor/201/0_4965	4986	0.888	0	1186	6172	9538	0	0	4965	4965	contains
long/6096/0_9538	anchor/282/0_2680	2705	0.886	0	6157	8862	9538	0	0	2680	2680	contains
long/6019/0_9345	anchor/201/0_4965	4960	0.885	0	0	4960	9345	0	209	4965	4965	overlap
long/6019/0_9345	anchor/282/0_2680	2861	0.874	0	4944	7805	9345	0	0	2680	2680	contains
long/6019/0_9345	anchor/306/0_2073	1581	0.857	0	7764	9345	9345	0	0	1507	2073	overlap
long/6005/0_8970	anchor/282/0_2680	2839	0.878	0	2443	5282	8970	0	0	2680	2680	contains
long/6005/0_8970	anchor/306/0_2073	2204	0.855	0	5245	7449	8970	0	0	2073	2073	contains
long/2618/0_8586	anchor/282/0_2680	1221	0.865	0	0	1221	8586	0	1534	2680	2680	overlap
long/2618/0_8586	anchor/306/0_2073	2188	0.910	0	1185	3373	8586	0	0	2073	2073	contains
long/2618/0_8586	anchor/311/0_1888	1962	0.911	0	3344	5306	8586	0	0	1888	1888	contains
long/2002/0_8381	anchor/306/0_2073	1834	0.912	0	0	1834	8381	0	313	2073	2073	overlap
long/2002/0_8381	anchor/311/0_1888	1991	0.899	0	1802	3793	8381	0	0	1888	1888	contains
long/10968/0_7486	anchor/306/0_2073	2154	0.899	0	1474	3628	7486	0	0	2073	2073	contains
long/10968/0_7486	anchor/311/0_1888	1948	0.906	0	3596	5544	7486	0	0	1888	1888	contains
long/4796/0_6610	anchor/306/0_2073	1399	0.883	0	0	1399	6610	0	713	2073	2073	overlap
long/4796/0_6610	anchor/311/0_1888	1903	0.911	0	1367	3270	6610	0	0	1888	1888	contains



( run in 0.641 second using v1.01-cache-2.11-cpan-7add2cbd662 )