App-Anchr
view release on metacpan or search on metacpan
doc/bacteria_2_3.md view on Meta::CPAN
cd 2_illumina/Q{1}L{2}
if [ -e R1.fq.gz ]; then
echo ' R1.fq.gz already presents'
exit;
fi
anchr trim \
--noscythe \
-q {1} -l {2} \
../R1.scythe.fq.gz ../R2.scythe.fq.gz \
-o stdout \
| bash
" ::: 20 25 30 ::: 60
```
* Stats
```bash
BASE_DIR=$HOME/data/anchr/Vpar
cd ${BASE_DIR}
printf "| %s | %s | %s | %s |\n" \
"Name" "N50" "Sum" "#" \
> stat.md
printf "|:--|--:|--:|--:|\n" >> stat.md
printf "| %s | %s | %s | %s |\n" \
$(echo "Genome"; faops n50 -H -S -C 1_genome/genome.fa;) >> stat.md
printf "| %s | %s | %s | %s |\n" \
$(echo "Paralogs"; faops n50 -H -S -C 1_genome/paralogs.fas;) >> stat.md
printf "| %s | %s | %s | %s |\n" \
$(echo "Illumina"; faops n50 -H -S -C 2_illumina/R1.fq.gz 2_illumina/R2.fq.gz;) >> stat.md
printf "| %s | %s | %s | %s |\n" \
$(echo "PacBio"; faops n50 -H -S -C 3_pacbio/pacbio.fasta;) >> stat.md
printf "| %s | %s | %s | %s |\n" \
$(echo "uniq"; faops n50 -H -S -C 2_illumina/R1.uniq.fq.gz 2_illumina/R2.uniq.fq.gz;) >> stat.md
printf "| %s | %s | %s | %s |\n" \
$(echo "scythe"; faops n50 -H -S -C 2_illumina/R1.scythe.fq.gz 2_illumina/R2.scythe.fq.gz;) >> stat.md
for qual in 20 25 30; do
for len in 60; do
DIR_COUNT="${BASE_DIR}/2_illumina/Q${qual}L${len}"
printf "| %s | %s | %s | %s |\n" \
$(echo "Q${qual}L${len}"; faops n50 -H -S -C ${DIR_COUNT}/R1.fq.gz ${DIR_COUNT}/R2.fq.gz;) \
>> stat.md
done
done
cat stat.md
```
| Name | N50 | Sum | # |
|:---------|--------:|-----------:|---------:|
| Genome | 3288558 | 5165770 | 2 |
| Paralogs | 3333 | 155714 | 62 |
| Illumina | 101 | 1368727962 | 13551762 |
| PacBio | 11771 | 1228497092 | 143537 |
| uniq | 101 | 1361783404 | 13483004 |
| scythe | 101 | 1346787728 | 13483004 |
| Q20L60 | 101 | 1264469138 | 12611522 |
| Q25L60 | 101 | 1200269501 | 12011552 |
| Q30L60 | 101 | 1080002384 | 10917028 |
## Vpar: down sampling
```bash
BASE_DIR=$HOME/data/anchr/Vpar
cd ${BASE_DIR}
ARRAY=(
"2_illumina/Q20L60:Q20L60:5000000"
"2_illumina/Q25L60:Q25L60:5000000"
"2_illumina/Q30L60:Q30L60:5000000"
)
for group in "${ARRAY[@]}" ; do
GROUP_DIR=$(group=${group} perl -e '@p = split q{:}, $ENV{group}; print $p[0];')
GROUP_ID=$( group=${group} perl -e '@p = split q{:}, $ENV{group}; print $p[1];')
GROUP_MAX=$(group=${group} perl -e '@p = split q{:}, $ENV{group}; print $p[2];')
printf "==> %s \t %s \t %s\n" "$GROUP_DIR" "$GROUP_ID" "$GROUP_MAX"
perl -e 'print 1000000 * $_, qq{\n} for 1 .. 5' \
| parallel --no-run-if-empty -j 4 "
if [[ {} -gt '$GROUP_MAX' ]]; then
exit;
fi
echo ' ${GROUP_ID}_{}'
mkdir -p ${BASE_DIR}/${GROUP_ID}_{}
if [ -e ${BASE_DIR}/${GROUP_ID}_{}/R1.fq.gz ]; then
exit;
fi
seqtk sample -s{} \
${BASE_DIR}/${GROUP_DIR}/R1.fq.gz {} \
| pigz -p 4 -c > ${BASE_DIR}/${GROUP_ID}_{}/R1.fq.gz
seqtk sample -s{} \
${BASE_DIR}/${GROUP_DIR}/R2.fq.gz {} \
| pigz -p 4 -c > ${BASE_DIR}/${GROUP_ID}_{}/R2.fq.gz
"
done
```
```bash
BASE_DIR=$HOME/data/anchr/Vpar
cd ${BASE_DIR}
head -n 50000 3_pacbio/pacbio.fasta > 3_pacbio/pacbio.40x.fasta
faops n50 -S -C 3_pacbio/pacbio.40x.fasta
head -n 100000 3_pacbio/pacbio.fasta > 3_pacbio/pacbio.80x.fasta
faops n50 -S -C 3_pacbio/pacbio.80x.fasta
```
doc/bacteria_2_3.md view on Meta::CPAN
if [ -e Q{1}L{2}X{3}P{4}/k_unitigs.fasta ]; then
echo >&2 ' k_unitigs.fasta already presents'
exit;
fi
mkdir -p Q{1}L{2}X{3}P{4}
cd Q{1}L{2}X{3}P{4}
anchr kunitigs \
../2_illumina/Q{1}L{2}X{3}P{4}/pe.cor.fa \
../2_illumina/Q{1}L{2}X{3}P{4}/environment.json \
-p 8 \
--kmer 31,41,51,61,71,81 \
-o kunitigs.sh
bash kunitigs.sh
echo >&2
" ::: 25 30 ::: 60 ::: 40 80 120 160 ::: 000 001 002 003 004 005 006
# anchors (sampled)
parallel --no-run-if-empty -j 3 "
echo >&2 '==> Group Q{1}L{2}X{3}P{4}'
if [ -e Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa ]; then
exit;
fi
rm -fr Q{1}L{2}X{3}P{4}/anchor
bash ~/Scripts/cpan/App-Anchr/share/anchor.sh Q{1}L{2}X{3}P{4} 8 false
echo >&2
" ::: 25 30 ::: 60 ::: 40 80 120 160 ::: 000 001 002 003 004 005 006
# Stats of anchors
REAL_G=1892775
bash ~/Scripts/cpan/App-Anchr/share/sr_stat.sh 2 header \
> stat2.md
parallel -k --no-run-if-empty -j 6 "
if [ ! -e Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa ]; then
exit;
fi
bash ~/Scripts/cpan/App-Anchr/share/sr_stat.sh 2 Q{1}L{2}X{3}P{4} ${REAL_G}
" ::: 25 30 ::: 60 ::: 40 80 120 160 ::: 000 001 002 003 004 005 006 \
>> stat2.md
cat stat2.md
```
| Name | SumCor | CovCor | N50SR | Sum | # | N50Anchor | Sum | # | N50Others | Sum | # | Kmer | RunTimeKU | RunTimeAN |
|:---------------|--------:|-------:|------:|------:|---:|----------:|------:|---:|----------:|-------:|--:|--------------------:|----------:|:----------|
| Q25L60X40P000 | 75.71M | 40.0 | 35248 | 1.8M | 72 | 35248 | 1.8M | 71 | 865 | 865 | 1 | "31,41,51,61,71,81" | 0:01'13'' | 0:00'52'' |
| Q25L60X40P001 | 75.71M | 40.0 | 32751 | 1.8M | 75 | 32751 | 1.79M | 72 | 4293 | 9.43K | 3 | "31,41,51,61,71,81" | 0:01'15'' | 0:00'52'' |
| Q25L60X40P002 | 75.71M | 40.0 | 32751 | 1.8M | 76 | 32751 | 1.79M | 73 | 4293 | 9.45K | 3 | "31,41,51,61,71,81" | 0:01'13'' | 0:00'54'' |
| Q25L60X40P003 | 75.71M | 40.0 | 32751 | 1.82M | 75 | 32803 | 1.77M | 72 | 23232 | 47.32K | 3 | "31,41,51,61,71,81" | 0:01'14'' | 0:00'47'' |
| Q25L60X80P000 | 151.42M | 80.0 | 32751 | 1.8M | 78 | 32751 | 1.8M | 74 | 645 | 2.58K | 4 | "31,41,51,61,71,81" | 0:01'48'' | 0:01'09'' |
| Q25L60X80P001 | 151.42M | 80.0 | 31667 | 1.8M | 79 | 31667 | 1.8M | 77 | 865 | 1.44K | 2 | "31,41,51,61,71,81" | 0:01'49'' | 0:01'12'' |
| Q25L60X120P000 | 227.13M | 120.0 | 32404 | 1.8M | 83 | 32404 | 1.8M | 78 | 650 | 3.55K | 5 | "31,41,51,61,71,81" | 0:02'27'' | 0:01'21'' |
| Q25L60X160P000 | 302.84M | 160.0 | 31667 | 1.8M | 84 | 31667 | 1.8M | 83 | 865 | 865 | 1 | "31,41,51,61,71,81" | 0:03'05'' | 0:01'42'' |
| Q30L60X40P000 | 75.71M | 40.0 | 35248 | 1.8M | 72 | 35248 | 1.8M | 71 | 855 | 855 | 1 | "31,41,51,61,71,81" | 0:01'14'' | 0:00'56'' |
| Q30L60X40P001 | 75.71M | 40.0 | 32751 | 1.84M | 76 | 32813 | 1.76M | 71 | 32374 | 74.19K | 5 | "31,41,51,61,71,81" | 0:01'13'' | 0:00'45'' |
| Q30L60X40P002 | 75.71M | 40.0 | 32751 | 1.8M | 73 | 32751 | 1.8M | 72 | 855 | 855 | 1 | "31,41,51,61,71,81" | 0:01'13'' | 0:00'44'' |
| Q30L60X40P003 | 75.71M | 40.0 | 32741 | 1.8M | 75 | 32741 | 1.8M | 74 | 865 | 865 | 1 | "31,41,51,61,71,81" | 0:01'13'' | 0:00'45'' |
| Q30L60X80P000 | 151.42M | 80.0 | 32751 | 1.8M | 74 | 32751 | 1.8M | 73 | 865 | 865 | 1 | "31,41,51,61,71,81" | 0:01'49'' | 0:01'08'' |
| Q30L60X80P001 | 151.42M | 80.0 | 32751 | 1.8M | 74 | 32751 | 1.8M | 73 | 865 | 865 | 1 | "31,41,51,61,71,81" | 0:01'50'' | 0:01'12'' |
| Q30L60X120P000 | 227.13M | 120.0 | 32751 | 1.8M | 77 | 32751 | 1.8M | 75 | 865 | 1.49K | 2 | "31,41,51,61,71,81" | 0:02'26'' | 0:01'32'' |
| Q30L60X160P000 | 302.84M | 160.0 | 32404 | 1.8M | 79 | 32404 | 1.8M | 77 | 865 | 1.49K | 2 | "31,41,51,61,71,81" | 0:03'00'' | 0:01'37'' |
## Ftul: merge anchors
```bash
BASE_NAME=Ftul
cd ${HOME}/data/anchr/${BASE_NAME}
# merge anchors
mkdir -p merge
anchr contained \
$(
parallel -k --no-run-if-empty -j 6 "
if [ -e Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa ]; then
echo Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa
fi
" ::: 25 30 ::: 60 ::: 40 80 120 160 ::: 000 001 002 003 004 005 006
) \
--len 1000 --idt 0.98 --proportion 0.99999 --parallel 16 \
-o stdout \
| faops filter -a 1000 -l 0 stdin merge/anchor.contained.fasta
anchr orient merge/anchor.contained.fasta --len 1000 --idt 0.98 -o merge/anchor.orient.fasta
anchr merge merge/anchor.orient.fasta --len 1000 --idt 0.999 -o stdout \
| faops filter -a 1000 -l 0 stdin merge/anchor.merge.fasta
# merge others
mkdir -p merge
anchr contained \
$(
parallel -k --no-run-if-empty -j 6 "
if [ -e Q{1}L{2}X{3}P{4}/anchor/pe.others.fa ]; then
echo Q{1}L{2}X{3}P{4}/anchor/pe.others.fa
fi
" ::: 25 30 ::: 60 ::: 40 80 120 160 ::: 000 001 002 003 004 005 006
) \
--len 1000 --idt 0.98 --proportion 0.99999 --parallel 16 \
-o stdout \
| faops filter -a 1000 -l 0 stdin merge/others.contained.fasta
anchr orient merge/others.contained.fasta --len 1000 --idt 0.98 -o merge/others.orient.fasta
anchr merge merge/others.orient.fasta --len 1000 --idt 0.999 -o stdout \
| faops filter -a 1000 -l 0 stdin merge/others.merge.fasta
# anchors sorted on ref
bash ~/Scripts/cpan/App-Anchr/share/sort_on_ref.sh merge/anchor.merge.fasta 1_genome/genome.fa merge/anchor.sort
nucmer -l 200 1_genome/genome.fa merge/anchor.sort.fa
mummerplot -png out.delta -p anchor.sort --large
# mummerplot files
rm *.[fr]plot
rm out.delta
rm *.gp
mv anchor.sort.png merge/
# quast
rm -fr 9_qa
quast --no-check --threads 16 \
-R 1_genome/genome.fa \
merge/anchor.merge.fasta \
merge/others.merge.fasta \
1_genome/paralogs.fas \
--label "merge,others,paralogs" \
doc/bacteria_2_3.md view on Meta::CPAN
contigTrim/group/*.contig.fasta \
> contigTrim/contig.fasta
```
* quast
```bash
BASE_NAME=Ftul
cd ${HOME}/data/anchr/${BASE_NAME}
rm -fr 9_qa_contig
quast --no-check --threads 16 \
-R 1_genome/genome.fa \
merge/anchor.merge.fasta \
merge/anchor.cover.fasta \
anchorLong/contig.fasta \
contigTrim/contig.fasta \
canu-raw-40x/Ftul.contigs.fasta \
canu-raw-80x/Ftul.contigs.fasta \
1_genome/paralogs.fas \
--label "merge,cover,contig,contigTrim,canu-40x,canu-80x,paralogs" \
-o 9_qa_contig
```
* Stats
```bash
BASE_NAME=Ftul
cd ${HOME}/data/anchr/${BASE_NAME}
printf "| %s | %s | %s | %s |\n" \
"Name" "N50" "Sum" "#" \
> stat3.md
printf "|:--|--:|--:|--:|\n" >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "Genome"; faops n50 -H -S -C 1_genome/genome.fa;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "Paralogs"; faops n50 -H -S -C 1_genome/paralogs.fas;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "anchor.merge"; faops n50 -H -S -C merge/anchor.merge.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "others.merge"; faops n50 -H -S -C merge/others.merge.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "anchor.cover"; faops n50 -H -S -C merge/anchor.cover.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "anchorLong"; faops n50 -H -S -C anchorLong/contig.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "contigTrim"; faops n50 -H -S -C contigTrim/contig.fasta;) >> stat3.md
cat stat3.md
```
| Name | N50 | Sum | # |
|:-------------|--------:|--------:|---:|
| Genome | 1892775 | 1892775 | 1 |
| Paralogs | 33912 | 93531 | 10 |
| anchor.merge | 32813 | 1801122 | 73 |
| others.merge | 32404 | 64274 | 3 |
| anchor.cover | 32813 | 1796007 | 71 |
| anchorLong | 35248 | 1795927 | 70 |
| contigTrim | 1027458 | 1856949 | 4 |
* Clear QxxLxxXxx.
```bash
BASE_NAME=Ftul
cd ${HOME}/data/anchr/${BASE_NAME}
rm -fr 2_illumina/Q{20,25,30}L{1,60,90,120}X*
rm -fr Q{20,25,30}L{1,60,90,120}X*
```
# Haemophilus influenzae FDAARGOS_199, æµæåè¡æè
* Project
[SRP040661](https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP040661)
* Other name: ATCC 51907D; Rd KW20
* BioSample: [SAMN04875536](https://www.ncbi.nlm.nih.gov/biosample/SAMN04875536)
## Hinf: download
* Reference genome
* Strain: Haemophilus influenzae Rd KW20 (g-proteobacteria)
* Taxid: [71421](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=71421)
* RefSeq assembly accession:
[GCF_000027305.1](ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/027/305/GCF_000027305.1_ASM2730v1/GCF_000027305.1_ASM2730v1_assembly_report.txt)
* Proportion of paralogs (> 1000 bp): 0.0324
```bash
mkdir -p ~/data/anchr/Hinf/1_genome
cd ~/data/anchr/Hinf/1_genome
aria2c -x 9 -s 3 -c ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/027/305/GCF_000027305.1_ASM2730v1/GCF_000027305.1_ASM2730v1_genomic.fna.gz
TAB=$'\t'
cat <<EOF > replace.tsv
NC_000907.1${TAB}1
EOF
faops replace GCF_000027305.1_ASM2730v1_genomic.fna.gz replace.tsv genome.fa
cp ~/data/anchr/paralogs/otherbac/Results/Hinf/Hinf.multi.fas paralogs.fas
```
* Illumina
* [SRX2104758](https://www.ncbi.nlm.nih.gov/sra/SRX2104758) SRR4123928
```bash
mkdir -p ~/data/anchr/Hinf/2_illumina
cd ~/data/anchr/Hinf/2_illumina
cat << EOF > sra_ftp.txt
EOF
( run in 1.074 second using v1.01-cache-2.11-cpan-39bf76dae61 )