App-Anchr
view release on metacpan or search on metacpan
doc/model_organisms.md view on Meta::CPAN
independentLong.fasta
find . -type d -name "correction" | xargs rm -fr
# localCor
gzip -d -c -f $(find group -type f -name "*.correctedReads.fasta.gz") \
| faops filter -l 0 stdin stdout \
| grep -E '^>long' -A 1 \
| sed '/^--$/d' \
| faops dazz -a -l 0 stdin stdout \
| pigz -c > localCor.fasta.gz
canu \
-p ${BASE_NAME} -d localCor \
gnuplotTested=true \
genomeSize=${REAL_G} \
-pacbio-corrected localCor.fasta.gz \
-pacbio-corrected anchor.fasta
canu \
-p ${BASE_NAME} -d localCorRaw \
gnuplotTested=true \
genomeSize=${REAL_G} \
-pacbio-raw localCor.fasta.gz \
-pacbio-raw anchor.fasta
canu \
-p ${BASE_NAME} -d localCorIndep \
gnuplotTested=true \
genomeSize=${REAL_G} \
-pacbio-raw localCor.fasta.gz \
-pacbio-raw anchor.fasta \
-pacbio-raw independentLong.fasta
popd
# quast
rm -fr 9_qa_localCor
quast --no-check --threads 16 \
--eukaryote \
-R 1_genome/genome.fa \
localCor/anchor.fasta \
localCor/localCor/${BASE_NAME}.contigs.fasta \
localCor/localCorRaw/${BASE_NAME}.contigs.fasta \
localCor/localCorIndep/${BASE_NAME}.contigs.fasta \
1_genome/paralogs.fas \
--label "anchor,localCor,localCorRaw,localCorIndep,paralogs" \
-o 9_qa_localCor
find . -type d -name "correction" | xargs rm -fr
```
## s288c: expand anchors
å¨é
¿é
é
µæ¯ä¸, æä¸åå ç»å®å
¨ç¸åçåºå, å®ä»¬é½æ¯æ°è¿åçççæ®µéå¤:
* I:216563-218385, VIII:537165-538987
* I:223713-224783, VIII:550350-551420
* IV:528442-530427, IV:532327-534312, IV:536212-538197
* IV:530324-531519, IV:534209-535404
* IV:5645-7725, X:738076-740156
* IV:7810-9432, X:736368-737990
* IX:9683-11043, X:9666-11026
* IV:1244112-1245373, XV:575980-577241
* VIII:212266-214124, VIII:214264-216122
* IX:11366-14953, X:11349-14936
* XII:468935-470576, XII:472587-474228, XII:482167-483808, XII:485819-487460,
* XII:483798-485798, XII:487450-489450
* anchorLong
```bash
BASE_NAME=s288c
cd ${HOME}/data/anchr/${BASE_NAME}
rm -fr anchorLong
anchr overlap2 \
--parallel 16 \
merge/anchor.merge.fasta \
3_pacbio/pacbio.40x.trim.fasta \
-d anchorLong \
-b 20 --len 1000 --idt 0.85 --all
pushd anchorLong
anchr cover \
--range "1-$(faops n50 -H -N 0 -C anchor.fasta)" \
--len 1000 --idt 0.85 -c 2 \
anchorLong.ovlp.tsv \
-o anchor.cover.json
cat anchor.cover.json | jq "." > environment.json
anchr overlap \
anchor.fasta \
--serial --len 20 --idt 0.9999 \
-o stdout \
| perl -nla -e '
BEGIN {
our %seen;
our %count_of;
}
@F == 13 or next;
$F[3] > 0.9999 or next;
my $pair = join( "-", sort { $a <=> $b } ( $F[0], $F[1], ) );
next if $seen{$pair};
$seen{$pair} = $_;
$count_of{ $F[0] }++;
$count_of{ $F[1] }++;
END {
for my $pair ( keys %seen ) {
my ($f_id, $g_id) = split "-", $pair;
next if $count_of{$f_id} > 2;
next if $count_of{$g_id} > 2;
print $seen{$pair};
}
}
doc/model_organisms.md view on Meta::CPAN
cat << EOF > sra_md5.txt
c0c877f8ba0bba7e26597e415d7591e1 ERR701706
8737074782482ced94418a579bc0e8db ERR701707
e638730be88ee74102511c5091850359 ERR701708
d2bf01cb606e5d2ccad76bd1380e17a3 ERR701709
a51e6c1c09f225f1b6628b614c046ed0 ERR701710
dab2d1f14eff875f456045941a955b51 ERR701711
EOF
md5sum --check sra_md5.txt
for sra in ERR7017{06,07,08,09,10,11}; do
echo ${sra}
fastq-dump --split-files ./${sra}
done
cat ERR7017{06,07,08,09,10,11}_1.fastq > R1.fq
cat ERR7017{06,07,08,09,10,11}_2.fastq > R2.fq
find . -name "*.fq" | parallel -j 2 pigz -p 8
rm *.fastq
```
* PacBio
PacBio provides a dataset of *D. melanogaster* strain
[ISO1](https://github.com/PacificBiosciences/DevNet/wiki/Drosophila-sequence-and-assembly), the
same stock used in the official BDGP reference assemblies. This is gathered with RS II and P5C3.
```bash
mkdir -p ~/data/anchr/iso_1/3_pacbio
cd ~/data/anchr/iso_1/3_pacbio
cat <<EOF > tgz.txt
https://s3.amazonaws.com/datasets.pacb.com/2014/Drosophila/raw/Dro1_24NOV2013_398.tgz
https://s3.amazonaws.com/datasets.pacb.com/2014/Drosophila/raw/Dro2_25NOV2013_399.tgz
https://s3.amazonaws.com/datasets.pacb.com/2014/Drosophila/raw/Dro3_26NOV2013_400.tgz
https://s3.amazonaws.com/datasets.pacb.com/2014/Drosophila/raw/Dro4_28NOV2013_401.tgz
https://s3.amazonaws.com/datasets.pacb.com/2014/Drosophila/raw/Dro5_29NOV2013_402.tgz
https://s3.amazonaws.com/datasets.pacb.com/2014/Drosophila/raw/Dro6_1DEC2013_403.tgz
EOF
aria2c -x 9 -s 3 -c -i tgz.txt
# untar
mkdir -p ~/data/anchr/iso_1/3_pacbio/untar
cd ~/data/anchr/iso_1/3_pacbio
tar xvfz Dro1_24NOV2013_398.tgz --directory untar
#tar xvfz Dro2_25NOV2013_399.tgz --directory untar
#tar xvfz Dro3_26NOV2013_400.tgz --directory untar
#tar xvfz Dro4_28NOV2013_401.tgz --directory untar
tar xvfz Dro5_29NOV2013_402.tgz --directory untar
tar xvfz Dro6_1DEC2013_403.tgz --directory untar
find . -type f -name "*.ba?.h5" | parallel -j 1 "mv {} untar"
# convert .bax.h5 to .subreads.bam
mkdir -p ~/data/anchr/iso_1/3_pacbio/bam
cd ~/data/anchr/iso_1/3_pacbio/bam
source ~/share/pitchfork/deployment/setup-env.sh
for movie in m131124_190051 m131124_221952 m131125_013854 m131125_045830 m131130_054035 m131130_091217 m131130_124231 m131130_161213 m131130_194336 m131130_231441 m131201_024805 m131201_061903 m131201_223357 m131202_020424 m131202_053545 m131202_0905...
do
if [ -e ~/data/anchr/iso_1/3_pacbio/bam/${movie}*.subreads.bam ]; then
continue
fi
bax2bam ~/data/anchr/iso_1/3_pacbio/untar/${movie}*.bax.h5
done
# convert .subreads.bam to fasta
mkdir -p ~/data/anchr/iso_1/3_pacbio/fasta
for movie in m131124_190051 m131124_221952 m131125_013854 m131125_045830 m131130_054035 m131130_091217 m131130_124231 m131130_161213 m131130_194336 m131130_231441 m131201_024805 m131201_061903 m131201_223357 m131202_020424 m131202_053545 m131202_0905...
do
if [ ! -e ~/data/anchr/iso_1/3_pacbio/bam/${movie}*.subreads.bam ]; then
continue
fi
samtools fasta \
~/data/anchr/iso_1/3_pacbio/bam/${movie}*.subreads.bam \
> ~/data/anchr/iso_1/3_pacbio/fasta/${movie}.fasta
done
cd ~/data/anchr/iso_1
cat 3_pacbio/fasta/*.fasta > 3_pacbio/pacbio.fasta
cd 3_pacbio/
ln -s pacbio.fasta pacbio.40x.fasta
```
* FastQC
```bash
BASE_NAME=iso_1
cd ${HOME}/data/anchr/${BASE_NAME}
mkdir -p 2_illumina/fastqc
cd 2_illumina/fastqc
fastqc -t 16 \
../R1.fq.gz ../R2.fq.gz \
-o .
```
## iso_1: preprocess Illumina reads
* qual: 20, 25, and 30
* len: 60
```bash
BASE_NAME=iso_1
cd ${HOME}/data/anchr/${BASE_NAME}
if [ ! -e 2_illumina/R1.uniq.fq.gz ]; then
tally \
--pair-by-offset --with-quality --nozip --unsorted \
-i 2_illumina/R1.fq.gz \
-j 2_illumina/R2.fq.gz \
-o 2_illumina/R1.uniq.fq \
-p 2_illumina/R2.uniq.fq
parallel --no-run-if-empty -j 2 "
pigz -p 4 2_illumina/{}.uniq.fq
" ::: R1 R2
fi
parallel --no-run-if-empty -j 3 "
mkdir -p 2_illumina/Q{1}L{2}
cd 2_illumina/Q{1}L{2}
if [ -e R1.fq.gz ]; then
doc/model_organisms.md view on Meta::CPAN
anchr kunitigs \
../2_illumina/Q{1}L{2}X{3}P{4}/pe.cor.fa \
../2_illumina/Q{1}L{2}X{3}P{4}/environment.json \
-p 16 \
--kmer 31,41,51,61,71,81 \
-o kunitigs.sh
bash kunitigs.sh
echo >&2
" ::: 25 30 ::: 60 ::: 30 60 ::: 000 001 002 003 004 005 006
# anchors (sampled)
parallel --no-run-if-empty -j 3 "
echo >&2 '==> Group Q{1}L{2}X{3}P{4}'
if [ ! -e Q{1}L{2}X{3}P{4}/pe.cor.fa ]; then
echo >&2 ' pe.cor.fa not exists'
exit;
fi
if [ -e Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa ]; then
echo >&2 ' k_unitigs.fasta already presents'
exit;
fi
rm -fr Q{1}L{2}X{3}P{4}/anchor
mkdir -p Q{1}L{2}X{3}P{4}/anchor
cd Q{1}L{2}X{3}P{4}/anchor
anchr anchors \
../k_unitigs.fasta \
../pe.cor.fa \
-p 8 \
-o anchors.sh
bash anchors.sh
echo >&2
" ::: 25 30 ::: 60 ::: 30 60 ::: 000 001 002 003 004 005 006
# Stats of anchors
bash ~/Scripts/cpan/App-Anchr/share/sr_stat.sh 2 header \
> stat2.md
parallel -k --no-run-if-empty -j 6 "
if [ ! -e Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa ]; then
exit;
fi
bash ~/Scripts/cpan/App-Anchr/share/sr_stat.sh 2 Q{1}L{2}X{3}P{4} ${REAL_G}
" ::: 25 30 ::: 60 ::: 30 60 ::: 000 001 002 003 004 005 006 \
>> stat2.md
cat stat2.md
```
| Name | SumCor | CovCor | N50SR | Sum | # | N50Anchor | Sum | # | N50Others | Sum | # | Kmer | RunTimeKU | RunTimeAN |
|:--------------|-------:|-------:|------:|-------:|------:|----------:|-------:|------:|----------:|-------:|-----:|--------------------:|----------:|:----------|
| Q25L60X30P000 | 3.01G | 30.0 | 10992 | 98.21M | 22237 | 11789 | 85.96M | 13658 | 2311 | 12.25M | 8579 | "31,41,51,61,71,81" | 1:14'53'' | 0:07'05'' |
| Q25L60X30P001 | 3.01G | 30.0 | 10433 | 97.91M | 23035 | 11351 | 85.44M | 13871 | 1562 | 12.47M | 9164 | "31,41,51,61,71,81" | 1:11'02'' | 0:07'05'' |
| Q25L60X60P000 | 6.02G | 60.0 | 11787 | 99.24M | 19543 | 12467 | 88.53M | 12969 | 4629 | 10.71M | 6574 | "31,41,51,61,71,81" | 1:25'03'' | 0:08'23'' |
| Q30L60X30P000 | 3.01G | 30.0 | 10914 | 97.81M | 22843 | 11752 | 85.35M | 13764 | 1510 | 12.46M | 9079 | "31,41,51,61,71,81" | 0:59'47'' | 0:07'06'' |
| Q30L60X30P001 | 3.01G | 30.0 | 10160 | 97.39M | 24052 | 10924 | 84.51M | 14218 | 1255 | 12.87M | 9834 | "31,41,51,61,71,81" | 0:45'09'' | 0:07'05'' |
| Q30L60X60P000 | 6.02G | 60.0 | 12462 | 99.3M | 19363 | 12943 | 88.29M | 12773 | 5901 | 11.01M | 6590 | "31,41,51,61,71,81" | 0:59'46'' | 0:08'02'' |
## n2: merge anchors
```bash
BASE_NAME=n2
cd ${HOME}/data/anchr/${BASE_NAME}
# merge anchors
mkdir -p merge
anchr contained \
$(
parallel -k --no-run-if-empty -j 6 "
if [ -e Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa ]; then
echo Q{1}L{2}X{3}P{4}/anchor/pe.anchor.fa
fi
" ::: 25 30 ::: 60 ::: 30 60 ::: 000 001 002 003 004 005
) \
--len 1000 --idt 0.98 --proportion 0.99999 --parallel 16 \
-o stdout \
| faops filter -a 1000 -l 0 stdin merge/anchor.contained.fasta
anchr orient merge/anchor.contained.fasta --len 1000 --idt 0.98 -o merge/anchor.orient.fasta
anchr merge merge/anchor.orient.fasta --len 1000 --idt 0.999 -o merge/anchor.merge0.fasta
anchr contained merge/anchor.merge0.fasta --len 1000 --idt 0.98 \
--proportion 0.99 --parallel 16 -o stdout \
| faops filter -a 1000 -l 0 stdin merge/anchor.merge.fasta
# merge others
mkdir -p merge
anchr contained \
$(
parallel -k --no-run-if-empty -j 6 "
if [ -e Q{1}L{2}X{3}P{4}/anchor/pe.others.fa ]; then
echo Q{1}L{2}X{3}P{4}/anchor/pe.others.fa
fi
" ::: 25 30 ::: 60 ::: 30 60 ::: 000 001 002 003 004 005
) \
--len 1000 --idt 0.98 --proportion 0.99999 --parallel 16 \
-o stdout \
| faops filter -a 1000 -l 0 stdin merge/others.contained.fasta
anchr orient merge/others.contained.fasta --len 1000 --idt 0.98 -o merge/others.orient.fasta
anchr merge merge/others.orient.fasta --len 1000 --idt 0.999 -o stdout \
| faops filter -a 1000 -l 0 stdin merge/others.merge.fasta
# anchor sort on ref
bash ~/Scripts/cpan/App-Anchr/share/sort_on_ref.sh merge/anchor.merge.fasta 1_genome/genome.fa merge/anchor.sort
nucmer -l 200 1_genome/genome.fa merge/anchor.sort.fa
mummerplot -png out.delta -p anchor.sort --large
# mummerplot files
rm *.[fr]plot
rm out.delta
rm *.gp
mv anchor.sort.png merge/
# quast
rm -fr 9_qa
quast --no-check --threads 16 \
--eukaryote \
--no-icarus \
doc/model_organisms.md view on Meta::CPAN
| contigTrim | 335060 | 95965683 | 607 |
| spades.contig | 34792 | 105621949 | 39732 |
| spades.scaffold | 39185 | 105667774 | 39154 |
| platanus.contig | 9540 | 108908253 | 143264 |
| platanus.scaffold | 28158 | 99589056 | 35182 |
* quast
```bash
BASE_NAME=n2
cd ${HOME}/data/anchr/${BASE_NAME}
rm -fr 9_qa_contig
quast --no-check --threads 16 \
--eukaryote \
--no-icarus \
-R 1_genome/genome.fa \
merge/anchor.merge.fasta \
merge/anchor.cover.fasta \
anchorLong/contig.fasta \
contigTrim/contig.fasta \
canu-raw-40x/${BASE_NAME}.contigs.fasta \
8_spades/scaffolds.fasta \
8_platanus/out_gapClosed.fa \
1_genome/paralogs.fas \
--label "merge,cover,contig,contigTrim,canu-40x,spades,platanus,paralogs" \
-o 9_qa_contig
```
* Clear QxxLxxx.
```bash
BASE_DIR=$HOME/data/anchr/n2
cd ${BASE_DIR}
rm -fr 2_illumina/Q{20,25,30,35}L{30,60,90,120}X*
rm -fr Q{20,25,30,35}L{30,60,90,120}X*
```
# *Arabidopsis thaliana* Col-0
* Genome: [Ensembl Genomes](http://plants.ensembl.org/Arabidopsis_thaliana/Info/Index)
* Proportion of paralogs (> 1000 bp): 0.1158
## col_0: download
* Reference genome
```bash
mkdir -p ~/data/anchr/col_0/1_genome
cd ~/data/anchr/col_0/1_genome
wget -N ftp://ftp.ensemblgenomes.org/pub/release-29/plants/fasta/arabidopsis_thaliana/dna/Arabidopsis_thaliana.TAIR10.29.dna_sm.toplevel.fa.gz
faops order Arabidopsis_thaliana.TAIR10.29.dna_sm.toplevel.fa.gz \
<(for chr in {1,2,3,4,5,Mt,Pt}; do echo $chr; done) \
genome.fa
```
* Illumina HiSeq (100 bp)
[SRX202246](https://www.ncbi.nlm.nih.gov/sra/SRX202246[accn])
```bash
# Downloading from ena with aria2
mkdir -p ~/data/anchr/col_0/2_illumina
cd ~/data/anchr/col_0/2_illumina
cat << EOF > sra_ftp.txt
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR611/SRR611086
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR616/SRR616966
EOF
aria2c -x 9 -s 3 -c -i sra_ftp.txt
cat << EOF > sra_md5.txt
b884e83b47c485c9a07f732b3805e7cf SRR611086
102db119d1040c3bf85af5e4da6e456d SRR616966
EOF
md5sum --check sra_md5.txt
for sra in SRR61{1086,6966}; do
echo ${sra}
fastq-dump --split-files ./${sra}
done
cat SRR61{1086,6966}_1.fastq > R1.fq
cat SRR61{1086,6966}_2.fastq > R2.fq
find . -name "*.fq" | parallel -j 2 pigz -p 8
rm *.fastq
```
* Illumina MiSeq
[SRX2527206](https://www.ncbi.nlm.nih.gov/sra/SRX2527206[accn]) SRR5216995
```bash
BASE_NAME=col_0
cd ${HOME}/data/anchr/${BASE_NAME}
mkdir -p 2_illumina
cd 2_illumina
cat << EOF > sra_ftp.txt
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR521/005/SRR5216995/SRR5216995_1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR521/005/SRR5216995/SRR5216995_2.fastq.gz
EOF
aria2c -x 9 -s 3 -c -i sra_ftp.txt
cat << EOF > sra_md5.txt
ce4a92a9364a6773633223ff7a807810 SRR5216995_1.fastq.gz
5c6672124a628ea0020c88e74eff53a3 SRR5216995_2.fastq.gz
EOF
md5sum --check sra_md5.txt
ln -s SRR5216995_1.fastq.gz R1.fq.gz
ln -s SRR5216995_2.fastq.gz R2.fq.gz
```
* PacBio
Chin, C.-S. *et al.* Phased diploid genome assembly with single-molecule real-time sequencing.
*Nature Methods* (2016). doi:10.1038/nmeth.4035
P4C2 is not supported in newer version of SMRTAnalysis.
https://www.ncbi.nlm.nih.gov/biosample/4539665
[SRX1715692](https://www.ncbi.nlm.nih.gov/sra/SRX1715692[accn])
```bash
mkdir -p ~/data/anchr/col_0/3_pacbio
cd ~/data/anchr/col_0/3_pacbio
cat <<EOF > sra_ftp.txt
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/002/SRR3405242
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/003/SRR3405243
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/004/SRR3405244
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/006/SRR3405246
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/008/SRR3405248
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/000/SRR3405250
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/002/SRR3405252
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/003/SRR3405253
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/004/SRR3405254
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/005/SRR3405255
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/006/SRR3405256
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/007/SRR3405257
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/008/SRR3405258
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/009/SRR3405259
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/005/SRR3405245
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/007/SRR3405247
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/009/SRR3405249
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/001/SRR3405251
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/000/SRR3405260
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/003/SRR3405263
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/005/SRR3405265
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/007/SRR3405267
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/009/SRR3405269
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/001/SRR3405271
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/004/SRR3405274
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/005/SRR3405275
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/006/SRR3405276
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/007/SRR3405277
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/008/SRR3405278
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/009/SRR3405279
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/000/SRR3405280
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/001/SRR3405281
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/002/SRR3405282
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/003/SRR3405283
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/004/SRR3405284
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/005/SRR3405285
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/006/SRR3405286
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/007/SRR3405287
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/008/SRR3405288
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/009/SRR3405289
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/000/SRR3405290
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/001/SRR3405261
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/002/SRR3405262
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/004/SRR3405264
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/006/SRR3405266
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/008/SRR3405268
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/000/SRR3405270
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/002/SRR3405272
ftp://ftp.sra.ebi.ac.uk/vol1/srr/SRR340/003/SRR3405273
EOF
aria2c -x 6 -s 3 -c -i sra_ftp.txt
cat << EOF > sra_md5.txt
be9c803f847ff1c81d153110cc699390 SRR3405242
c68a2c3b62245a697722fd3f8fda7a2d SRR3405243
7116e8a0de87b1acd016d9b284e4795c SRR3405244
51f8e5ee4565aace4e5a5cba73e3e597 SRR3405246
f339f580e86aad3a5487b5cec8ae80d4 SRR3405248
1a8246ed1f7c38801cfc603e088abb70 SRR3405250
a0ce8435a7fa2e7ddbd6ac181902f751 SRR3405252
8754f69a1c8c1f00b58b48454c1c01ad SRR3405253
367508500303325e855666133505a5af SRR3405254
d250f69fcf2975c89ceab5a4f9425b36 SRR3405255
badd9b2d23f94d1c98263d2e786742ae SRR3405256
6c5cbd3bce9459283a415d8a5c05c86e SRR3405257
32da7a364c8cbda5cf76b87f7c51b475 SRR3405258
eb3819adf483451ac670f89d1ea6b76e SRR3405259
5337862eeb0945f932de74e8f7b9ec4f SRR3405245
4545ce4666878fcbcda1e7737be1896b SRR3405247
71d61bc64e3ca9b91f08b1c6b1389f16 SRR3405249
b9a911b8eb4fbfe29dff8cf920429f18 SRR3405251
99bae070fa90d53c8f15b9cf42c634f6 SRR3405260
830e02f1f3cb66b9e085803a21ad8040 SRR3405263
86d28c63f00095ae0ff1151e7e0bf7b4 SRR3405265
3e048ad8dbb526d4a533ee1d5ec10a43 SRR3405267
1b73ed3a1124f5f025c511672c1e18d3 SRR3405269
fa07c85b9e6258abcef8bdb730ab812f SRR3405271
aeb6ab7edfa42e5e27704b7625c659c1 SRR3405274
0eb24fcc9b40f6fe0f013fe79dd7edf7 SRR3405275
f051e0065602477e0a1d13a6d0a42d3d SRR3405276
178540e33e9f4f76adc8509b147d7ff6 SRR3405277
6fdfa97e2eacf0ac186b5333e97c334b SRR3405278
a6bb6b57db82eb6e4161847f9d35a608 SRR3405279
8399b8e8e4d48c7374a414a9585efa5b SRR3405280
e725278a3837775e214b39093a900927 SRR3405281
fab9120bfa1130b300f7e82b74d23173 SRR3405282
33929263f09811d7f7360a9675e82cdd SRR3405283
7f9e58c6fa43e8f2f3fa2496e149d2cb SRR3405284
b9a469affbff1bdcb1b299c106c2c1b9 SRR3405285
688ab23dbfe7977f9de780486a8d5c6b SRR3405286
fadc273d324413017e45570e3bf0ee6e SRR3405287
6f4b0eb22cb523ddecb842042d500ceb SRR3405288
03a4581c1b951dba3bb9e295e9113bf3 SRR3405289
51fa78f451a33bd44f985ac220e17efe SRR3405290
fac8c4c2a862a4d572d77d0deb4b0abc SRR3405261
3fd1a3d8140cfa96a0287e9e2b6055c4 SRR3405262
f908e6194fb3a0026b5263acadbd2600 SRR3405264
e04a7d96ba91ebb11772c019981ea9eb SRR3405266
784e28febf413c6dfa842802aa106a55 SRR3405268
05b91a051fc52417858e93ce3b22fe2e SRR3405270
07bca433005313a4a2c8050e32952f58 SRR3405272
a9bbee29c3d507760c4c33fbbe436fa6 SRR3405273
EOF
md5sum --check sra_md5.txt
for sra in SRR34052{42,43,44,46,48,50,52,53,54,55,56,57,58,59,45,47,49,51,60,63,65,67,69,71,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,61,62,64,66,68,70,72,73}; do
echo ${sra}
fastq-dump ./${sra}
done
cat SRR34052{42,43,44,46,48,50,52,53,54,55,56,57,58,59,45,47,49,51,60,63,65,67,69,71,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,61,62,64,66,68,70,72,73}.fastq \
> pacbio.fq
find . -name "*.fq" | parallel -j 2 pigz -p 8
rm *.fastq
faops filter -l 0 pacbio.fq.gz pacbio.fasta
```
* FastQC
```bash
BASE_NAME=col_0
cd ${HOME}/data/anchr/${BASE_NAME}
mkdir -p 2_illumina/fastqc
cd 2_illumina/fastqc
doc/model_organisms.md view on Meta::CPAN
BASE_NAME=col_0
cd ${HOME}/data/anchr/${BASE_NAME}
printf "| %s | %s | %s | %s |\n" \
"Name" "N50" "Sum" "#" \
> stat.md
printf "|:--|--:|--:|--:|\n" >> stat.md
printf "| %s | %s | %s | %s |\n" \
$(echo "Genome"; faops n50 -H -S -C 1_genome/genome.fa;) >> stat.md
printf "| %s | %s | %s | %s |\n" \
$(echo "Paralogs"; faops n50 -H -S -C 1_genome/paralogs.fas;) >> stat.md
printf "| %s | %s | %s | %s |\n" \
$(echo "Illumina"; faops n50 -H -S -C 2_illumina/R1.fq.gz 2_illumina/R2.fq.gz;) >> stat.md
printf "| %s | %s | %s | %s |\n" \
$(echo "uniq"; faops n50 -H -S -C 2_illumina/R1.uniq.fq.gz 2_illumina/R2.uniq.fq.gz;) >> stat.md
parallel -k --no-run-if-empty -j 3 "
printf \"| %s | %s | %s | %s |\n\" \
\$(
echo Q{1}L{2};
if [[ {1} -ge '30' ]]; then
faops n50 -H -S -C \
2_illumina/Q{1}L{2}/R1.fq.gz \
2_illumina/Q{1}L{2}/R2.fq.gz \
2_illumina/Q{1}L{2}/Rs.fq.gz;
else
faops n50 -H -S -C \
2_illumina/Q{1}L{2}/R1.fq.gz \
2_illumina/Q{1}L{2}/R2.fq.gz;
fi
)
" ::: 20 25 30 ::: 60 \
>> stat.md
printf "| %s | %s | %s | %s |\n" \
$(echo "PacBio"; faops n50 -H -S -C 3_pacbio/pacbio.fasta;) >> stat.md
printf "| %s | %s | %s | %s |\n" \
$(echo "PacBio.40x"; faops n50 -H -S -C 3_pacbio/pacbio.40x.fasta;) >> stat.md
printf "| %s | %s | %s | %s |\n" \
$(echo "PacBio.40x.trim"; faops n50 -H -S -C 3_pacbio/pacbio.40x.trim.fasta;) >> stat.md
printf "| %s | %s | %s | %s |\n" \
$(echo "PacBio.80x"; faops n50 -H -S -C 3_pacbio/pacbio.80x.fasta;) >> stat.md
printf "| %s | %s | %s | %s |\n" \
$(echo "PacBio.80x.trim"; faops n50 -H -S -C 3_pacbio/pacbio.80x.trim.fasta;) >> stat.md
cat stat.md
```
| Name | N50 | Sum | # |
|:----------------|---------:|------------:|---------:|
| Genome | 23459830 | 119667750 | 7 |
| Paralogs | 2007 | 16447809 | 8055 |
| Illumina | 301 | 15529845059 | 53786130 |
| uniq | 301 | 15528150050 | 53779068 |
| Q20L60 | 301 | 13359936477 | 52318516 |
| Q25L60 | 301 | 11821537855 | 49650904 |
| Q30L60 | 301 | 10366980114 | 48122656 |
| PacBio | 6754 | 18768526777 | 5721958 |
| PacBio.40x | 7830 | 4906030224 | 1300000 |
| PacBio.40x.trim | 6904 | 2032710549 | 381134 |
| PacBio.80x | 7448 | 9473394614 | 2600000 |
| PacBio.80x.trim | 6975 | 3942522483 | 729527 |
## col_0: spades
```bash
BASE_NAME=col_0
cd ${HOME}/data/anchr/${BASE_NAME}
spades.py \
-t 16 \
-k 21,33,55,77 \
-1 2_illumina/Q25L60/R1.fq.gz \
-2 2_illumina/Q25L60/R2.fq.gz \
-s 2_illumina/Q25L60/Rs.fq.gz \
-o 8_spades
```
## col_0: platanus
```bash
BASE_NAME=col_0
cd ${HOME}/data/anchr/${BASE_NAME}
mkdir -p 8_platanus
cd 8_platanus
if [ ! -e pe.fa ]; then
faops interleave \
-p pe \
../2_illumina/Q25L60/R1.fq.gz \
../2_illumina/Q25L60/R2.fq.gz \
> pe.fa
faops interleave \
-p se \
../2_illumina/Q25L60/Rs.fq.gz \
> se.fa
fi
platanus assemble -t 16 -m 100 \
-f pe.fa se.fa \
2>&1 | tee ass_log.txt
platanus scaffold -t 16 \
-c out_contig.fa -b out_contigBubble.fa \
-ip1 pe.fa \
2>&1 | tee sca_log.txt
platanus gap_close -t 16 \
-c out_scaffold.fa \
-ip1 pe.fa \
2>&1 | tee gap_log.txt
```
## col_0: quorum
( run in 3.864 seconds using v1.01-cache-2.11-cpan-75ffa21a3d4 )