App-Anchr
view release on metacpan or search on metacpan
doc/model_organisms.md view on Meta::CPAN
independentLong.fasta
find . -type d -name "correction" | xargs rm -fr
# localCor
gzip -d -c -f $(find group -type f -name "*.correctedReads.fasta.gz") \
| faops filter -l 0 stdin stdout \
| grep -E '^>long' -A 1 \
| sed '/^--$/d' \
| faops dazz -a -l 0 stdin stdout \
| pigz -c > localCor.fasta.gz
canu \
-p ${BASE_NAME} -d localCor \
gnuplotTested=true \
genomeSize=${REAL_G} \
-pacbio-corrected localCor.fasta.gz \
-pacbio-corrected anchor.fasta
canu \
-p ${BASE_NAME} -d localCorRaw \
gnuplotTested=true \
genomeSize=${REAL_G} \
-pacbio-raw localCor.fasta.gz \
-pacbio-raw anchor.fasta
canu \
-p ${BASE_NAME} -d localCorIndep \
gnuplotTested=true \
genomeSize=${REAL_G} \
-pacbio-raw localCor.fasta.gz \
-pacbio-raw anchor.fasta \
-pacbio-raw independentLong.fasta
popd
# quast
rm -fr 9_qa_localCor
quast --no-check --threads 16 \
--eukaryote \
-R 1_genome/genome.fa \
localCor/anchor.fasta \
localCor/localCor/${BASE_NAME}.contigs.fasta \
localCor/localCorRaw/${BASE_NAME}.contigs.fasta \
localCor/localCorIndep/${BASE_NAME}.contigs.fasta \
1_genome/paralogs.fas \
--label "anchor,localCor,localCorRaw,localCorIndep,paralogs" \
-o 9_qa_localCor
find . -type d -name "correction" | xargs rm -fr
```
## s288c: expand anchors
å¨é
¿é
é
µæ¯ä¸, æä¸åå ç»å®å
¨ç¸åçåºå, å®ä»¬é½æ¯æ°è¿åçççæ®µéå¤:
* I:216563-218385, VIII:537165-538987
* I:223713-224783, VIII:550350-551420
* IV:528442-530427, IV:532327-534312, IV:536212-538197
* IV:530324-531519, IV:534209-535404
* IV:5645-7725, X:738076-740156
* IV:7810-9432, X:736368-737990
* IX:9683-11043, X:9666-11026
* IV:1244112-1245373, XV:575980-577241
* VIII:212266-214124, VIII:214264-216122
* IX:11366-14953, X:11349-14936
* XII:468935-470576, XII:472587-474228, XII:482167-483808, XII:485819-487460,
* XII:483798-485798, XII:487450-489450
* anchorLong
```bash
BASE_NAME=s288c
cd ${HOME}/data/anchr/${BASE_NAME}
rm -fr anchorLong
anchr overlap2 \
--parallel 16 \
merge/anchor.merge.fasta \
3_pacbio/pacbio.40x.trim.fasta \
-d anchorLong \
-b 20 --len 1000 --idt 0.85 --all
pushd anchorLong
anchr cover \
--range "1-$(faops n50 -H -N 0 -C anchor.fasta)" \
--len 1000 --idt 0.85 -c 2 \
anchorLong.ovlp.tsv \
-o anchor.cover.json
cat anchor.cover.json | jq "." > environment.json
anchr overlap \
anchor.fasta \
--serial --len 20 --idt 0.9999 \
-o stdout \
| perl -nla -e '
BEGIN {
our %seen;
our %count_of;
}
@F == 13 or next;
$F[3] > 0.9999 or next;
my $pair = join( "-", sort { $a <=> $b } ( $F[0], $F[1], ) );
next if $seen{$pair};
$seen{$pair} = $_;
$count_of{ $F[0] }++;
$count_of{ $F[1] }++;
END {
for my $pair ( keys %seen ) {
my ($f_id, $g_id) = split "-", $pair;
next if $count_of{$f_id} > 2;
next if $count_of{$g_id} > 2;
print $seen{$pair};
}
}
doc/model_organisms.md view on Meta::CPAN
-o group/{}.contig.fasta
'
popd
cat \
contigTrim/group/non_grouped.fasta \
contigTrim/group/*.contig.fasta \
> contigTrim/contig.fasta
```
## iso_1: final stats
* Stats
```bash
BASE_NAME=iso_1
cd ${HOME}/data/anchr/${BASE_NAME}
printf "| %s | %s | %s | %s |\n" \
"Name" "N50" "Sum" "#" \
> stat3.md
printf "|:--|--:|--:|--:|\n" >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "Genome"; faops n50 -H -S -C 1_genome/genome.fa;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "Paralogs"; faops n50 -H -S -C 1_genome/paralogs.fas;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "anchor.merge"; faops n50 -H -S -C merge/anchor.merge.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "others.merge"; faops n50 -H -S -C merge/others.merge.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "anchorLong"; faops n50 -H -S -C anchorLong/contig.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "contigTrim"; faops n50 -H -S -C contigTrim/contig.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "spades.contig"; faops n50 -H -S -C 8_spades/contigs.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "spades.scaffold"; faops n50 -H -S -C 8_spades/scaffolds.fasta;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "platanus.contig"; faops n50 -H -S -C 8_platanus/out_contig.fa;) >> stat3.md
printf "| %s | %s | %s | %s |\n" \
$(echo "platanus.scaffold"; faops n50 -H -S -C 8_platanus/out_gapClosed.fa;) >> stat3.md
cat stat3.md
```
| Name | N50 | Sum | # |
|:------------------|---------:|----------:|-------:|
| Genome | 25286936 | 137567477 | 8 |
| Paralogs | 4031 | 13665900 | 4492 |
| anchor.merge | 26860 | 117041459 | 9566 |
| others.merge | 8732 | 3092289 | 1004 |
| anchor.cover | 26199 | 116199529 | 9576 |
| anchorLong | 69814 | 115806088 | 4924 |
| contigTrim | 1238480 | 123572499 | 603 |
| spades.contig | 108756 | 132705321 | 61620 |
| spades.scaffold | 142273 | 132725706 | 61182 |
| platanus.contig | 11503 | 156820565 | 359399 |
| platanus.scaffold | 146404 | 129134232 | 71416 |
* quast
```bash
BASE_NAME=iso_1
cd ${HOME}/data/anchr/${BASE_NAME}
rm -fr 9_qa_contig
quast --no-check --threads 16 \
--eukaryote \
--no-icarus \
-R 1_genome/genome.fa \
merge/anchor.merge.fasta \
anchorLong/contig.fasta \
contigTrim/contig.fasta \
canu-raw-40x/${BASE_NAME}.contigs.fasta \
canu-trim-40x/${BASE_NAME}.contigs.fasta \
8_spades/scaffolds.fasta \
8_platanus/out_gapClosed.fa \
1_genome/paralogs.fas \
--label "merge,contig,contigTrim,canu-40x,canu-40x.trim,spades,platanus,paralogs" \
-o 9_qa_contig
```
* Clear QxxLxxXxx.
```bash
BASE_NAME=iso_1
cd ${HOME}/data/anchr/${BASE_NAME}
rm -fr 2_illumina/Q{20,25,30,35}L{30,60,90,120}X*
rm -fr Q{20,25,30,35}L{30,60,90,120}X*
```
# *Caenorhabditis elegans* N2
* Genome: [Ensembl 82](http://sep2015.archive.ensembl.org/Caenorhabditis_elegans/Info/Index)
* Proportion of paralogs (> 1000 bp): 0.0472
## n2: download
* Reference genome
```bash
mkdir -p ~/data/anchr/n2/1_genome
cd ~/data/anchr/n2/1_genome
wget -N ftp://ftp.ensembl.org/pub/release-82/fasta/caenorhabditis_elegans/dna/Caenorhabditis_elegans.WBcel235.dna_sm.toplevel.fa.gz
faops order Caenorhabditis_elegans.WBcel235.dna_sm.toplevel.fa.gz \
<(for chr in {I,II,III,IV,V,X,MtDNA}; do echo $chr; done) \
genome.fa
cp ~/data/anchr/paralogs/model/Results/n2/n2.multi.fas 1_genome/paralogs.fas
```
* Illumina
* Other SRA
* SRX770040 - [insert size](https://www.ncbi.nlm.nih.gov/sra/SRX770040[accn]) is 500-600 bp
( run in 1.159 second using v1.01-cache-2.11-cpan-75ffa21a3d4 )