App-Egaz
view release on metacpan or search on metacpan
doc/Scer-self.md view on Meta::CPAN
## minimap2
```shell
cd ~/data/egaz
# https://github.com/lh3/minimap2/blob/master/cookbook.md#constructing-self-homology-map
minimap2 -DP -k19 -w19 -m200 S288c/chr.fasta S288c/chr.fasta > mm.paf
# https://github.com/lh3/miniasm/blob/master/PAF.md
# 10 int Number of residue matches
cat mm.paf |
tsv-filter --ge "10:1000" |
tsv-sort -k1,1 -k6,6 -k3,3n -k8,8n \
> mm.length.paf
# convert to rg
cat mm.length.paf |
perl -nla -e '
my $f_id = $F[0];
my $f_begin = $F[2] + 1;
my $f_end = $F[3];
my $g_id = $F[5];
my $g_begin = $F[7] + 1;
my $g_end = $F[8];
print "$f_id:$f_begin-$f_end";
print "$g_id:$g_begin-$g_end";
' |
tsv-uniq |
rgr sort stdin \
> mm.rg
# remove repeats
rgr prop S288c/repeat.json mm.rg |
tsv-filter --le "2:0.3" |
tsv-select -f 1 \
> mm.filter.rg
faops region S288c/chr.fasta mm.filter.rg mm.gl.fasta
# stats
spanr cover mm.rg |
spanr stat S288c/chr.sizes stdin --all
spanr cover mm.filter.rg |
spanr stat S288c/chr.sizes stdin --all
spanr stat S288c/chr.sizes S288c/repeat.json --all
# draw the dotplot
cat mm.length.paf |
tsv-select -f 1-12 |
rgr field stdin --chr 1 --start 3 --end 4 -a |
rgr runlist --op overlap -f 13 \
<(spanr cover mm.filter.rg) \
stdin |
tsv-select -f 1-12 \
> mm.filter.paf
paf2dotplot png medium mm.filter.paf
```
## `wfmash -X`
```shell
cd ~/data/egaz
wfmash -X -p 70 S288c/chr.fasta S288c/chr.fasta > self.paf
paf2dotplot png medium self.paf
```
## blast
```bash
cd ~/data/egaz
mkdir -p S288c_proc
mkdir -p S288c_result
cd ~/data/egaz/S288c_proc
# Get exact copies in the genome
fasr axt2fas \
../S288c/chr.sizes ../S288cvsSelf/axtNet/*.net.axt.gz |
fasr filter --ge 1000 stdin -o axt.fas
# links by lastz-chain
fasr link axt.fas |
perl -nl -e 's/(target|query)\.//g; print;' \
> links.lastz.tsv
# remove species names
# remove duplicated sequences
# remove sequences with more than 250 Ns
fasr separate axt.fas --rc |
perl -nl -e '/^>/ and s/^>(target|query)\./\>/; print;' |
faops filter -u -d stdin stdout |
faops filter -n 250 stdin stdout \
> axt.gl.fasta
# Get more paralogs
egaz blastn axt.gl.fasta genome.fa -o axt.bg.blast
egaz blastmatch axt.bg.blast -c 0.95 -o axt.bg.region
samtools faidx genome.fa -r axt.bg.region --continue |
perl -p -e '/^>/ and s/:/(+):/' \
> axt.bg.fasta
cat axt.gl.fasta axt.bg.fasta |
faops filter -u stdin stdout |
faops filter -n 250 stdin stdout \
> axt.all.fasta
# link paralogs
echo "* Link paralogs"
egaz blastn axt.all.fasta axt.all.fasta -o axt.all.blast
egaz blastlink axt.all.blast -c 0.95 -o links.blast.tsv
```
## merge
```bash
cd ~/data/egaz/S288c_proc
# merge
linkr sort -o links.sort.tsv \
links.lastz.tsv links.blast.tsv
( run in 1.395 second using v1.01-cache-2.11-cpan-ecdf5575e8d )