App-Egaz

 view release on metacpan or  search on metacpan

doc/Scer-self.md  view on Meta::CPAN

## minimap2

```shell
cd ~/data/egaz

# https://github.com/lh3/minimap2/blob/master/cookbook.md#constructing-self-homology-map
minimap2 -DP -k19 -w19 -m200 S288c/chr.fasta S288c/chr.fasta > mm.paf

# https://github.com/lh3/miniasm/blob/master/PAF.md
# 10	int	Number of residue matches
cat mm.paf |
    tsv-filter --ge "10:1000" |
    tsv-sort -k1,1 -k6,6 -k3,3n -k8,8n \
    > mm.length.paf

# convert to rg
cat mm.length.paf |
     perl -nla -e '
        my $f_id = $F[0];
        my $f_begin = $F[2] + 1;
        my $f_end = $F[3];

        my $g_id = $F[5];
        my $g_begin = $F[7] + 1;
        my $g_end = $F[8];

        print "$f_id:$f_begin-$f_end";
        print "$g_id:$g_begin-$g_end";
    ' |
    tsv-uniq |
    rgr sort stdin \
    > mm.rg

# remove repeats
rgr prop S288c/repeat.json mm.rg |
    tsv-filter --le "2:0.3" |
    tsv-select -f 1 \
    > mm.filter.rg

faops region S288c/chr.fasta mm.filter.rg mm.gl.fasta

# stats
spanr cover mm.rg |
    spanr stat S288c/chr.sizes stdin --all

spanr cover mm.filter.rg |
    spanr stat S288c/chr.sizes stdin --all

spanr stat S288c/chr.sizes S288c/repeat.json --all

# draw the dotplot
cat mm.length.paf |
    tsv-select -f 1-12 |
    rgr field stdin --chr 1 --start 3 --end 4 -a |
    rgr runlist --op overlap -f 13 \
        <(spanr cover mm.filter.rg) \
        stdin |
    tsv-select -f 1-12 \
    > mm.filter.paf

paf2dotplot png medium mm.filter.paf

```

## `wfmash -X`

```shell
cd ~/data/egaz

wfmash -X -p 70 S288c/chr.fasta S288c/chr.fasta > self.paf

paf2dotplot png medium self.paf

```

## blast

```bash
cd ~/data/egaz

mkdir -p S288c_proc
mkdir -p S288c_result

cd ~/data/egaz/S288c_proc

# Get exact copies in the genome
fasr axt2fas \
    ../S288c/chr.sizes ../S288cvsSelf/axtNet/*.net.axt.gz |
    fasr filter --ge 1000 stdin -o axt.fas

# links by lastz-chain
fasr link axt.fas |
    perl -nl -e 's/(target|query)\.//g; print;' \
    > links.lastz.tsv

# remove species names
# remove duplicated sequences
# remove sequences with more than 250 Ns
fasr separate axt.fas --rc |
    perl -nl -e '/^>/ and s/^>(target|query)\./\>/; print;' |
    faops filter -u -d stdin stdout |
    faops filter -n 250 stdin stdout \
    > axt.gl.fasta

# Get more paralogs
egaz blastn axt.gl.fasta genome.fa -o axt.bg.blast
egaz blastmatch axt.bg.blast -c 0.95 -o axt.bg.region
samtools faidx genome.fa -r axt.bg.region --continue |
    perl -p -e '/^>/ and s/:/(+):/' \
    > axt.bg.fasta

cat axt.gl.fasta axt.bg.fasta |
    faops filter -u stdin stdout |
    faops filter -n 250 stdin stdout \
    > axt.all.fasta

# link paralogs
echo "* Link paralogs"
egaz blastn axt.all.fasta axt.all.fasta -o axt.all.blast
egaz blastlink axt.all.blast -c 0.95 -o links.blast.tsv

```

## merge

```bash
cd ~/data/egaz/S288c_proc

# merge
linkr sort -o links.sort.tsv \
    links.lastz.tsv links.blast.tsv



( run in 1.395 second using v1.01-cache-2.11-cpan-ecdf5575e8d )