App-Egaz
view release on metacpan or search on metacpan
share/2_mash.tt2.sh view on Meta::CPAN
[% INCLUDE header.tt2 %]
#----------------------------#
# [% sh %]
#----------------------------#
log_warn [% sh %]
mkdir -p mash
cd mash
log_info mash sketch
[% FOREACH item IN opt.data -%]
if [[ ! -e [% item.name %].msh ]]; then
cat [% item.dir %]/chr.fasta |
mash sketch -k 21 -s 100000 -p [% opt.parallel %] - -I "[% item.name %]" -o [% item.name %]
fi
[% END -%]
log_info mash triangle
mash triangle -E -p [% opt.parallel %] -l <(
cat ../genome.lst | parallel echo "{}.msh"
) \
> dist.tsv
log_info fill matrix with lower triangle
tsv-select -f 1-3 dist.tsv |
(tsv-select -f 2,1,3 dist.tsv && cat) |
(
cut -f 1 dist.tsv |
tsv-uniq |
parallel -j 1 --keep-order 'echo -e "{}\t{}\t0"' &&
cat
) \
> dist_full.tsv
log_info Raw phylogenetic tree by MinHash
cat dist_full.tsv |
Rscript -e '
library(readr);
library(tidyr);
library(ape);
pair_dist <- read_tsv(file("stdin"), col_names=F, show_col_types = FALSE);
tmp <- pair_dist %>%
pivot_wider( names_from = X2, values_from = X3, values_fill = list(X3 = 1.0) )
tmp <- as.matrix(tmp)
mat <- tmp[,-1]
rownames(mat) <- tmp[,1]
dist_mat <- as.dist(mat)
clusters <- hclust(dist_mat, method = "ward.D2")
tree <- as.phylo(clusters)
write.tree(phy=tree, file="tree.nwk")
group <- cutree(clusters, h=0.4) # k=5
groups <- as.data.frame(group)
groups$ids <- rownames(groups)
rownames(groups) <- NULL
groups <- groups[order(groups$group), ]
write_tsv(groups, "groups.tsv")
'
log_info newick-utils
cat tree.nwk |
[% IF opt.outgroup -%]
nw_reroot - [% opt.outgroup %] |
[% END -%]
nw_order - -c n \
> ../Results/[% opt.multiname %].mash.raw.nwk
plotr tree ../Results/[% opt.multiname %].mash.raw.nwk
( run in 1.968 second using v1.01-cache-2.11-cpan-e1769b4cff6 )