From 6c37a56b0226734a1ed5bfac765741048359e5f4 Mon Sep 17 00:00:00 2001
From: DOH-JDJ0303 <jared.johnson@doh.wa.gov>
Date: Fri, 29 Mar 2024 15:49:13 -0700
Subject: [PATCH] fixed bugs and added scale to tree

---
 bin/cluster.R              | 38 +++++++++++++++++++++++++++++++++-----
 modules/local/cluster.nf   | 21 +++++++++++++++++----
 modules/local/consensus.nf |  9 +++++----
 modules/local/input-qc.nf  |  6 ++----
 modules/local/summary.nf   | 12 +++++++-----
 nextflow.config            |  4 ++--
 workflows/epitome.nf       |  4 ++--
 7 files changed, 68 insertions(+), 26 deletions(-)

diff --git a/bin/cluster.R b/bin/cluster.R
index fb6f9b8..f3d0e96 100755
--- a/bin/cluster.R
+++ b/bin/cluster.R
@@ -1,17 +1,33 @@
 #!/usr/bin/env Rscript
 
-library(tidyverse)
-library(ggtree)
+#!/usr/bin/env Rscript
+version <- "1.0"
+
+# cluster.R
+# Author: Jared Johnson, jared.johnson@doh.wa.gov
 
+#---- ARGUMENTS ----#
 args <- commandArgs(trailingOnly = T)
 dist_path <- args[1]
 taxa_name <- args[2]
 segment_name <- args[3]
 threshold <- args[4]
 
-file.name <- paste(taxa_name,segment_name,sep="-")
+#---- VERSION ----#
+if(dist_path == "version"){
+  cat(version, sep = "\n")
+  quit(status=0)
+}
+
+#---- LIBRARY ----#
+library(tidyverse)
+library(ggtree)
+library(ape)
 
+# set output file name
+file.name <- paste(taxa_name,segment_name,sep="-")
 
+#---- LOAD PAIRWISE DISTANCES ----#
 dist.mat <- read_tsv(dist_path, col_names = c("ID1","ID2","DIST","PVAL","HASHES")) %>%
   select(ID1, ID2, DIST) %>%
   pivot_wider(names_from="ID2", values_from="DIST") %>%
@@ -19,8 +35,10 @@ dist.mat <- read_tsv(dist_path, col_names = c("ID1","ID2","DIST","PVAL","HASHES"
   as.matrix() %>%
   as.dist()
 
+#---- HIERARCHICAL CLUSTER ----#
 tree <- hclust(dist.mat, method = "complete")
 
+#---- CUT DENDROGRAM AT DISTANCE THRESHOLD ----#
 clusters <- cutree(tree, h = as.numeric(threshold)) %>%
   data.frame() %>%
   rownames_to_column(var = "seq") %>%
@@ -29,14 +47,23 @@ clusters <- cutree(tree, h = as.numeric(threshold)) %>%
          segment = segment_name) %>%
   select(seq, taxa, segment, cluster)
 
-write.csv(clusters, file = paste0(file.name,'.csv'), quote = F, row.names = F)
+# adjust height to percentage for more intuitive interpretation
+tree$height <- 100*tree$height
 
+# plot tree
 p <- ggtree(tree)%<+%clusters+
   geom_tippoint(aes(color = as.character(cluster)))+
-  labs(color = "Cluster")
+  geom_vline(xintercept = -100*as.numeric(threshold), linetype = "dashed", color = "#E35335")+
+  theme_tree2()+
+  labs(color = "Cluster", x = "Approx. Nucleotide Difference (%)")
+
+#---- SAVE OUTPUTS ----#
+# cluster info
+write.csv(clusters, file = paste0(file.name,'.csv'), quote = F, row.names = F)
 n_iso <- p$data %>%
   drop_na() %>%
   nrow()
+
 # set image dimensions
 wdth <- n_iso/100
 if(wdth < 200){
@@ -46,4 +73,5 @@ hght=n_iso/100
 if(hght < 200){
   hght <- 10
 }
+# plot
 ggsave(plot = p, filename = paste0(file.name,'.jpg'), dpi = 300, width = wdth, height = hght, limitsize=F)
\ No newline at end of file
diff --git a/modules/local/cluster.nf b/modules/local/cluster.nf
index 8e5e356..0311244 100644
--- a/modules/local/cluster.nf
+++ b/modules/local/cluster.nf
@@ -8,8 +8,10 @@ process CLUSTER {
     tuple val(taxa), val(segment), path(dist), val(seq_count)
 
     output:
-    path "*.csv", emit: results
-    path "*.jpg", emit: plot
+    path "*.csv",        emit: results
+    path "*.jpg",        emit: plot
+    path "versions.yml", emit: versions
+
 
     when:
     task.ext.when == null || task.ext.when
@@ -20,6 +22,11 @@ process CLUSTER {
     gzip -d ${dist}
     # run script
     cluster.R *.txt "${taxa}" "${segment}" ${params.dist_threshold}
+
+    cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            cluster: \$(cluster.R version)
+        END_VERSIONS
     """
 }
 
@@ -32,8 +39,9 @@ process CLUSTER_LARGE {
     tuple val(taxa), val(segment), path(dist), val(seq_count)
 
     output:
-    path "*.csv", emit: results
-    path "*.jpg", emit: plot
+    path "*.csv",        emit: results
+    path "*.jpg",        emit: plot
+    path "versions.yml", emit: versions
 
     when:
     task.ext.when == null || task.ext.when
@@ -44,5 +52,10 @@ process CLUSTER_LARGE {
     gzip -d ${dist}
     # run script
     cluster.R *.txt "${taxa}" "${segment}" ${params.dist_threshold}
+    
+    cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            cluster: \$(cluster.R version)
+        END_VERSIONS
     """
 }
diff --git a/modules/local/consensus.nf b/modules/local/consensus.nf
index 62086e4..a1023c0 100644
--- a/modules/local/consensus.nf
+++ b/modules/local/consensus.nf
@@ -8,7 +8,8 @@ process CONSENSUS {
 
     output:
     tuple val(taxa), val(segment), val(cluster), path("${prefix}.fa"), env(length), emit: fa
-    path "${prefix}_length.csv",                                                   emit: len
+    path "${prefix}_length.csv",                                                    emit: len
+    path "versions.yml",                                                            emit: versions
 
     when:
     task.ext.when == null || task.ext.when
@@ -24,8 +25,8 @@ process CONSENSUS {
     echo "${prefix},\${length}" > ${prefix}_length.csv
 
     cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        consensus: \$(consensus.sh version)
-    END_VERSIONS
+        "${task.process}":
+            consensus: \$(consensus.sh version)
+        END_VERSIONS
     """
 }
diff --git a/modules/local/input-qc.nf b/modules/local/input-qc.nf
index 29c3df5..a38ecd1 100644
--- a/modules/local/input-qc.nf
+++ b/modules/local/input-qc.nf
@@ -29,9 +29,7 @@ process INPUT_QC {
     # set sequence count
     seq_count=\$(cat ${prefix}-qc-summary.csv | cut -f 5 -d ',' | grep -v 'filter4' | tr -d '\t\r\n ')
 
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        input-qc: \$(input-qc.sh version)
-    END_VERSIONS
+    # something about the normal way of getting version info messes with the creations of .command.env
+    echo "${task.process}:\n\tinput-qc: \$(input-qc.sh version | tr -d '\t\r\')" > versions.yml
     """
 }
diff --git a/modules/local/summary.nf b/modules/local/summary.nf
index 137112c..0a53fab 100755
--- a/modules/local/summary.nf
+++ b/modules/local/summary.nf
@@ -11,8 +11,10 @@ process SUMMARY {
 
 
     output:
-    path "*.csv", emit: summary
-    path "*.jpg", emit: plots
+    path "*.csv",        emit: summary
+    path "*.jpg",        emit: plots
+    path "versions.yml", emit: versions
+
 
     when:
     task.ext.when == null || task.ext.when
@@ -25,8 +27,8 @@ process SUMMARY {
     summary.R clusters-no-header.csv ${lengths} ${ani_ava} ${ani_seeds} ${seeds}
 
     cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        summary: \$(summary.R version)
-    END_VERSIONS
+        "${task.process}":
+            summary: \$(summary.R version)
+        END_VERSIONS
     """
 }
diff --git a/nextflow.config b/nextflow.config
index 4893bf7..ea51537 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -13,8 +13,8 @@ params {
     // Input options
     input                      = null
     seeds                      = null
-    dist_threshold             = 0.2
-    len_threshold              = 0.2
+    dist_threshold             = 0.15
+    len_threshold              = 0.15
     
     // References
     genome                     = null
diff --git a/workflows/epitome.nf b/workflows/epitome.nf
index a67e023..3053a0d 100644
--- a/workflows/epitome.nf
+++ b/workflows/epitome.nf
@@ -110,13 +110,13 @@ workflow EPITOME {
     CLUSTER (
         MASH.out.dist.filter{ taxa, segment, dist, count -> count.toInteger() <= 2000 }
     )
-    ch_versions = ch_versions.mix(CLUSTERS.out.versions.first())
+    ch_versions = ch_versions.mix(CLUSTER.out.versions.first())
 
     // Large datasets - requires much more memory!
     CLUSTER_LARGE (
         MASH.out.dist.filter{ taxa, segment, dist, count -> count.toInteger() > 2000 }
     )
-    ch_versions = ch_versions.mix(CLUSTERS_LARGE.out.versions.first())
+    ch_versions = ch_versions.mix(CLUSTER_LARGE.out.versions.first())
 
     // Combine small and large dataset cluster results and add clean sequence paths
     CLUSTER