From 6c37a56b0226734a1ed5bfac765741048359e5f4 Mon Sep 17 00:00:00 2001 From: DOH-JDJ0303 Date: Fri, 29 Mar 2024 15:49:13 -0700 Subject: [PATCH] fixed bugs and added scale to tree --- bin/cluster.R | 38 +++++++++++++++++++++++++++++++++----- modules/local/cluster.nf | 21 +++++++++++++++++---- modules/local/consensus.nf | 9 +++++---- modules/local/input-qc.nf | 6 ++---- modules/local/summary.nf | 12 +++++++----- nextflow.config | 4 ++-- workflows/epitome.nf | 4 ++-- 7 files changed, 68 insertions(+), 26 deletions(-) diff --git a/bin/cluster.R b/bin/cluster.R index fb6f9b8..f3d0e96 100755 --- a/bin/cluster.R +++ b/bin/cluster.R @@ -1,17 +1,33 @@ #!/usr/bin/env Rscript -library(tidyverse) -library(ggtree) +#!/usr/bin/env Rscript +version <- "1.0" + +# cluster.R +# Author: Jared Johnson, jared.johnson@doh.wa.gov +#---- ARGUMENTS ----# args <- commandArgs(trailingOnly = T) dist_path <- args[1] taxa_name <- args[2] segment_name <- args[3] threshold <- args[4] -file.name <- paste(taxa_name,segment_name,sep="-") +#---- VERSION ----# +if(dist_path == "version"){ + cat(version, sep = "\n") + quit(status=0) +} + +#---- LIBRARY ----# +library(tidyverse) +library(ggtree) +library(ape) +# set output file name +file.name <- paste(taxa_name,segment_name,sep="-") +#---- LOAD PAIRWISE DISTANCES ----# dist.mat <- read_tsv(dist_path, col_names = c("ID1","ID2","DIST","PVAL","HASHES")) %>% select(ID1, ID2, DIST) %>% pivot_wider(names_from="ID2", values_from="DIST") %>% @@ -19,8 +35,10 @@ dist.mat <- read_tsv(dist_path, col_names = c("ID1","ID2","DIST","PVAL","HASHES" as.matrix() %>% as.dist() +#---- HIERARCHICAL CLUSTER ----# tree <- hclust(dist.mat, method = "complete") +#---- CUT DENDROGRAM AT DISTANCE THRESHOLD ----# clusters <- cutree(tree, h = as.numeric(threshold)) %>% data.frame() %>% rownames_to_column(var = "seq") %>% @@ -29,14 +47,23 @@ clusters <- cutree(tree, h = as.numeric(threshold)) %>% segment = segment_name) %>% select(seq, taxa, segment, cluster) -write.csv(clusters, file = paste0(file.name,'.csv'), quote = F, row.names = F) +# adjust height to percentage for more intuitive interpretation +tree$height <- 100*tree$height +# plot tree p <- ggtree(tree)%<+%clusters+ geom_tippoint(aes(color = as.character(cluster)))+ - labs(color = "Cluster") + geom_vline(xintercept = -100*as.numeric(threshold), linetype = "dashed", color = "#E35335")+ + theme_tree2()+ + labs(color = "Cluster", x = "Approx. Nucleotide Difference (%)") + +#---- SAVE OUTPUTS ----# +# cluster info +write.csv(clusters, file = paste0(file.name,'.csv'), quote = F, row.names = F) n_iso <- p$data %>% drop_na() %>% nrow() + # set image dimensions wdth <- n_iso/100 if(wdth < 200){ @@ -46,4 +73,5 @@ hght=n_iso/100 if(hght < 200){ hght <- 10 } +# plot ggsave(plot = p, filename = paste0(file.name,'.jpg'), dpi = 300, width = wdth, height = hght, limitsize=F) \ No newline at end of file diff --git a/modules/local/cluster.nf b/modules/local/cluster.nf index 8e5e356..0311244 100644 --- a/modules/local/cluster.nf +++ b/modules/local/cluster.nf @@ -8,8 +8,10 @@ process CLUSTER { tuple val(taxa), val(segment), path(dist), val(seq_count) output: - path "*.csv", emit: results - path "*.jpg", emit: plot + path "*.csv", emit: results + path "*.jpg", emit: plot + path "versions.yml", emit: versions + when: task.ext.when == null || task.ext.when @@ -20,6 +22,11 @@ process CLUSTER { gzip -d ${dist} # run script cluster.R *.txt "${taxa}" "${segment}" ${params.dist_threshold} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cluster: \$(cluster.R version) + END_VERSIONS """ } @@ -32,8 +39,9 @@ process CLUSTER_LARGE { tuple val(taxa), val(segment), path(dist), val(seq_count) output: - path "*.csv", emit: results - path "*.jpg", emit: plot + path "*.csv", emit: results + path "*.jpg", emit: plot + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when @@ -44,5 +52,10 @@ process CLUSTER_LARGE { gzip -d ${dist} # run script cluster.R *.txt "${taxa}" "${segment}" ${params.dist_threshold} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cluster: \$(cluster.R version) + END_VERSIONS """ } diff --git a/modules/local/consensus.nf b/modules/local/consensus.nf index 62086e4..a1023c0 100644 --- a/modules/local/consensus.nf +++ b/modules/local/consensus.nf @@ -8,7 +8,8 @@ process CONSENSUS { output: tuple val(taxa), val(segment), val(cluster), path("${prefix}.fa"), env(length), emit: fa - path "${prefix}_length.csv", emit: len + path "${prefix}_length.csv", emit: len + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when @@ -24,8 +25,8 @@ process CONSENSUS { echo "${prefix},\${length}" > ${prefix}_length.csv cat <<-END_VERSIONS > versions.yml - "${task.process}": - consensus: \$(consensus.sh version) - END_VERSIONS + "${task.process}": + consensus: \$(consensus.sh version) + END_VERSIONS """ } diff --git a/modules/local/input-qc.nf b/modules/local/input-qc.nf index 29c3df5..a38ecd1 100644 --- a/modules/local/input-qc.nf +++ b/modules/local/input-qc.nf @@ -29,9 +29,7 @@ process INPUT_QC { # set sequence count seq_count=\$(cat ${prefix}-qc-summary.csv | cut -f 5 -d ',' | grep -v 'filter4' | tr -d '\t\r\n ') - cat <<-END_VERSIONS > versions.yml - "${task.process}": - input-qc: \$(input-qc.sh version) - END_VERSIONS + # something about the normal way of getting version info messes with the creations of .command.env + echo "${task.process}:\n\tinput-qc: \$(input-qc.sh version | tr -d '\t\r\')" > versions.yml """ } diff --git a/modules/local/summary.nf b/modules/local/summary.nf index 137112c..0a53fab 100755 --- a/modules/local/summary.nf +++ b/modules/local/summary.nf @@ -11,8 +11,10 @@ process SUMMARY { output: - path "*.csv", emit: summary - path "*.jpg", emit: plots + path "*.csv", emit: summary + path "*.jpg", emit: plots + path "versions.yml", emit: versions + when: task.ext.when == null || task.ext.when @@ -25,8 +27,8 @@ process SUMMARY { summary.R clusters-no-header.csv ${lengths} ${ani_ava} ${ani_seeds} ${seeds} cat <<-END_VERSIONS > versions.yml - "${task.process}": - summary: \$(summary.R version) - END_VERSIONS + "${task.process}": + summary: \$(summary.R version) + END_VERSIONS """ } diff --git a/nextflow.config b/nextflow.config index 4893bf7..ea51537 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,8 +13,8 @@ params { // Input options input = null seeds = null - dist_threshold = 0.2 - len_threshold = 0.2 + dist_threshold = 0.15 + len_threshold = 0.15 // References genome = null diff --git a/workflows/epitome.nf b/workflows/epitome.nf index a67e023..3053a0d 100644 --- a/workflows/epitome.nf +++ b/workflows/epitome.nf @@ -110,13 +110,13 @@ workflow EPITOME { CLUSTER ( MASH.out.dist.filter{ taxa, segment, dist, count -> count.toInteger() <= 2000 } ) - ch_versions = ch_versions.mix(CLUSTERS.out.versions.first()) + ch_versions = ch_versions.mix(CLUSTER.out.versions.first()) // Large datasets - requires much more memory! CLUSTER_LARGE ( MASH.out.dist.filter{ taxa, segment, dist, count -> count.toInteger() > 2000 } ) - ch_versions = ch_versions.mix(CLUSTERS_LARGE.out.versions.first()) + ch_versions = ch_versions.mix(CLUSTER_LARGE.out.versions.first()) // Combine small and large dataset cluster results and add clean sequence paths CLUSTER