From ab8581e5e8a4cc817729e66e77af81590a1d3e2b Mon Sep 17 00:00:00 2001
From: DOH-JDJ0303 <jared.johnson@doh.wa.gov>
Date: Fri, 26 Apr 2024 15:42:27 -0700
Subject: [PATCH] reducing R memory usage and increasing resources

---
 bin/cluster.R            | 7 +++++--
 bin/input-qc.sh          | 2 +-
 conf/base.config         | 5 +++--
 modules/local/cluster.nf | 8 ++++----
 modules/local/mash.nf    | 2 +-
 5 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/bin/cluster.R b/bin/cluster.R
index d3b289a..c5282c1 100755
--- a/bin/cluster.R
+++ b/bin/cluster.R
@@ -23,12 +23,14 @@ if(dist_path == "version"){
 library(tidyverse)
 library(ggtree)
 library(ape)
+install.packages("bigmemory")
+library(bigmemory)
 
 # set output file name
 file.name <- paste(taxa_name,segment_name,sep="-")
 
 #---- LOAD PAIRWISE DISTANCES ----#
-dist.df <- read_tsv(dist_path, col_names = c("ID1","ID2","DIST","PVAL","HASHES")) %>%
+dist.df <- read_tsv(dist_path, col_names = c("ID1","ID2","DIST")) %>%
   select(ID1, ID2, DIST)
 dist.mat <- dist.df %>%
   pivot_wider(names_from="ID2", values_from="DIST") %>%
@@ -55,7 +57,8 @@ clusters <- cutree(as.hclust(tree), h = as.numeric(threshold)) %>%
   data.frame() %>%
   rownames_to_column(var = "seq") %>%
   rename(cluster = 2) %>%
-  mutate(taxa = taxa_name,
+  mutate(seq = as.numeric(seq),
+         taxa = taxa_name,
          segment = segment_name) %>%
   select(seq, taxa, segment, cluster)
 
diff --git a/bin/input-qc.sh b/bin/input-qc.sh
index d5b5ecc..886a3cc 100755
--- a/bin/input-qc.sh
+++ b/bin/input-qc.sh
@@ -69,5 +69,5 @@ fi
 echo "total,filter1,filter2,filter3,filter4" > ${prefix}-qc-summary.csv
 echo "$(cat seqs | wc -l),$(cat f1 | wc -l),$(cat f2 | wc -l),$(cat f3 | wc -l),$(cat f4 | wc -l)" >> ${prefix}-qc-summary.csv
 # output cleaned sequences & clean up
-cat f3 | awk -v OFS='\n' -v prefix=${prefix} '{print ">"prefix"-"NR, $1}' > ${prefix}.clean.fa
+cat f3 | awk -v OFS='\n' -v prefix=${prefix} '{print ">"NR, $1}' > ${prefix}.clean.fa
 rm seqs f1 f2 f3 f4
\ No newline at end of file
diff --git a/conf/base.config b/conf/base.config
index b9a50c2..54112cc 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -43,14 +43,15 @@ process {
     }
     withLabel:process_high {
         cpus   = { check_max( 10    * task.attempt, 'cpus'    ) }
-        memory = { check_max( 14.GB * task.attempt, 'memory'  ) }
+        memory = { check_max( 16.GB * task.attempt, 'memory'  ) }
         time   = { check_max( 16.h  * task.attempt, 'time'    ) }
     }
     withLabel:process_long {
         time   = { check_max( 20.h  * task.attempt, 'time'    ) }
     }
     withLabel:process_high_memory {
-        memory = { check_max( 32.GB * task.attempt, 'memory' ) }
+        memory = { check_max( 64.GB * task.attempt, 'memory' ) }
+        cpus   = { check_max( 8                   , 'cpus'   ) }
     }
     withLabel:error_ignore {
         errorStrategy = 'ignore'
diff --git a/modules/local/cluster.nf b/modules/local/cluster.nf
index d6d36f6..4c9f173 100644
--- a/modules/local/cluster.nf
+++ b/modules/local/cluster.nf
@@ -19,9 +19,9 @@ process CLUSTER {
     script:
     prefix = "${taxa}-${segment}"
     """
-    gzip -d ${dist}
+    zcat ${dist} | cut -f 1,2,3 > dists.txt
     # run script
-    cluster.R *.txt "${taxa}" "${segment}" ${params.dist_threshold}
+    cluster.R dists.txt "${taxa}" "${segment}" ${params.dist_threshold}
 
     cat <<-END_VERSIONS > versions.yml
         "${task.process}":
@@ -49,9 +49,9 @@ process CLUSTER_LARGE {
     script:
     prefix = "${taxa}-${segment}"
     """
-    gzip -d ${dist}
+    zcat ${dist} | cut -f 1,2,3 > dists.txt
     # run script
-    cluster.R *.txt "${taxa}" "${segment}" ${params.dist_threshold}
+    cluster.R dists.txt "${taxa}" "${segment}" ${params.dist_threshold}
     
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/modules/local/mash.nf b/modules/local/mash.nf
index 70afd70..8d54d31 100644
--- a/modules/local/mash.nf
+++ b/modules/local/mash.nf
@@ -1,6 +1,6 @@
 process MASH {
     tag "${taxa}-${segment}"
-    label 'process_medium'
+    label 'process_high'
     conda "bioconda::mash=2.3"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/mash:2.3--he348c14_1' :