Merge pull request #7 from DOH-JDJ0303/image_fix

reducing R memory usage and increasing resources
DOH-JDJ0303 · Apr 26, 2024 · d19a4df · d19a4df
2 parents e36cb4c + ab8581e
commit d19a4df
Show file tree

Hide file tree

Showing 5 changed files with 14 additions and 10 deletions.
diff --git a/bin/cluster.R b/bin/cluster.R
@@ -23,12 +23,14 @@ if(dist_path == "version"){
 library(tidyverse)
 library(ggtree)
 library(ape)
+install.packages("bigmemory")
+library(bigmemory)
 
 # set output file name
 file.name <- paste(taxa_name,segment_name,sep="-")
 
 #---- LOAD PAIRWISE DISTANCES ----#
-dist.df <- read_tsv(dist_path, col_names = c("ID1","ID2","DIST","PVAL","HASHES")) %>%
+dist.df <- read_tsv(dist_path, col_names = c("ID1","ID2","DIST")) %>%
   select(ID1, ID2, DIST)
 dist.mat <- dist.df %>%
   pivot_wider(names_from="ID2", values_from="DIST") %>%
@@ -55,7 +57,8 @@ clusters <- cutree(as.hclust(tree), h = as.numeric(threshold)) %>%
   data.frame() %>%
   rownames_to_column(var = "seq") %>%
   rename(cluster = 2) %>%
-  mutate(taxa = taxa_name,
+  mutate(seq = as.numeric(seq),
+         taxa = taxa_name,
          segment = segment_name) %>%
   select(seq, taxa, segment, cluster)
 

diff --git a/bin/input-qc.sh b/bin/input-qc.sh
@@ -69,5 +69,5 @@ fi
 echo "total,filter1,filter2,filter3,filter4" > ${prefix}-qc-summary.csv
 echo "$(cat seqs | wc -l),$(cat f1 | wc -l),$(cat f2 | wc -l),$(cat f3 | wc -l),$(cat f4 | wc -l)" >> ${prefix}-qc-summary.csv
 # output cleaned sequences & clean up
-cat f3 | awk -v OFS='\n' -v prefix=${prefix} '{print ">"prefix"-"NR, $1}' > ${prefix}.clean.fa
+cat f3 | awk -v OFS='\n' -v prefix=${prefix} '{print ">"NR, $1}' > ${prefix}.clean.fa
 rm seqs f1 f2 f3 f4
diff --git a/conf/base.config b/conf/base.config
@@ -43,14 +43,15 @@ process {
     }
     withLabel:process_high {
         cpus   = { check_max( 10    * task.attempt, 'cpus'    ) }
-        memory = { check_max( 14.GB * task.attempt, 'memory'  ) }
+        memory = { check_max( 16.GB * task.attempt, 'memory'  ) }
         time   = { check_max( 16.h  * task.attempt, 'time'    ) }
     }
     withLabel:process_long {
         time   = { check_max( 20.h  * task.attempt, 'time'    ) }
     }
     withLabel:process_high_memory {
-        memory = { check_max( 32.GB * task.attempt, 'memory' ) }
+        memory = { check_max( 64.GB * task.attempt, 'memory' ) }
+        cpus   = { check_max( 8                   , 'cpus'   ) }
     }
     withLabel:error_ignore {
         errorStrategy = 'ignore'

diff --git a/modules/local/cluster.nf b/modules/local/cluster.nf
@@ -19,9 +19,9 @@ process CLUSTER {
     script:
     prefix = "${taxa}-${segment}"
     """
-    gzip -d ${dist}
+    zcat ${dist} | cut -f 1,2,3 > dists.txt
     # run script
-    cluster.R *.txt "${taxa}" "${segment}" ${params.dist_threshold}
+    cluster.R dists.txt "${taxa}" "${segment}" ${params.dist_threshold}
 
     cat <<-END_VERSIONS > versions.yml
         "${task.process}":
@@ -51,9 +51,9 @@ process CLUSTER_LARGE {
     script:
     prefix = "${taxa}-${segment}"
     """
-    gzip -d ${dist}
+    zcat ${dist} | cut -f 1,2,3 > dists.txt
     # run script
-    cluster.R *.txt "${taxa}" "${segment}" ${params.dist_threshold}
+    cluster.R dists.txt "${taxa}" "${segment}" ${params.dist_threshold}
     
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/mash.nf b/modules/local/mash.nf
@@ -1,6 +1,6 @@
 process MASH {
     tag "${taxa}-${segment}"
-    label 'process_medium'
+    label 'process_high'
     conda "bioconda::mash=2.3"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/mash:2.3--he348c14_1' :