From ab8581e5e8a4cc817729e66e77af81590a1d3e2b Mon Sep 17 00:00:00 2001 From: DOH-JDJ0303 Date: Fri, 26 Apr 2024 15:42:27 -0700 Subject: [PATCH] reducing R memory usage and increasing resources --- bin/cluster.R | 7 +++++-- bin/input-qc.sh | 2 +- conf/base.config | 5 +++-- modules/local/cluster.nf | 8 ++++---- modules/local/mash.nf | 2 +- 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/bin/cluster.R b/bin/cluster.R index d3b289a..c5282c1 100755 --- a/bin/cluster.R +++ b/bin/cluster.R @@ -23,12 +23,14 @@ if(dist_path == "version"){ library(tidyverse) library(ggtree) library(ape) +install.packages("bigmemory") +library(bigmemory) # set output file name file.name <- paste(taxa_name,segment_name,sep="-") #---- LOAD PAIRWISE DISTANCES ----# -dist.df <- read_tsv(dist_path, col_names = c("ID1","ID2","DIST","PVAL","HASHES")) %>% +dist.df <- read_tsv(dist_path, col_names = c("ID1","ID2","DIST")) %>% select(ID1, ID2, DIST) dist.mat <- dist.df %>% pivot_wider(names_from="ID2", values_from="DIST") %>% @@ -55,7 +57,8 @@ clusters <- cutree(as.hclust(tree), h = as.numeric(threshold)) %>% data.frame() %>% rownames_to_column(var = "seq") %>% rename(cluster = 2) %>% - mutate(taxa = taxa_name, + mutate(seq = as.numeric(seq), + taxa = taxa_name, segment = segment_name) %>% select(seq, taxa, segment, cluster) diff --git a/bin/input-qc.sh b/bin/input-qc.sh index d5b5ecc..886a3cc 100755 --- a/bin/input-qc.sh +++ b/bin/input-qc.sh @@ -69,5 +69,5 @@ fi echo "total,filter1,filter2,filter3,filter4" > ${prefix}-qc-summary.csv echo "$(cat seqs | wc -l),$(cat f1 | wc -l),$(cat f2 | wc -l),$(cat f3 | wc -l),$(cat f4 | wc -l)" >> ${prefix}-qc-summary.csv # output cleaned sequences & clean up -cat f3 | awk -v OFS='\n' -v prefix=${prefix} '{print ">"prefix"-"NR, $1}' > ${prefix}.clean.fa +cat f3 | awk -v OFS='\n' -v prefix=${prefix} '{print ">"NR, $1}' > ${prefix}.clean.fa rm seqs f1 f2 f3 f4 \ No newline at end of file diff --git a/conf/base.config b/conf/base.config index b9a50c2..54112cc 100644 --- a/conf/base.config +++ b/conf/base.config @@ -43,14 +43,15 @@ process { } withLabel:process_high { cpus = { check_max( 10 * task.attempt, 'cpus' ) } - memory = { check_max( 14.GB * task.attempt, 'memory' ) } + memory = { check_max( 16.GB * task.attempt, 'memory' ) } time = { check_max( 16.h * task.attempt, 'time' ) } } withLabel:process_long { time = { check_max( 20.h * task.attempt, 'time' ) } } withLabel:process_high_memory { - memory = { check_max( 32.GB * task.attempt, 'memory' ) } + memory = { check_max( 64.GB * task.attempt, 'memory' ) } + cpus = { check_max( 8 , 'cpus' ) } } withLabel:error_ignore { errorStrategy = 'ignore' diff --git a/modules/local/cluster.nf b/modules/local/cluster.nf index d6d36f6..4c9f173 100644 --- a/modules/local/cluster.nf +++ b/modules/local/cluster.nf @@ -19,9 +19,9 @@ process CLUSTER { script: prefix = "${taxa}-${segment}" """ - gzip -d ${dist} + zcat ${dist} | cut -f 1,2,3 > dists.txt # run script - cluster.R *.txt "${taxa}" "${segment}" ${params.dist_threshold} + cluster.R dists.txt "${taxa}" "${segment}" ${params.dist_threshold} cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -49,9 +49,9 @@ process CLUSTER_LARGE { script: prefix = "${taxa}-${segment}" """ - gzip -d ${dist} + zcat ${dist} | cut -f 1,2,3 > dists.txt # run script - cluster.R *.txt "${taxa}" "${segment}" ${params.dist_threshold} + cluster.R dists.txt "${taxa}" "${segment}" ${params.dist_threshold} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/mash.nf b/modules/local/mash.nf index 70afd70..8d54d31 100644 --- a/modules/local/mash.nf +++ b/modules/local/mash.nf @@ -1,6 +1,6 @@ process MASH { tag "${taxa}-${segment}" - label 'process_medium' + label 'process_high' conda "bioconda::mash=2.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mash:2.3--he348c14_1' :