Merge branch 'dev'

mskilab-org · Mar 4, 2024 · f9703e3 · f9703e3
2 parents f3eafbe + 349dc77
commit f9703e3
Show file tree

Hide file tree

Showing 35 changed files with 1,460 additions and 118 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,5 @@ testing/
 testing*
 *.pyc
 .nf-test/
+tests/test_data/
+.nf-test.log
diff --git a/.nf-test.log b/.nf-test.log
@@ -1,2 +1,12 @@
-Nov-20 17:46:16.940 [main] INFO  com.askimed.nf.test.App - nf-test 0.8.1
-Nov-20 17:46:16.960 [main] INFO  com.askimed.nf.test.App - Arguments: [init]
+Jan-16 10:10:49.572 [main] INFO  com.askimed.nf.test.App - nf-test 0.8.2
+Jan-16 10:10:49.580 [main] INFO  com.askimed.nf.test.App - Arguments: [test, tests/modules/local/gridss/gridss/main.nf.test]
+Jan-16 10:10:49.580 [main] INFO  com.askimed.nf.test.commands.RunTestsCommand - Load config from file /Users/diders01/projects/jabba_prod/nf-jabba/nf-test.config...
+Jan-16 10:10:49.758 [main] INFO  com.askimed.nf.test.commands.RunTestsCommand - Detected 1 test files.
+Jan-16 10:10:49.827 [main] INFO  com.askimed.nf.test.config.FileStaging - Create symlink '/Users/diders01/projects/jabba_prod/nf-jabba/.nf-test/tests/1a4fc5dc640a75b58bcf88f15a9b2cb6/meta/bin' --> '/Users/diders01/projects/jabba_prod/nf-jabba/bin'
+Jan-16 10:10:49.828 [main] INFO  com.askimed.nf.test.config.FileStaging - Create symlink '/Users/diders01/projects/jabba_prod/nf-jabba/.nf-test/tests/1a4fc5dc640a75b58bcf88f15a9b2cb6/meta/lib' --> '/Users/diders01/projects/jabba_prod/nf-jabba/lib'
+Jan-16 10:10:49.828 [main] INFO  com.askimed.nf.test.config.FileStaging - Create symlink '/Users/diders01/projects/jabba_prod/nf-jabba/.nf-test/tests/1a4fc5dc640a75b58bcf88f15a9b2cb6/meta/assets' --> '/Users/diders01/projects/jabba_prod/nf-jabba/assets'
+Jan-16 10:10:49.828 [main] DEBUG com.askimed.nf.test.core.AbstractTest - Stage 1 user provided files...
+Jan-16 10:10:49.828 [main] INFO  com.askimed.nf.test.config.FileStaging - Create symlink '/Users/diders01/projects/jabba_prod/nf-jabba/.nf-test/tests/1a4fc5dc640a75b58bcf88f15a9b2cb6/meta/tests/test_data/' --> '/Users/diders01/projects/jabba_prod/nf-jabba/tests/test_data'
+Jan-16 10:10:49.830 [main] INFO  com.askimed.nf.test.core.TestExecutionEngine - Started test plan
+Jan-16 10:10:49.830 [main] INFO  com.askimed.nf.test.core.TestExecutionEngine - Running testsuite 'Test Process GRIDSS_GRIDSS' from file '/Users/diders01/projects/jabba_prod/nf-jabba/tests/modules/local/gridss/gridss/main.nf.test'.
+Jan-16 10:10:49.830 [main] INFO  com.askimed.nf.test.core.TestExecutionEngine - Run test '1a4fc5dc: Should run GRIDSS without failures'. type: com.askimed.nf.test.lang.process.ProcessTest
diff --git a/.spec.md b/.spec.md
@@ -0,0 +1,37 @@
+# Refactor
+## Questions:
+- Do we expect to have more than one tool per step (e.g gridss and svaba): No
+- Is there redundancy/dependency in the package parameters. Can parameters be
+  organized into a dependency tree or otherwise hardcoded because they never
+  change?: Yes, need to investigate.
+- Do we care about alignment? Should the pipeline be able to start at
+  alignment? No.
+
+## To Do
+- Refactor output channel attribute extraction (21)
+- Swich from steps conditionals to cases (8)
+- Switch from tools to nodes (13)
+- Remove alignment step (3)
+- Refactor repetitive declarations (1)
+
+## Outline
+- Swtich from the "step" based conditionals to something more declarative (e.g
+  cases). Having the program run a block of code by checking if the starting
+  step is included in a list of steps is enormously cumbersome, cases should
+  make things more parsimoinous and readable.
+- Tools-in-step paradigm could be replaced with ungrouped nodes.
+- Use functions for repetitive declarations/imports and hold their required
+  variables in arrays/maps.
+- Clean up output channel attribute extraction. Currently very repetitive,
+  could be replaced with functions.
+- Cut down on the number of default parameters in the config (possibly at the
+  package level). Packages/processes/workflows shouldn't have so many
+  parameters (see: JaBbA as the worst offendor), it indicates either
+  overparameterization or a dependency tree in the parameter space.
+- Add *_create_csv methods for remaining tools to generate samplesheets to
+  start from those tools
+- Move parameter specification from nextflow.config to module specific config,
+  then import them into nextflow config. This keeps the process and its parameter
+  configuration tightly coupled--but loses a central interface for modifying
+  all the defaults in one place, which is useful if changing one default would
+  affect defaults in a different parameter.
diff --git a/bigpurple.config b/bigpurple.config
@@ -0,0 +1,46 @@
+
+params {
+    config_profile_name           = 'mskilab-org NYU BigPurple Cluster Profile'
+    config_profile_description    = """
+				    mskilab-org NYU School of Medicine BigPurple cluster profile to run nf-JaBbA.
+				    !!Make sure to load both singularity/3.1 and squashfs-tools/4.3 before running nf-JaBbA with this profile!!
+				    Ideal to make work folder on scratch as it generates whole lot of temporary files to run the pipeline
+				    Make sure to submit the run as an SBATCH job since we don't own our own node at NYU yet!!
+				    """.stripIndent()	
+    config_profile_contact        = "Tanubrata Dey ([email protected])"
+    config_profile_url            = "https://www.mskilab.org/"
+
+    // Resources
+    max_memory = 700.GB
+    max_cpus = 256
+    max_time = 10.d
+}
+process {
+
+    // default SLURM node config
+    beforeScript = """
+        module load singularity/3.9.8
+        module load squashfs-tools/4.3
+	
+    """
+    .stripIndent()
+
+    executor='slurm'
+
+    // memory errors which should be retried. otherwise error out
+    errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
+    maxRetries    = 3
+    maxErrors     = '-1'
+
+}
+executor {
+    name = 'slurm'
+    queueSize = 500
+    submitRateLimit = '10 sec'
+}
+singularity {
+    enabled = true
+    autoMounts = true
+    cacheDir = "/gpfs/data/imielinskilab/singularity_files/nextflow_singularity_cache"
+}
+
diff --git a/bin/ascat_seg.R b/bin/ascat_seg.R
@@ -625,6 +625,7 @@
 
     if (grepl(pattern = "txt$", x = opt$variants)) {
         variants.dt = fread(opt$variants)
+    	variants.dt[[1]] <- gsub("chr","",variants.dt[[1]])
         variants.dt[, ":="(alt.count.n = as.numeric(as.character(alt.count.n)),
                            ref.count.n = as.numeric(as.character(ref.count.n)),
                            alt.count.t = as.numeric(as.character(alt.count.t)),
@@ -677,11 +678,13 @@
     ## ## transfer ratio
     message("Transferring ratio")
     ## gc correct tumor and normal
+    ## Edit by Tanubrata: Adds a fix to the column names to do GC correction when passing CBS coverge, else
+    ## ASCAT breaks when passing raw drycleaned coverage without GC correction
     if (opt$gc) {
-        if ("tumor" %in% names(values(cov.gr)) & "normal" %in% names(values(cov.gr))) {
+        if ("tum.counts" %in% names(values(cov.gr)) & "norm.counts" %in% names(values(cov.gr))) {
             message("Applying GC correction")
-            tum.gr = khtools::.gc(cov.gr, "tumor")
-            norm.gr = khtools::.gc(cov.gr, "normal")
+            tum.gr = khtools::.gc(cov.gr, "tum.counts")
+            norm.gr = khtools::.gc(cov.gr, "norm.counts")
             ratio.gr = khtools::.gc(cov.gr, "ratio")
             values(cov.gr)[, opt$field] = values(ratio.gr)[, "ratio"]
         } else {

diff --git a/conf/base.config b/conf/base.config
@@ -102,12 +102,12 @@ process {
         time            = { check_max( 24.h  * task.attempt, 'time'   ) }
     }
     withName: 'SVABA' {
-        cpus            = 8
+        cpus            = 4
         memory          = { check_max( 72.GB * task.attempt, 'memory' ) }
         time            = 84.h
     }
     withName: 'GRIDSS' {
-        cpus            = { check_max( 8 * task.attempt, 'cpus' ) }
+        cpus            = { check_max( 4 * task.attempt, 'cpus' ) }
         memory          = { check_max( 72.GB * task.attempt, 'memory' ) }
         time            = 84.h
     }

diff --git a/conf/igenomes.config b/conf/igenomes.config
@@ -88,7 +88,7 @@ params {
             build_dryclean               = 'hg38'
             hapmap_sites                 = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/hapmap_3.3.hg38.vcf.gz"
             pon_dryclean                 = "${params.mski_base}/dryclean/pon/hg38/detergent.rds"
-            blacklist_coverage_jabba     = "${params.mski_base}/JaBbA/blacklist_coverage/hg38/hg38.coverage.mask.rds"
+            blacklist_coverage_jabba     = "${params.mski_base}/JaBbA/blacklist_coverage/hg38/hg38.coverage.mask.nochr.rds"
 	}
 
         'GRCh37' {

diff --git a/conf/modules/fragcounter.config b/conf/modules/fragcounter.config
@@ -33,4 +33,22 @@ process {
                 pattern: "*{.rds,.bw,cov*,.command.*}"
         ]
     }
+
+    withName: 'MSKILABORG_NFJABBA:NFJABBA:NORMAL_FRAGCOUNTER:REBIN_RAW_FRAGCOUNTER' {
+        ext.when         = { params.tools && params.tools.split(',').contains('fragcounter') }
+        publishDir       = [
+                mode: params.publish_dir_mode,
+                path: { "${params.outdir}/Coverages/fragCounter_normal/${meta.id}/" },
+                pattern: "*{.rds,1kb_*,.command.*}"
+        ]
+    }
+
+    withName: 'MSKILABORG_NFJABBA:NFJABBA:TUMOR_FRAGCOUNTER:REBIN_RAW_FRAGCOUNTER' {
+        ext.when         = { params.tools && params.tools.split(',').contains('fragcounter') }
+        publishDir       = [
+                mode: params.publish_dir_mode,
+                path: { "${params.outdir}/Coverages/fragCounter_tumor/${meta.id}/" },
+                pattern: "*{.rds,1kb_*,.command.*}"
+        ]
+    }
 }
diff --git a/modules/local/ascat/main.nf b/modules/local/ascat/main.nf
@@ -18,7 +18,7 @@ process ASCAT_SEG {
     val(from_maf)                                                    // channel: whether to start from MAF, default=FALSE
 
     output:
-    tuple val(meta), path("*ascat_pp.rds")        , emit: purityploidy, optional:true
+    tuple val(meta), path("*ascat_pp.rds")        , emit: purityploidy
     tuple val(meta), path("*ascat_seg.rds")       , emit: segments, optional:true
     path "versions.yml"                           , emit: versions
 

diff --git a/modules/local/cbs/main.nf b/modules/local/cbs/main.nf
@@ -15,7 +15,7 @@ process CBS {
     val(name)
 
     output:
-    tuple val(meta), path("*cov.rds")         , emit: cbs_cov_rds
+    tuple val(meta), path("cov.rds")         , emit: cbs_cov_rds
     tuple val(meta), path("seg.rds")         , emit: cbs_seg_rds
     tuple val(meta), path("nseg.rds")        , emit: cbs_nseg_rds
     path "versions.yml"                       , emit: versions

diff --git a/modules/local/dryclean/main.nf b/modules/local/dryclean/main.nf
@@ -3,27 +3,26 @@ process DRYCLEAN {
     label 'process_medium'
 
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'docker://mskilab/dryclean:0.0.2':
-        'mskilab/dryclean:0.0.2' }"
+        'docker://mskilab/dryclean:0.0.3':
+        'mskilab/dryclean:0.0.3' }"
 
 
     input:
     tuple val(meta), path(input)
     path(pon)
-    val(centered)
+    val(center)
     val(cbs)
     val(cnsignif)
     val(wholeGenome)
     val(blacklist)
     val(blacklist_path)
     val(germline_filter)
     val(germline_file)
-    val(human)
     val(field)
     val(build)
 
     output:
-    tuple val(meta), path("*cov.rds")                 , emit: decomposed_cov, optional: true
+    tuple val(meta), path("*cov.rds")                 , emit: decomposed_cov
     //tuple val(meta), path("*.dryclean.object.rds")    , emit: dryclean_object, optional: true
     path "versions.yml"                               , emit: versions
 
@@ -33,7 +32,7 @@ process DRYCLEAN {
     script:
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
-    def VERSION = '0.0.2'
+    def VERSION = '0.0.3'
     """
     #!/bin/bash
     set -o allexport
@@ -70,7 +69,7 @@ process DRYCLEAN {
     CMD="Rscript \$drycln \\
         --input             ${input} \\
         --pon               ${pon} \\
-        --centered          ${centered} \\
+        --center            ${center} \\
         --cbs               ${cbs} \\
         --cnsignif          ${cnsignif} \\
         --cores             ${task.cpus} \\
@@ -79,7 +78,6 @@ process DRYCLEAN {
         --blacklist_path    ${blacklist_path} \\
         --germline.filter   ${germline_filter} \\
         --germline.file     ${germline_file} \\
-        --human             ${human} \\
         --field             ${field} \\
         --build             ${build} \\
     "

diff --git a/modules/local/fragcounter/main.nf b/modules/local/fragcounter/main.nf
@@ -5,8 +5,8 @@ process FRAGCOUNTER {
 
     // TODO add fragcounter container
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'docker://mskilab/fragcounter:latest':
-        'mskilab/fragcounter:latest' }"
+        'docker://mskilab/fragcounter:0.1':
+        'mskilab/fragcounter:0.1' }"
 
     input:
     tuple val(meta), path(bam), path(bai)                    // Mandatory: Format should be [meta, bam, bai] : can also provide cram & crai
@@ -21,6 +21,7 @@ process FRAGCOUNTER {
 
 
     output:
+    tuple val(meta), path("*cov.raw.rds")                   , emit: fragcounter_raw_cov, optional: true
     tuple val(meta), path("*cov.rds")                       , emit: fragcounter_cov, optional: true
     tuple val(meta), path("*cov.corrected.bw")              , emit: corrected_bw, optional: true
     path "versions.yml"                                     , emit: versions
@@ -72,4 +73,54 @@ process FRAGCOUNTER {
     END_VERSIONS
     """    
 
-}
+}
+
+process REBIN_RAW_FRAGCOUNTER {
+
+    tag "$meta.id"
+    label 'process_low'
+
+    // TODO add fragcounter container
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'docker://mskilab/fragcounter:0.1':
+        'mskilab/fragcounter:0.1' }"
+
+    input:
+    tuple val(meta), path(cov_raw)
+    val(field)
+    val(windowsize)
+
+    output:
+    tuple val(meta), path("1kb_*"), emit: raw_fragcounter_cov_1kb, optional:true
+
+    script:
+
+    """
+    #!/usr/bin/env Rscript
+
+    library(skitools)
+
+    filename = "${cov_raw}"
+    outputfn = "1kb_${cov_raw.name}"
+
+    raw_cov = readRDS(filename)
+    collapse.cov <- function(cov.gr, bin.size = 1e3, field = "reads.corrected") {
+        BINSIZE.ROUGH = bin.size
+        cov.gr = cov.gr[, field]
+        cov.gr = gr2dt(cov.gr)
+        setnames(cov.gr, field, "signal")
+        cov.gr = cov.gr[!is.infinite(signal), .(signal = median(signal, na.rm = TRUE)),
+                        by = .(seqnames, start = floor(start/BINSIZE.ROUGH)*BINSIZE.ROUGH+1)]
+        cov.gr[, end := (start + BINSIZE.ROUGH) - 1]
+        setnames(cov.gr, "signal", field)
+        cov.gr = dt2gr(cov.gr)
+        return(cov.gr)
+    }
+    rebinned_cov = collapse.cov(raw_cov, bin.size = ${windowsize}, field = "${field}")
+    ##rebinned_cov = rebinned_cov %Q% (!seqnames=="Y")
+    rebinned_cov = rebinned_cov %Q% (seqnames %in% c(seq(1:22),"X"))
+    saveRDS(rebinned_cov, outputfn)
+
+    """
+
+}
diff --git a/modules/local/gridss/gridss/main.nf b/modules/local/gridss/gridss/main.nf
@@ -46,8 +46,8 @@ process GRIDSS_GRIDSS {
         $assembly_bam \\
         $blacklist \\
         --picardoptions VALIDATION_STRINGENCY=LENIENT \\
-        --jvmheap 31g \\
-        --otherjvmheap 31g \\
+        --jvmheap 2g \\
+        --otherjvmheap 1g \\
         ${normalbam} \\
         ${tumorbam}
 

diff --git a/modules/local/hetpileups/main.nf b/modules/local/hetpileups/main.nf
@@ -14,7 +14,7 @@ process HETPILEUPS {
     path(hapmap_sites)
 
     output:
-    tuple val(meta), path("*sites.txt")                     , emit: het_pileups_wgs
+    tuple val(meta), path("*sites.txt")                     , emit: het_pileups_wgs, optional: true
     path "versions.yml"                                     , emit: versions
 
     when:

diff --git a/modules/local/jabba/main.nf b/modules/local/jabba/main.nf
@@ -170,3 +170,48 @@ process JABBA {
     END_VERSIONS
     """
 }
+
+process COERCE_SEQNAMES {
+
+    tag "$meta.id"
+    label 'process_low'
+
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'docker://mskilab/jabba:latest':
+        'mskilab/jabba:latest' }"
+
+    input:
+    tuple val(meta), path(file)
+
+    output:
+    tuple val(meta), path("coerced_chr*"), emit: file, optional: true
+
+    script:
+    """
+    #!/usr/bin/env Rscript
+
+    fn <- "${file}"
+    outputfn <- "coerced_chr_${file.name}"
+
+    if(grepl('.rds', "${file.name}")){
+        library(GenomicRanges)
+        data <- readRDS(fn)
+        seqlevels(data, pruning.mode = "coarse") <- gsub("chr","",seqlevels(data))
+        saveRDS(data, file = outputfn)
+    } else if (grepl('.vcf|.vcf.gz|.vcf.bgz', "${file.name}")) {
+        library(VariantAnnotation)
+        data <- readVcf(fn)
+        ##seqlevelsStyle(data) <- 'NCBI'
+        seqlevels(data) <- sub("^chr", "", seqlevels(data))
+        header = header(data)
+        rownames(header@header\$contig) = sub("^chr", "", rownames(header@header\$contig))
+        header(data) <- header
+        data@fixed\$ALT <- lapply(data@fixed\$ALT, function(x) gsub("chr", "", x))
+        writeVcf(data, file = outputfn)
+    } else {
+        data <- read.table(fn, header=T)
+        data[[1]] <- gsub("chr","",data[[1]])
+        write.table(data, file = outputfn, sep = "\\t", row.names = F, quote = F)
+    }
+    """
+}