From ec26a90a095c27ccc6a7a578d44091f27eb4b53f Mon Sep 17 00:00:00 2001
From: Shihab Dider <shihabdider@gmail.com>
Date: Wed, 18 Oct 2023 21:53:41 -0400
Subject: [PATCH] feat: Wires up dryclean

Did not add schema definition since those can be generated automatically
via the nf-core tool. Also did not fully incorporate FragCounter into
the new "coverage" step in anticipation of merge conflicts.
---
 modules/local/dryclean/main.nf                | 30 +++++++++--
 modules/local/gridss/gridss/main.nf           |  2 +-
 nextflow.config                               | 33 ++++++------
 nextflow_schema.json                          |  9 ++--
 subworkflows/local/bam_Dryclean/main.nf       | 30 -----------
 .../local/bam_svcalling_gridss/main.nf        |  4 +-
 subworkflows/local/dryclean/main.nf           | 44 ++++++++++++++++
 workflows/heisenbio.nf                        | 51 +++++++++++++++----
 8 files changed, 135 insertions(+), 68 deletions(-)
 delete mode 100644 subworkflows/local/bam_Dryclean/main.nf
 create mode 100644 subworkflows/local/dryclean/main.nf

diff --git a/modules/local/dryclean/main.nf b/modules/local/dryclean/main.nf
index 997ea7a..c2cd0e4 100644
--- a/modules/local/dryclean/main.nf
+++ b/modules/local/dryclean/main.nf
@@ -8,11 +8,11 @@ process DRYCLEAN {
         'mskilab/dryclean:latest' }"
 
     input:
-    tuple val(meta), path(pon) path(input)
+    tuple val(meta), path(input)
+    path pon
     val centered
     val cbs
     val cnsignif
-    val cores
     val wholeGenome
     val blacklist
     path blacklist_path
@@ -25,7 +25,7 @@ process DRYCLEAN {
     output:
     tuple val(meta), path("*.drycleaned.cov.rds")     , emit: decomposed_cov, optional: true
     tuple val(meta), path("*.dryclean.object.rds")    , emit: dryclean_object, optional: true
-    //path "versions.yml"           , emit: versions
+    path "versions.yml"           , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
@@ -33,6 +33,7 @@ process DRYCLEAN {
     script:
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
+    def VERSION = '0.0.2'
     """
     #!/bin/bash
     set -o allexport
@@ -65,7 +66,23 @@ process DRYCLEAN {
     export drycln=$drycleanPath/extdata/drcln
     echo $drycln
     set +x
-    CMD="Rscript $drycln \$@"
+
+    CMD="Rscript $drycln \\
+        --input             ${input} \\
+        --pon               ${pon} \\
+        --centered          ${centered} \\
+        --cbs               ${cbs} \\
+        --cnsignif          ${cnsignif} \\
+        --cores             ${task.cpus} \\
+        --wholeGenome       ${wholeGenome} \\
+        --blacklist         ${blacklist} \\
+        --blacklist_path    ${blacklist_path} \\
+        --germline.filter   ${germline_filter} \\
+        --germline.file     ${germline_file} \\
+        --human             ${human} \\
+        --field             ${field} \\
+        --build             ${build} \\
+    "
 
     if [ ! -s ./drycleaned.cov.rds ]; then
 	if ! { echo "Running:" && echo "${CMD}" && eval ${CMD}; }; then
@@ -75,6 +92,11 @@ process DRYCLEAN {
 	echo "If you wish to rerun Dryclean - please purge directory first"
     fi
     exit 0
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        dryclean: ${VERSION}
+    END_VERSIONS
     """
 
     stub:
diff --git a/modules/local/gridss/gridss/main.nf b/modules/local/gridss/gridss/main.nf
index 110f74a..5531478 100644
--- a/modules/local/gridss/gridss/main.nf
+++ b/modules/local/gridss/gridss/main.nf
@@ -15,7 +15,7 @@ process GRIDSS_GRIDSS {
     path(fasta_fai)
     path(bwa_index)                                                                           // required: bwa index folder
     path(blacklist_gridss)                                                                    // optional: gridss blacklist bed file based on genome
-    
+
 
     output:
     tuple val(meta), path("*.vcf.gz")                                  , emit: vcf,       optional:true
diff --git a/nextflow.config b/nextflow.config
index a751ed7..31b3bfa 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -55,26 +55,12 @@ params {
 	//indel_mask                 = null                            // Must provide blacklist bed file for indels based on genome to run Svaba
 
 	// fragCounter options
-	midpoint_frag              = "TRUE"                           // If TRUE only count midpoint if FALSE then count bin footprint of every fragment interval: Default=TRUE  
+	midpoint_frag              = "TRUE"                           // If TRUE only count midpoint if FALSE then count bin footprint of every fragment interval: Default=TRUE
 	windowsize_frag            = 200                            // Window / bin size : Default=200
 	minmapq_frag               = 1                              // Minimal map quality : Default = 1
 	paired_frag                = "TRUE"                           // Is the dataset paired : Default = TRUE
 	exome_frag                 = "FALSE"	                        // Use exons as bins instead of fixed window : Default = FALSE
 
-    // Dryclean
-    centered                    = true
-    cbs                         = false
-    cnsignif                    = 0.00001
-    cores                       = 1
-    wholeGenome                 = true
-    blacklist                   = false
-    blacklist_path              = null
-    germline_filter             = false
-    germline_file               = null
-    human                       = true
-    field                       = "reads.corrected"
-    build                       = "hg19"
-
     // Variant Calling
     only_paired_variant_calling   = false // if true, skips germline variant calling for normal-paired samples
     ascat_ploidy                  = null  // default value for ASCAT
@@ -126,6 +112,19 @@ params {
 	max_multiqc_email_size     = '25.MB'
 	multiqc_methods_description = null
 
+    // Dryclean
+    centered                    = true
+    cbs                         = false
+    cnsignif                    = 0.00001
+    wholeGenome                 = true
+    blacklist                   = false
+    blacklist_path              = null
+    germline_filter             = false
+    germline_file               = null
+    human                       = true
+    field                       = "reads.corrected"
+    build                       = "hg19"
+
     // JaBbA options
     field = "ratio"
     junctionUnfiltered = file("/dev/null")
@@ -134,8 +133,8 @@ params {
     cbs_seg_rds = file("/dev/null")
     slack = 100
     het_pileups_wgs = file("/dev/null")
-    purity = "NA"
-    ploidy = "NA"
+    purity = null
+    ploidy = null
     tilim = 6000
     epgap = 1e-8
     pp_method = "ppgrid"
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 4de1562..aabe8d0 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -47,7 +47,8 @@
                         "recalibrate",
                         "sv_calling",
                         "variant_calling",
-                        "annotate"
+                        "annotate",
+                        "coverage"
                     ]
                 },
                 "outdir": {
@@ -290,7 +291,7 @@
                     "fa_icon": "fas fa-forward",
                     "description": "Option to mention whether the dataset is paired or single ended for fragCounter",
                     "hidden": true,
-                    "help_text": "If TRUE, will consider the dataset is paired ened, else single ended"
+                    "help_text": "If TRUE, will consider the dataset is paired ended, else single ended"
                 },
                 "exome_frag": {
                     "type": "string",
@@ -303,7 +304,7 @@
                     "type": "number",
                     "default": 200,
                     "fa_icon": "fas fa-wrench",
-                    "description": "Default bin size for gragCounter",
+                    "description": "Default bin size for fragCounter",
                     "hidden": true,
                     "help_text": "By default the bin size is 200, adjust as necessary."
                 },
@@ -322,7 +323,7 @@
                     "hidden": true,
                     "help_text": "Provide the directory containing .rds file for gc and mappability bias based on genome version"
                 }
-            } 
+            }
         },
         "variant_calling": {
             "title": "Variant Calling",
diff --git a/subworkflows/local/bam_Dryclean/main.nf b/subworkflows/local/bam_Dryclean/main.nf
deleted file mode 100644
index 4dbce38..0000000
--- a/subworkflows/local/bam_Dryclean/main.nf
+++ /dev/null
@@ -1,30 +0,0 @@
-//
-// BAM DRYCLEAN
-//
-
-include { DRYCLEAN } from '../../../modules/local/dryclean/main.nf'
-
-workflow BAM_DRYCLEAN {
-
-    take:
-    tuple val(meta), path(cov)
-    path(pon_dryclean)
-    val(germline_filter_drycln)
-    path(blacklist_drycln)
-    path(germline_file_drycln)
-    val(collapse_drycln)
-    val(field_drycln)
-    tuple val(meta), path(bam), path(bai)
-
-    main:
-    versions          = Channel.empty()
-    dryclean_cov      = Channel.empty()
-
-    DRYCLEAN(cov, pon_dryclean, germline_filter_drycln, blacklist_drycln, germline_file_drycln, collapse_drycln, field_drycln,)
-
-
-
-
-
-}
-
diff --git a/subworkflows/local/bam_svcalling_gridss/main.nf b/subworkflows/local/bam_svcalling_gridss/main.nf
index 41466f9..283d397 100644
--- a/subworkflows/local/bam_svcalling_gridss/main.nf
+++ b/subworkflows/local/bam_svcalling_gridss/main.nf
@@ -9,7 +9,7 @@ include { GRIDSS_SOMATIC  } from '../../../modules/local/gridss/somaticFilter/ma
 
 workflow BAM_SVCALLING_GRIDSS {
     take:
-    cram                                              // channel: [mandatory] [ meta, normalcram, normalcrai, tumorcram, tumorcrai ] 
+    cram                                              // channel: [mandatory] [ meta, normalcram, normalcrai, tumorcram, tumorcrai ]
     fasta                                             // channel: [mandatory] reference fasta
     fasta_fai                                         // channel: [mandatory] reference fasta index
     bwa_index                                         // channel: [mandatory] bwa index path
@@ -65,7 +65,7 @@ workflow BAM_SVCALLING_GRIDSS_SOMATIC {
     somatic_all
     somatic_high_confidence
     all_vcf
-    
+
     versions
 
 }
diff --git a/subworkflows/local/dryclean/main.nf b/subworkflows/local/dryclean/main.nf
new file mode 100644
index 0000000..ab3a429
--- /dev/null
+++ b/subworkflows/local/dryclean/main.nf
@@ -0,0 +1,44 @@
+//
+// DRYCLEAN
+//
+
+include { DRYCLEAN } from '../../../modules/local/dryclean/main.nf'
+
+workflow COV_DRYCLEAN {
+
+    take:
+    input_dryclean   // channel: [mandatory] [ meta, input ]
+    pon_dryclean
+    centered_dryclean
+    cbs_dryclean
+    cnsignif_dryclean
+    wholeGenome_dryclean
+    blacklist_dryclean
+    blacklist_path_dryclean
+    germline_filter_dryclean
+    germline_file_dryclean
+    human_dryclean
+    field_dryclean
+    build_dryclean
+
+    main:
+    versions          = Channel.empty()
+    dryclean_cov      = Channel.empty()
+    dryclean_obj      = Channel.empty()
+
+    DRYCLEAN(input_dryclean, pon_dryclean, centered_dryclean, cbs_dryclean,
+    cnsignif_dryclean, wholeGenome_dryclean, blacklist_dryclean,
+    blacklist_path_dryclean, germline_filter_dryclean, germline_file_dryclean,
+    human_dryclean, field_dryclean, build_dryclean)
+
+    dryclean_cov      = DRYCLEAN.out.decomposed_cov
+    dryclean_obj      = DRYCLEAN.out.dryclean_object
+
+    versions          = DRYCLEAN.out.versions
+
+    emit:
+    dryclean_cov    // only need to emit the coverage for JaBbA
+
+    versions
+}
+
diff --git a/workflows/heisenbio.nf b/workflows/heisenbio.nf
index 2f5030c..8cebbef 100644
--- a/workflows/heisenbio.nf
+++ b/workflows/heisenbio.nf
@@ -47,7 +47,8 @@ def checkPathParamList = [
     params.mappability,
     params.multiqc_config,
     params.pon,
-    params.pon_tbi
+    params.pon_tbi,
+    params.pon_dryclean
 ]
 // only check if we are using the tools
 if (params.tools && params.tools.contains("snpeff")) checkPathParamList.add(params.snpeff_cache)
@@ -176,12 +177,12 @@ input_sample = ch_from_samplesheet
                 }
             } else if (cov) {
                 meta = meta + [id: meta.sample, data_type: 'cov']
-                
+
                 if (params.step == 'coverage') return [ meta - meta.subMap('lane'), cov ]
                 else {
                     error("Samplesheet contains cov rds files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.")
                 }
-                
+
             } else {
                 error("Missing or unknown field in csv file header. Please check your samplesheet")
             }
@@ -228,6 +229,9 @@ simple_seq_db      = params.simple_seq_db      ? Channel.fromPath(params.simple_
 blacklist_gridss   = params.blacklist_gridss   ? Channel.fromPath(params.blacklist_gridss).collect()  : Channel.empty()   // This is the mask for gridss SV calls
 pon_gridss         = params.pon_gridss         ? Channel.fromPath(params.pon_gridss).collect()        : Channel.empty()   //This is the pon directory for GRIDSS SOMATIC. (MUST CONTAIN .bed and .bedpe files)
 gcmapdir_frag      = params.gcmapdir_frag      ? Channel.fromPath(params.gcmapdir_frag).collect()     : Channel.empty()   // This is the GC/Mappability directory for fragCounter. (Must contain gc* & map* .rds files)
+pon_dryclean      = params.pon_dryclean      ? Channel.fromPath(params.pon_dryclean).collect()     : Channel.empty()   // This is the path to the PON for Dryclean.
+blacklist_path_dryclean      = params.blacklist_path_dryclean      ? Channel.fromPath(params.blacklist_path_dryclean).collect()     : Channel.empty()   // This is the path to the blacklist for Dryclean (optional).
+germline_file_dryclean      = params.germline_file_dryclean      ? Channel.fromPath(params.germline_file_dryclean).collect()     : Channel.empty()   // This is the path to the germline mask for dryclean (optional).
 // Initialize value channels based on params, defined in the params.genomes[params.genome] scope
 ascat_genome       = params.ascat_genome       ?: Channel.empty()
 dbsnp_vqsr         = params.dbsnp_vqsr         ? Channel.value(params.dbsnp_vqsr) : Channel.empty()
@@ -244,6 +248,18 @@ midpoint_frag      = params.midpoint_frag      ?: Channel.empty()
 paired_frag        = params.paired_frag        ?: Channel.empty()                                                         // For fragCounter
 exome_frag         = params.exome_frag         ?: Channel.empty()                                                         // For fragCounter
 
+// Dryclean
+centered_dryclean           = params.centered_dryclean          ?: Channel.empty()
+cbs_dryclean                = params.cbs_dryclean               ?: Channel.empty()
+cnsiginif_dryclean          = params.cnsiginif_dryclean         ?: Channel.empty()
+wholeGenome_dryclean        = params.wholeGenome_dryclean       ?: Channel.empty()
+blacklist_dryclean          = params.blacklist_dryclean         ?: Channel.empty()
+germline_filter_dryclean    = params.germline_filter_dryclean   ?: Channel.empty()
+human_dryclean              = params.human_dryclean             ?: Channel.empty()
+field_dryclean              = params.field_dryclean             ?: Channel.empty()
+build_dryclean              = params.build_dryclean             ?: Channel.empty()
+
+
 // Initialize files channels based on params, not defined within the params.genomes[params.genome] scope
 if (params.snpeff_cache && params.tools && params.tools.contains("snpeff")) {
     def snpeff_annotation_cache_key = params.use_annotation_cache_keys ? "${params.snpeff_genome}.${params.snpeff_db}/" : ""
@@ -346,6 +362,8 @@ include { BAM_SVCALLING_GRIDSS_SOMATIC                } from '../subworkflows/lo
 include { BAM_FRAGCOUNTER as TUMOR_FRAGCOUNTER         } from '../subworkflows/local/bam_fragCounter/main'
 include { BAM_FRAGCOUNTER as NORMAL_FRAGCOUNTER        } from '../subworkflows/local/bam_fragCounter/main'
 
+include { COV_DRYCLEAN as DRYCLEAN         } from '../subworkflows/local/dryclean/main'
+
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     RUN MAIN WORKFLOW
@@ -402,11 +420,11 @@ workflow HEISENBIO {
                                     : PREPARE_GENOME.out.bwa
     bwamem2    = params.bwamem2     ? Channel.fromPath(params.bwamem2).collect()
                                     : PREPARE_GENOME.out.bwamem2
-    
+
     // Gather index for mapping given the chosen aligner
     index_alignement = (params.aligner == "bwa-mem") ? bwa :
         params.aligner == "bwa-mem2" ? bwamem2 : null
-    
+
     // TODO: add a params for msisensorpro_scan
     msisensorpro_scan      = PREPARE_GENOME.out.msisensorpro_scan
 
@@ -457,7 +475,7 @@ workflow HEISENBIO {
         if ( num_intervals < 1 ) [ [], [], num_intervals ]
         else [ intervals[0], intervals[1], num_intervals ]
     }
-    
+
     // Gather used softwares versions
     versions = versions.mix(PREPARE_GENOME.out.versions)
     versions = versions.mix(PREPARE_INTERVALS.out.versions)
@@ -846,14 +864,14 @@ workflow HEISENBIO {
 
         //cram_sv_calling_pair.view()
         if (params.tools && params.tools.split(',').contains('svaba')) {
-            BAM_SVCALLING_SVABA(cram_sv_calling_pair, fasta, fasta_fai, bwa, dbsnp, dbsnp_tbi, indel_mask, germ_sv_db, simple_seq_db, error_rate) 
+            BAM_SVCALLING_SVABA(cram_sv_calling_pair, fasta, fasta_fai, bwa, dbsnp, dbsnp_tbi, indel_mask, germ_sv_db, simple_seq_db, error_rate)
 
             versions = versions.mix(BAM_SVCALLING_SVABA.out.versions)
 
             vcf_from_sv_calling = Channel.empty()
             vcf_from_sv_calling = vcf_from_sv_calling.mix(BAM_SVCALLING_SVABA.out.all_output)                                                      //This one contains multiple files of vcf, to get individual files, call individual output
 
-        }  
+        }
 
         if (params.tools && params.tools.split(',').contains('gridss')) {
             BAM_SVCALLING_GRIDSS(cram_sv_calling_pair, fasta, fasta_fai, bwa, blacklist_gridss)     // running GRIDSS
@@ -892,7 +910,7 @@ workflow HEISENBIO {
             versions = versions.mix(NORMAL_FRAGCOUNTER.out.versions)
             normal_frag_cov = Channel.empty().mix(NORMAL_FRAGCOUNTER.out.fragcounter_cov)
             //normal_frag_cov.view()
-            NORMAL_FRAGCOUNTER.out.corrected_bw.view() 
+            NORMAL_FRAGCOUNTER.out.corrected_bw.view()
 
             TUMOR_FRAGCOUNTER(cram_sv_calling_status.tumor, midpoint_frag, windowsize_frag, gcmapdir_frag, minmapq_frag, fasta, fasta_fai, paired_frag, exome_frag)
 
@@ -902,10 +920,23 @@ workflow HEISENBIO {
         }
 
         // TODO: Add a workflow to write the output file paths into a csv
-        
+
 
     }
 
+    if (params.step == 'coverage') {
+        if (params.tools && params.tools.split(',').contains('dryclean')) {
+
+            DRYCLEAN(tumor_frag_cov, pon_dryclean, centered_dryclean,
+            cbs_dryclean, cnsignif_dryclean, wholeGenome_dryclean,
+            blacklist_dryclean, blacklist_path_dryclean,
+            germline_filter_dryclean, germline_file_dryclean, human_dryclean,
+            field_dryclean, build_dryclean)
+
+            versions = versions.mix(NORMAL_FRAGCOUNTER.out.versions)
+            dryclean_cov = Channel.empty().mix(DRYCLEAN.out.dryclean_cov)
+        }
+    }
 }