From ec26a90a095c27ccc6a7a578d44091f27eb4b53f Mon Sep 17 00:00:00 2001 From: Shihab Dider Date: Wed, 18 Oct 2023 21:53:41 -0400 Subject: [PATCH] feat: Wires up dryclean Did not add schema definition since those can be generated automatically via the nf-core tool. Also did not fully incorporate FragCounter into the new "coverage" step in anticipation of merge conflicts. --- modules/local/dryclean/main.nf | 30 +++++++++-- modules/local/gridss/gridss/main.nf | 2 +- nextflow.config | 33 ++++++------ nextflow_schema.json | 9 ++-- subworkflows/local/bam_Dryclean/main.nf | 30 ----------- .../local/bam_svcalling_gridss/main.nf | 4 +- subworkflows/local/dryclean/main.nf | 44 ++++++++++++++++ workflows/heisenbio.nf | 51 +++++++++++++++---- 8 files changed, 135 insertions(+), 68 deletions(-) delete mode 100644 subworkflows/local/bam_Dryclean/main.nf create mode 100644 subworkflows/local/dryclean/main.nf diff --git a/modules/local/dryclean/main.nf b/modules/local/dryclean/main.nf index 997ea7a..c2cd0e4 100644 --- a/modules/local/dryclean/main.nf +++ b/modules/local/dryclean/main.nf @@ -8,11 +8,11 @@ process DRYCLEAN { 'mskilab/dryclean:latest' }" input: - tuple val(meta), path(pon) path(input) + tuple val(meta), path(input) + path pon val centered val cbs val cnsignif - val cores val wholeGenome val blacklist path blacklist_path @@ -25,7 +25,7 @@ process DRYCLEAN { output: tuple val(meta), path("*.drycleaned.cov.rds") , emit: decomposed_cov, optional: true tuple val(meta), path("*.dryclean.object.rds") , emit: dryclean_object, optional: true - //path "versions.yml" , emit: versions + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -33,6 +33,7 @@ process DRYCLEAN { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '0.0.2' """ #!/bin/bash set -o allexport @@ -65,7 +66,23 @@ process DRYCLEAN { export drycln=$drycleanPath/extdata/drcln echo $drycln set +x - CMD="Rscript $drycln \$@" + + CMD="Rscript $drycln \\ + --input ${input} \\ + --pon ${pon} \\ + --centered ${centered} \\ + --cbs ${cbs} \\ + --cnsignif ${cnsignif} \\ + --cores ${task.cpus} \\ + --wholeGenome ${wholeGenome} \\ + --blacklist ${blacklist} \\ + --blacklist_path ${blacklist_path} \\ + --germline.filter ${germline_filter} \\ + --germline.file ${germline_file} \\ + --human ${human} \\ + --field ${field} \\ + --build ${build} \\ + " if [ ! -s ./drycleaned.cov.rds ]; then if ! { echo "Running:" && echo "${CMD}" && eval ${CMD}; }; then @@ -75,6 +92,11 @@ process DRYCLEAN { echo "If you wish to rerun Dryclean - please purge directory first" fi exit 0 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dryclean: ${VERSION} + END_VERSIONS """ stub: diff --git a/modules/local/gridss/gridss/main.nf b/modules/local/gridss/gridss/main.nf index 110f74a..5531478 100644 --- a/modules/local/gridss/gridss/main.nf +++ b/modules/local/gridss/gridss/main.nf @@ -15,7 +15,7 @@ process GRIDSS_GRIDSS { path(fasta_fai) path(bwa_index) // required: bwa index folder path(blacklist_gridss) // optional: gridss blacklist bed file based on genome - + output: tuple val(meta), path("*.vcf.gz") , emit: vcf, optional:true diff --git a/nextflow.config b/nextflow.config index a751ed7..31b3bfa 100644 --- a/nextflow.config +++ b/nextflow.config @@ -55,26 +55,12 @@ params { //indel_mask = null // Must provide blacklist bed file for indels based on genome to run Svaba // fragCounter options - midpoint_frag = "TRUE" // If TRUE only count midpoint if FALSE then count bin footprint of every fragment interval: Default=TRUE + midpoint_frag = "TRUE" // If TRUE only count midpoint if FALSE then count bin footprint of every fragment interval: Default=TRUE windowsize_frag = 200 // Window / bin size : Default=200 minmapq_frag = 1 // Minimal map quality : Default = 1 paired_frag = "TRUE" // Is the dataset paired : Default = TRUE exome_frag = "FALSE" // Use exons as bins instead of fixed window : Default = FALSE - // Dryclean - centered = true - cbs = false - cnsignif = 0.00001 - cores = 1 - wholeGenome = true - blacklist = false - blacklist_path = null - germline_filter = false - germline_file = null - human = true - field = "reads.corrected" - build = "hg19" - // Variant Calling only_paired_variant_calling = false // if true, skips germline variant calling for normal-paired samples ascat_ploidy = null // default value for ASCAT @@ -126,6 +112,19 @@ params { max_multiqc_email_size = '25.MB' multiqc_methods_description = null + // Dryclean + centered = true + cbs = false + cnsignif = 0.00001 + wholeGenome = true + blacklist = false + blacklist_path = null + germline_filter = false + germline_file = null + human = true + field = "reads.corrected" + build = "hg19" + // JaBbA options field = "ratio" junctionUnfiltered = file("/dev/null") @@ -134,8 +133,8 @@ params { cbs_seg_rds = file("/dev/null") slack = 100 het_pileups_wgs = file("/dev/null") - purity = "NA" - ploidy = "NA" + purity = null + ploidy = null tilim = 6000 epgap = 1e-8 pp_method = "ppgrid" diff --git a/nextflow_schema.json b/nextflow_schema.json index 4de1562..aabe8d0 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -47,7 +47,8 @@ "recalibrate", "sv_calling", "variant_calling", - "annotate" + "annotate", + "coverage" ] }, "outdir": { @@ -290,7 +291,7 @@ "fa_icon": "fas fa-forward", "description": "Option to mention whether the dataset is paired or single ended for fragCounter", "hidden": true, - "help_text": "If TRUE, will consider the dataset is paired ened, else single ended" + "help_text": "If TRUE, will consider the dataset is paired ended, else single ended" }, "exome_frag": { "type": "string", @@ -303,7 +304,7 @@ "type": "number", "default": 200, "fa_icon": "fas fa-wrench", - "description": "Default bin size for gragCounter", + "description": "Default bin size for fragCounter", "hidden": true, "help_text": "By default the bin size is 200, adjust as necessary." }, @@ -322,7 +323,7 @@ "hidden": true, "help_text": "Provide the directory containing .rds file for gc and mappability bias based on genome version" } - } + } }, "variant_calling": { "title": "Variant Calling", diff --git a/subworkflows/local/bam_Dryclean/main.nf b/subworkflows/local/bam_Dryclean/main.nf deleted file mode 100644 index 4dbce38..0000000 --- a/subworkflows/local/bam_Dryclean/main.nf +++ /dev/null @@ -1,30 +0,0 @@ -// -// BAM DRYCLEAN -// - -include { DRYCLEAN } from '../../../modules/local/dryclean/main.nf' - -workflow BAM_DRYCLEAN { - - take: - tuple val(meta), path(cov) - path(pon_dryclean) - val(germline_filter_drycln) - path(blacklist_drycln) - path(germline_file_drycln) - val(collapse_drycln) - val(field_drycln) - tuple val(meta), path(bam), path(bai) - - main: - versions = Channel.empty() - dryclean_cov = Channel.empty() - - DRYCLEAN(cov, pon_dryclean, germline_filter_drycln, blacklist_drycln, germline_file_drycln, collapse_drycln, field_drycln,) - - - - - -} - diff --git a/subworkflows/local/bam_svcalling_gridss/main.nf b/subworkflows/local/bam_svcalling_gridss/main.nf index 41466f9..283d397 100644 --- a/subworkflows/local/bam_svcalling_gridss/main.nf +++ b/subworkflows/local/bam_svcalling_gridss/main.nf @@ -9,7 +9,7 @@ include { GRIDSS_SOMATIC } from '../../../modules/local/gridss/somaticFilter/ma workflow BAM_SVCALLING_GRIDSS { take: - cram // channel: [mandatory] [ meta, normalcram, normalcrai, tumorcram, tumorcrai ] + cram // channel: [mandatory] [ meta, normalcram, normalcrai, tumorcram, tumorcrai ] fasta // channel: [mandatory] reference fasta fasta_fai // channel: [mandatory] reference fasta index bwa_index // channel: [mandatory] bwa index path @@ -65,7 +65,7 @@ workflow BAM_SVCALLING_GRIDSS_SOMATIC { somatic_all somatic_high_confidence all_vcf - + versions } diff --git a/subworkflows/local/dryclean/main.nf b/subworkflows/local/dryclean/main.nf new file mode 100644 index 0000000..ab3a429 --- /dev/null +++ b/subworkflows/local/dryclean/main.nf @@ -0,0 +1,44 @@ +// +// DRYCLEAN +// + +include { DRYCLEAN } from '../../../modules/local/dryclean/main.nf' + +workflow COV_DRYCLEAN { + + take: + input_dryclean // channel: [mandatory] [ meta, input ] + pon_dryclean + centered_dryclean + cbs_dryclean + cnsignif_dryclean + wholeGenome_dryclean + blacklist_dryclean + blacklist_path_dryclean + germline_filter_dryclean + germline_file_dryclean + human_dryclean + field_dryclean + build_dryclean + + main: + versions = Channel.empty() + dryclean_cov = Channel.empty() + dryclean_obj = Channel.empty() + + DRYCLEAN(input_dryclean, pon_dryclean, centered_dryclean, cbs_dryclean, + cnsignif_dryclean, wholeGenome_dryclean, blacklist_dryclean, + blacklist_path_dryclean, germline_filter_dryclean, germline_file_dryclean, + human_dryclean, field_dryclean, build_dryclean) + + dryclean_cov = DRYCLEAN.out.decomposed_cov + dryclean_obj = DRYCLEAN.out.dryclean_object + + versions = DRYCLEAN.out.versions + + emit: + dryclean_cov // only need to emit the coverage for JaBbA + + versions +} + diff --git a/workflows/heisenbio.nf b/workflows/heisenbio.nf index 2f5030c..8cebbef 100644 --- a/workflows/heisenbio.nf +++ b/workflows/heisenbio.nf @@ -47,7 +47,8 @@ def checkPathParamList = [ params.mappability, params.multiqc_config, params.pon, - params.pon_tbi + params.pon_tbi, + params.pon_dryclean ] // only check if we are using the tools if (params.tools && params.tools.contains("snpeff")) checkPathParamList.add(params.snpeff_cache) @@ -176,12 +177,12 @@ input_sample = ch_from_samplesheet } } else if (cov) { meta = meta + [id: meta.sample, data_type: 'cov'] - + if (params.step == 'coverage') return [ meta - meta.subMap('lane'), cov ] else { error("Samplesheet contains cov rds files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.") } - + } else { error("Missing or unknown field in csv file header. Please check your samplesheet") } @@ -228,6 +229,9 @@ simple_seq_db = params.simple_seq_db ? Channel.fromPath(params.simple_ blacklist_gridss = params.blacklist_gridss ? Channel.fromPath(params.blacklist_gridss).collect() : Channel.empty() // This is the mask for gridss SV calls pon_gridss = params.pon_gridss ? Channel.fromPath(params.pon_gridss).collect() : Channel.empty() //This is the pon directory for GRIDSS SOMATIC. (MUST CONTAIN .bed and .bedpe files) gcmapdir_frag = params.gcmapdir_frag ? Channel.fromPath(params.gcmapdir_frag).collect() : Channel.empty() // This is the GC/Mappability directory for fragCounter. (Must contain gc* & map* .rds files) +pon_dryclean = params.pon_dryclean ? Channel.fromPath(params.pon_dryclean).collect() : Channel.empty() // This is the path to the PON for Dryclean. +blacklist_path_dryclean = params.blacklist_path_dryclean ? Channel.fromPath(params.blacklist_path_dryclean).collect() : Channel.empty() // This is the path to the blacklist for Dryclean (optional). +germline_file_dryclean = params.germline_file_dryclean ? Channel.fromPath(params.germline_file_dryclean).collect() : Channel.empty() // This is the path to the germline mask for dryclean (optional). // Initialize value channels based on params, defined in the params.genomes[params.genome] scope ascat_genome = params.ascat_genome ?: Channel.empty() dbsnp_vqsr = params.dbsnp_vqsr ? Channel.value(params.dbsnp_vqsr) : Channel.empty() @@ -244,6 +248,18 @@ midpoint_frag = params.midpoint_frag ?: Channel.empty() paired_frag = params.paired_frag ?: Channel.empty() // For fragCounter exome_frag = params.exome_frag ?: Channel.empty() // For fragCounter +// Dryclean +centered_dryclean = params.centered_dryclean ?: Channel.empty() +cbs_dryclean = params.cbs_dryclean ?: Channel.empty() +cnsiginif_dryclean = params.cnsiginif_dryclean ?: Channel.empty() +wholeGenome_dryclean = params.wholeGenome_dryclean ?: Channel.empty() +blacklist_dryclean = params.blacklist_dryclean ?: Channel.empty() +germline_filter_dryclean = params.germline_filter_dryclean ?: Channel.empty() +human_dryclean = params.human_dryclean ?: Channel.empty() +field_dryclean = params.field_dryclean ?: Channel.empty() +build_dryclean = params.build_dryclean ?: Channel.empty() + + // Initialize files channels based on params, not defined within the params.genomes[params.genome] scope if (params.snpeff_cache && params.tools && params.tools.contains("snpeff")) { def snpeff_annotation_cache_key = params.use_annotation_cache_keys ? "${params.snpeff_genome}.${params.snpeff_db}/" : "" @@ -346,6 +362,8 @@ include { BAM_SVCALLING_GRIDSS_SOMATIC } from '../subworkflows/lo include { BAM_FRAGCOUNTER as TUMOR_FRAGCOUNTER } from '../subworkflows/local/bam_fragCounter/main' include { BAM_FRAGCOUNTER as NORMAL_FRAGCOUNTER } from '../subworkflows/local/bam_fragCounter/main' +include { COV_DRYCLEAN as DRYCLEAN } from '../subworkflows/local/dryclean/main' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -402,11 +420,11 @@ workflow HEISENBIO { : PREPARE_GENOME.out.bwa bwamem2 = params.bwamem2 ? Channel.fromPath(params.bwamem2).collect() : PREPARE_GENOME.out.bwamem2 - + // Gather index for mapping given the chosen aligner index_alignement = (params.aligner == "bwa-mem") ? bwa : params.aligner == "bwa-mem2" ? bwamem2 : null - + // TODO: add a params for msisensorpro_scan msisensorpro_scan = PREPARE_GENOME.out.msisensorpro_scan @@ -457,7 +475,7 @@ workflow HEISENBIO { if ( num_intervals < 1 ) [ [], [], num_intervals ] else [ intervals[0], intervals[1], num_intervals ] } - + // Gather used softwares versions versions = versions.mix(PREPARE_GENOME.out.versions) versions = versions.mix(PREPARE_INTERVALS.out.versions) @@ -846,14 +864,14 @@ workflow HEISENBIO { //cram_sv_calling_pair.view() if (params.tools && params.tools.split(',').contains('svaba')) { - BAM_SVCALLING_SVABA(cram_sv_calling_pair, fasta, fasta_fai, bwa, dbsnp, dbsnp_tbi, indel_mask, germ_sv_db, simple_seq_db, error_rate) + BAM_SVCALLING_SVABA(cram_sv_calling_pair, fasta, fasta_fai, bwa, dbsnp, dbsnp_tbi, indel_mask, germ_sv_db, simple_seq_db, error_rate) versions = versions.mix(BAM_SVCALLING_SVABA.out.versions) vcf_from_sv_calling = Channel.empty() vcf_from_sv_calling = vcf_from_sv_calling.mix(BAM_SVCALLING_SVABA.out.all_output) //This one contains multiple files of vcf, to get individual files, call individual output - } + } if (params.tools && params.tools.split(',').contains('gridss')) { BAM_SVCALLING_GRIDSS(cram_sv_calling_pair, fasta, fasta_fai, bwa, blacklist_gridss) // running GRIDSS @@ -892,7 +910,7 @@ workflow HEISENBIO { versions = versions.mix(NORMAL_FRAGCOUNTER.out.versions) normal_frag_cov = Channel.empty().mix(NORMAL_FRAGCOUNTER.out.fragcounter_cov) //normal_frag_cov.view() - NORMAL_FRAGCOUNTER.out.corrected_bw.view() + NORMAL_FRAGCOUNTER.out.corrected_bw.view() TUMOR_FRAGCOUNTER(cram_sv_calling_status.tumor, midpoint_frag, windowsize_frag, gcmapdir_frag, minmapq_frag, fasta, fasta_fai, paired_frag, exome_frag) @@ -902,10 +920,23 @@ workflow HEISENBIO { } // TODO: Add a workflow to write the output file paths into a csv - + } + if (params.step == 'coverage') { + if (params.tools && params.tools.split(',').contains('dryclean')) { + + DRYCLEAN(tumor_frag_cov, pon_dryclean, centered_dryclean, + cbs_dryclean, cnsignif_dryclean, wholeGenome_dryclean, + blacklist_dryclean, blacklist_path_dryclean, + germline_filter_dryclean, germline_file_dryclean, human_dryclean, + field_dryclean, build_dryclean) + + versions = versions.mix(NORMAL_FRAGCOUNTER.out.versions) + dryclean_cov = Channel.empty().mix(DRYCLEAN.out.dryclean_cov) + } + } }