From 97edbe3595a225b8f62610a649c665cde044c9ce Mon Sep 17 00:00:00 2001 From: Tanubrata Dey Date: Tue, 7 Nov 2023 13:29:58 -0500 Subject: [PATCH 1/7] Adds jabba config file to base.config --- conf/base.config | 21 +++++++++++++-------- nextflow.config | 3 ++- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/conf/base.config b/conf/base.config index f275642..6243ac1 100644 --- a/conf/base.config +++ b/conf/base.config @@ -98,12 +98,12 @@ process { } withName: 'SVABA' { cpus = { check_max( 16 * task.attempt, 'cpus' ) } - memory = { check_max( 32.GB * task.attempt, 'memory' ) } + memory = { check_max( 64.GB * task.attempt, 'memory' ) } time = { check_max( 72.h * task.attempt, 'time' ) } } withName: 'GRIDSS' { cpus = { check_max( 16 * task.attempt, 'cpus' ) } - memory = { check_max( 32.GB * task.attempt, 'memory' ) } + memory = { check_max( 64.GB * task.attempt, 'memory' ) } time = { check_max( 72.h * task.attempt, 'time' ) } } withName: 'GRIDSS_SOMATIC' { @@ -112,24 +112,29 @@ process { time = { check_max( 24.h * task.attempt, 'time' ) } } withName: 'FRAGCOUNTER' { - cpus = { check_max( 10 * task.attempt, 'cpus' ) } - memory = { check_max( 24.GB * task.attempt, 'memory' ) } + cpus = { check_max( 12 * task.attempt, 'cpus' ) } + memory = { check_max( 36.GB * task.attempt, 'memory' ) } time = { check_max( 48.h * task.attempt, 'time' ) } } withName: 'DRYCLEAN' { - cpus = { check_max( 10 * task.attempt, 'cpus' ) } + cpus = { check_max( 8 * task.attempt, 'cpus' ) } memory = { check_max( 24.GB * task.attempt, 'memory' ) } time = { check_max( 24.h * task.attempt, 'time' ) } } withName: 'ASCAT_SEG' { - cpus = { check_max( 12 * task.attempt, 'cpus' ) } + cpus = { check_max( 10 * task.attempt, 'cpus' ) } memory = { check_max( 24.GB * task.attempt, 'memory' ) } - time = { check_max( 24.h * task.attempt, 'time' ) } + time = { check_max( 10.h * task.attempt, 'time' ) } } withName: 'CBS' { cpus = { check_max( 8 * task.attempt, 'cpus' ) } memory = { check_max( 16.GB * task.attempt, 'memory' ) } - time = { check_max( 24.h * task.attempt, 'time' ) } + time = { check_max( 10.h * task.attempt, 'time' ) } + } + withName: 'JABBA' { + cpus = { check_max( 16 * task.attempt, 'cpus' ) } + memory = { check_max( 48.GB * task.attempt, 'memory' ) } + time = { check_max( 10.h * task.attempt, 'time' ) } } withLabel:error_ignore { errorStrategy = 'ignore' diff --git a/nextflow.config b/nextflow.config index 8477a51..f94add6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -419,7 +419,8 @@ includeConfig 'conf/modules/cbs.config' // hetpileups configurations includeConfig 'conf/modules/hetpileups.config' - +// JaBbA configurations +includeConfig 'conf/modules/jabba.config' From 388cc09658f53e0c94e5632cc24ef9ce7268b2a7 Mon Sep 17 00:00:00 2001 From: Tanubrata Dey Date: Tue, 7 Nov 2023 15:06:06 -0500 Subject: [PATCH 2/7] Updates the README.md file --- README.md | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 38df467..0f5f235 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,22 @@ +# NF-JaBbA (Nextflow - Junction Balance Analysis Pipeline) +``` + +▐▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▌ +▐ ▌ +▐ ██████ █████ ███████████ █████ █████████ ▌ +▐ ███░░███ ░░███ ░░███░░░░░███░░███ ███░░░░░███ ▌ +▐ ████████ ░███ ░░░ ░███ ██████ ░███ ░███ ░███████ ░███ ░███ ▌ +▐ ░░███░░███ ███████ ██████████ ░███ ░░░░░███ ░██████████ ░███░░███ ░███████████ ▌ +▐ ░███ ░███ ░░░███░ ░░░░░░░░░░ ░███ ███████ ░███░░░░░███ ░███ ░███ ░███░░░░░███ ▌ +▐ ░███ ░███ ░███ ███ ░███ ███░░███ ░███ ░███ ░███ ░███ ░███ ░███ ▌ +▐ ████ █████ █████ ░░████████ ░░████████ ███████████ ████████ █████ █████ ▌ +▐ ░░░░ ░░░░░ ░░░░░ ░░░░░░░░ ░░░░░░░░ ░░░░░░░░░░░ ░░░░░░░░ ░░░░░ ░░░░░ ▌ +▐ ▌ +▐▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▌ + + +``` + [![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/) @@ -7,7 +26,11 @@ ## Introduction -**mskilab-org/nf-jabba** is a part of the core mski-lab pipeline workflows that is intended to run JaBbA, an MIP based joint inference of copy number and rearrangement state in cancer whole genome sequence data. This workflow runs all the prerequisite steps/processes necessary to run JaBbA. +**mskilab-org/nf-JaBbA** is a new state-of-art bioinformatics pipeline from [`mskilab-org`](https://www.mskilab.org/) that is intended to run [`JaBbA`](https://github.com/mskilab-org/JaBbA/tree/master), an MIP based joint inference of copy number and rearrangement state in cancer whole genome sequence data. It runs all the pre-requisite modules necessary to run JaBbA and as followed in `mskilab-org`. This pipeline is built to handle only tumor-normal pairs as input (as of now) and is designed and tested to run on Human samples. + +This pipeline is built after being influenced by `nf-core/Sarek`, a workflow designed to detect variants on whole genome or targeted sequencing data. It is built using [`Nextflow`](https://www.nextflow.io/) and is implemented using `Nextflow DSL2`. All the modules uses [`Docker`](https://www.docker.com/) and [`Singularity`](https://sylabs.io/docs/) containers which makes the pipeline easily reproducible and maintain its dependencies. Some of the modules/processes are used from [`nf-core/modules`](https://github.com/nf-core/modules) that are available for the Nextflow Community. + +This pipeline has been designed to start from scratch using **fastq** files or start using **BAM** files and should be supplied in a **csv** file as input (*please refer the documentation below for the input format of the .csv file*). - - - +We drew our inspiration and ideas from [`nf-core/Sarek`](https://github.com/nf-core/sarek), a workflow designed to detect variants on whole genome or targeted sequencing data. It is built using [`Nextflow`](https://www.nextflow.io/) and is implemented using `Nextflow DSL2`. All the modules uses [`Docker`](https://www.docker.com/) and [`Singularity`](https://sylabs.io/docs/) containers which makes the pipeline easily reproducible and maintain its dependencies. Some of the modules/processes are used from [`nf-core/modules`](https://github.com/nf-core/modules) that are available for the Nextflow Community. + +This pipeline has been designed to start from scratch using **FASTQ** files or start directly from **BAM** files as input and should be supplied in a **CSV** file (*please refer to the documentation below for the input format of the .csv file*). We incorporated a modified version of the `Alignment` step of `nf-JaBbA` pipeline from `nf-core/Sarek`, many thanks to the Sarek community. + +## Workflow Summary: +1. Alignment to Reference Genome (currently support `BWA-MEM` & `BWA-MEM2`) +2. Quality Control (using [`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) +3. Perform trimming (must turn on using `--trim_fastq`) (using `fastp`) +4. Marking Duplicates (using `GATK MarkDuplicates`) +5. Perform baserecalibration (using `GATK BaseRecalibrator`) +6. Apply BQSR (using `GATK ApplyBQSR`) +7. Perform Structural Variants Calling (using [`SVABA`](https://github.com/walaj/svaba) and/or [`GRIDSS`](https://github.com/PapenfussLab/gridss); must mention using `--tools`) +8. Perform Pileups (using mskilab's custom `HetPileups`; must mention using `--tools`) +9. Generate raw coverages and corect for GC & Mappability bias (using [`fragCounter`](https://github.com/mskilab-org/fragCounter); must mention using `--tools`) +10. Remove biological and technical noise from coverage data. (using [`Dryclean`](https://github.com/mskilab-org/dryclean); must mention using `--tools`) +11. Perform Segmentation by using tumor/normal ratios of corrected read counts, (using `CBS` circular binary segmentation algorithm; must mention using `--tools`) +12. Get Purity & Ploidy separately to supply to JaBbA (currently support [`ASCAT`](https://www.crick.ac.uk/research/labs/peter-van-loo/software) to pass ploidy values to JaBbA; must mention using `--tools`) +13. Execute JaBbA (using inputs from `Dryclean`, `CBS`, `HetPileups` and/or `ASCAT`; must mention using `--tools`) -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) ## Usage From 448563fd3fabef08d53d3cd48696fb38dd0e8b8a Mon Sep 17 00:00:00 2001 From: Tanubrata Dey Date: Wed, 8 Nov 2023 10:17:59 -0500 Subject: [PATCH 4/7] Updates README and address bugs for JaBbA --- README.md | 15 +++++++-- conf/base.config | 2 +- modules/local/jabba/main.nf | 61 +++++++++++++++++++------------------ nextflow.config | 4 +-- 4 files changed, 47 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 4a1b991..d9dd9d9 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,15 @@ This pipeline has been designed to start from scratch using **FASTQ** files or s > to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) > with `-profile test` before running the workflow on actual data. +### Setting up the ***samplesheet.csv*** file for input: + +You need to create a samplesheet with information regarding the samples that you want to run the pipeline on. You need to specify the path of your **samplesheet** using the `--input` flag to specify the location. Make sure the input file is a *comma-separated* file and must contain headers with info discussed below. *It is highly recommended to provide the **absolute path** for inputs inside the samplesheet rather than relative paths.* + +To mention a sample as paired tumor-normal, it has to be specified with the same `patient` ID, a different `sample`, and their respective `status`. For instance, a `tumor` sample should be mentioned **1** in `status` field for a sample, if it is normal mention **0**. If there are multiple `sample` IDs, `nf-jabba` will consider them as separate samples and output the results on separate folders based on `patient`, rest assured all the runs will be separate based on `patient`, so no need to be concerned with getting the outputs mixed. + +You need to specify the desired output directory path using `--outdir` flag when you start a run so that the outputs get stored on your designated folder and separated by `tool` and `sample` names in folders. + + diff --git a/conf/base.config b/conf/base.config index 6243ac1..21fd912 100644 --- a/conf/base.config +++ b/conf/base.config @@ -133,7 +133,7 @@ process { } withName: 'JABBA' { cpus = { check_max( 16 * task.attempt, 'cpus' ) } - memory = { check_max( 48.GB * task.attempt, 'memory' ) } + memory = { check_max( 64.GB * task.attempt, 'memory' ) } time = { check_max( 10.h * task.attempt, 'time' ) } } withLabel:error_ignore { diff --git a/modules/local/jabba/main.nf b/modules/local/jabba/main.nf index 951fec6..c649d5f 100644 --- a/modules/local/jabba/main.nf +++ b/modules/local/jabba/main.nf @@ -100,36 +100,37 @@ process JABBA { echo \$jba set +x - export cmd="Rscript \$jba $junction $cov_rds \\ - --j.supp $j_supp \\ - --indel $indel \\ - --tfield $tfield \\ - --iterate $iter \\ - --rescue.window $rescue_window \\ - --rescue.all $rescue_all \\ - --nudgebalanced $nudgebalanced \\ - --edgenudge $edgenudge \\ - --field $field \\ - --seg $cbs_seg_rds \\ - --maxna $maxna \\ - --blacklist.coverage $blacklist_coverage \\ - --nseg $cbs_nseg_rds \\ - --hets $het_pileups_wgs \\ - --ploidy $ploidy \\ - --purity $purity \\ - --ppmethod $pp_method \\ - --cnsignif $cnsignif \\ - --slack $slack \\ + export cmd="Rscript \$jba ${junction} ${cov_rds} \\ + --j.supp ${j_supp} \\ + --indel ${indel} \\ + --tfield ${tfield} \\ + --iterate ${iter} \\ + --rescue.window ${rescue_window} \\ + --rescue.all ${rescue_all} \\ + --nudgebalanced ${nudgebalanced} \\ + --edgenudge ${edgenudge} \\ + --field ${field} \\ + --seg ${cbs_seg_rds} \\ + --maxna ${maxna} \\ + --blacklist.coverage ${blacklist_coverage} \\ + --nseg ${cbs_nseg_rds} \\ + --hets ${het_pileups_wgs} \\ + --ploidy ${ploidy} \\ + --purity ${purity} \\ + --ppmethod ${pp_method} \\ + --cnsignif ${cnsignif} \\ + --slack ${slack} \\ --linear \\ - --tilim $tilim \\ - --epgap $epgap \\ - --name $name \\ - --cores $task.cpus \\ - --fix.thres $fix_thres \\ - --lp $lp \\ - --ism $ism \\ - --filter_loose $filter_loose \\ - --gurobi $gurobi \\ + --tilim ${tilim} \\ + --epgap ${epgap} \\ + --name ${meta.id} \\ + --cores 12 \\ + --mem 16 \\ + --fix.thres ${fix_thres} \\ + --lp ${lp} \\ + --ism ${ism} \\ + --filter_loose ${filter_loose} \\ + --gurobi ${gurobi} \\ " cat <<-END_VERSIONS > versions.yml @@ -146,7 +147,7 @@ process JABBA { exit \$cmdsig fi - exit 0 + ## exit 0 """ stub: diff --git a/nextflow.config b/nextflow.config index f94add6..5318dcc 100644 --- a/nextflow.config +++ b/nextflow.config @@ -111,8 +111,8 @@ params { cnsignif_jabba = 0.00001 slack_jabba = 100 linear_jabba = "TRUE" - tilim_jabba = 7200 - epgap_jabba = 0.00000001 + tilim_jabba = 5000 + epgap_jabba = 0.000001 fix_thres_jabba = -1 lp_jabba = "TRUE" ism_jabba = "TRUE" From aae072f9e40ae15a14c310fba32a08c4610b23d3 Mon Sep 17 00:00:00 2001 From: Tanubrata Dey Date: Wed, 8 Nov 2023 10:28:40 -0500 Subject: [PATCH 5/7] Updates the config files to catch command files for each run --- conf/modules/ascat.config | 2 +- conf/modules/cbs.config | 4 ++-- conf/modules/dryclean.config | 4 ++-- conf/modules/fragcounter.config | 4 ++-- conf/modules/hetpileups.config | 2 +- conf/modules/jabba.config | 4 ++-- conf/modules/structural_variants.config | 6 +++--- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/conf/modules/ascat.config b/conf/modules/ascat.config index ae372b8..3929870 100644 --- a/conf/modules/ascat.config +++ b/conf/modules/ascat.config @@ -22,7 +22,7 @@ process { publishDir = [ mode: params.publish_dir_mode, path: { "${params.outdir}/ASCAT/${meta.id}/" }, - pattern: "*{rds,png,cnvs.txt,metrics.txt,purityploidy.txt,segments.txt,LogR.txt,BAF.txt}" + pattern: "*{rds,png,cnvs.txt,metrics.txt,purityploidy.txt,segments.txt,LogR.txt,BAF.txt,.command.*}" ] } } \ No newline at end of file diff --git a/conf/modules/cbs.config b/conf/modules/cbs.config index 45a4b76..ecafc5e 100644 --- a/conf/modules/cbs.config +++ b/conf/modules/cbs.config @@ -18,8 +18,8 @@ process { ext.when = { params.tools && params.tools.split(',').contains('cbs') } publishDir = [ mode: params.publish_dir_mode, - path: { "${params.outdir}/cbs/${meta.id}/" }, - pattern: "*{.rds}" + path: { "${params.outdir}/CBS/${meta.id}/" }, + pattern: "*{.rds,.command.*}" ] } } diff --git a/conf/modules/dryclean.config b/conf/modules/dryclean.config index e4b8bbe..3179ed3 100644 --- a/conf/modules/dryclean.config +++ b/conf/modules/dryclean.config @@ -22,7 +22,7 @@ process { publishDir = [ mode: params.publish_dir_mode, path: { "${params.outdir}/Coverages/Dryclean_tumor/${meta.id}/" }, - pattern: "*cov.rds" + pattern: "*{cov.rds,.command.*}" ] } @@ -32,7 +32,7 @@ process { publishDir = [ mode: params.publish_dir_mode, path: { "${params.outdir}/Coverages/Dryclean_normal/${meta.id}/" }, - pattern: "*cov.rds" + pattern: "*{cov.rds,.command.*}" ] } } \ No newline at end of file diff --git a/conf/modules/fragcounter.config b/conf/modules/fragcounter.config index f252841..618f65d 100644 --- a/conf/modules/fragcounter.config +++ b/conf/modules/fragcounter.config @@ -21,7 +21,7 @@ process { publishDir = [ mode: params.publish_dir_mode, path: { "${params.outdir}/Coverages/fragCounter_tumor/${meta.id}/" }, - pattern: "*{.rds,.bw,cov*}" + pattern: "*{.rds,.bw,cov*,.command.*}" ] } @@ -30,7 +30,7 @@ process { publishDir = [ mode: params.publish_dir_mode, path: { "${params.outdir}/Coverages/fragCounter_normal/${meta.id}/" }, - pattern: "*{.rds,.bw,cov*}" + pattern: "*{.rds,.bw,cov*,.command.*}" ] } } diff --git a/conf/modules/hetpileups.config b/conf/modules/hetpileups.config index 390d4e6..c315323 100644 --- a/conf/modules/hetpileups.config +++ b/conf/modules/hetpileups.config @@ -19,7 +19,7 @@ process { publishDir = [ mode: params.publish_dir_mode, path: { "${params.outdir}/Hetpileups/${meta.id}/" }, - pattern: "*{.txt*}" + pattern: "*{.txt*,.command.*}" ] } } diff --git a/conf/modules/jabba.config b/conf/modules/jabba.config index 2a5c4a9..21cc7f2 100644 --- a/conf/modules/jabba.config +++ b/conf/modules/jabba.config @@ -18,8 +18,8 @@ process { ext.when = { params.tools && params.tools.split(',').contains('jabba') } publishDir = [ mode: params.publish_dir_mode, - path: { "${params.outdir}/jabba/${meta.id}/" }, - pattern: "*{.rds,.vcf,.seg}" + path: { "${params.outdir}/JaBbA/${meta.id}/" }, + pattern: "*{.rds*,.vcf,.seg,.png,.txt,.command.*}" ] } } diff --git a/conf/modules/structural_variants.config b/conf/modules/structural_variants.config index 9c31425..242567f 100644 --- a/conf/modules/structural_variants.config +++ b/conf/modules/structural_variants.config @@ -21,7 +21,7 @@ process { publishDir = [ mode: params.publish_dir_mode, path: { "${params.outdir}/SV_calling/SVABA/${meta.id}/" }, - pattern: "*{vcf.gz,txt.gz,vcf*,bam}" + pattern: "*{vcf.gz,txt.gz,vcf*,bam,.command.*}" ] } @@ -31,7 +31,7 @@ process { publishDir = [ mode: params.publish_dir_mode, path: { "${params.outdir}/SV_calling/GRIDSS/${meta.id}/" }, - pattern: "*{vcf.gz,txt.gz,vcf*,bam}" + pattern: "*{vcf.gz,txt.gz,vcf*,bam,.command.*}" ] } @@ -40,7 +40,7 @@ process { publishDir = [ mode: params.publish_dir_mode, path: { "${params.outdir}/SV_calling/GRIDSS_SOMATIC/${meta.id}/" }, - pattern: "*{vcf.bgz,vcf.bgz.tbi}" + pattern: "*{vcf.bgz,vcf.bgz.tbi,.command.*}" ] } } From ddb9ede899fa2ddace137d725f64d02258ba5252 Mon Sep 17 00:00:00 2001 From: Tanubrata Dey Date: Wed, 8 Nov 2023 18:26:08 -0500 Subject: [PATCH 6/7] Updates README.md with detailed discussion of inputs and steps of pipeline --- README.md | 199 +++++++++++++++++++++++++++++++++++++++++------- nextflow.config | 14 ++-- 2 files changed, 180 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index d9dd9d9..588a802 100644 --- a/README.md +++ b/README.md @@ -60,39 +60,192 @@ To mention a sample as paired tumor-normal, it has to be specified with the same You need to specify the desired output directory path using `--outdir` flag when you start a run so that the outputs get stored on your designated folder and separated by `tool` and `sample` names in folders. - - - -Now, you can run the pipeline using: - - +Each row represents a pair of fastq files (paired end) for each Sample. +After the input file is ready, you can run the pipeline using: ```bash nextflow run mskilab-org/nf-jabba \ -profile \ --input samplesheet.csv \ - --outdir + --outdir \ + --tools \ + --genome ``` - > **Warning:** -> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those +> Please provide pipeline parameters via the CLI or Nextflow [`-params-file`](https://www.nextflow.io/blog/2020/cli-docs-release.html) option. Custom config files including those > provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; > see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). + +### Discussion of expected fields in input file and expected inputs for each `--step` + +A typical sample sheet should populate with the column names as hown below: + +|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------| +| Column Name | Description | +|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------| +| patient | Patient or Sample ID. This should differentiate each patient/sample. *Note*: Each patient can have multiple sample names. | +| sample | Sample ID for each Patient. Should differentiate between tumor and normal. Sample IDs should be unique to Patient IDs | +| lane | If starting with FASTQ files, if there are multiple lanes for each sample for each patient, mention lane name. **Required for `--step alignment`. | +| sex | If known, please provide the sex for the patient. For instance if **Male** type XY, else if **Female** type XX, else others and unknown should be NA. | +| status | This should tell if your sample is **tumor** or **normal**. For **normal**, write 0, and for **tumor**, write 1. | +| fastq_1 | Full Path to FASTQ file read 1. The extension should be `.fastq.gz` or `.fq.gz`. **Required** for `--step alignment`. | +| fastq_2 | Full Path to FASTQ file read 2. The extension should be `.fastq.gz` or `.fq.gz`. **Required** for `--step alignment`. | +| bam | Full Path to BAM file. The extension should be `.bam`. **Required** for `--step sv_calling`. | +| bai | Full Path to BAM index file. The extension should be `.bam.bai`. **Required** for `--step sv_calling`. | +| cram | Full Path to CRAM file. The extension should be `.cram`. **Required** for `--step sv_calling` if file is of type `CRAM`. | +| crai | Full Path to CRAM index file. The extension should be `.cram.crai`. **Required** for `--step sv_calling` if file is of type `CRAM`. | +| table | Full path to Recalibration table file. **Required** for `--step recalibrate`. | +| vcf | Full path to VCF file. **Required** for `--step jabba`. | +| hets | Full path to HetPileups .txt file. **Required** for `--step jabba`. | +|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------| + +There are multiple `--steps` for `nf-jabba`. The main idea behind this was to make each of the tool separate so that one can only run their only desired tool rather than running the whole pipeline when it is provided with the required outputs. There are 2 primary `--steps` in the pipeline which can lead to `JaBbA` and run the module if mentioned with all the tools using a list of comma-separated names in `--tools`. A full input `csv` files for these 2 steps are shown below: +- **`--step alignment`** + +``` +patient,sex,status,sample,lane,fastq_1,fastq_2 +TCXX49,XX,0,TCXX49_N,lane_1,/path/to/fastq_1.fq.gz,/path/to/fastq_2.gz +TCXX49,XX,0,TCXX49_N,lane_2,/path/to/fastq_1.fq.gz,/path/to/fastq_2.gz +TCXX49,XX,1,TCXX49_T,lane_1,/path/to/fastq_1.fq.gz,/path/to/fastq_2.gz +TCXX49,XX,1,TCXX49_T,lane_2,/path/to/fastq_1.fq.gz,/path/to/fastq_2.gz +TCXX52,NA,0,TCXX52_N,lane_1,/path/to/fastq_1.fq.gz,/path/to/fastq_2.gz +TCXX52,NA,1,TCXX52_T,lane_1,/path/to/fastq_1.fq.gz,/path/to/fastq_2.gz +``` +- **`--step sv_calling`** + +``` +patient,sex,status,sample,bam,bai +TCXX49,XX,0,TCXX49_N,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX49,XX,1,TCXX49_T,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX52,NA,0,TCXX52_N,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX52,NA,1,TCXX52_T,/path/to/alignment.bam,/path/to/alignment.bam.bai +``` +> **Note** +> If you are using cram files, just replace *bam* and *bai* headers with *cram* and *crai* headers +> and pass the *cram* and *crai* paths there + + +There are also many secondary `--steps` for this pipeline that are designed to only run a specific `--tools` for the module but is not adequate to run **`JaBbA`**. You can also run only **`JaBbA`** if you have all the inputs available and can be provided in the `--input` `csv` file. Below, we provide the other secondary steps and their desired input `csv` files for each step. + +- **`--step markduplicates`** + +``` +patient,sex,status,sample,bam,bai +TCXX49,XX,0,TCXX49_N,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX49,XX,1,TCXX49_T,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX52,NA,0,TCXX52_N,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX52,NA,1,TCXX52_T,/path/to/alignment.bam,/path/to/alignment.bam.bai +``` +> **Note** +> If you are using cram files, just replace *bam* and *bai* headers with *cram* and *crai* headers +> and pass the *cram* and *crai* paths there + + +- **`--step prepare_recalibration`** + +``` +patient,sex,status,sample,bam,bai +TCXX49,XX,0,TCXX49_N,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX49,XX,1,TCXX49_T,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX52,NA,0,TCXX52_N,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX52,NA,1,TCXX52_T,/path/to/alignment.bam,/path/to/alignment.bam.bai +``` +> **Note** +> If you are using cram files, just replace *bam* and *bai* headers with *cram* and *crai* headers +> and pass the *cram* and *crai* paths there + + +- **`--step recalibrate`** + +``` +patient,sex,status,sample,bam,bai,table +TCXX49,XX,0,TCXX49_N,/path/to/alignment.bam,/path/to/alignment.bam.bai,TCXX49_N.table +TCXX49,XX,1,TCXX49_T,/path/to/alignment.bam,/path/to/alignment.bam.bai,TCXX49_T.table +TCXX52,NA,0,TCXX52_N,/path/to/alignment.bam,/path/to/alignment.bam.bai,TCXX52_N.table +TCXX52,NA,1,TCXX52_T,/path/to/alignment.bam,/path/to/alignment.bam.bai,TCXX52_T.table +``` + +- **`--step fragcounter`** + +``` +patient,sex,status,sample,bam,bai +TCXX49,XX,0,TCXX49_N,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX49,XX,1,TCXX49_T,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX52,NA,0,TCXX52_N,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX52,NA,1,TCXX52_T,/path/to/alignment.bam,/path/to/alignment.bam.bai +``` +> **Note** +> If you are using cram files, just replace *bam* and *bai* headers with *cram* and *crai* headers +> and pass the *cram* and *crai* paths there + + +- **`--step dryclean`** (**Note**: you should also mention `--tools dryclean` to use Dryclean. This step also has `CBS`, if you want to perform both Dryclean and CBS, use `--tools dryclean,cbs`) + +``` +patient,sex,status,sample,cov +TCXX49,XX,0,TCXX49_N,/path/to/coverage.rds +TCXX49,XX,1,TCXX49_T,/path/to/coverage.rds +TCXX52,NA,0,TCXX52_N,/path/to/coverage.rds +TCXX52,NA,1,TCXX52_T,/path/to/coverage.rds +``` + +- **`--step hetpileups`** (**Note**: you should also mention `--tools hetpileups` to use HetPileups.) + +``` +patient,sex,status,sample,bam,bai +TCXX49,XX,0,TCXX49_N,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX49,XX,1,TCXX49_T,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX52,NA,0,TCXX52_N,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX52,NA,1,TCXX52_T,/path/to/alignment.bam,/path/to/alignment.bam.bai +``` +> **Note** +> HetPileups does not support for *`cram`* files, you must use *`bam`* files for this step. + + +- **`--step ascat`** (**Note**: you should also mention `--tools ascat` to use ASCAT.) + +``` +patient,sex,status,sample,cov,hets +TCXX49,XX,1,TCXX49_T,/path/to/coverage.rds,/path/to/hetpileups/sites.txt +TCXX52,NA,1,TCXX52_T,/path/to/coverage.rds,/path/to/hetpileups/sites.txt +``` + + +- **`--step jabba`** (**Note**: you should also mention `--tools jabba` to use JaBbA.) + +``` +patient,sex,status,sample,cov,vcf +TCXX49,XX,1,TCXX49_T,/path/to/tumor/coverage.rds,/path/to/sv_caller/tumor/somatic.vcf +TCXX52,NA,1,TCXX52_T,/path/to/tumor/coverage.rds,/path/to/sv_caller/tumor/somatic.vcf +``` + +### Helpful Core Nextflow Commands: + +#### `-resume` +This is a life saving command which is part of Nextflow. If a Process of the pipeline fails at some point, Nextflow has the ability to start from that step where the job failed rather than starting all the way from the beginning. You must specify this in the `-CLI` or on the `command-line` when restarting a pipeline. You can also supply a run name to resume a specific run using: `-resume` [run-name]. Use the `nextflow log` command to show previous run names. + +### `-profile` +Use this parameter for choosing a configuration profile. Profiles can give configuration presets for different computing environments. + +Several generic profiles have been provided with the pipeline which instruct the pipeline to use software packaged using different methods. You need to use this option to mention when using containers (singularity/Docker) which is highly recommended for running the pipeline. + +### `-c` +You can mention custom configuration scripts to run the pipeline with using `-c` flag and providing the path to the `.config` file. This is advised when you want to submit processes into an executor like `slurm/LSF/..`. + +### `-bg` +The Nextflow `-bg` flag helps launching Nextflow pipeline in the background, and being detached from your terminal so that the curren run does not stop if you log out of your session and the log of the run are saved inside a file. Alternative ways include using `screen` or `tmux` sessions which you can easily detach and log back in at a later time. + +## Debugging any step/process: + +To debug any step or process that failed, please check your current `execution_trace*.txt` file inside the `/pipeline_info/` folder and gather the `hash` number for that process. Then go inside the `work` folder and paste that `hash` number to locate thw working directory for that process. There should be multiple `.command.*` files inside that folder which corresponds to your run. This includes log, sh, trace, error files. One good thing is you can run ``.command.sh` script locally to check where it is exactly breaking and replicat the issue (though you might need to edit the command a bit to run it successfully locally). + ## Credits `nf-jabba` was originally written by [`Tanubrata Dey`](https://github.com/tanubrata) and [`Shihab Dider`](https://github.com/shihabdider) at the Perlmutter Cancer Center and the New York Genome Center. @@ -101,7 +254,6 @@ We thank the following people for their extensive guidance in the development of - [Marcin Imielinski](https://github.com/imielinski) - [Joel Rosiene](https://github.com/jrosiene) - ## Contributions and Support @@ -109,11 +261,6 @@ If you would like to contribute to this pipeline, please see the [contributing g ## Citations - - - - - An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE). diff --git a/nextflow.config b/nextflow.config index 5318dcc..e5fc2fa 100644 --- a/nextflow.config +++ b/nextflow.config @@ -188,8 +188,8 @@ params { // Config options config_profile_name = null config_profile_description = null - custom_config_version = 'master' - custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" + //custom_config_version = 'master' + //custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" config_profile_contact = null config_profile_url = null @@ -212,11 +212,11 @@ params { includeConfig 'conf/base.config' // Load nf-core custom profiles from different Institutions -try { - includeConfig "${params.custom_config_base}/nfcore_custom.config" -} catch (Exception e) { - System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") -} +//try { +// includeConfig "${params.custom_config_base}/nfcore_custom.config" +//} catch (Exception e) { +// System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") +//} // Load nf-core/heisenbio custom profiles from different institutions. // Warning: Uncomment only if a pipeline-specific instititutional config already exists on nf-core/configs! From 5480070be7f3ec4266c5534de1745de487d0d6ee Mon Sep 17 00:00:00 2001 From: Tanubrata Dey Date: Wed, 8 Nov 2023 22:31:43 -0500 Subject: [PATCH 7/7] Updates the Documentation and upgrades resources for each process --- README.md | 133 ++---------------------- conf/base.config | 14 +-- docs/README.md | 6 +- docs/output.md | 222 +++++++++++++++++++++++++++++++++++----- docs/usage.md | 259 ++++++++++++++++++++++++++++++++--------------- 5 files changed, 392 insertions(+), 242 deletions(-) diff --git a/README.md b/README.md index 588a802..659d6ce 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ **mskilab-org/nf-JaBbA** is a new state-of-art bioinformatics pipeline from [`mskilab-org`](https://www.mskilab.org/) that is intended to run [`JaBbA`](https://github.com/mskilab-org/JaBbA/tree/master), an MIP based joint inference of copy number and rearrangement state in cancer whole genome sequence data. It runs all the pre-requisite modules necessary to run JaBbA and as followed in `mskilab-org`. This pipeline is built to handle only tumor-normal pairs as input (as of now) and is designed and tested to run on Human samples. -We drew our inspiration and ideas from [`nf-core/Sarek`](https://github.com/nf-core/sarek), a workflow designed to detect variants on whole genome or targeted sequencing data. It is built using [`Nextflow`](https://www.nextflow.io/) and is implemented using `Nextflow DSL2`. All the modules uses [`Docker`](https://www.docker.com/) and [`Singularity`](https://sylabs.io/docs/) containers which makes the pipeline easily reproducible and maintain its dependencies. Some of the modules/processes are used from [`nf-core/modules`](https://github.com/nf-core/modules) that are available for the Nextflow Community. +We drew our inspiration and ideas from [`nf-core/Sarek`](https://github.com/nf-core/sarek), a workflow designed to detect variants on whole genome or targeted sequencing data. **`nf-jabba`** is built using [`Nextflow`](https://www.nextflow.io/) and is implemented using `Nextflow DSL2`. All the modules uses [`Docker`](https://www.docker.com/) and [`Singularity`](https://sylabs.io/docs/) containers which makes the pipeline easily reproducible and maintain its dependencies. Some of the modules/processes are used from [`nf-core/modules`](https://github.com/nf-core/modules) that are available for the Nextflow Community. This pipeline has been designed to start from scratch using **FASTQ** files or start directly from **BAM** files as input and should be supplied in a **CSV** file (*please refer to the documentation below for the input format of the .csv file*). We incorporated a modified version of the `Alignment` step of `nf-JaBbA` pipeline from `nf-core/Sarek`, many thanks to the Sarek community. @@ -85,9 +85,8 @@ nextflow run mskilab-org/nf-jabba \ ### Discussion of expected fields in input file and expected inputs for each `--step` -A typical sample sheet should populate with the column names as hown below: +A typical sample sheet should populate with the column names as shown below: -|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------| | Column Name | Description | |-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------| | patient | Patient or Sample ID. This should differentiate each patient/sample. *Note*: Each patient can have multiple sample names. | @@ -104,147 +103,29 @@ A typical sample sheet should populate with the column names as hown below: | table | Full path to Recalibration table file. **Required** for `--step recalibrate`. | | vcf | Full path to VCF file. **Required** for `--step jabba`. | | hets | Full path to HetPileups .txt file. **Required** for `--step jabba`. | -|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------| - -There are multiple `--steps` for `nf-jabba`. The main idea behind this was to make each of the tool separate so that one can only run their only desired tool rather than running the whole pipeline when it is provided with the required outputs. There are 2 primary `--steps` in the pipeline which can lead to `JaBbA` and run the module if mentioned with all the tools using a list of comma-separated names in `--tools`. A full input `csv` files for these 2 steps are shown below: -- **`--step alignment`** - -``` -patient,sex,status,sample,lane,fastq_1,fastq_2 -TCXX49,XX,0,TCXX49_N,lane_1,/path/to/fastq_1.fq.gz,/path/to/fastq_2.gz -TCXX49,XX,0,TCXX49_N,lane_2,/path/to/fastq_1.fq.gz,/path/to/fastq_2.gz -TCXX49,XX,1,TCXX49_T,lane_1,/path/to/fastq_1.fq.gz,/path/to/fastq_2.gz -TCXX49,XX,1,TCXX49_T,lane_2,/path/to/fastq_1.fq.gz,/path/to/fastq_2.gz -TCXX52,NA,0,TCXX52_N,lane_1,/path/to/fastq_1.fq.gz,/path/to/fastq_2.gz -TCXX52,NA,1,TCXX52_T,lane_1,/path/to/fastq_1.fq.gz,/path/to/fastq_2.gz -``` -- **`--step sv_calling`** - -``` -patient,sex,status,sample,bam,bai -TCXX49,XX,0,TCXX49_N,/path/to/alignment.bam,/path/to/alignment.bam.bai -TCXX49,XX,1,TCXX49_T,/path/to/alignment.bam,/path/to/alignment.bam.bai -TCXX52,NA,0,TCXX52_N,/path/to/alignment.bam,/path/to/alignment.bam.bai -TCXX52,NA,1,TCXX52_T,/path/to/alignment.bam,/path/to/alignment.bam.bai -``` -> **Note** -> If you are using cram files, just replace *bam* and *bai* headers with *cram* and *crai* headers -> and pass the *cram* and *crai* paths there - - -There are also many secondary `--steps` for this pipeline that are designed to only run a specific `--tools` for the module but is not adequate to run **`JaBbA`**. You can also run only **`JaBbA`** if you have all the inputs available and can be provided in the `--input` `csv` file. Below, we provide the other secondary steps and their desired input `csv` files for each step. -- **`--step markduplicates`** -``` -patient,sex,status,sample,bam,bai -TCXX49,XX,0,TCXX49_N,/path/to/alignment.bam,/path/to/alignment.bam.bai -TCXX49,XX,1,TCXX49_T,/path/to/alignment.bam,/path/to/alignment.bam.bai -TCXX52,NA,0,TCXX52_N,/path/to/alignment.bam,/path/to/alignment.bam.bai -TCXX52,NA,1,TCXX52_T,/path/to/alignment.bam,/path/to/alignment.bam.bai -``` -> **Note** -> If you are using cram files, just replace *bam* and *bai* headers with *cram* and *crai* headers -> and pass the *cram* and *crai* paths there - - -- **`--step prepare_recalibration`** - -``` -patient,sex,status,sample,bam,bai -TCXX49,XX,0,TCXX49_N,/path/to/alignment.bam,/path/to/alignment.bam.bai -TCXX49,XX,1,TCXX49_T,/path/to/alignment.bam,/path/to/alignment.bam.bai -TCXX52,NA,0,TCXX52_N,/path/to/alignment.bam,/path/to/alignment.bam.bai -TCXX52,NA,1,TCXX52_T,/path/to/alignment.bam,/path/to/alignment.bam.bai -``` -> **Note** -> If you are using cram files, just replace *bam* and *bai* headers with *cram* and *crai* headers -> and pass the *cram* and *crai* paths there - - -- **`--step recalibrate`** - -``` -patient,sex,status,sample,bam,bai,table -TCXX49,XX,0,TCXX49_N,/path/to/alignment.bam,/path/to/alignment.bam.bai,TCXX49_N.table -TCXX49,XX,1,TCXX49_T,/path/to/alignment.bam,/path/to/alignment.bam.bai,TCXX49_T.table -TCXX52,NA,0,TCXX52_N,/path/to/alignment.bam,/path/to/alignment.bam.bai,TCXX52_N.table -TCXX52,NA,1,TCXX52_T,/path/to/alignment.bam,/path/to/alignment.bam.bai,TCXX52_T.table -``` - -- **`--step fragcounter`** - -``` -patient,sex,status,sample,bam,bai -TCXX49,XX,0,TCXX49_N,/path/to/alignment.bam,/path/to/alignment.bam.bai -TCXX49,XX,1,TCXX49_T,/path/to/alignment.bam,/path/to/alignment.bam.bai -TCXX52,NA,0,TCXX52_N,/path/to/alignment.bam,/path/to/alignment.bam.bai -TCXX52,NA,1,TCXX52_T,/path/to/alignment.bam,/path/to/alignment.bam.bai -``` -> **Note** -> If you are using cram files, just replace *bam* and *bai* headers with *cram* and *crai* headers -> and pass the *cram* and *crai* paths there - - -- **`--step dryclean`** (**Note**: you should also mention `--tools dryclean` to use Dryclean. This step also has `CBS`, if you want to perform both Dryclean and CBS, use `--tools dryclean,cbs`) - -``` -patient,sex,status,sample,cov -TCXX49,XX,0,TCXX49_N,/path/to/coverage.rds -TCXX49,XX,1,TCXX49_T,/path/to/coverage.rds -TCXX52,NA,0,TCXX52_N,/path/to/coverage.rds -TCXX52,NA,1,TCXX52_T,/path/to/coverage.rds -``` - -- **`--step hetpileups`** (**Note**: you should also mention `--tools hetpileups` to use HetPileups.) - -``` -patient,sex,status,sample,bam,bai -TCXX49,XX,0,TCXX49_N,/path/to/alignment.bam,/path/to/alignment.bam.bai -TCXX49,XX,1,TCXX49_T,/path/to/alignment.bam,/path/to/alignment.bam.bai -TCXX52,NA,0,TCXX52_N,/path/to/alignment.bam,/path/to/alignment.bam.bai -TCXX52,NA,1,TCXX52_T,/path/to/alignment.bam,/path/to/alignment.bam.bai -``` -> **Note** -> HetPileups does not support for *`cram`* files, you must use *`bam`* files for this step. - - -- **`--step ascat`** (**Note**: you should also mention `--tools ascat` to use ASCAT.) - -``` -patient,sex,status,sample,cov,hets -TCXX49,XX,1,TCXX49_T,/path/to/coverage.rds,/path/to/hetpileups/sites.txt -TCXX52,NA,1,TCXX52_T,/path/to/coverage.rds,/path/to/hetpileups/sites.txt -``` - - -- **`--step jabba`** (**Note**: you should also mention `--tools jabba` to use JaBbA.) - -``` -patient,sex,status,sample,cov,vcf -TCXX49,XX,1,TCXX49_T,/path/to/tumor/coverage.rds,/path/to/sv_caller/tumor/somatic.vcf -TCXX52,NA,1,TCXX52_T,/path/to/tumor/coverage.rds,/path/to/sv_caller/tumor/somatic.vcf -``` +For more information and further functionality regarding the pipeline usage and inputs necesaary for each step please follow the [Usage](docs//usage.md) documentation as suggested here. ### Helpful Core Nextflow Commands: #### `-resume` This is a life saving command which is part of Nextflow. If a Process of the pipeline fails at some point, Nextflow has the ability to start from that step where the job failed rather than starting all the way from the beginning. You must specify this in the `-CLI` or on the `command-line` when restarting a pipeline. You can also supply a run name to resume a specific run using: `-resume` [run-name]. Use the `nextflow log` command to show previous run names. -### `-profile` +#### `-profile` Use this parameter for choosing a configuration profile. Profiles can give configuration presets for different computing environments. Several generic profiles have been provided with the pipeline which instruct the pipeline to use software packaged using different methods. You need to use this option to mention when using containers (singularity/Docker) which is highly recommended for running the pipeline. -### `-c` +#### `-c` You can mention custom configuration scripts to run the pipeline with using `-c` flag and providing the path to the `.config` file. This is advised when you want to submit processes into an executor like `slurm/LSF/..`. -### `-bg` +#### `-bg` The Nextflow `-bg` flag helps launching Nextflow pipeline in the background, and being detached from your terminal so that the curren run does not stop if you log out of your session and the log of the run are saved inside a file. Alternative ways include using `screen` or `tmux` sessions which you can easily detach and log back in at a later time. ## Debugging any step/process: -To debug any step or process that failed, please check your current `execution_trace*.txt` file inside the `/pipeline_info/` folder and gather the `hash` number for that process. Then go inside the `work` folder and paste that `hash` number to locate thw working directory for that process. There should be multiple `.command.*` files inside that folder which corresponds to your run. This includes log, sh, trace, error files. One good thing is you can run ``.command.sh` script locally to check where it is exactly breaking and replicat the issue (though you might need to edit the command a bit to run it successfully locally). +To debug any step or process that failed, please check your current `execution_trace*.txt` file inside the `/pipeline_info/` folder and gather the `hash` number for that process. Then go inside the `work` folder and paste that `hash` number to locate thw working directory for that process. There should be multiple `.command.*` files inside that folder which corresponds to your run. This includes log, sh, trace, error files. One good thing is you can run `.command.sh` script locally to check where it is exactly breaking and replicat the issue (though you might need to edit the command a bit to run it successfully locally). ## Credits diff --git a/conf/base.config b/conf/base.config index 21fd912..dea1a87 100644 --- a/conf/base.config +++ b/conf/base.config @@ -113,28 +113,28 @@ process { } withName: 'FRAGCOUNTER' { cpus = { check_max( 12 * task.attempt, 'cpus' ) } - memory = { check_max( 36.GB * task.attempt, 'memory' ) } + memory = { check_max( 48.GB * task.attempt, 'memory' ) } time = { check_max( 48.h * task.attempt, 'time' ) } } withName: 'DRYCLEAN' { cpus = { check_max( 8 * task.attempt, 'cpus' ) } - memory = { check_max( 24.GB * task.attempt, 'memory' ) } - time = { check_max( 24.h * task.attempt, 'time' ) } + memory = { check_max( 48.GB * task.attempt, 'memory' ) } + time = { check_max( 36.h * task.attempt, 'time' ) } } withName: 'ASCAT_SEG' { cpus = { check_max( 10 * task.attempt, 'cpus' ) } - memory = { check_max( 24.GB * task.attempt, 'memory' ) } + memory = { check_max( 30.GB * task.attempt, 'memory' ) } time = { check_max( 10.h * task.attempt, 'time' ) } } withName: 'CBS' { cpus = { check_max( 8 * task.attempt, 'cpus' ) } - memory = { check_max( 16.GB * task.attempt, 'memory' ) } - time = { check_max( 10.h * task.attempt, 'time' ) } + memory = { check_max( 32.GB * task.attempt, 'memory' ) } + time = { check_max( 24.h * task.attempt, 'time' ) } } withName: 'JABBA' { cpus = { check_max( 16 * task.attempt, 'cpus' ) } memory = { check_max( 64.GB * task.attempt, 'memory' ) } - time = { check_max( 10.h * task.attempt, 'time' ) } + time = { check_max( 12.h * task.attempt, 'time' ) } } withLabel:error_ignore { errorStrategy = 'ignore' diff --git a/docs/README.md b/docs/README.md index 008f782..5d0e8d0 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,10 +1,10 @@ -# nf-core/heisenbio: Documentation +# mskilab-org/nf-jabba: Documentation -The nf-core/heisenbio documentation is split into the following pages: +The mskilab-org/nf-jabba documentation is split into the following pages: - [Usage](usage.md) - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. - [Output](output.md) - An overview of the different results produced by the pipeline and how to interpret them. -You can find a lot more documentation about installing, configuring and running nf-core pipelines on the website: [https://nf-co.re](https://nf-co.re) +You can find a lot more documentation about installing, configuring and running pipelines on the website: [https://nf-co.re](https://nf-co.re) diff --git a/docs/output.md b/docs/output.md index 92c3ad9..be92c1e 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,57 +1,231 @@ -# nf-core/heisenbio: Output +# mskilab-org/nf-jabba: Output ## Introduction -This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. +This document describes the output produced by the pipeline. The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - - ## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +- **Directory Structure** +- **Alignment** +- **SV Calling** +- **Coverages** +- **CBS** +- **ASCAT** +- **JaBbA** +- **Pipeline information** + +## Directory Structure + +``` +{outdir} +├── csv +├── CBS +├── pipeline_info +├── Alignment +│ ├── markduplicates +│ └── +│ ├── recal_table +│ └── +│ └── recalibrated +│ └── +├── Reference +├── SV_calling +│ ├── SVABA +│ └── +│ ├── GRIDSS +│ └── +├── HetPileups +├── JaBbA +├── Coverages +│ ├── fragCounter_normal +│ └── +│ ├── fragCounter_tumor +│ └── +│ └── Dryclean_normal +│ └── +│ └── Dryclean_tumor +│ └── +└── Reports + ├── + └── +work/ +.nextflow.log +``` + +## Alignment +`nf-jabba` pre-processes raw FastQ files or unmapped BAM files, based on [GATK best practices](https://gatk.broadinstitute.org/hc/en-us/sections/360007226651-Best-Practices-Workflows). + +### Preparation of input files (FastQ or (u)BAM) + +[FastP](https://github.com/OpenGene/fastp) is a tool designed to provide all-in-one preprocessing for FastQ files and as such is used for trimming and splitting. By default, these files are not published. However, if publishing is enabled, please be aware that these files are only published once, meaning if trimming and splitting is enabled, then the resulting files will be sharded FastQ files with trimmed reads. If only one of them is enabled then the files contain either trimmed or split reads, respectively. + +#### Trim adapters + +[FastP](https://github.com/OpenGene/fastp) supports global trimming, which means it trims all reads in the front or the tail. This function is useful since sometimes you want to drop some cycles of a sequencing run. In the current implementation in Sarek +`--detect_adapter_for_pe` is set by default which enables auto-detection of adapter sequences. For more information on how to fine-tune adapter trimming, take a look into the parameter docs. + +The resulting files are intermediate and by default not kept in the final files delivered to users. Set `--save_trimmed` to enable publishing of the files in: -### FastQC +
+Output files for all samples + +**Output directory: `{outdir}/Alignment/fastp/`** + +- `__{1,2}.fastp.fastq.gz>` + - Bgzipped FastQ file + +
+ +#### Split FastQ files + +[FastP](https://github.com/OpenGene/fastp) supports splitting of one FastQ file into multiple files allowing parallel alignment of sharded FastQ file. To enable splitting, the number of reads per output can be specified. For more information, take a look into the parameter `--split_fastq`in the parameter docs. + +These files are intermediate and by default not placed in the output-folder kept in the final files delivered to users. Set `--save_split` to enable publishing of these files to:
-Output files +Output files for all samples -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +**Output directory: `{outdir}/Alignment/fastp//`** + +- `` + - Bgzipped FastQ file
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +### Mapping to the Reference Genome -![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) +#### BWA -![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) +[BWA](https://github.com/lh3/bwa) is a software package for mapping low-divergent sequences against a large reference genome. The aligned reads are then coordinate-sorted (or name-sorted if [`GATK MarkDuplicatesSpark`](https://gatk.broadinstitute.org/hc/en-us/articles/5358833264411-MarkDuplicatesSpark) is used for duplicate marking) with [samtools](https://www.htslib.org/doc/samtools.html). -![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) +#### BWA-mem2 + +[BWA-mem2](https://github.com/bwa-mem2/bwa-mem2) is a software package for mapping low-divergent sequences against a large reference genome.The aligned reads are then coordinate-sorted (or name-sorted if [`GATK MarkDuplicatesSpark`](https://gatk.broadinstitute.org/hc/en-us/articles/5358833264411-MarkDuplicatesSpark) is used for duplicate marking) with [samtools](https://www.htslib.org/doc/samtools.html). + +
+Output files for all mappers and samples + +The alignment files (BAM or CRAM) produced by the chosen aligner are not published by default. CRAM output files will not be saved in the output-folder (`outdir`), unless the flag `--save_mapped` is used. BAM output can be selected by setting the flag `--save_output_as_bam`. + +**Output directory: `{outdir}/Alignment/mapped//`** + +- if `--save_mapped`: `.sorted.cram` and `.sorted.cram.crai` + + - CRAM file and index + +- if `--save_mapped --save_output_as_bam`: `.sorted.bam` and `.sorted.bam.bai` + - BAM file and index +
+ +### Mark Duplicates + +During duplicate marking, read pairs that are likely to have originated from duplicates of the same original DNA fragments through some artificial processes are identified. These are considered to be non-independent observations, so all but a single read pair within each set of duplicates are marked, causing the marked pairs to be ignored by default during the variant discovery process. + +For further reading and documentation see the [data pre-processing for variant discovery from the GATK best practices](https://gatk.broadinstitute.org/hc/en-us/articles/360035535912-Data-pre-processing-for-variant-discovery). + +The resulting CRAM files are delivered to the users. + +
+Output files for all samples + +**Output directory: `{outdir}/preprocessing/markduplicates//`** + +- `.md.cram` and `.md.cram.crai` + - CRAM file and index +- if `--save_output_as_bam`: + - `.md.bam` and `.md.bam.bai` + +
-> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. +### Base Quality Score Recalibration -### MultiQC +During Base Quality Score Recalibration, systematic errors in the base quality scores are corrected by applying machine learning to detect and correct for them. This is important for evaluating the correct call of a variant during the variant discovery process. However, this is not needed for all combinations of tools in Sarek. Notably, this should be turned off when having UMI tagged reads or using DragMap (see [here](https://gatk.broadinstitute.org/hc/en-us/articles/4407897446939--How-to-Run-germline-single-sample-short-variant-discovery-in-DRAGEN-mode)) as mapper. + +For further reading and documentation see the [technical documentation by GATK](https://gatk.broadinstitute.org/hc/en-us/articles/360035890531-Base-Quality-Score-Recalibration-BQSR-). + +
+Output files for all samples + +**Output directory: `{outdir}/Alignment/recal_table//`** + +- `.recal.table` + - Recalibration table associated to the duplicates-marked CRAM file. + +
+ +### GATK ApplyBQSR + +[GATK ApplyBQSR](https://gatk.broadinstitute.org/hc/en-us/articles/5358826654875-ApplyBQSR) recalibrates the base qualities of the input reads based on the recalibration table produced by the [GATK BaseRecalibrator](#gatk-baserecalibrator) tool. + +The resulting recalibrated CRAM files are delivered to the user. Recalibrated CRAM files are usually 2-3 times larger than the duplicate-marked CRAM files. + +
+Output files for all samples + +**Output directory: `{outdir}/Alignment/recalibrated//`** + +- `.recal.cram` and `.recal.cram.crai` + - CRAM file and index +- if `--save_output_as_bam`: + - `.recal.bam` and `.recal.bam.bai` - BAM file and index +
+ +## SV_calling + +The results regarding structural variant calling are collected in {outdir}/SV_calling/. If some results from a variant caller do not appear here, please check out the `--tools` section to check if the an SV caller was mentioned. + +Base Recalibrated CRAM files can used as an input to start the structural variant calling. + +### SvABA +SvABA is a method for detecting structural variants in sequencing data using genome-wide local assembly. For reference, check [info](https://github.com/walaj/svaba) + +### GRIDSS +GRIDSS is a module software suite containing tools useful for the detection of genomic rearrangements. It can also detect purity and ploidy. For reference, check [info](https://github.com/PapenfussLab/gridss) + +## fragCounter +The goal of fragCounter is to correct Whole genome or targeted sequencing data for GC and mappability bias. +The GC bias curve is determined by loess regression of read count by GC and mappability scores. For reference, check [info](https://github.com/mskilab-org/fragCounter) + +## Dryclean +Dryclean is a robust principal component analysis (rPCA) based method. It uses a panel of normal (PON) samples to learn the landscape of both biological and technical noise in read depth data. Dryclean then uses this landscape to significantly reduce noise and artifacts in the signal for tumor samples. The input to the algorithm is a GenomicsRanges object containing read depth. +For reference, check [info](https://github.com/mskilab-org/dryclean) + +## CBS +Segmentation is done by circular binary segmentation (CBS) algorithm after getting tumor/normal ratios of corrected read counts. We use a custom module script, check [here](../bin/cbsFH.R) + +## HetPileups +Pileup mutational calls are done using a custom module script called HetPileups. We use a custom module script, check [here](../bin/Pileups.R) + +## ASCAT + ASCAT(allele-specific copy number analysis of tumors) is used to accurately dissect the allele-specific copy number of solid tumors, simultaneously estimating and adjusting for both tumor ploidy and nonaberrant cell admixture. We use ASCAT ploidy to supply for JaBbA. For more info regarding ASCAT, check [here](https://github.com/VanLoo-lab/ascat) + +## JaBbA +JaBbA builds a genome graph based on junctions and read depth from whole genome sequencing, inferring optimal copy numbers for both vertices (DNA segments) and edges (bonds between segments). It can be used for discovering various patterns of structural variations. For more info regarding JaBbA, check [here](https://github.com/mskilab-org/JaBbA) + +### FastQC
Output files -- `multiqc/` - - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - - `multiqc_plots/`: directory containing static images from the report in various formats. +- `fastqc/` + - `*_fastqc.html`: FastQC report containing quality metrics. + - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images.
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). + +![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) + +![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) + +![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) -Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . ### Pipeline information diff --git a/docs/usage.md b/docs/usage.md index 79d6ba2..0e1fb78 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,68 +1,28 @@ -# nf-core/heisenbio: Usage +# mskilab-org/nf-jabba: Usage -## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/heisenbio/usage](https://nf-co.re/heisenbio/usage) +# Introduction: -> _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ +**mskilab-org/nf-JaBbA** is a new state-of-art bioinformatics pipeline from [`mskilab-org`](https://www.mskilab.org/) that is intended to run [`JaBbA`](https://github.com/mskilab-org/JaBbA/tree/master), an MIP based joint inference of copy number and rearrangement state in cancer whole genome sequence data. It runs all the pre-requisite modules necessary to run JaBbA and as followed in `mskilab-org`. This pipeline is built to handle only tumor-normal pairs as input (as of now) and is designed and tested to run on Human samples. -## Introduction +We drew our inspiration and ideas from [`nf-core/Sarek`](https://github.com/nf-core/sarek), a workflow designed to detect variants on whole genome or targeted sequencing data. **`nf-jabba`** is built using [`Nextflow`](https://www.nextflow.io/) and is implemented using `Nextflow DSL2`. All the modules uses [`Docker`](https://www.docker.com/) and [`Singularity`](https://sylabs.io/docs/) containers which makes the pipeline easily reproducible and maintain its dependencies. Some of the modules/processes are used from [`nf-core/modules`](https://github.com/nf-core/modules) that are available for the Nextflow Community. - +This pipeline has been designed to start from scratch using **FASTQ** files or start directly from **BAM** files as input and should be supplied in a **CSV** file (*please refer to the documentation below for the input format of the .csv file*). We incorporated a modified version of the `Alignment` step of `nf-JaBbA` pipeline from `nf-core/Sarek`, many thanks to the Sarek community. -## Samplesheet input +# Setting up a run: -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +To run the pipeline from the beginning, first create an `--input` `samplesheet.csv` file with your file paths. A typical input should look like this: -```bash ---input '[path to samplesheet file]' -``` - -### Multiple runs of the same sample - -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: - -```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz +```csv +patient,sex,status,sample,lane,fastq_1,fastq_2 +TCXX49,XX,0,TCXX49_N,lane_1,/path/to/fastq_1.fq.gz,/path/to/fastq_2.gz ``` - -### Full samplesheet - -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. - -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. - -```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, -``` - -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | - -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. - -## Running the pipeline - -The typical command for running the pipeline is as follows: +Each row represents a pair of fastq files (paired end) for each Sample. +A typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/heisenbio --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker +nextflow run mskilab-org/nf-jabba --input ./samplesheet.csv --outdir ./results --genome GATK.GRCh37 --tools -profile singularity ``` - -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. - -Note that the pipeline will create the following files in your working directory: +This will launch the pipeline and run all the processes with the tools specified in `--tools` to `JaBbA`. It will create all the following files in the working directory from where the command is run: ```bash work # Directory containing the nextflow working files @@ -80,7 +40,7 @@ Pipeline settings can be provided in a `yaml` or `json` file via `-params-file < The above pipeline run specified with a params file in yaml format: ```bash -nextflow run nf-core/heisenbio -profile docker -params-file params.yaml +nextflow run mskilab-org/nf-jabba -profile singularity -params-file params.yaml ``` with `params.yaml` containing: @@ -88,31 +48,170 @@ with `params.yaml` containing: ```yaml input: './samplesheet.csv' outdir: './results/' -genome: 'GRCh37' +genome: 'GATK.GRCh37' +tools: 'svaba,hetpileups,...,jabba' <...> ``` You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). -### Updating the pipeline +## Samplesheet input configurations (along with `--step`) -When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: +You need to create a samplesheet with information regarding the samples that you want to run the pipeline on. You need to specify the path of your **samplesheet** using the `--input` flag to specify the location. Make sure the input file is a *comma-separated* file and must contain headers with info discussed below. *It is highly recommended to provide the **absolute path** for inputs inside the samplesheet rather than relative paths.* + +To mention a sample as paired tumor-normal, it has to be specified with the same `patient` ID, a different `sample`, and their respective `status`. For instance, a `tumor` sample should be mentioned **1** in `status` field for a sample, if it is normal mention **0**. If there are multiple `sample` IDs, `nf-jabba` will consider them as separate samples and output the results on separate folders based on `patient`, rest assured all the runs will be separate based on `patient`, so no need to be concerned with getting the outputs mixed. ```bash -nextflow pull nf-core/heisenbio +--input '[path to samplesheet file]' ``` -### Reproducibility +A typical sample sheet should populate with the column names as shown below: + +| Column Name | Description | +|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------| +| patient | Patient or Sample ID. This should differentiate each patient/sample. *Note*: Each patient can have multiple sample names. | +| sample | Sample ID for each Patient. Should differentiate between tumor and normal. Sample IDs should be unique to Patient IDs | +| lane | If starting with FASTQ files, if there are multiple lanes for each sample for each patient, mention lane name. **Required for `--step alignment`. | +| sex | If known, please provide the sex for the patient. For instance if **Male** type XY, else if **Female** type XX, else others and unknown should be NA. | +| status | This should tell if your sample is **tumor** or **normal**. For **normal**, write 0, and for **tumor**, write 1. | +| fastq_1 | Full Path to FASTQ file read 1. The extension should be `.fastq.gz` or `.fq.gz`. **Required** for `--step alignment`. | +| fastq_2 | Full Path to FASTQ file read 2. The extension should be `.fastq.gz` or `.fq.gz`. **Required** for `--step alignment`. | +| bam | Full Path to BAM file. The extension should be `.bam`. **Required** for `--step sv_calling`. | +| bai | Full Path to BAM index file. The extension should be `.bam.bai`. **Required** for `--step sv_calling`. | +| cram | Full Path to CRAM file. The extension should be `.cram`. **Required** for `--step sv_calling` if file is of type `CRAM`. | +| crai | Full Path to CRAM index file. The extension should be `.cram.crai`. **Required** for `--step sv_calling` if file is of type `CRAM`. | +| table | Full path to Recalibration table file. **Required** for `--step recalibrate`. | +| vcf | Full path to VCF file. **Required** for `--step jabba`. | +| hets | Full path to HetPileups .txt file. **Required** for `--step jabba`. | + + +There are multiple `--steps` for `nf-jabba`. The main idea behind this was to make each of the tool separate so that one can only run their only desired tool rather than running the whole pipeline when it is provided with the required outputs. There are 2 primary `--steps` in the pipeline which can lead to `JaBbA` and run the module if mentioned with all the tools using a list of comma-separated names in `--tools`. A full input `csv` files for these 2 steps are shown below: +- **`--step alignment`** -It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. +``` +patient,sex,status,sample,lane,fastq_1,fastq_2 +TCXX49,XX,0,TCXX49_N,lane_1,/path/to/fastq_1.fq.gz,/path/to/fastq_2.gz +TCXX49,XX,0,TCXX49_N,lane_2,/path/to/fastq_1.fq.gz,/path/to/fastq_2.gz +TCXX49,XX,1,TCXX49_T,lane_1,/path/to/fastq_1.fq.gz,/path/to/fastq_2.gz +TCXX49,XX,1,TCXX49_T,lane_2,/path/to/fastq_1.fq.gz,/path/to/fastq_2.gz +TCXX52,NA,0,TCXX52_N,lane_1,/path/to/fastq_1.fq.gz,/path/to/fastq_2.gz +TCXX52,NA,1,TCXX52_T,lane_1,/path/to/fastq_1.fq.gz,/path/to/fastq_2.gz +``` +- **`--step sv_calling`** + +``` +patient,sex,status,sample,bam,bai +TCXX49,XX,0,TCXX49_N,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX49,XX,1,TCXX49_T,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX52,NA,0,TCXX52_N,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX52,NA,1,TCXX52_T,/path/to/alignment.bam,/path/to/alignment.bam.bai +``` +> **Note** +> If you are using cram files, just replace *bam* and *bai* headers with *cram* and *crai* headers +> and pass the *cram* and *crai* paths there + + +There are also many secondary `--steps` for this pipeline that are designed to only run a specific `--tools` for the module but is not adequate to run **`JaBbA`**. You can also run only **`JaBbA`** if you have all the inputs available and can be provided in the `--input` `csv` file. Below, we provide the other secondary steps and their desired input `csv` files for each step. + +- **`--step markduplicates`** + +``` +patient,sex,status,sample,bam,bai +TCXX49,XX,0,TCXX49_N,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX49,XX,1,TCXX49_T,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX52,NA,0,TCXX52_N,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX52,NA,1,TCXX52_T,/path/to/alignment.bam,/path/to/alignment.bam.bai +``` +> **Note** +> If you are using cram files, just replace *bam* and *bai* headers with *cram* and *crai* headers +> and pass the *cram* and *crai* paths there + + +- **`--step prepare_recalibration`** + +``` +patient,sex,status,sample,bam,bai +TCXX49,XX,0,TCXX49_N,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX49,XX,1,TCXX49_T,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX52,NA,0,TCXX52_N,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX52,NA,1,TCXX52_T,/path/to/alignment.bam,/path/to/alignment.bam.bai +``` +> **Note** +> If you are using cram files, just replace *bam* and *bai* headers with *cram* and *crai* headers +> and pass the *cram* and *crai* paths there -First, go to the [nf-core/heisenbio releases page](https://github.com/nf-core/heisenbio/releases) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag. -This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports. +- **`--step recalibrate`** -To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. +``` +patient,sex,status,sample,bam,bai,table +TCXX49,XX,0,TCXX49_N,/path/to/alignment.bam,/path/to/alignment.bam.bai,TCXX49_N.table +TCXX49,XX,1,TCXX49_T,/path/to/alignment.bam,/path/to/alignment.bam.bai,TCXX49_T.table +TCXX52,NA,0,TCXX52_N,/path/to/alignment.bam,/path/to/alignment.bam.bai,TCXX52_N.table +TCXX52,NA,1,TCXX52_T,/path/to/alignment.bam,/path/to/alignment.bam.bai,TCXX52_T.table +``` -> 💡 If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. +- **`--step fragcounter`** + +``` +patient,sex,status,sample,bam,bai +TCXX49,XX,0,TCXX49_N,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX49,XX,1,TCXX49_T,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX52,NA,0,TCXX52_N,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX52,NA,1,TCXX52_T,/path/to/alignment.bam,/path/to/alignment.bam.bai +``` +> **Note** +> If you are using cram files, just replace *bam* and *bai* headers with *cram* and *crai* headers +> and pass the *cram* and *crai* paths there + + +- **`--step dryclean`** (**Note**: you should also mention `--tools dryclean` to use Dryclean. This step also has `CBS`, if you want to perform both Dryclean and CBS, use `--tools dryclean,cbs`) + +``` +patient,sex,status,sample,cov +TCXX49,XX,0,TCXX49_N,/path/to/coverage.rds +TCXX49,XX,1,TCXX49_T,/path/to/coverage.rds +TCXX52,NA,0,TCXX52_N,/path/to/coverage.rds +TCXX52,NA,1,TCXX52_T,/path/to/coverage.rds +``` + +- **`--step hetpileups`** (**Note**: you should also mention `--tools hetpileups` to use HetPileups.) + +``` +patient,sex,status,sample,bam,bai +TCXX49,XX,0,TCXX49_N,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX49,XX,1,TCXX49_T,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX52,NA,0,TCXX52_N,/path/to/alignment.bam,/path/to/alignment.bam.bai +TCXX52,NA,1,TCXX52_T,/path/to/alignment.bam,/path/to/alignment.bam.bai +``` +> **Note** +> HetPileups does not support for *`cram`* files, you must use *`bam`* files for this step. + + +- **`--step ascat`** (**Note**: you should also mention `--tools ascat` to use ASCAT.) + +``` +patient,sex,status,sample,cov,hets +TCXX49,XX,1,TCXX49_T,/path/to/coverage.rds,/path/to/hetpileups/sites.txt +TCXX52,NA,1,TCXX52_T,/path/to/coverage.rds,/path/to/hetpileups/sites.txt +``` + + +- **`--step jabba`** (**Note**: you should also mention `--tools jabba` to use JaBbA.) + +``` +patient,sex,status,sample,cov,vcf +TCXX49,XX,1,TCXX49_T,/path/to/tumor/coverage.rds,/path/to/sv_caller/tumor/somatic.vcf +TCXX52,NA,1,TCXX52_T,/path/to/tumor/coverage.rds,/path/to/sv_caller/tumor/somatic.vcf +``` + +### Updating the pipeline + +When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: + +```bash +nextflow pull mskilab-org/nf-jabba +``` ## Core Nextflow arguments @@ -161,6 +260,19 @@ You can also supply a run name to resume a specific run: `-resume [run-name]`. U Specify the path to a specific config file (this is a core Nextflow command). See the [nf-core website documentation](https://nf-co.re/usage/configuration) for more information. +#### `-bg` +Nextflow handles job submissions and supervises the running jobs. The Nextflow process must run until the pipeline is finished. + +The Nextflow `-bg` flag launches Nextflow in the background, detached from your terminal so that the workflow does not stop if you log out of your session. The logs are saved to a file. + +Alternatively, you can use `screen` / `tmux` or similar tool to create a detached session which you can log back into at a later time. +Some HPC setups also allow you to run nextflow within a cluster job submitted your job scheduler (from where it submits more jobs). + + +## Debugging any step/process: + +To debug any step or process that failed, please check your current `execution_trace*.txt` file inside the `/pipeline_info/` folder and gather the `hash` number for that process. Then go inside the `work` folder and paste that `hash` number to locate thw working directory for that process. There should be multiple `.command.*` files inside that folder which corresponds to your run. This includes log, sh, trace, error files. One good thing is you can run `.command.sh` script locally to check where it is exactly breaking and replicat the issue (though you might need to edit the command a bit to run it successfully locally). + ## Custom configuration ### Resource requests @@ -189,23 +301,6 @@ See the main [Nextflow documentation](https://www.nextflow.io/docs/latest/config If you have any questions or issues please send us a message on [Slack](https://nf-co.re/join/slack) on the [`#configs` channel](https://nfcore.slack.com/channels/configs). -## Azure Resource Requests - -To be used with the `azurebatch` profile by specifying the `-profile azurebatch`. -We recommend providing a compute `params.vm_type` of `Standard_D16_v3` VMs by default but these options can be changed if required. - -Note that the choice of VM size depends on your quota and the overall workload during the analysis. -For a thorough list, please refer the [Azure Sizes for virtual machines in Azure](https://docs.microsoft.com/en-us/azure/virtual-machines/sizes). - -## Running in the background - -Nextflow handles job submissions and supervises the running jobs. The Nextflow process must run until the pipeline is finished. - -The Nextflow `-bg` flag launches Nextflow in the background, detached from your terminal so that the workflow does not stop if you log out of your session. The logs are saved to a file. - -Alternatively, you can use `screen` / `tmux` or similar tool to create a detached session which you can log back into at a later time. -Some HPC setups also allow you to run nextflow within a cluster job submitted your job scheduler (from where it submits more jobs). - ## Nextflow memory requirements In some cases, the Nextflow Java virtual machines can start to request a large amount of memory.