diff --git a/conf/test.config b/conf/test.config index 4e457585..f40fb04b 100644 --- a/conf/test.config +++ b/conf/test.config @@ -48,6 +48,10 @@ params { kraken2_save_reads = true centrifuge_save_reads = true run_profile_standardisation = true + + // Generate downstream samplesheets + generate_downstream_samplesheets = true + generate_pipeline_samplesheets = "differentialabundance,mag" } process { diff --git a/conf/test_nothing.config b/conf/test_nothing.config index e8b87bc7..7619f73c 100644 --- a/conf/test_nothing.config +++ b/conf/test_nothing.config @@ -41,6 +41,10 @@ params { run_motus = false run_kmcp = false run_ganon = false + + // Generate downstream samplesheets + generate_downstream_samplesheets = false + generate_pipeline_samplesheets = "differentialabundance,mag" } process { diff --git a/docs/output.md b/docs/output.md index 7cf96395..20eddac6 100644 --- a/docs/output.md +++ b/docs/output.md @@ -42,6 +42,9 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +The pipeline can also generate downstream pipeline input samplesheets. +These are stored in `/downstream_samplesheets`. + ![](images/taxprofiler_tube.png) ### untar @@ -130,7 +133,7 @@ You can change the default value for low complexity filtering by using the argum By default nf-core/taxprofiler will only provide the `.settings` file if AdapterRemoval is selected. -You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. If this is selected, you may receive different combinations of `.fastq` files for each sample depending on the input types - e.g. whether you have merged or not, or if you're supplying both single- and paired-end reads. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. +You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. If this is selected, you may receive different combinations of `.fastq` files for each sample depending on the input types - e.g. whether you have merged or not, or if you're supplying both single- and paired-end reads. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_fastqs`, in which case the reads will be in the folder `analysis_ready_reads`. :::warning The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as complexity filtering, host removal, run merging etc.. @@ -174,7 +177,7 @@ The `.npo` files can be used for re-generating and customising the plots using t The output logs are saved in the output folder and are part of MultiQC report.You do not normally need to check these manually. -You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. +You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_fastqs`, in which case the reads will be in the folder `analysis_ready_reads`. :::warning We do **not** recommend using Porechop if you are already trimming the adapters with ONT's basecaller Guppy. @@ -195,7 +198,7 @@ We do **not** recommend using Porechop if you are already trimming the adapters The output logs are saved in the output folder and are part of MultiQC report.You do not normally need to check these manually. -You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. +You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_fastqs`, in which case the reads will be in the folder `analysis_ready_reads`. ### BBDuk @@ -212,7 +215,7 @@ It is used in nf-core/taxprofiler for complexity filtering using different algor -By default nf-core/taxprofiler will only provide the `.log` file if BBDuk is selected as the complexity filtering tool. You will only find the complexity filtered reads in your results directory if you provide ` --save_complexityfiltered_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. +By default nf-core/taxprofiler will only provide the `.log` file if BBDuk is selected as the complexity filtering tool. You will only find the complexity filtered reads in your results directory if you provide ` --save_complexityfiltered_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_fastqs`, in which case the reads will be in the folder `analysis_ready_reads`. :::warning The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as host removal, run merging etc.. @@ -233,7 +236,7 @@ It is used in nf-core/taxprofiler for complexity filtering using different algor -By default nf-core/taxprofiler will only provide the `.log` file if PRINSEQ++ is selected as the complexity filtering tool. You will only find the complexity filtered `.fastq` files in your results directory if you supply ` --save_complexityfiltered_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. +By default nf-core/taxprofiler will only provide the `.log` file if PRINSEQ++ is selected as the complexity filtering tool. You will only find the complexity filtered `.fastq` files in your results directory if you supply ` --save_complexityfiltered_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_fastqs`, in which case the reads will be in the folder `analysis_ready_reads`. :::warning The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as host removal, run merging etc.. @@ -252,7 +255,7 @@ The resulting `.fastq` files may _not_ always be the 'final' reads that go into -You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. +You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_fastqs`, in which case the reads will be in the folder `analysis_ready_reads`. :::warning We do _not_ recommend using Filtlong if you are performing filtering of low quality reads with ONT's basecaller Guppy. @@ -271,7 +274,7 @@ We do _not_ recommend using Filtlong if you are performing filtering of low qual -You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. +You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_fastqs`, in which case the reads will be in the folder `analysis_ready_reads`. ### Bowtie2 @@ -292,7 +295,7 @@ It is used with nf-core/taxprofiler to allow removal of 'host' (e.g. human) and/ -By default nf-core/taxprofiler will only provide the `.log` file if host removal is turned on. You will only have a `.bam` file if you specify `--save_hostremoval_bam`. This will contain _both_ mapped and unmapped reads. You will only get FASTQ files if you specify to save `--save_hostremoval_unmapped` - these contain only unmapped reads. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. +By default nf-core/taxprofiler will only provide the `.log` file if host removal is turned on. You will only have a `.bam` file if you specify `--save_hostremoval_bam`. This will contain _both_ mapped and unmapped reads. You will only get FASTQ files if you specify to save `--save_hostremoval_unmapped` - these contain only unmapped reads. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_fastqs`, in which case the reads will be in the folder `analysis_ready_reads`. :::info Unmapped reads in FASTQ are only found in this directory for short-reads, for long-reads see [`samtools/fastq/`](#samtools-fastq). @@ -345,7 +348,7 @@ Unlike Bowtie2, minimap2 does not produce an unmapped FASTQ file by itself. See -This directory will be present and contain the unmapped reads from the `.fastq` format from long-read minimap2 host removal, if `--save_hostremoval_unmapped` is supplied. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. +This directory will be present and contain the unmapped reads from the `.fastq` format from long-read minimap2 host removal, if `--save_hostremoval_unmapped` is supplied. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_fastqs`, in which case the reads will be in the folder `analysis_ready_reads`. :::info For short-read unmapped reads, see [bowtie2](#bowtie2). @@ -354,7 +357,7 @@ For short-read unmapped reads, see [bowtie2](#bowtie2). ### Analysis Ready Reads :::info -This optional results directory will only be present in the pipeline results when supplying `--save_analysis_ready_reads`. +This optional results directory will only be present in the pipeline results when supplying `--save_analysis_ready_fastqs`. :::
@@ -401,7 +404,7 @@ This is the last possible preprocessing step, so if you have multiple runs or li Note that you will only find samples that went through the run merging step in this directory. For samples that had a single run or library will not go through this step of the pipeline and thus will not be present in this directory. -This directory and its FASTQ files will only be present if you supply `--save_runmerged_reads`.Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. +This directory and its FASTQ files will only be present if you supply `--save_runmerged_reads`.Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_fastqs`, in which case the reads will be in the folder `analysis_ready_reads`. ### Bracken @@ -744,3 +747,24 @@ For example, DIAMOND output does not have a dedicated section in the MultiQC HTM
[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. + +### Downstream samplesheets + +The pipeline can also generate input files for the following downstream pipelines: + +- [nf-core/mag](https://nf-co.re/mag) + +
+Output files + +- `downstream_samplesheets/` + - `mag-{pe,se}.csv`: input sheet for single-end and paired-end reads that contains paths to preprocessed short-read FASTQs (corresponding to what is saved with `--save_analysis_ready_fastqs`) that can be used to skip read preprocessing steps in nf-core/mag. + - Note: if you supplied `--shortread_qc_mergepairs`, all files will be listed in `mag-se.csv` as single end and no `mag-pe.csv` will be generated. + - Note: the nf-core/mag mandatory `group` column is filled with a dummy ID (`0`), you may wish to change this depending on your nf-core/mag settings. + +
+ +:::warning +Any generated downstream samplesheet is provided as 'best effort' and are not guaranteed to work straight out of the box! +They may not be complete (e.g. some columns may need to be manually filled in). +::: diff --git a/nextflow.config b/nextflow.config index e8dd7723..0f96ad18 100644 --- a/nextflow.config +++ b/nextflow.config @@ -196,6 +196,12 @@ params { taxpasta_add_ranklineage = false taxpasta_ignore_errors = false standardisation_motus_generatebiom = false + + // Generate downstream samplesheets + + // Generate downstream samplesheets + generate_downstream_samplesheets = false + generate_pipeline_samplesheets = null } // Load base.config by default for all pipelines diff --git a/nextflow_schema.json b/nextflow_schema.json index 3ada1a56..7012ec1f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,5 +1,5 @@ { - "$schema": "http://json-schema.org/draft-07/schema", + "$schema": "https://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/nf-core/taxprofiler/master/nextflow_schema.json", "title": "nf-core/taxprofiler pipeline parameters", "description": "Taxonomic classification and profiling of shotgun short- and long-read metagenomic data", @@ -712,6 +712,24 @@ }, "fa_icon": "fas fa-chart-line" }, + "generate_samplesheet_options": { + "title": "Downstream pipeline samplesheet generation options", + "type": "object", + "fa_icon": "fas fa-university", + "description": "Options for generating input samplesheets for complementary downstream pipelines.", + "properties": { + "generate_pipeline_samplesheets": { + "type": "string", + "description": "Specify a comma separated string in quotes to specify which pipeline to generate a samplesheet for.", + "pattern": "^(differentialabundance|mag)(?:,(differentialabundance|mag)){0,1}" + }, + "generate_downstream_samplesheets": { + "type": "boolean", + "description": "Turn on generation of samplesheets for downstream pipelines.", + "fa_icon": "fas fa-toggle-on" + } + } + }, "institutional_config_options": { "title": "Institutional config options", "type": "object", @@ -945,6 +963,9 @@ } }, "allOf": [ + { + "$ref": "#/definitions/generate_samplesheet_options" + }, { "$ref": "#/definitions/input_output_options" }, diff --git a/subworkflows/local/generate_downstream_samplesheets/main.nf b/subworkflows/local/generate_downstream_samplesheets/main.nf new file mode 100644 index 00000000..c2f70f8d --- /dev/null +++ b/subworkflows/local/generate_downstream_samplesheets/main.nf @@ -0,0 +1,79 @@ +// +// Subworkflow with functionality specific to the nf-core/mag pipeline +// + +workflow SAMPLESHEET_MAG { + take: + ch_processed_reads + + main: + format = 'csv' + + ch_list_for_samplesheet = ch_processed_reads + .filter { meta, reads -> + if (meta.instrument_platform != 'ILLUMINA') { + log.warn("[nf-core/taxprofiler] WARNING: Only Illumina short-reads are supported by the nf-core/mag pipeline. The following sample will not be in present in `mag-*.csv`: ${meta.id}") + } + meta.instrument_platform == 'ILLUMINA' + } + .map { meta, reads -> + def sample = meta.id + def run = params.perform_runmerging ? '' : meta.run_accession + def group = "0" + //this should be optional + def short_reads_1 = meta.is_fasta ? "" : file(params.outdir).toString() + '/analysis_ready_fastqs/' + reads[0].getName() + def short_reads_2 = meta.is_fasta || meta.single_end ? "" : file(params.outdir).toString() + '/analysis_ready_fastqs/' + reads[1].getName() + def long_reads = meta.is_fasta ? file(params.outdir).toString() + '/analysis_ready_fastqs/' + reads[0].getName() : "" + + if (params.perform_runmerging) { + [sample: sample, group: group, short_reads_1: short_reads_1, short_reads_2: short_reads_2, long_reads: long_reads] + } + else { + [sample: sample, run: run, group: group, short_reads_1: short_reads_1, short_reads_2: short_reads_2, long_reads: long_reads] + } + } + .tap { ch_list_for_samplesheet_all } + .filter { it.short_reads_1 != "" } + .branch { + se: it.short_reads_2 == "" + pe: it.short_reads_2 != "" + unknown: true + } + + // Throw a warning that only long reads are not supported yet by MAG + ch_list_for_samplesheet_all + .filter { it.long_reads != "" && it.short_reads_1 == "" } + .collect { log.warn("[nf-core/taxprofiler] WARNING: Standalone long reads are not yet supported by the nf-core/mag pipeline and will not be in present in `mag-*.csv`. Sample: ${it.sample}") } + + channelToSamplesheet(ch_list_for_samplesheet.pe, "${params.outdir}/downstream_samplesheets/mag-pe", format) + channelToSamplesheet(ch_list_for_samplesheet.se, "${params.outdir}/downstream_samplesheets/mag-se", format) +} + +workflow GENERATE_DOWNSTREAM_SAMPLESHEETS { + take: + ch_processed_reads + + main: + def downstreampipeline_names = params.generate_pipeline_samplesheets.split(",") + + if (downstreampipeline_names.contains('mag') && params.save_analysis_ready_fastqs) { + SAMPLESHEET_MAG(ch_processed_reads) + } +} + +// Constructs the header string and then the strings of each row, and +def channelToSamplesheet(ch_list_for_samplesheet, path, format) { + def format_sep = [csv: ",", tsv: "\t", txt: "\t"][format] + + def ch_header = ch_list_for_samplesheet + + ch_header + .first() + .map { it.keySet().join(format_sep) } + .concat(ch_list_for_samplesheet.map { it.values().join(format_sep) }) + .collectFile( + name: "${path}.${format}", + newLine: true, + sort: false + ) +} diff --git a/subworkflows/local/utils_nfcore_taxprofiler_pipeline/main.nf b/subworkflows/local/utils_nfcore_taxprofiler_pipeline/main.nf index 9b4f6df5..e93e9942 100644 --- a/subworkflows/local/utils_nfcore_taxprofiler_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_taxprofiler_pipeline/main.nf @@ -149,7 +149,17 @@ workflow PIPELINE_COMPLETION { // def validateInputParameters() { genomeExistsError() -}// + + if (params.generate_downstream_samplesheets && !params.generate_pipeline_samplesheets) { + error('[nf-core/taxprofiler] ERROR: If supplying `--generate_downstream_samplesheets`, you must also specify which pipeline to generate for with `--generate_pipeline_samplesheets`! Check input.') + } + + if ( params.generate_downstream_samplesheets && params.generate_pipeline_samplesheets.split(",").contains('mag') && !params.save_analysis_ready_fastqs ) { + error("[nf-core/taxprofiler] ERROR: To generate downstream samplesheets for nf-core/mag, you must also specify `--save_analysis_ready_fastqs`") + } +} + +// // Validate channels from input samplesheet // def validateInputSamplesheet(input) { diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 93eb55dd..619d3916 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -72,6 +72,7 @@ include { SHORTREAD_COMPLEXITYFILTERING } from '../subworkflows/local/shortread_ include { PROFILING } from '../subworkflows/local/profiling' include { VISUALIZATION_KRONA } from '../subworkflows/local/visualization_krona' include { STANDARDISATION_PROFILES } from '../subworkflows/local/standardisation_profiles' +include { GENERATE_DOWNSTREAM_SAMPLESHEETS } from '../subworkflows/local/generate_downstream_samplesheets/main.nf' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -328,6 +329,14 @@ workflow TAXPROFILER { MODULE: MultiQC */ + + // + // Samplesheet generation + // + if ( params.generate_downstream_samplesheets ) { + GENERATE_DOWNSTREAM_SAMPLESHEETS ( ch_reads_runmerged ) + } + // // Collate and save software versions //