From 6b04d7efbafebd95cc46b46d6e35ef2cee5dc1ba Mon Sep 17 00:00:00 2001 From: Zhenfeng Liu Date: Thu, 31 Aug 2023 16:55:01 -0700 Subject: [PATCH 1/2] add downsample step --- main.nf | 13 ++++++++----- nextflow.config | 5 ++++- nextflow_schema.json | 6 ++++++ processes/downsample.nf | 29 +++++++++++++++++++++++++++++ processes/miqscore16s.nf | 5 ++--- 5 files changed, 49 insertions(+), 9 deletions(-) create mode 100644 processes/downsample.nf diff --git a/main.nf b/main.nf index 033d231..461b7ae 100644 --- a/main.nf +++ b/main.nf @@ -44,6 +44,9 @@ Channel.from( summary.collect{ [it.key, it.value] } ) * PROCESS DEFINITION */ include { check_design } from "./processes/check_design" +include { downsample } from "./processes/downsample" addParams( + downsample_num: params.downsample_num +) include { miqscore16s } from "./processes/miqscore16s" addParams( publish_dir: "${outdir}/miqscore16s", forward_primer_length: params.forward_primer_length, @@ -62,13 +65,13 @@ workflow { check_design.out.checked_design .splitCsv( header: true ) .first() // Only support 1 pair of FASTQ for now - .multiMap { - read_1 : file(it['read_1']) - read_2 : file(it['read_2']) - sample_name: it['sample'] + .map { + [ it['sample'], [ file(it['read_1']), file(it['read_2']) ] ] } .set { input } - miqscore16s(input.sample_name, input.read_1, input.read_2) + downsample(input) + miqscore_input = params.downsample_num ? downsample.out.reads : input + miqscore16s(miqscore_input) miqscore16s.out.report.map { "${outdir}/miqscore16s/" + it.getName() } .collectFile(name: "${outdir}/download_data/file_locations.txt", newLine: true) .set { output_locations } diff --git a/nextflow.config b/nextflow.config index b9b9876..15908fc 100644 --- a/nextflow.config +++ b/nextflow.config @@ -19,7 +19,10 @@ params { max_memory = 60.GB max_cpus = 8 max_time = 2.h - + + // Downsample + downsample_num = 100000 + // Other Defaults outdir = './results' name = false diff --git a/nextflow_schema.json b/nextflow_schema.json index bf35978..ce58a7f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -86,6 +86,12 @@ "type": "object", "description": "Less common options for the pipeline, typically set in a config file", "properties": { + "downsample_num": { + "type": "integer", + "description": "Number of read pairs to downsample to. If input has fewer reads than this, downsampling will not happen. Set to 0 to turn this off.", + "minimum": 0, + "default": 100000 + }, "name": { "type": "string", "description": "Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic", diff --git a/processes/downsample.nf b/processes/downsample.nf new file mode 100644 index 0000000..f85a7a8 --- /dev/null +++ b/processes/downsample.nf @@ -0,0 +1,29 @@ +// Downsample FASTQ files + +process downsample { + container 'quay.io/biocontainers/seqtk:1.4--he4a0461_1' + + input: + tuple val(name), path(read_1), path(read_2) + + when: + params.downsample_num + + output: + tuple val(name), path("${name}_downsample_R1.fastq"), path("${name}_downsample_R2.fastq"), emit: reads + + script: + """ + readnum=\$((\$(zcat $reads | wc -l) / 4)) + if ((\$readnum > $params.downsample_num)) + then + seqtk sample -s1000 $read_1 $params.downsample_num > ${name}_downsample_R1.fastq + gzip ${name}_downsample_R1.fastq + seqtk sample -s1000 $read_2 $params.downsample_num > ${name}_downsample_R2.fastq + gzip ${name}_downsample_R2.fastq + else + [ ! -f ${name}_downsample_R1.fastq ] && ln -s $read_1 ${name}_downsample_R1.fastq + [ ! -f ${name}_downsample_R2.fastq ] && ln -s $read_2 ${name}_downsample_R2.fastq + fi + """ +} \ No newline at end of file diff --git a/processes/miqscore16s.nf b/processes/miqscore16s.nf index a74c7cd..806c153 100644 --- a/processes/miqscore16s.nf +++ b/processes/miqscore16s.nf @@ -9,9 +9,7 @@ process miqscore16s { publishDir "${params.publish_dir}", mode: 'copy' input: - env SAMPLENAME - path read_1 - path read_2 + tuple val(name), path(read_1), path(read_2) output: path '*.html', emit: report @@ -22,6 +20,7 @@ process miqscore16s { mv $read_1 /data/input/sequence/standard_submitted_R1.fastq mv $read_2 /data/input/sequence/standard_submitted_R2.fastq mkdir -p /data/output + export SAMPLENAME=${name} export FORWARDPRIMERLENGTH=${params.forward_primer_length} export REVERSEPRIMERLENGTH=${params.reverse_primer_length} export AMPLICONLENGTH=${params.amplicon_length} From cbf7ef4ad3cdb3658dd54832fde1d19244226aa9 Mon Sep 17 00:00:00 2001 From: Zhenfeng Liu Date: Fri, 1 Sep 2023 11:51:35 -0700 Subject: [PATCH 2/2] fix small bugs --- main.nf | 2 +- processes/downsample.nf | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/main.nf b/main.nf index 461b7ae..3b066e0 100644 --- a/main.nf +++ b/main.nf @@ -66,7 +66,7 @@ workflow { .splitCsv( header: true ) .first() // Only support 1 pair of FASTQ for now .map { - [ it['sample'], [ file(it['read_1']), file(it['read_2']) ] ] + [ it['sample'], file(it['read_1']), file(it['read_2']) ] } .set { input } downsample(input) diff --git a/processes/downsample.nf b/processes/downsample.nf index f85a7a8..ec7ef16 100644 --- a/processes/downsample.nf +++ b/processes/downsample.nf @@ -10,11 +10,11 @@ process downsample { params.downsample_num output: - tuple val(name), path("${name}_downsample_R1.fastq"), path("${name}_downsample_R2.fastq"), emit: reads + tuple val(name), path("${name}_downsample_R1.fastq.gz"), path("${name}_downsample_R2.fastq.gz"), emit: reads script: """ - readnum=\$((\$(zcat $reads | wc -l) / 4)) + readnum=\$((\$(zcat $read_1 | wc -l) / 4)) if ((\$readnum > $params.downsample_num)) then seqtk sample -s1000 $read_1 $params.downsample_num > ${name}_downsample_R1.fastq @@ -22,8 +22,8 @@ process downsample { seqtk sample -s1000 $read_2 $params.downsample_num > ${name}_downsample_R2.fastq gzip ${name}_downsample_R2.fastq else - [ ! -f ${name}_downsample_R1.fastq ] && ln -s $read_1 ${name}_downsample_R1.fastq - [ ! -f ${name}_downsample_R2.fastq ] && ln -s $read_2 ${name}_downsample_R2.fastq + [ ! -f ${name}_downsample_R1.fastq.gz ] && ln -s $read_1 ${name}_downsample_R1.fastq.gz + [ ! -f ${name}_downsample_R2.fastq.gz ] && ln -s $read_2 ${name}_downsample_R2.fastq.gz fi """ } \ No newline at end of file