diff --git a/.travis.yml b/.travis.yml index 459e925cb..e165c7afd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,7 +13,7 @@ before_install: # Pull the docker image first so the test doesn't wait for this - docker pull nfcore/eager:dev # Fake the tag locally so that the pipeline runs properly - - docker tag nfcore/eager:dev nfcore/eager:2.0.5 + - docker tag nfcore/eager:dev nfcore/eager:2.0.6 install: # Install Nextflow @@ -40,6 +40,12 @@ script: - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --saveReference # Run the basic pipeline with single end data (pretending its single end actually) - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --singleEnd --bwa_index results/reference_genome/bwa_index/bwa_index/ + # Run the basic pipeline with paired end data without collapsing + - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --skip_collapse --saveReference + # Run the basic pipeline with paired end data without trimming + - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --skip_trim --saveReference + # Run the basic pipeline with paired end data without adapterRemoval + - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --skip_adapterremoval --saveReference # Run the same pipeline testing optional step: fastp, complexity - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --complexity_filter --bwa_index results/reference_genome/bwa_index/bwa_index/ # Test BAM Trimming diff --git a/CHANGELOG.md b/CHANGELOG.md index 047f5b207..345f5ada3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,24 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). -## [Unpublished / Dev Branch] +## [2.0.6] - 2019-03-05 + +### `Added` + +* [#152](https://github.com/nf-core/eager/pull/152) - Clarified `--complexity_filter` flag to be specifically for poly G trimming. +* [#155](https://github.com/nf-core/eager/pull/155) - Added [Dedup log to output folders](https://github.com/nf-core/eager/issues/154) +* [#159](https://github.com/nf-core/eager/pull/159) - Added Possibility to skip AdapterRemoval, skip merging, skip trimming fixing [#64](https://github.com/nf-core/eager/issues/64),[#137](https://github.com/nf-core/eager/issues/137) - thanks to @maxibor, @jfy133 + +### `Fixed` + +* [#151](https://github.com/nf-core/eager/pull/151) - Fixed [post-deduplication step errors](https://github.com/nf-core/eager/issues/128 +* [#147](https://github.com/nf-core/eager/pull/147) - Fix Samtools Index for [large references](https://github.com/nf-core/eager/issues/146) +* [#145](https://github.com/nf-core/eager/pull/145) - Added Picard Memory Handling [fix](https://github.com/nf-core/eager/issues/144) + +### `Dependencies` +* Picard Tools 2.18.23 -> 2.18.27 +* GATK 4.0.12.0 -> 4.1.0.0 +* FastP 0.19.6 -> 0.19.7 ## [2.0.5] - 2019-01-28 @@ -18,8 +35,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Dependencies` * Picard Tools 2.18.21 -> 2.18.23 -* R-Markdown 1.10 -> 1.11 -* FastP 0.19.5 -> 0.19.6 +* R-Markdown 1.10 -> 1.11 +* FastP 0.19.5 -> 0.19.6 ## [2.0.4] - 2019-01-09 diff --git a/Dockerfile b/Dockerfile index 396593bd5..5a3050fff 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,4 +3,4 @@ FROM nfcore/base LABEL description="Docker image containing all requirements for nf-core/eager pipeline" COPY environment.yml / RUN conda env create -f /environment.yml && conda clean -a -ENV PATH /opt/conda/envs/nf-core-eager-2.0.5/bin:$PATH +ENV PATH /opt/conda/envs/nf-core-eager-2.0.6/bin:$PATH diff --git a/README.md b/README.md index d602fd450..df99db945 100644 --- a/README.md +++ b/README.md @@ -45,20 +45,28 @@ Additional functionality contained by the pipeline currently includes: ## Quick Start 1. Install [`nextflow`](docs/installation.md) + 2. Install one of [`docker`](https://docs.docker.com/engine/installation/), [`singularity`](https://www.sylabs.io/guides/3.0/user-guide/) or [`conda`](https://conda.io/miniconda.html) + 3. Download the EAGER pipeline ```bash nextflow pull nf-core/eager ``` -4. Set up your job with default parameters +4. Test the pipeline using the provided test data ```bash -nextflow run nf-core -profile --reads'*_R{1,2}.fastq.gz' --fasta '.fasta' +nextflow run nf-core/eager -profile ,test --pairedEnd ``` -5. See the overview of the run with under `/MultiQC/multiqc_report.html` +5. Start running your own ancient DNA analysis! + +```bash +nextflow run nf-core/eager -profile --reads'*_R{1,2}.fastq.gz' --fasta '.fasta' +``` + +NB. You can see an overview of the run in the MultiQC report located at `/MultiQC/multiqc_report.html` Modifications to the default pipeline are easily made using various options as described in the documentation. @@ -84,6 +92,19 @@ James Fellows Yates, Raphael Eisenhofer and Judith Neukamm. If you want to contribute, please open an issue and ask to be added to the project - happy to do so and everyone is welcome to contribute here! +## Contributors + +- [James A. Fellows-Yates](https://github.com/jfy133) +- [Stephen Clayton](https://github.com/sc13-bioinf) +- [Maxime Borry](https://github.com/maxibor) +- [Judith Neukamm](https://github.com/JudithNeukamm) +- [Raphael Eisenhofer](https://github.com/EisenRa) +- [Maxime Garcia](https://github.com/MaxUlysse) +- [Luc Venturini](https://github.com/lucventurini) +- [Hester van Schalkwyk](https://github.com/hesterjvs) + +If you've contributed and you're missing in here, please let me know and I'll add you in. + ## Tool References * **EAGER v1**, CircularMapper, DeDup* Peltzer, A., Jäger, G., Herbig, A., Seitz, A., Kniep, C., Krause, J., & Nieselt, K. (2016). EAGER: efficient ancient genome reconstruction. Genome Biology, 17(1), 1–14. [https://doi.org/10.1186/s13059-016-0918-z](https://doi.org/10.1186/s13059-016-0918-z) Download: [https://github.com/apeltzer/EAGER-GUI](https://github.com/apeltzer/EAGER-GUI) and [https://github.com/apeltzer/EAGER-CLI](https://github.com/apeltzer/EAGER-CLI) diff --git a/Singularity b/Singularity index eb302a82d..43d148bb9 100644 --- a/Singularity +++ b/Singularity @@ -4,10 +4,10 @@ Bootstrap:docker %labels MAINTAINER Alexander Peltzer DESCRIPTION Container image containing all requirements for the nf-core/eager pipeline - VERSION 2.0.5 + VERSION 2.0.6 %environment - PATH=/opt/conda/envs/nf-core-eager-2.0.5/bin:$PATH + PATH=/opt/conda/envs/nf-core-eager-2.0.6/bin:$PATH export PATH %files diff --git a/conf/base.config b/conf/base.config index c65d74400..8c2868214 100644 --- a/conf/base.config +++ b/conf/base.config @@ -31,7 +31,10 @@ process { withName:convertBam { cpus = { check_max(8 * task.attempt, 'cpus') } } - + withName:makeSeqDict { + memory = { check_max( 16.GB * task.attempt, 'memory' ) } + } + withName:bwa { memory = { check_max( 16.GB * task.attempt, 'memory' ) } cpus = { check_max(8 * task.attempt, 'cpus') } diff --git a/conf/multiqc_config.yaml b/conf/multiqc_config.yaml index 8e561e902..1b6bdaa63 100644 --- a/conf/multiqc_config.yaml +++ b/conf/multiqc_config.yaml @@ -9,6 +9,7 @@ top_modules: - '*_fastqc.zip' path_filters_exclude: - '*.combined.prefixed_fastqc.zip' + - 'fastp' - 'adapterRemoval' - 'fastqc': name: 'FastQC (post-AdapterRemoval)' diff --git a/docs/usage.md b/docs/usage.md index 6f0838d7d..5a848760b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -170,6 +170,10 @@ If you prefer, you can specify the full path to your reference genome when you r ``` > If you don't specify appropriate `--bwa_index`, `--fasta_index` parameters, the pipeline will create these indices for you automatically. Note, that saving these for later has to be turned on using `--saveReference`. You may also specify the path to a gzipped (`*.gz` file extension) FastA as reference genome - this will be uncompressed by the pipeline automatically for you. Note that other file extensions such as `.fna`, `.fa` are also supported but will be renamed to `.fasta` automatically by the pipeline. +### `--large_ref` + +This parameter is required to be set for large reference genomes. If your reference genome is larger than 3.5GB, the `samtools index` calls in the pipeline need to generate `CSI` indices instead of `BAI` indices to accompensate for the size of the reference genome. This parameter is not required for smaller references (including a human `hg19` or `grch37`/`grch38` reference), but `>4GB` genomes have been shown to need `CSI` indices. + ### `--genome` (using iGenomes) The pipeline config files come bundled with paths to the illumina iGenomes reference index files. If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource. @@ -237,7 +241,7 @@ Use to set a top-limit for the default time requirement for each process. Should be a string in the format integer-unit. eg. `--max_time '2.h'`. If not specified, will be taken from the configuration in the `-profile` flag. ### `--max_cpus` -Use to set a top-limit for the default CPU requirement for each process. +Use to set a top-limit for the default CPU requirement for each **process**. This is not the maximum number of CPUs that can be used for the whole pipeline, but the maximum number of CPUs each program can use for each program submission (known as a process). Do not set this higher than what is available on your workstation or computing node can provide. If you're unsure, ask your local IT administrator for details on compute node capabilities! Should be a string in the format integer-unit. eg. `--max_cpus 1`. If not specified, will be taken from the configuration in the `-profile` flag. ### `--email` @@ -279,12 +283,16 @@ This part of the documentation contains a list of user-adjustable parameters in ## Step skipping parameters -Some of the steps in the pipeline can be executed optionally. If you specify specific steps to be skipped, there won't be any output related to these modules. +Some of the steps in the pipeline can be executed optionally. If you specify specific steps to be skipped, there won't be any output related to these modules. ### `--skip_preseq` Turns off the computation of library complexity estimation. +### `--skip_adapterremoval` + +Turns off adaptor trimming and paired-end read merging. Equivalent to setting both `--skip_collapse` and `--skip_trim`. + ### `--skip_damage_calculation` Turns off the DamageProfiler module to compute DNA damage profiles. @@ -299,7 +307,7 @@ Turns off duplicate removal methods DeDup and MarkDuplicates respectively. No du ## Complexity Filtering Options -### `--complexity_filter` +### `--complexity_filter_poly_g` Performs a poly-G tail removal step in the beginning of the pipeline, if turned on. This can be useful for trimming ploy-G tails from short-fragments sequenced on two-colour Illumina chemistry such as NextSeqs (where no-fluorescence is read as a G on two-colour chemistry), which can inflate reported GC content values. @@ -329,6 +337,24 @@ Defines the minimum read quality per base that is required for a base to be kept ### `--clip_min_adap_overlap` 1 Sets the minimum overlap between two reads when read merging is performed. Default is set to `1` base overlap. +### `--skip_collapse` + +Turns off the paired-end read merging. + +For example +```bash +--pairedEnd --skip_collapse --reads '*.fastq' +``` + +### `--skip_trim` + +Turns off adaptor and quality trimming. + +For example: +```bash +--pairedEnd --skip_trim --reads '*.fastq' +``` + ## Read Mapping Parameters ## BWA (default) diff --git a/environment.yml b/environment.yml index 9a191fefe..c8c0338c9 100644 --- a/environment.yml +++ b/environment.yml @@ -1,4 +1,4 @@ -name: nf-core-eager-2.0.5 +name: nf-core-eager-2.0.6 channels: - defaults - bioconda @@ -9,12 +9,12 @@ dependencies: - bioconda::adapterremoval=2.2.2 - bioconda::adapterremovalfixprefix=0.0.4 - bioconda::bwa=0.7.17 - - bioconda::picard=2.18.23 + - bioconda::picard=2.18.27 - bioconda::samtools=1.9 - bioconda::dedup=0.12.3 - bioconda::angsd=0.923 - bioconda::circularmapper=1.93.4 - - bioconda::gatk4=4.0.12.0 + - bioconda::gatk4=4.1.0.0 - bioconda::qualimap=2.2.2b - bioconda::vcf2genome=0.91 - bioconda::damageprofiler=0.4.4 @@ -25,6 +25,6 @@ dependencies: - conda-forge::pigz=2.3.4 - bioconda::sequencetools=1.2.2 - bioconda::preseq=2.0.3 - - bioconda::fastp=0.19.6 + - bioconda::fastp=0.19.7 - bioconda::bamutil=1.0.14 #Missing Schmutzi,snpAD diff --git a/main.nf b/main.nf index f25ad3e79..5b807c8ef 100644 --- a/main.nf +++ b/main.nf @@ -26,9 +26,9 @@ def helpMessage() { Mandatory arguments: --reads Path to input data (must be surrounded with quotes) - -profile Hardware config to use (e.g. standard, docker, singularity, conda, aws). Ask your system admin if unsure, or check documentatoin. + -profile Institution or personal hardware config to use (e.g. standard, docker, singularity, conda, aws). Ask your system admin if unsure, or check documentation. --singleEnd Specifies that the input is single end reads (required if not pairedEnd) - --pairedEnd Specifies that the input is paired end reads (required if not singleend) + --pairedEnd Specifies that the input is paired end reads (required if not singleEnd) --bam Specifies that the input is in BAM format --fasta Path to Fasta reference (required if not iGenome reference) --genome Name of iGenomes reference (required if not fasta reference) @@ -36,22 +36,23 @@ def helpMessage() { Input Data Additional Options: --snpcapture Runs in SNPCapture mode (specify a BED file if you do this!) - References If not specified in the configuration file or you wish to overwrite any of the references. - --bwa_index Path to BWA index + References If not specified in the configuration file, or you wish to overwrite any of the references. + --bwa_index Path to directory containing BWA index files --bedfile Path to BED file for SNPCapture methods - --seq_dict Path to sequence dictionary file - --fasta_index Path to FastA index + --seq_dict Path to picard sequence dictionary file (typically ending in '.dict') + --fasta_index Path to samtools FASTA index (typically ending in '.fai') --saveReference Saves reference genome indices for later reusage Skipping Skip any of the mentioned steps + --skip_adapterremoval --skip_preseq --skip_damage_calculation --skip_qualimap --skip_deduplication Complexity Filtering - --complexity_filtering Run complexity filtering on FastQ files - --complexity_filter_poly_g_min Specify poly-g min filter (default: 10) for filtering + --complexity_filter_poly_g Run poly-G removal on FASTQ files + --complexity_filter_poly_g_min Specify length of poly-g min for clipping to be performed (default: 10) Clipping / Merging --clip_forward_adaptor Specify adapter sequence to be clipped off (forward) @@ -59,25 +60,27 @@ def helpMessage() { --clip_readlength Specify read minimum length to be kept for downstream analysis --clip_min_read_quality Specify minimum base quality for not trimming off bases --min_adap_overlap Specify minimum adapter overlap + --skip_collapse Skip merging forward and reverse reads together. (Only for PE samples) + --skip_trim Skip adaptor and quality trimming BWA Mapping - --bwaalnn Specify the -n parameter for BWA aln + --bwaalnn Specify the -n parameter for BWA aln. --bwaalnk Specify the -k parameter for BWA aln --bwaalnl Specify the -l parameter for BWA aln CircularMapper --circularmapper Turn on CircularMapper (CM) - --circularextension Specify the number of bases to extend + --circularextension Specify the number of bases to extend reference by --circulartarget Specify the target chromosome for CM --circularfilter Specify to filter off-target reads BWA Mem Mapping - --bwamem Turn on BWA Mem instead of CM/BWA aln for mapping + --bwamem Turn on BWA Mem instead of BWA aln for mapping BAM Filtering - --bam_discard_unmapped Discards unmapped reads in either FASTQ or BAM format, depending on choice. - --bam_unmapped_type Defines whether to discard all unmapped reads, keep only bam and/or keep only fastq format (options: discard, bam, fastq, both). --bam_mapping_quality_threshold Minimum mapping quality for reads filter, default 0. + --bam_discard_unmapped Discards unmapped reads in either FASTQ or BAM format, depending on choice (see --bam_unmapped_type). + --bam_unmapped_type Defines whether to discard all unmapped reads, keep only bam and/or keep only fastq format (options: discard, bam, fastq, both). DeDuplication --dedupper Deduplication method to use (options: dedup, markduplicates). Default: dedup @@ -101,10 +104,6 @@ def helpMessage() { --bamutils_clip_left / --bamutils_clip_right Specify the number of bases to clip off reads --bamutils_softclip Use softclip instead of hard masking - - For a full list and more information of available parameters, consider the documentation. - - Other options: --outdir The output directory where the results will be saved --email Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits @@ -114,6 +113,9 @@ def helpMessage() { --max_time Time limit for each step of the pipeline. Should be in form e.g. --max_memory '2.h' --max_cpus Maximum number of CPUs to use for each step of the pipleine. Should be in form e.g. --max_cpus 1 + For a full list and more information of available parameters, consider the documentation (https://github.com/nf-core/eager/). + + """.stripIndent() } /* @@ -146,13 +148,14 @@ params.email = false params.plaintext_email = false // Skipping parts of the pipeline for impatient users +params.skip_adapterremoval = false params.skip_preseq = false params.skip_damage_calculation = false params.skip_qualimap = false params.skip_deduplication = false //Complexity filtering reads -params.complexity_filter = false +params.complexity_filter_poly_g = false params.complexity_filter_poly_g_min = 10 //Read clipping and merging parameters @@ -161,6 +164,8 @@ params.clip_reverse_adaptor = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA" params.clip_readlength = 30 params.clip_min_read_quality = 20 params.min_adap_overlap = 1 +params.skip_collapse = false +params.skip_trim = false //Read mapping parameters (default = BWA aln default) params.bwaalnn = 0.04 @@ -241,12 +246,6 @@ if("${params.fasta}".endsWith(".gz")){ .ifEmpty { exit 1, "No genome specified! Please specify one with --fasta"} .into {ch_fasta_for_bwa_indexing;ch_fasta_for_faidx_indexing;ch_fasta_for_dict_indexing; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_circularmapper_index} } - - - - - - //Index files provided? Then check whether they are correct and complete if (params.aligner != 'bwa' && !params.circularmapper && !params.bwamem){ @@ -265,6 +264,10 @@ if( params.singleEnd || params.pairedEnd || params.bam){ exit 1, "Please specify either --singleEnd, --pairedEnd to execute the pipeline on FastQ files and --bam for previously processed BAM files!" } +//Validate that skip_collapse is only set to True for pairedEnd reads! +if (params.skip_collapse && params.singleEnd){ + exit 1, "--skip_collapse can only be set for pairedEnd samples!" +} //AWSBatch sanity checking if(workflow.profile == 'awsbatch'){ @@ -292,14 +295,14 @@ if( params.readPaths ){ .from( params.readPaths ) .map { row -> [ row[0], [ file( row[1][0] ) ] ] } .ifEmpty { exit 1, "params.readPaths or params.bams was empty - no input files supplied!" } - .into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filtering } + .into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g } ch_bam_to_fastq_convert = Channel.empty() } else if (!params.bam){ Channel .from( params.readPaths ) .map { row -> [ row[0], [ file( row[1][0] ), file( row[1][1] ) ] ] } .ifEmpty { exit 1, "params.readPaths or params.bams was empty - no input files supplied!" } - .into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filtering } + .into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g } ch_bam_to_fastq_convert = Channel.empty() } else { Channel @@ -311,7 +314,7 @@ if( params.readPaths ){ //Set up clean channels ch_read_files_fastqc = Channel.empty() - ch_read_files_complexity_filtering = Channel.empty() + ch_read_files_complexity_filter_poly_g = Channel.empty() ch_read_files_clip = Channel.empty() } } else if (!params.bam){ @@ -319,7 +322,7 @@ if( params.readPaths ){ .fromFilePairs( params.reads, size: params.singleEnd ? 1 : 2 ) .ifEmpty { exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs" + "to be enclosed in quotes!\nNB: Path requires at least one * wildcard!\nIf this is single-end data, please specify --singleEnd on the command line." } - .into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filtering } + .into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g } ch_bam_to_fastq_convert = Channel.empty() } else { Channel @@ -332,7 +335,7 @@ if( params.readPaths ){ //Set up clean channels ch_read_files_fastqc = Channel.empty() - ch_read_files_complexity_filtering = Channel.empty() + ch_read_files_complexity_filter_poly_g = Channel.empty() ch_read_files_clip = Channel.empty() } @@ -347,8 +350,11 @@ summary['Pipeline Version'] = workflow.manifest.version summary['Run Name'] = custom_runName ?: workflow.runName summary['Reads'] = params.reads summary['Fasta Ref'] = params.fasta +summary['BAM Index Type'] = (params.large_ref == "") ? 'BAI' : 'CSI' if(params.bwa_index) summary['BWA Index'] = params.bwa_index summary['Data Type'] = params.singleEnd ? 'Single-End' : 'Paired-End' +summary['Skip Collapsing'] = params.skip_collapse ? 'Yes' : 'No' +summary['Skip Trimming'] = params.skip_trim ? 'Yes' : 'No' summary['Max Memory'] = params.max_memory summary['Max CPUs'] = params.max_cpus summary['Max Time'] = params.max_time @@ -368,7 +374,7 @@ if(workflow.profile == 'awsbatch'){ summary['AWS Queue'] = params.awsqueue } if(params.email) summary['E-mail Address'] = params.email -log.info summary.collect { k,v -> "${k.padRight(15)}: $v" }.join("\n") +log.info summary.collect { k,v -> "${k.padRight(35)}: $v" }.join("\n") log.info "=========================================" @@ -489,7 +495,7 @@ process makeSeqDict { mkdir -p seq_dict mv $fasta "seq_dict/${base}" cd seq_dict - picard CreateSequenceDictionary R=$base O="${fasta.baseName}.dict" + picard -Xmx${task.memory.toMega()}M -Xms${task.memory.toMega()}M CreateSequenceDictionary R=$base O="${fasta.baseName}.dict" """ } @@ -507,8 +513,7 @@ process convertBam { file bam from ch_bam_to_fastq_convert output: - set val("${base}"), file("*.fastq.gz") into (ch_read_files_converted_fastqc, ch_read_files_converted_fastp) - file("*.fastq.gz") into (ch_read_files_converted_mapping_bwa, ch_read_files_converted_mapping_cm, ch_read_files_converted_mapping_bwamem) + set val("${base}"), file("*.fastq.gz") into (ch_read_files_converted_fastqc, ch_read_files_converted_fastp, ch_read_files_converted_mapping_bwa, ch_read_files_converted_mapping_cm, ch_read_files_converted_mapping_bwamem) script: base = "${bam.baseName}" @@ -549,13 +554,13 @@ process fastp { tag "$name" publishDir "${params.outdir}/FastP", mode: 'copy' - when: params.complexity_filter + when: params.complexity_filter_poly_g input: - set val(name), file(reads) from ch_read_files_complexity_filtering.mix(ch_read_files_converted_fastp) + set val(name), file(reads) from ch_read_files_complexity_filter_poly_g.mix(ch_read_files_converted_fastp) output: - set val(name), file("*pG.fq.gz") into ch_clipped_reads_complexity_filtered + set val(name), file("*pG.fq.gz") into ch_clipped_reads_complexity_filtered_poly_g file("*.json") into ch_fastp_for_multiqc script: @@ -574,60 +579,87 @@ process fastp { /* * STEP 2 - Adapter Clipping / Read Merging */ - - +//Initialize empty channel if we skip adapterremoval entirely +if(params.skip_adapterremoval) { + //No logs if no AR is run + ch_adapterremoval_logs = Channel.empty() + //Either coming from complexity filtering, or directly use reads normally directed to clipping first and push them through to the other channels downstream! + ch_clipped_reads_complexity_filtered_poly_g.mix(ch_read_files_clip).into { ch_clipped_reads;ch_clipped_reads_for_fastqc;ch_clipped_reads_circularmapper;ch_clipped_reads_bwamem } +} else { process adapter_removal { tag "$name" publishDir "${params.outdir}/read_merging", mode: 'copy' - when: !params.bam + when: !params.bam && !params.skip_adapterremoval input: - set val(name), file(reads) from ( params.complexity_filter ? ch_clipped_reads_complexity_filtered : ch_read_files_clip ) + set val(name), file(reads) from ( params.complexity_filter_poly_g ? ch_clipped_reads_complexity_filtered_poly_g : ch_read_files_clip ) output: - file "*.combined*.gz" into (ch_clipped_reads, ch_clipped_reads_for_fastqc,ch_clipped_reads_circularmapper,ch_clipped_reads_bwamem) - file "*.settings" into ch_adapterremoval_logs + set val(base), file("output/*.gz") into (ch_clipped_reads,ch_clipped_reads_for_fastqc,ch_clipped_reads_circularmapper,ch_clipped_reads_bwamem) + file("*.settings") into ch_adapterremoval_logs script: - prefix = reads[0].toString() - ~/(_R1)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?$/ - //Readprefixing only required for PE data with merging - fixprefix = (params.singleEnd) ? "" : "AdapterRemovalFixPrefix ${prefix}.combined.fq.gz ${prefix}.combined.prefixed.fq.gz" + base = reads[0].baseName + //This checks whether we skip trimming and defines a variable respectively + trim_me = params.skip_trim ? '' : "--trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}" + collapse_me = params.skip_collapse ? '' : '--collapse' - if( !params.singleEnd ){ + //PE mode, dependent on trim_me and collapse_me the respective procedure is run or not :-) + if (!params.singleEnd && !params.skip_collapse && !params.skip_trim){ """ - AdapterRemoval --file1 ${reads[0]} --file2 ${reads[1]} --basename ${prefix} --gzip --threads ${task.cpus} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap} --collapse + mkdir -p output + AdapterRemoval --file1 ${reads[0]} --file2 ${reads[1]} --basename ${base} ${trim_me} --gzip --threads ${task.cpus} ${collapse_me} #Combine files - zcat *.collapsed.gz *.collapsed.truncated.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz | gzip > ${prefix}.combined.fq.gz - ${fixprefix} - rm ${prefix}.combined.fq.gz + zcat *.collapsed.gz *.collapsed.truncated.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz | gzip > output/${base}.combined.fq.gz + """ + //PE, don't collapse, but trim reads + } else if (!params.singleEnd && params.skip_collapse && !params.skip_trim) { + """ + mkdir -p output + AdapterRemoval --file1 ${reads[0]} --file2 ${reads[1]} --basename ${base} --gzip --threads ${task.cpus} ${trim_me} ${collapse_me} + mv ${base}.pair*.truncated.gz output/ + """ + //PE, collapse, but don't trim reads + } else if (!params.singleEnd && !params.skip_collapse && params.skip_trim) { + """ + mkdir -p output + AdapterRemoval --file1 ${reads[0]} --file2 ${reads[1]} --basename ${base} --gzip --threads ${task.cpus} --basename ${base} ${collapse_me} ${trim_me} + + mv ${base}.pair*.truncated.gz output/ """ } else { + //SE, collapse not possible, trim reads """ - AdapterRemoval --file1 ${reads[0]} --basename ${prefix} --gzip --threads ${task.cpus} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} - # Pseudo-Combine - mv *.truncated.gz ${prefix}.combined.fq.gz + mkdir -p output + AdapterRemoval --file1 ${reads[0]} --basename ${base} --gzip --threads ${task.cpus} ${trim_me} + + mv *.truncated.gz output/ """ } } +} + + /* - * STEP 2.1 - FastQC after clipping/merging (if applied!) - */ +* STEP 2.1 - FastQC after clipping/merging (if applied!) +*/ process fastqc_after_clipping { - tag "${reads[0].baseName}" + tag "${prefix}" publishDir "${params.outdir}/FastQC/after_clipping", mode: 'copy', saveAs: {filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename"} - when: !params.bam + when: !params.bam && !params.skip_adapterremoval input: - file(reads) from ch_clipped_reads_for_fastqc + set val(name), file(reads) from ch_clipped_reads_for_fastqc output: file "*_fastqc.{zip,html}" optional true into ch_fastqc_after_clipping script: + prefix = reads[0].toString().tokenize('.')[0] """ fastqc -q $reads """ @@ -644,23 +676,39 @@ process bwa { when: !params.circularmapper && !params.bwamem input: - file(reads) from ch_clipped_reads.mix(ch_read_files_converted_mapping_bwa) + set val(name), file(reads) from ch_clipped_reads.mix(ch_read_files_converted_mapping_bwa) + file index from ch_bwa_index.first() output: file "*.sorted.bam" into ch_mapped_reads_idxstats,ch_mapped_reads_filter,ch_mapped_reads_preseq, ch_mapped_reads_damageprofiler - file "*.bai" into ch_bam_index_for_damageprofiler + file "*.{bai,csi}" into ch_bam_index_for_damageprofiler script: - prefix = reads[0].toString() - ~/(_R1)?(\.combined\.)?(prefixed)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?$/ - fasta = "${index}/*.fasta" + fasta = "${index}/*.fasta" + size = "${params.large_ref}" ? '-c' : '' + + //PE data without merging, PE data without any AR applied + if (!params.singleEnd && (params.skip_collapse || params.skip_adapterremoval)){ + prefix = reads[0].toString().tokenize('.')[0] + """ + bwa aln -t ${task.cpus} $fasta ${reads[0]} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${prefix}.r1.sai + bwa aln -t ${task.cpus} $fasta ${reads[1]} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${prefix}.r2.sai + bwa sampe -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta ${prefix}.r1.sai ${prefix}.r2.sai ${reads[0]} ${reads[1]} | samtools sort -@ ${task.cpus} -O bam - > ${prefix}.sorted.bam + samtools index "${size}" "${prefix}".sorted.bam + """ + } else { + //PE collapsed, or SE data + prefix = reads[0].toString().tokenize('.')[0] """ - bwa aln -t ${task.cpus} $fasta $reads -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f "${reads.baseName}.sai" - bwa samse -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta "${reads.baseName}".sai $reads | samtools sort -@ ${task.cpus} -O bam - > "${prefix}".sorted.bam - samtools index "${prefix}".sorted.bam + bwa aln -t ${task.cpus} $fasta $reads -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${prefix}.sai + bwa samse -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta ${prefix}.sai $reads | samtools sort -@ ${task.cpus} -O bam - > "${prefix}".sorted.bam + samtools index "${size}" "${prefix}".sorted.bam """ + } + } process circulargenerator{ @@ -699,25 +747,40 @@ process circularmapper{ when: params.circularmapper input: - file reads from ch_clipped_reads_circularmapper.mix(ch_read_files_converted_mapping_cm) + set val(name), file(reads) from ch_clipped_reads_circularmapper.mix(ch_read_files_converted_mapping_cm) file index from ch_circularmapper_indices.first() output: file "*.sorted.bam" into ch_mapped_reads_idxstats_cm,ch_mapped_reads_filter_cm,ch_mapped_reads_preseq_cm, ch_mapped_reads_damageprofiler_cm - file "*.bai" + file "*.{bai,csi}" script: filter = "${params.circularfilter}" ? '' : '-f true -x false' - prefix = reads[0].toString() - ~/(_R1)?(\.combined\.)?(prefixed)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?$/ + fasta = "${index}/*_*.fasta" + size = "${params.large_ref}" ? '-c' : '' + if (!params.singleEnd && params.skip_collapse ){ + prefix = reads[0].toString().tokenize('.')[0] """ - bwa aln -t ${task.cpus} $fasta $reads -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f "${reads.baseName}.sai" - bwa samse -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta "${reads.baseName}".sai $reads > tmp.out + bwa aln -t ${task.cpus} $fasta ${reads[0]} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${prefix}.r1.sai + bwa aln -t ${task.cpus} $fasta ${reads[1]} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${prefix}.r2.sai + bwa sampe -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta ${prefix}.r1.sai ${prefix}.r2.sai ${reads[0]} ${reads[1]} > tmp.out + realignsamfile -e ${params.circularextension} -i tmp.out -r $fasta $filter + samtools sort -@ ${task.cpus} -O bam tmp_realigned.bam > ${prefix}.sorted.bam + samtools index "${size}" ${prefix}.sorted.bam + """ + } else { + prefix = reads[0].toString().tokenize('.')[0] + """ + bwa aln -t ${task.cpus} $fasta $reads -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${prefix}.sai + bwa samse -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta ${prefix}.sai $reads > tmp.out realignsamfile -e ${params.circularextension} -i tmp.out -r $fasta $filter samtools sort -@ ${task.cpus} -O bam tmp_realigned.bam > "${prefix}".sorted.bam - samtools index "${prefix}".sorted.bam + samtools index "${size}" "${prefix}".sorted.bam """ + } + } process bwamem { @@ -727,21 +790,31 @@ process bwamem { when: params.bwamem && !params.circularmapper input: - file(reads) from ch_clipped_reads_bwamem.mix(ch_read_files_converted_mapping_bwamem) + set val(name), file(reads) from ch_clipped_reads_bwamem.mix(ch_read_files_converted_mapping_bwamem) file index from ch_bwa_index_bwamem.first() output: file "*.sorted.bam" into ch_bwamem_mapped_reads_idxstats,ch_bwamem_mapped_reads_filter,ch_bwamem_mapped_reads_preseq, ch_bwamem_mapped_reads_damageprofiler - file "*.bai" + file "*.{bai,csi}" script: prefix = reads[0].toString() - ~/(_R1)?(\.combined\.)?(prefixed)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?$/ fasta = "${index}/*.fasta" + size = "${params.large_ref}" ? '-c' : '' + + if (!params.singleEnd && params.skip_collapse){ + """ + bwa mem -t ${task.cpus} $fasta ${reads[0]} ${reads[1]} -R "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" | samtools sort -@ ${task.cpus} -O bam - > "${prefix}".sorted.bam + samtools index "${size}" -@ ${task.cpus} "${prefix}".sorted.bam + """ + } else { """ bwa mem -t ${task.cpus} $fasta $reads -R "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" | samtools sort -@ ${task.cpus} -O bam - > "${prefix}".sorted.bam - samtools index -@ ${task.cpus} "${prefix}".sorted.bam + samtools index "${size}" -@ ${task.cpus} "${prefix}".sorted.bam """ + } + } /* @@ -787,38 +860,39 @@ process samtools_filter { file "*filtered.bam" into ch_bam_filtered_qualimap, ch_bam_filtered_dedup, ch_bam_filtered_markdup, ch_bam_filtered_pmdtools, ch_bam_filtered_angsd, ch_bam_filtered_gatk file "*.fastq.gz" optional true file "*.unmapped.bam" optional true - file "*.bai" + file "*.{bai,csi}" script: prefix="$bam" - ~/(\.bam)?/ + size = "${params.large_ref}" ? '-c' : '' if("${params.bam_discard_unmapped}" && "${params.bam_unmapped_type}" == "discard"){ """ samtools view -h -b $bam -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam - samtools index ${prefix}.filtered.bam + samtools index "${size}" ${prefix}.filtered.bam """ } else if("${params.bam_discard_unmapped}" && "${params.bam_unmapped_type}" == "bam"){ """ samtools view -h $bam | tee >(samtools view - -@ ${task.cpus} -f4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.unmapped.bam) >(samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam) - samtools index ${prefix}.filtered.bam + samtools index "${size}" ${prefix}.filtered.bam """ } else if("${params.bam_discard_unmapped}" && "${params.bam_unmapped_type}" == "fastq"){ """ samtools view -h $bam | tee >(samtools view - -@ ${task.cpus} -f4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.unmapped.bam) >(samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam) - samtools index ${prefix}.filtered.bam + samtools index "${size}" ${prefix}.filtered.bam samtools fastq -tn ${prefix}.unmapped.bam | pigz -p ${task.cpus} > ${prefix}.unmapped.fastq.gz rm ${prefix}.unmapped.bam """ } else if("${params.bam_discard_unmapped}" && "${params.bam_unmapped_type}" == "both"){ """ samtools view -h $bam | tee >(samtools view - -@ ${task.cpus} -f4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.unmapped.bam) >(samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam) - samtools index ${prefix}.filtered.bam + samtools index "${size}" ${prefix}.filtered.bam samtools fastq -tn ${prefix}.unmapped.bam | pigz -p ${task.cpus} > ${prefix}.unmapped.fastq.gz """ } else { //Only apply quality filtering, default """ samtools view -h -b $bam -@ ${task.cpus} -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam - samtools index ${prefix}.filtered.bam + samtools index "${size}" ${prefix}.filtered.bam """ } } @@ -830,7 +904,9 @@ Step 6: DeDup / MarkDuplicates process dedup{ tag "${bam.baseName}" - publishDir "${params.outdir}/deduplication/dedup" + publishDir "${params.outdir}/deduplication/dedup", mode: 'copy', + saveAs: {filename -> (filename.endsWith(".hist") || filename.endsWith(".log")) ? "${prefix}/$filename" : "$filename"} + when: !params.skip_deduplication && params.dedupper == 'dedup' @@ -842,25 +918,26 @@ process dedup{ file "*.hist" into ch_hist_for_preseq file "*.log" into ch_dedup_results_for_multiqc file "${prefix}.sorted.bam" into ch_dedup_bam - file "*.bai" + file "*.{bai,csi}" script: prefix="${bam.baseName}" treat_merged="${params.dedup_all_merged}" ? '-m' : '' - + size = "${params.large_ref}" ? '-c' : '' + if(params.singleEnd) { """ dedup -i $bam $treat_merged -o . -u mv *.log dedup.log samtools sort -@ ${task.cpus} "$prefix"_rmdup.bam -o "$prefix".sorted.bam - samtools index "$prefix".sorted.bam + samtools index "${size}" "$prefix".sorted.bam """ } else { """ dedup -i $bam $treat_merged -o . -u mv *.log dedup.log samtools sort -@ ${task.cpus} "$prefix"_rmdup.bam -o "$prefix".sorted.bam - samtools index "$prefix".sorted.bam + samtools index "${size}" "$prefix".sorted.bam """ } } @@ -908,7 +985,7 @@ process damageprofiler { input: file bam from ch_mapped_reads_damageprofiler.mix(ch_mapped_reads_damageprofiler_cm,ch_bwamem_mapped_reads_damageprofiler) - file fasta from ch_fasta_for_damageprofiler + file fasta from ch_fasta_for_damageprofiler.first() file bai from ch_bam_index_for_damageprofiler @@ -935,7 +1012,7 @@ process qualimap { input: file bam from ch_bam_filtered_qualimap - file fasta from ch_fasta_for_qualimap + file fasta from ch_fasta_for_qualimap.first() output: file "*" into ch_qualimap_results @@ -971,7 +1048,7 @@ process markDup{ script: prefix = "${bam.baseName}" """ - picard MarkDuplicates INPUT=$bam OUTPUT=${prefix}.markDup.bam REMOVE_DUPLICATES=TRUE AS=TRUE METRICS_FILE="${prefix}.markdup.metrics" VALIDATION_STRINGENCY=SILENT + picard -Xmx${task.memory.toMega()}M -Xms${task.memory.toMega()}M MarkDuplicates INPUT=$bam OUTPUT=${prefix}.markDup.bam REMOVE_DUPLICATES=TRUE AS=TRUE METRICS_FILE="${prefix}.markdup.metrics" VALIDATION_STRINGENCY=SILENT """ } @@ -1038,15 +1115,16 @@ process bam_trim { output: file "*.trimmed.bam" into ch_trimmed_bam_for_genotyping - file "*.bai" + file "*.{bai,csi}" script: prefix="${bam.baseName}" softclip = "${params.bamutils_softclip}" ? '-c' : '' + size = "${params.large_ref}" ? '-c' : '' """ bam trimBam $bam tmp.bam -L ${params.bamutils_clip_left} -R ${params.bamutils_clip_right} ${softclip} samtools sort -@ ${task.cpus} tmp.bam -o ${prefix}.trimmed.bam - samtools index ${prefix}.trimmed.bam + samtools index "${size}" ${prefix}.trimmed.bam """ } @@ -1140,12 +1218,12 @@ process multiqc { file multiqc_config from ch_multiqc_config.collect().ifEmpty([]) file ('fastqc_raw/*') from ch_fastqc_results.collect().ifEmpty([]) file('fastqc/*') from ch_fastqc_after_clipping.collect().ifEmpty([]) - file ('software_versions/*') from software_versions_yaml.collect().ifEmpty([]) + file ('software_versions/software_versions_mqc*') from software_versions_yaml.collect().ifEmpty([]) file ('adapter_removal/*') from ch_adapterremoval_logs.collect().ifEmpty([]) file ('idxstats/*') from ch_idxstats_for_multiqc.collect().ifEmpty([]) file ('preseq/*') from ch_preseq_results.collect().ifEmpty([]) - file ('damageprofiler/*') from ch_damageprofiler_results.collect().ifEmpty([]) - file ('qualimap/*') from ch_qualimap_results.collect().ifEmpty([]) + file ('damageprofiler/dmgprof*/*') from ch_damageprofiler_results.collect().ifEmpty([]) + file ('qualimap/qualimap*/*') from ch_qualimap_results.collect().ifEmpty([]) file ('markdup/*') from ch_markdup_results_for_multiqc.collect().ifEmpty([]) file ('dedup*/*') from ch_dedup_results_for_multiqc.collect().ifEmpty([]) file ('fastp/*') from ch_fastp_for_multiqc.collect().ifEmpty([]) diff --git a/nextflow.config b/nextflow.config index f4ac236ad..539785beb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,10 +10,11 @@ // Global default params, used in configs params { - container = 'nfcore/eager:2.0.5' + container = 'nfcore/eager:2.0.6' //Pipeline options aligner = 'bwa' + saveReference = false saveTrimmed = true saveAlignedIntermediates = false @@ -23,25 +24,37 @@ params { tracedir = "${params.outdir}/pipeline_info" readPaths = false bam = false - + large_ref = false + //More defaults complexity_filter = false complexity_filter_poly_g_min = 10 trim_bam = false + //Skipping adapterremoval, trimming or collapsing defaults + skip_adapterremoval = false + skip_trim = false + skip_adapterremoval = false + // AWS Batch awsqueue = false awsregion = 'eu-west-1' igenomesIgnore = false custom_config_version = 'master' + custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" } + // Load base.config by default for all pipelines includeConfig 'conf/base.config' // Load nf-core custom profiles from different Institutions -includeConfig "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}/nfcore_custom.config" +try { + includeConfig "${params.custom_config_base}/nfcore_custom.config" +} catch (Exception e) { + System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") +} profiles { awsbatch { includeConfig 'conf/awsbatch.config' } @@ -87,7 +100,7 @@ manifest { name = 'nf-core/eager' author = 'Alexander Peltzer, Stephen Clayton, James A Fellows-Yates' homePage = 'https://github.com/nf-core/eager' - version = '2.0.5' + version = '2.0.6' description = 'A fully reproducible and modern ancient DNA pipeline in Nextflow and with cloud support.' mainScript = 'main.nf' nextflowVersion = '>=0.32.0'