diff --git a/assets/schema_input.json b/assets/schema_input.json index f7d24f9..3b3b2bc 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -7,18 +7,73 @@ "items": { "type": "object", "properties": { + "patient": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Patient ID must be provided and cannot contain spaces", + "meta": ["patient"] + }, "sample": { "type": "string", "pattern": "^\\S+$", - "errorMessage": "Sample name must be provided and cannot contain spaces" + "errorMessage": "Sample ID must be provided and cannot contain spaces", + "meta": ["sample"] }, - "fastq_1": { + "sex": { + "errorMessage": "Sex cannot contain spaces", + "meta": ["sex"], + "default": "NA", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + }, + "status": { + "type": "integer", + "errorMessage": "Status can only be 0 (normal) or 1 (tumor). Defaults to 0, if none is supplied.", + "meta": ["status"], + "default": "0", + "minimum": 0, + "maximum": 1 + }, + "lane": { "type": "string", - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "pattern": "^\\S+$", + "unique": ["patient", "sample"], + "anyOf": [ + { + "dependentRequired": ["fastq_1"] + }, + { + "dependentRequired": ["bam"] + } + ], + "meta": ["lane"] + }, + "fastq_1": { + "errorMessage": "FastQ file for reads 1 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.f(ast)?q\\.gz$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "format": "file-path", + "exists": true }, "fastq_2": { "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'", + "dependentRequired": ["fastq_1"], "anyOf": [ { "type": "string", @@ -28,9 +83,104 @@ "type": "string", "maxLength": 0 } - ] + ], + "format": "file-path", + "exists": true + }, + "table": { + "errorMessage": "Recalibration table cannot contain spaces and must have extension '.table'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.table$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "format": "file-path", + "exists": true + }, + "cram": { + "errorMessage": "CRAM file cannot contain spaces and must have extension '.cram'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.cram$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "format": "file-path", + "exists": true + }, + "crai": { + "errorMessage": "CRAM index file cannot contain spaces and must have extension '.crai'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.crai$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "format": "file-path", + "exists": true + }, + "bam": { + "errorMessage": "BAM file cannot contain spaces and must have extension '.bam'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.bam$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "format": "file-path", + "exists": true + }, + "bai": { + "errorMessage": "BAM index file cannot contain spaces and must have extension '.bai'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.bai$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "format": "file-path", + "exists": true + }, + "vcf": { + "errorMessage": "VCF file for reads 1 cannot contain spaces and must have extension '.vcf' or '.vcf.gz'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.vcf(\\.gz)?$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "format": "file-path", + "exists": true + }, + "variantcaller": { + "type": "string" } }, - "required": ["sample", "fastq_1"] + "required": ["patient", "sample"] } } diff --git a/conf/base.config b/conf/base.config index 53fb752..abff8ff 100644 --- a/conf/base.config +++ b/conf/base.config @@ -52,6 +52,54 @@ process { withLabel:process_high_memory { memory = { check_max( 200.GB * task.attempt, 'memory' ) } } + + withName: 'UNZIP.*|UNTAR.*|TABIX.*|BUILD_INTERVALS|CREATE_INTERVALS_BED|CUSTOM_DUMPSOFTWAREVERSIONS|VCFTOOLS|BCFTOOLS.*|SAMTOOLS_INDEX' { + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 1.GB * task.attempt, 'memory' ) } + } + withName: 'FASTQC'{ + cpus = { check_max( 4 * task.attempt, 'cpus' ) } + memory = { check_max( 4.GB * task.attempt, 'memory' ) } + } + withName: 'FASTP'{ + cpus = { check_max( 12 * task.attempt, 'cpus' ) } + memory = { check_max( 4.GB * task.attempt, 'memory' ) } + } + withName: 'BWAMEM1_MEM|BWAMEM2_MEM' { + cpus = { check_max( 24 * task.attempt, 'cpus' ) } + memory = { check_max( 30.GB * task.attempt, 'memory' ) } + } + withName: 'GATK4_MARKDUPLICATES|GATK4_MARKDUPLICATESSPARK' { + cpus = { check_max( 6 * task.attempt, 'cpus' ) } + memory = { check_max( 30.GB * task.attempt, 'memory' ) } + } + withName:'GATK4_APPLYBQSR|GATK4_APPLYBQSR_SPARK|GATK4_BASERECALIBRATOR|GATK4_BASERECALIBRATOR_SPARK|GATK4_GATHERBQSRREPORTS'{ + cpus = { check_max( 2 * task.attempt, 'cpus' ) } + memory = { check_max( 4.GB * task.attempt, 'memory' ) } + } + withName:'MOSDEPTH'{ + cpus = { check_max( 4 * task.attempt, 'cpus' ) } + memory = { check_max( 4.GB * task.attempt, 'memory' ) } + } + withName:'STRELKA.*|MANTA.*' { + cpus = { check_max( 10 * task.attempt, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } + } + withName:'SAMTOOLS_CONVERT'{ + memory = { check_max( 4.GB * task.attempt, 'memory' ) } + } + withName:'GATK4_MERGEVCFS'{ + cpus = { check_max( 2 * task.attempt, 'cpus' ) } + memory = { check_max( 4.GB * task.attempt, 'memory' ) } + } + withName: 'MULTIQC' { + cpus = { check_max( 4 * task.attempt, 'cpus' ) } + memory = { check_max( 12.GB * task.attempt, 'memory' ) } + } + withName: 'SVABA' { + cpus = { check_max( 12 * task.attempt, 'cpus' ) } + memory = { check_max( 16.GB * task.attempt, 'memory' ) } + } withLabel:error_ignore { errorStrategy = 'ignore' } diff --git a/conf/igenomes.config b/conf/igenomes.config index 3f11437..3733047 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -11,6 +11,76 @@ params { // illumina iGenomes reference file paths genomes { + 'GATK.GRCh37' { + fasta = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta" + fasta_fai = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta.fai" + chr_dir = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/Chromosomes" + dict = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.dict" + bwa = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/BWAIndex/" + dbsnp = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/dbsnp_138.b37.vcf.gz" + dbsnp_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/dbsnp_138.b37.vcf.gz.tbi" + dbsnp_vqsr = '--resource:dbsnp,known=false,training=true,truth=false,prior=2.0 dbsnp_138.b37.vcf.gz' + known_snps = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/1000G_phase1.snps.high_confidence.b37.vcf.gz" + known_snps_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/1000G_phase1.snps.high_confidence.b37.vcf.gz.tbi" + known_indels = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.gz" + known_indels_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.gz.tbi" + germline_resource = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/af-only-gnomad.raw.sites.vcf.gz" + germline_resource_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/af-only-gnomad.raw.sites.vcf.gz.tbi" + intervals = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/intervals/wgs_calling_regions_Sarek.list" + mappability = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/Control-FREEC/out100m2_hg19.gem" + ascat_alleles = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/ASCAT/G1000_alleles_hg19.zip" + ascat_genome = 'hg19' + ascat_loci = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/ASCAT/G1000_loci_hg19.zip" + ascat_loci_gc = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/ASCAT/GC_G1000_hg19.zip" + ascat_loci_rt = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/ASCAT/RT_G1000_hg19.zip" + snpeff_db = 87 + snpeff_genome = 'GRCh37' + vep_cache_version = 110 + vep_genome = 'GRCh37' + vep_species = 'homo_sapiens' + indel_mask = "${projectDir}/data/snowman_blacklist.bed" + germ_sv_db = "${projectDir}/data/snowman_germline_mini_160413.bed" + simple_seq_db = "${projectDir}/data/repeat_masker_hg19_Simple.bed" + blacklist_gridss = "${projectDir}/data/ENCFF001TDO_hg19_nochr.bed" + pon_gridss = "${projectDir}/data/GRIDSS/pon/hg19/" + } + 'GATK.GRCh38' { + fasta = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta" + fasta_fai = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta.fai" + chr_dir = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/Chromosomes" + dict = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.dict" + bwa = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/BWAIndex/" + bwamem2 = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/BWAmem2Index/" + cf_chrom_len = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/Length/Homo_sapiens_assembly38.len" + dbsnp = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz" + dbsnp_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz.tbi" + known_snps = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/1000G_omni2.5.hg38.vcf.gz" + known_snps_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/1000G_omni2.5.hg38.vcf.gz.tbi" + known_indels = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz" + known_indels_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" + germline_resource = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/af-only-gnomad.hg38.vcf.gz" + germline_resource_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/af-only-gnomad.hg38.vcf.gz.tbi" + intervals = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/intervals/wgs_calling_regions_noseconds.hg38.bed" + mappability = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/Control-FREEC/out100m2_hg38.gem" + pon = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/1000g_pon.hg38.vcf.gz" + pon_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/1000g_pon.hg38.vcf.gz.tbi" + ascat_alleles = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/ASCAT/G1000_alleles_hg38.zip" + ascat_genome = 'hg38' + ascat_loci = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/ASCAT/G1000_loci_hg38.zip" + ascat_loci_gc = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/ASCAT/GC_G1000_hg38.zip" + ascat_loci_rt = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/ASCAT/RT_G1000_hg38.zip" + snpeff_db = 105 + snpeff_genome = 'GRCh38' + vep_cache_version = 110 + vep_genome = 'GRCh38' + vep_species = 'homo_sapiens' + indel_mask = "${projectDir}/data/snowman_blacklist.hg38.bed" + germ_sv_db = "${projectDir}/data/snowman_germline_mini_hg38.bed" + simple_seq_db = "${projectDir}/data/repeat_masker_hg38_simple.bed" + blacklist_gridss = "${projectDir}/data/ENCFF356LFX_hg38.bed" + pon_gridss = "${projectDir}/data/GRIDSS/pon/hg38/" + } + 'GRCh37' { fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa" bwa = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/version0.6.0/" diff --git a/conf/modules.config b/conf/modules.config index da58a5d..75f68c8 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -28,6 +28,58 @@ process { withName: FASTQC { ext.args = '--quiet' + publishDir = [ + [ + path: { "${params.outdir}/QCreports/FastQC/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*{html,zip}" + ] + ] + } + + withName: 'NFCORE_HEISENBIO:HEISENBIO:CRAM_QC_NO_MD:SAMTOOLS_STATS' { + ext.when = { !(params.skip_tools && params.skip_tools.split(',').contains('samtools')) } + ext.prefix = { "${meta.id}.sorted.cram" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/QCreports/samtools/${meta.id}" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'MOSDEPTH' { + ext.args = { !params.wes ? "-n --fast-mode --by 500" : ""} + ext.prefix = { + if (params.tools && params.tools.split(',').contains('sentieon_dedup')) { + "${meta.id}.dedup" + } else if (params.skip_tools && params.skip_tools.split(',').contains('markduplicates')) { + "${meta.id}.sorted" + } else { + "${meta.id}.md" + } + } + ext.when = { !(params.skip_tools && params.skip_tools.split(',').contains('mosdepth')) } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/QCreports/mosdepth/${meta.id}" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + + if ((params.step == 'alignment' || params.step == 'markduplicates'|| params.step == 'prepare_recalibration'|| params.step == 'recalibrate') && (!(params.skip_tools && params.skip_tools.split(',').contains('baserecalibrator')))) { + withName: 'NFCORE_HEISENBIO:HEISENBIO:CRAM_QC_RECAL:MOSDEPTH' { + ext.prefix = { "${meta.id}.recal" } + } + + withName: 'NFCORE_HEISENBIO:HEISENBIO:CRAM_QC_RECAL:SAMTOOLS_STATS' { + ext.prefix = { "${meta.id}.recal.cram" } + ext.when = { !(params.skip_tools && params.skip_tools.split(',').contains('samtools')) } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/QCreports/samtools/${meta.id}" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } } withName: CUSTOM_DUMPSOFTWAREVERSIONS { diff --git a/conf/modules/aligner.config b/conf/modules/aligner.config new file mode 100644 index 0000000..b045e1e --- /dev/null +++ b/conf/modules/aligner.config @@ -0,0 +1,85 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// MAPPING + +process { + + if (params.step == 'alignment') { + withName: "BWAMEM1_MEM" { + ext.when = { params.aligner == "bwa-mem" } + } + + withName: "BWAMEM2_MEM" { + ext.when = { params.aligner == "bwa-mem2" } + } + + withName: "DRAGMAP_ALIGN" { + ext.when = { params.aligner == "dragmap" } + ext.args = { "--RGSM ${meta.patient}_${meta.sample} --RGID ${meta.read_group}" } + } + + withName: "SENTIEON_BWAMEM" { + ext.when = { params.aligner == "sentieon-bwamem" } + } + + withName: "(BWAMEM.*_MEM|DRAGMAP_ALIGN|SENTIEON_BWAMEM)" { + ext.prefix = { params.split_fastq > 1 ? "${meta.id}".concat('.').concat(reads.get(0).name.tokenize('.')[0]) : "${meta.id}.sorted" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/Alignment/" }, + pattern: "*bam", + // Only save if save_output_as_bam AND + // (save_mapped OR no_markduplicates OR sentieon_dedup) AND + // only a single BAM file per sample + saveAs: { + if (params.save_output_as_bam && + ( + params.save_mapped || + (params.skip_tools && params.skip_tools.split(',').contains('markduplicates')) && + !(params.tools && params.tools.split(',').contains('sentieon_dedup')) + ) && (meta.size * meta.num_lanes == 1) + ) { "Mapped/${meta.id}/${it}" } + else { null } + } + ] + } + + withName: "(BWAMEM.*_MEM|DRAGMAP_ALIGN)" { + // Markduplicates Spark NEEDS name-sorted reads or runtime goes through the roof + // However if it's skipped, reads need to be coordinate-sorted + // Only name sort if Spark for Markduplicates + duplicate marking is not skipped + // Currently SENTIEON_BWAMEM only supports coordinate sorting the reads. + ext.args2 = { params.use_gatk_spark && params.use_gatk_spark.contains('markduplicates') && (!params.skip_tools || (params.skip_tools && !params.skip_tools.split(',').contains('markduplicates'))) ? '-n' : '' } + } + + withName: "BWAMEM.*_MEM|SENTIEON_BWAMEM" { + // Using -B 3 for tumor samples + ext.args = { meta.status == 1 ? "-K 100000000 -Y -B 3 -R ${meta.read_group}" : "-K 100000000 -Y -R ${meta.read_group}" } + } + } + + withName: 'MERGE_BAM|INDEX_MERGE_BAM' { + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/Alignment/" }, + pattern: "*{bam,bai}", + // Only save if (save_output_as_bam AND (no_markduplicates OR save_mapped )) + saveAs: { (params.save_output_as_bam && (params.save_mapped || params.skip_tools && params.skip_tools.split(',').contains('markduplicates'))) ? "Mapped/${meta.id}/${it}" : null } + ] + } + + withName: 'MERGE_BAM' { + ext.prefix = { "${meta.id}.sorted" } + } +} \ No newline at end of file diff --git a/conf/modules/alignment_to_fastq.config b/conf/modules/alignment_to_fastq.config new file mode 100644 index 0000000..d8eda93 --- /dev/null +++ b/conf/modules/alignment_to_fastq.config @@ -0,0 +1,85 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// BAM TO FASTQ + +process { + + withName: 'COLLATE_FASTQ_MAP' { + ext.args2 = '-N' + ext.prefix = {"${meta.id}.mapped"} + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } + + withName: 'COLLATE_FASTQ_UNMAP' { + ext.args2 = '-N' + ext.prefix = {"${meta.id}.unmapped"} + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } + + withName: 'SAMTOOLS_VIEW_MAP_MAP' { + ext.args = '-b -f1 -F12' + ext.prefix = {"${meta.id}.map_map"} + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } + + withName: 'SAMTOOLS_VIEW_MAP_UNMAP' { + ext.args = '-b -f8 -F260' + ext.prefix = {"${meta.id}.map_unmap"} + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } + + withName: 'SAMTOOLS_VIEW_UNMAP_MAP' { + ext.args = '-b -f4 -F264' + ext.prefix = {"${meta.id}.unmap_map"} + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } + + withName: 'SAMTOOLS_VIEW_UNMAP_UNMAP' { + ext.args = '-b -f12 -F256' + ext.prefix = {"${meta.id}.unmap_unmap"} + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } + + withName: 'SAMTOOLS_MERGE_UNMAP' { + ext.prefix = {"${meta.id}.merged_unmap"} + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } + withName: 'CAT_FASTQ' { + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } +} diff --git a/conf/modules/markduplicates.config b/conf/modules/markduplicates.config new file mode 100644 index 0000000..fb2ab2d --- /dev/null +++ b/conf/modules/markduplicates.config @@ -0,0 +1,133 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// MARKDUPLICATES + +process { + + withName: 'CRAM_TO_BAM' { + ext.args = "-b" + } + + withName: 'BAM_TO_CRAM' { + // BAM provided for step Markduplicates either run through MD or Convert -> then saved as sorted.cram (convert) or md.cram (md directly) + // BAM files provided for step prepare_recal are converted and run through BQSR -> then saved as md.cram + // BAM files provided for step recal are converted and run through BQSR II -> then saved as md.cram + ext.args = "-C" + ext.prefix = { "${meta.id}.converted" } + publishDir = [ + enabled: !params.save_output_as_bam, + mode: params.publish_dir_mode, + path: { "${params.outdir}/Alignment/converted/${meta.id}" }, + pattern: "*{cram,crai}" + ] + } + + withName: 'NFCORE_HEISENBIO:HEISENBIO:(BAM_MARKDUPLICATES|BAM_MARKDUPLICATES_SPARK):CRAM_QC_MOSDEPTH_SAMTOOLS:SAMTOOLS_STATS' { + ext.when = { !(params.skip_tools && params.skip_tools.split(',').contains('samtools')) } + ext.prefix = { "${meta.id}.md.cram" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/Reports/samtools/${meta.id}" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'BAM_TO_CRAM_MAPPING' { + // Run only when mapping should be saved as CRAM or when no MD is done + ext.when = (params.save_mapped && !params.save_output_as_bam) || + ( + (params.skip_tools && params.skip_tools.split(',').contains('markduplicates')) && + !(params.tools && params.tools.split(',').contains('sentieon_dedup')) + ) + ext.prefix = { "${meta.id}.sorted" } + publishDir = [ + // Never publish if BAM only should be published + enabled: !params.save_output_as_bam, + mode: params.publish_dir_mode, + path: { "${params.outdir}/Alignment/Mapped/${meta.id}/" }, + pattern: "*{cram,crai}" + ] + } + + withName: 'GATK4_ESTIMATELIBRARYCOMPLEXITY' { + ext.prefix = { "${meta.id}.md.cram" } + ext.when = { !(params.skip_tools && params.skip_tools.split(',').contains('markduplicates_report')) } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/Reports/markduplicates/${meta.id}" }, + pattern: "*metrics" + ] + } + + withName: 'GATK4_MARKDUPLICATES' { + ext.args = '-REMOVE_DUPLICATES false -VALIDATION_STRINGENCY LENIENT' + ext.prefix = { "${meta.id}.md.cram" } + ext.when = { !(params.skip_tools && params.skip_tools.split(',').contains('markduplicates')) } + publishDir = [ + [ + enabled: !params.save_output_as_bam, + mode: params.publish_dir_mode, + path: { "${params.outdir}/Alignment/markduplicates/${meta.id}/" }, + pattern: "*{cram,crai}" + ], + [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/Reports/" }, + pattern: "*metrics", + saveAs: { !(params.skip_tools && params.skip_tools.split(',').contains('markduplicates_report')) ? "markduplicates/${meta.id}/${it}" : null} + ] + ] + } + + withName: 'GATK4_MARKDUPLICATES_SPARK' { + ext.args = '--remove-sequencing-duplicates false -VS LENIENT' + ext.prefix = { "${meta.id}.md.cram" } + publishDir = [ + enabled: !params.save_output_as_bam, + mode: params.publish_dir_mode, + path: { "${params.outdir}/Alignment/markduplicates/${meta.id}/" }, + pattern: "*{cram,crai}" + ] + } + + withName: 'INDEX_MARKDUPLICATES' { + publishDir = [ + enabled: !params.save_output_as_bam, + mode: params.publish_dir_mode, + path: { "${params.outdir}/Alignment/markduplicates/${meta.id}/" }, + pattern: "*{cram,crai}" + ] + } + + withName: 'NFCORE_HEISENBIO:HEISENBIO:CRAM_TO_BAM' { + ext.when = { params.save_output_as_bam } + if (params.tools && params.tools.split(',').contains('sentieon_dedup')) { + ext.prefix = { "${meta.id}.dedup" } + publishDir = [ + enabled: params.save_output_as_bam, + mode: params.publish_dir_mode, + path: { "${params.outdir}/Alignment/sentieon_dedup/${meta.id}/" }, + pattern: "*{dedup.bam,dedup.bam.bai}" + ] + } else { + ext.prefix = { "${meta.id}.md" } + publishDir = [ + enabled: params.save_output_as_bam, + mode: params.publish_dir_mode, + path: { "${params.outdir}/Alignment/markduplicates/${meta.id}/" }, + pattern: "*{md.bam,md.bam.bai}" + ] + } + } +} diff --git a/conf/modules/prepare_cache.config b/conf/modules/prepare_cache.config new file mode 100644 index 0000000..2f55495 --- /dev/null +++ b/conf/modules/prepare_cache.config @@ -0,0 +1,39 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + + +// PREPARE_CACHE + +process { + + // SNPEFF + withName: 'SNPEFF_DOWNLOAD' { + ext.when = { params.tools && (params.tools.split(',').contains('snpeff') || params.tools.split(',').contains('merge')) } + publishDir = [ + mode: params.publish_dir_mode, + path: { params.outdir_cache ? "${params.outdir_cache}/": "${params.outdir}/cache/snpeff/" } + ] + } + + // VEP + withName: 'ENSEMBLVEP_DOWNLOAD' { + ext.when = { params.tools && (params.tools.split(',').contains('vep') || params.tools.split(',').contains('merge')) } + ext.args = '--AUTO c --CONVERT --NO_BIOPERL --NO_HTSLIB --NO_TEST --NO_UPDATE' + publishDir = [ + mode: params.publish_dir_mode, + path: { params.outdir_cache ? "${params.outdir_cache}/": "${params.outdir}/cache/vep/" } + ] + } +} + + diff --git a/conf/modules/prepare_genome.config b/conf/modules/prepare_genome.config new file mode 100644 index 0000000..0ba9269 --- /dev/null +++ b/conf/modules/prepare_genome.config @@ -0,0 +1,126 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// PREPARE_GENOME + +process { + + withName: 'BWAMEM1_INDEX' { + ext.when = { !params.bwa && params.step == "alignment" && (params.aligner == "bwa-mem")} + publishDir = [ + enabled: (params.save_reference || params.build_only_index), + mode: params.publish_dir_mode, + path: { "${params.outdir}/Reference" }, + pattern: "bwa" + ] + } + + withName: 'BWAMEM2_INDEX' { + ext.when = { !params.bwamem2 && params.step == "alignment" && params.aligner == "bwa-mem2" } + publishDir = [ + enabled: (params.save_reference || params.build_only_index), + mode: params.publish_dir_mode, + path: { "${params.outdir}/Reference" }, + pattern: "bwamem2" + ] + } + + withName: 'GATK4_CREATESEQUENCEDICTIONARY' { + ext.when = { !params.dict && params.step != "annotate" && params.step != "controlfreec" } + publishDir = [ + enabled: (params.save_reference || params.build_only_index), + mode: params.publish_dir_mode, + path: { "${params.outdir}/Reference/dict" }, + pattern: "*dict" + ] + } + + withName: 'SAMTOOLS_FAIDX' { + ext.when = { !params.fasta_fai && params.step != "annotate" } + publishDir = [ + enabled: (params.save_reference || params.build_only_index), + mode: params.publish_dir_mode, + path: { "${params.outdir}/Reference/fai" }, + pattern: "*fai" + ] + } + + + withName: 'TABIX_DBSNP' { + ext.when = { !params.dbsnp_tbi && params.dbsnp && ((params.step == "alignment" || params.step == "markduplicates" || params.step == "prepare_recalibration") || params.tools && (params.tools.split(',').contains('controlfreec') || params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('mutect2'))) } + publishDir = [ + enabled: (params.save_reference || params.build_only_index), + mode: params.publish_dir_mode, + path: { "${params.outdir}/Reference/dbsnp" }, + pattern: "*vcf.gz.tbi" + ] + } + + withName: 'TABIX_GERMLINE_RESOURCE' { + ext.when = { !params.germline_resource_tbi && params.germline_resource && params.tools && params.tools.split(',').contains('mutect2') } + publishDir = [ + enabled: (params.save_reference || params.build_only_index), + mode: params.publish_dir_mode, + path: { "${params.outdir}/Reference/germline_resource" }, + pattern: "*vcf.gz.tbi" + ] + } + + withName: 'TABIX_KNOWN_INDELS' { + ext.when = { !params.known_indels_tbi && params.known_indels && (params.step == 'alignment' || params.step == "markduplicates" || params.step == 'prepare_recalibration' || (params.tools && (params.tools.split(',').contains('haplotypecaller'))) ) } + publishDir = [ + enabled: (params.save_reference || params.build_only_index), + mode: params.publish_dir_mode, + path: { "${params.outdir}/Reference/known_indels" }, + pattern: "*vcf.gz.tbi" + ] + } + + withName: 'TABIX_KNOWN_SNPS' { + ext.when = { !params.known_snps_tbi && params.known_snps && (params.step == 'alignment' || params.step == "markduplicates" || params.step == 'prepare_recalibration' || (params.tools && (params.tools.split(',').contains('haplotypecaller'))) ) } + publishDir = [ + enabled: (params.save_reference || params.build_only_index), + mode: params.publish_dir_mode, + path: { "${params.outdir}/Reference/known_snps" }, + pattern: "*vcf.gz.tbi" + ] + } + + withName: 'TABIX_PON' { + ext.when = { !params.pon_tbi && params.pon && params.tools && params.tools.split(',').contains('mutect2') } + publishDir = [ + enabled: (params.save_reference || params.build_only_index), + mode: params.publish_dir_mode, + path: { "${params.outdir}/Reference/pon" }, + pattern: "*vcf.gz.tbi" + ] + } + + withName: 'UNZIP_ALLELES|UNZIP_LOCI|UNZIP_GC|UNZIP_RT' { + ext.when = { params.tools && params.tools.split(',').contains('ascat')} + publishDir = [ + enabled: false + ] + } + + withName: 'UNTAR_CHR_DIR' { + ext.when = { params.tools && params.tools.split(',').contains('controlfreec')} + } +} + + + + + + + diff --git a/conf/modules/prepare_intervals.config b/conf/modules/prepare_intervals.config new file mode 100644 index 0000000..330db31 --- /dev/null +++ b/conf/modules/prepare_intervals.config @@ -0,0 +1,45 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// PREPARE INTERVALS + +process { + + withName: 'CREATE_INTERVALS_BED' { + publishDir = [ + enabled: (params.save_reference || params.build_only_index), + mode: params.publish_dir_mode, + path: { "${params.outdir}/Reference/intervals" }, + pattern: "*bed" + ] + } + + withName: 'GATK4_INTERVALLISTTOBED' { + publishDir = [ + enabled: (params.save_reference || params.build_only_index), + mode: params.publish_dir_mode, + path: { "${params.outdir}/Reference/intervals" }, + pattern: "*bed" + ] + } + + withName: 'TABIX_BGZIPTABIX_INTERVAL_SPLIT' { + ext.prefix = {"${meta.id}"} + publishDir = [ + enabled: (params.save_reference || params.build_only_index), + mode: params.publish_dir_mode, + path: { "${params.outdir}/Reference/intervals" }, + pattern: "*bed.gz" + ] + } +} diff --git a/conf/modules/prepare_recalibration.config b/conf/modules/prepare_recalibration.config new file mode 100644 index 0000000..04590b3 --- /dev/null +++ b/conf/modules/prepare_recalibration.config @@ -0,0 +1,39 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// PREPARE_RECALIBRATION + +process { + + withName: 'GATK4_BASERECALIBRATOR|GATK4_BASERECALIBRATOR_SPARK' { + ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}.recal" : "${meta.id}_${intervals.simpleName}.recal" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/Alignment/" }, + pattern: "*table", + saveAs: { meta.num_intervals > 1 ? null : "recal_table/${meta.id}/${it}" } + ] + } + + withName: 'GATK4_GATHERBQSRREPORTS' { + ext.prefix = {"${meta.id}.recal"} + ext.when = { meta.num_intervals > 1 } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/Alignment/recal_table/${meta.id}/" }, + pattern: "*table", + ] + } +} + diff --git a/conf/modules/recalibrate.config b/conf/modules/recalibrate.config new file mode 100644 index 0000000..9ea9f94 --- /dev/null +++ b/conf/modules/recalibrate.config @@ -0,0 +1,61 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// RECALIBRATE + +process { + + withName: 'GATK4_APPLYBQSR|GATK4_APPLYBQSR_SPARK' { + ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}.recal" : "${meta.id}_${intervals.simpleName}.recal" } + publishDir = [ + enabled: !params.save_output_as_bam, + mode: params.publish_dir_mode, + path: { "${params.outdir}/Alignment/" }, + pattern: "*cram", + saveAs: { meta.num_intervals > 1 ? null : "recalibrated/${meta.id}/${it}" } + ] + } + + if ((params.step == 'alignment' || params.step == 'markduplicates'|| params.step == 'prepare_recalibration'|| params.step == 'recalibrate') && (!(params.skip_tools && params.skip_tools.split(',').contains('baserecalibrator')))) { + withName: 'NFCORE_HEISENBIO:HEISENBIO:(BAM_APPLYBQSR|BAM_APPLYBQSR_SPARK):CRAM_MERGE_INDEX_SAMTOOLS:MERGE_CRAM' { + ext.prefix = { "${meta.id}.recal" } + ext.when = { meta.num_intervals > 1 } + publishDir = [ + enabled: !params.save_output_as_bam, + mode: params.publish_dir_mode, + path: { "${params.outdir}/Alignment/recalibrated/${meta.id}/" }, + pattern: "*cram" + ] + } + + withName: 'NFCORE_HEISENBIO:HEISENBIO:(BAM_APPLYBQSR|BAM_APPLYBQSR_SPARK):CRAM_MERGE_INDEX_SAMTOOLS:INDEX_CRAM' { + publishDir = [ + enabled: !params.save_output_as_bam, + mode: params.publish_dir_mode, + path: { "${params.outdir}/Alignment/recalibrated/${meta.id}/" }, + pattern: "*{recal.cram,recal.cram.crai}" + ] + } + } + + withName: 'CRAM_TO_BAM_RECAL' { + ext.prefix = { "${meta.id}.recal" } + ext.when = { params.save_output_as_bam} + publishDir = [ + enabled: params.save_output_as_bam, + mode: params.publish_dir_mode, + path: { "${params.outdir}/Alignment/recalibrated/${meta.id}/" }, + pattern: "*{recal.bam,recal.bam.bai}" + ] + } +} diff --git a/conf/modules/structural_variants.config b/conf/modules/structural_variants.config new file mode 100644 index 0000000..4fc1727 --- /dev/null +++ b/conf/modules/structural_variants.config @@ -0,0 +1,44 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// SV calling + +process { + + withName: 'NFCORE_HEISENBIO:HEISENBIO:BAM_SVCALLING_SVABA:SVABA' { + ext.prefix = { "${meta.id}.svaba" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/SV_calling/SVABA/${meta.id}/" }, + pattern: "*{vcf.gz,txt.gz,vcf*,bam}" + ] + } + + withName: 'NFCORE_HEISENBIO:HEISENBIO:BAM_SVCALLING_GRIDSS:GRIDSS_GRIDSS' { + ext.prefix = { "${meta.id}.gridss" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/SV_calling/GRIDSS/${meta.id}/" }, + pattern: "*{vcf.gz,txt.gz,vcf*,bam}" + ] + } + + withName: 'NFCORE_HEISENBIO:HEISENBIO:BAM_SVCALLING_GRIDSS_SOMATIC:GRIDSS_SOMATIC' { + ext.prefix = { "${meta.id}.high_confidence_somatic" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/SV_calling/GRIDSS/${meta.id}/" }, + pattern: "*{vcf.bgz, vcf.bgz.tbi}" + ] + } +} diff --git a/conf/modules/trimming.config b/conf/modules/trimming.config new file mode 100644 index 0000000..1e54501 --- /dev/null +++ b/conf/modules/trimming.config @@ -0,0 +1,42 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// TRIMMING + +process { + + withName: 'FASTP' { + ext.args = [ "", + !params.trim_fastq ? "--disable_adapter_trimming" : "", // Disable adapter trimming + params.clip_r1 > 0 ? "--trim_front1 ${params.clip_r1}" : "", // Remove bp from the 5' end of read 1 + params.clip_r2 > 0 ? "--trim_front2 ${params.clip_r2}" : "", // Remove bp from the 5' end of read 2 + params.three_prime_clip_r1 > 0 ? "--trim_tail1 ${params.three_prime_clip_r1}" : "", // Remove bp from the 3' end of read 1 AFTER adapter/quality trimming has been performed + params.three_prime_clip_r2 > 0 ? "--trim_tail2 ${params.three_prime_clip_r2}" : "", // Remove bp from the 3' end of read 2 AFTER adapter/quality trimming has been performed + params.trim_nextseq ? "--trim_poly_g" : "", // Apply the --nextseq=X option, to trim based on quality after removing poly-G tails + params.split_fastq > 0 ? "--split_by_lines ${params.split_fastq * 4}" : "" + ].join(" ").trim() + publishDir = [ + [ + path: { "${params.outdir}/QCreports/fastp/${meta.sample}" }, + mode: params.publish_dir_mode, + pattern: "*.{html,json,log}" + ], + [ + enabled: params.save_trimmed || params.save_split_fastqs, + path: { "${params.outdir}/Alignment/fastp/${meta.sample}/" }, + mode: params.publish_dir_mode, + pattern: "*.fastp.fastq.gz" + ] + ] + } +} diff --git a/lib/WorkflowHeisenbio.groovy b/lib/WorkflowHeisenbio.groovy index ac97d55..69975eb 100755 --- a/lib/WorkflowHeisenbio.groovy +++ b/lib/WorkflowHeisenbio.groovy @@ -119,4 +119,34 @@ class WorkflowHeisenbio { Nextflow.error(error_string) } } + + public static String retrieveInput(params, log){ + def input = null + if (!params.input && !params.build_only_index) { + switch (params.step) { + case 'alignment': Nextflow.error("Can't start with step $params.step without samplesheet") + break + case 'markduplicates': log.warn("Using file ${params.outdir}/csv/mapped.csv"); + input = params.outdir + "/csv/mapped.csv" + break + case 'prepare_recalibration': log.warn("Using file ${params.outdir}/csv/markduplicates_no_table.csv"); + input = params.outdir + "/csv/markduplicates_no_table.csv" + break + case 'recalibrate': log.warn("Using file ${params.outdir}/csv/markduplicates.csv"); + input = params.outdir + "/csv/markduplicates.csv" + break + case 'variant_calling': log.warn("Using file ${params.outdir}/csv/recalibrated.csv"); + input = params.outdir + "/csv/recalibrated.csv" + break + // case 'controlfreec': csv_file = file("${params.outdir}/variant_calling/csv/control-freec_mpileup.csv", checkIfExists: true); break + case 'annotate': log.warn("Using file ${params.outdir}/csv/variantcalled.csv"); + input = params.outdir + "/csv/variantcalled.csv" + break + default: log.warn("Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'") + Nextflow.error("Unknown step $params.step") + } + } + return input + + } } diff --git a/main.nf b/main.nf index 541c54e..eaaa7d4 100644 --- a/main.nf +++ b/main.nf @@ -3,7 +3,7 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ nf-core/heisenbio ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Github : https://github.com/nf-core/heisenbio + Github : https://tanubrata/nf-core/heisenbio Website: https://nf-co.re/heisenbio Slack : https://nfcore.slack.com/channels/heisenbio ---------------------------------------------------------------------------------------- @@ -17,7 +17,52 @@ nextflow.enable.dsl = 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') +//params.ascat_alleles = WorkflowMain.getGenomeAttribute(params, 'ascat_alleles') +//params.ascat_genome = WorkflowMain.getGenomeAttribute(params, 'ascat_genome') +//params.ascat_loci = WorkflowMain.getGenomeAttribute(params, 'ascat_loci') +//params.ascat_loci_gc = WorkflowMain.getGenomeAttribute(params, 'ascat_loci_gc') +//params.ascat_loci_rt = WorkflowMain.getGenomeAttribute(params, 'ascat_loci_rt') +params.bwa = WorkflowMain.getGenomeAttribute(params, 'bwa') +params.bwamem2 = WorkflowMain.getGenomeAttribute(params, 'bwamem2') +params.cf_chrom_len = WorkflowMain.getGenomeAttribute(params, 'cf_chrom_len') +params.chr_dir = WorkflowMain.getGenomeAttribute(params, 'chr_dir') +params.dbsnp = WorkflowMain.getGenomeAttribute(params, 'dbsnp') +params.dbsnp_tbi = WorkflowMain.getGenomeAttribute(params, 'dbsnp_tbi') +params.dbsnp_vqsr = WorkflowMain.getGenomeAttribute(params, 'dbsnp_vqsr') +params.dict = WorkflowMain.getGenomeAttribute(params, 'dict') +//params.dragmap = WorkflowMain.getGenomeAttribute(params, 'dragmap') +params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') +params.fasta_fai = WorkflowMain.getGenomeAttribute(params, 'fasta_fai') +//params.germline_resource = WorkflowMain.getGenomeAttribute(params, 'germline_resource') +//params.germline_resource_tbi = WorkflowMain.getGenomeAttribute(params, 'germline_resource_tbi') +params.intervals = WorkflowMain.getGenomeAttribute(params, 'intervals') +params.known_snps = WorkflowMain.getGenomeAttribute(params, 'known_snps') +params.known_snps_tbi = WorkflowMain.getGenomeAttribute(params, 'known_snps_tbi') +params.known_snps_vqsr = WorkflowMain.getGenomeAttribute(params, 'known_snps_vqsr') +params.known_indels = WorkflowMain.getGenomeAttribute(params, 'known_indels') +params.known_indels_tbi = WorkflowMain.getGenomeAttribute(params, 'known_indels_tbi') +params.known_indels_vqsr = WorkflowMain.getGenomeAttribute(params, 'known_indels_vqsr') +//params.mappability = WorkflowMain.getGenomeAttribute(params, 'mappability') +//params.pon = WorkflowMain.getGenomeAttribute(params, 'pon') +//params.pon_tbi = WorkflowMain.getGenomeAttribute(params, 'pon_tbi') +//params.snpeff_db = WorkflowMain.getGenomeAttribute(params, 'snpeff_db') +//params.snpeff_genome = WorkflowMain.getGenomeAttribute(params, 'snpeff_genome') +//params.vep_cache_version = WorkflowMain.getGenomeAttribute(params, 'vep_cache_version') +//params.vep_genome = WorkflowMain.getGenomeAttribute(params, 'vep_genome') +//params.vep_species = WorkflowMain.getGenomeAttribute(params, 'vep_species') +params.indel_mask = WorkflowMain.getGenomeAttribute(params, 'indel_mask') +params.germ_sv_db = WorkflowMain.getGenomeAttribute(params, 'germ_sv_db') +params.simple_seq_db = WorkflowMain.getGenomeAttribute(params, 'simple_seq_db') +params.blacklist_gridss = WorkflowMain.getGenomeAttribute(params, 'blacklist_gridss') +params.pon_gridss = WorkflowMain.getGenomeAttribute(params, 'pon_gridss') + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ALTERNATIVE INPUT FILE ON RESTART +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params.input_restart = WorkflowHeisenbio.retrieveInput(params, log) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/modules.json b/modules.json index 4af784b..9405ccf 100644 --- a/modules.json +++ b/modules.json @@ -5,23 +5,246 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "bwa/index": { + "branch": "master", + "git_sha": "bfed129da5134b4439b1821c917972570d44d39c", + "installed_by": [ + "modules" + ] + }, + "bwa/mem": { + "branch": "master", + "git_sha": "3dc300ddcaa563c1e3503477557c0e0def6df2ce", + "installed_by": [ + "modules" + ] + }, + "bwamem2/index": { + "branch": "master", + "git_sha": "bfed129da5134b4439b1821c917972570d44d39c", + "installed_by": [ + "modules" + ] + }, + "bwamem2/mem": { + "branch": "master", + "git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013", + "installed_by": [ + "modules" + ] + }, + "cat/fastq": { + "branch": "master", + "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", + "installed_by": [ + "modules" + ] + }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "76cc4938c1f6ea5c7d83fed1eeffc146787f9543", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] + }, + "dragmap/align": { + "branch": "master", + "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", + "installed_by": [ + "modules" + ] + }, + "ensemblvep/download": { + "branch": "master", + "git_sha": "9f9e1fc31cb35876922070c0e601ae05abae5cae", + "installed_by": [ + "modules" + ] + }, + "fastp": { + "branch": "master", + "git_sha": "d497a4868ace3302016ea8ed4b395072d5e833cd", + "installed_by": [ + "modules" + ] }, "fastqc": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] + }, + "gatk4/applybqsr": { + "branch": "master", + "git_sha": "cf8f9ace77aac01caa5c7cb92af5bbda7adb77bd", + "installed_by": [ + "modules" + ] + }, + "gatk4/baserecalibrator": { + "branch": "master", + "git_sha": "cf8f9ace77aac01caa5c7cb92af5bbda7adb77bd", + "installed_by": [ + "modules" + ] + }, + "gatk4/createsequencedictionary": { + "branch": "master", + "git_sha": "cf8f9ace77aac01caa5c7cb92af5bbda7adb77bd", + "installed_by": [ + "modules" + ] + }, + "gatk4/gatherbqsrreports": { + "branch": "master", + "git_sha": "cf8f9ace77aac01caa5c7cb92af5bbda7adb77bd", + "installed_by": [ + "modules" + ] + }, + "gatk4/intervallisttobed": { + "branch": "master", + "git_sha": "cf8f9ace77aac01caa5c7cb92af5bbda7adb77bd", + "installed_by": [ + "modules" + ] + }, + "gatk4/markduplicates": { + "branch": "master", + "git_sha": "cf8f9ace77aac01caa5c7cb92af5bbda7adb77bd", + "installed_by": [ + "modules" + ] + }, + "gridss/gridss": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": [ + "modules" + ] + }, + "mosdepth": { + "branch": "master", + "git_sha": "ebb27711cd5f4de921244bfa81c676504072d31c", + "installed_by": [ + "modules" + ] + }, + "msisensorpro/scan": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": [ + "modules" + ] }, "multiqc": { "branch": "master", "git_sha": "f2d63bd5b68925f98f572eed70993d205cc694b7", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] + }, + "samtools/collatefastq": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": [ + "modules" + ] + }, + "samtools/convert": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": [ + "modules" + ] + }, + "samtools/faidx": { + "branch": "master", + "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe", + "installed_by": [ + "modules" + ] + }, + "samtools/index": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": [ + "modules" + ] + }, + "samtools/merge": { + "branch": "master", + "git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013", + "installed_by": [ + "modules" + ] + }, + "samtools/stats": { + "branch": "master", + "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "installed_by": [ + "modules" + ] + }, + "samtools/view": { + "branch": "master", + "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f", + "installed_by": [ + "modules" + ] + }, + "sentieon/bwamem": { + "branch": "master", + "git_sha": "149b4746c6e16ef84f64db5bb245d5b9495fdc3f", + "installed_by": [ + "modules" + ] + }, + "snpeff/download": { + "branch": "master", + "git_sha": "4d584d5cf6ed5f7363a51cdb4b3eb25398e9e537", + "installed_by": [ + "modules" + ] + }, + "svaba": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": [ + "modules" + ] + }, + "tabix/bgziptabix": { + "branch": "master", + "git_sha": "591b71642820933dcb3c954c934b397bd00d8e5e", + "installed_by": [ + "modules" + ] + }, + "tabix/tabix": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": [ + "modules" + ] + }, + "untar": { + "branch": "master", + "git_sha": "d0b4fc03af52a1cc8c6fb4493b921b57352b1dd8", + "installed_by": [ + "modules" + ] + }, + "unzip": { + "branch": "master", + "git_sha": "cf67a6d7d043e2bd6a3099be84c72046fc71508f", + "installed_by": [ + "modules" + ] } } } } } -} +} \ No newline at end of file diff --git a/modules/local/add_info_to_vcf/main.nf b/modules/local/add_info_to_vcf/main.nf new file mode 100644 index 0000000..deb413e --- /dev/null +++ b/modules/local/add_info_to_vcf/main.nf @@ -0,0 +1,43 @@ + +process ADD_INFO_TO_VCF { + tag "$meta.id" + + conda "anaconda::gawk=5.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : + 'biocontainers/gawk:5.1.0' }" + + input: + tuple val(meta), path(vcf_gz) + + output: + tuple val(meta), path("*.added_info.vcf"), emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + input="input.vcf" + output="${vcf_gz.baseName.minus(".vcf")}.added_info.vcf" + zcat $vcf_gz > \$input + ## Add info header lines + grep -E "^##" \$input > \$output + ## Add description of new INFO value + echo '##INFO=' >> \$output + ## Add column header + grep -E "^#CHROM" \$input >> \$output + ## Add SOURCE value to INFO column of variant calls + if grep -Ev "^#" \$input; then + grep -Ev "^#" \$input | awk 'BEGIN{FS=OFS="\t"} { \$8=="." ? \$8="SOURCE=$vcf_gz" : \$8=\$8";SOURCE=$vcf_gz"; print }' >> \$output + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ +} + + diff --git a/modules/local/build_intervals/main.nf b/modules/local/build_intervals/main.nf new file mode 100644 index 0000000..29a6b5a --- /dev/null +++ b/modules/local/build_intervals/main.nf @@ -0,0 +1,28 @@ +process BUILD_INTERVALS { + tag "$meta.id" + + conda "anaconda::gawk=5.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : + 'biocontainers/gawk:5.1.0' }" + + input: + tuple val(meta), path(fasta_fai) + + output: + tuple val(meta), path("${fasta_fai.baseName}.bed") , emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + awk -v FS='\t' -v OFS='\t' '{ print \$1, \"0\", \$2 }' ${fasta_fai} > ${fasta_fai.baseName}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ +} diff --git a/modules/local/create_intervals_bed/main.nf b/modules/local/create_intervals_bed/main.nf new file mode 100644 index 0000000..269f897 --- /dev/null +++ b/modules/local/create_intervals_bed/main.nf @@ -0,0 +1,76 @@ + +process CREATE_INTERVALS_BED { + tag "$intervals" + + conda "anaconda::gawk=5.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : + 'biocontainers/gawk:5.1.0' }" + + input: + path(intervals) + + output: + path("*.bed") , emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // If intervals file is in BED format, + // Fifth column is interpreted to contain runtime estimates + // Which is then used to combine short-running jobs + if (intervals.toString().toLowerCase().endsWith("bed")) { + """ + awk -vFS="\t" '{ + t = \$5 # runtime estimate + if (t == "") { + # no runtime estimate in this row, assume default value + t = (\$3 - \$2) / ${params.nucleotides_per_second} + } + if (name == "" || (chunk > 600 && (chunk + t) > longest * 1.05)) { + # start a new chunk + name = sprintf("%s_%d-%d.bed", \$1, \$2+1, \$3) + chunk = 0 + longest = 0 + } + if (t > longest) + longest = t + chunk += t + print \$0 > name + }' ${intervals} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ + } else if (intervals.toString().toLowerCase().endsWith("interval_list")) { + """ + grep -v '^@' ${intervals} | awk -vFS="\t" '{ + name = sprintf("%s_%d-%d", \$1, \$2, \$3); + printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed" + }' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ + } else { + """ + awk -vFS="[:-]" '{ + name = sprintf("%s_%d-%d", \$1, \$2, \$3); + printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed" + }' ${intervals} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ + } +} + + diff --git a/modules/local/gridss/gridss/main.nf b/modules/local/gridss/gridss/main.nf new file mode 100644 index 0000000..110f74a --- /dev/null +++ b/modules/local/gridss/gridss/main.nf @@ -0,0 +1,83 @@ + +process GRIDSS_GRIDSS { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::gridss=2.13.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gridss:2.13.2--h270b39a_0': + 'biocontainers/gridss:2.13.2--h270b39a_0' }" + + + input: + tuple val(meta), path(normalbam), path(normalbai), path(tumorbam), path(tumorbai) // required: [meta, normal_cram, normal_crai, tumor_cram, tumor_crai] + path(fasta) // required: reference fasta + path(fasta_fai) + path(bwa_index) // required: bwa index folder + path(blacklist_gridss) // optional: gridss blacklist bed file based on genome + + + output: + tuple val(meta), path("*.vcf.gz") , emit: vcf, optional:true + tuple val(meta), path("*.vcf.gz.tbi") , emit: vcf_index, optional:true + tuple val(meta), path("*.assembly.bam") , emit: assembly, optional:true + path "versions.yml" , emit: versions + + + when: + task.ext.when == null || task.ext.when + + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '2.13.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + def assembly_bam = "--assembly ${meta.id}.assembly.bam" + def bwa = bwa_index ? "cp -s ${bwa_index}/* ." : "" + def blacklist = blacklist_gridss ? "--blacklist ${blacklist_gridss}" : "" + + """ + ${bwa} + + gridss \\ + --output ${prefix}.vcf.gz \\ + --reference ${fasta} \\ + --threads ${task.cpus} \\ + $assembly_bam \\ + $blacklist \\ + --picardoptions VALIDATION_STRINGENCY=LENIENT \\ + --jvmheap ${task.memory.toGiga() - 1}g \\ + --otherjvmheap ${task.memory.toGiga() - 1}g \\ + ${normalbam} \\ + ${tumorbam} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gridss: ${VERSION} + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '2.13.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + + def steps = args.contains("-s ") ? args.split('-s ')[-1].split(" ")[0] : + args.contains("--steps ") ? args.split('--steps ')[-1].split(" ")[0] : + "all" + def vcf = steps.contains("call") || steps.contains("all") ? "touch ${prefix}.vcf*" : "" + def assembly_bam = steps.contains("assembly") || steps.contains("all") ? "touch ${meta.id}.assembly.bam" : "" + """ + ${vcf} + ${assembly_bam} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gridss: ${VERSION} + END_VERSIONS + """ +} + + + + diff --git a/modules/local/gridss/somaticFilter/main.nf b/modules/local/gridss/somaticFilter/main.nf new file mode 100644 index 0000000..bd1a434 --- /dev/null +++ b/modules/local/gridss/somaticFilter/main.nf @@ -0,0 +1,62 @@ +process GRIDSS_SOMATIC { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::gridss=2.13.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gridss:2.13.2--h270b39a_0': + 'biocontainers/gridss:2.13.2--h270b39a_0' }" + + + input: + tuple val(meta), path(gridss_output) + path(pondir_gridss) + + output: + tuple val(meta), path("*high_confidence_somatic.vcf.gz") , emit: somatic_high_vcf, optional:true + tuple val(meta), path("*high_and_low_confidence_somatic.vcf.gz") , emit: somatic_all_vcf, optional:true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '2.13.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + def output1 = "${meta.id}.high_confidence_somatic.vcf.gz" + def output2 = "${meta.id}.high_and_low_confidence_somatic.vcf.gz" + //def pon = pondir_gridss ? "cp -s ${pondir_gridss}/* ." : "" + //def scriptDir = "dirname \$(which gridss_somatic_filter)".execute().text.trim() + """ + + gridss_somatic_filter \\ + --pondir ${pondir_gridss} \\ + --input ${gridss_output} \\ + --output $output1 \\ + --fulloutput $output2 \\ + -n 1 \\ + -t 2 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gridss: ${VERSION} + END_VERSIONS + + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '2.13.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + touch ${prefix}.high_confidence_somatic.vcf.bgz + touch ${prefix}.high_and_low_confidence_somatic.vcf.bgz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gridss: ${VERSION} + END_VERSIONS + """ +} + diff --git a/modules/local/svaba/main.nf b/modules/local/svaba/main.nf new file mode 100644 index 0000000..cba2c4f --- /dev/null +++ b/modules/local/svaba/main.nf @@ -0,0 +1,89 @@ + +process SVABA { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::svaba=1.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/svaba:1.1.0--h7d7f7ad_2': + 'biocontainers/svaba:1.1.0--h7d7f7ad_2' }" + + + input: + tuple val(meta), path(tumorbam), path(tumorbai), path(normalbam), path(normalbai) + path fasta + path fasta_fai + path bwa_index + path dbsnp + path dbsnp_tbi + path indel_mask + path germ_sv_db + path simple_seq_db + val error_rate + + output: + tuple val(meta), path("*.svaba.sv.vcf.gz") , emit: sv, optional: true + tuple val(meta), path("*.svaba.indel.vcf.gz") , emit: indel, optional: true + tuple val(meta), path("*.svaba.germline.indel.vcf.gz") , emit: germ_indel, optional: true + tuple val(meta), path("*.svaba.germline.sv.vcf.gz") , emit: germ_sv, optional: true + tuple val(meta), path("*.svaba.somatic.indel.vcf.gz") , emit: som_indel, optional: true + tuple val(meta), path("*.svaba.somatic.sv.vcf.gz") , emit: som_sv, optional: true + tuple val(meta), path("*.svaba.unfiltered.sv.vcf.gz") , emit: unfiltered_sv, optional: true + tuple val(meta), path("*.svaba.unfiltered.indel.vcf.gz") , emit: unfiltered_indel, optional: true + tuple val(meta), path("*.svaba.unfiltered.germline.indel.vcf.gz") , emit: unfiltered_germ_indel, optional: true + tuple val(meta), path("*.svaba.unfiltered.germline.sv.vcf.gz") , emit: unfiltered_germ_sv, optional: true + tuple val(meta), path("*.svaba.unfiltered.somatic.indel.vcf.gz") , emit: unfiltered_som_indel, optional: true + tuple val(meta), path("*.svaba.unfiltered.somatic.sv.vcf.gz") , emit: unfiltered_som_sv, optional: true + tuple val(meta), path("*.bps.txt.gz") , emit: raw_calls + tuple val(meta), path("*.alignments.txt.gz") , emit: ascii_alignments, optional:true + tuple val(meta), path("*.discordants.txt.gz") , emit: discordants, optional: true + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bamlist = normalbam ? "-t ${tumorbam} -n ${normalbam}" : "-t ${tumorbam}" + def dbsnp = dbsnp ? "--dbsnp-vcf ${dbsnp}" : "" + def bwa = bwa_index ? "cp -s ${bwa_index}/* ." : "" + def indel_mask = indel_mask ? "--blacklist ${indel_mask}" : "" + def flags = germ_sv_db ? "--germline-sv-database ${germ_sv_db} --simple-seq-database ${simple_seq_db}" : "" + def error_rate = error_rate ? "--error-rate ${error_rate}" : "" + + """ + ${bwa} + + svaba \\ + run \\ + $bamlist \\ + --threads $task.cpus \\ + $dbsnp \\ + $indel_mask \\ + $error_rate \\ + $flags \\ + --id-string $meta.id \\ + --reference-genome $fasta \\ + --g-zip \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + svaba: \$(echo \$(svaba --version 2>&1) | sed 's/[^0-9.]*\\([0-9.]*\\).*/\\1/' ) + END_VERSIONS + """ + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bps.txt.gz + touch ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + svaba: \$(echo \$(svaba --version 2>&1) | sed 's/[^0-9.]*\\([0-9.]*\\).*/\\1/' ) + END_VERSIONS + """ + +} \ No newline at end of file diff --git a/modules/nf-core/bwa/index/main.nf b/modules/nf-core/bwa/index/main.nf new file mode 100644 index 0000000..c30d194 --- /dev/null +++ b/modules/nf-core/bwa/index/main.nf @@ -0,0 +1,53 @@ +process BWA_INDEX { + tag "$fasta" + label 'process_single' + + conda "bioconda::bwa=0.7.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bwa:0.7.17--hed695b0_7' : + 'biocontainers/bwa:0.7.17--hed695b0_7' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path(bwa) , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${fasta.baseName}" + def args = task.ext.args ?: '' + """ + mkdir bwa + bwa \\ + index \\ + $args \\ + -p bwa/${prefix} \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa: \$(echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${fasta.baseName}" + """ + mkdir bwa + + touch bwa/${prefix}.amb + touch bwa/${prefix}.ann + touch bwa/${prefix}.bwt + touch bwa/${prefix}.pac + touch bwa/${prefix}.sa + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa: \$(echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bwa/index/meta.yml b/modules/nf-core/bwa/index/meta.yml new file mode 100644 index 0000000..2c6cfcd --- /dev/null +++ b/modules/nf-core/bwa/index/meta.yml @@ -0,0 +1,42 @@ +name: bwa_index +description: Create BWA index for reference genome +keywords: + - index + - fasta + - genome + - reference +tools: + - bwa: + description: | + BWA is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: http://bio-bwa.sourceforge.net/ + documentation: http://www.htslib.org/doc/samtools.html + arxiv: arXiv:1303.3997 + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input genome fasta file +output: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - index: + type: file + description: BWA genome index files + pattern: "*.{amb,ann,bwt,pac,sa}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@maxulysse" diff --git a/modules/nf-core/bwa/mem/main.nf b/modules/nf-core/bwa/mem/main.nf new file mode 100644 index 0000000..8ba99df --- /dev/null +++ b/modules/nf-core/bwa/mem/main.nf @@ -0,0 +1,55 @@ +process BWA_MEM { + tag "$meta.id" + label 'process_high' + + conda "bioconda::bwa=0.7.17 bioconda::samtools=1.16.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:219b6c272b25e7e642ae3ff0bf0c5c81a5135ab4-0' : + 'biocontainers/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:219b6c272b25e7e642ae3ff0bf0c5c81a5135ab4-0' }" + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(index) + val sort_bam + + output: + tuple val(meta), path("*.bam"), emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def samtools_command = sort_bam ? 'sort' : 'view' + """ + INDEX=`find -L ./ -name "*.amb" | sed 's/\\.amb\$//'` + + bwa mem \\ + $args \\ + -t $task.cpus \\ + \$INDEX \\ + $reads \\ + | samtools $samtools_command $args2 --threads $task.cpus -o ${prefix}.bam - + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa: \$(echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa: \$(echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bwa/mem/meta.yml b/modules/nf-core/bwa/mem/meta.yml new file mode 100644 index 0000000..62357bf --- /dev/null +++ b/modules/nf-core/bwa/mem/meta.yml @@ -0,0 +1,55 @@ +name: bwa_mem +description: Performs fastq alignment to a fasta reference using BWA +keywords: + - mem + - bwa + - alignment + - map + - fastq + - bam + - sam +tools: + - bwa: + description: | + BWA is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: http://bio-bwa.sourceforge.net/ + documentation: http://www.htslib.org/doc/samtools.html + arxiv: arXiv:1303.3997 + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - index: + type: file + description: BWA genome index files + pattern: "Directory containing BWA index *.{amb,ann,bwt,pac,sa}" + - sort_bam: + type: boolean + description: use samtools sort (true) or samtools view (false) + pattern: "true or false" +output: + - bam: + type: file + description: Output BAM file containing read alignments + pattern: "*.{bam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@jeremy1805" diff --git a/modules/nf-core/bwamem2/index/main.nf b/modules/nf-core/bwamem2/index/main.nf new file mode 100644 index 0000000..244e100 --- /dev/null +++ b/modules/nf-core/bwamem2/index/main.nf @@ -0,0 +1,52 @@ +process BWAMEM2_INDEX { + tag "$fasta" + label 'process_single' + + conda "bioconda::bwa-mem2=2.2.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bwa-mem2:2.2.1--he513fc3_0' : + 'biocontainers/bwa-mem2:2.2.1--he513fc3_0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("bwamem2"), emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${fasta}" + def args = task.ext.args ?: '' + """ + mkdir bwamem2 + bwa-mem2 \\ + index \\ + $args \\ + $fasta -p bwamem2/${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwamem2: \$(echo \$(bwa-mem2 version 2>&1) | sed 's/.* //') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${fasta}" + + """ + mkdir bwamem2 + touch bwamem2/${prefix}.0123 + touch bwamem2/${prefix}.ann + touch bwamem2/${prefix}.pac + touch bwamem2/${prefix}.amb + touch bwamem2/${prefix}.bwt.2bit.64 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwamem2: \$(echo \$(bwa-mem2 version 2>&1) | sed 's/.* //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bwamem2/index/meta.yml b/modules/nf-core/bwamem2/index/meta.yml new file mode 100644 index 0000000..40c26c3 --- /dev/null +++ b/modules/nf-core/bwamem2/index/meta.yml @@ -0,0 +1,40 @@ +name: bwamem2_index +description: Create BWA-mem2 index for reference genome +keywords: + - index + - fasta + - genome + - reference +tools: + - bwamem2: + description: | + BWA-mem2 is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: https://github.com/bwa-mem2/bwa-mem2 + documentation: https://github.com/bwa-mem2/bwa-mem2#usage + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input genome fasta file +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - index: + type: file + description: BWA genome index files + pattern: "*.{0123,amb,ann,bwt.2bit.64,pac}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" diff --git a/modules/nf-core/bwamem2/mem/main.nf b/modules/nf-core/bwamem2/mem/main.nf new file mode 100644 index 0000000..d427dea --- /dev/null +++ b/modules/nf-core/bwamem2/mem/main.nf @@ -0,0 +1,55 @@ +process BWAMEM2_MEM { + tag "$meta.id" + label 'process_high' + + conda "bioconda::bwa-mem2=2.2.1 bioconda::samtools=1.16.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-e5d375990341c5aef3c9aff74f96f66f65375ef6:2cdf6bf1e92acbeb9b2834b1c58754167173a410-0' : + 'biocontainers/mulled-v2-e5d375990341c5aef3c9aff74f96f66f65375ef6:2cdf6bf1e92acbeb9b2834b1c58754167173a410-0' }" + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(index) + val sort_bam + + output: + tuple val(meta), path("*.bam"), emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def samtools_command = sort_bam ? 'sort' : 'view' + """ + INDEX=`find -L ./ -name "*.amb" | sed 's/\\.amb\$//'` + + bwa-mem2 \\ + mem \\ + $args \\ + -t $task.cpus \\ + \$INDEX \\ + $reads \\ + | samtools $samtools_command $args2 -@ $task.cpus -o ${prefix}.bam - + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwamem2: \$(echo \$(bwa-mem2 version 2>&1) | sed 's/.* //') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwamem2: \$(echo \$(bwa-mem2 version 2>&1) | sed 's/.* //') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bwamem2/mem/meta.yml b/modules/nf-core/bwamem2/mem/meta.yml new file mode 100644 index 0000000..bc3dfcd --- /dev/null +++ b/modules/nf-core/bwamem2/mem/meta.yml @@ -0,0 +1,59 @@ +name: bwamem2_mem +description: Performs fastq alignment to a fasta reference using BWA +keywords: + - mem + - bwa + - alignment + - map + - fastq + - bam + - sam +tools: + - bwa: + description: | + BWA-mem2 is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: https://github.com/bwa-mem2/bwa-mem2 + documentation: http://www.htslib.org/doc/samtools.html + arxiv: arXiv:1303.3997 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - meta2: + type: map + description: | + Groovy Map containing reference/index information + e.g. [ id:'test' ] + - index: + type: file + description: BWA genome index files + pattern: "Directory containing BWA index *.{0132,amb,ann,bwt.2bit.64,pac}" + - sort_bam: + type: boolean + description: use samtools sort (true) or samtools view (false) + pattern: "true or false" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Output BAM file containing read alignments + pattern: "*.{bam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" diff --git a/modules/nf-core/cat/fastq/main.nf b/modules/nf-core/cat/fastq/main.nf new file mode 100644 index 0000000..5021e6f --- /dev/null +++ b/modules/nf-core/cat/fastq/main.nf @@ -0,0 +1,80 @@ +process CAT_FASTQ { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(reads, stageAs: "input*/*") + + output: + tuple val(meta), path("*.merged.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size >= 1) { + """ + cat ${readList.join(' ')} > ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size >= 2) { + def read1 = [] + def read2 = [] + readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } + """ + cat ${read1.join(' ')} > ${prefix}_1.merged.fastq.gz + cat ${read2.join(' ')} > ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size > 1) { + """ + touch ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size > 2) { + """ + touch ${prefix}_1.merged.fastq.gz + touch ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + +} diff --git a/modules/nf-core/cat/fastq/meta.yml b/modules/nf-core/cat/fastq/meta.yml new file mode 100644 index 0000000..8a39e30 --- /dev/null +++ b/modules/nf-core/cat/fastq/meta.yml @@ -0,0 +1,40 @@ +name: cat_fastq +description: Concatenates fastq files +keywords: + - cat + - fastq + - concatenate +tools: + - cat: + description: | + The cat utility reads files sequentially, writing them to the standard output. + documentation: https://www.gnu.org/software/coreutils/manual/html_node/cat-invocation.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files to be concatenated. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Merged fastq file + pattern: "*.{merged.fastq.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/dragmap/align/main.nf b/modules/nf-core/dragmap/align/main.nf new file mode 100644 index 0000000..6221fde --- /dev/null +++ b/modules/nf-core/dragmap/align/main.nf @@ -0,0 +1,46 @@ +process DRAGMAP_ALIGN { + tag "$meta.id" + label 'process_high' + + conda "bioconda::dragmap=1.2.1 bioconda::samtools=1.15.1 conda-forge::pigz=2.3.4" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-580d344d9d4a496cd403932da8765f9e0187774d:5ebebbc128cd624282eaa37d2c7fe01505a91a69-0': + 'biocontainers/mulled-v2-580d344d9d4a496cd403932da8765f9e0187774d:5ebebbc128cd624282eaa37d2c7fe01505a91a69-0' }" + + input: + tuple val(meta) , path(reads) + tuple val(meta2), path(hashmap) + val sort_bam + + output: + tuple val(meta), path("*.bam"), emit: bam + tuple val(meta), path('*.log'), emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reads_command = meta.single_end ? "-1 $reads" : "-1 ${reads[0]} -2 ${reads[1]}" + def samtools_command = sort_bam ? 'sort' : 'view' + + """ + dragen-os \\ + -r $hashmap \\ + $args \\ + --num-threads $task.cpus \\ + $reads_command \\ + 2> ${prefix}.dragmap.log \\ + | samtools $samtools_command $args2 --threads $task.cpus -o ${prefix}.bam - + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dragmap: \$(echo \$(dragen-os --version 2>&1)) + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/dragmap/align/meta.yml b/modules/nf-core/dragmap/align/meta.yml new file mode 100644 index 0000000..763e005 --- /dev/null +++ b/modules/nf-core/dragmap/align/meta.yml @@ -0,0 +1,47 @@ +name: dragmap_align +description: Performs fastq alignment to a reference using DRAGMAP +keywords: + - alignment + - map + - fastq + - bam + - sam +tools: + - dragmap: + description: Dragmap is the Dragen mapper/aligner Open Source Software. + homepage: https://github.com/Illumina/dragmap + documentation: https://github.com/Illumina/dragmap + tool_dev_url: https://github.com/Illumina/dragmap#basic-command-line-usage + + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test', single_end:false ] + - hashmap: + type: file + description: DRAGMAP hash table + pattern: "Directory containing DRAGMAP hash table *.{cmp,.bin,.txt}" +output: + - bam: + type: file + description: Output BAM file containing read alignments + pattern: "*.{bam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Emiller88" diff --git a/modules/nf-core/ensemblvep/download/main.nf b/modules/nf-core/ensemblvep/download/main.nf new file mode 100644 index 0000000..4873b91 --- /dev/null +++ b/modules/nf-core/ensemblvep/download/main.nf @@ -0,0 +1,45 @@ +process ENSEMBLVEP_DOWNLOAD { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::ensembl-vep=110.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ensembl-vep:110.0--pl5321h2a3209d_0' : + 'biocontainers/ensembl-vep:110.0--pl5321h2a3209d_0' }" + + input: + tuple val(meta), val(assembly), val(species), val(cache_version) + + output: + tuple val(meta), path("vep_cache"), emit: cache + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + vep_install \\ + --CACHEDIR vep_cache \\ + --SPECIES $species \\ + --ASSEMBLY $assembly \\ + --CACHE_VERSION $cache_version \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//') + END_VERSIONS + """ + + stub: + """ + mkdir vep_cache + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/ensemblvep/download/meta.yml b/modules/nf-core/ensemblvep/download/meta.yml new file mode 100644 index 0000000..acb337c --- /dev/null +++ b/modules/nf-core/ensemblvep/download/meta.yml @@ -0,0 +1,43 @@ +name: ENSEMBLVEP_DOWNLOAD +description: Ensembl Variant Effect Predictor (VEP). The cache downloading options are controlled through `task.ext.args`. +keywords: + - annotation + - cache + - download +tools: + - ensemblvep: + description: | + VEP determines the effect of your variants (SNPs, insertions, deletions, CNVs + or structural variants) on genes, transcripts, and protein sequence, as well as regulatory regions. + homepage: https://www.ensembl.org/info/docs/tools/vep/index.html + documentation: https://www.ensembl.org/info/docs/tools/vep/script/index.html + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - assembly: + type: string + description: | + Genome assembly + - species: + type: string + description: | + Specie + - cache_version: + type: string + description: | + cache version +output: + - cache: + type: file + description: cache + pattern: "*" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf new file mode 100644 index 0000000..831b7f1 --- /dev/null +++ b/modules/nf-core/fastp/main.nf @@ -0,0 +1,102 @@ +process FASTP { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::fastp=0.23.4" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' : + 'biocontainers/fastp:0.23.4--h5f740d0_0' }" + + input: + tuple val(meta), path(reads) + path adapter_fasta + val save_trimmed_fail + val save_merged + + output: + tuple val(meta), path('*.fastp.fastq.gz') , optional:true, emit: reads + tuple val(meta), path('*.json') , emit: json + tuple val(meta), path('*.html') , emit: html + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + tuple val(meta), path('*.fail.fastq.gz') , optional:true, emit: reads_fail + tuple val(meta), path('*.merged.fastq.gz'), optional:true, emit: reads_merged + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def adapter_list = adapter_fasta ? "--adapter_fasta ${adapter_fasta}" : "" + def fail_fastq = save_trimmed_fail && meta.single_end ? "--failed_out ${prefix}.fail.fastq.gz" : save_trimmed_fail && !meta.single_end ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : '' + // Added soft-links to original fastqs for consistent naming in MultiQC + // Use single ended for interleaved. Add --interleaved_in in config. + if ( task.ext.args?.contains('--interleaved_in') ) { + """ + [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz + + fastp \\ + --stdout \\ + --in1 ${prefix}.fastq.gz \\ + --thread $task.cpus \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $args \\ + 2> ${prefix}.fastp.log \\ + | gzip -c > ${prefix}.fastp.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } else if (meta.single_end) { + """ + [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz + + fastp \\ + --in1 ${prefix}.fastq.gz \\ + --out1 ${prefix}.fastp.fastq.gz \\ + --thread $task.cpus \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $args \\ + 2> ${prefix}.fastp.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } else { + def merge_fastq = save_merged ? "-m --merged_out ${prefix}.merged.fastq.gz" : '' + """ + [ ! -f ${prefix}_1.fastq.gz ] && ln -sf ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -sf ${reads[1]} ${prefix}_2.fastq.gz + fastp \\ + --in1 ${prefix}_1.fastq.gz \\ + --in2 ${prefix}_2.fastq.gz \\ + --out1 ${prefix}_1.fastp.fastq.gz \\ + --out2 ${prefix}_2.fastp.fastq.gz \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $merge_fastq \\ + --thread $task.cpus \\ + --detect_adapter_for_pe \\ + $args \\ + 2> ${prefix}.fastp.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/fastp/meta.yml b/modules/nf-core/fastp/meta.yml new file mode 100644 index 0000000..197ea7c --- /dev/null +++ b/modules/nf-core/fastp/meta.yml @@ -0,0 +1,73 @@ +name: fastp +description: Perform adapter/quality trimming on sequencing reads +keywords: + - trimming + - quality control + - fastq +tools: + - fastp: + description: | + A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance. + documentation: https://github.com/OpenGene/fastp + doi: 10.1093/bioinformatics/bty560 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information. Use 'single_end: true' to specify single ended or interleaved FASTQs. Use 'single_end: false' for paired-end reads. + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. If you wish to run interleaved paired-end data, supply as single-end data + but with `--interleaved_in` in your `modules.conf`'s `ext.args` for the module. + - adapter_fasta: + type: file + description: File in FASTA format containing possible adapters to remove. + pattern: "*.{fasta,fna,fas,fa}" + - save_trimmed_fail: + type: boolean + description: Specify true to save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz` + - save_merged: + type: boolean + description: Specify true to save all merged reads to the a file ending in `*.merged.fastq.gz` + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: The trimmed/modified/unmerged fastq reads + pattern: "*fastp.fastq.gz" + - json: + type: file + description: Results in JSON format + pattern: "*.json" + - html: + type: file + description: Results in HTML format + pattern: "*.html" + - log: + type: file + description: fastq log file + pattern: "*.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads_fail: + type: file + description: Reads the failed the preprocessing + pattern: "*fail.fastq.gz" + - reads_merged: + type: file + description: Reads that were successfully merged + pattern: "*.{merged.fastq.gz}" +authors: + - "@drpatelh" + - "@kevinmenden" diff --git a/modules/nf-core/gatk4/applybqsr/main.nf b/modules/nf-core/gatk4/applybqsr/main.nf new file mode 100644 index 0000000..e5e6bf9 --- /dev/null +++ b/modules/nf-core/gatk4/applybqsr/main.nf @@ -0,0 +1,51 @@ +process GATK4_APPLYBQSR { + tag "$meta.id" + label 'process_low' + + conda "bioconda::gatk4=4.4.0.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(input), path(input_index), path(bqsr_table), path(intervals) + path fasta + path fai + path dict + + output: + tuple val(meta), path("*.bam") , emit: bam, optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def interval_command = intervals ? "--intervals $intervals" : "" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK ApplyBQSR] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + ApplyBQSR \\ + --input $input \\ + --output ${prefix}.${input.getExtension()} \\ + --reference $fasta \\ + --bqsr-recal-file $bqsr_table \\ + $interval_command \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/applybqsr/meta.yml b/modules/nf-core/gatk4/applybqsr/meta.yml new file mode 100644 index 0000000..2085fa9 --- /dev/null +++ b/modules/nf-core/gatk4/applybqsr/meta.yml @@ -0,0 +1,74 @@ +name: gatk4_applybqsr +description: Apply base quality score recalibration (BQSR) to a bam file +keywords: + - bam + - base quality score recalibration + - bqsr + - cram + - gatk4 +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file from alignment + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAI/CRAI file from alignment + pattern: "*.{bai,crai}" + - bqsr_table: + type: file + description: Recalibration table from gatk4_baserecalibrator + - intervals: + type: file + description: Bed file with the genomic regions included in the library (optional) + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "*.fasta.fai" + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + type: file + description: Recalibrated BAM file + pattern: "*.{bam}" + - cram: + type: file + description: Recalibrated CRAM file + pattern: "*.{cram}" + +authors: + - "@yocra3" + - "@FriederikeHanssen" diff --git a/modules/nf-core/gatk4/baserecalibrator/main.nf b/modules/nf-core/gatk4/baserecalibrator/main.nf new file mode 100644 index 0000000..5375289 --- /dev/null +++ b/modules/nf-core/gatk4/baserecalibrator/main.nf @@ -0,0 +1,53 @@ +process GATK4_BASERECALIBRATOR { + tag "$meta.id" + label 'process_low' + + conda "bioconda::gatk4=4.4.0.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(input), path(input_index), path(intervals) + path fasta + path fai + path dict + path known_sites + path known_sites_tbi + + output: + tuple val(meta), path("*.table"), emit: table + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def interval_command = intervals ? "--intervals $intervals" : "" + def sites_command = known_sites.collect{"--known-sites $it"}.join(' ') + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK BaseRecalibrator] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + BaseRecalibrator \\ + --input $input \\ + --output ${prefix}.table \\ + --reference $fasta \\ + $interval_command \\ + $sites_command \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/baserecalibrator/meta.yml b/modules/nf-core/gatk4/baserecalibrator/meta.yml new file mode 100644 index 0000000..db4fecf --- /dev/null +++ b/modules/nf-core/gatk4/baserecalibrator/meta.yml @@ -0,0 +1,76 @@ +name: gatk4_baserecalibrator +description: Generate recalibration table for Base Quality Score Recalibration (BQSR) +keywords: + - base quality score recalibration + - table + - bqsr + - gatk4 + - sort +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file from alignment + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAI/CRAI file from alignment + pattern: "*.{bai,crai}" + - intervals: + type: file + description: Bed file with the genomic regions included in the library (optional) + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "*.fasta.fai" + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" + - known_sites: + type: file + description: VCF files with known sites for indels / snps (optional) + pattern: "*.vcf.gz" + - known_sites_tbi: + type: file + description: Tabix index of the known_sites (optional) + pattern: "*.vcf.gz.tbi" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - table: + type: file + description: Recalibration table from BaseRecalibrator + pattern: "*.{table}" + +authors: + - "@yocra3" + - "@FriederikeHanssen" + - "@maxulysse" diff --git a/modules/nf-core/gatk4/createsequencedictionary/main.nf b/modules/nf-core/gatk4/createsequencedictionary/main.nf new file mode 100644 index 0000000..3e4efdd --- /dev/null +++ b/modules/nf-core/gatk4/createsequencedictionary/main.nf @@ -0,0 +1,52 @@ +process GATK4_CREATESEQUENCEDICTIONARY { + tag "$fasta" + label 'process_medium' + + conda "bioconda::gatk4=4.4.0.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path('*.dict') , emit: dict + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + def avail_mem = 6144 + if (!task.memory) { + log.info '[GATK CreateSequenceDictionary] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + CreateSequenceDictionary \\ + --REFERENCE $fasta \\ + --URI $fasta \\ + --TMP_DIR . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ + + stub: + """ + touch ${fasta.baseName}.dict + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/createsequencedictionary/meta.yml b/modules/nf-core/gatk4/createsequencedictionary/meta.yml new file mode 100644 index 0000000..9b8b8c8 --- /dev/null +++ b/modules/nf-core/gatk4/createsequencedictionary/meta.yml @@ -0,0 +1,40 @@ +name: gatk4_createsequencedictionary +description: Creates a sequence dictionary for a reference sequence +keywords: + - createsequencedictionary + - dictionary + - fasta + - gatk4 +tools: + - gatk: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] + +input: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Input fasta file + pattern: "*.{fasta,fa}" +output: + - dict: + type: file + description: gatk dictionary file + pattern: "*.{dict}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" + - "@ramprasadn" diff --git a/modules/nf-core/gatk4/gatherbqsrreports/main.nf b/modules/nf-core/gatk4/gatherbqsrreports/main.nf new file mode 100644 index 0000000..3eeca5a --- /dev/null +++ b/modules/nf-core/gatk4/gatherbqsrreports/main.nf @@ -0,0 +1,44 @@ +process GATK4_GATHERBQSRREPORTS { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::gatk4=4.4.0.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(table) + + output: + tuple val(meta), path("*.table"), emit: table + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input_list = table.collect{"--input $it"}.join(' ') + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK GatherBQSRReports] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + GatherBQSRReports \\ + $input_list \\ + --output ${prefix}.table \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/gatherbqsrreports/meta.yml b/modules/nf-core/gatk4/gatherbqsrreports/meta.yml new file mode 100644 index 0000000..d9faf09 --- /dev/null +++ b/modules/nf-core/gatk4/gatherbqsrreports/meta.yml @@ -0,0 +1,44 @@ +name: gatk4_gatherbqsrreports +description: Gathers scattered BQSR recalibration reports into a single file +keywords: + - base quality score recalibration + - bqsr + - gatherbqsrreports + - gatk4 +tools: + - gatk4: + description: Genome Analysis Toolkit (GATK4) + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us + tool_dev_url: https://github.com/broadinstitute/gatk + doi: "10.1158/1538-7445.AM2017-3590" + licence: ["BSD-3-clause"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - table: + type: file + description: File(s) containing BQSR table(s) + pattern: "*.table" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - table: + type: file + description: File containing joined BQSR table + pattern: "*.table" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@FriederikeHanssen" diff --git a/modules/nf-core/gatk4/intervallisttobed/main.nf b/modules/nf-core/gatk4/intervallisttobed/main.nf new file mode 100644 index 0000000..2537f0a --- /dev/null +++ b/modules/nf-core/gatk4/intervallisttobed/main.nf @@ -0,0 +1,43 @@ +process GATK4_INTERVALLISTTOBED { + tag "$meta.id" + label 'process_low' + + conda "bioconda::gatk4=4.4.0.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(intervals) + + output: + tuple val(meta), path("*.bed"), emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK IntervalListToBed] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + IntervalListToBed \\ + --INPUT $intervals \\ + --OUTPUT ${prefix}.bed \\ + --TMP_DIR . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/intervallisttobed/meta.yml b/modules/nf-core/gatk4/intervallisttobed/meta.yml new file mode 100644 index 0000000..df3705a --- /dev/null +++ b/modules/nf-core/gatk4/intervallisttobed/meta.yml @@ -0,0 +1,44 @@ +name: gatk4_intervallisttobed +description: Converts an Picard IntervalList file to a BED file. +keywords: + - bed + - conversion + - gatk4 + - interval +tools: + - gatk4: + description: Genome Analysis Toolkit (GATK4) + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + tool_dev_url: https://github.com/broadinstitute/gatk + doi: "10.1158/1538-7445.AM2017-3590" + licence: ["BSD-3-clause"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - interval: + type: file + description: Interval list + pattern: "*.{interval,interval_list}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bed: + type: file + description: BED file + pattern: "*.bed" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@FriederikeHanssen" diff --git a/modules/nf-core/gatk4/markduplicates/main.nf b/modules/nf-core/gatk4/markduplicates/main.nf new file mode 100644 index 0000000..e4c01f9 --- /dev/null +++ b/modules/nf-core/gatk4/markduplicates/main.nf @@ -0,0 +1,68 @@ +process GATK4_MARKDUPLICATES { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::gatk4=4.4.0.0 bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-d9e7bad0f7fbc8f4458d5c3ab7ffaaf0235b59fb:f857e2d6cc88d35580d01cf39e0959a68b83c1d9-0': + 'biocontainers/mulled-v2-d9e7bad0f7fbc8f4458d5c3ab7ffaaf0235b59fb:f857e2d6cc88d35580d01cf39e0959a68b83c1d9-0' }" + + input: + tuple val(meta), path(bam) + path fasta + path fasta_fai + + output: + tuple val(meta), path("*cram"), emit: cram, optional: true + tuple val(meta), path("*bam"), emit: bam, optional: true + tuple val(meta), path("*.crai"), emit: crai, optional: true + tuple val(meta), path("*.bai"), emit: bai, optional: true + tuple val(meta), path("*.metrics"), emit: metrics + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}.bam" + + // If the extension is CRAM, then change it to BAM + prefix_bam = prefix.tokenize('.')[-1] == 'cram' ? "${prefix.substring(0, prefix.lastIndexOf('.'))}.bam" : prefix + + def input_list = bam.collect{"--INPUT $it"}.join(' ') + def reference = fasta ? "--REFERENCE_SEQUENCE ${fasta}" : "" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + + // Using samtools and not Markduplicates to compress to CRAM speeds up computation: + // https://medium.com/@acarroll.dna/looking-at-trade-offs-in-compression-levels-for-genomics-tools-eec2834e8b94 + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + MarkDuplicates \\ + $input_list \\ + --OUTPUT ${prefix_bam} \\ + --METRICS_FILE ${prefix}.metrics \\ + --TMP_DIR . \\ + ${reference} \\ + $args + + # If cram files are wished as output, the run samtools for conversion + if [[ ${prefix} == *.cram ]]; then + samtools view -Ch -T ${fasta} -o ${prefix} ${prefix_bam} + rm ${prefix_bam} + samtools index ${prefix} + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/markduplicates/meta.yml b/modules/nf-core/gatk4/markduplicates/meta.yml new file mode 100644 index 0000000..d3e7550 --- /dev/null +++ b/modules/nf-core/gatk4/markduplicates/meta.yml @@ -0,0 +1,73 @@ +name: gatk4_markduplicates +description: This tool locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA. +keywords: + - bam + - gatk4 + - markduplicates + - sort +tools: + - gatk4: + description: + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/articles/360037052812-MarkDuplicates-Picard- + tool_dev_url: https://github.com/broadinstitute/gatk + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM file + pattern: "*.{bam}" + - fasta: + type: file + description: Fasta file + pattern: "*.{fasta}" + - fasta_fai: + type: file + description: Fasta index file + pattern: "*.{fai}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + type: file + description: Marked duplicates BAM file + pattern: "*.{bam}" + - cram: + type: file + description: Marked duplicates CRAM file + pattern: "*.{cram}" + - bai: + type: file + description: BAM index file + pattern: "*.{bam.bai}" + - crai: + type: file + description: CRAM index file + pattern: "*.{cram.crai}" + - metrics: + type: file + description: Duplicate metrics file generated by GATK + pattern: "*.{metrics.txt}" + +authors: + - "@ajodeh-juma" + - "@FriederikeHanssen" + - "@maxulysse" diff --git a/modules/nf-core/gridss/gridss/main.nf b/modules/nf-core/gridss/gridss/main.nf new file mode 100644 index 0000000..f642b83 --- /dev/null +++ b/modules/nf-core/gridss/gridss/main.nf @@ -0,0 +1,69 @@ +process GRIDSS_GRIDSS { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::gridss=2.13.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gridss:2.13.2--h270b39a_0': + 'biocontainers/gridss:2.13.2--h270b39a_0' }" + + input: + tuple val(meta), path(inputs), path(assembly) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fasta_fai) + tuple val(meta4), path(bwa_index) + + output: + tuple val(meta), path("*.vcf.gz") , emit: vcf, optional:true + tuple val(meta), path("*.assembly.bam") , emit: assembly, optional:true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '2.13.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + + def assembly_bam = assembly ? "--assembly ${assembly}" : "" + def bwa = bwa_index ? "cp -s ${bwa_index}/* ." : "" + + """ + ${bwa} + + gridss \\ + --output ${prefix}.vcf.gz \\ + --reference ${fasta} \\ + --threads ${task.cpus} \\ + ${assembly} \\ + --jvmheap ${task.memory.toGiga() - 1}g \\ + --otherjvmheap ${task.memory.toGiga() - 1}g \\ + ${inputs} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gridss: ${VERSION} + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '2.13.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + + def steps = args.contains("-s ") ? args.split('-s ')[-1].split(" ")[0] : + args.contains("--steps ") ? args.split('--steps ')[-1].split(" ")[0] : + "all" + def vcf = steps.contains("call") || steps.contains("all") ? "touch ${prefix}.vcf.gz" : "" + def assembly_bam = steps.contains("assembly") || steps.contains("all") ? "touch ${prefix}.assembly.bam" : "" + """ + ${vcf} + ${assembly_bam} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gridss: ${VERSION} + END_VERSIONS + """ +} diff --git a/modules/nf-core/gridss/gridss/meta.yml b/modules/nf-core/gridss/gridss/meta.yml new file mode 100644 index 0000000..3ac3842 --- /dev/null +++ b/modules/nf-core/gridss/gridss/meta.yml @@ -0,0 +1,64 @@ +name: "gridss_gridss" +description: GRIDSS is a module software suite containing tools useful for the detection of genomic rearrangements. +keywords: + - gridss + - structural variants + - bam + - cram + - vcf +tools: + - "gridss": + description: "GRIDSS: the Genomic Rearrangement IDentification Software Suite" + + documentation: "https://github.com/PapenfussLab/gridss/wiki/GRIDSS-Documentation" + tool_dev_url: "https://github.com/PapenfussLab/gridss" + doi: "10.1186/s13059-021-02423-x" + licence: "['GPL v3']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - inputs: + type: file(s) + description: One or more input BAM/CRAM file(s) + pattern: "*.{bam,cram}" + - assembly: + type: file + description: OPTIONAL - An assembly BAM file created by the assembly step of Gridss + pattern: "*.bam" + - fasta: + type: file + description: The reference fasta + pattern: "*.{fa,fna,fasta}" + - fasta_fai: + type: file + description: The index of the reference fasta + pattern: "*.fai" + - bwa_index: + type: directory + description: OPTIONAL - The BWA index created from the reference fasta, will be generated by Gridss in the setupreference step + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: The called VCF file created by Gridss' call step + pattern: "*.vcf.gz" + - assembly: + type: file + description: The assembly BAM file created by Gridss' assembly step + pattern: "*.assembly.bam" + +authors: + - "@nvnieuwk" diff --git a/modules/nf-core/mosdepth/main.nf b/modules/nf-core/mosdepth/main.nf new file mode 100644 index 0000000..74db3a2 --- /dev/null +++ b/modules/nf-core/mosdepth/main.nf @@ -0,0 +1,80 @@ +process MOSDEPTH { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::mosdepth=0.3.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mosdepth:0.3.3--hdfd78af_1' : + 'biocontainers/mosdepth:0.3.3--hdfd78af_1'}" + + input: + tuple val(meta), path(bam), path(bai), path(bed) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path('*.global.dist.txt') , emit: global_txt + tuple val(meta), path('*.summary.txt') , emit: summary_txt + tuple val(meta), path('*.region.dist.txt') , optional:true, emit: regions_txt + tuple val(meta), path('*.per-base.d4') , optional:true, emit: per_base_d4 + tuple val(meta), path('*.per-base.bed.gz') , optional:true, emit: per_base_bed + tuple val(meta), path('*.per-base.bed.gz.csi') , optional:true, emit: per_base_csi + tuple val(meta), path('*.regions.bed.gz') , optional:true, emit: regions_bed + tuple val(meta), path('*.regions.bed.gz.csi') , optional:true, emit: regions_csi + tuple val(meta), path('*.quantized.bed.gz') , optional:true, emit: quantized_bed + tuple val(meta), path('*.quantized.bed.gz.csi') , optional:true, emit: quantized_csi + tuple val(meta), path('*.thresholds.bed.gz') , optional:true, emit: thresholds_bed + tuple val(meta), path('*.thresholds.bed.gz.csi'), optional:true, emit: thresholds_csi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--fasta ${fasta}" : "" + def interval = bed ? "--by ${bed}" : "" + if (bed && args.contains("--by")) { + error "'--by' can only be specified once when running mosdepth! Either remove input BED file definition or remove '--by' from 'ext.args' definition" + } + if (!bed && args.contains("--thresholds")) { + error "'--thresholds' can only be specified in conjunction with '--by'" + } + + """ + mosdepth \\ + --threads $task.cpus \\ + $interval \\ + $reference \\ + $args \\ + $prefix \\ + $bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mosdepth: \$(mosdepth --version 2>&1 | sed 's/^.*mosdepth //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.global.dist.txt + touch ${prefix}.region.dist.txt + touch ${prefix}.summary.txt + touch ${prefix}.per-base.d4 + touch ${prefix}.per-base.bed.gz + touch ${prefix}.per-base.bed.gz.csi + touch ${prefix}.regions.bed.gz + touch ${prefix}.regions.bed.gz.csi + touch ${prefix}.quantized.bed.gz + touch ${prefix}.quantized.bed.gz.csi + touch ${prefix}.thresholds.bed.gz + touch ${prefix}.thresholds.bed.gz.csi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mosdepth: \$(mosdepth --version 2>&1 | sed 's/^.*mosdepth //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/mosdepth/meta.yml b/modules/nf-core/mosdepth/meta.yml new file mode 100644 index 0000000..adf3893 --- /dev/null +++ b/modules/nf-core/mosdepth/meta.yml @@ -0,0 +1,109 @@ +name: mosdepth +description: Calculates genome-wide sequencing coverage. +keywords: + - mosdepth + - bam + - cram + - coverage +tools: + - mosdepth: + description: | + Fast BAM/CRAM depth calculation for WGS, exome, or targeted sequencing. + documentation: https://github.com/brentp/mosdepth + doi: 10.1093/bioinformatics/btx699 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Input BAM/CRAM file + pattern: "*.{bam,cram}" + - bai: + type: file + description: Index for BAM/CRAM file + pattern: "*.{bai,crai}" + - meta2: + type: map + description: | + Groovy Map containing bed information + e.g. [ id:'test' ] + - bed: + type: file + description: BED file with intersected intervals + pattern: "*.{bed}" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: Reference genome FASTA file + pattern: "*.{fa,fasta}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - global_txt: + type: file + description: Text file with global cumulative coverage distribution + pattern: "*.{global.dist.txt}" + - regions_txt: + type: file + description: Text file with region cumulative coverage distribution + pattern: "*.{region.dist.txt}" + - summary_txt: + type: file + description: Text file with summary mean depths per chromosome and regions + pattern: "*.{summary.txt}" + - per_base_bed: + type: file + description: BED file with per-base coverage + pattern: "*.{per-base.bed.gz}" + - per_base_csi: + type: file + description: Index file for BED file with per-base coverage + pattern: "*.{per-base.bed.gz.csi}" + - per_base_d4: + type: file + description: D4 file with per-base coverage + pattern: "*.{per-base.d4}" + - regions_bed: + type: file + description: BED file with per-region coverage + pattern: "*.{regions.bed.gz}" + - regions_csi: + type: file + description: Index file for BED file with per-region coverage + pattern: "*.{regions.bed.gz.csi}" + - quantized_bed: + type: file + description: BED file with binned coverage + pattern: "*.{quantized.bed.gz}" + - quantized_csi: + type: file + description: Index file for BED file with binned coverage + pattern: "*.{quantized.bed.gz.csi}" + - thresholds_bed: + type: file + description: BED file with the number of bases in each region that are covered at or above each threshold + pattern: "*.{thresholds.bed.gz}" + - thresholds_csi: + type: file + description: Index file for BED file with threshold coverage + pattern: "*.{thresholds.bed.gz.csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@ramprasadn" + - "@matthdsm" diff --git a/modules/nf-core/msisensorpro/scan/main.nf b/modules/nf-core/msisensorpro/scan/main.nf new file mode 100644 index 0000000..760d51a --- /dev/null +++ b/modules/nf-core/msisensorpro/scan/main.nf @@ -0,0 +1,35 @@ +process MSISENSORPRO_SCAN { + tag "$meta.id" + label 'process_low' + + conda "bioconda::msisensor-pro=1.2.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/msisensor-pro:1.2.0--hfc31af2_0' : + 'biocontainers/msisensor-pro:1.2.0--hfc31af2_0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("*.list"), emit: list + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + msisensor-pro \\ + scan \\ + -d $fasta \\ + -o ${prefix}.msisensor_scan.list \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + msisensor-pro: \$(msisensor-pro 2>&1 | sed -nE 's/Version:\\sv([0-9]\\.[0-9])/\\1/ p') + END_VERSIONS + """ +} diff --git a/modules/nf-core/msisensorpro/scan/meta.yml b/modules/nf-core/msisensorpro/scan/meta.yml new file mode 100644 index 0000000..47f4d35 --- /dev/null +++ b/modules/nf-core/msisensorpro/scan/meta.yml @@ -0,0 +1,43 @@ +name: msisensorpro_scan +description: MSIsensor-pro evaluates Microsatellite Instability (MSI) for cancer patients with next generation sequencing data. It accepts the whole genome sequencing, whole exome sequencing and target region (panel) sequencing data as input +keywords: + - micro-satellite-scan + - msisensor-pro + - scan +tools: + - msisensorpro: + description: Microsatellite Instability (MSI) detection using high-throughput sequencing data. + homepage: https://github.com/xjtu-omics/msisensor-pro + documentation: https://github.com/xjtu-omics/msisensor-pro/wiki + tool_dev_url: https://github.com/xjtu-omics/msisensor-pro + doi: "10.1016/j.gpb.2020.02.001" + licence: ["Custom Licence"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Reference genome + pattern: "*.{fasta}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - list: + type: file + description: File containing microsatellite list + pattern: "*.{list}" + +authors: + - "@FriederikeHanssen" diff --git a/modules/nf-core/samtools/collatefastq/main.nf b/modules/nf-core/samtools/collatefastq/main.nf new file mode 100644 index 0000000..4469faf --- /dev/null +++ b/modules/nf-core/samtools/collatefastq/main.nf @@ -0,0 +1,55 @@ +process SAMTOOLS_COLLATEFASTQ { + tag "$meta.id" + label 'process_low' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + tuple val(meta2), path(fasta) + val(interleave) + + output: + tuple val(meta), path("*_{1,2}.fq.gz") , optional:true, emit: fastq + tuple val(meta), path("*_interleaved.fq.gz") , optional:true, emit: fastq_interleaved + tuple val(meta), path("*_other.fq.gz") , emit: fastq_other + tuple val(meta), path("*_singleton.fq.gz") , optional:true, emit: fastq_singleton + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + def output = (interleave && ! meta.single_end) ? "> ${prefix}_interleaved.fq.gz" : + meta.single_end ? "-1 ${prefix}_1.fq.gz -s ${prefix}_singleton.fq.gz" : + "-1 ${prefix}_1.fq.gz -2 ${prefix}_2.fq.gz -s ${prefix}_singleton.fq.gz" + + """ + samtools collate \\ + $args \\ + --threads $task.cpus \\ + ${reference} \\ + -O \\ + $input \\ + . | + + samtools fastq \\ + $args2 \\ + --threads $task.cpus \\ + ${reference} \\ + -0 ${prefix}_other.fq.gz \\ + $output + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/collatefastq/meta.yml b/modules/nf-core/samtools/collatefastq/meta.yml new file mode 100644 index 0000000..b647cba --- /dev/null +++ b/modules/nf-core/samtools/collatefastq/meta.yml @@ -0,0 +1,76 @@ +name: samtools_collatefastq +description: | + The module uses collate and then fastq methods from samtools to + convert a SAM, BAM or CRAM file to FASTQ format +keywords: + - bam2fq + - samtools + - fastq +tools: + - samtools: + description: Tools for dealing with SAM, BAM and CRAM files + + documentation: http://www.htslib.org/doc/1.1/samtools.html + + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: Reference genome fasta file + pattern: "*.{fasta,fa}" + - interleave: + type: boolean + description: | + If true, the output is a single interleaved paired-end FASTQ + If false, the output split paired-end FASTQ + default: false +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastq: + type: file + description: | + R1 and R2 FASTQ files + pattern: "*_{1,2}.fq.gz" + - fastq_interleaved: + type: file + description: | + Interleaved paired end FASTQ files + pattern: "*_interleaved.fq.gz" + - fastq_other: + type: file + description: | + FASTQ files with reads where the READ1 and READ2 FLAG bits set are either both set or both unset. + pattern: "*_other.fq.gz" + - fastq_singleton: + type: file + description: | + FASTQ files with singleton reads. + pattern: "*_singleton.fq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@lescai" + - "@maxulysse" + - "@matthdsm" diff --git a/modules/nf-core/samtools/convert/main.nf b/modules/nf-core/samtools/convert/main.nf new file mode 100644 index 0000000..29722ba --- /dev/null +++ b/modules/nf-core/samtools/convert/main.nf @@ -0,0 +1,42 @@ +process SAMTOOLS_CONVERT { + tag "$meta.id" + label 'process_low' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input), path(index) + path fasta + path fai + + output: + tuple val(meta), path("*.{cram,bam}"), path("*.{crai,bai}") , emit: alignment_index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def output_extension = input.getExtension() == "bam" ? "cram" : "bam" + + """ + samtools view \\ + --threads ${task.cpus} \\ + --reference ${fasta} \\ + $args \\ + $input \\ + -o ${prefix}.${output_extension} + + samtools index -@${task.cpus} ${prefix}.${output_extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/convert/meta.yml b/modules/nf-core/samtools/convert/meta.yml new file mode 100644 index 0000000..866c228 --- /dev/null +++ b/modules/nf-core/samtools/convert/meta.yml @@ -0,0 +1,52 @@ +name: samtools_convert +description: convert and then index CRAM -> BAM or BAM -> CRAM file +keywords: + - view + - index + - bam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file + pattern: "*.{bam,cram}" + - index: + type: file + description: BAM/CRAM index file + pattern: "*.{bai,crai}" + - fasta: + type: file + description: Reference file to create the CRAM file + pattern: "*.{fasta,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - alignment_index: + type: file + description: filtered/converted BAM/CRAM file + index + pattern: "*{.bam/cram,.bai/crai}" + - version: + type: file + description: File containing software version + pattern: "*.{version.txt}" +authors: + - "@FriederikeHanssen" + - "@maxulysse" diff --git a/modules/nf-core/samtools/faidx/main.nf b/modules/nf-core/samtools/faidx/main.nf new file mode 100644 index 0000000..59ed308 --- /dev/null +++ b/modules/nf-core/samtools/faidx/main.nf @@ -0,0 +1,50 @@ +process SAMTOOLS_FAIDX { + tag "$fasta" + label 'process_single' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(fasta) + tuple val(meta2), path(fai) + + output: + tuple val(meta), path ("*.{fa,fasta}") , emit: fa , optional: true + tuple val(meta), path ("*.fai") , emit: fai, optional: true + tuple val(meta), path ("*.gzi") , emit: gzi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + faidx \\ + $fasta \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def match = (task.ext.args =~ /-o(?:utput)?\s(.*)\s?/).findAll() + def fastacmd = match[0] ? "touch ${match[0][1]}" : '' + """ + ${fastacmd} + touch ${fasta}.fai + + cat <<-END_VERSIONS > versions.yml + + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/faidx/meta.yml b/modules/nf-core/samtools/faidx/meta.yml new file mode 100644 index 0000000..957b25e --- /dev/null +++ b/modules/nf-core/samtools/faidx/meta.yml @@ -0,0 +1,57 @@ +name: samtools_faidx +description: Index FASTA file +keywords: + - index + - fasta + - faidx +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: FASTA file + pattern: "*.{fa,fasta}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" + - gzi: + type: file + description: Optional gzip index file for compressed inputs + pattern: "*.gzi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@phue" diff --git a/modules/nf-core/samtools/index/main.nf b/modules/nf-core/samtools/index/main.nf new file mode 100644 index 0000000..0b20aa4 --- /dev/null +++ b/modules/nf-core/samtools/index/main.nf @@ -0,0 +1,48 @@ +process SAMTOOLS_INDEX { + tag "$meta.id" + label 'process_low' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.bai") , optional:true, emit: bai + tuple val(meta), path("*.csi") , optional:true, emit: csi + tuple val(meta), path("*.crai"), optional:true, emit: crai + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + index \\ + -@ ${task.cpus-1} \\ + $args \\ + $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + """ + touch ${input}.bai + touch ${input}.crai + touch ${input}.csi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/index/meta.yml b/modules/nf-core/samtools/index/meta.yml new file mode 100644 index 0000000..8bd2fa6 --- /dev/null +++ b/modules/nf-core/samtools/index/meta.yml @@ -0,0 +1,53 @@ +name: samtools_index +description: Index SAM/BAM/CRAM file +keywords: + - index + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - crai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - csi: + type: file + description: CSI index file + pattern: "*.{csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@maxulysse" diff --git a/modules/nf-core/samtools/merge/main.nf b/modules/nf-core/samtools/merge/main.nf new file mode 100644 index 0000000..b73b7cb --- /dev/null +++ b/modules/nf-core/samtools/merge/main.nf @@ -0,0 +1,56 @@ +process SAMTOOLS_MERGE { + tag "$meta.id" + label 'process_low' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input_files, stageAs: "?/*") + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + + output: + tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam + tuple val(meta), path("${prefix}.cram"), optional:true, emit: cram + tuple val(meta), path("*.csi") , optional:true, emit: csi + path "versions.yml" , emit: versions + + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def file_type = input_files instanceof List ? input_files[0].getExtension() : input_files.getExtension() + def reference = fasta ? "--reference ${fasta}" : "" + """ + samtools \\ + merge \\ + --threads ${task.cpus-1} \\ + $args \\ + ${reference} \\ + ${prefix}.${file_type} \\ + $input_files + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.suffix ? "${meta.id}${task.ext.suffix}" : "${meta.id}" + def file_type = input_files instanceof List ? input_files[0].getExtension() : input_files.getExtension() + """ + touch ${prefix}.${file_type} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/merge/meta.yml b/modules/nf-core/samtools/merge/meta.yml new file mode 100644 index 0000000..3a815f7 --- /dev/null +++ b/modules/nf-core/samtools/merge/meta.yml @@ -0,0 +1,73 @@ +name: samtools_merge +description: Merge BAM or CRAM file +keywords: + - merge + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input_files: + type: file + description: BAM/CRAM file + pattern: "*.{bam,cram,sam}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference file the CRAM was created with (optional) + pattern: "*.{fasta,fa}" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fai: + type: file + description: Index of the reference file the CRAM was created with (optional) + pattern: "*.fai" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file + pattern: "*.{bam}" + - cram: + type: file + description: CRAM file + pattern: "*.{cram}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - csi: + type: file + description: BAM index file (optional) + pattern: "*.csi" +authors: + - "@drpatelh" + - "@yuukiiwa " + - "@maxulysse" + - "@FriederikeHanssen" + - "@ramprasadn" diff --git a/modules/nf-core/samtools/stats/main.nf b/modules/nf-core/samtools/stats/main.nf new file mode 100644 index 0000000..4a2607d --- /dev/null +++ b/modules/nf-core/samtools/stats/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_STATS { + tag "$meta.id" + label 'process_single' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input), path(input_index) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path("*.stats"), emit: stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + """ + samtools \\ + stats \\ + --threads ${task.cpus} \\ + ${reference} \\ + ${input} \\ + > ${prefix}.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/stats/meta.yml b/modules/nf-core/samtools/stats/meta.yml new file mode 100644 index 0000000..90e6345 --- /dev/null +++ b/modules/nf-core/samtools/stats/meta.yml @@ -0,0 +1,59 @@ +name: samtools_stats +description: Produces comprehensive statistics from SAM/BAM/CRAM file +keywords: + - statistics + - counts + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file from alignment + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAI/CRAI file from alignment + pattern: "*.{bai,crai}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference file the CRAM was created with (optional) + pattern: "*.{fasta,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - stats: + type: file + description: File containing samtools stats output + pattern: "*.{stats}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@FriederikeHanssen" + - "@ramprasadn" diff --git a/modules/nf-core/samtools/view/main.nf b/modules/nf-core/samtools/view/main.nf new file mode 100644 index 0000000..cb91fac --- /dev/null +++ b/modules/nf-core/samtools/view/main.nf @@ -0,0 +1,66 @@ +process SAMTOOLS_VIEW { + tag "$meta.id" + label 'process_low' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input), path(index) + tuple val(meta2), path(fasta) + path qname + + output: + tuple val(meta), path("*.bam"), emit: bam, optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + tuple val(meta), path("*.sam"), emit: sam, optional: true + tuple val(meta), path("*.bai"), emit: bai, optional: true + tuple val(meta), path("*.csi"), emit: csi, optional: true + tuple val(meta), path("*.crai"), emit: crai, optional: true + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + def readnames = qname ? "--qname-file ${qname}": "" + def file_type = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt bam") ? "bam" : + args.contains("--output-fmt cram") ? "cram" : + input.getExtension() + if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools \\ + view \\ + --threads ${task.cpus-1} \\ + ${reference} \\ + ${readnames} \\ + $args \\ + -o ${prefix}.${file_type} \\ + $input \\ + $args2 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + touch ${prefix}.cram + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/view/meta.yml b/modules/nf-core/samtools/view/meta.yml new file mode 100644 index 0000000..3b05450 --- /dev/null +++ b/modules/nf-core/samtools/view/meta.yml @@ -0,0 +1,84 @@ +name: samtools_view +description: filter/convert SAM/BAM/CRAM file +keywords: + - view + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - index: + type: file + description: BAM.BAI/BAM.CSI/CRAM.CRAI file (optional) + pattern: "*.{.bai,.csi,.crai}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: Reference file the CRAM was created with (optional) + pattern: "*.{fasta,fa}" + - qname: + type: file + description: Optional file with read names to output only select alignments + pattern: "*.{txt,list}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: optional filtered/converted BAM file + pattern: "*.{bam}" + - cram: + type: file + description: optional filtered/converted CRAM file + pattern: "*.{cram}" + - sam: + type: file + description: optional filtered/converted SAM file + pattern: "*.{sam}" + # bai, csi, and crai are created with `--write-index` + - bai: + type: file + description: optional BAM file index + pattern: "*.{bai}" + - csi: + type: file + description: optional tabix BAM file index + pattern: "*.{csi}" + - crai: + type: file + description: optional CRAM file index + pattern: "*.{crai}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" + - "@priyanka-surana" diff --git a/modules/nf-core/sentieon/bwamem/main.nf b/modules/nf-core/sentieon/bwamem/main.nf new file mode 100644 index 0000000..b58835f --- /dev/null +++ b/modules/nf-core/sentieon/bwamem/main.nf @@ -0,0 +1,78 @@ +process SENTIEON_BWAMEM { + tag "$meta.id" + label 'process_high' + label 'sentieon' + + secret 'SENTIEON_LICENSE_BASE64' + + container 'nf-core/sentieon:202112.06' + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(index) + tuple val(meta3), path(fasta) + tuple val(meta4), path(fasta_fai) + + output: + tuple val(meta), path("*.bam"), path("*.bai"), emit: bam_and_bai + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sentieon modules do not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def sentieon_auth_mech_base64 = task.ext.sentieon_auth_mech_base64 ?: '' + def sentieon_auth_data_base64 = task.ext.sentieon_auth_data_base64 ?: '' + + """ + if [ "\${#SENTIEON_LICENSE_BASE64}" -lt "1500" ]; then # If the string SENTIEON_LICENSE_BASE64 is short, then it is an encrypted url. + export SENTIEON_LICENSE=\$(echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d) + else # Localhost license file + # The license file is stored as a nextflow variable like, for instance, this: + # nextflow secrets set SENTIEON_LICENSE_BASE64 \$(cat | base64 -w 0) + export SENTIEON_LICENSE=\$(mktemp) + echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d > \$SENTIEON_LICENSE + fi + + if [ ${sentieon_auth_mech_base64} ] && [ ${sentieon_auth_data_base64} ]; then + # If sentieon_auth_mech_base64 and sentieon_auth_data_base64 are non-empty strings, then Sentieon is mostly likely being run with some test-license. + export SENTIEON_AUTH_MECH=\$(echo -n "${sentieon_auth_mech_base64}" | base64 -d) + export SENTIEON_AUTH_DATA=\$(echo -n "${sentieon_auth_data_base64}" | base64 -d) + echo "Decoded and exported Sentieon test-license system environment variables" + fi + + INDEX=`find -L ./ -name "*.amb" | sed 's/.amb//'` + + sentieon bwa mem \\ + $args \\ + -t $task.cpus \\ + \$INDEX \\ + $reads \\ + | sentieon util sort -r $fasta -t $task.cpus -o ${prefix}.bam --sam2bam - + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + bwa: \$(echo \$(sentieon bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + touch ${prefix}.bam.bai + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + bwa: \$(echo \$(sentieon bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/sentieon/bwamem/meta.yml b/modules/nf-core/sentieon/bwamem/meta.yml new file mode 100644 index 0000000..9987e8c --- /dev/null +++ b/modules/nf-core/sentieon/bwamem/meta.yml @@ -0,0 +1,73 @@ +name: sentieon_bwamem +description: Performs fastq alignment to a fasta reference using Sentieon's BWA MEM +keywords: + - mem + - bwa + - alignment + - map + - fastq + - bam + - sentieon +tools: + - sentieon: + description: | + Sentieon® provides complete solutions for secondary DNA/RNA analysis for a variety of sequencing platforms, including short and long reads. + Our software improves upon BWA, STAR, Minimap2, GATK, HaplotypeCaller, Mutect, and Mutect2 based pipelines and is deployable on any generic-CPU-based computing system. + homepage: https://www.sentieon.com/ + documentation: https://www.sentieon.com/ +input: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Genome fastq files (single-end or paired-end) + - meta2: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - index: + type: file + description: BWA genome index files + pattern: "*.{amb,ann,bwt,pac,sa}" + - meta3: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Genome fasta file + pattern: "*.{fa,fasta}" + - meta4: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - fasta_fai: + type: file + description: The index of the FASTA reference. + pattern: "*.fai" +output: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file. + pattern: "*.bam" + - bai: + type: file + description: BAI file + pattern: "*.bai" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@asp8200" diff --git a/modules/nf-core/snpeff/download/main.nf b/modules/nf-core/snpeff/download/main.nf new file mode 100644 index 0000000..9a3a0d3 --- /dev/null +++ b/modules/nf-core/snpeff/download/main.nf @@ -0,0 +1,51 @@ +process SNPEFF_DOWNLOAD { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::snpeff=5.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/snpeff:5.1--hdfd78af_2' : + 'biocontainers/snpeff:5.1--hdfd78af_2' }" + + input: + tuple val(meta), val(genome), val(cache_version) + + output: + tuple val(meta), path('snpeff_cache'), emit: cache + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def avail_mem = 6144 + if (!task.memory) { + log.info '[snpEff] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + snpEff \\ + -Xmx${avail_mem}M \\ + download ${genome}.${cache_version} \\ + -dataDir \${PWD}/snpeff_cache \\ + ${args} + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + snpeff: \$(echo \$(snpEff -version 2>&1) | cut -f 2 -d ' ') + END_VERSIONS + """ + + stub: + """ + mkdir ${genome}.${cache_version} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + snpeff: \$(echo \$(snpEff -version 2>&1) | cut -f 2 -d ' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/snpeff/download/meta.yml b/modules/nf-core/snpeff/download/meta.yml new file mode 100644 index 0000000..3c03c2f --- /dev/null +++ b/modules/nf-core/snpeff/download/meta.yml @@ -0,0 +1,41 @@ +name: SNPEFF_DOWNLOAD +description: Genetic variant annotation and functional effect prediction toolbox +keywords: + - annotation + - effect prediction + - snpeff + - variant + - vcf +tools: + - snpeff: + description: | + SnpEff is a variant annotation and effect prediction tool. + It annotates and predicts the effects of genetic variants on genes and proteins (such as amino acid changes). + homepage: https://pcingola.github.io/SnpEff/ + documentation: https://pcingola.github.io/SnpEff/se_introduction/ + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: | + vcf to annotate + - db: + type: string + description: | + which db to annotate with +output: + - cache: + type: file + description: | + snpEff cache + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" diff --git a/modules/nf-core/svaba/main.nf b/modules/nf-core/svaba/main.nf new file mode 100644 index 0000000..6bd44b6 --- /dev/null +++ b/modules/nf-core/svaba/main.nf @@ -0,0 +1,79 @@ + +process SVABA { + tag "$meta.id" + label 'process_single' + + conda "bioconda::svaba=1.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/svaba:1.1.0--h7d7f7ad_2': + 'biocontainers/svaba:1.1.0--h7d7f7ad_2' }" + + input: + tuple val(meta), path(tumorbam), path(tumorbai), path(normalbam), path(normalbai) + tuple val(meta2), path(fasta) + tuple val(meta2), path(fasta_fai) + tuple val(meta3), path(bwa_index) + tuple val(meta4), path(dbsnp) + tuple val(meta4), path(dbsnp_tbi) + tuple val(meta5), path(regions) + + output: + tuple val(meta), path("*.svaba.sv.vcf.gz") , emit: sv, optional: true + tuple val(meta), path("*.svaba.indel.vcf.gz") , emit: indel, optional: true + tuple val(meta), path("*.svaba.germline.indel.vcf.gz") , emit: germ_indel, optional: true + tuple val(meta), path("*.svaba.germline.sv.vcf.gz") , emit: germ_sv, optional: true + tuple val(meta), path("*.svaba.somatic.indel.vcf.gz") , emit: som_indel, optional: true + tuple val(meta), path("*.svaba.somatic.sv.vcf.gz") , emit: som_sv, optional: true + tuple val(meta), path("*.svaba.unfiltered.sv.vcf.gz") , emit: unfiltered_sv, optional: true + tuple val(meta), path("*.svaba.unfiltered.indel.vcf.gz") , emit: unfiltered_indel, optional: true + tuple val(meta), path("*.svaba.unfiltered.germline.indel.vcf.gz") , emit: unfiltered_germ_indel, optional: true + tuple val(meta), path("*.svaba.unfiltered.germline.sv.vcf.gz") , emit: unfiltered_germ_sv, optional: true + tuple val(meta), path("*.svaba.unfiltered.somatic.indel.vcf.gz") , emit: unfiltered_som_indel, optional: true + tuple val(meta), path("*.svaba.unfiltered.somatic.sv.vcf.gz") , emit: unfiltered_som_sv, optional: true + tuple val(meta), path("*.bps.txt.gz") , emit: raw_calls + tuple val(meta), path("*.discordants.txt.gz") , emit: discordants, optional: true + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bamlist = normalbam ? "-t ${tumorbam} -n ${normalbam}" : "-t ${tumorbam}" + def dbsnp = dbsnp ? "--dbsnp-vcf ${dbsnp}" : "" + def regions = regions ? "--region ${regions}" : "" + def bwa = bwa_index ? "cp -s ${bwa_index}/* ." : "" + + """ + ${bwa} + + svaba \\ + run \\ + $bamlist \\ + --threads $task.cpus \\ + $dbsnp \\ + --id-string $meta.id \\ + --reference-genome $fasta \\ + --g-zip \\ + $regions \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + svaba: \$(echo \$(svaba --version 2>&1) | sed 's/[^0-9.]*\\([0-9.]*\\).*/\\1/' ) + END_VERSIONS + """ + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bps.txt.gz + touch ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + svaba: \$(echo \$(svaba --version 2>&1) | sed 's/[^0-9.]*\\([0-9.]*\\).*/\\1/' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/svaba/meta.yml b/modules/nf-core/svaba/meta.yml new file mode 100644 index 0000000..0137c5c --- /dev/null +++ b/modules/nf-core/svaba/meta.yml @@ -0,0 +1,160 @@ +name: "svaba" +description: SvABA is an efficient and accurate method for detecting SVs from short-read sequencing data using genome-wide local assembly with low memory and computing requirements +keywords: + - sv + - structural variants + - detecting svs + - short-read sequencing +tools: + - "svaba": + description: "Structural variation and indel detection by local assembly" + homepage: "https://github.com/walaj/svaba" + documentation: "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5880247/" + tool_dev_url: "https://github.com/walaj/svaba" + doi: "10.1101/gr.221028.117" + licence: "['GPL v3']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + id: should be the identification number or sample name. If there is normal file meta should be common + e.g. [ id:'test' ] + - meta2: + type: map + description: | + Groovy Map containing FASTA information + id: should be the identification number for alignment file and should be the same used to create BWA index files + e.g. [ id:'fasta' ] + - meta3: + type: map + description: | + Groovy Map containing BWA information + id: should be the identification number same as fasta file + e.g. [ id:'bwa' ] + - meta4: + type: map + description: | + Groovy Map containing dbSNP information + id: should be the identification number for dbSNP files + e.g. [ id:'test' ] + - meta5: + type: map + description: | + Groovy Map containing regions information + id: should be the identification number for regions + e.g. [ id:'test' ] + - tumorbam: + type: file + description: Tumor or metastatic sample, BAM, SAM or CRAM file + pattern: "*.{bam,cram,sam}" + - tummorbai: + type: file + description: Index + pattern: "*.{bai,crai,sai}" + - normalbam: + type: file + description: Control (or normal) of matching tumor/metastatic sample, BAM, SAM or CRAM file + pattern: "*.{bam,cram,sam}" + - normalbai: + type: file + description: Index + pattern: "*.{bai,crai,sai}" + - bwa_index: + type: file + description: BWA genome index files + pattern: "Directory containing BWA index *.{amb,ann,bwt,pac,sa}" + - fasta: + type: file + description: FASTA file + pattern: "*.{fasta|fa}" + - fasta_fai: + type: file + description: Index of FASTA file + pattern: "*.{fai}" + - dbsnp: + type: file + description: VCF file including dbSNP variants + pattern: "*.vcf.gz" + - dbsnp_tbi: + type: file + description: Index of VCF file including dbSNP variants + pattern: "*.vcf.gz.tbi" + - regions: + type: file + description: Targeted intervals. Accepts BED file or Samtools-style string + pattern: "*.bed|*.txt|*.tab" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - sv: + type: file + description: Filtered SVs for tumor only cases + pattern: "*.vcf.gz" + - indel: + type: file + description: Filtered Indels for tumor only cases + pattern: "*.vcf.gz" + - som_sv: + type: file + description: Somatic filtered SVs for tumor/normal paired samples + pattern: "*.vcf.gz" + - som_indel: + type: file + description: Somatic filtered Indels for tumor/normal paired samples + pattern: "*.vcf.gz" + - germ_sv: + type: file + description: Germline filtered SVs for tumor/normal paired samples + pattern: "*.vcf.gz" + - germ_indel: + type: file + description: Germline filtered Indels for tumor/normal paired samples + pattern: "*.vcf.gz" + - unfiltered_sv: + type: file + description: Unfiltered SVs for tumor only cases + pattern: "*.vcf.gz" + - unfiltered_indel: + type: file + description: Unfiltered Indels for tumor only cases + pattern: "*.vcf.gz" + - unfiltered_som_sv: + type: file + description: Unfiltered somatic SVs for tumor/normal paired samples + pattern: "*.vcf.gz" + - unfiltered_som_indel: + type: file + description: Unfiltered somatic Indels for tumor/normal paired samples + pattern: "*.vcf.gz" + - unfiltered_germ_sv: + type: file + description: Unfiltered germline SVs for tumor/normal paired samples + pattern: "*.vcf.gz" + - unfiltered_germ_indel: + type: file + description: Unfiltered germline Indels for tumor/normal paired samples + pattern: "*.vcf.gz" + - raw_calls: + type: file + description: Raw, unfiltered variants + pattern: "*.txt.gz" + - discordants: + type: file + description: Information on all clusters of discordant reads identified with 2+ reads + pattern: "*.txt.gz" + - log: + type: file + description: Log file + pattern: "*.txt.gz" +authors: + - "@kubranarci" diff --git a/modules/nf-core/tabix/bgziptabix/main.nf b/modules/nf-core/tabix/bgziptabix/main.nf new file mode 100644 index 0000000..d6c5a76 --- /dev/null +++ b/modules/nf-core/tabix/bgziptabix/main.nf @@ -0,0 +1,47 @@ +process TABIX_BGZIPTABIX { + tag "$meta.id" + label 'process_single' + + conda "bioconda::tabix=1.11" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/tabix:1.11--hdfd78af_0' : + 'biocontainers/tabix:1.11--hdfd78af_0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.gz"), path("*.tbi"), optional: true, emit: gz_tbi + tuple val(meta), path("*.gz"), path("*.csi"), optional: true, emit: gz_csi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + bgzip --threads ${task.cpus} -c $args $input > ${prefix}.${input.getExtension()}.gz + tabix $args2 ${prefix}.${input.getExtension()}.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.${input.getExtension()}.gz + touch ${prefix}.${input.getExtension()}.gz.tbi + touch ${prefix}.${input.getExtension()}.gz.csi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/tabix/bgziptabix/meta.yml b/modules/nf-core/tabix/bgziptabix/meta.yml new file mode 100644 index 0000000..2761e27 --- /dev/null +++ b/modules/nf-core/tabix/bgziptabix/meta.yml @@ -0,0 +1,50 @@ +name: tabix_bgziptabix +description: bgzip a sorted tab-delimited genome file and then create tabix index +keywords: + - bgzip + - compress + - index + - tabix + - vcf +tools: + - tabix: + description: Generic indexer for TAB-delimited genome position files. + homepage: https://www.htslib.org/doc/tabix.html + documentation: https://www.htslib.org/doc/tabix.1.html + doi: 10.1093/bioinformatics/btq671 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - tab: + type: file + description: TAB-delimited genome position file + pattern: "*.{bed,gff,sam,vcf}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gz: + type: file + description: Output compressed file + pattern: "*.{gz}" + - tbi: + type: file + description: tabix index file + pattern: "*.{gz.tbi}" + - csi: + type: file + description: tabix alternate index file + pattern: "*.{gz.csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" + - "@DLBPointon" diff --git a/modules/nf-core/tabix/tabix/main.nf b/modules/nf-core/tabix/tabix/main.nf new file mode 100644 index 0000000..5bf332e --- /dev/null +++ b/modules/nf-core/tabix/tabix/main.nf @@ -0,0 +1,42 @@ +process TABIX_TABIX { + tag "$meta.id" + label 'process_single' + + conda "bioconda::tabix=1.11" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/tabix:1.11--hdfd78af_0' : + 'biocontainers/tabix:1.11--hdfd78af_0' }" + + input: + tuple val(meta), path(tab) + + output: + tuple val(meta), path("*.tbi"), optional:true, emit: tbi + tuple val(meta), path("*.csi"), optional:true, emit: csi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + tabix $args $tab + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${tab}.tbi + cat <<-END_VERSIONS > versions.yml + + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/tabix/tabix/meta.yml b/modules/nf-core/tabix/tabix/meta.yml new file mode 100644 index 0000000..fcc6e52 --- /dev/null +++ b/modules/nf-core/tabix/tabix/meta.yml @@ -0,0 +1,45 @@ +name: tabix_tabix +description: create tabix index from a sorted bgzip tab-delimited genome file +keywords: + - index + - tabix + - vcf +tools: + - tabix: + description: Generic indexer for TAB-delimited genome position files. + homepage: https://www.htslib.org/doc/tabix.html + documentation: https://www.htslib.org/doc/tabix.1.html + doi: 10.1093/bioinformatics/btq671 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - tab: + type: file + description: TAB-delimited genome position file compressed with bgzip + pattern: "*.{bed.gz,gff.gz,sam.gz,vcf.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - tbi: + type: file + description: tabix index file + pattern: "*.{tbi}" + - csi: + type: file + description: coordinate sorted index file + pattern: "*.{csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@maxulysse" diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf new file mode 100644 index 0000000..61461c3 --- /dev/null +++ b/modules/nf-core/untar/main.nf @@ -0,0 +1,63 @@ +process UNTAR { + tag "$archive" + label 'process_single' + + conda "conda-forge::sed=4.7 conda-forge::grep=3.11 conda-forge::tar=1.34" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$prefix"), emit: untar + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName.toString().replaceFirst(/\.tar$/, "")) + + """ + mkdir $prefix + + ## Ensures --strip-components only applied when top level of tar contents is a directory + ## If just files or multiple directories, place all in prefix + if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then + tar \\ + -C $prefix --strip-components 1 \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + else + tar \\ + -C $prefix \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.toString().replaceFirst(/\.[^\.]+(.gz)?$/, "")) + """ + mkdir $prefix + touch ${prefix}/file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/untar/meta.yml b/modules/nf-core/untar/meta.yml new file mode 100644 index 0000000..db241a6 --- /dev/null +++ b/modules/nf-core/untar/meta.yml @@ -0,0 +1,41 @@ +name: untar +description: Extract files. +keywords: + - untar + - uncompress + - extract +tools: + - untar: + description: | + Extract tar.gz files. + documentation: https://www.gnu.org/software/tar/manual/ + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be untar + pattern: "*.{tar}.{gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - untar: + type: directory + description: Directory containing contents of archive + pattern: "*/" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" diff --git a/modules/nf-core/unzip/main.nf b/modules/nf-core/unzip/main.nf new file mode 100644 index 0000000..cf977f1 --- /dev/null +++ b/modules/nf-core/unzip/main.nf @@ -0,0 +1,37 @@ +process UNZIP { + tag "$archive" + label 'process_single' + + conda "conda-forge::p7zip=16.02" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/p7zip:16.02' : + 'biocontainers/p7zip:16.02' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("${prefix}/"), emit: unzipped_archive + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + if ( archive instanceof List && archive.name.size > 1 ) { error "[UNZIP] error: 7za only accepts a single archive as input. Please check module input." } + + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName) + """ + 7za \\ + x \\ + -o"${prefix}"/ \\ + $args \\ + $archive + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + 7za: \$(echo \$(7za --help) | sed 's/.*p7zip Version //; s/(.*//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/unzip/meta.yml b/modules/nf-core/unzip/meta.yml new file mode 100644 index 0000000..2910e0f --- /dev/null +++ b/modules/nf-core/unzip/meta.yml @@ -0,0 +1,43 @@ +name: unzip +description: Unzip ZIP archive files +keywords: + - unzip + - decompression + - zip + - archiving +tools: + - unzip: + description: p7zip is a quick port of 7z.exe and 7za.exe (command line version of 7zip, see www.7-zip.org) for Unix. + homepage: https://sourceforge.net/projects/p7zip/ + documentation: https://sourceforge.net/projects/p7zip/ + tool_dev_url: https://sourceforge.net/projects/p7zip" + licence: ["LGPL-2.1-or-later"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: ZIP file + pattern: "*.zip" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - unzipped_archive: + type: directory + description: Directory contents of the unzipped archive + pattern: "${archive.baseName}/" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index da7e815..22a905c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -8,55 +8,133 @@ // Global default params, used in configs params { + - // TODO nf-core: Specify your pipeline's command line flags - // Input options - input = null - // References - genome = null - igenomes_base = 's3://ngi-igenomes/igenomes' - igenomes_ignore = false - - - // MultiQC options - multiqc_config = null - multiqc_title = null - multiqc_logo = null - max_multiqc_email_size = '25.MB' - multiqc_methods_description = null - - // Boilerplate options - outdir = null - publish_dir_mode = 'copy' - email = null - email_on_fail = null - plaintext_email = false - monochrome_logs = false - hook_url = null - help = false - version = false - - // Config options - config_profile_name = null - config_profile_description = null - custom_config_version = 'master' - custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" - config_profile_contact = null - config_profile_url = null - - - // Max resource options - // Defaults only, expecting to be overwritten - max_memory = '128.GB' - max_cpus = 16 - max_time = '240.h' - - // Schema validation default options - validationFailUnrecognisedParams = false - validationLenientMode = false - validationSchemaIgnoreParams = 'genomes' - validationShowHiddenParams = false - validate_params = true + // TODO nf-core: Specify your pipeline's command line flags + // Input options (Mandatory!) + input = null + step = 'alignment' + + // References + genome = 'GATK.GRCh38' + igenomes_base = 's3://ngi-igenomes/igenomes' + igenomes_ignore = false + save_reference = false + build_only_index = false // Only build the reference indexes + download_cache = false // Do not download annotation cache + + // Options to consider + // Main options + no_intervals = false // Intervals will be built from the fasta file + nucleotides_per_second = 200000 // Default interval size + tools = null // No default Variant_Calling or Annotation tools + skip_tools = null // All tools (markduplicates + baserecalibrator + QC) are used by default + split_fastq = 50000000 // FASTQ files will not be split by default by FASTP, sarek = 50000000 + + // Modify FASTQ files (trim/split) with FASTP + trim_fastq = false // No trimming by default + clip_r1 = 0 + clip_r2 = 0 + three_prime_clip_r1 = 0 + three_prime_clip_r2 = 0 + trim_nextseq = 0 + save_trimmed = false + save_split_fastqs = false + + // Alignment + aligner = 'bwa-mem' // Default is bwa-mem, bwa-mem2 and dragmap can be used too + save_mapped = true // Mapped BAMs are saved + save_output_as_bam = true // Output files from alignment are saved as bam by default and not as cram files + seq_center = null // No sequencing center to be written in read group CN field by aligner + seq_platform = null // Default platform written in read group PL field by aligner, null by default. + + // Structural Variant Calling + error_rate = 0.01 // Default error_rate for Svaba + //indel_mask = null // Must provide blacklist bed file for indels based on genome to run Svaba + + + // Variant Calling + only_paired_variant_calling = false // if true, skips germline variant calling for normal-paired samples + ascat_ploidy = null // default value for ASCAT + ascat_min_base_qual = 20 // default value for ASCAT + ascat_min_counts = 10 // default value for ASCAT + ascat_min_map_qual = 35 // default value for ASCAT + ascat_purity = null // default value for ASCAT + cf_ploidy = "2" // default value for Control-FREEC + cf_coeff = 0.05 // default value for Control-FREEC + cf_contamination = 0 // default value for Control-FREEC + cf_contamination_adjustment = false // by default we are not using this in Control-FREEC + cf_mincov = 0 // ControlFreec default values + cf_minqual = 0 // ControlFreec default values + cf_window = null // by default we are not using this in Control-FREEC + cnvkit_reference = null // by default the reference is build from the fasta file + concatenate_vcfs = false // by default we don't concatenate the germline-vcf-files + ignore_soft_clipped_bases = false // no --dont-use-soft-clipped-bases for GATK Mutect2 + wes = false // Set to true, if data is exome/targeted sequencing data. Used to use correct models in various variant callers + joint_germline = false // g.vcf & joint germline calling are not run by default if HaplotypeCaller is selected + joint_mutect2 = false // if true, enables patient-wise multi-sample somatic variant calling + sentieon_haplotyper_emit_mode = "variant" // default value for Sentieon haplotyper + + // Annotation + dbnsfp = null // No dbnsfp processed file + dbnsfp_consequence = null // No default consequence for dbnsfp plugin + dbnsfp_fields = "rs_dbSNP,HGVSc_VEP,HGVSp_VEP,1000Gp3_EAS_AF,1000Gp3_AMR_AF,LRT_score,GERP++_RS,gnomAD_exomes_AF" // Default fields for dbnsfp plugin + dbnsfp_tbi = null // No dbnsfp processed file index + outdir_cache = null // No default outdir cache + snpeff_cache = 's3://annotation-cache/snpeff_cache/' + spliceai_indel = null // No spliceai_indel file + spliceai_indel_tbi = null // No spliceai_indel file index + spliceai_snv = null // No spliceai_snv file + spliceai_snv_tbi = null // No spliceai_snv file index + use_annotation_cache_keys = false + vep_cache = 's3://annotation-cache/vep_cache/' + vep_custom_args = "--everything --filter_common --per_gene --total_length --offline --format vcf" // Default arguments for VEP + vep_dbnsfp = null // dbnsfp plugin disabled within VEP + vep_include_fasta = false // Don't use fasta file for annotation with VEP + vep_loftee = null // loftee plugin disabled within VEP + vep_out_format = "vcf" + vep_spliceai = null // spliceai plugin disabled within VEP + vep_spliceregion = null // spliceregion plugin disabled within VEP + + + // MultiQC options + multiqc_config = null + multiqc_title = null + multiqc_logo = null + max_multiqc_email_size = '25.MB' + multiqc_methods_description = null + + // Boilerplate options + outdir = null + publish_dir_mode = 'copy' + email = null + email_on_fail = null + plaintext_email = false + monochrome_logs = false + hook_url = null + help = false + version = false + + // Config options + config_profile_name = null + config_profile_description = null + custom_config_version = 'master' + custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" + config_profile_contact = null + config_profile_url = null + + // Max resource options + // Defaults only, expecting to be overwritten + max_memory = '256.GB' + max_cpus = 16 + max_time = '360.h' + + // Schema validation default options + validationFailUnrecognisedParams = false + validationLenientMode = false + validationSchemaIgnoreParams = 'genomes, cf_ploidy' + validationShowHiddenParams = false + validate_params = false } @@ -65,9 +143,9 @@ includeConfig 'conf/base.config' // Load nf-core custom profiles from different Institutions try { - includeConfig "${params.custom_config_base}/nfcore_custom.config" + includeConfig "${params.custom_config_base}/nfcore_custom.config" } catch (Exception e) { - System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") + System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") } // Load nf-core/heisenbio custom profiles from different institutions. @@ -78,96 +156,99 @@ try { // System.err.println("WARNING: Could not load nf-core/config/heisenbio profiles: ${params.custom_config_base}/pipeline/heisenbio.config") // } profiles { - debug { - dumpHashes = true - process.beforeScript = 'echo $HOSTNAME' - cleanup = false - } - conda { - conda.enabled = true - docker.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false - } - mamba { - conda.enabled = true - conda.useMamba = true - docker.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false - } - docker { - docker.enabled = true - docker.userEmulation = true - conda.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false - } - arm { - docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' - } - singularity { - singularity.enabled = true - singularity.autoMounts = true - conda.enabled = false - docker.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false - } - podman { - podman.enabled = true - conda.enabled = false - docker.enabled = false - singularity.enabled = false - shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false - } - shifter { - shifter.enabled = true - conda.enabled = false - docker.enabled = false - singularity.enabled = false - podman.enabled = false - charliecloud.enabled = false - apptainer.enabled = false - } - charliecloud { - charliecloud.enabled = true - conda.enabled = false - docker.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - apptainer.enabled = false - } - apptainer { - apptainer.enabled = true - conda.enabled = false - docker.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - } - gitpod { - executor.name = 'local' - executor.cpus = 16 - executor.memory = 60.GB - } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } + debug { + dumpHashes = true + process.beforeScript = 'echo $HOSTNAME' + cleanup = false + } + conda { + conda.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false + } + mamba { + conda.enabled = true + conda.useMamba = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false + } + docker { + docker.enabled = true + docker.userEmulation = true + conda.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false + } + arm { + docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + conda.enabled = false + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false + } + podman { + podman.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false + } + shifter { + shifter.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + apptainer.enabled = false + } + charliecloud { + charliecloud.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + apptainer.enabled = false + } + apptainer { + apptainer.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + gitpod { + executor.name = 'local' + executor.cpus = 16 + executor.memory = 60.GB + } + + //basic test config files + test { includeConfig 'conf/test.config' } + test_full { includeConfig 'conf/test_full.config' } + } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile @@ -180,24 +261,24 @@ singularity.registry = 'quay.io' // Nextflow plugins plugins { - id 'nf-validation' // Validation of pipeline parameters and creation of an input channel from a sample sheet + id 'nf-validation' // Validation of pipeline parameters and creation of an input channel from a sample sheet } // Load igenomes.config if required if (!params.igenomes_ignore) { - includeConfig 'conf/igenomes.config' + includeConfig 'conf/igenomes.config' } else { - params.genomes = [:] + params.genomes = [:] } // Export these variables to prevent local Python/R libraries from conflicting with those in the container // The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. env { - PYTHONNOUSERSITE = 1 - R_PROFILE_USER = "/.Rprofile" - R_ENVIRON_USER = "/.Renviron" - JULIA_DEPOT_PATH = "/usr/local/share/julia" + PYTHONNOUSERSITE = 1 + R_PROFILE_USER = "/.Rprofile" + R_ENVIRON_USER = "/.Renviron" + JULIA_DEPOT_PATH = "/usr/local/share/julia" } // Capture exit codes from upstream processes when piping @@ -205,36 +286,64 @@ process.shell = ['/bin/bash', '-euo', 'pipefail'] def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { - enabled = true - file = "${params.outdir}/pipeline_info/execution_timeline_${trace_timestamp}.html" + enabled = true + file = "${params.outdir}/pipeline_info/execution_timeline_${trace_timestamp}.html" } report { - enabled = true - file = "${params.outdir}/pipeline_info/execution_report_${trace_timestamp}.html" + enabled = true + file = "${params.outdir}/pipeline_info/execution_report_${trace_timestamp}.html" } trace { - enabled = true - file = "${params.outdir}/pipeline_info/execution_trace_${trace_timestamp}.txt" + enabled = true + file = "${params.outdir}/pipeline_info/execution_trace_${trace_timestamp}.txt" } dag { - enabled = true - file = "${params.outdir}/pipeline_info/pipeline_dag_${trace_timestamp}.html" + enabled = true + file = "${params.outdir}/pipeline_info/pipeline_dag_${trace_timestamp}.html" } manifest { - name = 'nf-core/heisenbio' - author = """Tanubrata Dey, Shihab Dider, Joel Rosiene""" - homePage = 'https://github.com/nf-core/heisenbio' - description = """Clinical Pipeline for MSkiLab""" - mainScript = 'main.nf' - nextflowVersion = '!>=23.04.0' - version = '1.0dev' - doi = '' + name = 'nf-core/heisenbio' + author = """Tanubrata Dey, Shihab Dider, Joel Rosiene""" + homePage = 'https://github.com/tanubrata/heisenbio' + description = """Clinical Core Pipeline for MSkiLab""" + mainScript = 'main.nf' + nextflowVersion = '!>=23.04.0' + version = '1.0dev' + doi = '' } // Load modules.config for DSL2 module specific options includeConfig 'conf/modules.config' + +// prepare reference, pre-alignment steps +includeConfig 'conf/modules/prepare_cache.config' +includeConfig 'conf/modules/prepare_genome.config' +includeConfig 'conf/modules/prepare_intervals.config' + +// Alignment configurations +includeConfig 'conf/modules/aligner.config' +includeConfig 'conf/modules/alignment_to_fastq.config' +includeConfig 'conf/modules/markduplicates.config' +includeConfig 'conf/modules/prepare_recalibration.config' +includeConfig 'conf/modules/recalibrate.config' +includeConfig 'conf/modules/trimming.config' + +// SV configurations +includeConfig 'conf/modules/structural_variants.config' + + + + + + + + + + + + // Function to ensure that resource requirements don't go beyond // a maximum limit def check_max(obj, type) { diff --git a/nextflow_schema.json b/nextflow_schema.json index 7a2efb8..8868edb 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,8 +1,8 @@ { "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/nf-core/heisenbio/master/nextflow_schema.json", - "title": "nf-core/heisenbio pipeline parameters", - "description": "Clinical Pipeline for MSkiLab", + "$id": "https://raw.githubusercontent.com/tanubrata/heisenbio/master/nextflow_schema.json", + "title": "tanubrata/heisenbio pipeline parameters", + "description": "An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing", "type": "object", "definitions": { "input_output_options": { @@ -10,35 +10,587 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "help_text": "Specify input samplesheet, step and output folder.", + "required": ["step", "outdir"], "properties": { "input": { + "description": "Path to comma-separated file containing information about the samples in the experiment.", + "help_text": "A design file with information about the samples in your experiment. Use this parameter to specify the location of the input files. It has to be a comma-separated file with a header row. See [usage docs](https://nf-co.re/sarek/usage#input).\n\nIf no input file is specified, sarek will attempt to locate one in the `{outdir}` directory. If no input should be supplied, i.e. when --step is supplied or --build_from_index, then set --input false", + "fa_icon": "fas fa-file-csv", + "schema": "assets/schema_input.json", + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/csv", + "pattern": "^\\S+\\.csv$" + }, + "input_restart": { "type": "string", + "description": "Automatic retrieval for restart", "format": "file-path", "exists": true, "mimetype": "text/csv", "pattern": "^\\S+\\.csv$", - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/heisenbio/usage#samplesheet-input).", - "fa_icon": "fas fa-file-csv" + "hidden": true, + "schema": "assets/schema_input.json" + }, + "step": { + "type": "string", + "default": "alignment", + "fa_icon": "fas fa-play", + "description": "Starting step", + "help_text": "The pipeline starts from this step and then runs through the possible subsequent steps.", + "enum": [ + "alignment", + "markduplicates", + "prepare_recalibration", + "recalibrate", + "sv_calling", + "variant_calling", + "annotate" + ] }, "outdir": { "type": "string", "format": "directory-path", "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" + } + } + }, + "main_options": { + "title": "Main options", + "type": "object", + "description": "Most common options used for the pipeline", + "default": "", + "properties": { + "split_fastq": { + "type": "integer", + "default": 50000000, + "fa_icon": "fas fa-clock", + "description": "Specify how many reads each split of a FastQ file contains. Set 0 to turn off splitting at all.", + "help_text": "Use the the tool FastP to split FASTQ file by number of reads. This parallelizes across fastq file shards speeding up mapping. " }, - "email": { + "wes": { + "type": "boolean", + "fa_icon": "fas fa-dna", + "description": "Enable when exome or panel data is provided.", + "help_text": "With this parameter flags in various tools are set for targeted sequencing data. It is recommended to enable for whole-exome and panel data analysis." + }, + "intervals": { "type": "string", - "description": "Email address for completion summary.", - "fa_icon": "fas fa-envelope", - "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + "fa_icon": "fas fa-file-alt", + "help_text": "To speed up preprocessing and variant calling processes, the execution is parallelized across a reference chopped into smaller pieces.\n\nParts of preprocessing and variant calling are done by these intervals, the different resulting files are then merged.\nThis can parallelize processes, and push down wall clock time significantly.\n\nWe are aligning to the whole genome, and then run Base Quality Score Recalibration and Variant Calling on the supplied regions.\n\n**Whole Genome Sequencing:**\n\nThe (provided) intervals are chromosomes cut at their centromeres (so each chromosome arm processed separately) also additional unassigned contigs.\n\nWe are ignoring the `hs37d5` contig that contains concatenated decoy sequences.\n\nThe calling intervals can be defined using a .list or a BED file.\nA .list file contains one interval per line in the format `chromosome:start-end` (1-based coordinates).\nA BED file must be a tab-separated text file with one interval per line.\nThere must be at least three columns: chromosome, start, and end (0-based coordinates).\nAdditionally, the score column of the BED file can be used to provide an estimate of how many seconds it will take to call variants on that interval.\nThe fourth column remains unused.\n\n```\n|chr1|10000|207666|NA|47.3|\n```\nThis indicates that variant calling on the interval chr1:10001-207666 takes approximately 47.3 seconds.\n\nThe runtime estimate is used in two different ways.\nFirst, when there are multiple consecutive intervals in the file that take little time to compute, they are processed as a single job, thus reducing the number of processes that needs to be spawned.\nSecond, the jobs with largest processing time are started first, which reduces wall-clock time.\nIf no runtime is given, a time of 200000 nucleotides per second is assumed. See `--nucleotides_per_second` on how to customize this.\nActual figures vary from 2 nucleotides/second to 30000 nucleotides/second.\nIf you prefer, you can specify the full path to your reference genome when you run the pipeline:\n\n> **NB** If none provided, will be generated automatically from the FASTA reference\n> **NB** Use --no_intervals to disable automatic generation.\n\n**Targeted Sequencing:**\n\nThe recommended flow for targeted sequencing data is to use the workflow as it is, but also provide a `BED` file containing targets for all steps using the `--intervals` option. In addition, the parameter `--wes` should be set.\nIt is advised to pad the variant calling regions (exons or target) to some extent before submitting to the workflow.\n\nThe procedure is similar to whole genome sequencing, except that only BED file are accepted. See above for formatting description.\nAdding every exon as an interval in case of `WES` can generate >200K processes or jobs, much more forks, and similar number of directories in the Nextflow work directory. These are appropriately grouped together to reduce number of processes run in parallel (see above and `--nucleotides_per_second` for details). \nFurthermore, primers and/or baits are not 100% specific, (certainly not for MHC and KIR, etc.), quite likely there going to be reads mapping to multiple locations.\nIf you are certain that the target is unique for your genome (all the reads will certainly map to only one location), and aligning to the whole genome is an overkill, it is actually better to change the reference itself.", + "description": "Path to target bed file in case of whole exome or targeted sequencing or intervals file." }, - "multiqc_title": { + "nucleotides_per_second": { + "type": "number", + "fa_icon": "fas fa-clock", + "description": "Estimate interval size.", + "help_text": "Intervals are parts of the chopped up genome used to speed up preprocessing and variant calling. See `--intervals` for more info. \n\nChanging this parameter, changes the number of intervals that are grouped and processed together. Bed files from target sequencing can contain thousands or small intervals. Spinning up a new process for each can be quite resource intensive. Instead it can be desired to process small intervals together on larger nodes. \nIn order to make use of this parameter, no runtime estimate can be present in the bed file (column 5). ", + "default": 200000 + }, + "no_intervals": { + "type": "boolean", + "fa_icon": "fas fa-ban", + "description": "Disable usage of intervals.", + "help_text": "Intervals are parts of the chopped up genome used to speed up preprocessing and variant calling. See `--intervals` for more info. \n\nIf `--no_intervals` is set no intervals will be taken into account for speed up or data processing." + }, + "tools": { "type": "string", - "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", - "fa_icon": "fas fa-file-signature" + "fa_icon": "fas fa-toolbox", + "description": "Tools to use for duplicate marking, variant calling and/or for annotation.", + "help_text": "Multiple tools separated with commas.\n\n**Variant Calling:**\n\nGermline variant calling can currently be performed with the following variant callers:\n- SNPs/Indels: DeepVariant, FreeBayes, GATK HaplotypeCaller, mpileup, Sentieon Haplotyper, Strelka\n- Structural Variants: Manta, TIDDIT\n- Copy-number: CNVKit\n\nTumor-only somatic variant calling can currently be performed with the following variant callers:\n- SNPs/Indels: FreeBayes, mpileup, Mutect2, Strelka\n- Structural Variants: Manta, TIDDIT\n- Copy-number: CNVKit, ControlFREEC\n\nSomatic variant calling can currently only be performed with the following variant callers:\n- SNPs/Indels: FreeBayes, Mutect2, Strelka2\n- Structural variants: Manta, TIDDIT\n- Copy-Number: ASCAT, CNVKit, Control-FREEC\n- Microsatellite Instability: MSIsensorpro\n\n> **NB** Mutect2 for somatic variant calling cannot be combined with `--no_intervals`\n\n**Annotation:**\n \n- snpEff, VEP, merge (both consecutively).\n\n> **NB** As Sarek will use bgzip and tabix to compress and index VCF files annotated, it expects VCF files to be sorted when starting from `--step annotate`.", + "pattern": "^((ascat|cnvkit|controlfreec|deepvariant|freebayes|haplotypecaller|manta|merge|mpileup|msisensorpro|mutect2|snpeff|strelka|tiddit|vep|svaba|gridss)?,?)*(? **NB** `--skip_tools baserecalibrator_report` is actually just not saving the reports.\n> **NB** `--skip_tools markduplicates_report` does not skip `MarkDuplicates` but prevent the collection of duplicate metrics that slows down performance.", + "pattern": "^((baserecalibrator|baserecalibrator_report|bcftools|documentation|fastqc|haplotypecaller_filter|haplotyper_filter|markduplicates|markduplicates_report|mosdepth|multiqc|samtools|vcftools|versions)?,?)*(? The GATK4 Base Quality Score recalibration tools `Baserecalibrator` and `ApplyBQSR` are currently available as Beta release. Use with caution!", + "pattern": "^((baserecalibrator|markduplicates)?,?)*(? **NB** PON file should be bgzipped.", + "hidden": true + }, + "pon_tbi": { + "type": "string", + "fa_icon": "fas fa-file", + "description": "Index of PON panel-of-normals VCF.", + "help_text": "If none provided, will be generated automatically from the PON bgzipped VCF file.", + "hidden": true + }, + "ignore_soft_clipped_bases": { + "type": "boolean", + "fa_icon": "fas fa-ban", + "description": "Do not analyze soft clipped bases in the reads for GATK Mutect2.", + "help_text": "use the `--dont-use-soft-clipped-bases` params with GATK Mutect2.", + "hidden": true + }, + "sentieon_haplotyper_emit_mode": { + "type": "string", + "default": "variant", + "fa_icon": "fas fa-toolbox", + "description": "Option for selecting output and emit-mode of Sentieon's Haplotyper.", + "help_text": "The option `--sentieon_haplotyper_emit_mode` can be set to the same string values as the Haplotyper's `--emit_mode`. To output both a vcf and a gvcf, specify both a vcf-option (currently, `all`, `confident` and `variant`) and `gvcf`. For example, to obtain a vcf and gvcf one could set `--sentieon_haplotyper_emit_mode` to `variant, gvcf`.", + "pattern": "^(all|confident|gvcf|variant|gvcf,all|gvcf,confident|gvcf,variant|all,gvcf|confident,gvcf|variant,gvcf)(? **NB** If none provided, will be generated automatically from the FASTA reference. Combine with `--save_reference` to save for future runs.", + "hidden": true + }, + "bwamem2": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "Path to bwa-mem2 mem indices.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\nIf you wish to recompute indices available on igenomes, set `--bwamem2 false`.\n\n> **NB** If none provided, will be generated automatically from the FASTA reference, if `--aligner bwa-mem2` is specified. Combine with `--save_reference` to save for future runs.", + "hidden": true + }, + "chr_dir": { + "type": "string", + "fa_icon": "fas fa-folder-open", + "description": "Path to chromosomes folder used with ControLFREEC.", + "hidden": true, + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately." + }, + "dbsnp": { + "type": "string", + "fa_icon": "fas fa-file", + "description": "Path to dbsnp file.", + "hidden": true, + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately." + }, + "dbsnp_tbi": { + "type": "string", + "fa_icon": "fas fa-file", + "description": "Path to dbsnp index.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n> **NB** If none provided, will be generated automatically from the dbsnp file. Combine with `--save_reference` to save for future runs.", + "hidden": true + }, + "dbsnp_vqsr": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "label string for VariantRecalibration (haplotypecaller joint variant calling)" + }, + "dict": { + "type": "string", + "fa_icon": "fas fa-file", + "description": "Path to FASTA dictionary file.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n> **NB** If none provided, will be generated automatically from the FASTA reference. Combine with `--save_reference` to save for future runs.", + "hidden": true + }, + "dragmap": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "Path to dragmap indices.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\nIf you wish to recompute indices available on igenomes, set `--dragmap false`.\n\n> **NB** If none provided, will be generated automatically from the FASTA reference, if `--aligner dragmap` is specified. Combine with `--save_reference` to save for future runs.", + "hidden": true }, "fasta": { "type": "string", @@ -61,25 +704,139 @@ "mimetype": "text/plain", "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", "description": "Path to FASTA genome file.", - "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\nThis parameter is *mandatory* if `--genome` is not specified.", "fa_icon": "far fa-file-code" }, + "fasta_fai": { + "type": "string", + "fa_icon": "fas fa-file", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n> **NB** If none provided, will be generated automatically from the FASTA reference. Combine with `--save_reference` to save for future runs.", + "description": "Path to FASTA reference index." + }, + "germline_resource": { + "type": "string", + "fa_icon": "fas fa-file", + "description": "Path to GATK Mutect2 Germline Resource File.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\nThe germline resource VCF file (bgzipped and tabixed) needed by GATK4 Mutect2 is a collection of calls that are likely present in the sample, with allele frequencies.\nThe AF info field must be present.\nYou can find a smaller, stripped gnomAD VCF file (most of the annotation is removed and only calls signed by PASS are stored) in the AWS iGenomes Annotation/GermlineResource folder.", + "hidden": true + }, + "germline_resource_tbi": { + "type": "string", + "fa_icon": "fas fa-file", + "description": "Path to GATK Mutect2 Germline Resource Index.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n> **NB** If none provided, will be generated automatically from the Germline Resource file, if provided. Combine with `--save_reference` to save for future runs.", + "hidden": true + }, + "known_indels": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "Path to known indels file.", + "hidden": true, + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately." + }, + "known_indels_tbi": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "Path to known indels file index.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n> **NB** If none provided, will be generated automatically from the known index file, if provided. Combine with `--save_reference` to save for future runs.", + "hidden": true + }, + "known_indels_vqsr": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n1st label string for VariantRecalibration (haplotypecaller joint variant calling)" + }, + "known_snps": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "If you use AWS iGenomes, this has already been set for you appropriately.\n\nPath to known snps file." + }, + "known_snps_tbi": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "Path to known snps file snps.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n> **NB** If none provided, will be generated automatically from the known index file, if provided. Combine with `--save_reference` to save for future runs." + }, + "known_snps_vqsr": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "If you use AWS iGenomes, this has already been set for you appropriately.\n\nlabel string for VariantRecalibration (haplotypecaller joint variant calling)" + }, + "mappability": { + "type": "string", + "fa_icon": "fas fa-file", + "description": "Path to Control-FREEC mappability file.", + "hidden": true, + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately." + }, + "snpeff_db": { + "type": "string", + "fa_icon": "fas fa-database", + "description": "snpEff DB version.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the database to be use to annotate with.\nAlternatively databases' names can be listed with the `snpEff databases`.", + "hidden": true + }, + "snpeff_genome": { + "type": "string", + "fa_icon": "fas fa-microscope", + "description": "snpEff genome.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the genome when using the container with pre-downloaded cache.", + "hidden": true + }, + "vep_genome": { + "type": "string", + "fa_icon": "fas fa-microscope", + "description": "VEP genome.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the genome when using the container with pre-downloaded cache.", + "hidden": true + }, + "vep_species": { + "type": "string", + "fa_icon": "fas fa-microscope", + "description": "VEP species.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nAlternatively species listed in Ensembl Genomes caches can be used.", + "hidden": true + }, + "vep_cache_version": { + "type": "number", + "fa_icon": "fas fa-tag", + "description": "VEP cache version.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nAlternatively cache version can be use to specify the correct Ensembl Genomes version number as these differ from the concurrent Ensembl/VEP version numbers", + "hidden": true + }, + "save_reference": { + "type": "boolean", + "fa_icon": "fas fa-download", + "description": "Save built references.", + "help_text": "Set this parameter, if you wish to save all computed reference files. This is useful to avoid re-computation on future runs." + }, + "build_only_index": { + "type": "boolean", + "fa_icon": "fas fa-download", + "description": "Only built references.", + "help_text": "Set this parameter, if you wish to compute and save all computed reference files. No alignment or any other downstream steps will be performed." + }, + "download_cache": { + "type": "boolean", + "fa_icon": "fas fa-download", + "description": "Download annotation cache.", + "help_text": "Set this parameter, if you wish to download annotation cache." + }, "igenomes_base": { "type": "string", "format": "directory-path", "description": "Directory / URL base for iGenomes references.", - "default": "s3://ngi-igenomes/igenomes", - "fa_icon": "fas fa-cloud-download-alt", - "hidden": true + "default": "s3://ngi-igenomes/igenomes/", + "fa_icon": "fas fa-cloud-download-alt" }, "igenomes_ignore": { "type": "boolean", "description": "Do not load the iGenomes reference config.", "fa_icon": "fas fa-ban", - "hidden": true, - "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + "help_text": "Do not load `igenomes.config` when running the pipeline.\nYou may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`.\n\n> **NB** You can then run `Sarek` by specifying at least a FASTA genome file." } - } + }, + "help_text": "The pipeline config files come bundled with paths to the Illumina iGenomes reference index files.\nThe configuration is set up to use the AWS-iGenomes resource\ncf https://ewels.github.io/AWS-iGenomes/." }, "institutional_config_options": { "title": "Institutional config options", @@ -126,6 +883,27 @@ "description": "Institutional config URL link.", "hidden": true, "fa_icon": "fas fa-users-cog" + }, + "test_data_base": { + "type": "string", + "default": "https://raw.githubusercontent.com/nf-core/test-datasets/sarek3", + "description": "Base path / URL for data used in the test profiles", + "help_text": "Warning: The `-profile test` samplesheet file itself contains remote paths. Setting this parameter does not alter the contents of that file.", + "hidden": true + }, + "seq_center": { + "type": "string", + "fa_icon": "fas fa-university", + "description": "Sequencing center information to be added to read group (CN field).", + "hidden": true + }, + "seq_platform": { + "type": "string", + "fa_icon": "fas fa-university", + "default": "ILLUMINA", + "description": "Sequencing platform information to be added to read group (PL field).", + "help_text": "Default: ILLUMINA. Will be used to create a proper header for further GATK4 downstream analysis.", + "hidden": true } } }, @@ -142,7 +920,7 @@ "default": 16, "fa_icon": "fas fa-microchip", "hidden": true, - "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" + "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`." }, "max_memory": { "type": "string", @@ -151,7 +929,7 @@ "fa_icon": "fas fa-memory", "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", "hidden": true, - "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" + "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`." }, "max_time": { "type": "string", @@ -160,7 +938,7 @@ "fa_icon": "far fa-clock", "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", "hidden": true, - "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" + "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`." } } }, @@ -175,14 +953,12 @@ "type": "boolean", "description": "Display help text.", "fa_icon": "fas fa-question-circle", - "default": false, "hidden": true }, "version": { "type": "boolean", "description": "Display version and exit.", "fa_icon": "fas fa-question-circle", - "default": false, "hidden": true }, "publish_dir_mode": { @@ -194,6 +970,13 @@ "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + }, "email_on_fail": { "type": "string", "description": "Email address for completion summary, only when pipeline fails.", @@ -206,7 +989,6 @@ "type": "boolean", "description": "Send plain-text email instead of HTML.", "fa_icon": "fas fa-remove-format", - "default": false, "hidden": true }, "max_multiqc_email_size": { @@ -221,15 +1003,12 @@ "type": "boolean", "description": "Do not use coloured log outputs.", "fa_icon": "fas fa-palette", - "default": false, "hidden": true }, - "hook_url": { + "multiqc_title": { "type": "string", - "description": "Incoming hook URL for messaging service", - "fa_icon": "fas fa-people-group", - "help_text": "Incoming hook URL for messaging service. Currently, MS Teams and Slack are supported.", - "hidden": true + "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", + "fa_icon": "fas fa-file-signature" }, "multiqc_config": { "type": "string", @@ -260,7 +1039,6 @@ "type": "boolean", "fa_icon": "far fa-eye-slash", "description": "Show all params when using `--help`", - "default": false, "hidden": true, "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." }, @@ -268,7 +1046,6 @@ "type": "boolean", "fa_icon": "far fa-check-circle", "description": "Validation of parameters fails when an unrecognised parameter is found.", - "default": false, "hidden": true, "help_text": "By default, when an unrecognised parameter is found, it returns a warinig." }, @@ -276,9 +1053,15 @@ "type": "boolean", "fa_icon": "far fa-check-circle", "description": "Validation of parameters in lenient more.", - "default": false, "hidden": true, "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." + }, + "hook_url": { + "type": "string", + "description": "Incoming hook URL for messaging service", + "fa_icon": "fas fa-people-group", + "help_text": "Incoming hook URL for messaging service. Currently, MS Teams and Slack are supported.", + "hidden": true } } } @@ -287,6 +1070,21 @@ { "$ref": "#/definitions/input_output_options" }, + { + "$ref": "#/definitions/main_options" + }, + { + "$ref": "#/definitions/fastq_preprocessing" + }, + { + "$ref": "#/definitions/preprocessing" + }, + { + "$ref": "#/definitions/variant_calling" + }, + { + "$ref": "#/definitions/annotation" + }, { "$ref": "#/definitions/reference_genome_options" }, diff --git a/subworkflows/local/bam_applybqsr/main.nf b/subworkflows/local/bam_applybqsr/main.nf new file mode 100644 index 0000000..667b349 --- /dev/null +++ b/subworkflows/local/bam_applybqsr/main.nf @@ -0,0 +1,47 @@ +// +// RECALIBRATE +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { GATK4_APPLYBQSR } from '../../../modules/nf-core/gatk4/applybqsr/main' +include { CRAM_MERGE_INDEX_SAMTOOLS } from '../cram_merge_index_samtools/main' + +workflow BAM_APPLYBQSR { + take: + cram // channel: [mandatory] [ meta, cram, crai, recal ] + dict // channel: [mandatory] [ dict ] + fasta // channel: [mandatory] [ fasta ] + fasta_fai // channel: [mandatory] [ fasta_fai ] + intervals // channel: [mandatory] [ intervals, num_intervals ] or [ [], 0 ] if no intervals + + main: + versions = Channel.empty() + + // Combine cram and intervals for spread and gather strategy + cram_intervals = cram.combine(intervals) + // Move num_intervals to meta map + .map{ meta, cram, crai, recal, intervals, num_intervals -> [ meta + [ num_intervals:num_intervals ], cram, crai, recal, intervals ] } + + // RUN APPLYBQSR + GATK4_APPLYBQSR(cram_intervals, fasta, fasta_fai, dict.map{ meta, it -> [ it ] }) + + // Gather the recalibrated cram files + cram_to_merge = GATK4_APPLYBQSR.out.cram.map{ meta, cram -> [ groupKey(meta, meta.num_intervals), cram ] }.groupTuple() + + // Merge and index the recalibrated cram files + CRAM_MERGE_INDEX_SAMTOOLS(cram_to_merge, fasta, fasta_fai) + + cram_recal = CRAM_MERGE_INDEX_SAMTOOLS.out.cram_crai + // Remove no longer necessary field: num_intervals + .map{ meta, cram, crai -> [ meta - meta.subMap('num_intervals'), cram, crai ] } + + // Gather versions of all tools used + versions = versions.mix(GATK4_APPLYBQSR.out.versions) + versions = versions.mix(CRAM_MERGE_INDEX_SAMTOOLS.out.versions) + + emit: + cram = cram_recal // channel: [ meta, cram, crai ] + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/bam_baserecalibrator/main.nf b/subworkflows/local/bam_baserecalibrator/main.nf new file mode 100644 index 0000000..198b96e --- /dev/null +++ b/subworkflows/local/bam_baserecalibrator/main.nf @@ -0,0 +1,54 @@ +// +// PREPARE RECALIBRATION +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { GATK4_BASERECALIBRATOR } from '../../../modules/nf-core/gatk4/baserecalibrator/main' +include { GATK4_GATHERBQSRREPORTS } from '../../../modules/nf-core/gatk4/gatherbqsrreports/main' + +workflow BAM_BASERECALIBRATOR { + take: + cram // channel: [mandatory] [ meta, cram_markduplicates, crai ] + dict // channel: [mandatory] [ dict ] + fasta // channel: [mandatory] [ fasta ] + fasta_fai // channel: [mandatory] [ fasta_fai ] + intervals // channel: [mandatory] [ intervals, num_intervals ] (or [ [], 0 ] if no intervals) + known_sites // channel: [optional] [ known_sites ] + known_sites_tbi // channel: [optional] [ known_sites_tbi ] + + main: + versions = Channel.empty() + + // Combine cram and intervals for spread and gather strategy + cram_intervals = cram.combine(intervals) + // Move num_intervals to meta map + .map{ meta, cram, crai, intervals, num_intervals -> [ meta + [ num_intervals:num_intervals ], cram, crai, intervals ] } + + // RUN BASERECALIBRATOR + GATK4_BASERECALIBRATOR(cram_intervals, fasta, fasta_fai, dict.map{ meta, it -> [ it ] }, known_sites, known_sites_tbi) + + // Figuring out if there is one or more table(s) from the same sample + table_to_merge = GATK4_BASERECALIBRATOR.out.table.map{ meta, table -> [ groupKey(meta, meta.num_intervals), table ] }.groupTuple().branch{ + // Use meta.num_intervals to asses number of intervals + single: it[0].num_intervals <= 1 + multiple: it[0].num_intervals > 1 + } + + // Only when using intervals + GATK4_GATHERBQSRREPORTS(table_to_merge.multiple) + + // Mix intervals and no_intervals channels together + table_bqsr = GATK4_GATHERBQSRREPORTS.out.table.mix(table_to_merge.single.map{ meta, table -> [ meta, table[0] ] }) + // Remove no longer necessary field: num_intervals + .map{ meta, table -> [ meta - meta.subMap('num_intervals'), table ] } + + // Gather versions of all tools used + versions = versions.mix(GATK4_BASERECALIBRATOR.out.versions) + versions = versions.mix(GATK4_GATHERBQSRREPORTS.out.versions) + + emit: + table_bqsr // channel: [ meta, table ] + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/bam_convert_samtools/main.nf b/subworkflows/local/bam_convert_samtools/main.nf new file mode 100644 index 0000000..5b057e4 --- /dev/null +++ b/subworkflows/local/bam_convert_samtools/main.nf @@ -0,0 +1,76 @@ +// +// BAM/CRAM to FASTQ conversion, paired end only +// + +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_MAP_MAP } from '../../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_UNMAP_UNMAP } from '../../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_UNMAP_MAP } from '../../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_MAP_UNMAP } from '../../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_MERGE as SAMTOOLS_MERGE_UNMAP } from '../../../modules/nf-core/samtools/merge/main' +include { SAMTOOLS_COLLATEFASTQ as COLLATE_FASTQ_UNMAP } from '../../../modules/nf-core/samtools/collatefastq/main' +include { SAMTOOLS_COLLATEFASTQ as COLLATE_FASTQ_MAP } from '../../../modules/nf-core/samtools/collatefastq/main' +include { CAT_FASTQ } from '../../../modules/nf-core/cat/fastq/main' + +workflow BAM_CONVERT_SAMTOOLS { + take: + input // channel: [meta, alignment (BAM or CRAM), index (optional)] + fasta // optional: reference file if CRAM format and reference not in header + fasta_fai + interleaved // value: true/false + + main: + versions = Channel.empty() + + // Index File if not PROVIDED -> this also requires updates to samtools view possibly URGH + + // MAP - MAP + SAMTOOLS_VIEW_MAP_MAP(input, fasta, []) + + // UNMAP - UNMAP + SAMTOOLS_VIEW_UNMAP_UNMAP(input, fasta, []) + + // UNMAP - MAP + SAMTOOLS_VIEW_UNMAP_MAP(input, fasta, []) + + // MAP - UNMAP + SAMTOOLS_VIEW_MAP_UNMAP(input, fasta, []) + + // Merge UNMAP + all_unmapped_bam = SAMTOOLS_VIEW_UNMAP_UNMAP.out.bam + .join(SAMTOOLS_VIEW_UNMAP_MAP.out.bam, failOnDuplicate: true, remainder: true) + .join(SAMTOOLS_VIEW_MAP_UNMAP.out.bam, failOnDuplicate: true, remainder: true) + .map{ meta, unmap_unmap, unmap_map, map_unmap -> [ meta, [ unmap_unmap, unmap_map, map_unmap ] ] } + + SAMTOOLS_MERGE_UNMAP(all_unmapped_bam, fasta, fasta_fai) + + // Collate & convert unmapped + COLLATE_FASTQ_UNMAP(SAMTOOLS_MERGE_UNMAP.out.bam, fasta, interleaved) + + // Collate & convert mapped + COLLATE_FASTQ_MAP(SAMTOOLS_VIEW_MAP_MAP.out.bam, fasta, interleaved) + + // join Mapped & unmapped fastq + + reads_to_concat = COLLATE_FASTQ_MAP.out.fastq + .join(COLLATE_FASTQ_UNMAP.out.fastq, failOnDuplicate: true, failOnMismatch: true) + .map{ meta, mapped_reads, unmapped_reads -> [ meta, [ mapped_reads[0], mapped_reads[1], unmapped_reads[0], unmapped_reads[1] ] ] } + + // Concatenate Mapped_R1 with Unmapped_R1 and Mapped_R2 with Unmapped_R2 + CAT_FASTQ(reads_to_concat) + reads = CAT_FASTQ.out.reads + + // Gather versions of all tools used + versions = versions.mix(CAT_FASTQ.out.versions) + versions = versions.mix(COLLATE_FASTQ_MAP.out.versions) + versions = versions.mix(COLLATE_FASTQ_UNMAP.out.versions) + versions = versions.mix(SAMTOOLS_MERGE_UNMAP.out.versions) + versions = versions.mix(SAMTOOLS_VIEW_MAP_MAP.out.versions) + versions = versions.mix(SAMTOOLS_VIEW_MAP_UNMAP.out.versions) + versions = versions.mix(SAMTOOLS_VIEW_UNMAP_MAP.out.versions) + versions = versions.mix(SAMTOOLS_VIEW_UNMAP_UNMAP.out.versions) + + emit: + reads + + versions +} diff --git a/subworkflows/local/bam_markduplicates/main.nf b/subworkflows/local/bam_markduplicates/main.nf new file mode 100644 index 0000000..b1084f9 --- /dev/null +++ b/subworkflows/local/bam_markduplicates/main.nf @@ -0,0 +1,43 @@ +// +// MARKDUPLICATES AND QC after mapping +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { CRAM_QC_MOSDEPTH_SAMTOOLS } from '../cram_qc_mosdepth_samtools/main' +include { GATK4_MARKDUPLICATES } from '../../../modules/nf-core/gatk4/markduplicates/main' + +workflow BAM_MARKDUPLICATES { + take: + bam // channel: [mandatory] [ meta, bam ] + fasta // channel: [mandatory] [ fasta ] + fasta_fai // channel: [mandatory] [ fasta_fai ] + intervals_bed_combined // channel: [optional] [ intervals_bed ] + + main: + versions = Channel.empty() + reports = Channel.empty() + + // RUN MARKUPDUPLICATES + GATK4_MARKDUPLICATES(bam, fasta, fasta_fai) + + // Join with the crai file + cram = GATK4_MARKDUPLICATES.out.cram.join(GATK4_MARKDUPLICATES.out.crai, failOnDuplicate: true, failOnMismatch: true) + + // QC on CRAM + CRAM_QC_MOSDEPTH_SAMTOOLS(cram, fasta, intervals_bed_combined) + + // Gather all reports generated + reports = reports.mix(GATK4_MARKDUPLICATES.out.metrics) + reports = reports.mix(CRAM_QC_MOSDEPTH_SAMTOOLS.out.reports) + + // Gather versions of all tools used + versions = versions.mix(GATK4_MARKDUPLICATES.out.versions) + versions = versions.mix(CRAM_QC_MOSDEPTH_SAMTOOLS.out.versions) + + emit: + cram + reports + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/bam_merge_index_samtools/main.nf b/subworkflows/local/bam_merge_index_samtools/main.nf new file mode 100644 index 0000000..f615b1c --- /dev/null +++ b/subworkflows/local/bam_merge_index_samtools/main.nf @@ -0,0 +1,45 @@ +// +// MERGE INDEX BAM +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { SAMTOOLS_INDEX as INDEX_MERGE_BAM } from '../../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_MERGE as MERGE_BAM } from '../../../modules/nf-core/samtools/merge/main' + +workflow BAM_MERGE_INDEX_SAMTOOLS { + take: + bam // channel: [mandatory] meta, bam + + main: + versions = Channel.empty() + + // Figuring out if there is one or more bam(s) from the same sample + bam_to_merge = bam.branch{ meta, bam -> + // bam is a list, so use bam.size() to asses number of intervals + single: bam.size() <= 1 + return [ meta, bam[0] ] + multiple: bam.size() > 1 + } + + // Only when using intervals + MERGE_BAM(bam_to_merge.multiple, [ [ id:'null' ], []], [ [ id:'null' ], []]) + + // Mix intervals and no_intervals channels together + bam_all = MERGE_BAM.out.bam.mix(bam_to_merge.single) + + // Index bam + INDEX_MERGE_BAM(bam_all) + + // Join with the bai file + bam_bai = bam_all.join(INDEX_MERGE_BAM.out.bai, failOnDuplicate: true, failOnMismatch: true) + + // Gather versions of all tools used + versions = versions.mix(INDEX_MERGE_BAM.out.versions) + versions = versions.mix(MERGE_BAM.out.versions) + + emit: + bam_bai + + versions +} diff --git a/subworkflows/local/bam_svcalling_gridss/main.nf b/subworkflows/local/bam_svcalling_gridss/main.nf new file mode 100644 index 0000000..41466f9 --- /dev/null +++ b/subworkflows/local/bam_svcalling_gridss/main.nf @@ -0,0 +1,80 @@ +// +// GRIDSS SV CALLING +// +// +// + +include { GRIDSS_GRIDSS } from '../../../modules/local/gridss/gridss/main.nf' +include { GRIDSS_SOMATIC } from '../../../modules/local/gridss/somaticFilter/main.nf' + +workflow BAM_SVCALLING_GRIDSS { + take: + cram // channel: [mandatory] [ meta, normalcram, normalcrai, tumorcram, tumorcrai ] + fasta // channel: [mandatory] reference fasta + fasta_fai // channel: [mandatory] reference fasta index + bwa_index // channel: [mandatory] bwa index path + blacklist_gridss // optional: blacklist bed file for gridss + + + main: + versions = Channel.empty() + vcf = Channel.empty() + vcf_index = Channel.empty() + assembly_bam = Channel.empty() + + GRIDSS_GRIDSS(cram, fasta, fasta_fai, bwa_index, blacklist_gridss) + + vcf = GRIDSS_GRIDSS.out.vcf + vcf_index = GRIDSS_GRIDSS.out.vcf_index + assembly_bam = GRIDSS_GRIDSS.out.assembly + + + versions = versions.mix(GRIDSS_GRIDSS.out.versions) + + emit: + vcf + vcf_index + assembly_bam + + + versions + +} + + +workflow BAM_SVCALLING_GRIDSS_SOMATIC { + take: + vcf + pondir_gridss + + main: + versions = Channel.empty() + somatic_all = Channel.empty() + somatic_high_confidence = Channel.empty() + + GRIDSS_SOMATIC(vcf, pondir_gridss) + + somatic_all = GRIDSS_SOMATIC.out.somatic_all_vcf + somatic_high_confidence = GRIDSS_SOMATIC.out.somatic_high_vcf + + versions = GRIDSS_SOMATIC.out.versions + + all_vcf = Channel.empty().mix(somatic_all, somatic_high_confidence) + + emit: + somatic_all + somatic_high_confidence + all_vcf + + versions + +} + + + + + + + + + diff --git a/subworkflows/local/bam_svcalling_svaba/main.nf b/subworkflows/local/bam_svcalling_svaba/main.nf new file mode 100644 index 0000000..d38851f --- /dev/null +++ b/subworkflows/local/bam_svcalling_svaba/main.nf @@ -0,0 +1,82 @@ +// +// SVABA SV CALLING +// +// +// + +include { SVABA } from '../../../modules/local/svaba/main.nf' + +workflow BAM_SVCALLING_SVABA { + take: + cram // channel: [mandatory] [ meta, tumorcram, tumorcrai, normalcram, normalcrai ] + fasta // channel: [mandatory] [ fasta ] + fasta_fai // channel: [mandatory] [ fasta index ] + bwa_index + dbsnp + dbsnp_tbi + indel_mask + germ_sv_db + simple_seq_db + error_rate + + main: + versions = Channel.empty() + som_sv = Channel.empty() + som_indel = Channel.empty() + germ_sv = Channel.empty() + germ_indel = Channel.empty() + raw_calls = Channel.empty() + discordants = Channel.empty() + ascii_alignments = Channel.empty() + unfiltered_germ_sv = Channel.empty() + unfiltered_germ_indel = Channel.empty() + unfiltered_som_sv = Channel.empty() + unfiltered_som_indel = Channel.empty() + + //Remapping the input based on Svaba module + cram_svaba = cram.map { meta, normal_cram, normal_crai, tumor_cram, tumor_crai -> + [meta, tumor_cram, tumor_crai, normal_cram, normal_crai] + } + // Calling Svaba module for run + SVABA(cram_svaba, fasta, fasta_fai, bwa_index, dbsnp, dbsnp_tbi, indel_mask, germ_sv_db, simple_seq_db, error_rate) + + // getting the outputs in a channel from SVABA + som_sv = SVABA.out.som_sv + som_indel = SVABA.out.som_indel + germ_sv = SVABA.out.germ_sv + germ_indel = SVABA.out.germ_indel + raw_calls = SVABA.out.raw_calls + discordants = SVABA.out.discordants + ascii_alignments = SVABA.out.ascii_alignments + unfiltered_germ_sv = SVABA.out.unfiltered_germ_sv + unfiltered_germ_indel = SVABA.out.unfiltered_germ_indel + unfiltered_som_sv = SVABA.out.unfiltered_som_sv + unfiltered_som_indel = SVABA.out.unfiltered_som_indel + + versions = versions.mix(SVABA.out.versions) + all_output = Channel.empty().mix( + som_sv, + som_indel, + germ_sv, + germ_indel, + unfiltered_som_sv, + unfiltered_som_indel,unfiltered_germ_sv, + unfiltered_germ_indel + ) + + emit: + som_sv + som_indel + germ_sv + germ_indel + raw_calls + discordants + ascii_alignments + unfiltered_germ_sv + unfiltered_germ_indel + unfiltered_som_sv + unfiltered_som_indel + all_output + + versions +} \ No newline at end of file diff --git a/subworkflows/local/channel_align_create_csv/main.nf b/subworkflows/local/channel_align_create_csv/main.nf new file mode 100644 index 0000000..c77a06c --- /dev/null +++ b/subworkflows/local/channel_align_create_csv/main.nf @@ -0,0 +1,24 @@ +// +// CHANNEL_ALIGN_CREATE_CSV +// + +workflow CHANNEL_ALIGN_CREATE_CSV { + take: + bam_indexed // channel: [mandatory] meta, bam, bai + + main: + // Creating csv files to restart from this step + bam_indexed.collectFile(keepHeader: true, skip: 1, sort: true, storeDir: "${params.outdir}/csv") { meta, bam, bai -> + patient = meta.patient + sample = meta.sample + sex = meta.sex + status = meta.status + bam = "${params.outdir}/Alignment/Mapped/${sample}/${bam.name}" + bai = "${params.outdir}/Alignment/Mapped/${sample}/${bai.name}" + + type = params.save_output_as_bam ? "bam" : "cram" + type_index = params.save_output_as_bam ? "bai" : "crai" + + ["alignment.csv", "patient,sex,status,sample,${type},${type_index}\n${patient},${sex},${status},${sample},${bam},${bai}\n"] + } +} diff --git a/subworkflows/local/channel_applybqsr_create_csv/main.nf b/subworkflows/local/channel_applybqsr_create_csv/main.nf new file mode 100644 index 0000000..e7ac698 --- /dev/null +++ b/subworkflows/local/channel_applybqsr_create_csv/main.nf @@ -0,0 +1,24 @@ +// +// CHANNEL_APPLYBQSR_CREATE_CSV +// + +workflow CHANNEL_APPLYBQSR_CREATE_CSV { + take: + cram_recalibrated_index // channel: [mandatory] meta, cram, crai + + main: + // Creating csv files to restart from this step + cram_recalibrated_index.collectFile(keepHeader: true, skip: 1, sort: true, storeDir: "${params.outdir}/csv") { meta, file, index -> + patient = meta.patient + sample = meta.sample + sex = meta.sex + status = meta.status + file = "${params.outdir}/Alignment/recalibrated/${sample}/${file.name}" + index = "${params.outdir}/Alignment/recalibrated/${sample}/${index.name}" + + type = params.save_output_as_bam ? "bam" : "cram" + type_index = params.save_output_as_bam ? "bai" : "crai" + + ["recalibrated.csv", "patient,sex,status,sample,${type},${type_index}\n${patient},${sex},${status},${sample},${file},${index}\n"] + } +} diff --git a/subworkflows/local/channel_baserecalibrator_create_csv/main.nf b/subworkflows/local/channel_baserecalibrator_create_csv/main.nf new file mode 100644 index 0000000..c21cda2 --- /dev/null +++ b/subworkflows/local/channel_baserecalibrator_create_csv/main.nf @@ -0,0 +1,51 @@ +// +// CHANNEL_BASERECALIBRATOR_CREATE_CSV +// + +workflow CHANNEL_BASERECALIBRATOR_CREATE_CSV { + take: + cram_table_bqsr // channel: [mandatory] meta, cram, crai, table + tools + skip_tools + save_output_as_bam + outdir + + main: + // Creating csv files to restart from this step + if (!(skip_tools && (skip_tools.split(',').contains('markduplicates')))) { + cram_table_bqsr.collectFile(keepHeader: true, skip: 1, sort: true, storeDir: "${outdir}/csv") { meta, cram, crai, table -> + + patient = meta.patient + sample = meta.sample + sex = meta.sex + status = meta.status + suffix_aligned = save_output_as_bam ? "bam" : "cram" + suffix_index = save_output_as_bam ? "bam.bai" : "cram.crai" + cram = "${outdir}/Alignment/markduplicates/${sample}/${cram.baseName}.${suffix_aligned}" + crai = "${outdir}/Alignment/markduplicates/${sample}/${crai.baseName.minus(".cram")}.${suffix_index}" + table = "${outdir}/Alignment/recal_table/${sample}/${sample}.recal.table" + + type = save_output_as_bam ? "bam" : "cram" + type_index = save_output_as_bam ? "bai" : "crai" + + ["markduplicates.csv", "patient,sex,status,sample,${type},${type_index},table\n${patient},${sex},${status},${sample},${cram},${crai},${table}\n"] + } + } else { + cram_table_bqsr.collectFile(keepHeader: true, skip: 1, sort: true, storeDir: "${outdir}/csv") { meta, cram, crai, table -> + patient = meta.patient + sample = meta.sample + sex = meta.sex + status = meta.status + suffix_aligned = save_output_as_bam ? "bam" : "cram" + suffix_index = save_output_as_bam ? "bam.bai" : "cram.crai" + cram = "${outdir}/Alignment/${sample}/Mapped/${cram.baseName}.${suffix_aligned}" + crai = "${outdir}/Alignment/${sample}/Mapped/${crai.baseName.minus(".cram")}.${suffix_index}" + table = "${outdir}/Alignment/${sample}/recal_table/${sample}.recal.table" + + type = save_output_as_bam ? "bam" : "cram" + type_index = save_output_as_bam ? "bai" : "crai" + + ["sorted.csv", "patient,sex,status,sample,${type},${type_index},table\n${patient},${sex},${status},${sample},${cram},${crai},${table}\n"] + } + } +} diff --git a/subworkflows/local/channel_markduplicates_create_csv/main.nf b/subworkflows/local/channel_markduplicates_create_csv/main.nf new file mode 100644 index 0000000..b98a0ce --- /dev/null +++ b/subworkflows/local/channel_markduplicates_create_csv/main.nf @@ -0,0 +1,29 @@ +// +// CHANNEL_MARKDUPLICATES_CREATE_CSV +// + +workflow CHANNEL_MARKDUPLICATES_CREATE_CSV { + take: + cram_markduplicates // channel: [mandatory] meta, cram, crai + csv_subfolder + outdir + save_output_as_bam + + main: + // Creating csv files to restart from this step + cram_markduplicates.collectFile(keepHeader: true, skip: 1, sort: true, storeDir: "${outdir}/csv") { meta, file, index -> + patient = meta.patient + sample = meta.sample + sex = meta.sex + status = meta.status + suffix_aligned = save_output_as_bam ? "bam" : "cram" + suffix_index = save_output_as_bam ? "bam.bai" : "cram.crai" + file = "${outdir}/Alignment/${csv_subfolder}/${sample}/${file.baseName}.${suffix_aligned}" + index = "${outdir}/Alignment/${csv_subfolder}/${sample}/${index.baseName.minus(".cram")}.${suffix_index}" + + type = save_output_as_bam ? "bam" : "cram" + type_index = save_output_as_bam ? "bai" : "crai" + + ["markduplicates_no_table.csv", "patient,sex,status,sample,${type},${type_index}\n${patient},${sex},${status},${sample},${file},${index}\n"] + } +} diff --git a/subworkflows/local/channel_svcalling_create_csv/main.nf b/subworkflows/local/channel_svcalling_create_csv/main.nf new file mode 100644 index 0000000..3f958b3 --- /dev/null +++ b/subworkflows/local/channel_svcalling_create_csv/main.nf @@ -0,0 +1,35 @@ +// +// CHANNEL_SV_CALLING_CREATE_CSV +// + +workflow CHANNEL_SVCALLING_CREATE_CSV { + take: + vcf_from_sv_calling // channel: [mandatory] meta, vcf + tools + outdir + + main: + if (tools && (tools.split(',').contains('svaba'))) { + + // Creating csv files to restart from this step + vcf_from_sv_calling.collectFile(keepHeader: true, skip: 1,sort: true, storeDir: "${params.outdir}/csv"){ meta, vcf -> + patient = meta.patient + sample = meta.id + sv_caller = "Svaba" + vcf = "${params.outdir}/SV_calling/SVABA/${meta.id}/${vcf.getName()}" + ["sv_calling.csv", "patient,sample,sv_caller,vcf\n${patient},${sample},${sv_caller},${vcf}\n"] + + } + } else if (tools && (tools.split(',').contains('gridss'))) { + + // Creating csv files to restart from this step + vcf_from_sv_calling.collectFile(keepHeader: true, skip: 1,sort: true, storeDir: "${params.outdir}/csv"){ meta, vcf -> + patient = meta.patient + sample = meta.id + sv_caller = "GRIDSS" + vcf = "${params.outdir}/SV_calling/GRIDSS/${meta.id}/${vcf.baseName}.gz" + ["sv_calling.csv", "patient,sample,sv_caller,vcf\n${patient},${sample},${sv_caller},${vcf}\n"] + + } + } +} \ No newline at end of file diff --git a/subworkflows/local/cram_merge_index_samtools/main.nf b/subworkflows/local/cram_merge_index_samtools/main.nf new file mode 100644 index 0000000..b808c8e --- /dev/null +++ b/subworkflows/local/cram_merge_index_samtools/main.nf @@ -0,0 +1,47 @@ +// +// MERGE INDEX CRAM +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { SAMTOOLS_INDEX as INDEX_CRAM } from '../../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_MERGE as MERGE_CRAM } from '../../../modules/nf-core/samtools/merge/main' + +workflow CRAM_MERGE_INDEX_SAMTOOLS { + take: + cram // channel: [mandatory] meta, cram + fasta // channel: [mandatory] fasta + fasta_fai // channel: [mandatory] fai for fasta + + main: + versions = Channel.empty() + + // Figuring out if there is one or more cram(s) from the same sample + cram_to_merge = cram.branch{ meta, cram -> + // cram is a list, so use cram.size() to asses number of intervals + single: cram.size() <= 1 + return [ meta, cram[0] ] + multiple: cram.size() > 1 + } + + // Only when using intervals + MERGE_CRAM(cram_to_merge.multiple, fasta.map{ it -> [ [ id:'fasta' ], it ] }, fasta_fai.map{ it -> [ [ id:'fasta_fai' ], it ] }) + + // Mix intervals and no_intervals channels together + cram_all = MERGE_CRAM.out.cram.mix(cram_to_merge.single) + + // Index cram + INDEX_CRAM(cram_all) + + // Join with the crai file + cram_crai = cram_all.join(INDEX_CRAM.out.crai, failOnDuplicate: true, failOnMismatch: true) + + // Gather versions of all tools used + versions = versions.mix(INDEX_CRAM.out.versions.first()) + versions = versions.mix(MERGE_CRAM.out.versions.first()) + + emit: + cram_crai + + versions +} diff --git a/subworkflows/local/cram_qc_mosdepth_samtools/main.nf b/subworkflows/local/cram_qc_mosdepth_samtools/main.nf new file mode 100644 index 0000000..fd070a6 --- /dev/null +++ b/subworkflows/local/cram_qc_mosdepth_samtools/main.nf @@ -0,0 +1,38 @@ +// +// QC on CRAM +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { SAMTOOLS_STATS } from '../../../modules/nf-core/samtools/stats/main' +include { MOSDEPTH } from '../../../modules/nf-core/mosdepth/main' + +workflow CRAM_QC_MOSDEPTH_SAMTOOLS { + take: + cram // channel: [mandatory] [ meta, cram, crai ] + fasta // channel: [mandatory] [ fasta ] + intervals + + main: + versions = Channel.empty() + reports = Channel.empty() + + // Reports run on cram + SAMTOOLS_STATS(cram, fasta.map{ it -> [ [ id:'fasta' ], it ] }) + + MOSDEPTH(cram.combine(intervals.map{ meta, bed -> [ bed?:[] ] }), fasta.map{ it -> [ [ id:'fasta' ], it ] }) + + // Gather all reports generated + reports = reports.mix(SAMTOOLS_STATS.out.stats) + reports = reports.mix(MOSDEPTH.out.global_txt) + reports = reports.mix(MOSDEPTH.out.regions_txt) + + // Gather versions of all tools used + versions = versions.mix(MOSDEPTH.out.versions) + versions = versions.mix(SAMTOOLS_STATS.out.versions.first()) + + emit: + reports + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/fastq_align_bwamem_mem2/main.nf b/subworkflows/local/fastq_align_bwamem_mem2/main.nf new file mode 100644 index 0000000..0a24a92 --- /dev/null +++ b/subworkflows/local/fastq_align_bwamem_mem2/main.nf @@ -0,0 +1,56 @@ +// +// MAPPING +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { BWAMEM2_MEM } from '../../../modules/nf-core/bwamem2/mem/main' +include { BWA_MEM as BWAMEM1_MEM } from '../../../modules/nf-core/bwa/mem/main' +//include { DRAGMAP_ALIGN } from '../../../modules/nf-core/dragmap/align/main' +//include { SENTIEON_BWAMEM } from '../../../modules/nf-core/sentieon/bwamem/main' + +workflow FASTQ_ALIGN_BWAMEM_MEM2 { + take: + reads // channel: [mandatory] meta, reads + index // channel: [mandatory] index + sort // boolean: [mandatory] true -> sort, false -> don't sort + fasta + fasta_fai + + main: + + versions = Channel.empty() + reports = Channel.empty() + + // Only one of the following should be run + BWAMEM1_MEM(reads, index.map{ it -> [ [ id:'index' ], it ] }, sort) // If aligner is bwa-mem + BWAMEM2_MEM(reads, index.map{ it -> [ [ id:'index' ], it ] }, sort) // If aligner is bwa-mem2 + //DRAGMAP_ALIGN(reads, index.map{ it -> [ [ id:'index' ], it ] }, sort) // If aligner is dragmap + // The sentieon-bwamem-module does sorting as part of the conversion from sam to bam. + //SENTIEON_BWAMEM(reads, index.map{ it -> [ [ id:'index' ], it ] }, fasta.map{fa -> [[:], fa]}, fasta_fai.map{fai -> [[:], fai]}) // If aligner is sentieon-bwamem + + // Get the bam files from the aligner + // Only one aligner is run + bam = Channel.empty() + bam = bam.mix(BWAMEM1_MEM.out.bam) + bam = bam.mix(BWAMEM2_MEM.out.bam) + //bam = bam.mix(DRAGMAP_ALIGN.out.bam) + //bam = bam.mix(SENTIEON_BWAMEM.out.bam_and_bai.map{ meta, bam, bai -> [ meta, bam ] }) + + //bai = SENTIEON_BWAMEM.out.bam_and_bai.map{ meta, bam, bai -> [ meta, bai ] } + + // Gather reports of all tools used + //reports = reports.mix(DRAGMAP_ALIGN.out.log) + + // Gather versions of all tools used + versions = versions.mix(BWAMEM1_MEM.out.versions) + versions = versions.mix(BWAMEM2_MEM.out.versions) + //versions = versions.mix(DRAGMAP_ALIGN.out.versions) + //versions = versions.mix(SENTIEON_BWAMEM.out.versions) + + emit: + bam // channel: [ [meta], bam ] + //bai // channel: [ [meta], bai ] + reports + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/prepare_cache/main.nf b/subworkflows/local/prepare_cache/main.nf new file mode 100644 index 0000000..0b6dc58 --- /dev/null +++ b/subworkflows/local/prepare_cache/main.nf @@ -0,0 +1,39 @@ +// +// PREPARE CACHE +// + +// Initialize channels based on params or indices that were just built +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run +// Condition is based on params.step and params.tools +// If and extra condition exists, it's specified in comments + + + +include { ENSEMBLVEP_DOWNLOAD } from '../../../modules/nf-core/ensemblvep/download/main' +include { SNPEFF_DOWNLOAD } from '../../../modules/nf-core/snpeff/download/main' + +workflow PREPARE_CACHE { + take: + ensemblvep_info + snpeff_info + + + main: + versions = Channel.empty() + + ENSEMBLVEP_DOWNLOAD(ensemblvep_info) + SNPEFF_DOWNLOAD(snpeff_info) + + // Gather versions of all tools used + versions = versions.mix(ENSEMBLVEP_DOWNLOAD.out.versions) + versions = versions.mix(SNPEFF_DOWNLOAD.out.versions) + + emit: + ensemblvep_cache = ENSEMBLVEP_DOWNLOAD.out.cache.collect() // channel: [ meta, cache ] + snpeff_cache = SNPEFF_DOWNLOAD.out.cache.collect() // channel: [ meta, cache ] + + versions // channel: [ versions.yml ] +} + + diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf new file mode 100644 index 0000000..48f5b2a --- /dev/null +++ b/subworkflows/local/prepare_genome/main.nf @@ -0,0 +1,143 @@ +// +// PREPARE GENOME +// + +// Initialize channels based on params or indices that were just built +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run +// Condition is based on params.step and params.tools +// If and extra condition exists, it's specified in comments + + +include { BWA_INDEX as BWAMEM1_INDEX } from '../../../modules/nf-core/bwa/index/main' +include { BWAMEM2_INDEX } from '../../../modules/nf-core/bwamem2/index/main' +include { GATK4_CREATESEQUENCEDICTIONARY } from '../../../modules/nf-core/gatk4/createsequencedictionary/main' +include { MSISENSORPRO_SCAN } from '../../../modules/nf-core/msisensorpro/scan/main' +include { SAMTOOLS_FAIDX } from '../../../modules/nf-core/samtools/faidx/main' +include { TABIX_TABIX as TABIX_DBSNP } from '../../../modules/nf-core/tabix/tabix/main' +include { TABIX_TABIX as TABIX_GERMLINE_RESOURCE } from '../../../modules/nf-core/tabix/tabix/main' +include { TABIX_TABIX as TABIX_KNOWN_INDELS } from '../../../modules/nf-core/tabix/tabix/main' +include { TABIX_TABIX as TABIX_KNOWN_SNPS } from '../../../modules/nf-core/tabix/tabix/main' +include { TABIX_TABIX as TABIX_PON } from '../../../modules/nf-core/tabix/tabix/main' +include { UNTAR as UNTAR_CHR_DIR } from '../../../modules/nf-core/untar/main' +include { UNZIP as UNZIP_ALLELES } from '../../../modules/nf-core/unzip/main' +include { UNZIP as UNZIP_GC } from '../../../modules/nf-core/unzip/main' +include { UNZIP as UNZIP_LOCI } from '../../../modules/nf-core/unzip/main' +include { UNZIP as UNZIP_RT } from '../../../modules/nf-core/unzip/main' + + +workflow PREPARE_GENOME { + take: + ascat_alleles // channel: [optional] ascat allele files + ascat_loci // channel: [optional] ascat loci files + ascat_loci_gc // channel: [optional] ascat gc content file + ascat_loci_rt // channel: [optional] ascat replictiming file + chr_dir // channel: [optional] chromosome files + dbsnp // channel: [optional] dbsnp + fasta // channel: [mandatory] fasta + fasta_fai // channel: [optional] fasta_fai + germline_resource // channel: [optional] germline_resource + known_indels // channel: [optional] known_indels + known_snps // channel: [optional] known_snps + pon // channel: [optional] pon + + + main: + fasta = fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] } + versions = Channel.empty() + + + BWAMEM1_INDEX(fasta) // If aligner is bwa-mem + BWAMEM2_INDEX(fasta) // If aligner is bwa-mem2 + + GATK4_CREATESEQUENCEDICTIONARY(fasta) + MSISENSORPRO_SCAN(fasta) + SAMTOOLS_FAIDX(fasta, [['id':null], []]) + + + // the following are flattened and mapped in case the user supplies more than one value for the param + // written for KNOWN_INDELS, but preemptively applied to the rest + // [ file1, file2 ] becomes [ [ meta1, file1 ], [ meta2, file2 ] ] + // outputs are collected to maintain a single channel for relevant TBI files + TABIX_DBSNP(dbsnp.flatten().map{ it -> [ [ id:it.baseName ], it ] }) + TABIX_GERMLINE_RESOURCE(germline_resource.flatten().map{ it -> [ [ id:it.baseName ], it ] }) + TABIX_KNOWN_SNPS(known_snps.flatten().map{ it -> [ [ id:it.baseName ], it ] } ) + TABIX_KNOWN_INDELS(known_indels.flatten().map{ it -> [ [ id:it.baseName ], it ] } ) + TABIX_PON(pon.flatten().map{ it -> [ [ id:it.baseName ], it ] }) + + + // prepare ascat reference files + allele_files = ascat_alleles + if (params.ascat_alleles && params.ascat_alleles.endsWith('.zip')) { + UNZIP_ALLELES(ascat_alleles.map{ it -> [[id:it[0].baseName], it]}) + allele_files = UNZIP_ALLELES.out.unzipped_archive.map{ it[1] } + versions = versions.mix(UNZIP_ALLELES.out.versions) + } + + loci_files = ascat_loci + if (params.ascat_loci && params.ascat_loci.endsWith('.zip')) { + UNZIP_LOCI(ascat_loci.map{ it -> [[id:it[0].baseName], it]}) + loci_files = UNZIP_LOCI.out.unzipped_archive.map{ it[1] } + versions = versions.mix(UNZIP_LOCI.out.versions) + } + gc_file = ascat_loci_gc + if (params.ascat_loci_gc && params.ascat_loci_gc.endsWith('.zip')) { + UNZIP_GC(ascat_loci_gc.map{ it -> [[id:it[0].baseName], it]}) + gc_file = UNZIP_GC.out.unzipped_archive.map{ it[1] } + versions = versions.mix(UNZIP_GC.out.versions) + } + rt_file = ascat_loci_rt + if (params.ascat_loci_rt && params.ascat_loci_rt.endsWith('.zip')) { + UNZIP_RT(ascat_loci_rt.map{ it -> [[id:it[0].baseName], it]}) + rt_file = UNZIP_RT.out.unzipped_archive.map{ it[1] } + versions = versions.mix(UNZIP_RT.out.versions) + } + + + chr_files = chr_dir + if (params.chr_dir && params.chr_dir.endsWith('tar.gz')) { + UNTAR_CHR_DIR(chr_dir.map{ it -> [ [ id:'chr_dir' ], it ] }) + chr_files = UNTAR_CHR_DIR.out.untar.map{ it[1] } + versions = versions.mix(UNTAR_CHR_DIR.out.versions) + } + + + // Gather versions of all tools used + versions = versions.mix(SAMTOOLS_FAIDX.out.versions) + versions = versions.mix(BWAMEM1_INDEX.out.versions) + versions = versions.mix(BWAMEM2_INDEX.out.versions) + versions = versions.mix(GATK4_CREATESEQUENCEDICTIONARY.out.versions) + versions = versions.mix(MSISENSORPRO_SCAN.out.versions) + versions = versions.mix(TABIX_DBSNP.out.versions) + versions = versions.mix(TABIX_GERMLINE_RESOURCE.out.versions) + versions = versions.mix(TABIX_KNOWN_SNPS.out.versions) + versions = versions.mix(TABIX_KNOWN_INDELS.out.versions) + versions = versions.mix(TABIX_PON.out.versions) + + + + emit: + bwa = BWAMEM1_INDEX.out.index.map{ meta, index -> [index] }.collect() // path: bwa/* + bwamem2 = BWAMEM2_INDEX.out.index.map{ meta, index -> [index] }.collect() // path: bwamem2/* + dbsnp_tbi = TABIX_DBSNP.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: dbsnb.vcf.gz.tbi + dict = GATK4_CREATESEQUENCEDICTIONARY.out.dict // path: genome.fasta.dict + fasta_fai = SAMTOOLS_FAIDX.out.fai.map{ meta, fai -> [fai] } // path: genome.fasta.fai + germline_resource_tbi = TABIX_GERMLINE_RESOURCE.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: germline_resource.vcf.gz.tbi + known_snps_tbi = TABIX_KNOWN_SNPS.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: {known_indels*}.vcf.gz.tbi + known_indels_tbi = TABIX_KNOWN_INDELS.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: {known_indels*}.vcf.gz.tbi + msisensorpro_scan = MSISENSORPRO_SCAN.out.list.map{ meta, list -> [list] } // path: genome_msi.list + pon_tbi = TABIX_PON.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: pon.vcf.gz.tbi + allele_files + chr_files + gc_file + loci_files + rt_file + + versions // channel: [ versions.yml ] +} + + + + + + diff --git a/subworkflows/local/prepare_intervals/main.nf b/subworkflows/local/prepare_intervals/main.nf new file mode 100644 index 0000000..f4079e3 --- /dev/null +++ b/subworkflows/local/prepare_intervals/main.nf @@ -0,0 +1,113 @@ +// +// PREPARE INTERVALS +// + +// Initialize channels based on params or indices that were just built +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { BUILD_INTERVALS } from '../../../modules/local/build_intervals/main' +include { CREATE_INTERVALS_BED } from '../../../modules/local/create_intervals_bed/main' +include { GATK4_INTERVALLISTTOBED } from '../../../modules/nf-core/gatk4/intervallisttobed/main' +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_SPLIT } from '../../../modules/nf-core/tabix/bgziptabix/main' +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_COMBINED } from '../../../modules/nf-core/tabix/bgziptabix/main' + +workflow PREPARE_INTERVALS { + take: + fasta_fai // mandatory [ fasta_fai ] + intervals // [ params.intervals ] + no_intervals // [ params.no_intervals ] + + main: + versions = Channel.empty() + + intervals_bed = Channel.empty() // List of [ bed, num_intervals ], one for each region + intervals_bed_gz_tbi = Channel.empty() // List of [ bed.gz, bed,gz.tbi, num_intervals ], one for each region + intervals_combined = Channel.empty() // Single bed file containing all intervals + + if (no_intervals) { + file("${params.outdir}/no_intervals.bed").text = "no_intervals\n" + file("${params.outdir}/no_intervals.bed.gz").text = "no_intervals\n" + file("${params.outdir}/no_intervals.bed.gz.tbi").text = "no_intervals\n" + + intervals_bed = Channel.fromPath(file("${params.outdir}/no_intervals.bed")).map{ it -> [ it, 0 ] } + intervals_bed_gz_tbi = Channel.fromPath(file("${params.outdir}/no_intervals.bed.{gz,gz.tbi}")).collect().map{ it -> [ it, 0 ] } + intervals_combined = Channel.fromPath(file("${params.outdir}/no_intervals.bed")).map{ it -> [ [ id:it.simpleName ], it ] } + } else if (params.step != 'annotate' && params.step != 'controlfreec') { + // If no interval/target file is provided, then generated intervals from FASTA file + if (!intervals) { + BUILD_INTERVALS(fasta_fai.map{it -> [ [ id:it.baseName ], it ] }) + + intervals_combined = BUILD_INTERVALS.out.bed + + CREATE_INTERVALS_BED(intervals_combined.map{ meta, path -> path }).bed + + intervals_bed = CREATE_INTERVALS_BED.out.bed + + versions = versions.mix(BUILD_INTERVALS.out.versions) + versions = versions.mix(CREATE_INTERVALS_BED.out.versions) + } else { + intervals_combined = Channel.fromPath(file(intervals)).map{it -> [ [ id:it.baseName ], it ] } + intervals_bed = CREATE_INTERVALS_BED(file(intervals)).bed + + versions = versions.mix(CREATE_INTERVALS_BED.out.versions) + + // If interval file is not provided as .bed, but e.g. as .interval_list then convert to BED format + if (intervals.endsWith(".interval_list")) { + GATK4_INTERVALLISTTOBED(intervals_combined) + intervals_combined = GATK4_INTERVALLISTTOBED.out.bed + versions = versions.mix(GATK4_INTERVALLISTTOBED.out.versions) + } + } + + // Now for the intervals.bed the following operations are done: + // 1. Intervals file is split up into multiple bed files for scatter/gather + // 2. Each bed file is indexed + + // 1. Intervals file is split up into multiple bed files for scatter/gather & grouping together small intervals + intervals_bed = intervals_bed.flatten() + .map{ intervalFile -> + def duration = 0.0 + for (line in intervalFile.readLines()) { + final fields = line.split('\t') + if (fields.size() >= 5) duration += fields[4].toFloat() + else { + start = fields[1].toInteger() + end = fields[2].toInteger() + duration += (end - start) / params.nucleotides_per_second + } + } + [ duration, intervalFile ] + }.toSortedList({ a, b -> b[0] <=> a[0] }) + .flatten().collate(2).map{ duration, intervalFile -> intervalFile }.collect() + // Adding number of intervals as elements + .map{ it -> [ it, it.size() ] } + .transpose() + + // 2. Create bed.gz and bed.gz.tbi for each interval file. They are split by region (see above) + TABIX_BGZIPTABIX_INTERVAL_SPLIT(intervals_bed.map{ file, num_intervals -> [ [ id:file.baseName], file ] }) + + intervals_bed_gz_tbi = TABIX_BGZIPTABIX_INTERVAL_SPLIT.out.gz_tbi.map{ meta, bed, tbi -> [ bed, tbi ] }.toList() + // Adding number of intervals as elements + .map{ it -> [ it, it.size() ] } + .transpose() + + versions = versions.mix(TABIX_BGZIPTABIX_INTERVAL_SPLIT.out.versions) + } + + TABIX_BGZIPTABIX_INTERVAL_COMBINED(intervals_combined) + versions = versions.mix(TABIX_BGZIPTABIX_INTERVAL_COMBINED.out.versions) + + intervals_bed_combined = intervals_combined.map{meta, bed -> bed }.collect() + intervals_bed_gz_tbi_combined = TABIX_BGZIPTABIX_INTERVAL_COMBINED.out.gz_tbi.map{meta, gz, tbi -> [gz, tbi] }.collect() + + emit: + // Intervals split for parallel execution + intervals_bed // [ intervals.bed, num_intervals ] + intervals_bed_gz_tbi // [ intervals.bed.gz, intervals.bed.gz.tbi, num_intervals ] + // All intervals in one file + intervals_bed_combined // [ intervals.bed ] + intervals_bed_gz_tbi_combined // [ intervals.bed.gz, intervals.bed.gz.tbi] + + versions // [ versions.yml ] +} diff --git a/workflows/.heisenbio.nf.swp b/workflows/.heisenbio.nf.swp new file mode 100644 index 0000000..479c0f5 Binary files /dev/null and b/workflows/.heisenbio.nf.swp differ diff --git a/workflows/heisenbio.nf b/workflows/heisenbio.nf index 808761e..ebea1de 100644 --- a/workflows/heisenbio.nf +++ b/workflows/heisenbio.nf @@ -4,8 +4,7 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' - +include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet } from 'plugin/nf-validation' def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) def citation = '\n' + WorkflowMain.citation(workflow) + '\n' def summary_params = paramsSummaryMap(workflow) @@ -13,18 +12,182 @@ def summary_params = paramsSummaryMap(workflow) // Print parameter summary log to screen log.info logo + paramsSummaryLog(workflow) + citation +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + VALIDATE INPUTS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// Check input path parameters to see if they exist +def checkPathParamList = [ + params.ascat_alleles, + params.ascat_loci, + params.ascat_loci_gc, + params.ascat_loci_rt, + params.bwa, + params.bwamem2, + params.cf_chrom_len, + params.chr_dir, + params.cnvkit_reference, + params.dbnsfp, + params.dbnsfp_tbi, + params.dbsnp, + params.dbsnp_tbi, + params.dict, + params.fasta, + params.fasta_fai, + params.germline_resource, + params.germline_resource_tbi, + params.input, + params.intervals, + params.known_indels, + params.known_indels_tbi, + params.known_snps, + params.known_snps_tbi, + params.mappability, + params.multiqc_config, + params.pon, + params.pon_tbi +] +// only check if we are using the tools +if (params.tools && params.tools.contains("snpeff")) checkPathParamList.add(params.snpeff_cache) +if (params.tools && params.tools.contains("vep")) checkPathParamList.add(params.vep_cache) + +if (params.tools && params.tools.contains("svaba")) checkPathParamList.add(params.indel_mask) +if (params.tools && params.tools.contains("svaba")) checkPathParamList.add(params.germ_sv_db) +if (params.tools && params.tools.contains("svaba")) checkPathParamList.add(params.simple_seq_db) + +if (params.tools && params.tools.contains("gridss")) checkPathParamList.add(params.blacklist_gridss) +if (params.tools && params.tools.contains("gridss")) checkPathParamList.add(params.pon_gridss) + + +// Validate input parameters WorkflowHeisenbio.initialise(params, log) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - CONFIG FILES + Check mandatory parameters ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() -ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() -ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) +for (param in checkPathParamList) if (param) file(param, checkIfExists: true) + +// Set input, can either be from --input or from automatic retrieval in WorkflowSarek.groovy + +if (params.input) { + ch_from_samplesheet = params.build_only_index ? Channel.empty() : Channel.fromSamplesheet("input") +} else { + ch_from_samplesheet = params.build_only_index ? Channel.empty() : Channel.fromSamplesheet("input_restart") +} + +input_sample = ch_from_samplesheet + .map{ meta, fastq_1, fastq_2, table, cram, crai, bam, bai, vcf, variantcaller -> + // generate patient_sample key to group lanes together + [ meta.patient + meta.sample, [meta, fastq_1, fastq_2, table, cram, crai, bam, bai, vcf, variantcaller] ] + } + .tap{ ch_with_patient_sample } // save the channel + .groupTuple() //group by patient_sample to get all lanes + .map { patient_sample, ch_items -> + // get number of lanes per sample + [ patient_sample, ch_items.size() ] + } + .combine(ch_with_patient_sample, by: 0) // for each entry add numLanes + .map { patient_sample, num_lanes, ch_items -> + + (meta, fastq_1, fastq_2, table, cram, crai, bam, bai, vcf, variantcaller) = ch_items + if (meta.lane && fastq_2) { + meta = meta + [id: "${meta.sample}-${meta.lane}".toString()] + def CN = params.seq_center ? "CN:${params.seq_center}\\t" : '' + + def flowcell = flowcellLaneFromFastq(fastq_1) + // Don't use a random element for ID, it breaks resuming + def read_group = "\"@RG\\tID:${flowcell}.${meta.sample}.${meta.lane}\\t${CN}PU:${meta.lane}\\tSM:${meta.patient}_${meta.sample}\\tLB:${meta.sample}\\tDS:${params.fasta}\\tPL:${params.seq_platform}\"" + + meta = meta - meta.subMap('lane') + [num_lanes: num_lanes.toInteger(), read_group: read_group.toString(), data_type: 'fastq', size: 1] + + if (params.step == 'alignment') return [ meta, [ fastq_1, fastq_2 ] ] + else { + error("Samplesheet contains fastq files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.\nhttps://nf-co.re/sarek/usage#input-samplesheet-configurations") + } + + // start from BAM + } else if (meta.lane && bam) { + if (params.step != 'alignment' && !bai) { + error("BAM index (bai) should be provided.") + } + meta = meta + [id: "${meta.sample}-${meta.lane}".toString()] + def CN = params.seq_center ? "CN:${params.seq_center}\\t" : '' + def read_group = "\"@RG\\tID:${meta.sample}_${meta.lane}\\t${CN}PU:${meta.lane}\\tSM:${meta.patient}_${meta.sample}\\tLB:${meta.sample}\\tDS:${params.fasta}\\tPL:${params.seq_platform}\"" + + meta = meta - meta.subMap('lane') + [num_lanes: num_lanes.toInteger(), read_group: read_group.toString(), data_type: 'bam', size: 1] + + if (params.step != 'annotate') return [ meta - meta.subMap('lane'), bam, bai ] + else { + error("Samplesheet contains bam files but step is `annotate`. The pipeline is expecting vcf files for the annotation. Please check your samplesheet or adjust the step parameter.\nhttps://nf-co.re/sarek/usage#input-samplesheet-configurations") + } + + // recalibration + } else if (table && cram) { + meta = meta + [id: meta.sample, data_type: 'cram'] + + if (!(params.step == 'alignment' || params.step == 'annotate')) return [ meta - meta.subMap('lane'), cram, crai, table ] + else { + error("Samplesheet contains cram files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.\nhttps://nf-co.re/sarek/usage#input-samplesheet-configurations") + } + + // recalibration when skipping MarkDuplicates + } else if (table && bam) { + meta = meta + [id: meta.sample, data_type: 'bam'] + + if (!(params.step == 'alignment' || params.step == 'annotate')) return [ meta - meta.subMap('lane'), bam, bai, table ] + else { + error("Samplesheet contains bam files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.\nhttps://nf-co.re/sarek/usage#input-samplesheet-configurations") + } + + // prepare_recalibration or variant_calling + } else if (cram) { + meta = meta + [id: meta.sample, data_type: 'cram'] + + if (!(params.step == 'alignment' || params.step == 'annotate')) return [ meta - meta.subMap('lane'), cram, crai ] + else { + error("Samplesheet contains cram files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.\nhttps://nf-co.re/sarek/usage#input-samplesheet-configurations") + } + + // prepare_recalibration when skipping MarkDuplicates or `--step markduplicates` + } else if (bam) { + meta = meta + [id: meta.sample, data_type: 'bam'] + + if (!(params.step == 'alignment' || params.step == 'annotate')) return [ meta - meta.subMap('lane'), bam, bai ] + else { + error("Samplesheet contains bam files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.\nhttps://nf-co.re/sarek/usage#input-samplesheet-configurations") + } + + // annotation + } else if (vcf) { + meta = meta + [id: meta.sample, data_type: 'vcf', variantcaller: variantcaller ?: ''] + + if (params.step == 'annotate') return [ meta - meta.subMap('lane'), vcf ] + else { + error("Samplesheet contains vcf files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.\nhttps://nf-co.re/sarek/usage#input-samplesheet-configurations") + } + } else { + error("Missing or unknown field in csv file header. Please check your samplesheet") + } + } +if (!params.dbsnp && !params.known_indels) { + if (params.step in ['alignment', 'markduplicates', 'prepare_recalibration', 'recalibrate'] && (!params.skip_tools || (params.skip_tools && !params.skip_tools.split(',').contains('baserecalibrator')))) { + log.warn "Base quality score recalibration requires at least one resource file. Please provide at least one of `--dbsnp` or `--known_indels`\nYou can skip this step in the workflow by adding `--skip_tools baserecalibrator` to the command." + } + if (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper'))) { + log.warn "If GATK's Haplotypecaller or Sentieon's Haplotyper is specified, without `--dbsnp` or `--known_indels no filtering will be done. For filtering, please provide at least one of `--dbsnp` or `--known_indels`.\nFor more information see FilterVariantTranches (single-sample, default): https://gatk.broadinstitute.org/hc/en-us/articles/5358928898971-FilterVariantTranches\nFor more information see VariantRecalibration (--joint_germline): https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator\nFor more information on GATK Best practice germline variant calling: https://gatk.broadinstitute.org/hc/en-us/articles/360035535932-Germline-short-variant-discovery-SNPs-Indels-" + } +} + + + +if ((params.download_cache) && (params.snpeff_cache || params.vep_cache)) { + error("Please specify either `--download_cache` or `--snpeff_cache`, `--vep_cache`.\nhttps://nf-co.re/sarek/dev/usage#how-to-customise-snpeff-and-vep-annotation") +} /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -32,84 +195,667 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// -// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules -// -include { INPUT_CHECK } from '../subworkflows/local/input_check' +// Initialize file channels based on params, defined in the params.genomes[params.genome] scope +ascat_alleles = params.ascat_alleles ? Channel.fromPath(params.ascat_alleles).collect() : Channel.empty() +ascat_loci = params.ascat_loci ? Channel.fromPath(params.ascat_loci).collect() : Channel.empty() +ascat_loci_gc = params.ascat_loci_gc ? Channel.fromPath(params.ascat_loci_gc).collect() : Channel.value([]) +ascat_loci_rt = params.ascat_loci_rt ? Channel.fromPath(params.ascat_loci_rt).collect() : Channel.value([]) +cf_chrom_len = params.cf_chrom_len ? Channel.fromPath(params.cf_chrom_len).collect() : [] +chr_dir = params.chr_dir ? Channel.fromPath(params.chr_dir).collect() : Channel.value([]) +dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : Channel.value([]) +fasta = params.fasta ? Channel.fromPath(params.fasta).first() : Channel.empty() +fasta_fai = params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : Channel.empty() +germline_resource = params.germline_resource ? Channel.fromPath(params.germline_resource).collect() : Channel.value([]) // Mutect2 does not require a germline resource, so set to optional input +known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : Channel.value([]) +known_snps = params.known_snps ? Channel.fromPath(params.known_snps).collect() : Channel.value([]) +mappability = params.mappability ? Channel.fromPath(params.mappability).collect() : Channel.value([]) +pon = params.pon ? Channel.fromPath(params.pon).collect() : Channel.value([]) // PON is optional for Mutect2 (but highly recommended) +indel_mask = params.indel_mask ? Channel.fromPath(params.indel_mask).collect() : Channel.empty() // This is the indel mask for SVABA +germ_sv_db = params.germ_sv_db ? Channel.fromPath(params.germ_sv_db).collect() : Channel.empty() // This is the germline SV mask for Svaba +simple_seq_db = params.simple_seq_db ? Channel.fromPath(params.simple_seq_db).collect() : Channel.empty() // This is the file containing sites of simple DNA that can confuse the contig re-alignment for SVABA +blacklist_gridss = params.blacklist_gridss ? Channel.fromPath(params.blacklist_gridss).collect() : Channel.empty() // This is the mask for gridss SV calls +pon_gridss = params.pon_gridss ? Channel.fromPath(params.pon_gridss).collect() : Channel.empty() //This is the pon directory for GRIDSS SOMATIC. (MUST CONTAIN .bed and .bedpe files) +// Initialize value channels based on params, defined in the params.genomes[params.genome] scope +ascat_genome = params.ascat_genome ?: Channel.empty() +dbsnp_vqsr = params.dbsnp_vqsr ? Channel.value(params.dbsnp_vqsr) : Channel.empty() +known_indels_vqsr = params.known_indels_vqsr ? Channel.value(params.known_indels_vqsr) : Channel.empty() +known_snps_vqsr = params.known_snps_vqsr ? Channel.value(params.known_snps_vqsr) : Channel.empty() +snpeff_db = params.snpeff_db ?: Channel.empty() +vep_cache_version = params.vep_cache_version ?: Channel.empty() +vep_genome = params.vep_genome ?: Channel.empty() +vep_species = params.vep_species ?: Channel.empty() +error_rate = params.error_rate ?: Channel.empty() + +// Initialize files channels based on params, not defined within the params.genomes[params.genome] scope +if (params.snpeff_cache && params.tools && params.tools.contains("snpeff")) { + def snpeff_annotation_cache_key = params.use_annotation_cache_keys ? "${params.snpeff_genome}.${params.snpeff_db}/" : "" + def snpeff_cache_dir = "${snpeff_annotation_cache_key}${params.snpeff_genome}.${params.snpeff_db}" + def snpeff_cache_path_full = file("$params.snpeff_cache/$snpeff_cache_dir", type: 'dir') + if ( !snpeff_cache_path_full.exists() || !snpeff_cache_path_full.isDirectory() ) { + error("Files within --snpeff_cache invalid. Make sure there is a directory named ${snpeff_cache_dir} in ${params.snpeff_cache}.\nhttps://nf-co.re/sarek/dev/usage#how-to-customise-snpeff-and-vep-annotation") + } + snpeff_cache = Channel.fromPath(file("${params.snpeff_cache}/${snpeff_annotation_cache_key}"), checkIfExists: true).collect() + .map{ cache -> [ [ id:"${params.snpeff_genome}.${params.snpeff_db}" ], cache ] } +} else snpeff_cache = [] + +if (params.vep_cache && params.tools && params.tools.contains("vep")) { + def vep_annotation_cache_key = params.use_annotation_cache_keys ? "${params.vep_cache_version}_${params.vep_genome}/" : "" + def vep_cache_dir = "${vep_annotation_cache_key}${params.vep_species}/${params.vep_cache_version}_${params.vep_genome}" + def vep_cache_path_full = file("$params.vep_cache/$vep_cache_dir", type: 'dir') + if ( !vep_cache_path_full.exists() || !vep_cache_path_full.isDirectory() ) { + error("Files within --vep_cache invalid. Make sure there is a directory named ${vep_cache_dir} in ${params.vep_cache}.\nhttps://nf-co.re/sarek/dev/usage#how-to-customise-snpeff-and-vep-annotation") + } + vep_cache = Channel.fromPath(file("${params.vep_cache}/${vep_annotation_cache_key}"), checkIfExists: true).collect() +} else vep_cache = [] + +vep_extra_files = [] + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - IMPORT NF-CORE MODULES/SUBWORKFLOWS + IMPORT LOCAL/NF-CORE MODULES/SUBWORKFLOWS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// -// MODULE: Installed directly from nf-core/modules -// -include { FASTQC } from '../modules/nf-core/fastqc/main' -include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +// Create samplesheets to restart from different steps +include { CHANNEL_ALIGN_CREATE_CSV } from '../subworkflows/local/channel_align_create_csv/main' +include { CHANNEL_MARKDUPLICATES_CREATE_CSV } from '../subworkflows/local/channel_markduplicates_create_csv/main' +include { CHANNEL_BASERECALIBRATOR_CREATE_CSV } from '../subworkflows/local/channel_baserecalibrator_create_csv/main' +include { CHANNEL_APPLYBQSR_CREATE_CSV } from '../subworkflows/local/channel_applybqsr_create_csv/main' +include { CHANNEL_SVCALLING_CREATE_CSV } from '../subworkflows/local/channel_svcalling_create_csv/main' + +// Download annotation cache if needed +include { PREPARE_CACHE } from '../subworkflows/local/prepare_cache/main' + +// Build indices if needed +include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome/main' + +// Build intervals if needed +include { PREPARE_INTERVALS } from '../subworkflows/local/prepare_intervals/main' + +// Convert BAM files to FASTQ files +include { BAM_CONVERT_SAMTOOLS as CONVERT_FASTQ_INPUT } from '../subworkflows/local/bam_convert_samtools/main' + +// Run FASTQC +include { FASTQC } from '../modules/nf-core/fastqc/main' + +// TRIM/SPLIT FASTQ Files +include { FASTP } from '../modules/nf-core/fastp/main' + +// Loading the MULTIQC module +include { MULTIQC } from '../modules/nf-core/multiqc/main' + +// Loading the module that dumps the versions of software being used +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +// Map input reads to reference genome +include { FASTQ_ALIGN_BWAMEM_MEM2 } from '../subworkflows/local/fastq_align_bwamem_mem2/main' + +// Merge and index BAM files (optional) +include { BAM_MERGE_INDEX_SAMTOOLS } from '../subworkflows/local/bam_merge_index_samtools/main' + +// Convert BAM files +include { SAMTOOLS_CONVERT as BAM_TO_CRAM } from '../modules/nf-core/samtools/convert/main' +include { SAMTOOLS_CONVERT as BAM_TO_CRAM_MAPPING } from '../modules/nf-core/samtools/convert/main' + +// Convert CRAM files (optional) +include { SAMTOOLS_CONVERT as CRAM_TO_BAM } from '../modules/nf-core/samtools/convert/main' +include { SAMTOOLS_CONVERT as CRAM_TO_BAM_RECAL } from '../modules/nf-core/samtools/convert/main' + +// Mark Duplicates (+QC) +include { BAM_MARKDUPLICATES } from '../subworkflows/local/bam_markduplicates/main' + +// QC on CRAM +include { CRAM_QC_MOSDEPTH_SAMTOOLS as CRAM_QC_NO_MD } from '../subworkflows/local/cram_qc_mosdepth_samtools/main' +include { CRAM_QC_MOSDEPTH_SAMTOOLS as CRAM_QC_RECAL } from '../subworkflows/local/cram_qc_mosdepth_samtools/main' + +// Create recalibration tables +include { BAM_BASERECALIBRATOR } from '../subworkflows/local/bam_baserecalibrator/main' + +// Create recalibrated cram files to use for variant calling (+QC) +include { BAM_APPLYBQSR } from '../subworkflows/local/bam_applybqsr/main' + +// Svaba +include { BAM_SVCALLING_SVABA } from '../subworkflows/local/bam_svcalling_svaba/main' + +//GRIDSS +include { BAM_SVCALLING_GRIDSS } from '../subworkflows/local/bam_svcalling_gridss/main' +include { BAM_SVCALLING_GRIDSS_SOMATIC } from '../subworkflows/local/bam_svcalling_gridss/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// Info required for completion email and summary -def multiqc_report = [] - workflow HEISENBIO { - ch_versions = Channel.empty() - - // - // SUBWORKFLOW: Read in samplesheet, validate and stage input files - // - INPUT_CHECK ( - file(params.input) - ) - ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) - // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input") - // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ - // ! There is currently no tooling to help you write a sample sheet schema - - // - // MODULE: Run FastQC - // - FASTQC ( - INPUT_CHECK.out.reads - ) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - - CUSTOM_DUMPSOFTWAREVERSIONS ( - ch_versions.unique().collectFile(name: 'collated_versions.yml') - ) - - // - // MODULE: MultiQC - // - workflow_summary = WorkflowHeisenbio.paramsSummaryMultiqc(workflow, summary_params) - ch_workflow_summary = Channel.value(workflow_summary) - - methods_description = WorkflowHeisenbio.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) - ch_methods_description = Channel.value(methods_description) - - ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) - - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList() - ) - multiqc_report = MULTIQC.out.report.toList() + // MULTIQC + ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) + ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() + ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() + ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + + // To gather all QC reports for MultiQC + reports = Channel.empty() + // To gather used softwares versions for MultiQC + versions = Channel.empty() + + // Download cache if needed + // Assuming that if the cache is provided, the user has already downloaded it + ensemblvep_info = params.vep_cache ? [] : Channel.of([ [ id:"${params.vep_cache_version}_${params.vep_genome}" ], params.vep_genome, params.vep_species, params.vep_cache_version ]) + snpeff_info = params.snpeff_cache ? [] : Channel.of([ [ id:"${params.snpeff_genome}.${params.snpeff_db}" ], params.snpeff_genome, params.snpeff_db ]) + + if (params.download_cache) { + PREPARE_CACHE(ensemblvep_info, snpeff_info) + snpeff_cache = PREPARE_CACHE.out.snpeff_cache + vep_cache = PREPARE_CACHE.out.ensemblvep_cache.map{ meta, cache -> [ cache ] } + + versions = versions.mix(PREPARE_CACHE.out.versions) + } + // Build indices if needed + PREPARE_GENOME( + ascat_alleles, + ascat_loci, + ascat_loci_gc, + ascat_loci_rt, + chr_dir, + dbsnp, + fasta, + fasta_fai, + germline_resource, + known_indels, + known_snps, + pon) + + // Gather built indices or get them from the params + // Built from the fasta file: + dict = params.dict ? Channel.fromPath(params.dict).map{ it -> [ [id:'dict'], it ] }.collect() + : PREPARE_GENOME.out.dict + fasta_fai = params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() + : PREPARE_GENOME.out.fasta_fai + bwa = params.bwa ? Channel.fromPath(params.bwa).collect() + : PREPARE_GENOME.out.bwa + bwamem2 = params.bwamem2 ? Channel.fromPath(params.bwamem2).collect() + : PREPARE_GENOME.out.bwamem2 + + // Gather index for mapping given the chosen aligner + index_alignement = (params.aligner == "bwa-mem") ? bwa : + params.aligner == "bwa-mem2" ? bwamem2 : null + + // TODO: add a params for msisensorpro_scan + msisensorpro_scan = PREPARE_GENOME.out.msisensorpro_scan + + // For ASCAT, extracted from zip or tar.gz files: + allele_files = PREPARE_GENOME.out.allele_files + chr_files = PREPARE_GENOME.out.chr_files + gc_file = PREPARE_GENOME.out.gc_file + loci_files = PREPARE_GENOME.out.loci_files + rt_file = PREPARE_GENOME.out.rt_file + + // Tabix indexed vcf files: + dbsnp_tbi = params.dbsnp ? params.dbsnp_tbi ? Channel.fromPath(params.dbsnp_tbi).collect() : PREPARE_GENOME.out.dbsnp_tbi : Channel.value([]) + germline_resource_tbi = params.germline_resource ? params.germline_resource_tbi ? Channel.fromPath(params.germline_resource_tbi).collect() : PREPARE_GENOME.out.germline_resource_tbi : [] //do not change to Channel.value([]), the check for its existence then fails for Getpileupsumamries + known_indels_tbi = params.known_indels ? params.known_indels_tbi ? Channel.fromPath(params.known_indels_tbi).collect() : PREPARE_GENOME.out.known_indels_tbi : Channel.value([]) + known_snps_tbi = params.known_snps ? params.known_snps_tbi ? Channel.fromPath(params.known_snps_tbi).collect() : PREPARE_GENOME.out.known_snps_tbi : Channel.value([]) + pon_tbi = params.pon ? params.pon_tbi ? Channel.fromPath(params.pon_tbi).collect() : PREPARE_GENOME.out.pon_tbi : Channel.value([]) + + // known_sites is made by grouping both the dbsnp and the known snps/indels resources + // Which can either or both be optional + known_sites_indels = dbsnp.concat(known_indels).collect() + known_sites_indels_tbi = dbsnp_tbi.concat(known_indels_tbi).collect() + + known_sites_snps = dbsnp.concat(known_snps).collect() + known_sites_snps_tbi = dbsnp_tbi.concat(known_snps_tbi).collect() + + // Build intervals if needed + PREPARE_INTERVALS(fasta_fai, params.intervals, params.no_intervals) + + // Intervals for speed up preprocessing/variant calling by spread/gather + // [interval.bed] all intervals in one file + intervals_bed_combined = params.no_intervals ? Channel.value([]) : PREPARE_INTERVALS.out.intervals_bed_combined + intervals_bed_gz_tbi_combined = params.no_intervals ? Channel.value([]) : PREPARE_INTERVALS.out.intervals_bed_gz_tbi_combined + + // For QC during preprocessing, we don't need any intervals (MOSDEPTH doesn't take them for WGS) + intervals_for_preprocessing = params.wes ? + intervals_bed_combined.map{it -> [ [ id:it.baseName ], it ]}.collect() : + Channel.value([ [ id:'null' ], [] ]) + + intervals = PREPARE_INTERVALS.out.intervals_bed // [ interval, num_intervals ] multiple interval.bed files, divided by useful intervals for scatter/gather + intervals_bed_gz_tbi = PREPARE_INTERVALS.out.intervals_bed_gz_tbi // [ interval_bed, tbi, num_intervals ] multiple interval.bed.gz/.tbi files, divided by useful intervals for scatter/gather + + intervals_and_num_intervals = intervals.map{ interval, num_intervals -> + if ( num_intervals < 1 ) [ [], num_intervals ] + else [ interval, num_intervals ] + } + + intervals_bed_gz_tbi_and_num_intervals = intervals_bed_gz_tbi.map{ intervals, num_intervals -> + if ( num_intervals < 1 ) [ [], [], num_intervals ] + else [ intervals[0], intervals[1], num_intervals ] + } + + // Gather used softwares versions + versions = versions.mix(PREPARE_GENOME.out.versions) + versions = versions.mix(PREPARE_INTERVALS.out.versions) + + if (params.step == 'alignment') { + + // Figure out if input is bam or fastq + input_sample_type = input_sample.branch{ + bam: it[0].data_type == "bam" + fastq: it[0].data_type == "fastq" + } + + // Convert any bam input to fastq + // fasta are not needed when converting bam to fastq -> [ id:"fasta" ], [] + // No need for fasta.fai -> [] + interleave_input = false // Currently don't allow interleaved input + CONVERT_FASTQ_INPUT( + input_sample_type.bam, + [ [ id:"fasta" ], [] ], // fasta + [ [ id:'null' ], [] ], // fasta_fai + interleave_input) + + // Gather fastq (inputed or converted) + // Theorically this could work on mixed input (fastq for one sample and bam for another) + // But not sure how to handle that with the samplesheet + // Or if we really want users to be able to do that + input_fastq = input_sample_type.fastq.mix(CONVERT_FASTQ_INPUT.out.reads) + //input_fastq.view() + // STEP 0: QC & TRIMMING + // `--skip_tools fastqc` to skip fastqc + // Trim only with `--trim_fastq` + // Additional options to be set up + + // QC + if (!(params.skip_tools && params.skip_tools.split(',').contains('fastqc'))) { + FASTQC(input_fastq) + + reports = reports.mix(FASTQC.out.zip.collect{ meta, logs -> logs }) + versions = versions.mix(FASTQC.out.versions.first()) + } + + //skipping the UMI Conscensus calling step for now + reads_for_fastp = input_fastq + + // Trimming and/or splitting + if (params.trim_fastq && params.split_fastq > 0) { + log.warn "You have mentioned trim_fastq to `$params.trim_fastq`, will do trimming" + save_trimmed_fail = false + save_merged = false + FASTP( + reads_for_fastp, + [], // we are not using any adapter fastas at the moment + save_trimmed_fail, + save_merged + ) + + reports = reports.mix(FASTP.out.json.collect{ meta, json -> json }) + reports = reports.mix(FASTP.out.html.collect{ meta, html -> html }) + + if (params.split_fastq) { + reads_for_alignment = FASTP.out.reads.map{ meta, reads -> + read_files = reads.sort(false) { a,b -> a.getName().tokenize('.')[0] <=> b.getName().tokenize('.')[0] }.collate(2) + [ meta + [ size:read_files.size() ], read_files ] + }.transpose() + } else reads_for_alignment = FASTP.out.reads + + versions = versions.mix(FASTP.out.versions) + + } else { + println "Skipping trimming since trim_fastq is false" + reads_for_alignment = reads_for_fastp + } + + // STEP 1: MAPPING READS TO REFERENCE GENOME + // reads will be sorted + reads_for_alignment = reads_for_alignment.map{ meta, reads -> + // Update meta.id to meta.sample no multiple lanes or splitted fastqs + if (meta.size * meta.num_lanes == 1) [ meta + [ id:meta.sample ], reads ] + else [ meta, reads ] + } + + //reads_for_alignment.view() + + println "Starting Alignment Process..." + sort_bam = true + //fasta.view() + //fasta_fai.view() + //index_alignement.view() + FASTQ_ALIGN_BWAMEM_MEM2(reads_for_alignment, index_alignement, sort_bam, fasta, fasta_fai) + // Grouping the bams from the same samples not to stall the workflow + bam_mapped = FASTQ_ALIGN_BWAMEM_MEM2.out.bam.map{ meta, bam -> + + // Update meta.id to be meta.sample, ditching sample-lane that is not needed anymore + // Update meta.data_type + // Remove no longer necessary fields: + // read_group: Now in the BAM header + // num_lanes: only needed for mapping + // size: only needed for mapping + + // Use groupKey to make sure that the correct group can advance as soon as it is complete + // and not stall the workflow until all reads from all channels are mapped + [ groupKey( meta - meta.subMap('num_lanes', 'read_group', 'size') + [ data_type:'bam', id:meta.sample ], (meta.num_lanes ?: 1) * (meta.size ?: 1)), bam ] + }.groupTuple() + + if ( + params.save_mapped || + ( + (params.skip_tools && params.skip_tools.split(',').contains('markduplicates')) && + !(params.tools && params.tools.split(',').contains('sentieon_dedup')) + ) + ) { + // bams are merged (when multiple lanes from the same sample), indexed and then converted to cram + BAM_MERGE_INDEX_SAMTOOLS(bam_mapped) + + BAM_TO_CRAM_MAPPING(BAM_MERGE_INDEX_SAMTOOLS.out.bam_bai, fasta, fasta_fai) + // Create CSV to restart from this step + params.save_output_as_bam ? CHANNEL_ALIGN_CREATE_CSV(BAM_MERGE_INDEX_SAMTOOLS.out.bam_bai) : CHANNEL_ALIGN_CREATE_CSV(BAM_TO_CRAM_MAPPING.out.alignment_index) + + // Gather used softwares versions + versions = versions.mix(BAM_MERGE_INDEX_SAMTOOLS.out.versions) + versions = versions.mix(BAM_TO_CRAM_MAPPING.out.versions) + } + + // Gather used softwares versions + versions = versions.mix(CONVERT_FASTQ_INPUT.out.versions) + versions = versions.mix(FASTQ_ALIGN_BWAMEM_MEM2.out.versions) + } + + if (params.step in ['alignment', 'markduplicates']) { + + // ch_cram_no_markduplicates_restart = Channel.empty() + cram_markduplicates_no_spark = Channel.empty() + + // STEP 2: markduplicates (+QC) + convert to CRAM + // ch_bam_for_markduplicates will contain bam mapped with FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON when step is mapping + // Or bams that are specified in the samplesheet.csv when step is prepare_recalibration + cram_for_markduplicates = params.step == 'alignment' ? bam_mapped : input_sample.map{ meta, input, index -> [ meta, input ] } + // if no MD is done, then run QC on mapped & converted CRAM files + // or the input BAM (+converted) or CRAM files + cram_skip_markduplicates = Channel.empty() + + // Should it be possible to restart from converted crams? + // For now, conversion from bam to cram is only done when skipping markduplicates + + if ( + params.skip_tools && + params.skip_tools.split(',').contains('markduplicates') + ) { + if (params.step == 'alignment') { + cram_skip_markduplicates = BAM_TO_CRAM_MAPPING.out.alignment_index + } else { + input_markduplicates_convert = input_sample.branch{ + bam: it[0].data_type == "bam" + cram: it[0].data_type == "cram" + } + // Convert any input BAMs to CRAM + BAM_TO_CRAM(input_markduplicates_convert.bam, fasta, fasta_fai) + versions = versions.mix(BAM_TO_CRAM.out.versions) + + cram_skip_markduplicates = Channel.empty().mix(input_markduplicates_convert.cram, BAM_TO_CRAM.out.alignment_index) + } + CRAM_QC_NO_MD(cram_skip_markduplicates, fasta, intervals_for_preprocessing) + + // Gather QC reports + reports = reports.mix(CRAM_QC_NO_MD.out.reports.collect{ meta, report -> report }) + + // Gather used softwares versions + versions = versions.mix(CRAM_QC_NO_MD.out.versions) + } else { + BAM_MARKDUPLICATES( + cram_for_markduplicates, + fasta, + fasta_fai, + intervals_for_preprocessing) + + cram_markduplicates_no_spark = BAM_MARKDUPLICATES.out.cram + + // Gather QC reports + reports = reports.mix(BAM_MARKDUPLICATES.out.reports.collect{ meta, report -> report }) + + // Gather used softwares versions + versions = versions.mix(BAM_MARKDUPLICATES.out.versions) + } + // ch_md_cram_for_restart contains either: + // - crams from markduplicates + // - crams from sentieon_dedup + // - crams from markduplicates_spark + // - crams from input step markduplicates --> from the converted ones only? + ch_md_cram_for_restart = Channel.empty().mix(cram_markduplicates_no_spark) + // Make sure correct data types are carried through + .map{ meta, cram, crai -> [ meta + [data_type: "cram"], cram, crai ] } + //ch_md_cram_for_restart.view() + // If params.save_output_as_bam, then convert CRAM files to BAM + CRAM_TO_BAM(ch_md_cram_for_restart, fasta, fasta_fai) + versions = versions.mix(CRAM_TO_BAM.out.versions) + + // CSV should be written for the file actually out, either CRAM or BAM + // Create CSV to restart from this step + csv_subfolder = (params.tools && params.tools.split(',').contains('sentieon_dedup')) ? 'sentieon_dedup' : 'markduplicates' + + params.save_output_as_bam ? CHANNEL_MARKDUPLICATES_CREATE_CSV(CRAM_TO_BAM.out.alignment_index, csv_subfolder, params.outdir, params.save_output_as_bam) : CHANNEL_MARKDUPLICATES_CREATE_CSV(ch_md_cram_for_restart, csv_subfolder, params.outdir, params.save_output_as_bam) + } + + if (params.step in ['alignment', 'markduplicates', 'prepare_recalibration']) { + // Run if starting from step "prepare_recalibration" + if (params.step == 'prepare_recalibration') { + // Support if starting from BAM or CRAM files + input_prepare_recal_convert = input_sample.branch{ + bam: it[0].data_type == "bam" + cram: it[0].data_type == "cram" + } + // BAM files first must be converted to CRAM files since from this step on we base everything on CRAM format + BAM_TO_CRAM(input_prepare_recal_convert.bam, fasta, fasta_fai) + versions = versions.mix(BAM_TO_CRAM.out.versions) + + ch_cram_from_bam = BAM_TO_CRAM.out.alignment_index + // Make sure correct data types are carried through + .map{ meta, cram, crai -> [ meta + [data_type: "cram"], cram, crai ] } + + ch_cram_for_bam_baserecalibrator = Channel.empty().mix(ch_cram_from_bam, input_prepare_recal_convert.cram) + ch_md_cram_for_restart = ch_cram_from_bam + + } else { + // ch_cram_for_bam_baserecalibrator contains either: + // - crams from markduplicates + // - crams from markduplicates_spark + // - crams converted from bam mapped when skipping markduplicates + // - input cram files, when start from step markduplicates + ch_cram_for_bam_baserecalibrator = Channel.empty().mix(ch_md_cram_for_restart, cram_skip_markduplicates ) + // Make sure correct data types are carried through + .map{ meta, cram, crai -> [ meta + [data_type: "cram"], cram, crai ] } + + } + if (!(params.skip_tools && params.skip_tools.split(',').contains('baserecalibrator'))) { + + ch_table_bqsr_tab = Channel.empty() + + BAM_BASERECALIBRATOR( + ch_cram_for_bam_baserecalibrator, + dict, + fasta, + fasta_fai, + intervals_and_num_intervals, + known_sites_indels, + known_sites_indels_tbi) + + ch_table_bqsr_tab = BAM_BASERECALIBRATOR.out.table_bqsr + + versions = versions.mix(BAM_BASERECALIBRATOR.out.versions) + } + // ch_table_bqsr contains table from baserecalibrator + ch_table_bqsr = Channel.empty().mix(ch_table_bqsr_tab) + + reports = reports.mix(ch_table_bqsr.collect{ meta, table -> table }) + cram_applybqsr = ch_cram_for_bam_baserecalibrator.join(ch_table_bqsr, failOnDuplicate: true, failOnMismatch: true) + + // Create CSV to restart from this step + CHANNEL_BASERECALIBRATOR_CREATE_CSV(ch_md_cram_for_restart.join(ch_table_bqsr, failOnDuplicate: true), params.tools, params.skip_tools, params.save_output_as_bam, params.outdir) + } + + // STEP 4: RECALIBRATING + if (params.step in ['alignment', 'markduplicates', 'prepare_recalibration', 'recalibrate']) { + // Run if starting from step "prepare_recalibration" + if (params.step == 'recalibrate') { + // Support if starting from BAM or CRAM files + input_recal_convert = input_sample.branch{ + bam: it[0].data_type == "bam" + cram: it[0].data_type == "cram" + } + + // If BAM file, split up table and mapped file to convert BAM to CRAM + input_only_table = input_recal_convert.bam.map{ meta, bam, bai, table -> [ meta, table ] } + input_only_bam = input_recal_convert.bam.map{ meta, bam, bai, table -> [ meta, bam, bai ] } + + // BAM files first must be converted to CRAM files since from this step on we base everything on CRAM format + BAM_TO_CRAM(input_only_bam, fasta, fasta_fai) + versions = versions.mix(BAM_TO_CRAM.out.versions) + + cram_applybqsr = Channel.empty().mix( + BAM_TO_CRAM.out.alignment_index.join(input_only_table, failOnDuplicate: true, failOnMismatch: true), + input_recal_convert.cram) + // Join together converted cram with input tables + .map{ meta, cram, crai, table -> [ meta + [data_type: "cram"], cram, crai, table ]} + + } + if (!(params.skip_tools && params.skip_tools.split(',').contains('baserecalibrator'))) { + cram_variant_calling_tab = Channel.empty() + + BAM_APPLYBQSR( + cram_applybqsr, + dict, + fasta, + fasta_fai, + intervals_and_num_intervals + ) + + cram_variant_calling_tab = BAM_APPLYBQSR.out.cram + + // Gather used softwares versions + versions = versions.mix(BAM_APPLYBQSR.out.versions) + + cram_variant_calling = Channel.empty().mix(cram_variant_calling_tab) + + CRAM_QC_RECAL( + cram_variant_calling, + fasta, + intervals_for_preprocessing + ) + + // Gather QC + reports = reports.mix(CRAM_QC_RECAL.out.reports.collect{ meta, report -> report }) + + // Gather software versions + versions = versions.mix(CRAM_QC_RECAL.out.versions) + + // If params.save_output_as_bam, then convert CRAM files to BAM + CRAM_TO_BAM_RECAL(cram_variant_calling, fasta, fasta_fai) + versions = versions.mix(CRAM_TO_BAM_RECAL.out.versions) + + // CSV should be written for the file actually out out, either CRAM or BAM + csv_recalibration = Channel.empty() + csv_recalibration = params.save_output_as_bam ? CRAM_TO_BAM_RECAL.out.alignment_index : cram_variant_calling + + // Create CSV to restart from this step + CHANNEL_APPLYBQSR_CREATE_CSV(csv_recalibration) + + } else if (params.step == 'recalibrate') { + // cram_variant_calling contains either: + // - input bams converted to crams, if started from step recal + skip BQSR + // - input crams if started from step recal + skip BQSR + cram_variant_calling = Channel.empty().mix( + BAM_TO_CRAM.out.alignment_index, + input_recal_convert.cram.map{ meta, cram, crai, table -> [ meta, cram, crai ] }) + } else { + // cram_variant_calling contains either: + // - crams from markduplicates = ch_cram_for_bam_baserecalibrator if skip BQSR but not started from step recalibration + cram_variant_calling = Channel.empty().mix(ch_cram_for_bam_baserecalibrator) + } + cram_sv_calling = cram_variant_calling + } + + + + if (params.step in ['alignment', 'markduplicates', 'prepare_recalibration', 'recalibrate', 'sv_calling']) { + //when starting from sv_calling + if (params.step == 'sv_calling') { + input_sv_calling_convert = input_sample.branch{ + bam: it[0].data_type == "bam" + cram: it[0].data_type == "cram" + } + // BAM files first must be converted to CRAM files since from this step on we base everything on CRAM format + BAM_TO_CRAM(input_sv_calling_convert.bam, fasta, fasta_fai) + versions = versions.mix(BAM_TO_CRAM.out.versions) + + cram_sv_calling = Channel.empty().mix(BAM_TO_CRAM.out.alignment_index, input_sv_calling_convert.cram) + + } + + // getting the tumor and normal cram files separated + cram_sv_calling_status = cram_sv_calling.branch{ + normal: it[0].status == 0 + tumor: it[0].status == 1 + } + + // All normal samples + cram_sv_calling_normal_to_cross = cram_sv_calling_status.normal.map{ meta, cram, crai -> [ meta.patient, meta, cram, crai ] } + + // All tumor samples + cram_sv_calling_tumor_to_cross = cram_sv_calling_status.tumor.map{ meta, cram, crai -> [ meta.patient, meta, cram, crai ] } + + // Crossing the normal and tumor samples to create tumor and normal pairs + cram_sv_calling_pair = cram_sv_calling_normal_to_cross.cross(cram_sv_calling_tumor_to_cross) + .map { normal, tumor -> + def meta = [:] + + meta.id = "${tumor[1].sample}_vs_${normal[1].sample}".toString() + meta.normal_id = normal[1].sample + meta.patient = normal[0] + meta.sex = normal[1].sex + meta.tumor_id = tumor[1].sample + + [ meta, normal[2], normal[3], tumor[2], tumor[3] ] + } + + //cram_sv_calling_pair.view() + if (params.tools && params.tools.split(',').contains('svaba')) { + BAM_SVCALLING_SVABA(cram_sv_calling_pair, fasta, fasta_fai, bwa, dbsnp, dbsnp_tbi, indel_mask, germ_sv_db, simple_seq_db, error_rate) + + versions = versions.mix(BAM_SVCALLING_SVABA.out.versions) + + vcf_from_sv_calling = Channel.empty() + vcf_from_sv_calling = vcf_from_sv_calling.mix(BAM_SVCALLING_SVABA.out.all_output) //This one contains multiple files of vcf, to get individual files, call individual output + + } + + if (params.tools && params.tools.split(',').contains('gridss')) { + BAM_SVCALLING_GRIDSS(cram_sv_calling_pair, fasta, fasta_fai, bwa, blacklist_gridss) + + versions = versions.mix(BAM_SVCALLING_GRIDSS.out.versions) + + vcf_from_gridss_gridss = Channel.empty() + vcf_from_gridss_gridss = vcf_from_gridss_gridss.mix(BAM_SVCALLING_GRIDSS.out.vcf) // This one contain only one vcf + //vcf_from_gridss_gridss.view() + + + BAM_SVCALLING_GRIDSS_SOMATIC(vcf_from_gridss_gridss, pon_gridss) + versions = versions.mix(BAM_SVCALLING_GRIDSS_SOMATIC.out.versions) + + vcf_from_sv_calling = Channel.empty().mix(BAM_SVCALLING_GRIDSS_SOMATIC.out.all_vcf) + vcf_from_sv_calling.view() + } + + //CHANNEL_SVCALLING_CREATE_CSV(vcf_from_sv_calling, params.tools, params.outdir) // Need to fix this!!!!! + + } + } + + + + + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ COMPLETION EMAIL AND SUMMARY @@ -117,13 +863,42 @@ workflow HEISENBIO { */ workflow.onComplete { - if (params.email || params.email_on_fail) { - NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report) - } + if (params.email || params.email_on_fail) NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report) NfcoreTemplate.summary(workflow, params, log) - if (params.hook_url) { - NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) + if (params.hook_url) NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + FUNCTIONS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ +// Parse first line of a FASTQ file, return the flowcell id and lane number. +def flowcellLaneFromFastq(path) { + // expected format: + // xx:yy:FLOWCELLID:LANE:... (seven fields) + // or + // FLOWCELLID:LANE:xx:... (five fields) + def line + path.withInputStream { + InputStream gzipStream = new java.util.zip.GZIPInputStream(it) + Reader decoder = new InputStreamReader(gzipStream, 'ASCII') + BufferedReader buffered = new BufferedReader(decoder) + line = buffered.readLine() + } + assert line.startsWith('@') + line = line.substring(1) + def fields = line.split(':') + String fcid + + if (fields.size() >= 7) { + // CASAVA 1.8+ format, from https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm + // "@::::::: :::" + fcid = fields[2] + } else if (fields.size() == 5) { + fcid = fields[0] } + return fcid } /* @@ -131,3 +906,25 @@ workflow.onComplete { THE END ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ + + + + + + + + + + + + + + + + + + + + + + diff --git a/workflows/test_true.nf b/workflows/test_true.nf new file mode 100644 index 0000000..a5e7e83 --- /dev/null +++ b/workflows/test_true.nf @@ -0,0 +1,528 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + PRINT PARAMS SUMMARY +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' + +def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) +def citation = '\n' + WorkflowMain.citation(workflow) + '\n' +def summary_params = paramsSummaryMap(workflow) + +// Print parameter summary log to screen +log.info logo + paramsSummaryLog(workflow) + citation +// log.info paramsSummaryLog(workflow) + + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + VALIDATE INPUTS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// Check input path parameters to see if they exist +def checkPathParamList = [ + params.ascat_alleles, + params.ascat_loci, + params.ascat_loci_gc, + params.ascat_loci_rt, + params.bwa, + params.bwamem2, + params.cf_chrom_len, + params.chr_dir, + //params.cnvkit_reference, + params.dbnsfp, + params.dbnsfp_tbi, + params.dbsnp, + params.dbsnp_tbi, + params.dict, + //params.dragmap, + params.fasta, + params.fasta_fai, + params.germline_resource, + params.germline_resource_tbi, + params.input, + params.intervals, + params.known_indels, + params.known_indels_tbi, + params.known_snps, + params.known_snps_tbi, + params.mappability, + params.multiqc_config, + params.pon, + params.pon_tbi, + params.spliceai_indel, + params.spliceai_indel_tbi, + params.spliceai_snv, + params.spliceai_snv_tbi +] + + + +// Validating the input parameters +WorkflowHeisenbio.initialise(params, log) + + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Check mandatory parameters +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ +println "This is the output of build_only_index: $params.build_only_index" +for (param in checkPathParamList) if (param) file(param, checkIfExists: true) + + +// Set input, can either be from --input or from automatic retrieval in WorkflowHeisenbio.groovy +if (params.input) { + ch_from_samplesheet = params.build_only_index ? Channel.empty() : Channel.fromSamplesheet("input") +} else { + ch_from_samplesheet = params.build_only_index ? Channel.empty() : Channel.fromSamplesheet("input_restart") +} + +input_sample = ch_from_samplesheet + .map{ meta, fastq_1, fastq_2, table, cram, crai, bam, bai, vcf, variantcaller -> + // generate patient_sample key to group lanes together + [ meta.patient + meta.sample, [meta, fastq_1, fastq_2, table, cram, crai, bam, bai, vcf, variantcaller] ] + } + + .tap{ ch_with_patient_sample } // save the channel + .groupTuple() //group by patient_sample to get all lanes + .map { patient_sample, ch_items -> + // get number of lanes per sample + [ patient_sample, ch_items.size() ] + } + .combine(ch_with_patient_sample, by: 0) // for each entry add numLanes + + .map { patient_sample, num_lanes, ch_items -> + + (meta, fastq_1, fastq_2, table, cram, crai, bam, bai, vcf, variantcaller) = ch_items + + if (meta.lane && fastq_2) { + meta = meta + [id: "${meta.sample}-${meta.lane}".toString()] + def CN = params.seq_center ? "CN:${params.seq_center}\\t" : '' + + def flowcell = flowcellLaneFromFastq(fastq_1) + // Don't use a random element for ID, it breaks resuming + def read_group = "\"@RG\\tID:${flowcell}.${meta.sample}.${meta.lane}\\t${CN}PU:${meta.lane}\\tSM:${meta.patient}_${meta.sample}\\tLB:${meta.sample}\\tDS:${params.fasta}\\tPL:${params.seq_platform}\"" + + meta = meta - meta.subMap('lane') + [num_lanes: num_lanes.toInteger(), read_group: read_group.toString(), data_type: 'fastq', size: 1] + + if (params.step == 'alignment') return [ meta, [ fastq_1, fastq_2 ] ] + else { + error("Samplesheet contains fastq files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.\nhttps://nf-co.re/sarek/usage#input-samplesheet-configurations") + } + + // start from BAM if BAM files are provided + } else if (meta.lane && bam) { + if (params.step != 'alignment' && !bai) { + error("BAM index (bai) should be provided.") + } + meta = meta + [id: "${meta.sample}-${meta.lane}".toString()] + def CN = params.seq_center ? "CN:${params.seq_center}\\t" : '' + def read_group = "\"@RG\\tID:${meta.sample}_${meta.lane}\\t${CN}PU:${meta.lane}\\tSM:${meta.patient}_${meta.sample}\\tLB:${meta.sample}\\tDS:${params.fasta}\\tPL:${params.seq_platform}\"" + + meta = meta - meta.subMap('lane') + [num_lanes: num_lanes.toInteger(), read_group: read_group.toString(), data_type: 'bam', size: 1] + + if (params.step != 'annotate') return [ meta - meta.subMap('lane'), bam, bai ] + else { + error("Samplesheet contains bam files but step is `annotate`. The pipeline is expecting vcf files for the annotation. Please check your samplesheet or adjust the step parameter.") + } + + // start from recalibration step + } else if (table && cram) { + meta = meta + [id: meta.sample, data_type: 'cram'] + + if (!(params.step == 'alignment' || params.step == 'annotate')) return [ meta - meta.subMap('lane'), cram, crai, table ] + else { + error("Samplesheet contains cram files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.") + } + + // starting from recalibration step when skipping MarkDuplicates + } else if (table && bam) { + meta = meta + [id: meta.sample, data_type: 'bam'] + + if (!(params.step == 'alignment' || params.step == 'annotate')) return [ meta - meta.subMap('lane'), bam, bai, table ] + else { + error("Samplesheet contains bam files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.") + } + + // when starting from prepare_recalibration or variant_calling + } else if (cram) { + meta = meta + [id: meta.sample, data_type: 'cram'] + + if (!(params.step == 'alignment' || params.step == 'annotate')) return [ meta - meta.subMap('lane'), cram, crai ] + else { + error("Samplesheet contains cram files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.") + } + + // when starting from prepare_recalibration when skipping MarkDuplicates or `--step markduplicates` + } else if (bam) { + meta = meta + [id: meta.sample, data_type: 'bam'] + + if (!(params.step == 'alignment' || params.step == 'annotate')) return [ meta - meta.subMap('lane'), bam, bai ] + else { + error("Samplesheet contains bam files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.") + } + + // when starting from annotation step + } else if (vcf) { + meta = meta + [id: meta.sample, data_type: 'vcf', variantcaller: variantcaller ?: ''] + + if (params.step == 'annotate') return [ meta - meta.subMap('lane'), vcf ] + else { + error("Samplesheet contains vcf files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.") + } + } else { + error("Missing or unknown field in csv file header. Please check your samplesheet") + } + } + + +//skipping some lines from sarek that does checks for some downstream steps, not focussing on them now... + + +// Fails when missing resources for baserecalibrator +// Warns when missing resources for haplotypecaller +if (!params.dbsnp && !params.known_indels) { + if (params.step in ['alignment', 'markduplicates', 'prepare_recalibration', 'recalibrate'] && (!params.skip_tools || (params.skip_tools && !params.skip_tools.split(',').contains('baserecalibrator')))) { + error("Base quality score recalibration requires at least one resource file. Please provide at least one of `--dbsnp` or `--known_indels`\nYou can skip this step in the workflow by adding `--skip_tools baserecalibrator` to the command.") + } + if (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper'))) { + log.warn "If GATK's Haplotypecaller or Sentieon's Haplotyper is specified, without `--dbsnp` or `--known_indels no filtering will be done. For filtering, please provide at least one of `--dbsnp` or `--known_indels`.\nFor more information see FilterVariantTranches (single-sample, default): https://gatk.broadinstitute.org/hc/en-us/articles/5358928898971-FilterVariantTranches\nFor more information see VariantRecalibration (--joint_germline): https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator\nFor more information on GATK Best practice germline variant calling: https://gatk.broadinstitute.org/hc/en-us/articles/360035535932-Germline-short-variant-discovery-SNPs-Indels-" + } +} + +if (params.joint_germline && (!params.tools || !(params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper')))) { + error("The GATK's Haplotypecaller or Sentieon's Haplotyper should be specified as one of the tools when doing joint germline variant calling.) ") +} + +if (params.joint_germline && (!params.dbsnp || !params.known_indels || !params.known_snps || params.no_intervals)) { + log.warn "If GATK's Haplotypecaller or Sentieon's Haplotyper is specified, without `--dbsnp`, `--known_snps`, `--known_indels` or the associated resource labels (ie `known_snps_vqsr`), no variant recalibration will be done. For recalibration you must provide all of these resources.\nFor more information see VariantRecalibration: https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator \nJoint germline variant calling also requires intervals in order to genotype the samples. As a result, if `--no_intervals` is set to `true` the joint germline variant calling will not be performed." +} + + +//skipping some regarding mutect2 for now and ascat and variant calling and annotate + +if ((params.download_cache) && (params.snpeff_cache || params.vep_cache)) { + error("Please specify either `--download_cache` or `--snpeff_cache`, `--vep_cache`") +} + + + + + + + + + + + + + + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + CONFIG FILES +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT LOCAL MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// +// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules +// +//include { INPUT_CHECK } from '../subworkflows/local/input_check' + +// Initialize file channels based on params, defined in the params.genomes[params.genome] scope (commenting out some for now for downstream steps) + +ascat_alleles = params.ascat_alleles ? Channel.fromPath(params.ascat_alleles).collect() : Channel.empty() +ascat_loci = params.ascat_loci ? Channel.fromPath(params.ascat_loci).collect() : Channel.empty() +ascat_loci_gc = params.ascat_loci_gc ? Channel.fromPath(params.ascat_loci_gc).collect() : Channel.value([]) +ascat_loci_rt = params.ascat_loci_rt ? Channel.fromPath(params.ascat_loci_rt).collect() : Channel.value([]) +cf_chrom_len = params.cf_chrom_len ? Channel.fromPath(params.cf_chrom_len).collect() : [] +chr_dir = params.chr_dir ? Channel.fromPath(params.chr_dir).collect() : Channel.value([]) +dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : Channel.value([]) +fasta = params.fasta ? Channel.fromPath(params.fasta).first() : Channel.empty() +fasta_fai = params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : Channel.empty() +known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : Channel.value([]) +known_snps = params.known_snps ? Channel.fromPath(params.known_snps).collect() : Channel.value([]) + +germline_resource = params.germline_resource ? Channel.fromPath(params.germline_resource).collect() : Channel.value([]) // Mutect2 does not require a germline resource, so set to optional input +mappability = params.mappability ? Channel.fromPath(params.mappability).collect() : Channel.value([]) +pon = params.pon ? Channel.fromPath(params.pon).collect() : Channel.value([]) // PON is optional for Mutect2 (but highly recommended) +ascat_genome = params.ascat_genome ?: Channel.empty() +dbsnp_vqsr = params.dbsnp_vqsr ? Channel.value(params.dbsnp_vqsr) : Channel.empty() +known_indels_vqsr = params.known_indels_vqsr ? Channel.value(params.known_indels_vqsr) : Channel.empty() +known_snps_vqsr = params.known_snps_vqsr ? Channel.value(params.known_snps_vqsr) : Channel.empty() +snpeff_db = params.snpeff_db ?: Channel.empty() +vep_cache_version = params.vep_cache_version ?: Channel.empty() +vep_genome = params.vep_genome ?: Channel.empty() +vep_species = params.vep_species ?: Channel.empty() + + + + + +// Initialize files channels based on params, not defined within the params.genomes[params.genome] scope +if (params.snpeff_cache && params.tools && params.tools.contains("snpeff")) { + def snpeff_annotation_cache_key = params.use_annotation_cache_keys ? "${params.snpeff_genome}.${params.snpeff_db}/" : "" + def snpeff_cache_dir = "${snpeff_annotation_cache_key}${params.snpeff_genome}.${params.snpeff_db}" + def snpeff_cache_path_full = file("$params.snpeff_cache/$snpeff_cache_dir", type: 'dir') + if ( !snpeff_cache_path_full.exists() || !snpeff_cache_path_full.isDirectory() ) { + error("Files within --snpeff_cache invalid. Make sure there is a directory named ${snpeff_cache_dir} in ${params.snpeff_cache}.\n") + } + snpeff_cache = Channel.fromPath(file("${params.snpeff_cache}/${snpeff_annotation_cache_key}"), checkIfExists: true).collect() + .map{ cache -> [ [ id:"${params.snpeff_genome}.${params.snpeff_db}" ], cache ] } +} else snpeff_cache = [] + +if (params.vep_cache && params.tools && params.tools.contains("vep")) { + def vep_annotation_cache_key = params.use_annotation_cache_keys ? "${params.vep_cache_version}_${params.vep_genome}/" : "" + def vep_cache_dir = "${vep_annotation_cache_key}${params.vep_species}/${params.vep_cache_version}_${params.vep_genome}" + def vep_cache_path_full = file("$params.vep_cache/$vep_cache_dir", type: 'dir') + if ( !vep_cache_path_full.exists() || !vep_cache_path_full.isDirectory() ) { + error("Files within --vep_cache invalid. Make sure there is a directory named ${vep_cache_dir} in ${params.vep_cache}.\n") + } + vep_cache = Channel.fromPath(file("${params.vep_cache}/${vep_annotation_cache_key}"), checkIfExists: true).collect() +} else vep_cache = [] + +vep_extra_files = [] + +//if (params.dbnsfp && params.dbnsfp_tbi) { +// vep_extra_files.add(file(params.dbnsfp, checkIfExists: true)) +// vep_extra_files.add(file(params.dbnsfp_tbi, checkIfExists: true)) +//} + +//if (params.spliceai_snv && params.spliceai_snv_tbi && params.spliceai_indel && params.spliceai_indel_tbi) { +// vep_extra_files.add(file(params.spliceai_indel, checkIfExists: true)) +// vep_extra_files.add(file(params.spliceai_indel_tbi, checkIfExists: true)) +// vep_extra_files.add(file(params.spliceai_snv, checkIfExists: true)) +// vep_extra_files.add(file(params.spliceai_snv_tbi, checkIfExists: true)) +//} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT LOCAL/NF-CORE MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// Create samplesheets to restart from different steps +include { CHANNEL_ALIGN_CREATE_CSV } from '../subworkflows/local/channel_align_create_csv/main' +include { CHANNEL_MARKDUPLICATES_CREATE_CSV } from '../subworkflows/local/channel_markduplicates_create_csv/main' +include { CHANNEL_BASERECALIBRATOR_CREATE_CSV } from '../subworkflows/local/channel_baserecalibrator_create_csv/main' +include { CHANNEL_APPLYBQSR_CREATE_CSV } from '../subworkflows/local/channel_applybqsr_create_csv/main' +//include { CHANNEL_VARIANT_CALLING_CREATE_CSV } from '../subworkflows/local/channel_variant_calling_create_csv/main' + + +// Download annotation cache if needed +include { PREPARE_CACHE } from '../subworkflows/local/prepare_cache/main' + + +// Build indices if needed +include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome/main' + + +// Build intervals if needed +include { PREPARE_INTERVALS } from '../subworkflows/local/prepare_intervals/main' + + +// Convert BAM files to FASTQ files +include { BAM_CONVERT_SAMTOOLS as CONVERT_FASTQ_INPUT } from '../subworkflows/local/bam_convert_samtools/main' +//include { BAM_CONVERT_SAMTOOLS as CONVERT_FASTQ_UMI } from '../subworkflows/local/bam_convert_samtools/main' + + +// Map input reads to reference genome +include { FASTQ_ALIGN_BWAMEM_MEM2 } from '../subworkflows/local/fastq_align_bwamem_mem2/main' + + +// Merge and index BAM files (optional) +include { BAM_MERGE_INDEX_SAMTOOLS } from '../subworkflows/local/bam_merge_index_samtools/main' + + +// Convert BAM files +include { SAMTOOLS_CONVERT as BAM_TO_CRAM } from '../modules/nf-core/samtools/convert/main' +include { SAMTOOLS_CONVERT as BAM_TO_CRAM_MAPPING } from '../modules/nf-core/samtools/convert/main' + +// Convert CRAM files (optional) +include { SAMTOOLS_CONVERT as CRAM_TO_BAM } from '../modules/nf-core/samtools/convert/main' +include { SAMTOOLS_CONVERT as CRAM_TO_BAM_RECAL } from '../modules/nf-core/samtools/convert/main' + + +// Mark Duplicates (+QC) +include { BAM_MARKDUPLICATES } from '../subworkflows/local/bam_markduplicates/main' +//include { BAM_SENTIEON_DEDUP } from '../subworkflows/local/bam_sentieon_dedup/main' + + +// QC on CRAM +include { CRAM_QC_MOSDEPTH_SAMTOOLS as CRAM_QC_NO_MD } from '../subworkflows/local/cram_qc_mosdepth_samtools/main' +include { CRAM_QC_MOSDEPTH_SAMTOOLS as CRAM_QC_RECAL } from '../subworkflows/local/cram_qc_mosdepth_samtools/main' + + +// Create recalibration tables +include { BAM_BASERECALIBRATOR } from '../subworkflows/local/bam_baserecalibrator/main' + + +// Create recalibrated cram files to use for variant calling (+QC) +include { BAM_APPLYBQSR } from '../subworkflows/local/bam_applybqsr/main' + + +// +// MODULE: Installed directly from nf-core/modules +// + +// Loading the FASTQC module +include { FASTQC } from '../modules/nf-core/fastqc/main' + + +// TRIM/SPLIT FASTQ Files +include { FASTP } from '../modules/nf-core/fastp/main' + + +// Loading the MULTIQC module +include { MULTIQC } from '../modules/nf-core/multiqc/main' + + +// Loading the module that dumps the versions of software being used +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' + + +// SKIPPING SOME FROM SAREK FOR NOW..... + + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN MAIN WORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow HEISENBIO() { + + ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) + ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() + ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() + ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + + // To gather all QC reports for MultiQC + reports = Channel.empty() + // To gather used softwares versions for MultiQC + versions = Channel.empty() + + if (params.download_cache) { + PREPARE_CACHE(ensemblvep_info, snpeff_info) + snpeff_cache = PREPARE_CACHE.out.snpeff_cache + vep_cache = PREPARE_CACHE.out.ensemblvep_cache.map{ meta, cache -> [ cache ] } + + versions = versions.mix(PREPARE_CACHE.out.versions) + } + + // Build indices if needed + PREPARE_GENOME( + ascat_alleles, + ascat_loci, + ascat_loci_gc, + ascat_loci_rt, + chr_dir, + dbsnp, + fasta, + fasta_fai, + germline_resource, + known_indels, + known_snps, + pon) + + // Gather built indices or get them from the params. If it is supplied it will take that or build it using PREPARE_GENOME() function. Built from the fasta file: + dict = params.dict ? Channel.fromPath(params.dict).map{ it -> [ [id:'dict'], it ] }.collect() + : PREPARE_GENOME.out.dict + fasta_fai = params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() + : PREPARE_GENOME.out.fasta_fai + bwa = params.bwa ? Channel.fromPath(params.bwa).collect() + : PREPARE_GENOME.out.bwa + bwamem2 = params.bwamem2 ? Channel.fromPath(params.bwamem2).collect() + : PREPARE_GENOME.out.bwamem2 + // Gather index for mapping given the chosen aligner + index_alignement = (params.aligner == "bwa-mem" || params.aligner == "sentieon-bwamem") ? bwa : + params.aligner == "bwa-mem2" ? bwamem2 +} + //Skipping some VCFs, ASCAT, and msisensorpro variables loading for now.... + //versions = versions.mix(PREPARE_GENOME.out.versions) + // ALIGNMENT STARTING + + // MODULE: MultiQC + //if (!(params.skip_tools && params.skip_tools.split(',').contains('multiqc'))) { + // workflow_summary = WorkflowHeisenbio.paramsSummaryMultiqc(workflow, summary_params) + // ch_workflow_summary = Channel.value(workflow_summary) + // + // methods_description = WorkflowHeisenbio.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) + // ch_methods_description = Channel.value(methods_description) + // + // ch_multiqc_files = Channel.empty() + // multiqc_files = multiqc_files.mix(version_yaml) + // multiqc_files = multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + // multiqc_files = multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) + // multiqc_files = multiqc_files.mix(reports.collect().ifEmpty([])) + // + // MULTIQC(multiqc_files.collect(), ch_multiqc_config.collect().ifEmpty([]), ch_multiqc_custom_config.collect().ifEmpty([]), ch_multiqc_logo.collect().ifEmpty([])) + // multiqc_report = MULTIQC.out.report.toList() + // versions = versions.mix(MULTIQC.out.versions) + //} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + COMPLETION EMAIL AND SUMMARY +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow.onComplete { + if (params.email || params.email_on_fail) { + NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report) + } + NfcoreTemplate.summary(workflow, params, log) + if (params.hook_url) { + NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) + } +} + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + FUNCTIONS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ +// Parse first line of a FASTQ file, return the flowcell id and lane number. +def flowcellLaneFromFastq(path) { + // expected format: + // xx:yy:FLOWCELLID:LANE:... (seven fields) + // or + // FLOWCELLID:LANE:xx:... (five fields) + def line + path.withInputStream { + InputStream gzipStream = new java.util.zip.GZIPInputStream(it) + Reader decoder = new InputStreamReader(gzipStream, 'ASCII') + BufferedReader buffered = new BufferedReader(decoder) + line = buffered.readLine() + } + assert line.startsWith('@') + line = line.substring(1) + def fields = line.split(':') + String fcid + + if (fields.size() >= 7) { + // CASAVA 1.8+ format, from https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm + // "@::::::: :::" + fcid = fields[2] + } else if (fields.size() == 5) { + fcid = fields[0] + } + return fcid +} + + + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + THE END +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/