diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 7a38bd0..dccc7cf 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -2,53 +2,53 @@ name: Bug report description: Report something that is broken or incorrect labels: bug body: -- type: textarea - id: description - attributes: - label: Description of the bug - description: A clear and concise description of what the bug is. - validations: - required: true -- type: textarea - id: command_used - attributes: - label: Command used and terminal output - description: Steps to reproduce the behaviour. Please paste the command you used - to launch the pipeline and the output from your terminal. - render: console - placeholder: '$ nextflow run ... - - - Some output where something broke - - ' -- type: textarea - id: files - attributes: - label: Relevant files - description: 'Please drag and drop the relevant files here. Create a `.zip` archive - if the extension is not allowed. - - Your verbose log file `.nextflow.log` is often useful _(this is a hidden file - in the directory where you launched the pipeline)_ as well as custom Nextflow - configuration files. - - ' -- type: textarea - id: system - attributes: - label: System information - description: '* Nextflow version _(eg. 23.04.0)_ - - * Hardware _(eg. HPC, Desktop, Cloud)_ - - * Executor _(eg. slurm, local, awsbatch)_ - - * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, - or Apptainer)_ - - * OS _(eg. CentOS Linux, macOS, Linux Mint)_ - - * Version of mskcc/sif _(eg. 1.1, 1.5, 1.8.2)_ - - ' + - type: textarea + id: description + attributes: + label: Description of the bug + description: A clear and concise description of what the bug is. + validations: + required: true + - type: textarea + id: command_used + attributes: + label: Command used and terminal output + description: Steps to reproduce the behaviour. Please paste the command you used + to launch the pipeline and the output from your terminal. + render: console + placeholder: "$ nextflow run ... + + + Some output where something broke + + " + - type: textarea + id: files + attributes: + label: Relevant files + description: "Please drag and drop the relevant files here. Create a `.zip` archive + if the extension is not allowed. + + Your verbose log file `.nextflow.log` is often useful _(this is a hidden file + in the directory where you launched the pipeline)_ as well as custom Nextflow + configuration files. + + " + - type: textarea + id: system + attributes: + label: System information + description: "* Nextflow version _(eg. 23.04.0)_ + + * Hardware _(eg. HPC, Desktop, Cloud)_ + + * Executor _(eg. slurm, local, awsbatch)_ + + * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, + or Apptainer)_ + + * OS _(eg. CentOS Linux, macOS, Linux Mint)_ + + * Version of mskcc/sif _(eg. 1.1, 1.5, 1.8.2)_ + + " diff --git a/.nf-core.yml b/.nf-core.yml index d35f6c0..4d32798 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,23 +1,23 @@ lint: files_exist: - - CODE_OF_CONDUCT.md - - assets/nf-core-sif_logo_light.png - - docs/images/nf-core-sif_logo_light.png - - docs/images/nf-core-sif_logo_dark.png - - .github/ISSUE_TEMPLATE/config.yml - - .github/workflows/awstest.yml - - .github/workflows/awsfulltest.yml + - CODE_OF_CONDUCT.md + - assets/nf-core-sif_logo_light.png + - docs/images/nf-core-sif_logo_light.png + - docs/images/nf-core-sif_logo_dark.png + - .github/ISSUE_TEMPLATE/config.yml + - .github/workflows/awstest.yml + - .github/workflows/awsfulltest.yml files_unchanged: - - CODE_OF_CONDUCT.md - - assets/nf-core-sif_logo_light.png - - docs/images/nf-core-sif_logo_light.png - - docs/images/nf-core-sif_logo_dark.png - - .github/ISSUE_TEMPLATE/bug_report.yml + - CODE_OF_CONDUCT.md + - assets/nf-core-sif_logo_light.png + - docs/images/nf-core-sif_logo_light.png + - docs/images/nf-core-sif_logo_dark.png + - .github/ISSUE_TEMPLATE/bug_report.yml multiqc_config: - - report_comment + - report_comment nextflow_config: - - manifest.name - - manifest.homePage + - manifest.name + - manifest.homePage repository_type: pipeline template: prefix: mskcc diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 658754f..bf90038 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,8 +1,8 @@ report_comment: > - + This report has been generated by the mskcc/sif analysis pipeline. - + report_section_order: "mskcc-sif-methods-description": order: -1000 diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 4a758fe..8e44707 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -24,39 +24,32 @@ class RowChecker: """ - VALID_FORMATS = ( - ".fq.gz", - ".fastq.gz", - ) + VALID_FORMATS = ".bam" def __init__( self, - sample_col="sample", - first_col="fastq_1", - second_col="fastq_2", - single_col="single_end", + pairId="pairId", + tumorBam="tumorBam", + normalBam="normalBam", + assay="assay", + normalType="normalType", + bedFile="bedFile", **kwargs, ): """ Initialize the row checker with the expected column names. Args: - sample_col (str): The name of the column that contains the sample name - (default "sample"). - first_col (str): The name of the column that contains the first (or only) - FASTQ file path (default "fastq_1"). - second_col (str): The name of the column that contains the second (if any) - FASTQ file path (default "fastq_2"). - single_col (str): The name of the new column that will be inserted and - records whether the sample contains single- or paired-end sequencing - reads (default "single_end"). + """ super().__init__(**kwargs) - self._sample_col = sample_col - self._first_col = first_col - self._second_col = second_col - self._single_col = single_col + self._pairId = pairId + self._tumorBam = tumorBam + self._normalBam = normalBam + self._assay = assay + self._normalType = normalType + self._bedFile = bedFile self._seen = set() self.modified = [] @@ -69,65 +62,53 @@ def validate_and_transform(self, row): (values). """ - self._validate_sample(row) - self._validate_first(row) - self._validate_second(row) - self._validate_pair(row) - self._seen.add((row[self._sample_col], row[self._first_col])) + self._validate_names(row) + self._validate_bams(row) + self._validate_normalType(row) + self._validate_bed_format(row) + self._seen.add((row[self._pairId])) self.modified.append(row) - def _validate_sample(self, row): - """Assert that the sample name exists and convert spaces to underscores.""" - if len(row[self._sample_col]) <= 0: - raise AssertionError("Sample input is required.") - # Sanitize samples slightly. - row[self._sample_col] = row[self._sample_col].replace(" ", "_") + def _validate_names(self, row): + """Assert that the sample names exist""" + if len(row[self._pairId]) <= 0: + raise AssertionError("pairId is required.") + + def _validate_pairId_format(self, row): + id_value = row[self._pairId] + if "." in id_value: + raise AssertionError("pairId:{} cannot contain any periods ('.') ".format(id_value)) - def _validate_first(self, row): + def _validate_bams(self, row): """Assert that the first FASTQ entry is non-empty and has the right format.""" - if len(row[self._first_col]) <= 0: - raise AssertionError("At least the first FASTQ file is required.") - self._validate_fastq_format(row[self._first_col]) - - def _validate_second(self, row): - """Assert that the second FASTQ entry has the right format if it exists.""" - if len(row[self._second_col]) > 0: - self._validate_fastq_format(row[self._second_col]) - - def _validate_pair(self, row): - """Assert that read pairs have the same file extension. Report pair status.""" - if row[self._first_col] and row[self._second_col]: - row[self._single_col] = False - first_col_suffix = Path(row[self._first_col]).suffixes[-2:] - second_col_suffix = Path(row[self._second_col]).suffixes[-2:] - if first_col_suffix != second_col_suffix: - raise AssertionError("FASTQ pairs must have the same file extensions.") - else: - row[self._single_col] = True - - def _validate_fastq_format(self, filename): + if len(row[self._tumorBam]) <= 0 or len(row[self._normalBam]) <= 0: + raise AssertionError("Both bam files are required.") + self._validate_bam_format(row[self._tumorBam]) + self._validate_bam_format(row[self._normalBam]) + + def _validate_normalType(self, row): + """Assert that bait set exists.""" + if len(row[self._normalType]) <= 0: + raise AssertionError("normalType is required.") + + def _validate_bam_format(self, filename): """Assert that a given filename has one of the expected FASTQ extensions.""" if not any(filename.endswith(extension) for extension in self.VALID_FORMATS): raise AssertionError( - f"The FASTQ file has an unrecognized extension: {filename}\n" + f"The BAM file has an unrecognized extension: {filename}\n" f"It should be one of: {', '.join(self.VALID_FORMATS)}" ) - def validate_unique_samples(self): - """ - Assert that the combination of sample name and FASTQ filename is unique. - - In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the - number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment. - - """ - if len(self._seen) != len(self.modified): - raise AssertionError("The pair of sample name and FASTQ must be unique.") - seen = Counter() - for row in self.modified: - sample = row[self._sample_col] - seen[sample] += 1 - row[self._sample_col] = f"{sample}_T{seen[sample]}" + def _validate_bed_format(self, row): + """Assert that a given filename has one of the expected BED extensions.""" + filename = row[self._bedFile] + if filename and filename != "NONE": + if not filename.endswith(".bed"): + raise AssertionError( + f"The BED file has an unrecognized extension: {filename}\n" + f"It should be .bed\n" + f"If you would like one generated for you leave it bank or enter 'NONE'\n" + ) def read_head(handle, num_lines=10): @@ -164,10 +145,9 @@ def sniff_format(handle): def check_samplesheet(file_in, file_out): """ - Check that the tabular samplesheet has the structure expected by nf-core pipelines. + Check that the tabular samplesheet has the structure expected by the ODIN pipeline. - Validate the general shape of the table, expected columns, and each row. Also add - an additional column which records whether one or two FASTQ reads were found. + Validate the general shape of the table, expected columns, and each row. Args: file_in (pathlib.Path): The given tabular samplesheet. The format can be either @@ -179,19 +159,14 @@ def check_samplesheet(file_in, file_out): This function checks that the samplesheet follows the following structure, see also the `viral recon samplesheet`_:: - sample,fastq_1,fastq_2 - SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz - SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz - SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz, - - .. _viral recon samplesheet: - https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv + pairId,tumorBam,normalBam,assay,normalType,bedFile + SAMPLE_TUMOR.SAMPLE_NORMAL,BAM_TUMOR,BAM_NORMAL,BAITS,NORMAL_TYPE,BED_FILE """ - required_columns = {"sample", "fastq_1", "fastq_2"} + required_columns = {"pairId", "tumorBam", "normalBam", "assay", "normalType", "bedFile"} # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. with file_in.open(newline="") as in_handle: - reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) + reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle), delimiter=",") # Validate the existence of the expected header columns. if not required_columns.issubset(reader.fieldnames): req_cols = ", ".join(required_columns) @@ -205,9 +180,7 @@ def check_samplesheet(file_in, file_out): except AssertionError as error: logger.critical(f"{str(error)} On line {i + 2}.") sys.exit(1) - checker.validate_unique_samples() header = list(reader.fieldnames) - header.insert(1, "single_end") # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. with file_out.open(mode="w", newline="") as out_handle: writer = csv.DictWriter(out_handle, header, delimiter=",") diff --git a/bin/concat_with_comments.sh b/bin/concat_with_comments.sh new file mode 100755 index 0000000..74ce43f --- /dev/null +++ b/bin/concat_with_comments.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -eux + +# This script will concatenate table files, preserving the unique comment lines from all input files and adding a new comment line +# +# USAGE: +# $ concat_with_comments.sh comment_label comment_value output.txt input1.txt input2.txt ... inputn.txt +# +# EXAMPLE: +# $ bin/concat_with_comments.sh helix_filters_01 concat-with-comments-0-ga478e4e output.txt ../test_data/maf/*.muts.maf +# +# EXTENDED EXAMPLE: +# $ cat input1.txt +# # comment 1 +# HEADER +# foo1 +# bar1 +# +# $ cat input2.txt +# # comment 2 +# HEADER +# foo2 +# bar2 +# +# $ bin/concat_with_comments.sh label value output.txt input1.txt input2.txt +# +# $ cat output.txt +# # comment 1 +# # comment 2 +# #label: value +# HEADER +# foo1 +# bar1 +# foo2 +# bar2 + +comment_key="${1}" +comment_value="${2}" +output_file="${3}" +shift +shift +shift + +# all the remaining args should be filenames +input_files=( "$@" ) +# echo ${input_files[@]} + +# get the unique header lines from all files +# NOTE: grep exits with code 1 if no comments are found +if grep -q '#' ${input_files[@]}; then + printf "%s\n" "$(grep --no-filename '#' ${input_files[@]} | sort -u)" > "$output_file" +fi + +# make new comment line +new_comment_line="#${comment_key}: ${comment_value}" +echo "${new_comment_line}" >> "$output_file" + +# turn off set -e because it seems to have issues when the input files lack comments +set +e + +# add the header line from the first file +grep -v '#' ${input_files[0]} | head -1 >> "$output_file" + +# get all the non-comment, non-header lines from all files +for i in ${input_files[@]}; do + grep -v '#' "$i" | tail -n +2 >> "$output_file" +done diff --git a/bin/format_maf.sh b/bin/format_maf.sh new file mode 100755 index 0000000..692ae6f --- /dev/null +++ b/bin/format_maf.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash + +#USAGE: format_maf.sh [prefix] [input_maf] + +## Remove comments + +grep \ + '^[^#;]' \ + $2 \ + > \ + $1.grepped.txt + +## Extract columns + +awk \ + -F "\t" \ + 'NR==1 { for(i=1;i<=NF;i++) \ + { \ + f[$i]=i \ + } \ + print "Hugo_Symbol\tEntrez_Gene_Id\tCenter\tTumor_Sample_Barcode\tFusion\tMethod\tFrame" \ + } \ + NR>1 \ + { \ + print $(f["Hugo_Symbol"])"\t"$(f["Entrez_Gene_Id"])"\t"$(f["Center"])"\t"$(f["Tumor_Sample_Barcode"])"\t"$(f["Fusion"])"\t"$(f["Method"])"\t"$(f["Frame"]) \ + }' \ + $1.grepped.txt \ + > \ + $1.extracted.txt + +## Add two columns - RNA_support and no, DNA_support and yes + +sed \ + '1s/$/\tDNA_support\tRNA_support/;2,$s/$/\tyes\tno/' \ + $1.extracted.txt \ + > \ + $1.columns_added.txt + +## Portal format output +awk \ + -F "\t" \ + 'NR==1 \ + { \ + for(i=1;i<=NF;i++) \ + { \ + f[$i]=i \ + } \ + } \ + { \ + print $(f["Hugo_Symbol"])"\t"$(f["Entrez_Gene_Id"])"\t"$(f["Center"])"\t"$(f["Tumor_Sample_Barcode"])"\t"$(f["Fusion"])"\t"$(f["DNA_support"])"\t"$(f["RNA_support"])"\t"$(f["Method"])"\t"$(f["Frame"]) \ + }' \ + $1.columns_added.txt \ + > \ + $1.portal.txt diff --git a/conf/juno_resources.config b/conf/juno_resources.config new file mode 100644 index 0000000..f908e1f --- /dev/null +++ b/conf/juno_resources.config @@ -0,0 +1,178 @@ +genome_resources { + genomes { + 'GRCh37' { + fasta = '/juno/work/ci/resources/genomes/GRCh37/fasta/b37.fasta' + dbsnp = '/juno/work/ci/resources/genomes/GRCh37/dbsnp/129/dbsnp_138.b37.excluding_sites_after_129.vcf' + facets_snps = '/juno/work/ci/resources/genomes/GRCh37/facets_snps/dbsnp_137.b37__RmDupsClean__plusPseudo50__DROP_SORT.vcf' + delly = '/juno/work/ci/resources/genomes/GRCh37/delly/human.hg19.excl.tsv' + cosmic = '/juno/work/ci/resources/genomes/GRCh37/cosmic/67/CosmicCodingMuts_v67_b37_20131024__NDS.vcf' + intervals = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y', 'MT'] + } + } + + resources { + hotspot = '/juno/work/ci/resources/qc_resources/hotspot-list-union-v1-v2.vcf' + exac_filter = '/juno/work/ci/resources/vep/cache/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz' + } + + assay_coverage = [ + 'IMPACT341' : 896637, + 'IMPACT410' : 1016335, + 'IMPACT468' : 1139294, + 'IMPACT505' : 1213770, + 'HemePACT_v4' : 1412046, + 'IMPACT_Heme' : 1314920 + ] + + curated_bams = [ + 'IMPACT468' : '/juno/work/ci/resources/curated_bams/IMPACT468_b37', + 'AgilentExon_51MB' : '/juno/work/ci/resources/curated_bams/AgilentExon_51MB_b37_v3', + 'IDT_Exome' : '/juno/work/ci/resources/curated_bams/IDT_Exome_v1_FP_b37', + 'IMPACT_Heme' : '/juno/work/ci/resources/curated_bams/IMPACT-Heme_v2_BAITS' + ] + + targets { + 'AgilentExon_51MB' { + FP_genotypes = '/juno/work/ci/resources/genomic_resources/targets/AgilentExon_51MB_b37_v3/b37/AgilentExon_51MB_b37_v3_FP_tiling_genotypes.txt' + FP_intervals = '/juno/work/ci/resources/genomic_resources/targets/AgilentExon_51MB_b37_v3/b37/AgilentExon_51MB_b37_v3_FP_tiling_intervals.intervals' + baits_bed = '/juno/work/ci/resources/genomic_resources/targets/AgilentExon_51MB_b37_v3/b37/AgilentExon_51MB_b37_v3_baits.bed' + baits_list = '/juno/work/ci/resources/genomic_resources/targets/AgilentExon_51MB_b37_v3/b37/AgilentExon_51MB_b37_v3_baits.intervals' + targets_bed = '/juno/work/ci/resources/genomic_resources/targets/AgilentExon_51MB_b37_v3/b37/AgilentExon_51MB_b37_v3_targets.bed' + targets_list = '/juno/work/ci/resources/genomic_resources/targets/AgilentExon_51MB_b37_v3/b37/AgilentExon_51MB_b37_v3_targets.intervals' + } + + 'IDT_Exome' { + FP_genotypes = '/juno/work/ci/resources/genomic_resources/targets/IDT_Exome_v1_FP/b37/FP_tiling_genotypes.txt' + FP_intervals = '/juno/work/ci/resources/genomic_resources/targets/IDT_Exome_v1_FP/b37/FP_tiling_intervals.intervals' + baits_bed = '/juno/work/ci/resources/genomic_resources/targets/IDT_Exome_v1_FP/b37/IDT_Exome_v1_FP_b37_baits.bed' + baits_list = '/juno/work/ci/resources/genomic_resources/targets/IDT_Exome_v1_FP/b37/IDT_Exome_v1_FP_b37_baits.ilist' + targets_bed = '/juno/work/ci/resources/genomic_resources/targets/IDT_Exome_v1_FP/b37/IDT_Exome_v1_FP_b37_targets.bed' + targets_list = '/juno/work/ci/resources/genomic_resources/targets/IDT_Exome_v1_FP/b37/IDT_Exome_v1_FP_b37_targets.ilist' + } + + 'E90_NimbleGeneV3_WES' { + FP_genotypes = '/juno/work/ci/resources/genomic_resources/targets/E90_NimbleGeneV3_WES/b37/E90_NimbleGeneV3_WES_FP_tiling_genotypes.txt' + FP_intervals = '/juno/work/ci/resources/genomic_resources/targets/E90_NimbleGeneV3_WES/b37/E90_NimbleGeneV3_WES_FP_tiling_intervals.intervals' + baits_bed = '/juno/work/ci/resources/genomic_resources/targets/E90_NimbleGeneV3_WES/b37/E90_NimbleGeneV3_WES_b37_baits.bed' + baits_list = '/juno/work/ci/resources/genomic_resources/targets/E90_NimbleGeneV3_WES/b37/E90_NimbleGeneV3_WES_b37_baits.ilist' + targets_bed = '/juno/work/ci/resources/genomic_resources/targets/E90_NimbleGeneV3_WES/b37/E90_NimbleGeneV3_WES_b37_targets.bed' + targets_list = '/juno/work/ci/resources/genomic_resources/targets/E90_NimbleGeneV3_WES/b37/E90_NimbleGeneV3_WES_b37_targets.ilist' + } + + 'IMPACT341' { + FP_genotypes = '/juno/work/ci/resources/genomic_resources/targets/IMPACT341/b37/FP_tiling_genotypes.txt' + FP_intervals = '/juno/work/ci/resources/genomic_resources/targets/IMPACT341/b37/FP_tiling_intervals.list' + baits_list = '/juno/work/ci/resources/genomic_resources/targets/IMPACT341/b37/picard_baits.interval_list' + targets_list = '/juno/work/ci/resources/genomic_resources/targets/IMPACT341/b37/picard_targets.interval_list' + } + + 'IMPACT410' { + FP_genotypes = '/juno/work/ci/resources/genomic_resources/targets/IMPACT410/b37/FP_tiling_genotypes.txt' + FP_intervals = '/juno/work/ci/resources/genomic_resources/targets/IMPACT410/b37/FP_tiling_intervals.list' + baits_list = '/juno/work/ci/resources/genomic_resources/targets/IMPACT410/b37/picard_baits.interval_list' + targets_list = '/juno/work/ci/resources/genomic_resources/targets/IMPACT410/b37/picard_targets.interval_list' + } + + 'IMPACT468' { + FP_genotypes = '/juno/work/ci/resources/genomic_resources/targets/IMPACT468/b37/FP_tiling_genotypes.txt' + FP_intervals = '/juno/work/ci/resources/genomic_resources/targets/IMPACT468/b37/FP_tiling_intervals.list' + baits_list = '/juno/work/ci/resources/genomic_resources/targets/IMPACT468/b37/picard_baits.interval_list' + baits_bed = '/juno/work/ci/resources/genomic_resources/targets/IMPACT468/b37/IMPACT468_b37_baits.bed' + targets_list = '/juno/work/ci/resources/genomic_resources/targets/IMPACT468/b37/picard_targets.interval_list' + } + + 'IMPACT468_mm10' { + FP_genotypes = '/juno/work/ci/resources/genomes/GRCh37_mm10/targets/FP_tiling_genotypes.txt' + FP_intervals = '/juno/work/ci/resources/genomes/GRCh37_mm10/targets/FP_tiling_intervals.list' + baits_list = '/juno/work/ci/resources/genomes/GRCh37_mm10/targets/picard_baits.interval_list' + targets_list = '/juno/work/ci/resources/genomes/GRCh37_mm10/targets/picard_targets.interval_list' + } + + 'IMPACT468_08390' { + FP_genotypes = '/juno/work/ci/resources/genomic_resources/targets/IMPACT468_08390/b37/IMPACT468_08390_FP_tiling_genotypes.txt' + FP_intervals = '/juno/work/ci/resources/genomic_resources/targets/IMPACT468_08390/b37/IMPACT468_08390_FP_tiling_intervals.intervals' + baits_list = '/juno/work/ci/resources/genomic_resources/targets/IMPACT468_08390/b37/IMPACT468_08390_b37_baits.ilist' + targets_list = '/juno/work/ci/resources/genomic_resources/targets/IMPACT468_08390/b37/IMPACT468_08390_b37_targets.ilist' + } + + 'IMPACT468_08050' { + FP_genotypes = '/juno/work/ci/resources/genomic_resources/targets/IMPACT468_08050/b37/IMPACT468_08050_FP_tiling_genotypes.txt' + FP_intervals = '/juno/work/ci/resources/genomic_resources/targets/IMPACT468_08050/b37/IMPACT468_08050_FP_tiling_intervals.intervals' + baits_list = '/juno/work/ci/resources/genomic_resources/targets/IMPACT468_08050/b37/IMPACT468_08050_b37_baits.ilist' + targets_list = '/juno/work/ci/resources/genomic_resources/targets/IMPACT468_08050/b37/IMPACT468_08050_b37_targets.ilist' + } + + 'IMPACT505' { + FP_genotypes = '/juno/work/ci/resources/genomic_resources/targets/IMPACT505/b37/IMPACT505_FP_tiling_genotypes.txt' + FP_intervals = '/juno/work/ci/resources/genomic_resources/targets/IMPACT505/b37/IMPACT505_FP_tiling_intervals.intervals' + baits_list = '/juno/work/ci/resources/genomic_resources/targets/IMPACT505/b37/IMPACT505_b37_baits.ilist' + targets_list = '/juno/work/ci/resources/genomic_resources/targets/IMPACT505/b37/IMPACT505_b37_targets.ilist' + } + + 'Agilent_v4_51MB_Human' { + FP_genotypes = '/juno/work/ci/resources/genomic_resources/targets/Agilent_v4_51MB_Human/b37/Agilent_v4_51MB_Human_FP_tiling_genotypes.txt' + FP_intervals = '/juno/work/ci/resources/genomic_resources/targets/Agilent_v4_51MB_Human/b37/Agilent_v4_51MB_Human_FP_tiling_intervals.intervals' + baits_bed = '/juno/work/ci/resources/genomic_resources/targets/Agilent_v4_51MB_Human/b37/Agilent_v4_51MB_Human_b37_baits.bed' + baits_list = '/juno/work/ci/resources/genomic_resources/targets/Agilent_v4_51MB_Human/b37/Agilent_v4_51MB_Human_b37_baits.ilist' + targets_bed = '/juno/work/ci/resources/genomic_resources/targets/Agilent_v4_51MB_Human/b37/Agilent_v4_51MB_Human_b37_targets.bed' + targets_list = '/juno/work/ci/resources/genomic_resources/targets/Agilent_v4_51MB_Human/b37/Agilent_v4_51MB_Human_b37_targets.ilist' + } + + 'AgilentExon_v2' { + FP_genotypes = '/juno/work/ci/resources/genomic_resources/targets/AgilentExon_v2/b37/AgilentExon_v2_FP_tiling_genotypes.txt' + FP_intervals = '/juno/work/ci/resources/genomic_resources/targets/AgilentExon_v2/b37/AgilentExon_v2_FP_tiling_intervals.intervals' + baits_bed = '/juno/work/ci/resources/genomic_resources/targets/AgilentExon_v2/b37/AgilentExon_v2_b37_baits.bed' + baits_list = '/juno/work/ci/resources/genomic_resources/targets/AgilentExon_v2/b37/AgilentExon_v2_b37_baits.ilist' + targets_bed = '/juno/work/ci/resources/genomic_resources/targets/AgilentExon_v2/b37/AgilentExon_v2_b37_targets.bed' + targets_list = '/juno/work/ci/resources/genomic_resources/targets/AgilentExon_v2/b37/AgilentExon_v2_b37_targets.ilist' + } + + 'AgilentExon_v5' { + FP_genotypes = '/juno/work/ci/resources/genomic_resources/targets/AgilentExon_v5/b37/AgilentExon_v5_FP_tiling_genotypes.txt' + FP_intervals = '/juno/work/ci/resources/genomic_resources/targets/AgilentExon_v5/b37/AgilentExon_v5_FP_tiling_intervals.intervals' + baits_bed = '/juno/work/ci/resources/genomic_resources/targets/AgilentExon_v5/b37/AgilentExon_v5_b37_baits.bed' + baits_list = '/juno/work/ci/resources/genomic_resources/targets/AgilentExon_v5/b37/AgilentExon_v5_b37_baits.ilist' + targets_bed = '/juno/work/ci/resources/genomic_resources/targets/AgilentExon_v5/b37/AgilentExon_v5_b37_targets.bed' + targets_list = '/juno/work/ci/resources/genomic_resources/targets/AgilentExon_v5/b37/AgilentExon_v5_b37_targets.ilist' + } + + 'IlluminaExome_38MB' { + FP_genotypes = '/juno/work/ci/resources/genomic_resources/targets/IlluminaExome_38MB/b37/IlluminaExome_38MB_FP_tiling_genotypes.txt' + FP_intervals = '/juno/work/ci/resources/genomic_resources/targets/IlluminaExome_38MB/b37/IlluminaExome_38MB_FP_tiling_intervals.intervals' + baits_bed = '/juno/work/ci/resources/genomic_resources/targets/IlluminaExome_38MB/b37/IlluminaExome_38MB_b37_baits.bed' + baits_list = '/juno/work/ci/resources/genomic_resources/targets/IlluminaExome_38MB/b37/IlluminaExome_38MB_b37_baits.ilist' + targets_bed = '/juno/work/ci/resources/genomic_resources/targets/IlluminaExome_38MB/b37/IlluminaExome_38MB_b37_targets.bed' + targets_list = '/juno/work/ci/resources/genomic_resources/targets/IlluminaExome_38MB/b37/IlluminaExome_38MB_b37_targets.ilist' + } + + 'SeqCap_EZ_Exome' { + FP_genotypes = '/juno/work/ci/resources/genomic_resources/targets/SeqCap_EZ_Exome_v3/b37/SeqCap_EZ_Exome_v3_FP_tiling_genotypes.txt' + FP_intervals = '/juno/work/ci/resources/genomic_resources/targets/SeqCap_EZ_Exome_v3/b37/SeqCap_EZ_Exome_v3_FP_tiling_intervals.intervals' + baits_bed = '/juno/work/ci/resources/genomic_resources/targets/SeqCap_EZ_Exome_v3/b37/SeqCap_EZ_Exome_v3_b37_baits.bed' + baits_list = '/juno/work/ci/resources/genomic_resources/targets/SeqCap_EZ_Exome_v3/b37/SeqCap_EZ_Exome_v3_b37_baits.ilist' + targets_bed = '/juno/work/ci/resources/genomic_resources/targets/SeqCap_EZ_Exome_v3/b37/SeqCap_EZ_Exome_v3_b37_targets.bed' + targets_list = '/juno/work/ci/resources/genomic_resources/targets/SeqCap_EZ_Exome_v3/b37/SeqCap_EZ_Exome_v3_b37_targets.ilist' + } + + 'HemePACT_v3' { + FP_genotypes = '/juno/work/ci/resources/genomic_resources/targets/HemePACT_v3/b37/HemePACT_v3_FP_tiling_genotypes.txt' + FP_intervals = '/juno/work/ci/resources/genomic_resources/targets/HemePACT_v3/b37/HemePACT_v3_FP_tiling_intervals.intervals' + baits_list = '/juno/work/ci/resources/genomic_resources/targets/HemePACT_v3/b37/HemePACT_v3_b37_baits.ilist' + targets_list = '/juno/work/ci/resources/genomic_resources/targets/HemePACT_v3/b37/HemePACT_v3_b37_targets.ilist' + } + + 'HemePACT_v4' { + FP_genotypes = '/juno/work/ci/resources/genomic_resources/targets/HemePACT_v4/b37/HemePACT_v4_FP_tiling_genotypes.txt' + FP_intervals = '/juno/work/ci/resources/genomic_resources/targets/HemePACT_v4/b37/HemePACT_v4_FP_tiling_intervals.intervals' + baits_list = '/juno/work/ci/resources/genomic_resources/targets/HemePACT_v4/b37/HemePACT_v4_b37_baits.ilist' + targets_list = '/juno/work/ci/resources/genomic_resources/targets/HemePACT_v4/b37/HemePACT_v4_b37_targets.ilist' + } + + 'IMPACT-Heme' { + FP_genotypes = '/juno/work/ci/resources/genomic_resources/targets/IMPACT-Heme/IMPACT-Heme_v4/b37/IMPACT-Heme_v4_FP_tiling_genotypes.txt' + FP_intervals = '/juno/work/ci/resources/genomic_resources/targets/IMPACT-Heme/IMPACT-Heme_v4/b37/IMPACT-Heme_v4_FP_tiling_intervals.intervals' + baits_list = '/juno/work/ci/resources/genomic_resources/targets/IMPACT-Heme/IMPACT-Heme_v4/b37/IMPACT-Heme_v4_baits.ilist' + targets_list = '/juno/work/ci/resources/genomic_resources/targets/IMPACT-Heme/IMPACT-Heme_v4/b37/IMPACT-Heme_v4_targets.ilist' + } + } +} diff --git a/conf/modules.config b/conf/modules.config index d91c6ab..bf2f1d2 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -11,7 +11,6 @@ */ process { - publishDir = [ path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, @@ -26,8 +25,27 @@ process { ] } - withName: FASTQC { - ext.args = '--quiet' + withName: 'SAMTOOLS_HEADER_VIEW' { + ext.args = 'view -h -H' + ext.args2 = '\\(SM:\\S*\\)' + ext.args3 = 's/SM://g' + + } + + withName: 'VCF2MAF' { + ext.args = '--ncbi-build GRCh37 --species homo_sapiens --maf-center mskcc.org --max-filter-ac 10 --min-hom-vaf 0.7 --cache-version 86 --buffer-size 5000 --vep-data /var/cache --vep-path /usr/bin/vep --custom-enst /usr/bin/vcf2maf/data/isoform_overrides_at_mskcc' + } + + withName: 'BCFTOOLS_CONCAT' { + ext.args = '--output-type v --allow-overlaps --rm-dups all' + } + + withName: 'DELLY_CALL' { + ext.args = '--map-qual 1 --mad-cutoff 9 --geno-qual 5' + } + + withName: 'DELLY_FILTER' { + ext.args = '--filter somatic --altaf 0.04 --minsize 500 --maxsize 500000000 --ratiogeno 0.0 --pass --coverage 10 --controlcontamination 0 --gq 15 --rddel 0.800000012 --rddup 1.20000005' } withName: CUSTOM_DUMPSOFTWAREVERSIONS { @@ -37,14 +55,4 @@ process { pattern: '*_versions.yml' ] } - - withName: 'MULTIQC' { - ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } - publishDir = [ - path: { "${params.outdir}/multiqc" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - } diff --git a/conf/test_juno.config b/conf/test_juno.config new file mode 100644 index 0000000..ba1a535 --- /dev/null +++ b/conf/test_juno.config @@ -0,0 +1,58 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/phoenix -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +includeConfig 'juno_resources.config' + +process { + beforeScript = "module load singularity/3.7.1; unset R_LIBS; catch_term () { echo 'caught USR2/TERM signal'; set +e; false; on_exit ; } ; trap catch_term USR2 TERM" + maxRetries = 3 + + singularity.cacheDir = '/juno/work/ci/dev/dev_phoenix/singularity_cachedir' + +// executor = "lsf" +// clusterOptions = "-sla CMOVOYAGER" +// queueSize = 500 +// perJobMemLimit = true +} + +params { + config_profile_name = 'Juno profile' + config_profile_description = 'Minimal Juno profile' + + // Limit resources so that this can run on GitHub Actions + + // max_cpus = 2 + // max_memory = '6.GB' + // max_time = '6.h' + + // Genome references + genome = 'GRCh37' + + // Ignore MultiQC + + multiqc_config = null + multiqc_logo = null + multiqc_methods_description = null + + //Ignore igenomes + igenomes_ignore = true + + // Reference genome options + // genome = null + fasta = "${genome_resources.genomes[ params.genome ][ 'fasta' ]}" + fasta_index_dict = "${fasta.replaceAll('fasta$','dict')}" + fasta_index = ["${fasta}.fai","${fasta}.amb","${fasta}.ann","${fasta}.bwt","${fasta}.pac","${fasta}.sa","${fasta_index_dict}"] + exac_filter = "${genome_resources.resources.exac_filter}" + exac_filter_index = "${exac_filter}.tbi" + delly_type = ['DUP', 'DEL', 'INV', 'INS', 'BND'] + delly_exclude = "${genome_resources.genomes[params.genome]['delly']}" +} diff --git a/lib/WorkflowSif.groovy b/lib/WorkflowSif.groovy index b9aba05..964b6b7 100755 --- a/lib/WorkflowSif.groovy +++ b/lib/WorkflowSif.groovy @@ -12,12 +12,12 @@ class WorkflowSif { // public static void initialise(params, log) { - genomeExistsError(params, log) + // genomeExistsError(params, log) - if (!params.fasta) { - Nextflow.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." - } + //if (!params.fasta) { + // Nextflow.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." + //} } // diff --git a/modules/local/add_maf_comment.nf b/modules/local/add_maf_comment.nf new file mode 100644 index 0000000..5cbb667 --- /dev/null +++ b/modules/local/add_maf_comment.nf @@ -0,0 +1,54 @@ +process ADD_MAF_COMMENT { + tag "$meta.id" + label 'process_single' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://mskcc/alpine:3.19-with-bash': + 'docker.io/mskcc/alpine:3.19-with-bash' }" + + publishDir "${params.outdir}/${meta.id}/", pattern: "${meta.id}.*", mode: params.publish_dir_mode + + containerOptions "--bind $projectDir" + + input: + tuple val(meta), path(input_maf) + val(tool_name) + val(tool_version) + + output: + tuple val(meta), path("*.svs.maf") , emit: maf + path "versions.yml" , emit: versions + + script: + task.ext.when == null || task.ext.when + def prefix = task.ext.prefix ?: "${meta.id}" + def tool_name_trim = "${tool_name}".trim() + def tool_version_trim = "${tool_version}".trim() + + """ + $projectDir/bin/concat_with_comments.sh \\ + ${tool_name_trim} \\ + ${tool_version_trim} \\ + ${prefix}.svs.maf \\ + ${input_maf} + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + grep: BusyBox v1.36.1 + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + + touch ${prefix}.svs.maf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + grep: BusyBox v1.36.1 + END_VERSIONS + """ +} diff --git a/modules/local/bcftools_concat.nf b/modules/local/bcftools_concat.nf new file mode 100644 index 0000000..6413c7f --- /dev/null +++ b/modules/local/bcftools_concat.nf @@ -0,0 +1,49 @@ +process BCFTOOLS_CONCAT { + + + tag "$meta.id" + label 'process_medium' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://mskcc/htslib:1.9': + 'docker.io/mskcc/htslib:1.9' }" + + input: + tuple val(meta), path(inputVcfs), path(inputVcfTbis) + + output: + tuple val(meta), path("*.vcf") , emit: vcf + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input_vcfs = inputVcfs.join(" ") + def output_vcf = "${prefix}.combined.svs.vcf" + """ + /usr/bin/bcftools concat \\ + ${input_vcfs} \\ + ${args} \\ + --output ${output_vcf} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: 1.9 + htslib: 1.9 + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.combined.svs.vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: 1.9 + htslib: 1.9 + END_VERSIONS + """ + +} diff --git a/modules/local/delly_call.nf b/modules/local/delly_call.nf new file mode 100644 index 0000000..97f21ef --- /dev/null +++ b/modules/local/delly_call.nf @@ -0,0 +1,58 @@ +process DELLY_CALL { + + + tag "$meta.id" + label 'process_medium' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://mskcc/delly:1.2.6': + 'docker.io/mskcc/delly:1.2.6' }" + + input: + tuple val(meta), path(normal), path(normal_index) + tuple val(meta2), path(tumor), path(tumor_index) + tuple val(meta3), path(fasta) + tuple val(meta4), path(fai) + tuple val(meta4), path(exclude) + each delly_type + + output: + tuple val(meta), val(delly_type), path("*.bcf"), path("*.bcf.csi") , emit: sv_output + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + /opt/delly/bin/delly \\ + call \\ + ${args} \\ + --genome ${fasta} \\ + --exclude ${exclude} \\ + --outfile ${prefix}.${delly_type}.bcf \\ + ${tumor} \\ + ${normal} + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + delly: 1.2.6 + htslib: 1.15.1 + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.${delly_type}.bcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + delly: 1.2.6 + htslib: 1.15.1 + END_VERSIONS + """ + +} diff --git a/modules/local/delly_filter.nf b/modules/local/delly_filter.nf new file mode 100644 index 0000000..637815b --- /dev/null +++ b/modules/local/delly_filter.nf @@ -0,0 +1,56 @@ +process DELLY_FILTER { + + + tag "$meta.id" + label 'process_medium' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://mskcc/delly:1.2.6': + 'docker.io/mskcc/delly:1.2.6' }" + + input: + tuple val(meta), val(delly_type), path(sv_output), path(sv_index) + + output: + tuple val(meta), path("*.bcf"), path("*.bcf.csi") , emit: sv_pass_output + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def pair_file_name = "tn_pair.txt" + + """ + cat <<-END_PAIR > ${pair_file_name} + ${meta.tumorSampleName} tumor + ${meta.normalSampleName} control + END_PAIR + + /opt/delly/bin/delly \\ + filter \\ + ${args} \\ + --samples ${pair_file_name} \\ + --outfile ${prefix}.${delly_type}.pass.bcf \\ + ${sv_output} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + delly: 1.2.6 + htslib: 1.15.1 + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.${delly_type}.pass.bcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + delly: 1.2.6 + htslib: 1.15.1 + END_VERSIONS + """ + +} diff --git a/modules/local/format_maf.nf b/modules/local/format_maf.nf new file mode 100644 index 0000000..904908c --- /dev/null +++ b/modules/local/format_maf.nf @@ -0,0 +1,53 @@ +process FORMAT_MAF { + + + tag "$meta.id" + label 'process_medium' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://mskcc/alpine:3.19-with-bash': + 'docker.io/mskcc/alpine:3.19-with-bash' }" + + containerOptions "--bind $projectDir" + + publishDir "${params.outdir}/${meta.id}/", pattern: "${meta.id}.portal.txt", mode: params.publish_dir_mode + + input: + tuple val(meta), path(inputMaf) + + output: + tuple val(meta), path("*.portal.txt") , emit: portal + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + $projectDir/bin/format_maf.sh \\ + ${prefix} \\ + ${inputMaf} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + grep: BusyBox v1.36.1 + awk: BusyBox v1.36.1 + sed: BusyBox v1.36.1 + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.portal.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + grep: BusyBox v1.36.1 + awk: BusyBox v1.36.1 + sed: BusyBox v1.36.1 + END_VERSIONS + """ + +} diff --git a/modules/local/get_bam_header.nf b/modules/local/get_bam_header.nf new file mode 100644 index 0000000..909eb2a --- /dev/null +++ b/modules/local/get_bam_header.nf @@ -0,0 +1,36 @@ +process SAMTOOLS_HEADER_VIEW { + + + tag "$meta.id" + label 'process_medium' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://mskcc/htslib:1.9': + 'docker.io/mskcc/htslib:1.9' }" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), stdout , emit: sample_name + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + /usr/local/bin/samtools \\ + ${args} \\ + ${bam} | grep -o '${args2}' | sed '${args3}' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: 1.9 + htslib: 1.9 + END_VERSIONS + """ + +} diff --git a/modules/local/get_tool_version.nf b/modules/local/get_tool_version.nf new file mode 100644 index 0000000..b55cfaa --- /dev/null +++ b/modules/local/get_tool_version.nf @@ -0,0 +1,44 @@ +process GET_TOOL_VERSION { + + + tag "get_version_$tool" + label 'process_single' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://mskcc/alpine:3.19-with-bash': + 'docker.io/mskcc/alpine:3.19-with-bash' }" + + input: + val(tool) + path(version_yaml), stageAs: "tool_version.yml" + + output: + stdout emit: tool_version + path "versions.yml" , emit: versions + + script: + task.ext.when == null || task.ext.when + def prefix = task.ext.prefix + + """ + grep '${tool}:' tool_version.yml | tail -n1 | awk '{ print \$2}' + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + grep: BusyBox v1.36.1 + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix + """ + echo "1.0" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + grep: BusyBox v1.36.1 + END_VERSIONS + """ +} diff --git a/modules/local/vcf2maf.nf b/modules/local/vcf2maf.nf new file mode 100644 index 0000000..f2a402b --- /dev/null +++ b/modules/local/vcf2maf.nf @@ -0,0 +1,52 @@ +process VCF2MAF { + + + tag "$meta.id" + label 'process_medium' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://mskcc/vcf2maf:1.6.17': + 'docker.io/mskcc/vcf2maf:1.6.17' }" + + input: + tuple val(meta), path(inputVcf) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + tuple val(meta4), path(exac_filter) + tuple val(meta5), path(exac_filter_tbi) + + output: + tuple val(meta), path("*.maf") , emit: maf + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def forks = task.cpus * 2 + + """ + perl /usr/bin/vcf2maf/vcf2maf.pl \\ + ${args} \\ + --input-vcf ${inputVcf} \\ + --ref-fasta ${fasta} \\ + --vcf-tumor-id ${meta.tumorSampleName} \\ + --tumor-id ${meta.tumorSampleName} \\ + --vcf-normal-id ${meta.normalSampleName} \\ + --normal-id ${meta.normalSampleName} \\ + --filter-vcf ${exac_filter} \\ + --vep-forks ${forks} \\ + --output-maf ${prefix}.maf + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + perl: \$(perl -v | head -n 2 | grep -o '(v.*)' | sed 's/[()]//g') + vcf2maf: 1.6.17 + VEP: 86 + htslib: 1.9 + samtools: 1.9 + bcftools: 1.9 + END_VERSIONS + """ + +} diff --git a/nextflow.config b/nextflow.config index 1e79485..c6c2ac9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -16,7 +16,7 @@ params { genome = null igenomes_base = 's3://ngi-igenomes/igenomes/' igenomes_ignore = false - + // MultiQC options multiqc_config = null @@ -43,7 +43,7 @@ params { custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" config_profile_contact = null config_profile_url = null - + // Max resource options // Defaults only, expecting to be overwritten @@ -170,6 +170,7 @@ profiles { } test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } + test_juno { includeConfig 'conf/test_juno.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 0aecf87..f212995 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -3,6 +3,7 @@ // include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' +include { SAMTOOLS_HEADER_VIEW as normal_header; SAMTOOLS_HEADER_VIEW as tumor_header} from '../../modules/local/get_bam_header' workflow INPUT_CHECK { take: @@ -12,33 +13,121 @@ workflow INPUT_CHECK { SAMPLESHEET_CHECK ( samplesheet ) .csv .splitCsv ( header:true, sep:',' ) - .map { create_fastq_channel(it) } - .set { reads } + .map { create_bam_channel(it) } + .set { bam_files } + tumor_sample = bam_files + .map { + new Tuple(it[0],it[1][0]) + } + normal_sample = bam_files + .map { + new Tuple(it[0],it[1][1]) + } + tumor_header( tumor_sample ) + normal_header( normal_sample ) + + combined_bams = tuple_join(bam_files, tumor_header.out.sample_name) + combined_bams = tuple_join(combined_bams,normal_header.out.sample_name ) + + bams = combined_bams + .map{ set_samplename_meta(it) } + + ch_versions = Channel.empty() + ch_versions = ch_versions.mix(SAMPLESHEET_CHECK.out.versions) + ch_versions = ch_versions.mix(tumor_header.out.versions) + ch_versions = ch_versions.mix(normal_header.out.versions) emit: - reads // channel: [ val(meta), [ reads ] ] - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] + bams = bams // channel: [ val(meta), [ bams ] ] + versions = ch_versions // channel: [ versions.yml ] } -// Function to get list of [ meta, [ fastq_1, fastq_2 ] ] -def create_fastq_channel(LinkedHashMap row) { +def tuple_join(first, second) { + first_channel = first + .map{ + new Tuple(it[0].id,it) + } + second_channel = second + .map{ + new Tuple(it[0].id,it) + } + mergedWithKey = first_channel + .join(second_channel) + merged = mergedWithKey + .map{ + it[1] + it[2][1] + } + return merged + +} + +// Function to get list of [ meta, [ tumorBam, normalBam, assay, normalType ] ] +def create_bam_channel(LinkedHashMap row) { // create meta map def meta = [:] - meta.id = row.sample - meta.single_end = row.single_end.toBoolean() - - // add path(s) of the fastq file(s) to the meta map - def fastq_meta = [] - if (!file(row.fastq_1).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}" - } - if (meta.single_end) { - fastq_meta = [ meta, [ file(row.fastq_1) ] ] - } else { - if (!file(row.fastq_2).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" + meta.id = row.pairId + meta.assay = row.assay + meta.normalType = row.normalType + + // add path(s) of the bam files to the meta map + def bams = [] + def bedFile = null + if (!file(row.tumorBam).exists()) { + exit 1, "ERROR: Please check input samplesheet -> Tumor BAM file does not exist!\n${row.tumorBam}" + } + if (!file(row.normalBam).exists()) { + exit 1, "ERROR: Please check input samplesheet -> Normal BAM file does not exist!\n${row.normalBam}" + } + + def tumorBai = "${row.tumorBam}.bai" + def normalBai = "${row.normalBam}.bai" + def tumorBaiAlt = "${row.tumorBam}".replaceAll('bam$', 'bai') + def normalBaiAlt = "${row.normalBam}".replaceAll('bam$', 'bai') + + def foundTumorBai = "" + def foundNormalBai = "" + + + if (file(tumorBai).exists()) { + foundTumorBai = tumorBai + } + else{ + if(file(tumorBaiAlt).exists()){ + foundTumorBai = tumorBaiAlt + } + else{ + exit 1, "ERROR: Please verify inputs -> Tumor BAI file does not exist!\n${row.tumorBam}" + } + } + if (file(normalBai).exists()) { + foundNormalBai = normalBai + } + else{ + if(file(normalBaiAlt).exists()){ + foundNormalBai = normalBaiAlt + } + else{ + exit 1, "ERROR: Please verify inputs -> Normal BAI file does not exist!\n${row.normalBam}" } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] } - return fastq_meta + + + bams = [ meta, [ file(row.tumorBam), file(row.normalBam) ], [ file(foundTumorBai), file(foundNormalBai) ]] + return bams +} + +def set_samplename_meta(List bams) { + meta = bams[0] + def tumorSample = bams[3] + def normalSample = bams[4] + if( tumorSample == null || tumorSample.isEmpty() ){ + exit 1, "ERROR: No sample name found for tumor sample, please make sure the SM tag is set in the bam\n${tumorBam}" + } + if( normalSample == null || normalSample.isEmpty() ){ + exit 1, "ERROR: No sample name found for normal sample, please make sure the SM tag is set in the bam\n${normalBam}" + } + meta.tumorSampleName = tumorSample.trim() + meta.normalSampleName = normalSample.trim() + return [ meta, bams[1], bams[2] ] + } diff --git a/subworkflows/local/sv.nf b/subworkflows/local/sv.nf new file mode 100644 index 0000000..fa50d35 --- /dev/null +++ b/subworkflows/local/sv.nf @@ -0,0 +1,105 @@ +include { DELLY_CALL } from '../../modules/local/delly_call' +include { DELLY_FILTER } from '../../modules/local/delly_filter' +include { BCFTOOLS_CONCAT as concat_sv; BCFTOOLS_CONCAT as concat_filtered_sv} from '../../modules/local/bcftools_concat' +include { VCF2MAF } from '../../modules/local/vcf2maf' +include { FORMAT_MAF } from '../../modules/local/format_maf' +include { GET_TOOL_VERSION } from '../../modules/local/get_tool_version' +include { ADD_MAF_COMMENT } from '../../modules/local/add_maf_comment' + + +workflow SV { + take: + ch_normal // normal bam + ch_tumor // tumor bam + ch_fasta_ref // fasta path + ch_fasta_fai_ref // fasta_fai path + ch_delly_exclude // delly exclude file + delly_type // delly type list + ch_exac_filter // Exac filter vcf + ch_exac_filter_index // Exac filter index + + main: + + DELLY_CALL ( + ch_normal, + ch_tumor, + ch_fasta_ref, + ch_fasta_fai_ref, + ch_delly_exclude, + delly_type + ) + + delly_call_output = DELLY_CALL.out.sv_output.transpose() + + DELLY_FILTER ( + delly_call_output + ) + + combined_sv = delly_call_output + .map{ + new Tuple(it[0].id,it[0],it[2],it[3]) + } + .groupTuple() + .map{ + new Tuple(it[1][0],it[2], it[3]) + } + + combined_filtered_sv = DELLY_FILTER.out.sv_pass_output + .map{ + new Tuple(it[0].id,it[0],it[1], it[2]) + } + .groupTuple() + .map{ + new Tuple(it[1][0],it[2], it[3]) + } + + concat_sv ( + combined_sv + ) + + concat_filtered_sv ( + combined_filtered_sv + ) + + VCF2MAF ( + concat_filtered_sv.out.vcf, + ch_fasta_ref, + ch_fasta_fai_ref, + ch_exac_filter, + ch_exac_filter_index + ) + + delly_tool = Channel.value("delly") + + GET_TOOL_VERSION ( + delly_tool, + DELLY_CALL.out.versions + ) + + ADD_MAF_COMMENT ( + VCF2MAF.out.maf, + delly_tool, + GET_TOOL_VERSION.out.tool_version + ) + + FORMAT_MAF ( + VCF2MAF.out.maf + ) + + ch_versions = Channel.empty() + ch_versions = ch_versions.mix(DELLY_CALL.out.versions) + ch_versions = ch_versions.mix(DELLY_FILTER.out.versions) + ch_versions = ch_versions.mix(concat_sv.out.versions) + ch_versions = ch_versions.mix(concat_filtered_sv.out.versions) + ch_versions = ch_versions.mix(VCF2MAF.out.versions) + ch_versions = ch_versions.mix(GET_TOOL_VERSION.out.versions) + ch_versions = ch_versions.mix(ADD_MAF_COMMENT.out.versions) + ch_versions = ch_versions.mix(FORMAT_MAF.out.versions) + + emit: + sv = concat_sv.out.vcf + sv_filtered = concat_filtered_sv.out.vcf + maf_file = ADD_MAF_COMMENT.out.maf + portal = FORMAT_MAF.out.portal + versions = ch_versions +} diff --git a/workflows/sif.nf b/workflows/sif.nf index 61a2220..2c960e1 100644 --- a/workflows/sif.nf +++ b/workflows/sif.nf @@ -35,7 +35,9 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // + include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { SV } from '../subworkflows/local/sv' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -46,8 +48,6 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check' // // MODULE: Installed directly from nf-core/modules // -include { FASTQC } from '../modules/nf-core/fastqc/main' -include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' /* @@ -56,9 +56,6 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// Info required for completion email and summary -def multiqc_report = [] - workflow SIF { ch_versions = Channel.empty() @@ -69,45 +66,42 @@ workflow SIF { INPUT_CHECK ( file(params.input) ) - ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) - // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input") - // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ - // ! There is currently no tooling to help you write a sample sheet schema - // - // MODULE: Run FastQC - // - FASTQC ( - INPUT_CHECK.out.reads + ch_fasta_ref = Channel.value([ "reference_genome", file(params.fasta) ]) + ref_index_list = [] + for(single_genome_ref in params.fasta_index){ + ref_index_list.add(file(single_genome_ref)) + } + ch_fasta_fai_ref = Channel.value([ "reference_genome_index",ref_index_list]) + ch_delly_exclude = Channel.value([ "delly_exclude", file(params.delly_exclude) ]) + ch_exac_filter = Channel.value(["exac_filter", file(params.exac_filter)]) + ch_exac_filter_index = Channel.value(["exac_filter_index", file(params.exac_filter_index)]) + ch_normal = INPUT_CHECK.out.bams + .map{ + new Tuple(it[0],it[1][1], it[2][1]) + } + ch_tumor = INPUT_CHECK.out.bams + .map{ + new Tuple(it[0],it[1][0], it[2][0]) + } + + SV ( + ch_normal, + ch_tumor, + ch_fasta_ref, + ch_fasta_fai_ref, + ch_delly_exclude, + params.delly_type, + ch_exac_filter, + ch_exac_filter_index ) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + + ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) + ch_versions = ch_versions.mix(SV.out.versions) CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') ) - - // - // MODULE: MultiQC - // - workflow_summary = WorkflowSif.paramsSummaryMultiqc(workflow, summary_params) - ch_workflow_summary = Channel.value(workflow_summary) - - methods_description = WorkflowSif.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) - ch_methods_description = Channel.value(methods_description) - - ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) - - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList() - ) - multiqc_report = MULTIQC.out.report.toList() } /*