Merge pull request #1 from mskcc/develop

Merge develop with dev
mskcc · Jan 11, 2024 · 431c090 · 431c090
2 parents e952895 + 2494687
commit 431c090
Show file tree

Hide file tree

Showing 22 changed files with 1,158 additions and 229 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -2,53 +2,53 @@ name: Bug report
 description: Report something that is broken or incorrect
 labels: bug
 body:
-- type: textarea
-  id: description
-  attributes:
-    label: Description of the bug
-    description: A clear and concise description of what the bug is.
-  validations:
-    required: true
-- type: textarea
-  id: command_used
-  attributes:
-    label: Command used and terminal output
-    description: Steps to reproduce the behaviour. Please paste the command you used
-      to launch the pipeline and the output from your terminal.
-    render: console
-    placeholder: '$ nextflow run ...
-
-
-      Some output where something broke
-
-      '
-- type: textarea
-  id: files
-  attributes:
-    label: Relevant files
-    description: 'Please drag and drop the relevant files here. Create a `.zip` archive
-      if the extension is not allowed.
-
-      Your verbose log file `.nextflow.log` is often useful _(this is a hidden file
-      in the directory where you launched the pipeline)_ as well as custom Nextflow
-      configuration files.
-
-      '
-- type: textarea
-  id: system
-  attributes:
-    label: System information
-    description: '* Nextflow version _(eg. 23.04.0)_
-
-      * Hardware _(eg. HPC, Desktop, Cloud)_
-
-      * Executor _(eg. slurm, local, awsbatch)_
-
-      * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud,
-      or Apptainer)_
-
-      * OS _(eg. CentOS Linux, macOS, Linux Mint)_
-
-      * Version of mskcc/sif _(eg. 1.1, 1.5, 1.8.2)_
-
-      '
+  - type: textarea
+    id: description
+    attributes:
+      label: Description of the bug
+      description: A clear and concise description of what the bug is.
+    validations:
+      required: true
+  - type: textarea
+    id: command_used
+    attributes:
+      label: Command used and terminal output
+      description: Steps to reproduce the behaviour. Please paste the command you used
+        to launch the pipeline and the output from your terminal.
+      render: console
+      placeholder: "$ nextflow run ...
+
+
+        Some output where something broke
+
+        "
+  - type: textarea
+    id: files
+    attributes:
+      label: Relevant files
+      description: "Please drag and drop the relevant files here. Create a `.zip` archive
+        if the extension is not allowed.
+
+        Your verbose log file `.nextflow.log` is often useful _(this is a hidden file
+        in the directory where you launched the pipeline)_ as well as custom Nextflow
+        configuration files.
+
+        "
+  - type: textarea
+    id: system
+    attributes:
+      label: System information
+      description: "* Nextflow version _(eg. 23.04.0)_
+
+        * Hardware _(eg. HPC, Desktop, Cloud)_
+
+        * Executor _(eg. slurm, local, awsbatch)_
+
+        * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud,
+        or Apptainer)_
+
+        * OS _(eg. CentOS Linux, macOS, Linux Mint)_
+
+        * Version of mskcc/sif _(eg. 1.1, 1.5, 1.8.2)_
+
+        "
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -1,23 +1,23 @@
 lint:
   files_exist:
-  - CODE_OF_CONDUCT.md
-  - assets/nf-core-sif_logo_light.png
-  - docs/images/nf-core-sif_logo_light.png
-  - docs/images/nf-core-sif_logo_dark.png
-  - .github/ISSUE_TEMPLATE/config.yml
-  - .github/workflows/awstest.yml
-  - .github/workflows/awsfulltest.yml
+    - CODE_OF_CONDUCT.md
+    - assets/nf-core-sif_logo_light.png
+    - docs/images/nf-core-sif_logo_light.png
+    - docs/images/nf-core-sif_logo_dark.png
+    - .github/ISSUE_TEMPLATE/config.yml
+    - .github/workflows/awstest.yml
+    - .github/workflows/awsfulltest.yml
   files_unchanged:
-  - CODE_OF_CONDUCT.md
-  - assets/nf-core-sif_logo_light.png
-  - docs/images/nf-core-sif_logo_light.png
-  - docs/images/nf-core-sif_logo_dark.png
-  - .github/ISSUE_TEMPLATE/bug_report.yml
+    - CODE_OF_CONDUCT.md
+    - assets/nf-core-sif_logo_light.png
+    - docs/images/nf-core-sif_logo_light.png
+    - docs/images/nf-core-sif_logo_dark.png
+    - .github/ISSUE_TEMPLATE/bug_report.yml
   multiqc_config:
-  - report_comment
+    - report_comment
   nextflow_config:
-  - manifest.name
-  - manifest.homePage
+    - manifest.name
+    - manifest.homePage
 repository_type: pipeline
 template:
   prefix: mskcc

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -1,8 +1,8 @@
 report_comment: >
-  
+
   This report has been generated by the <a href="https://github.com/mskcc/sif/releases/tag/1.0.0" target="_blank">mskcc/sif</a>
   analysis pipeline.
-  
+
 report_section_order:
   "mskcc-sif-methods-description":
     order: -1000

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -24,39 +24,32 @@ class RowChecker:
 
     """
 
-    VALID_FORMATS = (
-        ".fq.gz",
-        ".fastq.gz",
-    )
+    VALID_FORMATS = ".bam"
 
     def __init__(
         self,
-        sample_col="sample",
-        first_col="fastq_1",
-        second_col="fastq_2",
-        single_col="single_end",
+        pairId="pairId",
+        tumorBam="tumorBam",
+        normalBam="normalBam",
+        assay="assay",
+        normalType="normalType",
+        bedFile="bedFile",
         **kwargs,
     ):
         """
         Initialize the row checker with the expected column names.
 
         Args:
-            sample_col (str): The name of the column that contains the sample name
-                (default "sample").
-            first_col (str): The name of the column that contains the first (or only)
-                FASTQ file path (default "fastq_1").
-            second_col (str): The name of the column that contains the second (if any)
-                FASTQ file path (default "fastq_2").
-            single_col (str): The name of the new column that will be inserted and
-                records whether the sample contains single- or paired-end sequencing
-                reads (default "single_end").
+
 
         """
         super().__init__(**kwargs)
-        self._sample_col = sample_col
-        self._first_col = first_col
-        self._second_col = second_col
-        self._single_col = single_col
+        self._pairId = pairId
+        self._tumorBam = tumorBam
+        self._normalBam = normalBam
+        self._assay = assay
+        self._normalType = normalType
+        self._bedFile = bedFile
         self._seen = set()
         self.modified = []
 
@@ -69,65 +62,53 @@ def validate_and_transform(self, row):
                 (values).
 
         """
-        self._validate_sample(row)
-        self._validate_first(row)
-        self._validate_second(row)
-        self._validate_pair(row)
-        self._seen.add((row[self._sample_col], row[self._first_col]))
+        self._validate_names(row)
+        self._validate_bams(row)
+        self._validate_normalType(row)
+        self._validate_bed_format(row)
+        self._seen.add((row[self._pairId]))
         self.modified.append(row)
 
-    def _validate_sample(self, row):
-        """Assert that the sample name exists and convert spaces to underscores."""
-        if len(row[self._sample_col]) <= 0:
-            raise AssertionError("Sample input is required.")
-        # Sanitize samples slightly.
-        row[self._sample_col] = row[self._sample_col].replace(" ", "_")
+    def _validate_names(self, row):
+        """Assert that the sample names exist"""
+        if len(row[self._pairId]) <= 0:
+            raise AssertionError("pairId is required.")
+
+    def _validate_pairId_format(self, row):
+        id_value = row[self._pairId]
+        if "." in id_value:
+            raise AssertionError("pairId:{} cannot contain any periods ('.') ".format(id_value))
 
-    def _validate_first(self, row):
+    def _validate_bams(self, row):
         """Assert that the first FASTQ entry is non-empty and has the right format."""
-        if len(row[self._first_col]) <= 0:
-            raise AssertionError("At least the first FASTQ file is required.")
-        self._validate_fastq_format(row[self._first_col])
-
-    def _validate_second(self, row):
-        """Assert that the second FASTQ entry has the right format if it exists."""
-        if len(row[self._second_col]) > 0:
-            self._validate_fastq_format(row[self._second_col])
-
-    def _validate_pair(self, row):
-        """Assert that read pairs have the same file extension. Report pair status."""
-        if row[self._first_col] and row[self._second_col]:
-            row[self._single_col] = False
-            first_col_suffix = Path(row[self._first_col]).suffixes[-2:]
-            second_col_suffix = Path(row[self._second_col]).suffixes[-2:]
-            if first_col_suffix != second_col_suffix:
-                raise AssertionError("FASTQ pairs must have the same file extensions.")
-        else:
-            row[self._single_col] = True
-
-    def _validate_fastq_format(self, filename):
+        if len(row[self._tumorBam]) <= 0 or len(row[self._normalBam]) <= 0:
+            raise AssertionError("Both bam files are required.")
+        self._validate_bam_format(row[self._tumorBam])
+        self._validate_bam_format(row[self._normalBam])
+
+    def _validate_normalType(self, row):
+        """Assert that bait set exists."""
+        if len(row[self._normalType]) <= 0:
+            raise AssertionError("normalType is required.")
+
+    def _validate_bam_format(self, filename):
         """Assert that a given filename has one of the expected FASTQ extensions."""
         if not any(filename.endswith(extension) for extension in self.VALID_FORMATS):
             raise AssertionError(
-                f"The FASTQ file has an unrecognized extension: {filename}\n"
+                f"The BAM file has an unrecognized extension: {filename}\n"
                 f"It should be one of: {', '.join(self.VALID_FORMATS)}"
             )
 
-    def validate_unique_samples(self):
-        """
-        Assert that the combination of sample name and FASTQ filename is unique.
-
-        In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the
-        number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment.
-
-        """
-        if len(self._seen) != len(self.modified):
-            raise AssertionError("The pair of sample name and FASTQ must be unique.")
-        seen = Counter()
-        for row in self.modified:
-            sample = row[self._sample_col]
-            seen[sample] += 1
-            row[self._sample_col] = f"{sample}_T{seen[sample]}"
+    def _validate_bed_format(self, row):
+        """Assert that a given filename has one of the expected BED extensions."""
+        filename = row[self._bedFile]
+        if filename and filename != "NONE":
+            if not filename.endswith(".bed"):
+                raise AssertionError(
+                    f"The BED file has an unrecognized extension: {filename}\n"
+                    f"It should be .bed\n"
+                    f"If you would like one generated for you leave it bank or enter 'NONE'\n"
+                )
 
 
 def read_head(handle, num_lines=10):
@@ -164,10 +145,9 @@ def sniff_format(handle):
 
 def check_samplesheet(file_in, file_out):
     """
-    Check that the tabular samplesheet has the structure expected by nf-core pipelines.
+    Check that the tabular samplesheet has the structure expected by the ODIN pipeline.
 
-    Validate the general shape of the table, expected columns, and each row. Also add
-    an additional column which records whether one or two FASTQ reads were found.
+    Validate the general shape of the table, expected columns, and each row.
 
     Args:
         file_in (pathlib.Path): The given tabular samplesheet. The format can be either
@@ -179,19 +159,14 @@ def check_samplesheet(file_in, file_out):
         This function checks that the samplesheet follows the following structure,
         see also the `viral recon samplesheet`_::
 
-            sample,fastq_1,fastq_2
-            SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
-            SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
-            SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
-
-    .. _viral recon samplesheet:
-        https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
+            pairId,tumorBam,normalBam,assay,normalType,bedFile
+            SAMPLE_TUMOR.SAMPLE_NORMAL,BAM_TUMOR,BAM_NORMAL,BAITS,NORMAL_TYPE,BED_FILE
 
     """
-    required_columns = {"sample", "fastq_1", "fastq_2"}
+    required_columns = {"pairId", "tumorBam", "normalBam", "assay", "normalType", "bedFile"}
     # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
     with file_in.open(newline="") as in_handle:
-        reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))
+        reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle), delimiter=",")
         # Validate the existence of the expected header columns.
         if not required_columns.issubset(reader.fieldnames):
             req_cols = ", ".join(required_columns)
@@ -205,9 +180,7 @@ def check_samplesheet(file_in, file_out):
             except AssertionError as error:
                 logger.critical(f"{str(error)} On line {i + 2}.")
                 sys.exit(1)
-        checker.validate_unique_samples()
     header = list(reader.fieldnames)
-    header.insert(1, "single_end")
     # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
     with file_out.open(mode="w", newline="") as out_handle:
         writer = csv.DictWriter(out_handle, header, delimiter=",")