Merge branch '405-somatic-variant-qc' of github.com:bihealth/snappy-p…

…ipeline into 405-somatic-variant-qc
bihealth · Nov 20, 2023 · 70d5bb0 · 70d5bb0
2 parents e9dbc5e + 8d1d2c8
commit 70d5bb0
Show file tree

Hide file tree

Showing 163 changed files with 55,617 additions and 2,623 deletions.
diff --git a/docs/dev_intro.rst b/docs/dev_intro.rst
@@ -99,6 +99,9 @@ Usually, you define a :class:`BaseStep <snappy_pipeline.workflows.abstract.BaseS
 The current configuration is passed into the constructor of this class and it then "takes over" and applies default setting, generating cluster resource settings, etc.
 Then, you pass the result of method calls to your :class:`BaseStep <snappy_pipeline.workflows.abstract.BaseStep>` instance as the values for the ``input:``, ``output:``, etc. sections of your ``Snakefile``.
 
+.. warning::
+   By convention your new Workflow step should be instantiated as ``wf = StepClass(...)`` in the ``Snakefile`` during object setup. Otherwise tools including cubi-tk might not be able to detect and parse your step. See existing workflow ``Snakefile`` for reference.
+
 The :class:`BaseStep <snappy_pipeline.workflows.abstract.BaseStep>` sub class itself uses :class:`BaseStepPart <snappy_pipeline.workflows.abstract.BaseStepPart>` sub classes for the implementation of the individual parts.
 One part might be linking in FASTQ files from the raw input directory or linking from the ``work/`` to the ``output/`` directory.
 Another part might be the somatic variant calling using mutect or WGS SV calling using Delly2.

diff --git a/requirements/test.txt b/requirements/test.txt
@@ -5,6 +5,7 @@ pytest
 coverage
 pytest-cov
 pytest-mock
+pytest-subprocess
 
 # Fake file system for testing
 pyfakefs

diff --git a/snappy_pipeline/apps/snappy_snake.py b/snappy_pipeline/apps/snappy_snake.py
@@ -25,12 +25,14 @@
     helper_gcnv_model_targeted,
     helper_gcnv_model_wgs,
     hla_typing,
+    homologous_recombination_deficiency,
     igv_session_generation,
     ngs_data_qc,
     ngs_mapping,
     ngs_sanity_checking,
     panel_of_normals,
     repeat_expansion,
+    somatic_cnv_checking,
     somatic_gene_fusion_calling,
     somatic_hla_loh_calling,
     somatic_msi_calling,
@@ -80,12 +82,14 @@
     "helper_gcnv_model_targeted": helper_gcnv_model_targeted,
     "helper_gcnv_model_wgs": helper_gcnv_model_wgs,
     "hla_typing": hla_typing,
+    "homologous_recombination_deficiency": homologous_recombination_deficiency,
     "igv_session_generation": igv_session_generation,
     "ngs_mapping": ngs_mapping,
     "ngs_data_qc": ngs_data_qc,
     "panel_of_normals": panel_of_normals,
     "repeat_analysis": repeat_expansion,
     "ngs_sanity_checking": ngs_sanity_checking,
+    "somatic_cnv_checking": somatic_cnv_checking,
     "somatic_gene_fusion_calling": somatic_gene_fusion_calling,
     "somatic_hla_loh_calling": somatic_hla_loh_calling,
     "somatic_msi_calling": somatic_msi_calling,

diff --git a/snappy_pipeline/workflows/cbioportal_export/Snakefile b/snappy_pipeline/workflows/cbioportal_export/Snakefile
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 """CUBI Pipeline cbioportal_export step Snakefile"""
 
+import csv
 import os
 
 from snappy_pipeline import expand_ref
@@ -28,92 +29,52 @@ exclude_flag = wf.w_config["step_config"]["cbioportal_export"]["exclude_variant_
 
 
 localrules:
-    # Linking files from work/ to output/ should be done locally
-    cbioportal_export_link_out_run,
+    # Assembling meta files & concatenating results should be done locally
+    # The sub steps requiring R for merging (CNA) *CANNOT* be done locally
+    # Neither the expression sub-step (computation of RPKM)
+    cbioportal_export_meta_files,
+    cbioportal_export_patient_data,
+    cbioportal_export_case_lists,
+    cbioportal_export_concatenate_maf,
 
 
 rule all:
     input:
         wf.get_result_files(),
 
 
-# House-Keeping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-# Generic linking out ---------------------------------------------------------
-
-
-rule cbioportal_export_link_out_run:
-    input:
-        wf.get_input_files("link_out", "run"),
-    output:
-        wf.get_output_files("link_out", "run"),
-    run:
-        shell(wf.get_shell_cmd("link_out", "run", wildcards))
-
-
 # cbioportal study metadata ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 
-rule cbioportal_export_study_metadata:
-    output:
-        meta_file="work/upload/meta_study.txt",
-    threads: wf.get_resource("cbioportal_study_meta_files", "run", "threads")
-    resources:
-        time=wf.get_resource("cbioportal_study_meta_files", "run", "time"),
-        memory=wf.get_resource("cbioportal_study_meta_files", "run", "memory"),
-        partition=wf.get_resource("cbioportal_study_meta_files", "run", "partition"),
-        tmpdir=wf.get_resource("cbioportal_study_meta_files", "run", "tmpdir"),
-    wrapper:
-        wf.wrapper_path("cbioportal/study_meta")
-
-
 rule cbioportal_export_meta_files:
     output:
         wf.get_output_files("cbioportal_meta_files", "run"),
-    threads: wf.get_resource("cbioportal_meta_files", "run", "threads")
-    resources:
-        time=wf.get_resource("cbioportal_meta_files", "run", "time"),
-        memory=wf.get_resource("cbioportal_meta_files", "run", "memory"),
-        partition=wf.get_resource("cbioportal_meta_files", "run", "partition"),
-        tmpdir=wf.get_resource("cbioportal_meta_files", "run", "tmpdir"),
     wrapper:
         wf.wrapper_path("cbioportal/meta_files")
 
 
-# cbioportal patient and sample metadata ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# cbioportal patient and sample data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 
-rule cbioportal_export_patient_metadata:
+rule cbioportal_export_patient_data:
     output:
         **wf.get_output_files("cbioportal_clinical_data", "run"),
-    threads: wf.get_resource("cbioportal_clinical_data", "run", "threads")
-    resources:
-        time=wf.get_resource("cbioportal_clinical_data", "run", "time"),
-        memory=wf.get_resource("cbioportal_clinical_data", "run", "memory"),
-        partition=wf.get_resource("cbioportal_clinical_data", "run", "partition"),
-        tmpdir=wf.get_resource("cbioportal_clinical_data", "run", "tmpdir"),
     params:
-        sheet=wf.substep_dispatch("cbioportal_clinical_data", "get_sample_sheets", "run"),
+        **wf.get_args("cbioportal_clinical_data", "run"),
     wrapper:
         wf.wrapper_path("cbioportal/clinical_data")
 
 
 rule cbioportal_export_case_lists:
     output:
         **wf.get_output_files("cbioportal_case_lists", "run"),
-    threads: wf.get_resource("cbioportal_case_lists", "run", "threads")
-    resources:
-        time=wf.get_resource("cbioportal_case_lists", "run", "time"),
-        memory=wf.get_resource("cbioportal_case_lists", "run", "memory"),
-        partition=wf.get_resource("cbioportal_case_lists", "run", "partition"),
-        tmpdir=wf.get_resource("cbioportal_case_lists", "run", "tmpdir"),
     params:
-        sheet=wf.substep_dispatch("cbioportal_clinical_data", "get_sample_sheets", "run"),
+        **wf.get_args("cbioportal_case_lists", "run"),
     wrapper:
         wf.wrapper_path("cbioportal/case_lists")
 
 
-# cbioportal data preparation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Mutation file (data_mutation_extended.txt) ==================================
 
 # Create MAF ------------------------------------------------------------------
 
@@ -130,126 +91,135 @@ rule cbioportal_export_generate_mafs:
         partition=wf.get_resource("cbioportal_vcf2maf", "run", "partition"),
         tmpdir=wf.get_resource("cbioportal_vcf2maf", "run", "tmpdir"),
     params:
-        args=wf.substep_dispatch("cbioportal_vcf2maf", "get_args", "run"),
+        args=wf.get_args("cbioportal_vcf2maf", "run"),
+    log:
+        **wf.get_log_file("cbioportal_vcf2maf", "run"),
     wrapper:
-        wf.wrapper_path("vcf2maf/vcf2maf")
+        wf.wrapper_path("vcf2maf/vcf_to_table")
 
 
-# Create one MAF from all MAFs
+# Merge all sample MAF files --------------------------------------------------
 
 
 rule cbioportal_export_concatenate_maf:
-    # TODO: could this be include in the `localrules`? Or simply incorporated into another step?
-    #  It looks like some postprocessing...
     input:
-        *wf.get_input_files("cbioportal_maf", "run"),
+        **wf.get_input_files("cbioportal_mutations", "run"),
     output:
-        "work/upload/data_mutation_extended.txt",
-    threads: wf.get_resource("cbioportal_maf", "run", "threads")
-    resources:
-        time=wf.get_resource("cbioportal_maf", "run", "time"),
-        memory=wf.get_resource("cbioportal_maf", "run", "memory"),
-        partition=wf.get_resource("cbioportal_maf", "run", "partition"),
-        tmpdir=wf.get_resource("cbioportal_maf", "run", "tmpdir"),
+        wf.get_output_files("cbioportal_mutations", "run"),
     shell:
         r"""
         cat \
-        <( head -n2 {input[0]} ) \
-        <( tail -q -n +3 {input}) \
-        | grep -v {exclude_flag} \
+        <( head -n 1 {input[0]} ) \
+        <( tail -q -n +2 {input}) \
+        | (if [[ -n "{exclude_flag}" ]] ; then grep -v {exclude_flag} ; else cat ; fi) \
         > {output}
         """
 
 
-# Create CNA gistic file ------------------------------------------------------
-rule cbioportal_export_CNA_calls:
+# Copy number files (data_cna_log2.txt, data_cna_gistic.txt, data_segment.txt)
+
+# Create CNA files (discrete, continuous from cns files) ----------------------
+
+
+rule cbioportal_export_generate_cna:
     input:
-        wf.get_input_files("cbioportal_cna_data", "gistic"),
+        **wf.get_input_files("cbioportal_cns2cna", "run"),
     output:
-        "work/upload/data_CNA_gistic.txt",
-    threads: wf.get_resource("cbioportal_cna_data", "gistic", "threads")
+        **wf.get_output_files("cbioportal_cns2cna", "run"),
+    threads: wf.get_resource("cbioportal_cns2cna", "run", "threads")
     resources:
-        time=wf.get_resource("cbioportal_cna_data", "gistic", "time"),
-        memory=wf.get_resource("cbioportal_cna_data", "gistic", "memory"),
-        partition=wf.get_resource("cbioportal_cna_data", "gistic", "partition"),
-        tmpdir=wf.get_resource("cbioportal_cna_data", "gistic", "tmpdir"),
+        time=wf.get_resource("cbioportal_cns2cna", "run", "time"),
+        memory=wf.get_resource("cbioportal_cns2cna", "run", "memory"),
+        partition=wf.get_resource("cbioportal_cns2cna", "run", "partition"),
+        tmpdir=wf.get_resource("cbioportal_cns2cna", "run", "tmpdir"),
     params:
-        datatype="int",
+        **wf.get_args("cbioportal_cns2cna", "run"),
+    log:
+        **wf.get_log_file("cbioportal_cns2cna", "run"),
     wrapper:
-        wf.wrapper_path("cbioportal/merge_tables")
+        wf.wrapper_path("cbioportal/generate_cna")
 
 
-# Create CNA log2 file --------------------------------------------------------
+# Merge sample-based CNA files to cBioPortal ----------------------------------
 
 
-rule cbioportal_export_CNA_log2:
+rule cbioportal_export_concatenate_cna_log2:
     input:
-        wf.get_input_files("cbioportal_cna_data", "log2"),
+        **wf.get_input_files("cbioportal_cna", "log2"),
     output:
-        "work/upload/data_CNA_log2.txt",
-    threads: wf.get_resource("cbioportal_cna_data", "log2", "threads")
-    resources:
-        time=wf.get_resource("cbioportal_cna_data", "log2", "time"),
-        memory=wf.get_resource("cbioportal_cna_data", "log2", "memory"),
-        partition=wf.get_resource("cbioportal_cna_data", "log2", "partition"),
-        tmpdir=wf.get_resource("cbioportal_cna_data", "log2", "tmpdir"),
+        wf.get_output_files("cbioportal_cna", "log2"),
     params:
-        datatype="float",
+        **wf.get_args("cbioportal_cna", "log2"),
+    threads: wf.get_resource("cbioportal_cna", "log2", "threads")
+    resources:
+        time=wf.get_resource("cbioportal_cna", "log2", "time"),
+        memory=wf.get_resource("cbioportal_cna", "log2", "memory"),
+        partition=wf.get_resource("cbioportal_cna", "log2", "partition"),
+        tmpdir=wf.get_resource("cbioportal_cna", "log2", "tmpdir"),
+    log:
+        **wf.get_log_file("cbioportal_cna", "log2"),
     wrapper:
         wf.wrapper_path("cbioportal/merge_tables")
 
 
-# Create CNA segmentation file ------------------------------------------------
-
-
-rule cbioportal_export_segments:
+rule cbioportal_export_concatenate_cna_gistic:
     input:
-        wf.get_input_files("cbioportal_cna_data", "segments"),
+        **wf.get_input_files("cbioportal_cna", "gistic"),
     output:
-        "work/upload/data_segment.txt",
-    threads: wf.get_resource("cbioportal_cna_data", "segments", "threads")
+        wf.get_output_files("cbioportal_cna", "gistic"),
+    params:
+        **wf.get_args("cbioportal_cna", "gistic"),
+    threads: wf.get_resource("cbioportal_cna", "gistic", "threads")
     resources:
-        time=wf.get_resource("cbioportal_cna_data", "segments", "time"),
-        memory=wf.get_resource("cbioportal_cna_data", "segments", "memory"),
-        partition=wf.get_resource("cbioportal_cna_data", "segments", "partition"),
-        tmpdir=wf.get_resource("cbioportal_cna_data", "segments", "tmpdir"),
-    shell:
-        r"""
-        cat \
-        <( head -n1 {input[0]} ) \
-        <( tail -q -n +2 {input}) \
-        | sed 's/-DNA.-W[EG]S.//' \
-        > {output}
-        """
+        time=wf.get_resource("cbioportal_cna", "gistic", "time"),
+        memory=wf.get_resource("cbioportal_cna", "gistic", "memory"),
+        partition=wf.get_resource("cbioportal_cna", "gistic", "partition"),
+        tmpdir=wf.get_resource("cbioportal_cna", "gistic", "tmpdir"),
+    log:
+        **wf.get_log_file("cbioportal_cna", "gistic"),
+    wrapper:
+        wf.wrapper_path("cbioportal/merge_tables")
 
 
-# Create expression z-scores --------------------------------------------------
+# Merge sample-based segment files to cBioPortal ------------------------------
 
 
-rule cbioportal_create_dataframe_matching_results:
-    # TODO: could this be include in the `localrules`?
+rule cbioportal_export_concatenate_segments:
+    input:
+        **wf.get_input_files("cbioportal_segment", "run"),
     output:
-        "work/zscores_mapping_df.tsv",
-    threads: wf.get_resource("cbioportal_zscores", "run", "threads")
+        wf.get_output_files("cbioportal_segment", "run"),
+    params:
+        action_type="segment",
+    threads: wf.get_resource("cbioportal_segment", "run", "threads")
     resources:
-        time=wf.get_resource("cbioportal_zscores", "run", "time"),
-        memory=wf.get_resource("cbioportal_zscores", "run", "memory"),
-        partition=wf.get_resource("cbioportal_zscores", "run", "partition"),
-        tmpdir=wf.get_resource("cbioportal_zscores", "run", "tmpdir"),
-    run:
-        wf.substep_dispatch("cbioportal_zscores", "get_df", output)
+        time=wf.get_resource("cbioportal_segment", "run", "time"),
+        memory=wf.get_resource("cbioportal_segment", "run", "memory"),
+        partition=wf.get_resource("cbioportal_segment", "run", "partition"),
+        tmpdir=wf.get_resource("cbioportal_segment", "run", "tmpdir"),
+    log:
+        **wf.get_log_file("cbioportal_segment", "run"),
+    wrapper:
+        wf.wrapper_path("cbioportal/merge_tables")
+
+
+# Create expression RPKMs -----------------------------------------------------
 
 
-rule cbioportal_compute_zscores:
+rule cbioportal_export_expression:
     input:
-        tsv="work/zscores_mapping_df.tsv",
+        **wf.get_input_files("cbioportal_expression", "run"),
     output:
-        tsv="work/upload/data_expression_zscores.txt",
-    threads: wf.get_resource("cbioportal_compute_zscores", "run", "threads")
+        wf.get_output_files("cbioportal_expression", "run"),
+    params:
+        **wf.get_args("cbioportal_expression", "run"),
+    threads: wf.get_resource("cbioportal_expression", "run", "threads")
     resources:
-        time=wf.get_resource("cbioportal_compute_zscores", "run", "time"),
-        memory=wf.get_resource("cbioportal_compute_zscores", "run", "memory"),
-        partition=wf.get_resource("cbioportal_compute_zscores", "run", "partition"),
-        tmpdir=wf.get_resource("cbioportal_compute_zscores", "run", "tmpdir"),
+        time=wf.get_resource("cbioportal_expression", "run", "time"),
+        memory=wf.get_resource("cbioportal_expression", "run", "memory"),
+        partition=wf.get_resource("cbioportal_expression", "run", "partition"),
+        tmpdir=wf.get_resource("cbioportal_expression", "run", "tmpdir"),
+    log:
+        **wf.get_log_file("cbioportal_expression", "run"),
     wrapper:
-        wf.wrapper_path("cbioportal/zscores")
+        wf.wrapper_path("cbioportal/merge_tables")