Skip to content

Commit

Permalink
Merge branch '405-somatic-variant-qc' of github.com:bihealth/snappy-p…
Browse files Browse the repository at this point in the history
…ipeline into 405-somatic-variant-qc
  • Loading branch information
giacuong171 committed Nov 20, 2023
2 parents e9dbc5e + 8d1d2c8 commit 70d5bb0
Show file tree
Hide file tree
Showing 163 changed files with 55,617 additions and 2,623 deletions.
3 changes: 3 additions & 0 deletions docs/dev_intro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ Usually, you define a :class:`BaseStep <snappy_pipeline.workflows.abstract.BaseS
The current configuration is passed into the constructor of this class and it then "takes over" and applies default setting, generating cluster resource settings, etc.
Then, you pass the result of method calls to your :class:`BaseStep <snappy_pipeline.workflows.abstract.BaseStep>` instance as the values for the ``input:``, ``output:``, etc. sections of your ``Snakefile``.

.. warning::
By convention your new Workflow step should be instantiated as ``wf = StepClass(...)`` in the ``Snakefile`` during object setup. Otherwise tools including cubi-tk might not be able to detect and parse your step. See existing workflow ``Snakefile`` for reference.

The :class:`BaseStep <snappy_pipeline.workflows.abstract.BaseStep>` sub class itself uses :class:`BaseStepPart <snappy_pipeline.workflows.abstract.BaseStepPart>` sub classes for the implementation of the individual parts.
One part might be linking in FASTQ files from the raw input directory or linking from the ``work/`` to the ``output/`` directory.
Another part might be the somatic variant calling using mutect or WGS SV calling using Delly2.
Expand Down
1 change: 1 addition & 0 deletions requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ pytest
coverage
pytest-cov
pytest-mock
pytest-subprocess

# Fake file system for testing
pyfakefs
Expand Down
4 changes: 4 additions & 0 deletions snappy_pipeline/apps/snappy_snake.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,14 @@
helper_gcnv_model_targeted,
helper_gcnv_model_wgs,
hla_typing,
homologous_recombination_deficiency,
igv_session_generation,
ngs_data_qc,
ngs_mapping,
ngs_sanity_checking,
panel_of_normals,
repeat_expansion,
somatic_cnv_checking,
somatic_gene_fusion_calling,
somatic_hla_loh_calling,
somatic_msi_calling,
Expand Down Expand Up @@ -80,12 +82,14 @@
"helper_gcnv_model_targeted": helper_gcnv_model_targeted,
"helper_gcnv_model_wgs": helper_gcnv_model_wgs,
"hla_typing": hla_typing,
"homologous_recombination_deficiency": homologous_recombination_deficiency,
"igv_session_generation": igv_session_generation,
"ngs_mapping": ngs_mapping,
"ngs_data_qc": ngs_data_qc,
"panel_of_normals": panel_of_normals,
"repeat_analysis": repeat_expansion,
"ngs_sanity_checking": ngs_sanity_checking,
"somatic_cnv_checking": somatic_cnv_checking,
"somatic_gene_fusion_calling": somatic_gene_fusion_calling,
"somatic_hla_loh_calling": somatic_hla_loh_calling,
"somatic_msi_calling": somatic_msi_calling,
Expand Down
228 changes: 99 additions & 129 deletions snappy_pipeline/workflows/cbioportal_export/Snakefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
"""CUBI Pipeline cbioportal_export step Snakefile"""

import csv
import os

from snappy_pipeline import expand_ref
Expand Down Expand Up @@ -28,92 +29,52 @@ exclude_flag = wf.w_config["step_config"]["cbioportal_export"]["exclude_variant_


localrules:
# Linking files from work/ to output/ should be done locally
cbioportal_export_link_out_run,
# Assembling meta files & concatenating results should be done locally
# The sub steps requiring R for merging (CNA) *CANNOT* be done locally
# Neither the expression sub-step (computation of RPKM)
cbioportal_export_meta_files,
cbioportal_export_patient_data,
cbioportal_export_case_lists,
cbioportal_export_concatenate_maf,


rule all:
input:
wf.get_result_files(),


# House-Keeping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Generic linking out ---------------------------------------------------------


rule cbioportal_export_link_out_run:
input:
wf.get_input_files("link_out", "run"),
output:
wf.get_output_files("link_out", "run"),
run:
shell(wf.get_shell_cmd("link_out", "run", wildcards))


# cbioportal study metadata ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


rule cbioportal_export_study_metadata:
output:
meta_file="work/upload/meta_study.txt",
threads: wf.get_resource("cbioportal_study_meta_files", "run", "threads")
resources:
time=wf.get_resource("cbioportal_study_meta_files", "run", "time"),
memory=wf.get_resource("cbioportal_study_meta_files", "run", "memory"),
partition=wf.get_resource("cbioportal_study_meta_files", "run", "partition"),
tmpdir=wf.get_resource("cbioportal_study_meta_files", "run", "tmpdir"),
wrapper:
wf.wrapper_path("cbioportal/study_meta")


rule cbioportal_export_meta_files:
output:
wf.get_output_files("cbioportal_meta_files", "run"),
threads: wf.get_resource("cbioportal_meta_files", "run", "threads")
resources:
time=wf.get_resource("cbioportal_meta_files", "run", "time"),
memory=wf.get_resource("cbioportal_meta_files", "run", "memory"),
partition=wf.get_resource("cbioportal_meta_files", "run", "partition"),
tmpdir=wf.get_resource("cbioportal_meta_files", "run", "tmpdir"),
wrapper:
wf.wrapper_path("cbioportal/meta_files")


# cbioportal patient and sample metadata ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# cbioportal patient and sample data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


rule cbioportal_export_patient_metadata:
rule cbioportal_export_patient_data:
output:
**wf.get_output_files("cbioportal_clinical_data", "run"),
threads: wf.get_resource("cbioportal_clinical_data", "run", "threads")
resources:
time=wf.get_resource("cbioportal_clinical_data", "run", "time"),
memory=wf.get_resource("cbioportal_clinical_data", "run", "memory"),
partition=wf.get_resource("cbioportal_clinical_data", "run", "partition"),
tmpdir=wf.get_resource("cbioportal_clinical_data", "run", "tmpdir"),
params:
sheet=wf.substep_dispatch("cbioportal_clinical_data", "get_sample_sheets", "run"),
**wf.get_args("cbioportal_clinical_data", "run"),
wrapper:
wf.wrapper_path("cbioportal/clinical_data")


rule cbioportal_export_case_lists:
output:
**wf.get_output_files("cbioportal_case_lists", "run"),
threads: wf.get_resource("cbioportal_case_lists", "run", "threads")
resources:
time=wf.get_resource("cbioportal_case_lists", "run", "time"),
memory=wf.get_resource("cbioportal_case_lists", "run", "memory"),
partition=wf.get_resource("cbioportal_case_lists", "run", "partition"),
tmpdir=wf.get_resource("cbioportal_case_lists", "run", "tmpdir"),
params:
sheet=wf.substep_dispatch("cbioportal_clinical_data", "get_sample_sheets", "run"),
**wf.get_args("cbioportal_case_lists", "run"),
wrapper:
wf.wrapper_path("cbioportal/case_lists")


# cbioportal data preparation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Mutation file (data_mutation_extended.txt) ==================================

# Create MAF ------------------------------------------------------------------

Expand All @@ -130,126 +91,135 @@ rule cbioportal_export_generate_mafs:
partition=wf.get_resource("cbioportal_vcf2maf", "run", "partition"),
tmpdir=wf.get_resource("cbioportal_vcf2maf", "run", "tmpdir"),
params:
args=wf.substep_dispatch("cbioportal_vcf2maf", "get_args", "run"),
args=wf.get_args("cbioportal_vcf2maf", "run"),
log:
**wf.get_log_file("cbioportal_vcf2maf", "run"),
wrapper:
wf.wrapper_path("vcf2maf/vcf2maf")
wf.wrapper_path("vcf2maf/vcf_to_table")


# Create one MAF from all MAFs
# Merge all sample MAF files --------------------------------------------------


rule cbioportal_export_concatenate_maf:
# TODO: could this be include in the `localrules`? Or simply incorporated into another step?
# It looks like some postprocessing...
input:
*wf.get_input_files("cbioportal_maf", "run"),
**wf.get_input_files("cbioportal_mutations", "run"),
output:
"work/upload/data_mutation_extended.txt",
threads: wf.get_resource("cbioportal_maf", "run", "threads")
resources:
time=wf.get_resource("cbioportal_maf", "run", "time"),
memory=wf.get_resource("cbioportal_maf", "run", "memory"),
partition=wf.get_resource("cbioportal_maf", "run", "partition"),
tmpdir=wf.get_resource("cbioportal_maf", "run", "tmpdir"),
wf.get_output_files("cbioportal_mutations", "run"),
shell:
r"""
cat \
<( head -n2 {input[0]} ) \
<( tail -q -n +3 {input}) \
| grep -v {exclude_flag} \
<( head -n 1 {input[0]} ) \
<( tail -q -n +2 {input}) \
| (if [[ -n "{exclude_flag}" ]] ; then grep -v {exclude_flag} ; else cat ; fi) \
> {output}
"""


# Create CNA gistic file ------------------------------------------------------
rule cbioportal_export_CNA_calls:
# Copy number files (data_cna_log2.txt, data_cna_gistic.txt, data_segment.txt)

# Create CNA files (discrete, continuous from cns files) ----------------------


rule cbioportal_export_generate_cna:
input:
wf.get_input_files("cbioportal_cna_data", "gistic"),
**wf.get_input_files("cbioportal_cns2cna", "run"),
output:
"work/upload/data_CNA_gistic.txt",
threads: wf.get_resource("cbioportal_cna_data", "gistic", "threads")
**wf.get_output_files("cbioportal_cns2cna", "run"),
threads: wf.get_resource("cbioportal_cns2cna", "run", "threads")
resources:
time=wf.get_resource("cbioportal_cna_data", "gistic", "time"),
memory=wf.get_resource("cbioportal_cna_data", "gistic", "memory"),
partition=wf.get_resource("cbioportal_cna_data", "gistic", "partition"),
tmpdir=wf.get_resource("cbioportal_cna_data", "gistic", "tmpdir"),
time=wf.get_resource("cbioportal_cns2cna", "run", "time"),
memory=wf.get_resource("cbioportal_cns2cna", "run", "memory"),
partition=wf.get_resource("cbioportal_cns2cna", "run", "partition"),
tmpdir=wf.get_resource("cbioportal_cns2cna", "run", "tmpdir"),
params:
datatype="int",
**wf.get_args("cbioportal_cns2cna", "run"),
log:
**wf.get_log_file("cbioportal_cns2cna", "run"),
wrapper:
wf.wrapper_path("cbioportal/merge_tables")
wf.wrapper_path("cbioportal/generate_cna")


# Create CNA log2 file --------------------------------------------------------
# Merge sample-based CNA files to cBioPortal ----------------------------------


rule cbioportal_export_CNA_log2:
rule cbioportal_export_concatenate_cna_log2:
input:
wf.get_input_files("cbioportal_cna_data", "log2"),
**wf.get_input_files("cbioportal_cna", "log2"),
output:
"work/upload/data_CNA_log2.txt",
threads: wf.get_resource("cbioportal_cna_data", "log2", "threads")
resources:
time=wf.get_resource("cbioportal_cna_data", "log2", "time"),
memory=wf.get_resource("cbioportal_cna_data", "log2", "memory"),
partition=wf.get_resource("cbioportal_cna_data", "log2", "partition"),
tmpdir=wf.get_resource("cbioportal_cna_data", "log2", "tmpdir"),
wf.get_output_files("cbioportal_cna", "log2"),
params:
datatype="float",
**wf.get_args("cbioportal_cna", "log2"),
threads: wf.get_resource("cbioportal_cna", "log2", "threads")
resources:
time=wf.get_resource("cbioportal_cna", "log2", "time"),
memory=wf.get_resource("cbioportal_cna", "log2", "memory"),
partition=wf.get_resource("cbioportal_cna", "log2", "partition"),
tmpdir=wf.get_resource("cbioportal_cna", "log2", "tmpdir"),
log:
**wf.get_log_file("cbioportal_cna", "log2"),
wrapper:
wf.wrapper_path("cbioportal/merge_tables")


# Create CNA segmentation file ------------------------------------------------


rule cbioportal_export_segments:
rule cbioportal_export_concatenate_cna_gistic:
input:
wf.get_input_files("cbioportal_cna_data", "segments"),
**wf.get_input_files("cbioportal_cna", "gistic"),
output:
"work/upload/data_segment.txt",
threads: wf.get_resource("cbioportal_cna_data", "segments", "threads")
wf.get_output_files("cbioportal_cna", "gistic"),
params:
**wf.get_args("cbioportal_cna", "gistic"),
threads: wf.get_resource("cbioportal_cna", "gistic", "threads")
resources:
time=wf.get_resource("cbioportal_cna_data", "segments", "time"),
memory=wf.get_resource("cbioportal_cna_data", "segments", "memory"),
partition=wf.get_resource("cbioportal_cna_data", "segments", "partition"),
tmpdir=wf.get_resource("cbioportal_cna_data", "segments", "tmpdir"),
shell:
r"""
cat \
<( head -n1 {input[0]} ) \
<( tail -q -n +2 {input}) \
| sed 's/-DNA.-W[EG]S.//' \
> {output}
"""
time=wf.get_resource("cbioportal_cna", "gistic", "time"),
memory=wf.get_resource("cbioportal_cna", "gistic", "memory"),
partition=wf.get_resource("cbioportal_cna", "gistic", "partition"),
tmpdir=wf.get_resource("cbioportal_cna", "gistic", "tmpdir"),
log:
**wf.get_log_file("cbioportal_cna", "gistic"),
wrapper:
wf.wrapper_path("cbioportal/merge_tables")


# Create expression z-scores --------------------------------------------------
# Merge sample-based segment files to cBioPortal ------------------------------


rule cbioportal_create_dataframe_matching_results:
# TODO: could this be include in the `localrules`?
rule cbioportal_export_concatenate_segments:
input:
**wf.get_input_files("cbioportal_segment", "run"),
output:
"work/zscores_mapping_df.tsv",
threads: wf.get_resource("cbioportal_zscores", "run", "threads")
wf.get_output_files("cbioportal_segment", "run"),
params:
action_type="segment",
threads: wf.get_resource("cbioportal_segment", "run", "threads")
resources:
time=wf.get_resource("cbioportal_zscores", "run", "time"),
memory=wf.get_resource("cbioportal_zscores", "run", "memory"),
partition=wf.get_resource("cbioportal_zscores", "run", "partition"),
tmpdir=wf.get_resource("cbioportal_zscores", "run", "tmpdir"),
run:
wf.substep_dispatch("cbioportal_zscores", "get_df", output)
time=wf.get_resource("cbioportal_segment", "run", "time"),
memory=wf.get_resource("cbioportal_segment", "run", "memory"),
partition=wf.get_resource("cbioportal_segment", "run", "partition"),
tmpdir=wf.get_resource("cbioportal_segment", "run", "tmpdir"),
log:
**wf.get_log_file("cbioportal_segment", "run"),
wrapper:
wf.wrapper_path("cbioportal/merge_tables")


# Create expression RPKMs -----------------------------------------------------


rule cbioportal_compute_zscores:
rule cbioportal_export_expression:
input:
tsv="work/zscores_mapping_df.tsv",
**wf.get_input_files("cbioportal_expression", "run"),
output:
tsv="work/upload/data_expression_zscores.txt",
threads: wf.get_resource("cbioportal_compute_zscores", "run", "threads")
wf.get_output_files("cbioportal_expression", "run"),
params:
**wf.get_args("cbioportal_expression", "run"),
threads: wf.get_resource("cbioportal_expression", "run", "threads")
resources:
time=wf.get_resource("cbioportal_compute_zscores", "run", "time"),
memory=wf.get_resource("cbioportal_compute_zscores", "run", "memory"),
partition=wf.get_resource("cbioportal_compute_zscores", "run", "partition"),
tmpdir=wf.get_resource("cbioportal_compute_zscores", "run", "tmpdir"),
time=wf.get_resource("cbioportal_expression", "run", "time"),
memory=wf.get_resource("cbioportal_expression", "run", "memory"),
partition=wf.get_resource("cbioportal_expression", "run", "partition"),
tmpdir=wf.get_resource("cbioportal_expression", "run", "tmpdir"),
log:
**wf.get_log_file("cbioportal_expression", "run"),
wrapper:
wf.wrapper_path("cbioportal/zscores")
wf.wrapper_path("cbioportal/merge_tables")
Loading

0 comments on commit 70d5bb0

Please sign in to comment.