LCR-BCCRC · mannycruz · Jan 31, 2023 · Feb 2, 2023 · Feb 2, 2023 · Feb 2, 2023
diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml
@@ -0,0 +1,120 @@
+lcr-modules:
+
+    igv:
+
+        inputs:
+            # Available wildcards: {seq_type} {tumour_id} {normal_sample_id} {pair_status} {genome_build}
+            maf: "__UPDATE__"
+
+            # Available wildcards: {seq_type} {sample_id} {genome_build}
+            bam_path: "__UPDATE__"
+            bai_path: "__UPDATE__"
+
+
+        regions: 
+            # Provide regions files as lists in their respective genome builds so that liftover of coordinates occurs properly
+            # Please provide at least one regions file to filter MAF variants
+            oncodriveclustl:
+                grch37: ["__UPDATE__"]
+                hg38: []
+            hotmaps:
+                grch37: []
+                hg38: []
+            bed:
+                grch37: []
+                hg38: []
+            maf:
+                grch37: []
+                hg38: []
+            mutation_id:
+                # mutation_id format: minimum requirements are header containing "mutation_id_{regions_build}" column with values in {chr}:{pos} format
+                # e.g 
+                # mutation_id_grch37
+                # chr22:23230361
+                grch37: [] # e.g at minimum requires column mutation_id_grch37
+                hg38: [] # e.g at minimum requires column mutation_id_hg38
+
+        # Stop snakefile after MAF filtering step to estimate total number of snapshots that will be taken without running IGV
+        estimate_only: False 
+
+        options:
+
+            igv_version: "https://data.broadinstitute.org/igv/projects/downloads/2.7/IGV_Linux_2.7.2.zip"
+
+            genome_map: 
+                # Maps metadata builds to either grch37 or hg38 so that MAF file locations are determined correctly. Additional genome builds can be added as necessary.
+                grch37: ["grch37","hg19","hs37d5"]
+                hg38: ["hg38","grch38"] 
+
+            liftover_regions:
+                liftover_minMatch: "0.95" # Float number from 0 to 1 indicating minimal mapping when converting to a different genome build
+
+            generate_batch_script:
+                padding: 100 # Base pairs upstream and downstream of variant position
+                max_height: 1000 # Maximum height of snapshot
+                sleep_timer: 2000 # Batch scripts with more options may require longer sleep intervals
+                igv_options:
+                    # Presets for IGV snapshots
+                    # Available igv options: https://github.com/igvteam/igv/wiki/Batch-commands
+                    default: ["preference SAM.COLOR_BY READ_STRAND", "preference SAM.SHOW_CENTER_LINE TRUE", "preference SAM.SHADE_BASE_QUALITY true", "preference SAM.DOWNSAMPLE_READS FALSE", "preference SAM.ALLELE_THRESHOLD 0.05", "sort"]
+                    pairs: ["viewaspairs", "preference SAM.COLOR_BY READ_STRAND", "preference SAM.SHOW_CENTER_LINE TRUE", "preference SAM.SHADE_BASE_QUALITY true", "preference SAM.DOWNSAMPLE_READS FALSE", "preference SAM.ALLELE_THRESHOLD 0.05", "sort QUALITY"]
+
+            igv_presets: ["default"] # Available options: "default" "pairs"
+
+            xvfb_parameters:
+                # Server options for running xvfb
+                server_number: "99"
+                server_args: ""
+
+            quality_control:
+                # Truncated heights that have been previously observed for dimensions 1920x1080x24 
+                truncated: [506,533,545,547,559,570]
+                # Kurtosis and skewness values observed in blank snapshots at different height values
+                blank:
+                    "547":
+                        kurtosis: 18.5
+                        skewness: -4
+                    "559":
+                        kurtosis: 18.2
+                        skewness: -4
+                    "708":
+                        kurtosis: 26
+                        skewness: -5
+                # Previously observed heights of snapshots that fail IGV
+                failed: [506,533]
+
+        scripts:
+            format_regions: "etc/format_regions.py"
+            filter_script: "etc/filter_maf.py"
+            region_liftover_script: "{SCRIPTSDIR}/liftover/1.0/liftover.sh"
+            batch_script_per_variant: "etc/generate_batch_script_per_variant.py"
+            quality_control: "etc/quality_control.py"
+
+        scratch_subdirectories: []
+
+        conda_envs:
+            liftover: "{SCRIPTSDIR}/liftover/1.0/liftover.yaml"
+            wget: "{MODSDIR}/envs/wget/wget-1.20.1.yaml"
+
+        threads: 4
+
+        resources:
+            _igv_liftover_regions:
+                mem_mb: 2000
+            _igv_run:
+                mem_mb: 2500
+            _igv_quality_control:
+                mem_mb: 2500
+            _igv_symlink:
+                symlink: 1
+
+        pairing_config:
+            genome:
+                run_paired_tumours: True
+                run_unpaired_tumours_with: "unmatched_normal"
+                run_paired_tumours_as_unpaired: False
+            capture:
+                run_paired_tumours: True
+                run_unpaired_tumours_with: "unmatched_normal"
+                run_paired_tumours_as_unpaired: False
+
diff --git a/modules/igv/1.0/envs/wget-1.20.1.yaml b/modules/igv/1.0/envs/wget-1.20.1.yaml
@@ -0,0 +1 @@
+../../../../envs/wget/wget-1.20.1.yaml
diff --git a/modules/igv/1.0/etc/filter_maf.py b/modules/igv/1.0/etc/filter_maf.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import logging
+import traceback
+import pandas as pd
+import oncopipe as op
+
+
+def log_exceptions(exctype, value, tb):
+    logging.critical(''.join(traceback.format_tb(tb)))
+    logging.critical('{0}: {1}'.format(exctype, value))
+
+sys.excepthook = log_exceptions
+
+def main():
+
+    with open(snakemake.log[0], "w") as stdout:
+        # Set up logging
+        sys.stdout = stdout
+
+        try:
+
+            maf_file = snakemake.input[0]
+
+            regions_file = snakemake.input[1]
+            regions_format = snakemake.params[0]
+
+            metadata = snakemake.params[1]
+
+            output_file = snakemake.output[0]
+
+            # Return empty dataframe if no lines in MAF
+            line_count = count_lines(maf_file)
+            if line_count == 1:
+                empty_maf = pd.read_table(maf_file, comment="#", sep="\t")
+                # Add columns required by workflow
+                required_columns = ["seq_type","genome_build","chr_std"]
+                empty_maf = empty_maf.assign(**{col:None for col in required_columns if col not in empty_maf.columns})
+                write_output(empty_maf, output_file)
+                exit()
+
+            maf = maf_add_columns(maf=maf_file, metadata=metadata, wildcards=snakemake.wildcards)
+
+            # Perform filtering
+            filtered_maf = maf_filter(
+                maf=maf, 
+                regions=regions_file,
+                regions_format=regions_format
+                )
+
+            write_output(filtered_maf, output_file)
+
+        except Exception as e:
+            logging.error(e, exc_info=1)
+            raise
+
+def count_lines(maf):
+    with open(maf, "r") as handle:
+        total_lines = len(handle.readlines())
+    return total_lines
+
+def filter_by_bed(maf, regions):
+
+    # Remove row containing column names
+    regions = regions[regions[0].str.contains("chrom")==False]
+
+    # Create common columns between BED and MAF
+    regions["chr_std"] = regions.apply(lambda x: "chr" + str(x[0]).replace("chr",""), axis=1)
+    regions["genomic_pos_std"] = regions["chr_std"] + ":" + regions[1].map(str)
+
+    maf["chr_std"] = maf.apply(lambda x: "chr" + str(x["Chromosome"]).replace("chr",""), axis=1)
+    maf["genomic_pos_std"] = maf["chr_std"] + ":" + maf["Start_Position"].map(str)
+
+    filtered_maf = maf[maf["genomic_pos_std"].isin(regions["genomic_pos_std"])]
+    return filtered_maf
+
+def filter_by_maf(maf, regions):
+
+    # Create common column by which to subset MAF
+    for df in [maf, regions]:
+        df["chr_std"] = df.apply(lambda x: "chr" + str(x["Chromosome"]).replace("chr",""), axis=1)
+        df["genomic_pos_std"] = df["chr_std"] + ":" + df["Start_Position"].map(str)
+
+    # Subset the MAF
+    filtered_maf = maf[maf["genomic_pos_std"].isin(regions["genomic_pos_std"])]
+    return filtered_maf
+
+def maf_filter(maf, regions, regions_format):
+
+    if regions_format != "bed":
+        regions_df = pd.read_table(regions, comment="#", sep="\t")
+    else:
+        regions_df = pd.read_table(regions, comment="#", sep="\t", header=None)
+
+    # Return empty dataframe without filtering if df is empty
+    if len(maf)==0:
+        return maf
+
+    filter_functions = {
+        "maf": filter_by_maf,
+        "bed": filter_by_bed
+        }
+
+    return filter_functions[regions_format](maf, regions_df)
+
+def maf_add_columns(maf, metadata, wildcards):
+    # Read input MAF as df
+    maf = pd.read_table(maf, comment="#", sep="\t")
+
+    sample_id = snakemake.wildcards["tumour_id"]
+    seq_type = snakemake.wildcards["seq_type"]
+    genome_build = snakemake.wildcards["genome_build"]
+    normal_sample_id = snakemake.wildcards["normal_sample_id"]
+    pair_status = snakemake.wildcards["pair_status"]
+
+    maf["seq_type"] = seq_type
+    maf["genome_build"] = genome_build
+    maf["normal_sample_id"] = normal_sample_id
+    maf["pair_status"] = pair_status
+
+    return maf
+
+def write_output(maf, outfile):
+    maf.to_csv(outfile, sep="\t", na_rep="NA", index=False)
+
+if __name__ == "__main__":
+    logging.basicConfig(
+        level=logging.DEBUG,
+        filename=snakemake.log[1],
+        filemode='w'
+    )
+
+    main()