From bcf9916991d2a6c6171ba56cbad247429ef67ea7 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Tue, 31 Jan 2023 02:17:17 -0800 Subject: [PATCH 001/132] initial IGV module, specify regions using MAF --- modules/igv/1.0/config/default.yaml | 70 +++++ modules/igv/1.0/envs/crossmap.yaml | 56 ++++ modules/igv/1.0/etc/generate_batch_scripts.py | 297 ++++++++++++++++++ modules/igv/1.0/etc/liftover_regions.sh | 49 +++ modules/igv/1.0/igv.smk | 269 ++++++++++++++++ modules/igv/1.0/schemas/base-1.0.yaml | 1 + 6 files changed, 742 insertions(+) create mode 100644 modules/igv/1.0/config/default.yaml create mode 100644 modules/igv/1.0/envs/crossmap.yaml create mode 100755 modules/igv/1.0/etc/generate_batch_scripts.py create mode 100755 modules/igv/1.0/etc/liftover_regions.sh create mode 100644 modules/igv/1.0/igv.smk create mode 120000 modules/igv/1.0/schemas/base-1.0.yaml diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml new file mode 100644 index 00000000..a02a0ab7 --- /dev/null +++ b/modules/igv/1.0/config/default.yaml @@ -0,0 +1,70 @@ +lcr-modules: + + igv: + + # TODO: Update the list of available wildcards, if applicable + inputs: + # Available wildcards: {seq_type} {genome_build} {sample_id} + master_maf: # MAFs to draw sample_id and variants from + metadata: # Metadata containing bam link names, sample_id, and genome_build columns + + regions_file: "__UPDATE__" # Path to a MAF, VCF, BED or BEDPE file containing regions of interest to create snapshots of. + regions_format: "__UPDATE__" + regions_build: "__UPDATE__" # Genome build of regions file, will be lifted over to filter MAFs on opposite genome builds + + filter_maf: + oncodriveclustl_options: # These parameters will filter down the OncodriveCLUSTL cluster results file. + q_value: # Desired q-value of OncodriveCLUSTL clusters + scores: # Desired scores of OncodriveCLUSTL clusters + n_samples: # Desired number of samples in OncodriveCLUSTL clusters + + + genome_map: # Map different builds in metadata + grch37: ["grch37","hg19","hg19-clc","hg19-reddy","hs37d5"] + hg38: ["hg38","hg38-nci","hg38-panea","grch38"] + + liftover_regions: + reference_chain_file: + grch37: "genomes/grch37/chains/grch37/hg19ToHg38.over.chain" + hg38: "genomes/hg38/chains/grch38/hg38ToHg19.over.chain" + target_reference: + grch37: "/projects/rmorin/reference/igenomes/Homo_sapiens/GSC/GRCh37-lite/Sequence/WholeGenomeFasta/genome.fa" + hg38: "/projects/rmorin/reference/igenomes/Homo_sapiens/GSC/GRCh37-lite/Sequence/WholeGenomeFasta/genome.fa" + + generate_batch_script: + padding: 300 + max_height: 400 + n_snapshots: 20 # Number of snapshots to take per unique variant. Default is 20. + batch_script_per_sample: # True/False. Default is False. If True, a batch script will be created to accompany each snapshot for easy access in IGV. + + scripts: + region_liftover_script: "{MODSDIR}/etc/liftover_regions.sh" + batch_script: "{MODSDIR}/etc/generate_batch_scripts.py" + + scratch_subdirectories: [] + + options: + step_1: "" + step_2: "" + + conda_envs: + liftover_regions: "{MODSDIR}/envs/crossmap.yaml" + batch_script: "{MODSDIR}/envs/samtools-1.9.yaml" + wget: "{REPODIR}/envs/wget/wget-1.20.1.yaml" + + threads: + step_1: 4 + + resources: + step_1: + mem_mb: 2000 + + pairing_config: + genome: + run_paired_tumours: False + run_unpaired_tumours_with: "no_normal" + run_paired_tumours_as_unpaired: False + mrna: + run_paired_tumours: False + run_unpaired_tumours_with: "no_normal" + run_paired_tumours_as_unpaired: False diff --git a/modules/igv/1.0/envs/crossmap.yaml b/modules/igv/1.0/envs/crossmap.yaml new file mode 100644 index 00000000..cdbfbed3 --- /dev/null +++ b/modules/igv/1.0/envs/crossmap.yaml @@ -0,0 +1,56 @@ +name: crossmap +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - bx-python=0.9.0 + - bzip2=1.0.8 + - ca-certificates=2022.12.7 + - crossmap=0.6.5 + - curl=7.71.1 + - cython=0.29.32 + - krb5=1.17.2 + - ld_impl_linux-64=2.40 + - libblas=3.9.0 + - libcblas=3.9.0 + - libcurl=7.71.1 + - libdeflate=1.0 + - libedit=3.1.20191231 + - libffi=3.4.2 + - libgcc=7.2.0 + - libgcc-ng=12.2.0 + - libgfortran-ng=12.2.0 + - libgfortran5=12.2.0 + - libgomp=12.2.0 + - liblapack=3.9.0 + - libnsl=2.0.0 + - libopenblas=0.3.21 + - libpng=1.6.39 + - libsqlite=3.40.0 + - libssh2=1.10.0 + - libstdcxx-ng=12.2.0 + - libuuid=2.32.1 + - libzlib=1.2.13 + - lzo=2.10 + - mysql-connector-c=6.1.6 + - ncurses=6.3 + - numpy=1.21.6 + - openssl=1.1.1s + - pip=22.3.1 + - pybigwig=0.3.18 + - pysam=0.15.3 + - python=3.7.12 + - python-lzo=1.14 + - python_abi=3.7 + - readline=8.1.2 + - setuptools=66.1.1 + - sqlite=3.40.0 + - tk=8.6.12 + - ucsc-wigtobigwig=357 + - wheel=0.38.4 + - xz=5.2.6 + - zlib=1.2.13 +prefix: /home/mcruz/miniconda3/envs/crossmap diff --git a/modules/igv/1.0/etc/generate_batch_scripts.py b/modules/igv/1.0/etc/generate_batch_scripts.py new file mode 100755 index 00000000..9ff0626a --- /dev/null +++ b/modules/igv/1.0/etc/generate_batch_scripts.py @@ -0,0 +1,297 @@ +#!/usr/bin/env python3 + +import os +import warnings +import argparse +import numpy as np +import pandas as pd +import oncopipe as op +import math + +def main(): + # Parse arguments + args = parse_arguments() + + # Read MAF file containing variants and create a dataframe linking regions, sample_ids, and bam paths + regions = get_regions_df( + args.input_maf, + metadata=args.metadata, + seq_type=args.seq_type, + padding=args.padding) + + # Format and output the batch script + generate_igv_batch( + regions = regions, + output = args.output, + max_height = args.max_height, + seq_type = args.seq_type, + genome_build = args.genome_build, + snapshot_dir=args.snapshot_dir, + n_snapshots=args.n_snapshots) + + close_files(args) + +def parse_arguments(): + parser = argparse.ArgumentParser(description=__doc__) + + parser.add_argument( + "input_maf", + type=argparse.FileType("r"), + default="-", + help=f"Input MAF. Can be '-' for stdin" + ) + + parser.add_argument( + "--output", + "-o", + metavar="OUTPUT_FILE", + type=argparse.FileType("w"), + help="Output IGV batch script." + ) + + parser.add_argument( + "--metadata", + "-v", + metavar="METADATA", + type=argparse.FileType("r"), + help="Metadata mapping sample IDs to BAM paths" + ) + + default_padding = 300 + parser.add_argument( + "--padding", + "-p", + type=int, + default=default_padding, + help=( + "Amount of padding added before and after each locus. " + f"Default padding is {default_padding}" + ), + ) + + default_max_height = 400 + parser.add_argument( + "--max_height", + "-m", + type=int, + default=default_max_height, + help="Maximum panel height in IGV. Default max height is {default_max_height}" + ) + + parser.add_argument( + "--snapshot_dir", + "-d", + required=True, + help=( + "Parent directory where {chromosome}/{region} subdirectories will be " + "populated with IGV snapshots." + ) + ) + + parser.add_argument( + "--n_snapshots", + "-n", + type=int, + default=20, + help=( + "Maximum number of different snapshots for each position." + ) + ) + + parser.add_argument( + "--genome_build", + "-g", + required=True, + help="Specify IGV genome build for snapshots." + ) + + parser.add_argument( + "--seq_type", + "-s", + required=True, + type=str, + help="Specify sequencing type for BAM extraction." + ) + + args = parser.parse_args() + + return args + +def get_regions_df(input_maf, metadata, seq_type, padding): + # Read MAF as dataframe + maf = pd.read_table(input_maf, comment="#") + + # Read metadata as dataframe + metadata = pd.read_table(metadata, comment="#") + + # Filter metadata down to only samples of required seq_type + metadata = metadata[metadata["seq_type"]==seq_type] + metadata = metadata[["sample_id","link_name"]] + + # Make sure required minimum columns are present in the maf + columns = [ + "Chromosome", + "Start_Position", + "End_Position", + "Tumor_Sample_Barcode", + ] + + assert(all(c in list(maf.columns) for c in columns)), ( + "The following required columns are missing: " + f"{[columns[missing_ix] for missing_ix in [ix for ix, bool_val in enumerate([col not in list(maf.columns) for col in columns]) if bool_val==True]]}" + ) + + # Check if there are issues with input file + for column in columns: + column_values = maf[column] + is_any_na = pd.isna(column_values).any() + assert not is_any_na, ( + f"The '{column}' column contains NA values. This might be caused " + "by an incorrectly formatted input MAF file. Please ensure that " + f"all of the following columns have values: {', '.join(columns)}." + f"Here's a preview of the MAF file after being parsed:\n\n {maf}" + ) + + # Create a pandas dataframe with to link regions with sample_ids and bam files + + chrom = (maf["Chromosome"].astype(str)).apply(lambda x: x.replace("chr","")) + + # Snapshots will be held in parent directories of 1000-nt intervals for easier navigation + dir_start = ((maf["Start_Position"] / 1000).apply(lambda x: math.trunc(x)) * 1000).astype(str) + dir_end = (dir_start.astype(int) + 1000).astype(str) + dir_regions = "chr" + chrom + ":" + dir_start + "_" + dir_end + + # Specify the regions that will be captured by IGV based on variant positions and padding + region_start = (maf["Start_Position"] - padding).astype(str) + region_end = (maf["End_Position"] + padding).astype(str) + regions = "chr" + chrom + ":" + region_start + "-" + region_end + + regions_df = pd.DataFrame( + {"dir_regions": dir_regions, + "regions": regions, + "region_name": maf.Hugo_Symbol, + "sample_id": maf.Tumor_Sample_Barcode, + } + ) + + # Link bam paths to regions by merging metadata and regions dataframes by sample_id + regions_df = pd.merge(regions_df, metadata, on="sample_id", how="left") + + return regions_df + +def generate_igv_batch_header(bam_file, max_height, snapshot_dir, genome_build): + lines = [] + + bam_file = os.path.realpath(bam_file) + lines.append(f"load {bam_file}") + + lines.append(f"maxPanelHeight {max_height}") + lines.append(f"snapshotDirectory {snapshot_dir}") + lines.append(f"genome {genome_build}") + + return lines + +def generate_igv_batch_per_row(regions, snapshot_filename): + lines = [] + lines.append(f"goto {regions}") + lines.append("sort") + lines.append("collapse") + lines.append(f"snapshot {snapshot_filename}") + lines.append("new") + + return lines + +def generate_igv_batch_per_region(regions, max_height, genome_build, snapshot_dir): + + # Lines of batch script + lines = [] + + # Add lines to batch script for each region + for _, row in regions.iterrows(): + filename = [] + + filename.append(row.regions) + if "region_name" in row: + filename.append(row.region_name) + filename.append(row.sample_id) + + filename = "--".join(filename) + ".png" + filename = filename.replace(" ", "_") + + bam_file = row.link_name + + genome_build = genome_build.replace("grch37","hg19").replace("grch38","hg38") + + header = generate_igv_batch_header( + bam_file, max_height, snapshot_dir, genome_build + ) + lines.extend(header) + + row_lines = generate_igv_batch_per_row(regions = row.regions, snapshot_filename = filename) + + lines.extend(row_lines) + + return lines + +def close_files(args): + args_dict = vars(args) + for arg_value in args_dict.values(): + if hasattr(arg_value, "close"): + arg_value.close() + +def generate_igv_batch_footer(): + lines = [] + lines.append("exit") + return lines + +def output_lines(lines, output): + lines.append("") + text = "\n".join(lines) + output.write(text) + +def generate_igv_batch(regions, output, max_height, seq_type, genome_build, snapshot_dir, n_snapshots): + + # The lines for the batch script encompassing all regions and sample_ids + all_lines = [] + + # Create batch scripts per unique 1000-nt interval region + dir_regions = regions.dir_regions.unique() + + for dir_region in dir_regions: + + # Get chromosome string for parent directory + dir_chrom = dir_region.split(":")[0] + # Get 1000nt interval region for subdirectory + dir_interval = dir_region.split(":")[1] + + seq_type_build = f"{seq_type}--{genome_build}" + + region_snapshot_dir = os.path.join(snapshot_dir, seq_type_build, dir_chrom, dir_interval, "") + + # Subset all regions down to those in the 1000nt interval + regions_in_dir = regions[regions["dir_regions"]==dir_region] + + # Iterate through unique regions within interval + for unique_region in regions_in_dir.regions.unique(): + + # Subset rows by number of snapshots desired per region + regions_subset = regions_in_dir[regions_in_dir["regions"]==unique_region][:n_snapshots] + + # Generate lines of batch script per region subset + lines = generate_igv_batch_per_region( + regions = regions_subset, + max_height=max_height, + genome_build=genome_build, + snapshot_dir=region_snapshot_dir) + + if lines is not None: + all_lines.extend(lines) + + footer = generate_igv_batch_footer() + all_lines.extend(footer) + + output_lines(all_lines, output) + +if __name__ == "__main__": + main() + diff --git a/modules/igv/1.0/etc/liftover_regions.sh b/modules/igv/1.0/etc/liftover_regions.sh new file mode 100755 index 00000000..45b86b2f --- /dev/null +++ b/modules/igv/1.0/etc/liftover_regions.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# Use CrossMap.py to convert genomic coordinates between GRCh37 and GRCh38 for MAF, VCF, BED or BEDPE files. +# +# Usage: liftover_regions.sh \ +# \ +# \ +# \ +# \ +# \ +# \ +# + +input_regions=$1 +input_type=$2 +target_build=$3 +output_file=$4 +chain_file=$5 +target_ref=$6 + +echo "Input regions file: $input_regions" +echo "Input regions type: $input_type" +echo "Target genome build: $target_build" +echo "Output file: $output_file" +echo "Chain file: $chain_file" +echo "Target reference: $target_ref" + +intermediate_output_file=$(echo $output_file)_int + +# MAFs +# Check genome build of incoming MAF file to determine what build it needs to be changed to +if [ "$input_type" == "maf" ] ; +then + echo "Proceeding with MAF input..." + + if grep -q $target_build $input_regions ; + then + echo "WARNING: Input regions file $input_regions is already $target_build. Copying contents of $input_regions to $output_file"; + cut -f 1,5,6,7,9,10,11,13,16 $input_regions > $output_file + else + echo "Input regions file $input_regions does not appear to be $target_build. Proceeding with conversion to $target_build" + echo "CrossMap.py maf $chain_file $input_regions $target_ref $target_build $output_file" + CrossMap.py maf $chain_file $input_regions $target_ref $target_build $intermediate_output_file + cut -f 1,5,6,7,9,10,11,13,16 $intermediate_output_file > $output_file + rm $intermediate_output_file + fi + echo "Finished MAF block." +fi +echo "End of bash script" diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk new file mode 100644 index 00000000..3adb4864 --- /dev/null +++ b/modules/igv/1.0/igv.smk @@ -0,0 +1,269 @@ +#!/usr/bin/env snakemake + + +##### ATTRIBUTION ##### + + +# Original Author: N/A +# Module Author: Manuela Cruz +# Contributors: N/A + + +##### SETUP ##### + +# Import package with useful functions for developing analysis modules +import oncopipe as op +import pandas as pd + +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + print('\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + '\x1b[0m') + print('\x1b[0;31;40m' + f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m') + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + +# Setup module and store module-specific configuration in `CFG` +# `CFG` is a shortcut to `config["lcr-modules"]["igv"]` +CFG = op.setup_module( + name = "igv", + version = "1.0", + subdirectories = ["inputs", "batch_scripts", "igv", "snapshots", "outputs"], +) + + +# Define rules to be run locally when using a compute cluster +# TODO: Replace with actual rules once you change the rule names +localrules: + _igv_symlink_regions_file, + _igv_symlink_metadata, + _igv_symlink_maf, + _igv_liftover_regions, + _igv_create_batch_script, + _igv_download_igv, + _igv_run + + +##### RULES ##### + + +# Symlinks the input files into the module results directory (under '00-inputs/') +rule _igv_symlink_regions_file: + input: + regions_file = CFG["inputs"]["regions_file"] + output: + regions_file = CFG["dirs"]["inputs"] + "regions/regions_file.txt" + run: + op.absolute_symlink(input.regions_file, output.regions_file) + +rule _igv_symlink_metadata: + input: + metadata = CFG["inputs"]["metadata"] + output: + metadata = CFG["dirs"]["inputs"] + "metadata/metadata.tsv" + run: + op.absolute_symlink(input.metadata, output.metadata) + +rule _igv_symlink_maf: + input: + maf = CFG["inputs"]["master_maf"] + output: + maf = CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}.maf" + run: + op.absolute_symlink(input.maf, output.maf) + +rule _igv_reduce_maf_cols: + input: + maf = str(rules._igv_symlink_maf.output.maf) + output: + maf = temp(CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}_cols.maf") + shell: + op.as_one_line(""" + cut -f 1,5,6,7,9,10,11,13,16 {input.maf} > {output.maf} + """) + + +rule _igv_liftover_regions: + input: + regions = str(rules._igv_symlink_regions_file.output.regions_file), + liftover_script = CFG["scripts"]["region_liftover_script"] + output: + regions_lifted = CFG["dirs"]["inputs"] + "regions/regions_file_{genome_build}.txt" + params: + chain_file = reference_files(CFG["liftover_regions"]["reference_chain_file"][(CFG["inputs"]["regions_build"]).replace("hg19","grch37").replace("grch38","hg38")]), + target_reference = lambda w: config["lcr-modules"]["igv"]["liftover_regions"]["target_reference"][w.genome_build], + regions_type = CFG["inputs"]["regions_format"].lower(), + target_build = lambda w: w.genome_build.replace("grch37","GRCh37").replace("hg38", "GRCh38") + conda: + CFG["conda_envs"]["liftover_regions"] + log: + stdout = CFG["logs"]["inputs"] + "liftover_regions_{genome_build}.stdout.log", + stderr = CFG["logs"]["inputs"] + "liftover_regions_{genome_build}.stderr.log" + shell: + op.as_one_line(""" + {input.liftover_script} {input.regions} + {params.regions_type} {params.target_build} + {output.regions_lifted} {params.chain_file} + {params.target_reference} > {log.stdout} 2> {log.stderr} + """) + +# Filter the MAF based on regions file +rule _igv_filter_maf: + input: + maf = str(rules._igv_reduce_maf_cols.output.maf), + regions = str(rules._igv_liftover_regions.output.regions_lifted) + output: + maf_filtered = CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}_cols_filtered.maf" + params: + regions_format = CFG["inputs"]["regions_format"].lower(), + metadata = CFG["inputs"]["metadata"], + genome_build = lambda w: w.genome_build, + seq_type = lambda w: w.seq_type, + genome_map = CFG["genome_map"], + oncodriveclustl_params = CFG["filter_maf"]["oncodriveclustl_options"] + run: + # Read input MAF and regions into pandas df + maf_df = pd.read_table(input.maf, comment="#", sep="\t") + regions_df = pd.read_table(input.regions, comment="#", sep="\t") + if params.regions_format in ["maf"]: + # Create common columns to subset the larger MAF down + count = 0 + for df in [maf_df, regions_df]: + count += 1 + if count == 1: + print(f"Working on maf df") + if count == 2: + print(f"Working on regions df") + df["chr_std"] = df.apply(lambda x: str(x["Chromosome"]).replace("chr",""), axis=1) + df["genomic_pos_std"] = df["chr_std"] + ":" + df["Start_Position"].map(str) + "_" + df["End_Position"].map(str) + + # Filter larger MAF + filtered_maf = maf_df[maf_df["genomic_pos_std"].isin(regions_df["genomic_pos_std"])] + + # Filter only to BAM files of corresponding build and seq_type + SAMPLES = op.load_samples(params.metadata) + SAMPLES = op.filter_samples(SAMPLES, seq_type=params.seq_type) + genome_build_list = params.genome_map[params.genome_build] + print(f"Only including samples of these builds: {genome_build_list}") + BUILD_SAMPLES = op.filter_samples(SAMPLES, genome_build=genome_build_list) + filtered_maf = filtered_maf[filtered_maf["Tumor_Sample_Barcode"].isin(BUILD_SAMPLES.sample_id)] + + # Write output + filtered_maf.to_csv(output.maf_filtered, sep="\t") + + +# Pass filtered MAF to create batch script +rule _igv_create_batch_script: + input: + maf_filtered = str(rules._igv_filter_maf.output.maf_filtered), + metadata = str(rules._igv_symlink_metadata.output.metadata) + output: + batch_script = temp(CFG["dirs"]["batch_scripts"] + "{seq_type}--{genome_build}.batch") + params: + py_script = CFG["scripts"]["batch_script"], + snapshot_dir = CFG["dirs"]["snapshots"], + genome_build = lambda w: w.genome_build, + seq_type = lambda w: w.seq_type, + padding = CFG["generate_batch_script"]["padding"], + max_height = CFG["generate_batch_script"]["max_height"], + n_snapshots = CFG["generate_batch_script"]["n_snapshots"] + log: + stdout = CFG["logs"]["batch_scripts"] + "{seq_type}--{genome_build}_batch_script.stdout.log", + stderr = CFG["logs"]["batch_scripts"] + "{seq_type}--{genome_build}_batch_script.stderr.log" + shell: + op.as_one_line(""" + {params.py_script} {input.maf_filtered} + --output {output.batch_script} --metadata {input.metadata} + --padding {params.padding} --max_height {params.max_height} + --snapshot_dir {params.snapshot_dir} --n_snapshots {params.n_snapshots} + --genome_build {params.genome_build} --seq_type {params.seq_type} > {log.stdout} 2> {log.stderr} + """) + +#rule _igv_merge_batch_scripts: +# input: +# batch_scripts = expand(str(rules._igv_create_batch_script.output.batch_script), genome_build=["hg38","grch37"], seq_type=["capture","genome"]), +# output: +# merged_batch = CFG["dirs"]["batch_scripts"] + "merged_script.batch" +# params: +# script_dir = CFG["dirs"]["batch_scripts"] +# shell: +# op.as_one_line(""" +# batch_dir={params.script_dir} && +# cat <(cat $(echo $batch_dir)/*.batch) <(echo end) | awk '{{ if ($0 !~ "exit") print $0 }}' | sed 's/end/exit\n/g' > {output.merged_batch} +# """) + + +#### WHEN LAST CHECKED DRY RUN WORKS UP TO HERE B-) +rule _igv_download_igv: + output: + igv_zip = CFG["dirs"]["igv"] + "IGV_2.7.2.zip", + igv_installed = CFG["dirs"]["igv"] + "igv_2.7.2.installed" + conda: + CFG["conda_envs"]["wget"] + log: + stdout = CFG["logs"]["igv"] + "download_igv.stdout.log", + stderr = CFG["logs"]["igv"] + "download_igv.stderr.log" + shell: + op.as_one_line(""" + wget -O {output.igv_zip} https://data.broadinstitute.org/igv/projects/downloads/2.7/IGV_Linux_2.7.2.zip && + unzip {output.igv_zip} -d $(dirname {output.igv_zip}) > {log.stdout} 2> {log.stderr} && + touch {output.igv_installed} + """) + +rule _igv_run: + input: + batch_script = str(rules._igv_create_batch_script.output.batch_script), + igv_installed = str(rules._igv_download_igv.output.igv_installed), + output: + success = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}_snapshot.finished" + params: + #igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh" + igv = "/projects/rmorin/projects/RNA_seq_ssm/test/bin/IGV_Linux_2.7.2/igv.sh" + log: + stdout = CFG["logs"]["igv"] + "run_igv_{seq_type}--{genome_build}.stdout.log", + stderr = CFG["logs"]["igv"] + "run_igv_{seq_type}--{genome_build}.stderr.log" + shell: + op.as_one_line(""" + xvfb-run --auto-servernum {params.igv} -b {input.batch_script} > {log.stdout} 2> {log.stderr} && + touch {output.success} + """) + +#rule _igv_run: +# input: +# batch_script = str(rules._igv_merge_batch_scripts.output.merged_batch), +# igv_installed = str(rules._igv_download_igv.output.igv_installed), +# output: +# success = CFG["dirs"]["outputs"] + "merged_batch_snapshot.finished" +# params: +# igv = CFG["dirs"]["igv"] + "IGV_Linux_2.16.0/igv.sh" +# log: +# stdout = CFG["logs"]["igv"] + "run_igv_merged_batch.stdout.log", +# stderr = CFG["logs"]["igv"] + "run_igv_merged_batch.stderr.log" +# shell: +# op.as_one_line(""" +# xvfb-run --auto-servernum {params.igv} -b {input.batch_script} > {log.stdout} 2> {log.stderr} && +# touch {output.success} +# """) + +# Generates the target sentinels for each run, which generate the symlinks +rule _igv_all: + input: + expand(rules._igv_run.output.success, seq_type=["genome","capture"], genome_build=["hg38","grch37"]) + + +##### CLEANUP ##### + + +# Perform some clean-up tasks, including storing the module-specific +# configuration on disk and deleting the `CFG` variable +op.cleanup_module(CFG) diff --git a/modules/igv/1.0/schemas/base-1.0.yaml b/modules/igv/1.0/schemas/base-1.0.yaml new file mode 120000 index 00000000..0a69d1ce --- /dev/null +++ b/modules/igv/1.0/schemas/base-1.0.yaml @@ -0,0 +1 @@ +../../../../schemas/base/base-1.0.yaml \ No newline at end of file From f2091a44040a072bc4fd21745ded68a8fcf22b18 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 1 Feb 2023 18:43:20 -0800 Subject: [PATCH 002/132] Add rules to reformat input regions files --- modules/igv/1.0/igv.smk | 100 +++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 57 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 3adb4864..ec6a36d3 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -48,6 +48,7 @@ localrules: _igv_symlink_regions_file, _igv_symlink_metadata, _igv_symlink_maf, + _igv_format_regions_file, _igv_liftover_regions, _igv_create_batch_script, _igv_download_igv, @@ -92,17 +93,36 @@ rule _igv_reduce_maf_cols: cut -f 1,5,6,7,9,10,11,13,16 {input.maf} > {output.maf} """) +# Prepare regions file for liftover +rule _igv_format_regions_file: + input: + regions = str(rules._igv_symlink_regions_file.output.regions_file) + output: + regions = config["lcr-modules"]["igv"]["dirs"]["inputs"] + "regions/regions_file_formatted.txt" + params: + regions_format = config["lcr-modules"]["igv"]["inputs"]["regions_format"], + oncodriveclustl_params = config["lcr-modules"]["igv"]["filter_maf"]["oncodriveclustl_options"] + script: + config["lcr-modules"]["igv"]["scripts"]["format_regions"] + +REGIONS_FORMAT = { + "maf": "maf", + "oncodriveclustl": "bed", + "hotmaps": "bed", + "genomic_regions": "bed" +} rule _igv_liftover_regions: input: - regions = str(rules._igv_symlink_regions_file.output.regions_file), + regions = str(rules._igv_format_regions_file.output.regions), liftover_script = CFG["scripts"]["region_liftover_script"] output: regions_lifted = CFG["dirs"]["inputs"] + "regions/regions_file_{genome_build}.txt" params: chain_file = reference_files(CFG["liftover_regions"]["reference_chain_file"][(CFG["inputs"]["regions_build"]).replace("hg19","grch37").replace("grch38","hg38")]), target_reference = lambda w: config["lcr-modules"]["igv"]["liftover_regions"]["target_reference"][w.genome_build], - regions_type = CFG["inputs"]["regions_format"].lower(), + regions_type = REGIONS_FORMAT[CFG["inputs"]["regions_format"].lower()], + regions_build = CFG["inputs"]["regions_build"].replace("grch37","GRCh37").replace("hg38","GRCh38"), target_build = lambda w: w.genome_build.replace("grch37","GRCh37").replace("hg38", "GRCh38") conda: CFG["conda_envs"]["liftover_regions"] @@ -112,55 +132,27 @@ rule _igv_liftover_regions: shell: op.as_one_line(""" {input.liftover_script} {input.regions} - {params.regions_type} {params.target_build} + {params.regions_type} {params.regions_build} {params.target_build} {output.regions_lifted} {params.chain_file} {params.target_reference} > {log.stdout} 2> {log.stderr} """) -# Filter the MAF based on regions file +# Pass metadata as a pandas dataframe directly from the samples value specified in config rule _igv_filter_maf: input: maf = str(rules._igv_reduce_maf_cols.output.maf), regions = str(rules._igv_liftover_regions.output.regions_lifted) output: - maf_filtered = CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}_cols_filtered.maf" + maf_filtered = config["lcr-modules"]["igv"]["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}_cols_filtered.maf" params: - regions_format = CFG["inputs"]["regions_format"].lower(), - metadata = CFG["inputs"]["metadata"], + regions_format = REGIONS_FORMAT[config["lcr-modules"]["igv"]["inputs"]["regions_format"].lower()], + metadata = config["lcr-modules"]["igv"]["samples"] if CFG["inputs"]["metadata"] is None else CFG["inputs"]["metadata"], genome_build = lambda w: w.genome_build, seq_type = lambda w: w.seq_type, - genome_map = CFG["genome_map"], - oncodriveclustl_params = CFG["filter_maf"]["oncodriveclustl_options"] - run: - # Read input MAF and regions into pandas df - maf_df = pd.read_table(input.maf, comment="#", sep="\t") - regions_df = pd.read_table(input.regions, comment="#", sep="\t") - if params.regions_format in ["maf"]: - # Create common columns to subset the larger MAF down - count = 0 - for df in [maf_df, regions_df]: - count += 1 - if count == 1: - print(f"Working on maf df") - if count == 2: - print(f"Working on regions df") - df["chr_std"] = df.apply(lambda x: str(x["Chromosome"]).replace("chr",""), axis=1) - df["genomic_pos_std"] = df["chr_std"] + ":" + df["Start_Position"].map(str) + "_" + df["End_Position"].map(str) - - # Filter larger MAF - filtered_maf = maf_df[maf_df["genomic_pos_std"].isin(regions_df["genomic_pos_std"])] - - # Filter only to BAM files of corresponding build and seq_type - SAMPLES = op.load_samples(params.metadata) - SAMPLES = op.filter_samples(SAMPLES, seq_type=params.seq_type) - genome_build_list = params.genome_map[params.genome_build] - print(f"Only including samples of these builds: {genome_build_list}") - BUILD_SAMPLES = op.filter_samples(SAMPLES, genome_build=genome_build_list) - filtered_maf = filtered_maf[filtered_maf["Tumor_Sample_Barcode"].isin(BUILD_SAMPLES.sample_id)] - - # Write output - filtered_maf.to_csv(output.maf_filtered, sep="\t") - + genome_map = config["lcr-modules"]["igv"]["genome_map"], + oncodriveclustl_params = config["lcr-modules"]["igv"]["filter_maf"]["oncodriveclustl_options"] + script: + config["lcr-modules"]["igv"]["scripts"]["filter_script"] # Pass filtered MAF to create batch script rule _igv_create_batch_script: @@ -189,21 +181,6 @@ rule _igv_create_batch_script: --genome_build {params.genome_build} --seq_type {params.seq_type} > {log.stdout} 2> {log.stderr} """) -#rule _igv_merge_batch_scripts: -# input: -# batch_scripts = expand(str(rules._igv_create_batch_script.output.batch_script), genome_build=["hg38","grch37"], seq_type=["capture","genome"]), -# output: -# merged_batch = CFG["dirs"]["batch_scripts"] + "merged_script.batch" -# params: -# script_dir = CFG["dirs"]["batch_scripts"] -# shell: -# op.as_one_line(""" -# batch_dir={params.script_dir} && -# cat <(cat $(echo $batch_dir)/*.batch) <(echo end) | awk '{{ if ($0 !~ "exit") print $0 }}' | sed 's/end/exit\n/g' > {output.merged_batch} -# """) - - -#### WHEN LAST CHECKED DRY RUN WORKS UP TO HERE B-) rule _igv_download_igv: output: igv_zip = CFG["dirs"]["igv"] + "IGV_2.7.2.zip", @@ -256,9 +233,18 @@ rule _igv_run: # """) # Generates the target sentinels for each run, which generate the symlinks -rule _igv_all: - input: - expand(rules._igv_run.output.success, seq_type=["genome","capture"], genome_build=["hg38","grch37"]) +if CFG["test_run"] is False: + rule _igv_all: + input: + expand(rules._igv_run.output.success, seq_type=["genome","capture"], genome_build=["hg38","grch37"]) + + +if CFG["test_run"] is True: + rule _igv_all: + input: + expand(rules._igv_filter_maf.output.maf_filtered, seq_type=["genome","capture"], genome_build=["hg38","grch37"]) + + ##### CLEANUP ##### From f6980b9585b3b17f94e27d690194b90c9e39ee68 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 1 Feb 2023 18:43:58 -0800 Subject: [PATCH 003/132] Add script to perform regions reformatting --- modules/igv/1.0/etc/format_regions.py | 70 +++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100755 modules/igv/1.0/etc/format_regions.py diff --git a/modules/igv/1.0/etc/format_regions.py b/modules/igv/1.0/etc/format_regions.py new file mode 100755 index 00000000..89fdd6a7 --- /dev/null +++ b/modules/igv/1.0/etc/format_regions.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python + +import os +import pandas as pd +import oncopipe as op + +def format_clustl(clustl_regions): + # Convert OncodriveCLUSTL cluster coordinates to BED format + p_filter = CLUSTL_PARAMS["p_value"] + score_filter = CLUSTL_PARAMS["score"] + n_samples_filter = CLUSTL_PARAMS["n_samples"] + + for key, filter_value in {"P": p_filter, "SCORE": score_filter, "N_SAMPLES": n_samples_filter}.items(): + if filter_value is not None: + if key != "P": + clustl_regions = clustl_regions[clustl_regions[key] >= float(filter_value)] + if key == "P": + clustl_regions = clustl_regions[clustl_regions[key] <= float(filter_value)] + + # Reformat CLUSTL coordinates to handle clusters that cross introns (when CLUSTL concatenated mode is used) + clustl_regions = clustl_regions.assign(COORDINATES = clustl_regions.COORDINATES.str.split(";")).explode("COORDINATES") + clustl_regions["COORDINATES"] = clustl_regions.apply( + lambda x: list( + range( + int(str(x["COORDINATES"]).split(",")[0]), int(str(x["COORDINATES"]).split(",")[1]) + 1 + ) + ) + if str(x["COORDINATES"]).split(",")[0] != str(x["COORDINATES"]).split(",")[1] else int(str(x["COORDINATES"]).split(",")[0]), + axis = 1 + ) + clustl_regions = clustl_regions.explode("COORDINATES") + + # Create columnsn required for BED format + chr_str = "chr" + clustl_regions["CHROMOSOME"].map(str) + clustl_reformatted = pd.DataFrame( + { + "chrom": chr_str, + "start": clustl_regions["COORDINATES"], + "end": clustl_regions["COORDINATES"] + } + ) + return clustl_reformatted + +def format_maf(regions): + # If the regions format is a MAF, don't need to reformat for liftover + return regions + +def format_regions(regions, regions_format): + format_functions = { + "maf": format_maf, + "oncodriveclustl": format_clustl, + "genomic_pos": format_genomic_pos + } + + return format_functions[regions_format](regions) + +regions_file = snakemake.input[0] +regions_format = snakemake.params[0] + +output_file = snakemake.output[0] + +if regions_format == "oncodriveclustl": + CLUSTL_PARAMS = snakemake.params[1] + +# Read regions into dataframe +regions_df = pd.read_table(regions_file, comment="#", sep="\t") + +regions_formatted = format_regions(regions_df, regions_format) + +regions_formatted.to_csv(output_file, sep="\t", index=False) \ No newline at end of file From e54b85c6f425e6bc00e2f0a256c99d6b52ffa60a Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 1 Feb 2023 18:44:32 -0800 Subject: [PATCH 004/132] Add script to filter maf based on BED or MAF --- modules/igv/1.0/etc/filter_maf.py | 96 +++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 modules/igv/1.0/etc/filter_maf.py diff --git a/modules/igv/1.0/etc/filter_maf.py b/modules/igv/1.0/etc/filter_maf.py new file mode 100644 index 00000000..beab48b8 --- /dev/null +++ b/modules/igv/1.0/etc/filter_maf.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python + +import os +import math +import pandas as pd +import oncopipe as op + +def filter_by_bed(maf, regions, metadata): + + # Remove rows that contain column names + regions = regions[regions[0].str.contains("chrom")==False] + + # Create common columns between BED and MAF + regions["chr_std"] = regions.apply(lambda x: str(x[0]).replace("chr",""), axis=1) + regions["genomic_pos_std"] = regions["chr_std"] + ":" + regions[1].map(str) + + maf["chr_std"] = maf.apply(lambda x: str(x["Chromosome"]).replace("chr",""), axis=1) + maf["genomic_pos_std"] = maf["chr_std"] + ":" + maf["Start_Position"].map(str) + + filtered_maf = maf[maf["genomic_pos_std"].isin(regions["genomic_pos_std"])] + return filtered_maf + +def filter_by_maf(maf, regions, metadata): + + # Create common column by which to subset MAF + for df in [maf, regions]: + df["chr_std"] = df.apply(lambda x: str(x["Chromosome"]).replace("chr",""), axis=1) + df["genomic_pos_std"] = df["chr_std"] + ":" + df["Start_Position"].map(str) + "_" + df["End_Position"].map(str) + + # Subset the MAF + filtered_maf = maf[maf["genomic_pos_std"].isin(regions["genomic_pos_std"])] + return filtered_maf + +def maf_filter(maf, regions, regions_format, metadata, genome_build, seq_type, genome_map): + # Read input MAF and regions file as dataframes + maf_df = pd.read_table(maf, comment="#", sep="\t") + + if regions_format != "bed": + regions_df = pd.read_table(regions, comment="#", sep="\t") + else: + regions_df = pd.read_table(regions, comment="#", sep="\t", header=None) + + # Select rows in MAF containing correct seq_type and build + metadata = op.filter_samples(metadata, seq_type=seq_type) + genome_build_list = genome_map[genome_build] + metadata = op.filter_samples(metadata, genome_build=genome_build_list) + + maf_df = maf_df[maf_df["Tumor_Sample_Barcode"].isin(metadata.sample_id)] + + filter_functions = { + "maf": filter_by_maf, + "bed": filter_by_bed + } + + return filter_functions[regions_format](maf_df, regions_df, metadata) + +def write_output(maf, outfile): + maf.to_csv(outfile, sep="\t", index=False) + +# assign snakemake values to variables + +maf_file = snakemake.input[0] + +regions_file = snakemake.input[1] +regions_format = snakemake.params[0] + +if regions_format == "oncodriveclustl": + # This should act as a global variable + CLUSTL_PARAMS = snakemake.params[5] + +# Metadata file or dataframe mapping sample_ids to bam file paths +metadata = snakemake.params[1] +if not isinstance(metadata, pd.DataFrame): + metadata = pd.read_table(metadata, comment="#", sep="\t") + +maf_genome_build = snakemake.params[2] +maf_seq_type = snakemake.params[3] + +# Dictionary of genome builds present in the MAF to label as grch37 / hg38 +genome_map = snakemake.params[4] + +output_file = snakemake.output[0] + +# Peform filtering + +filtered_maf = maf_filter( + maf=maf_file, + regions=regions_file, + regions_format=regions_format, + metadata=metadata, + genome_build=maf_genome_build, + seq_type=maf_seq_type, + genome_map=genome_map + ) + +write_output(filtered_maf, output_file) From f46ff3faafba3399a6647dfdc446561b1ef33471 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 1 Feb 2023 18:46:21 -0800 Subject: [PATCH 005/132] Modify liftover script to accomodate BED regions --- modules/igv/1.0/etc/liftover_regions.sh | 29 ++++++++++++++++++++----- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/modules/igv/1.0/etc/liftover_regions.sh b/modules/igv/1.0/etc/liftover_regions.sh index 45b86b2f..be39f72b 100755 --- a/modules/igv/1.0/etc/liftover_regions.sh +++ b/modules/igv/1.0/etc/liftover_regions.sh @@ -13,13 +13,15 @@ input_regions=$1 input_type=$2 -target_build=$3 -output_file=$4 -chain_file=$5 -target_ref=$6 +regions_build=$3 +target_build=$4 +output_file=$5 +chain_file=$6 +target_ref=$7 echo "Input regions file: $input_regions" echo "Input regions type: $input_type" +echo "Input regions build: $regions_build" echo "Target genome build: $target_build" echo "Output file: $output_file" echo "Chain file: $chain_file" @@ -32,8 +34,7 @@ intermediate_output_file=$(echo $output_file)_int if [ "$input_type" == "maf" ] ; then echo "Proceeding with MAF input..." - - if grep -q $target_build $input_regions ; + if [ $regions_build == $target_build ] ; then echo "WARNING: Input regions file $input_regions is already $target_build. Copying contents of $input_regions to $output_file"; cut -f 1,5,6,7,9,10,11,13,16 $input_regions > $output_file @@ -46,4 +47,20 @@ then fi echo "Finished MAF block." fi + +if [ "$input_type" == "bed" ] ; + then + echo "Proceeding with BED input..." + if [ $regions_build == $target_build ] ; + then + echo "WARNING: Input regions file $input_regions is already $target_build. Copying contents of $input_regions to $output_file"; + cat $input_regions > $output_file + else + echo "Input regions file $input_regions does not appear to be $target_build. Proceeding with conversion to $target_build" + echo "CrossMap.py bed $chain_file $input_regions $output_file" + CrossMap.py bed $chain_file $input_regions $output_file + fi + echo "Finished BED block." +fi + echo "End of bash script" From d0693a61ec4c066beefd42ff727d588b1ba9bbb3 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 1 Feb 2023 18:47:37 -0800 Subject: [PATCH 006/132] Update config for changes made to module --- modules/igv/1.0/config/default.yaml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index a02a0ab7..1abfdcbd 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -12,12 +12,13 @@ lcr-modules: regions_format: "__UPDATE__" regions_build: "__UPDATE__" # Genome build of regions file, will be lifted over to filter MAFs on opposite genome builds + test_run: True # Create only the filtered MAF, to get an estimate of how many snapshots will be taken + filter_maf: oncodriveclustl_options: # These parameters will filter down the OncodriveCLUSTL cluster results file. - q_value: # Desired q-value of OncodriveCLUSTL clusters - scores: # Desired scores of OncodriveCLUSTL clusters + p_value: # Desired q-value of OncodriveCLUSTL clusters + score: # Desired scores of OncodriveCLUSTL clusters n_samples: # Desired number of samples in OncodriveCLUSTL clusters - genome_map: # Map different builds in metadata grch37: ["grch37","hg19","hg19-clc","hg19-reddy","hs37d5"] @@ -38,6 +39,8 @@ lcr-modules: batch_script_per_sample: # True/False. Default is False. If True, a batch script will be created to accompany each snapshot for easy access in IGV. scripts: + format_regions: "etc/format_regions.py" + filter_script: "etc/filter_maf.py" region_liftover_script: "{MODSDIR}/etc/liftover_regions.sh" batch_script: "{MODSDIR}/etc/generate_batch_scripts.py" From 8fa8babefd112c45194a6996e6934417d55d760d Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 1 Feb 2023 18:49:13 -0800 Subject: [PATCH 007/132] Remove commented out text --- modules/igv/1.0/igv.smk | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index ec6a36d3..57620c98 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -215,23 +215,6 @@ rule _igv_run: touch {output.success} """) -#rule _igv_run: -# input: -# batch_script = str(rules._igv_merge_batch_scripts.output.merged_batch), -# igv_installed = str(rules._igv_download_igv.output.igv_installed), -# output: -# success = CFG["dirs"]["outputs"] + "merged_batch_snapshot.finished" -# params: -# igv = CFG["dirs"]["igv"] + "IGV_Linux_2.16.0/igv.sh" -# log: -# stdout = CFG["logs"]["igv"] + "run_igv_merged_batch.stdout.log", -# stderr = CFG["logs"]["igv"] + "run_igv_merged_batch.stderr.log" -# shell: -# op.as_one_line(""" -# xvfb-run --auto-servernum {params.igv} -b {input.batch_script} > {log.stdout} 2> {log.stderr} && -# touch {output.success} -# """) - # Generates the target sentinels for each run, which generate the symlinks if CFG["test_run"] is False: rule _igv_all: From d3b8e1c492f6fe970b3fea5aeaeb895455801bc2 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Sat, 4 Feb 2023 12:33:03 -0800 Subject: [PATCH 008/132] Add function to reformat hotmaps MAF results --- modules/igv/1.0/etc/format_regions.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/modules/igv/1.0/etc/format_regions.py b/modules/igv/1.0/etc/format_regions.py index 89fdd6a7..64688fd4 100755 --- a/modules/igv/1.0/etc/format_regions.py +++ b/modules/igv/1.0/etc/format_regions.py @@ -4,6 +4,21 @@ import pandas as pd import oncopipe as op +def format_hotmaps(hotmaps_regions): + # Convert HotMAPS coordinates to BED format + + hotmaps_regions["chr_std"] = hotmaps_regions.apply(lambda x: str(x["Chromosome"]).replace("chr",""), axis=1) + chr_std = "chr" + hotmaps_regions["Chromosome"].map(str) + + hotmaps_reformatted = pd.DataFrame( + { + "chrom": chr_std, + "start": hotmaps_regions["Start_Position"], + "end": hotmaps_regions["Start_Position"] + } + ) + return hotmaps_reformatted + def format_clustl(clustl_regions): # Convert OncodriveCLUSTL cluster coordinates to BED format p_filter = CLUSTL_PARAMS["p_value"] @@ -49,6 +64,7 @@ def format_regions(regions, regions_format): format_functions = { "maf": format_maf, "oncodriveclustl": format_clustl, + "hotmaps": format_hotmaps, "genomic_pos": format_genomic_pos } @@ -65,6 +81,8 @@ def format_regions(regions, regions_format): # Read regions into dataframe regions_df = pd.read_table(regions_file, comment="#", sep="\t") +# Reformat for liftover based on regions format regions_formatted = format_regions(regions_df, regions_format) +# Output regions file regions_formatted.to_csv(output_file, sep="\t", index=False) \ No newline at end of file From 3791dd1507da2d0b3401f7bbabba09a5e70f407a Mon Sep 17 00:00:00 2001 From: mannycruz Date: Mon, 6 Feb 2023 16:11:37 -0800 Subject: [PATCH 009/132] Add function to format mutation_id regions file --- modules/igv/1.0/etc/format_regions.py | 51 +++++++++++++++++++++++++-- modules/igv/1.0/igv.smk | 5 +-- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/modules/igv/1.0/etc/format_regions.py b/modules/igv/1.0/etc/format_regions.py index 64688fd4..f3aa954b 100755 --- a/modules/igv/1.0/etc/format_regions.py +++ b/modules/igv/1.0/etc/format_regions.py @@ -4,6 +4,48 @@ import pandas as pd import oncopipe as op +def format_mutation_id(mutation_id): + ## Modify dataframe to handle NA values in columns + #mutation_id = mutation_id.fillna(0) +# + ## Filter dataframe based on config option + #max_distinct_genome_cohorts = MUTATION_ID_PARAMS["max_distinct_genome_cohorts"] + #min_total_genome_samples = MUTATION_ID_PARAMS["min_total_genome_samples"] + # + #max_distinct_capture_cohorts = MUTATION_ID_PARAMS["max_distinct_capture_cohorts"] + #min_total_capture_samples = MUTATION_ID_PARAMS["min_total_capture_samples"] +# + #for key, filter_value in { + # "distinct_genome_cohorts": max_distinct_genome_cohorts, + # "total_genome": min_total_genome_samples, + # "distinct_capture_cohorts": max_distinct_capture_cohorts, + # "total_capture": min_total_capture_samples + #}.items(): + # if filter_value is not None: + # if key in ["distinct_genome_cohorts", "distinct_capture_cohorts"]: + # mutation_id = mutation_id[mutation_id[key] <= float(filter_value)] + # else: + # mutation_id = mutation_id[mutation_id[key] >= float(filter_value)] + + # Create columns required for liftover in BED format + genomic_pos_col = f"mutation_id_{REGIONS_BUILD}" + + for col, idx in {"chr_std": 0, "start": 1, "end": 2}.items(): + mutation_id[col] = mutation_id.apply(lambda x: str(x[genomic_pos_col]).split(":")[idx].replace("chr",""), axis=1) + + mutation_id_reformatted = pd.DataFrame( + { + "chrom": "chr" + mutation_id["chr_std"], + "start": mutation_id["start"], + "end": mutation_id["end"] + } + ) + + # Remove duplicate rows + mutation_id_reformatted = mutation_id_reformatted.drop_duplicates(keep='first') + + return mutation_id_reformatted + def format_hotmaps(hotmaps_regions): # Convert HotMAPS coordinates to BED format @@ -20,7 +62,6 @@ def format_hotmaps(hotmaps_regions): return hotmaps_reformatted def format_clustl(clustl_regions): - # Convert OncodriveCLUSTL cluster coordinates to BED format p_filter = CLUSTL_PARAMS["p_value"] score_filter = CLUSTL_PARAMS["score"] n_samples_filter = CLUSTL_PARAMS["n_samples"] @@ -34,6 +75,8 @@ def format_clustl(clustl_regions): # Reformat CLUSTL coordinates to handle clusters that cross introns (when CLUSTL concatenated mode is used) clustl_regions = clustl_regions.assign(COORDINATES = clustl_regions.COORDINATES.str.split(";")).explode("COORDINATES") + + # Convert OncodriveCLUSTL cluster coordinates to BED format clustl_regions["COORDINATES"] = clustl_regions.apply( lambda x: list( range( @@ -65,7 +108,7 @@ def format_regions(regions, regions_format): "maf": format_maf, "oncodriveclustl": format_clustl, "hotmaps": format_hotmaps, - "genomic_pos": format_genomic_pos + "mutation_id": format_mutation_id } return format_functions[regions_format](regions) @@ -78,6 +121,10 @@ def format_regions(regions, regions_format): if regions_format == "oncodriveclustl": CLUSTL_PARAMS = snakemake.params[1] +if regions_format == "mutation_id": + REGIONS_BUILD = snakemake.params[2] + REGIONS_BUILD = REGIONS_BUILD.lower() + # Read regions into dataframe regions_df = pd.read_table(regions_file, comment="#", sep="\t") diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 57620c98..9c5a5380 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -101,7 +101,8 @@ rule _igv_format_regions_file: regions = config["lcr-modules"]["igv"]["dirs"]["inputs"] + "regions/regions_file_formatted.txt" params: regions_format = config["lcr-modules"]["igv"]["inputs"]["regions_format"], - oncodriveclustl_params = config["lcr-modules"]["igv"]["filter_maf"]["oncodriveclustl_options"] + oncodriveclustl_params = config["lcr-modules"]["igv"]["filter_maf"]["oncodriveclustl_options"], + regions_build = config["lcr-modules"]["igv"]["inputs"]["regions_build"] script: config["lcr-modules"]["igv"]["scripts"]["format_regions"] @@ -109,7 +110,7 @@ REGIONS_FORMAT = { "maf": "maf", "oncodriveclustl": "bed", "hotmaps": "bed", - "genomic_regions": "bed" + "mutation_id": "bed" } rule _igv_liftover_regions: From e0c184dc86def4e7ca9cab504d7e84dda27ac258 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Mon, 6 Feb 2023 19:14:31 -0800 Subject: [PATCH 010/132] Move constraint on n snaps/variant to filter step --- modules/igv/1.0/config/default.yaml | 2 +- modules/igv/1.0/etc/filter_maf.py | 12 ++++++++++-- modules/igv/1.0/etc/generate_batch_scripts.py | 17 +++-------------- modules/igv/1.0/igv.smk | 8 ++++---- 4 files changed, 18 insertions(+), 21 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index 1abfdcbd..bc45693a 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -19,6 +19,7 @@ lcr-modules: p_value: # Desired q-value of OncodriveCLUSTL clusters score: # Desired scores of OncodriveCLUSTL clusters n_samples: # Desired number of samples in OncodriveCLUSTL clusters + n_snapshots: 20 # Number of snapshots to take per unique variant position. Default is set to max (1000000). genome_map: # Map different builds in metadata grch37: ["grch37","hg19","hg19-clc","hg19-reddy","hs37d5"] @@ -35,7 +36,6 @@ lcr-modules: generate_batch_script: padding: 300 max_height: 400 - n_snapshots: 20 # Number of snapshots to take per unique variant. Default is 20. batch_script_per_sample: # True/False. Default is False. If True, a batch script will be created to accompany each snapshot for easy access in IGV. scripts: diff --git a/modules/igv/1.0/etc/filter_maf.py b/modules/igv/1.0/etc/filter_maf.py index beab48b8..711cadb2 100644 --- a/modules/igv/1.0/etc/filter_maf.py +++ b/modules/igv/1.0/etc/filter_maf.py @@ -54,11 +54,15 @@ def maf_filter(maf, regions, regions_format, metadata, genome_build, seq_type, g return filter_functions[regions_format](maf_df, regions_df, metadata) +def maf_reduce_snapshots(maf, snapshots): + # Only include max of number of snapshots for each variant + maf = maf.groupby(["Chromosome","Start_Position", "End_Position", "Reference_Allele", "Tumor_Seq_Allele2"]).head(n=snapshots) + + return maf + def write_output(maf, outfile): maf.to_csv(outfile, sep="\t", index=False) -# assign snakemake values to variables - maf_file = snakemake.input[0] regions_file = snakemake.input[1] @@ -68,6 +72,8 @@ def write_output(maf, outfile): # This should act as a global variable CLUSTL_PARAMS = snakemake.params[5] +n_snapshots = snakemake.params[6] + # Metadata file or dataframe mapping sample_ids to bam file paths metadata = snakemake.params[1] if not isinstance(metadata, pd.DataFrame): @@ -93,4 +99,6 @@ def write_output(maf, outfile): genome_map=genome_map ) +filtered_maf = maf_reduce_snapshots(maf=filtered_maf, snapshots=n_snapshots) + write_output(filtered_maf, output_file) diff --git a/modules/igv/1.0/etc/generate_batch_scripts.py b/modules/igv/1.0/etc/generate_batch_scripts.py index 9ff0626a..2b920f65 100755 --- a/modules/igv/1.0/etc/generate_batch_scripts.py +++ b/modules/igv/1.0/etc/generate_batch_scripts.py @@ -26,8 +26,7 @@ def main(): max_height = args.max_height, seq_type = args.seq_type, genome_build = args.genome_build, - snapshot_dir=args.snapshot_dir, - n_snapshots=args.n_snapshots) + snapshot_dir=args.snapshot_dir) close_files(args) @@ -88,16 +87,6 @@ def parse_arguments(): ) ) - parser.add_argument( - "--n_snapshots", - "-n", - type=int, - default=20, - help=( - "Maximum number of different snapshots for each position." - ) - ) - parser.add_argument( "--genome_build", "-g", @@ -249,7 +238,7 @@ def output_lines(lines, output): text = "\n".join(lines) output.write(text) -def generate_igv_batch(regions, output, max_height, seq_type, genome_build, snapshot_dir, n_snapshots): +def generate_igv_batch(regions, output, max_height, seq_type, genome_build, snapshot_dir): # The lines for the batch script encompassing all regions and sample_ids all_lines = [] @@ -275,7 +264,7 @@ def generate_igv_batch(regions, output, max_height, seq_type, genome_build, snap for unique_region in regions_in_dir.regions.unique(): # Subset rows by number of snapshots desired per region - regions_subset = regions_in_dir[regions_in_dir["regions"]==unique_region][:n_snapshots] + regions_subset = regions_in_dir[regions_in_dir["regions"]==unique_region] # Generate lines of batch script per region subset lines = generate_igv_batch_per_region( diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 9c5a5380..1544aaa3 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -151,7 +151,8 @@ rule _igv_filter_maf: genome_build = lambda w: w.genome_build, seq_type = lambda w: w.seq_type, genome_map = config["lcr-modules"]["igv"]["genome_map"], - oncodriveclustl_params = config["lcr-modules"]["igv"]["filter_maf"]["oncodriveclustl_options"] + oncodriveclustl_params = config["lcr-modules"]["igv"]["filter_maf"]["oncodriveclustl_options"], + n_snapshots = config["lcr-modules"]["igv"]["filter_maf"]["n_snapshots"] if config["lcr-modules"]["igv"]["filter_maf"]["n_snapshots"] is not None else 1000000 script: config["lcr-modules"]["igv"]["scripts"]["filter_script"] @@ -168,8 +169,7 @@ rule _igv_create_batch_script: genome_build = lambda w: w.genome_build, seq_type = lambda w: w.seq_type, padding = CFG["generate_batch_script"]["padding"], - max_height = CFG["generate_batch_script"]["max_height"], - n_snapshots = CFG["generate_batch_script"]["n_snapshots"] + max_height = CFG["generate_batch_script"]["max_height"] log: stdout = CFG["logs"]["batch_scripts"] + "{seq_type}--{genome_build}_batch_script.stdout.log", stderr = CFG["logs"]["batch_scripts"] + "{seq_type}--{genome_build}_batch_script.stderr.log" @@ -178,7 +178,7 @@ rule _igv_create_batch_script: {params.py_script} {input.maf_filtered} --output {output.batch_script} --metadata {input.metadata} --padding {params.padding} --max_height {params.max_height} - --snapshot_dir {params.snapshot_dir} --n_snapshots {params.n_snapshots} + --snapshot_dir {params.snapshot_dir} --genome_build {params.genome_build} --seq_type {params.seq_type} > {log.stdout} 2> {log.stderr} """) From 07bea1630f3fcb7332bf1b7f1ca727222ac20d9b Mon Sep 17 00:00:00 2001 From: mannycruz Date: Fri, 10 Feb 2023 16:14:20 -0800 Subject: [PATCH 011/132] Remove metadata file option, fix pairing config --- modules/igv/1.0/config/default.yaml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index bc45693a..e53b9990 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -6,13 +6,12 @@ lcr-modules: inputs: # Available wildcards: {seq_type} {genome_build} {sample_id} master_maf: # MAFs to draw sample_id and variants from - metadata: # Metadata containing bam link names, sample_id, and genome_build columns regions_file: "__UPDATE__" # Path to a MAF, VCF, BED or BEDPE file containing regions of interest to create snapshots of. regions_format: "__UPDATE__" regions_build: "__UPDATE__" # Genome build of regions file, will be lifted over to filter MAFs on opposite genome builds - test_run: True # Create only the filtered MAF, to get an estimate of how many snapshots will be taken + test_run: False # Create only the filtered MAF, to get an estimate of how many snapshots will be taken filter_maf: oncodriveclustl_options: # These parameters will filter down the OncodriveCLUSTL cluster results file. @@ -37,12 +36,12 @@ lcr-modules: padding: 300 max_height: 400 batch_script_per_sample: # True/False. Default is False. If True, a batch script will be created to accompany each snapshot for easy access in IGV. - + scripts: format_regions: "etc/format_regions.py" filter_script: "etc/filter_maf.py" region_liftover_script: "{MODSDIR}/etc/liftover_regions.sh" - batch_script: "{MODSDIR}/etc/generate_batch_scripts.py" + batch_script: "etc/generate_batch_scripts.py" scratch_subdirectories: [] @@ -67,7 +66,7 @@ lcr-modules: run_paired_tumours: False run_unpaired_tumours_with: "no_normal" run_paired_tumours_as_unpaired: False - mrna: + capture: run_paired_tumours: False run_unpaired_tumours_with: "no_normal" run_paired_tumours_as_unpaired: False From ed501fb24850ceda0ced96aafde1dad280ac3ab7 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Fri, 10 Feb 2023 16:14:38 -0800 Subject: [PATCH 012/132] Grammar --- modules/igv/1.0/etc/filter_maf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/igv/1.0/etc/filter_maf.py b/modules/igv/1.0/etc/filter_maf.py index 711cadb2..40723ad2 100644 --- a/modules/igv/1.0/etc/filter_maf.py +++ b/modules/igv/1.0/etc/filter_maf.py @@ -7,7 +7,7 @@ def filter_by_bed(maf, regions, metadata): - # Remove rows that contain column names + # Remove row containing column names regions = regions[regions[0].str.contains("chrom")==False] # Create common columns between BED and MAF From 2bb98e7e3adc16a88eecdd8eb4c9238ec0b1f886 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Fri, 10 Feb 2023 16:15:29 -0800 Subject: [PATCH 013/132] Convert from snakemake shell to script directive --- modules/igv/1.0/etc/generate_batch_scripts.py | 54 ++++++++++++++----- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/modules/igv/1.0/etc/generate_batch_scripts.py b/modules/igv/1.0/etc/generate_batch_scripts.py index 2b920f65..6fa13656 100755 --- a/modules/igv/1.0/etc/generate_batch_scripts.py +++ b/modules/igv/1.0/etc/generate_batch_scripts.py @@ -10,25 +10,51 @@ def main(): # Parse arguments - args = parse_arguments() + #args = parse_arguments() + + # Read MAF file containing variants and create a dataframe linking regions, sample_ids, and bam paths + #regions = get_regions_df( + # args.input_maf, + # metadata=args.metadata, + # seq_type=args.seq_type, + # padding=args.padding) + + # Format and output the batch script + #generate_igv_batch( + # regions = regions, + # output = args.output, + # max_height = args.max_height, + # seq_type = args.seq_type, + # genome_build = args.genome_build, + # snapshot_dir=args.snapshot_dir) + + input_maf = open(snakemake.input[0], "r") # Read MAF file containing variants and create a dataframe linking regions, sample_ids, and bam paths regions = get_regions_df( - args.input_maf, - metadata=args.metadata, - seq_type=args.seq_type, - padding=args.padding) + input_maf, + metadata=snakemake.params[0], + seq_type=snakemake.params[3], + padding=snakemake.params[4] + ) + + input_maf.close() + + outfile = open(snakemake.output[0], "w") # Format and output the batch script generate_igv_batch( regions = regions, - output = args.output, - max_height = args.max_height, - seq_type = args.seq_type, - genome_build = args.genome_build, - snapshot_dir=args.snapshot_dir) + output = outfile, + max_height = snakemake.params[5], + seq_type = snakemake.params[3], + genome_build = snakemake.params[2], + snapshot_dir = snakemake.params[1] + ) - close_files(args) + outfile.close() + + #close_files(args) def parse_arguments(): parser = argparse.ArgumentParser(description=__doc__) @@ -109,9 +135,9 @@ def parse_arguments(): def get_regions_df(input_maf, metadata, seq_type, padding): # Read MAF as dataframe maf = pd.read_table(input_maf, comment="#") - - # Read metadata as dataframe - metadata = pd.read_table(metadata, comment="#") + + # Metadata should already be a dataframe + assert isinstance(metadata, pd.DataFrame), "Metadata is not in Pandas dataframe format." # Filter metadata down to only samples of required seq_type metadata = metadata[metadata["seq_type"]==seq_type] From ff248e6bc789e523722548191a6263e05bb349b5 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Fri, 10 Feb 2023 16:17:31 -0800 Subject: [PATCH 014/132] Add sample_id tracking, metadata = CFG["samples"] --- modules/igv/1.0/igv.smk | 64 ++++++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 27 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 1544aaa3..aa1218d1 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -41,6 +41,10 @@ CFG = op.setup_module( subdirectories = ["inputs", "batch_scripts", "igv", "snapshots", "outputs"], ) +# Rename genome builds in metadata to match up with MAFs? +CFG["samples"]["genome_build"].mask(CFG["samples"]["genome_build"].isin(CFG["genome_map"]["grch37"]), "grch37", inplace=True) +CFG["samples"]["genome_build"].mask(CFG["samples"]["genome_build"].isin(CFG["genome_map"]["hg38"]), "hg38", inplace=True) + # Define rules to be run locally when using a compute cluster # TODO: Replace with actual rules once you change the rule names @@ -54,6 +58,14 @@ localrules: _igv_download_igv, _igv_run +##### FUNCTIONS ##### + + +def get_bams(wildcards): + metadata = config["lcr-modules"]["igv"]["samples"] + return expand("data/{{seq_type}}_bams/{{sample_id}}.{genome_build}.bam", genome_build=metadata[(metadata.sample_id == wildcards.sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) + + ##### RULES ##### @@ -67,13 +79,13 @@ rule _igv_symlink_regions_file: run: op.absolute_symlink(input.regions_file, output.regions_file) -rule _igv_symlink_metadata: +rule _igv_symlink_bams: input: - metadata = CFG["inputs"]["metadata"] + bam = get_bams output: - metadata = CFG["dirs"]["inputs"] + "metadata/metadata.tsv" + bam = CFG["dirs"]["inputs"] + "bams/{seq_type}/{sample_id}.bam" run: - op.absolute_symlink(input.metadata, output.metadata) + op.absolute_symlink(input.bam, output.bam) rule _igv_symlink_maf: input: @@ -147,7 +159,7 @@ rule _igv_filter_maf: maf_filtered = config["lcr-modules"]["igv"]["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}_cols_filtered.maf" params: regions_format = REGIONS_FORMAT[config["lcr-modules"]["igv"]["inputs"]["regions_format"].lower()], - metadata = config["lcr-modules"]["igv"]["samples"] if CFG["inputs"]["metadata"] is None else CFG["inputs"]["metadata"], + metadata = config["lcr-modules"]["igv"]["samples"], genome_build = lambda w: w.genome_build, seq_type = lambda w: w.seq_type, genome_map = config["lcr-modules"]["igv"]["genome_map"], @@ -159,28 +171,18 @@ rule _igv_filter_maf: # Pass filtered MAF to create batch script rule _igv_create_batch_script: input: - maf_filtered = str(rules._igv_filter_maf.output.maf_filtered), - metadata = str(rules._igv_symlink_metadata.output.metadata) + maf_filtered = str(rules._igv_filter_maf.output.maf_filtered) output: - batch_script = temp(CFG["dirs"]["batch_scripts"] + "{seq_type}--{genome_build}.batch") + batch_script = temp(config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + "{seq_type}--{genome_build}.batch") params: - py_script = CFG["scripts"]["batch_script"], - snapshot_dir = CFG["dirs"]["snapshots"], + metadata = config["lcr-modules"]["igv"]["samples"], + snapshot_dir = config["lcr-modules"]["igv"]["dirs"]["snapshots"], genome_build = lambda w: w.genome_build, seq_type = lambda w: w.seq_type, - padding = CFG["generate_batch_script"]["padding"], - max_height = CFG["generate_batch_script"]["max_height"] - log: - stdout = CFG["logs"]["batch_scripts"] + "{seq_type}--{genome_build}_batch_script.stdout.log", - stderr = CFG["logs"]["batch_scripts"] + "{seq_type}--{genome_build}_batch_script.stderr.log" - shell: - op.as_one_line(""" - {params.py_script} {input.maf_filtered} - --output {output.batch_script} --metadata {input.metadata} - --padding {params.padding} --max_height {params.max_height} - --snapshot_dir {params.snapshot_dir} - --genome_build {params.genome_build} --seq_type {params.seq_type} > {log.stdout} 2> {log.stderr} - """) + padding = config["lcr-modules"]["igv"]["generate_batch_script"]["padding"], + max_height = config["lcr-modules"]["igv"]["generate_batch_script"]["max_height"] + script: + config["lcr-modules"]["igv"]["scripts"]["batch_script"] rule _igv_download_igv: output: @@ -198,12 +200,21 @@ rule _igv_download_igv: touch {output.igv_installed} """) +rule _igv_track_samples: + input: + bam = str(rules._igv_symlink_bams.output.bam) + output: + finished = CFG["dirs"]["outputs"] + "samples/{seq_type}/{sample_id}_" + CFG["inputs"]["regions_format"] + ".track" + shell: + "touch {output.finished}" + rule _igv_run: input: batch_script = str(rules._igv_create_batch_script.output.batch_script), igv_installed = str(rules._igv_download_igv.output.igv_installed), + sample_track = expand(str(rules._igv_track_samples.output.finished), zip, seq_type=CFG["samples"]["seq_type"], sample_id=CFG["samples"]["sample_id"]) output: - success = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}_snapshot.finished" + success = CFG["dirs"]["outputs"] + "snapshots/{seq_type}--{genome_build}_snapshot.finished" params: #igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh" igv = "/projects/rmorin/projects/RNA_seq_ssm/test/bin/IGV_Linux_2.7.2/igv.sh" @@ -220,13 +231,12 @@ rule _igv_run: if CFG["test_run"] is False: rule _igv_all: input: - expand(rules._igv_run.output.success, seq_type=["genome","capture"], genome_build=["hg38","grch37"]) - + expand(str(rules._igv_run.output.success), seq_type=CFG["samples"]["seq_type"], genome_build=CFG["samples"]["genome_build"]) if CFG["test_run"] is True: rule _igv_all: input: - expand(rules._igv_filter_maf.output.maf_filtered, seq_type=["genome","capture"], genome_build=["hg38","grch37"]) + expand(rules._igv_filter_maf.output.maf_filtered, seq_type=CFG["samples"]["seq_type"], genome_build=CFG["samples"]["seq_type"]) From e389982d960d8682f4fc8a4475ba84ce36515bea Mon Sep 17 00:00:00 2001 From: mannycruz Date: Sat, 25 Feb 2023 17:52:56 -0800 Subject: [PATCH 015/132] Overhaul to sample_id-dependent workflow --- modules/igv/1.0/config/default.yaml | 28 +- modules/igv/1.0/etc/filter_maf.py | 38 +-- modules/igv/1.0/etc/generate_batch_scripts.py | 273 ++++++------------ modules/igv/1.0/igv.smk | 154 ++++++---- 4 files changed, 215 insertions(+), 278 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index e53b9990..f353490f 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -2,19 +2,17 @@ lcr-modules: igv: - # TODO: Update the list of available wildcards, if applicable inputs: - # Available wildcards: {seq_type} {genome_build} {sample_id} - master_maf: # MAFs to draw sample_id and variants from - - regions_file: "__UPDATE__" # Path to a MAF, VCF, BED or BEDPE file containing regions of interest to create snapshots of. + # Available wildcards: {seq_type} {tumour_sample_id} {normal_sample_id} {pair_status} {genome_build} + maf: "__UPDATE__" + regions_file: "__UPDATE__" # Path to a MAF, VCF, BED, OncodriveCLUSTL clusters file, HotMAPS results file, or mutation_id file containing regions of interest to snapshot. regions_format: "__UPDATE__" - regions_build: "__UPDATE__" # Genome build of regions file, will be lifted over to filter MAFs on opposite genome builds + regions_build: "__UPDATE__" # Genome build of regions file, which will be lifted over as needed to filter MAFs on opposite genome builds - test_run: False # Create only the filtered MAF, to get an estimate of how many snapshots will be taken + test_run: True # Stop after MAF filtering step to get an estimate of how many snapshots will be taken filter_maf: - oncodriveclustl_options: # These parameters will filter down the OncodriveCLUSTL cluster results file. + oncodriveclustl_options: # These parameters will filter the OncodriveCLUSTL cluster results file. p_value: # Desired q-value of OncodriveCLUSTL clusters score: # Desired scores of OncodriveCLUSTL clusters n_samples: # Desired number of samples in OncodriveCLUSTL clusters @@ -30,12 +28,14 @@ lcr-modules: hg38: "genomes/hg38/chains/grch38/hg38ToHg19.over.chain" target_reference: grch37: "/projects/rmorin/reference/igenomes/Homo_sapiens/GSC/GRCh37-lite/Sequence/WholeGenomeFasta/genome.fa" - hg38: "/projects/rmorin/reference/igenomes/Homo_sapiens/GSC/GRCh37-lite/Sequence/WholeGenomeFasta/genome.fa" + hg38: "/projects/rmorin/reference/igenomes/Homo_sapiens/GSC/GRCh38/Sequence/WholeGenomeFasta/genome.fa" generate_batch_script: + temp: True # Create temporary batch files + image_format: ".svg" padding: 300 max_height: 400 - batch_script_per_sample: # True/False. Default is False. If True, a batch script will be created to accompany each snapshot for easy access in IGV. + igv_options: [] scripts: format_regions: "etc/format_regions.py" @@ -63,10 +63,10 @@ lcr-modules: pairing_config: genome: - run_paired_tumours: False - run_unpaired_tumours_with: "no_normal" + run_paired_tumours: True + run_unpaired_tumours_with: "unmatched_normal" run_paired_tumours_as_unpaired: False capture: - run_paired_tumours: False - run_unpaired_tumours_with: "no_normal" + run_paired_tumours: True + run_unpaired_tumours_with: "unmatched_normal" run_paired_tumours_as_unpaired: False diff --git a/modules/igv/1.0/etc/filter_maf.py b/modules/igv/1.0/etc/filter_maf.py index 40723ad2..dd5e03ef 100644 --- a/modules/igv/1.0/etc/filter_maf.py +++ b/modules/igv/1.0/etc/filter_maf.py @@ -5,7 +5,7 @@ import pandas as pd import oncopipe as op -def filter_by_bed(maf, regions, metadata): +def filter_by_bed(maf, regions): # Remove row containing column names regions = regions[regions[0].str.contains("chrom")==False] @@ -20,7 +20,7 @@ def filter_by_bed(maf, regions, metadata): filtered_maf = maf[maf["genomic_pos_std"].isin(regions["genomic_pos_std"])] return filtered_maf -def filter_by_maf(maf, regions, metadata): +def filter_by_maf(maf, regions): # Create common column by which to subset MAF for df in [maf, regions]: @@ -31,7 +31,7 @@ def filter_by_maf(maf, regions, metadata): filtered_maf = maf[maf["genomic_pos_std"].isin(regions["genomic_pos_std"])] return filtered_maf -def maf_filter(maf, regions, regions_format, metadata, genome_build, seq_type, genome_map): +def maf_filter(maf, regions, regions_format): # Read input MAF and regions file as dataframes maf_df = pd.read_table(maf, comment="#", sep="\t") @@ -40,19 +40,16 @@ def maf_filter(maf, regions, regions_format, metadata, genome_build, seq_type, g else: regions_df = pd.read_table(regions, comment="#", sep="\t", header=None) - # Select rows in MAF containing correct seq_type and build - metadata = op.filter_samples(metadata, seq_type=seq_type) - genome_build_list = genome_map[genome_build] - metadata = op.filter_samples(metadata, genome_build=genome_build_list) - - maf_df = maf_df[maf_df["Tumor_Sample_Barcode"].isin(metadata.sample_id)] + # Return empty dataframe without filtering if df is empty + if len(maf_df)==0: + return maf_df filter_functions = { "maf": filter_by_maf, "bed": filter_by_bed } - return filter_functions[regions_format](maf_df, regions_df, metadata) + return filter_functions[regions_format](maf_df, regions_df) def maf_reduce_snapshots(maf, snapshots): # Only include max of number of snapshots for each variant @@ -70,20 +67,9 @@ def write_output(maf, outfile): if regions_format == "oncodriveclustl": # This should act as a global variable - CLUSTL_PARAMS = snakemake.params[5] - -n_snapshots = snakemake.params[6] - -# Metadata file or dataframe mapping sample_ids to bam file paths -metadata = snakemake.params[1] -if not isinstance(metadata, pd.DataFrame): - metadata = pd.read_table(metadata, comment="#", sep="\t") - -maf_genome_build = snakemake.params[2] -maf_seq_type = snakemake.params[3] + CLUSTL_PARAMS = snakemake.params[1] -# Dictionary of genome builds present in the MAF to label as grch37 / hg38 -genome_map = snakemake.params[4] +n_snapshots = snakemake.params[2] output_file = snakemake.output[0] @@ -92,11 +78,7 @@ def write_output(maf, outfile): filtered_maf = maf_filter( maf=maf_file, regions=regions_file, - regions_format=regions_format, - metadata=metadata, - genome_build=maf_genome_build, - seq_type=maf_seq_type, - genome_map=genome_map + regions_format=regions_format ) filtered_maf = maf_reduce_snapshots(maf=filtered_maf, snapshots=n_snapshots) diff --git a/modules/igv/1.0/etc/generate_batch_scripts.py b/modules/igv/1.0/etc/generate_batch_scripts.py index 6fa13656..26fe00b8 100755 --- a/modules/igv/1.0/etc/generate_batch_scripts.py +++ b/modules/igv/1.0/etc/generate_batch_scripts.py @@ -9,146 +9,62 @@ import math def main(): - # Parse arguments - #args = parse_arguments() - # Read MAF file containing variants and create a dataframe linking regions, sample_ids, and bam paths - #regions = get_regions_df( - # args.input_maf, - # metadata=args.metadata, - # seq_type=args.seq_type, - # padding=args.padding) - - # Format and output the batch script - #generate_igv_batch( - # regions = regions, - # output = args.output, - # max_height = args.max_height, - # seq_type = args.seq_type, - # genome_build = args.genome_build, - # snapshot_dir=args.snapshot_dir) + input_bam = snakemake.input[0] + input_bai = snakemake.input[1] + input_maf = open(snakemake.input[2], "r") + outfile = open(snakemake.output[0], "w") - input_maf = open(snakemake.input[0], "r") + # Skip sample if no variants in filtered MAF file + line_count = 0 + for line in input_maf: + line_count += 1 + if line_count > 1: + break + if line_count < 2: + line = generate_igv_batch_footer() + output_lines(line, outfile) + input_maf.close() + outfile.close() + exit() + + # Return to top of MAF + input_maf.seek(0) # Read MAF file containing variants and create a dataframe linking regions, sample_ids, and bam paths regions = get_regions_df( input_maf, - metadata=snakemake.params[0], - seq_type=snakemake.params[3], - padding=snakemake.params[4] + seq_type=snakemake.params[2], + padding=snakemake.params[3] ) input_maf.close() - outfile = open(snakemake.output[0], "w") - # Format and output the batch script generate_igv_batch( + bam = input_bam, + bai = input_bai, regions = regions, output = outfile, - max_height = snakemake.params[5], - seq_type = snakemake.params[3], - genome_build = snakemake.params[2], - snapshot_dir = snakemake.params[1] + max_height = snakemake.params[4], + seq_type = snakemake.params[2], + genome_build = snakemake.params[1], + snapshot_dir = snakemake.params[0], + igv_options = snakemake.params[5], + image_format = snakemake.params[6] ) outfile.close() - #close_files(args) - -def parse_arguments(): - parser = argparse.ArgumentParser(description=__doc__) - - parser.add_argument( - "input_maf", - type=argparse.FileType("r"), - default="-", - help=f"Input MAF. Can be '-' for stdin" - ) - - parser.add_argument( - "--output", - "-o", - metavar="OUTPUT_FILE", - type=argparse.FileType("w"), - help="Output IGV batch script." - ) - - parser.add_argument( - "--metadata", - "-v", - metavar="METADATA", - type=argparse.FileType("r"), - help="Metadata mapping sample IDs to BAM paths" - ) - - default_padding = 300 - parser.add_argument( - "--padding", - "-p", - type=int, - default=default_padding, - help=( - "Amount of padding added before and after each locus. " - f"Default padding is {default_padding}" - ), - ) - - default_max_height = 400 - parser.add_argument( - "--max_height", - "-m", - type=int, - default=default_max_height, - help="Maximum panel height in IGV. Default max height is {default_max_height}" - ) - - parser.add_argument( - "--snapshot_dir", - "-d", - required=True, - help=( - "Parent directory where {chromosome}/{region} subdirectories will be " - "populated with IGV snapshots." - ) - ) - - parser.add_argument( - "--genome_build", - "-g", - required=True, - help="Specify IGV genome build for snapshots." - ) - - parser.add_argument( - "--seq_type", - "-s", - required=True, - type=str, - help="Specify sequencing type for BAM extraction." - ) - - args = parser.parse_args() - - return args - -def get_regions_df(input_maf, metadata, seq_type, padding): +def get_regions_df(input_maf, seq_type, padding): # Read MAF as dataframe - maf = pd.read_table(input_maf, comment="#") - - # Metadata should already be a dataframe - assert isinstance(metadata, pd.DataFrame), "Metadata is not in Pandas dataframe format." - - # Filter metadata down to only samples of required seq_type - metadata = metadata[metadata["seq_type"]==seq_type] - metadata = metadata[["sample_id","link_name"]] + maf = pd.read_table(input_maf, comment="#", sep="\t") # Make sure required minimum columns are present in the maf columns = [ "Chromosome", "Start_Position", "End_Position", - "Tumor_Sample_Barcode", ] assert(all(c in list(maf.columns) for c in columns)), ( @@ -168,7 +84,6 @@ def get_regions_df(input_maf, metadata, seq_type, padding): ) # Create a pandas dataframe with to link regions with sample_ids and bam files - chrom = (maf["Chromosome"].astype(str)).apply(lambda x: x.replace("chr","")) # Snapshots will be held in parent directories of 1000-nt intervals for easier navigation @@ -188,64 +103,103 @@ def get_regions_df(input_maf, metadata, seq_type, padding): "sample_id": maf.Tumor_Sample_Barcode, } ) - - # Link bam paths to regions by merging metadata and regions dataframes by sample_id - regions_df = pd.merge(regions_df, metadata, on="sample_id", how="left") return regions_df -def generate_igv_batch_header(bam_file, max_height, snapshot_dir, genome_build): +def generate_igv_batch_header(bam_file, index_file, max_height, genome_build): lines = [] + genome_build = genome_build.replace("grch37","hg19").replace("grch38","hg38") + bam_file = os.path.realpath(bam_file) lines.append(f"load {bam_file}") + bai_file = os.path.realpath(index_file) + lines.append(f"index={bai_file}") + lines.append(f"maxPanelHeight {max_height}") - lines.append(f"snapshotDirectory {snapshot_dir}") lines.append(f"genome {genome_build}") return lines -def generate_igv_batch_per_row(regions, snapshot_filename): +def generate_igv_batch_per_row(regions, snapshot_filename, igv_options): lines = [] lines.append(f"goto {regions}") lines.append("sort") lines.append("collapse") + for option in igv_options: + lines.append(option) lines.append(f"snapshot {snapshot_filename}") - lines.append("new") return lines -def generate_igv_batch_per_region(regions, max_height, genome_build, snapshot_dir): +def generate_igv_batch(bam, bai, regions, output, max_height, seq_type, genome_build, snapshot_dir, igv_options, image_format): + + # Lines for batch script encompassing all regions and sample_ids + all_lines = [] + + header = generate_igv_batch_header( + bam, bai, max_height, genome_build + ) + + all_lines.extend(header) + + for dir_region in regions.dir_regions.unique(): + regions_in_dir = regions[regions["dir_regions"]==dir_region] + + lines = generate_igv_batch_per_region( + regions=regions_in_dir, + max_height=max_height, + seq_type=seq_type, + genome_build=genome_build, + snapshot_dir=snapshot_dir, + options=igv_options, + image_format=image_format + ) + + if lines is not None: + all_lines.extend(lines) + + footer = generate_igv_batch_footer() + all_lines.extend(footer) + + output_lines(all_lines, output) + + +def generate_igv_batch_per_region(regions, max_height, seq_type, genome_build, snapshot_dir, options, image_format): - # Lines of batch script + # Batch script lines lines = [] - # Add lines to batch script for each region + # Set up snapshot directory string + dir_chrom = regions.dir_regions.unique()[0].split(":")[0] + dir_interval = regions.dir_regions.unique()[0].split(":")[1] + seq_type_build = f"{seq_type}--{genome_build}" + + # Add snapshot directory line to batch script + snapshot_regions_dir = os.path.join(snapshot_dir, seq_type_build, dir_chrom, dir_interval, "") + lines.append(f"snapshotDirectory {snapshot_regions_dir}") + + # Add lines to batch script for each sample for _, row in regions.iterrows(): + # Add components of filename as a list filename = [] filename.append(row.regions) + + # Include gene name if available if "region_name" in row: filename.append(row.region_name) filename.append(row.sample_id) - filename = "--".join(filename) + ".png" - filename = filename.replace(" ", "_") - - bam_file = row.link_name - - genome_build = genome_build.replace("grch37","hg19").replace("grch38","hg38") + if not image_format.startswith("."): + image_format = "." + image_format - header = generate_igv_batch_header( - bam_file, max_height, snapshot_dir, genome_build - ) - lines.extend(header) + filename = "--".join(filename) + image_format - row_lines = generate_igv_batch_per_row(regions = row.regions, snapshot_filename = filename) + row_lines = generate_igv_batch_per_row(regions = row.regions, snapshot_filename = filename, igv_options = options) lines.extend(row_lines) - return lines def close_files(args): @@ -264,49 +218,6 @@ def output_lines(lines, output): text = "\n".join(lines) output.write(text) -def generate_igv_batch(regions, output, max_height, seq_type, genome_build, snapshot_dir): - - # The lines for the batch script encompassing all regions and sample_ids - all_lines = [] - - # Create batch scripts per unique 1000-nt interval region - dir_regions = regions.dir_regions.unique() - - for dir_region in dir_regions: - - # Get chromosome string for parent directory - dir_chrom = dir_region.split(":")[0] - # Get 1000nt interval region for subdirectory - dir_interval = dir_region.split(":")[1] - - seq_type_build = f"{seq_type}--{genome_build}" - - region_snapshot_dir = os.path.join(snapshot_dir, seq_type_build, dir_chrom, dir_interval, "") - - # Subset all regions down to those in the 1000nt interval - regions_in_dir = regions[regions["dir_regions"]==dir_region] - - # Iterate through unique regions within interval - for unique_region in regions_in_dir.regions.unique(): - - # Subset rows by number of snapshots desired per region - regions_subset = regions_in_dir[regions_in_dir["regions"]==unique_region] - - # Generate lines of batch script per region subset - lines = generate_igv_batch_per_region( - regions = regions_subset, - max_height=max_height, - genome_build=genome_build, - snapshot_dir=region_snapshot_dir) - - if lines is not None: - all_lines.extend(lines) - - footer = generate_igv_batch_footer() - all_lines.extend(footer) - - output_lines(all_lines, output) - if __name__ == "__main__": main() diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index aa1218d1..b8a4f8b0 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -38,22 +38,24 @@ if version.parse(current_version) < version.parse(min_oncopipe_version): CFG = op.setup_module( name = "igv", version = "1.0", - subdirectories = ["inputs", "batch_scripts", "igv", "snapshots", "outputs"], + subdirectories = ["inputs", "maf_filtered", "regions_lifted", "batch_scripts", "igv", "snapshots", "outputs"], ) # Rename genome builds in metadata to match up with MAFs? -CFG["samples"]["genome_build"].mask(CFG["samples"]["genome_build"].isin(CFG["genome_map"]["grch37"]), "grch37", inplace=True) -CFG["samples"]["genome_build"].mask(CFG["samples"]["genome_build"].isin(CFG["genome_map"]["hg38"]), "hg38", inplace=True) +CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["genome_map"]["grch37"]), "grch37", inplace=True) +CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["genome_map"]["hg38"]), "hg38", inplace=True) # Define rules to be run locally when using a compute cluster -# TODO: Replace with actual rules once you change the rule names localrules: _igv_symlink_regions_file, - _igv_symlink_metadata, + _igv_symlink_bam, + _igv_symlink_bai, _igv_symlink_maf, + _igv_reduce_maf_cols, _igv_format_regions_file, _igv_liftover_regions, + _igv_filter_maf, _igv_create_batch_script, _igv_download_igv, _igv_run @@ -63,7 +65,15 @@ localrules: def get_bams(wildcards): metadata = config["lcr-modules"]["igv"]["samples"] - return expand("data/{{seq_type}}_bams/{{sample_id}}.{genome_build}.bam", genome_build=metadata[(metadata.sample_id == wildcards.sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) + return expand("data/{{seq_type}}_bams/{{tumour_sample_id}}.{genome_build}.bam", genome_build=metadata[(metadata.sample_id == wildcards.tumour_sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) + +def get_bai(wildcards): + metadata = config["lcr-modules"]["igv"]["samples"] + return expand("data/{{seq_type}}_bams/{{tumour_sample_id}}.{genome_build}.bam.bai", genome_build=metadata[(metadata.sample_id == wildcards.tumour_sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) + +def get_maf(wildcards): + unix_group = config["unix_group"] + return expand(config["lcr-modules"]["igv"]["inputs"]["maf"], allow_missing=True, unix_group=unix_group) @@ -79,27 +89,36 @@ rule _igv_symlink_regions_file: run: op.absolute_symlink(input.regions_file, output.regions_file) -rule _igv_symlink_bams: +rule _igv_symlink_bam: input: bam = get_bams output: - bam = CFG["dirs"]["inputs"] + "bams/{seq_type}/{sample_id}.bam" + bam = CFG["dirs"]["inputs"] + "bams/{seq_type}/{tumour_sample_id}.bam" run: op.absolute_symlink(input.bam, output.bam) +rule _igv_symlink_bai: + input: + bai = get_bai + output: + bai = CFG["dirs"]["inputs"] + "bams/{seq_type}/{tumour_sample_id}.bam.bai" + run: + op.absolute_symlink(input.bai, output.bai) + rule _igv_symlink_maf: input: - maf = CFG["inputs"]["master_maf"] + maf = get_maf output: - maf = CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}.maf" + maf = CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.maf" run: op.absolute_symlink(input.maf, output.maf) +# Filter to essential columns to prevent errors in parsing with pandas rule _igv_reduce_maf_cols: input: maf = str(rules._igv_symlink_maf.output.maf) output: - maf = temp(CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}_cols.maf") + maf = temp(CFG["dirs"]["inputs"] + "maf/temp/{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.maf") shell: op.as_one_line(""" cut -f 1,5,6,7,9,10,11,13,16 {input.maf} > {output.maf} @@ -130,7 +149,7 @@ rule _igv_liftover_regions: regions = str(rules._igv_format_regions_file.output.regions), liftover_script = CFG["scripts"]["region_liftover_script"] output: - regions_lifted = CFG["dirs"]["inputs"] + "regions/regions_file_{genome_build}.txt" + regions = CFG["dirs"]["regions_lifted"] + "regions_file_{genome_build}.txt" params: chain_file = reference_files(CFG["liftover_regions"]["reference_chain_file"][(CFG["inputs"]["regions_build"]).replace("hg19","grch37").replace("grch38","hg38")]), target_reference = lambda w: config["lcr-modules"]["igv"]["liftover_regions"]["target_reference"][w.genome_build], @@ -154,35 +173,55 @@ rule _igv_liftover_regions: rule _igv_filter_maf: input: maf = str(rules._igv_reduce_maf_cols.output.maf), - regions = str(rules._igv_liftover_regions.output.regions_lifted) + regions = str(rules._igv_liftover_regions.output.regions) output: - maf_filtered = config["lcr-modules"]["igv"]["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}_cols_filtered.maf" + maf = CFG["dirs"]["maf_filtered"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.maf" params: - regions_format = REGIONS_FORMAT[config["lcr-modules"]["igv"]["inputs"]["regions_format"].lower()], - metadata = config["lcr-modules"]["igv"]["samples"], - genome_build = lambda w: w.genome_build, - seq_type = lambda w: w.seq_type, - genome_map = config["lcr-modules"]["igv"]["genome_map"], - oncodriveclustl_params = config["lcr-modules"]["igv"]["filter_maf"]["oncodriveclustl_options"], - n_snapshots = config["lcr-modules"]["igv"]["filter_maf"]["n_snapshots"] if config["lcr-modules"]["igv"]["filter_maf"]["n_snapshots"] is not None else 1000000 + regions_format = REGIONS_FORMAT[CFG["inputs"]["regions_format"].lower()], + oncodriveclustl_params = CFG["filter_maf"]["oncodriveclustl_options"], + n_snapshots = CFG["filter_maf"]["n_snapshots"] if CFG["filter_maf"]["n_snapshots"] is not None else 1000000 script: config["lcr-modules"]["igv"]["scripts"]["filter_script"] - -# Pass filtered MAF to create batch script -rule _igv_create_batch_script: - input: - maf_filtered = str(rules._igv_filter_maf.output.maf_filtered) - output: - batch_script = temp(config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + "{seq_type}--{genome_build}.batch") - params: - metadata = config["lcr-modules"]["igv"]["samples"], - snapshot_dir = config["lcr-modules"]["igv"]["dirs"]["snapshots"], - genome_build = lambda w: w.genome_build, - seq_type = lambda w: w.seq_type, - padding = config["lcr-modules"]["igv"]["generate_batch_script"]["padding"], - max_height = config["lcr-modules"]["igv"]["generate_batch_script"]["max_height"] - script: - config["lcr-modules"]["igv"]["scripts"]["batch_script"] + +if CFG["generate_batch_script"]["temp"] == False: + rule _igv_create_batch_script: + input: + bam_file = str(rules._igv_symlink_bam.output.bam), + bai_file = str(rules._igv_symlink_bai.output.bai), + maf_filtered = str(rules._igv_filter_maf.output.maf) + output: + batch_script = config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.batch" + params: + snapshot_dir = config["lcr-modules"]["igv"]["dirs"]["snapshots"], + genome_build = lambda w: w.genome_build, + seq_type = lambda w: w.seq_type, + padding = config["lcr-modules"]["igv"]["generate_batch_script"]["padding"], + max_height = config["lcr-modules"]["igv"]["generate_batch_script"]["max_height"], + igv_options = config["lcr-modules"]["igv"]["generate_batch_script"]["igv_options"], + image_format = config["lcr-modules"]["igv"]["generate_batch_script"]["image_format"] + wildcard_constraints: genome_build='[a-zA-Z0-9]+' + script: + config["lcr-modules"]["igv"]["scripts"]["batch_script"] + +elif CFG["generate_batch_script"]["temp"] == True: + rule _igv_create_batch_script: + input: + bam_file = str(rules._igv_symlink_bam.output.bam), + bai_file = str(rules._igv_symlink_bai.output.bai), + maf_filtered = str(rules._igv_filter_maf.output.maf) + output: + batch_script = temp(config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.batch") + params: + snapshot_dir = config["lcr-modules"]["igv"]["dirs"]["snapshots"], + genome_build = lambda w: w.genome_build, + seq_type = lambda w: w.seq_type, + padding = config["lcr-modules"]["igv"]["generate_batch_script"]["padding"], + max_height = config["lcr-modules"]["igv"]["generate_batch_script"]["max_height"], + igv_options = config["lcr-modules"]["igv"]["generate_batch_script"]["igv_options"], + image_format = config["lcr-modules"]["igv"]["generate_batch_script"]["image_format"] + wildcard_constraints: genome_build='[a-zA-Z0-9]+' + script: + config["lcr-modules"]["igv"]["scripts"]["batch_script"] rule _igv_download_igv: output: @@ -200,30 +239,25 @@ rule _igv_download_igv: touch {output.igv_installed} """) -rule _igv_track_samples: - input: - bam = str(rules._igv_symlink_bams.output.bam) - output: - finished = CFG["dirs"]["outputs"] + "samples/{seq_type}/{sample_id}_" + CFG["inputs"]["regions_format"] + ".track" - shell: - "touch {output.finished}" - rule _igv_run: input: batch_script = str(rules._igv_create_batch_script.output.batch_script), - igv_installed = str(rules._igv_download_igv.output.igv_installed), - sample_track = expand(str(rules._igv_track_samples.output.finished), zip, seq_type=CFG["samples"]["seq_type"], sample_id=CFG["samples"]["sample_id"]) + igv_installed = str(rules._igv_download_igv.output.igv_installed) output: - success = CFG["dirs"]["outputs"] + "snapshots/{seq_type}--{genome_build}_snapshot.finished" + success = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.finished" params: #igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh" igv = "/projects/rmorin/projects/RNA_seq_ssm/test/bin/IGV_Linux_2.7.2/igv.sh" log: - stdout = CFG["logs"]["igv"] + "run_igv_{seq_type}--{genome_build}.stdout.log", - stderr = CFG["logs"]["igv"] + "run_igv_{seq_type}--{genome_build}.stderr.log" + stdout = CFG["logs"]["outputs"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.stdout.log", + stderr = CFG["logs"]["outputs"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.stderr.log" shell: op.as_one_line(""" - xvfb-run --auto-servernum {params.igv} -b {input.batch_script} > {log.stdout} 2> {log.stderr} && + lines=$(wc -l < {input.batch_script}) ; + if [ $lines > 1 ] ; + then + xvfb-run --auto-servernum {params.igv} -b {input.batch_script} > {log.stdout} 2> {log.stderr} ; + fi ; touch {output.success} """) @@ -231,14 +265,24 @@ rule _igv_run: if CFG["test_run"] is False: rule _igv_all: input: - expand(str(rules._igv_run.output.success), seq_type=CFG["samples"]["seq_type"], genome_build=CFG["samples"]["genome_build"]) + expand(str(rules._igv_run.output.success), + zip, + seq_type=CFG["runs"]["tumour_seq_type"], + tumour_sample_id=CFG["runs"]["tumour_sample_id"], + normal_sample_id=CFG["runs"]["normal_sample_id"], + pair_status=CFG["runs"]["pair_status"], + genome_build=CFG["runs"]["tumour_genome_build"]) if CFG["test_run"] is True: rule _igv_all: input: - expand(rules._igv_filter_maf.output.maf_filtered, seq_type=CFG["samples"]["seq_type"], genome_build=CFG["samples"]["seq_type"]) - - + expand(rules._igv_filter_maf.output.maf, + zip, + seq_type=CFG["runs"]["tumour_seq_type"], + tumour_sample_id=CFG["runs"]["tumour_sample_id"], + normal_sample_id=CFG["runs"]["normal_sample_id"], + pair_status=CFG["runs"]["pair_status"], + genome_build=CFG["runs"]["tumour_genome_build"]) ##### CLEANUP ##### From 31cba38bb456c5c6c3e6cf97fbe34bcdde699568 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Sun, 26 Feb 2023 00:41:19 -0800 Subject: [PATCH 016/132] Add capability for VCF files as regions --- modules/igv/1.0/config/default.yaml | 2 +- modules/igv/1.0/envs/format_regions.yaml | 38 +++++++++ modules/igv/1.0/etc/format_regions.py | 98 ++++++++++++++++-------- modules/igv/1.0/igv.smk | 2 + 4 files changed, 109 insertions(+), 31 deletions(-) create mode 100644 modules/igv/1.0/envs/format_regions.yaml diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index f353490f..cfed784b 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -51,7 +51,7 @@ lcr-modules: conda_envs: liftover_regions: "{MODSDIR}/envs/crossmap.yaml" - batch_script: "{MODSDIR}/envs/samtools-1.9.yaml" + format_regions: "{MODSDIR}/envs/format_regions.yaml" wget: "{REPODIR}/envs/wget/wget-1.20.1.yaml" threads: diff --git a/modules/igv/1.0/envs/format_regions.yaml b/modules/igv/1.0/envs/format_regions.yaml new file mode 100644 index 00000000..cd9e0993 --- /dev/null +++ b/modules/igv/1.0/envs/format_regions.yaml @@ -0,0 +1,38 @@ +name: test-generate_igv_batch +channels: + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - ca-certificates=2020.4.5.1 + - certifi=2020.4.5.1 + - ld_impl_linux-64=2.34 + - libblas=3.8.0 + - libcblas=3.8.0 + - libffi=3.2.1 + - libgcc-ng=9.2.0 + - libgfortran-ng=7.3.0 + - liblapack=3.8.0 + - libopenblas=0.3.9 + - libstdcxx-ng=9.2.0 + - llvm-openmp=10.0.0 + - ncurses=6.1 + - numpy=1.18.1 + - openssl=1.1.1f + - pandas=1.0.3 + - pip=20.0.2 + - python=3.8.2 + - python-dateutil=2.8.1 + - python_abi=3.8 + - pytz=2019.3 + - pyvcf=0.6.8 + - readline=8.0 + - setuptools=46.1.3 + - six=1.14.0 + - sqlite=3.30.1 + - tk=8.6.10 + - wheel=0.34.2 + - xz=5.2.5 + - zlib=1.2.11 +prefix: /home/mcruz/miniconda3/envs/test-generate_igv_batch diff --git a/modules/igv/1.0/etc/format_regions.py b/modules/igv/1.0/etc/format_regions.py index f3aa954b..d1d68bee 100755 --- a/modules/igv/1.0/etc/format_regions.py +++ b/modules/igv/1.0/etc/format_regions.py @@ -3,29 +3,12 @@ import os import pandas as pd import oncopipe as op +import vcf +import shutil def format_mutation_id(mutation_id): - ## Modify dataframe to handle NA values in columns - #mutation_id = mutation_id.fillna(0) -# - ## Filter dataframe based on config option - #max_distinct_genome_cohorts = MUTATION_ID_PARAMS["max_distinct_genome_cohorts"] - #min_total_genome_samples = MUTATION_ID_PARAMS["min_total_genome_samples"] - # - #max_distinct_capture_cohorts = MUTATION_ID_PARAMS["max_distinct_capture_cohorts"] - #min_total_capture_samples = MUTATION_ID_PARAMS["min_total_capture_samples"] -# - #for key, filter_value in { - # "distinct_genome_cohorts": max_distinct_genome_cohorts, - # "total_genome": min_total_genome_samples, - # "distinct_capture_cohorts": max_distinct_capture_cohorts, - # "total_capture": min_total_capture_samples - #}.items(): - # if filter_value is not None: - # if key in ["distinct_genome_cohorts", "distinct_capture_cohorts"]: - # mutation_id = mutation_id[mutation_id[key] <= float(filter_value)] - # else: - # mutation_id = mutation_id[mutation_id[key] >= float(filter_value)] + # Read regions into dataframe + mutation_id = pd.read_table(mutation_id, comment="#", sep="\t") # Create columns required for liftover in BED format genomic_pos_col = f"mutation_id_{REGIONS_BUILD}" @@ -47,6 +30,9 @@ def format_mutation_id(mutation_id): return mutation_id_reformatted def format_hotmaps(hotmaps_regions): + # Read regions into dataframe + hotmaps_regions = pd.read_table(hotmaps_regions, comment="#", sep="\t") + # Convert HotMAPS coordinates to BED format hotmaps_regions["chr_std"] = hotmaps_regions.apply(lambda x: str(x["Chromosome"]).replace("chr",""), axis=1) @@ -62,6 +48,9 @@ def format_hotmaps(hotmaps_regions): return hotmaps_reformatted def format_clustl(clustl_regions): + # Read regions into dataframe + clustl_regions = pd.read_table(clustl_regions, comment="#", sep="\t") + p_filter = CLUSTL_PARAMS["p_value"] score_filter = CLUSTL_PARAMS["score"] n_samples_filter = CLUSTL_PARAMS["n_samples"] @@ -99,16 +88,63 @@ def format_clustl(clustl_regions): ) return clustl_reformatted -def format_maf(regions): - # If the regions format is a MAF, don't need to reformat for liftover - return regions +def format_vcf(regions): + # Load VCF file + vcf_reader = vcf.Reader(open(regions, "rb")) + + # Convert VCF records to BED format + chroms = [] + pos = [] + events_seen = set() + + for record in vcf_reader: + if len(record.FILTER) > 0: + continue + + # Skip SVs with ID matching previous record + if record.ID in events_seen: + continue + + chromosome = "chr" + str(record.CHROM).replace("chr","") + position = record.POS + + chroms.append(chromosome) + pos.append(position) + + if record.is_sv and "END" in record.INFO: + # Add end position of SV to regions of interest + end = record.INFO["END"][0] + + chroms.append(chromosome) + pos.append(end) + + if record.is_sv and record.INFO["SVTYPE"] == "BND": + # Add end position of SV to regions of interest + chromosome = "chr" + str(record.ALT[0].chr).replace("chr","") + position = record.ALT[0].pos + + chroms.append(chromosome) + pos.append(position) + + # To skip mate event in VCF file + events_seen.add(record.INFO["MATEID"]) + + vcf_reformatted = pd.DataFrame( + { + "chrom": chroms, + "start": pos, + "end": pos + } + ) + + return vcf_reformatted def format_regions(regions, regions_format): format_functions = { - "maf": format_maf, "oncodriveclustl": format_clustl, "hotmaps": format_hotmaps, - "mutation_id": format_mutation_id + "mutation_id": format_mutation_id, + "vcf": format_vcf, } return format_functions[regions_format](regions) @@ -125,11 +161,13 @@ def format_regions(regions, regions_format): REGIONS_BUILD = snakemake.params[2] REGIONS_BUILD = REGIONS_BUILD.lower() -# Read regions into dataframe -regions_df = pd.read_table(regions_file, comment="#", sep="\t") +if regions_format == "bed" or regions_format == "maf": + # Do not need to reformat for liftover + shutil.copy(regions_file, output_file) + exit() # Reformat for liftover based on regions format -regions_formatted = format_regions(regions_df, regions_format) +regions_formatted = format_regions(regions_file, regions_format) # Output regions file -regions_formatted.to_csv(output_file, sep="\t", index=False) \ No newline at end of file +regions_formatted.to_csv(output_file, sep="\t", index=False) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index b8a4f8b0..2c0c26db 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -134,6 +134,8 @@ rule _igv_format_regions_file: regions_format = config["lcr-modules"]["igv"]["inputs"]["regions_format"], oncodriveclustl_params = config["lcr-modules"]["igv"]["filter_maf"]["oncodriveclustl_options"], regions_build = config["lcr-modules"]["igv"]["inputs"]["regions_build"] + conda: + CFG["conda_envs"]["format_regions"] script: config["lcr-modules"]["igv"]["scripts"]["format_regions"] From 77f9d76cc10b153f04e99492469c186021e2f0b3 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Tue, 28 Feb 2023 12:53:18 -0800 Subject: [PATCH 017/132] Allow filtered MAF files to be temp --- modules/igv/1.0/config/default.yaml | 3 ++ modules/igv/1.0/igv.smk | 49 ++++++++++++++++++++--------- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index cfed784b..84a36ea8 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -11,6 +11,8 @@ lcr-modules: test_run: True # Stop after MAF filtering step to get an estimate of how many snapshots will be taken + view_as_pairs: False # Toggle pairwise orientation in IGV + filter_maf: oncodriveclustl_options: # These parameters will filter the OncodriveCLUSTL cluster results file. p_value: # Desired q-value of OncodriveCLUSTL clusters @@ -35,6 +37,7 @@ lcr-modules: image_format: ".svg" padding: 300 max_height: 400 + # Available batch script options: https://github.com/igvteam/igv/wiki/Batch-commands igv_options: [] scripts: diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 2c0c26db..139601dd 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -118,7 +118,7 @@ rule _igv_reduce_maf_cols: input: maf = str(rules._igv_symlink_maf.output.maf) output: - maf = temp(CFG["dirs"]["inputs"] + "maf/temp/{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.maf") + maf = temp(CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.maf.temp") shell: op.as_one_line(""" cut -f 1,5,6,7,9,10,11,13,16 {input.maf} > {output.maf} @@ -171,19 +171,35 @@ rule _igv_liftover_regions: {params.target_reference} > {log.stdout} 2> {log.stderr} """) -# Pass metadata as a pandas dataframe directly from the samples value specified in config -rule _igv_filter_maf: - input: - maf = str(rules._igv_reduce_maf_cols.output.maf), - regions = str(rules._igv_liftover_regions.output.regions) - output: - maf = CFG["dirs"]["maf_filtered"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.maf" - params: - regions_format = REGIONS_FORMAT[CFG["inputs"]["regions_format"].lower()], - oncodriveclustl_params = CFG["filter_maf"]["oncodriveclustl_options"], - n_snapshots = CFG["filter_maf"]["n_snapshots"] if CFG["filter_maf"]["n_snapshots"] is not None else 1000000 - script: - config["lcr-modules"]["igv"]["scripts"]["filter_script"] +if CFG["test_run"] == False: + # Pass metadata as a pandas dataframe directly from the samples value specified in config + rule _igv_filter_maf: + input: + maf = str(rules._igv_reduce_maf_cols.output.maf), + regions = str(rules._igv_liftover_regions.output.regions) + output: + maf = temp(CFG["dirs"]["maf_filtered"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.maf") + params: + regions_format = REGIONS_FORMAT[CFG["inputs"]["regions_format"].lower()], + oncodriveclustl_params = CFG["filter_maf"]["oncodriveclustl_options"], + n_snapshots = CFG["filter_maf"]["n_snapshots"] if CFG["filter_maf"]["n_snapshots"] is not None else 1000000 + script: + config["lcr-modules"]["igv"]["scripts"]["filter_script"] + +elif CFG["test_run"] == True: + # Pass metadata as a pandas dataframe directly from the samples value specified in config + rule _igv_filter_maf: + input: + maf = str(rules._igv_reduce_maf_cols.output.maf), + regions = str(rules._igv_liftover_regions.output.regions) + output: + maf = CFG["dirs"]["maf_filtered"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.maf" + params: + regions_format = REGIONS_FORMAT[CFG["inputs"]["regions_format"].lower()], + oncodriveclustl_params = CFG["filter_maf"]["oncodriveclustl_options"], + n_snapshots = CFG["filter_maf"]["n_snapshots"] if CFG["filter_maf"]["n_snapshots"] is not None else 1000000 + script: + config["lcr-modules"]["igv"]["scripts"]["filter_script"] if CFG["generate_batch_script"]["temp"] == False: rule _igv_create_batch_script: @@ -241,12 +257,15 @@ rule _igv_download_igv: touch {output.igv_installed} """) +# Add suffix for running as pairs +RUN_SUFFIX = ".pairs" if CFG["view_as_pairs"] is True else "" + rule _igv_run: input: batch_script = str(rules._igv_create_batch_script.output.batch_script), igv_installed = str(rules._igv_download_igv.output.igv_installed) output: - success = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.finished" + success = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}" + RUN_SUFFIX + ".finished" params: #igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh" igv = "/projects/rmorin/projects/RNA_seq_ssm/test/bin/IGV_Linux_2.7.2/igv.sh" From 0bf93ade26b27eb5dce6ec7247202f7810898b95 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Thu, 2 Mar 2023 10:42:42 -0800 Subject: [PATCH 018/132] Track what snapshots will be created (draft) --- modules/igv/1.0/etc/filter_maf.py | 38 +++- modules/igv/1.0/etc/generate_batch_scripts.py | 54 +++--- modules/igv/1.0/igv.smk | 166 +++++++++++------- 3 files changed, 156 insertions(+), 102 deletions(-) diff --git a/modules/igv/1.0/etc/filter_maf.py b/modules/igv/1.0/etc/filter_maf.py index dd5e03ef..3b548054 100644 --- a/modules/igv/1.0/etc/filter_maf.py +++ b/modules/igv/1.0/etc/filter_maf.py @@ -25,31 +25,29 @@ def filter_by_maf(maf, regions): # Create common column by which to subset MAF for df in [maf, regions]: df["chr_std"] = df.apply(lambda x: str(x["Chromosome"]).replace("chr",""), axis=1) - df["genomic_pos_std"] = df["chr_std"] + ":" + df["Start_Position"].map(str) + "_" + df["End_Position"].map(str) + df["genomic_pos_std"] = df["chr_std"] + ":" + df["Start_Position"].map(str) # Subset the MAF filtered_maf = maf[maf["genomic_pos_std"].isin(regions["genomic_pos_std"])] return filtered_maf def maf_filter(maf, regions, regions_format): - # Read input MAF and regions file as dataframes - maf_df = pd.read_table(maf, comment="#", sep="\t") - + if regions_format != "bed": regions_df = pd.read_table(regions, comment="#", sep="\t") else: regions_df = pd.read_table(regions, comment="#", sep="\t", header=None) # Return empty dataframe without filtering if df is empty - if len(maf_df)==0: - return maf_df + if len(maf)==0: + return maf filter_functions = { "maf": filter_by_maf, "bed": filter_by_bed } - return filter_functions[regions_format](maf_df, regions_df) + return filter_functions[regions_format](maf, regions_df) def maf_reduce_snapshots(maf, snapshots): # Only include max of number of snapshots for each variant @@ -57,6 +55,26 @@ def maf_reduce_snapshots(maf, snapshots): return maf +def maf_add_columns(maf, metadata): + # Read input MAF as df + maf = pd.read_table(maf, comment="#", sep="\t") + + sample_id = maf["Tumor_Sample_Barcode"].unique()[0] + + row = metadata[metadata["tumour_sample_id"]==sample_id] + + seq_type = row["tumour_seq_type"].item() + genome_build = row["tumour_genome_build"].item() + normal_sample_id = row["normal_sample_id"].item() + pair_status = row["pair_status"].item() + + maf["seq_type"] = seq_type + maf["genome_build"] = genome_build + maf["normal_sample_id"] = normal_sample_id + maf["pair_status"] = pair_status + + return maf + def write_output(maf, outfile): maf.to_csv(outfile, sep="\t", index=False) @@ -65,6 +83,8 @@ def write_output(maf, outfile): regions_file = snakemake.input[1] regions_format = snakemake.params[0] +metadata = snakemake.params[3] + if regions_format == "oncodriveclustl": # This should act as a global variable CLUSTL_PARAMS = snakemake.params[1] @@ -73,10 +93,12 @@ def write_output(maf, outfile): output_file = snakemake.output[0] +maf = maf_add_columns(maf=maf_file, metadata=metadata) + # Peform filtering filtered_maf = maf_filter( - maf=maf_file, + maf=maf, regions=regions_file, regions_format=regions_format ) diff --git a/modules/igv/1.0/etc/generate_batch_scripts.py b/modules/igv/1.0/etc/generate_batch_scripts.py index 26fe00b8..207e6ff6 100755 --- a/modules/igv/1.0/etc/generate_batch_scripts.py +++ b/modules/igv/1.0/etc/generate_batch_scripts.py @@ -65,6 +65,7 @@ def get_regions_df(input_maf, seq_type, padding): "Chromosome", "Start_Position", "End_Position", + "Hugo_Symbol", ] assert(all(c in list(maf.columns) for c in columns)), ( @@ -86,21 +87,20 @@ def get_regions_df(input_maf, seq_type, padding): # Create a pandas dataframe with to link regions with sample_ids and bam files chrom = (maf["Chromosome"].astype(str)).apply(lambda x: x.replace("chr","")) - # Snapshots will be held in parent directories of 1000-nt intervals for easier navigation - dir_start = ((maf["Start_Position"] / 1000).apply(lambda x: math.trunc(x)) * 1000).astype(str) - dir_end = (dir_start.astype(int) + 1000).astype(str) - dir_regions = "chr" + chrom + ":" + dir_start + "_" + dir_end - - # Specify the regions that will be captured by IGV based on variant positions and padding - region_start = (maf["Start_Position"] - padding).astype(str) - region_end = (maf["End_Position"] + padding).astype(str) - regions = "chr" + chrom + ":" + region_start + "-" + region_end + # Specify the regions that will be captured by IGV based on variant positions + region_position = (maf["Start_Position"]).astype(str) + snapshot_start = (maf["Start_Position"] - padding).astype(str) + snapshot_end = (maf["End_Position"] + padding).astype(str) + snapshot_coordinates = "chr" + chrom + ":" + snapshot_start + "-" + snapshot_end + regions = "chr" + chrom + ":" + region_position regions_df = pd.DataFrame( - {"dir_regions": dir_regions, - "regions": regions, + {"chromosome": "chr" + chrom, + "region": regions, "region_name": maf.Hugo_Symbol, "sample_id": maf.Tumor_Sample_Barcode, + "snapshot_coordinates": snapshot_coordinates, + "padding": padding } ) @@ -122,9 +122,9 @@ def generate_igv_batch_header(bam_file, index_file, max_height, genome_build): return lines -def generate_igv_batch_per_row(regions, snapshot_filename, igv_options): +def generate_igv_batch_per_row(coordinates, snapshot_filename, igv_options): lines = [] - lines.append(f"goto {regions}") + lines.append(f"goto {coordinates}") lines.append("sort") lines.append("collapse") for option in igv_options: @@ -144,11 +144,11 @@ def generate_igv_batch(bam, bai, regions, output, max_height, seq_type, genome_b all_lines.extend(header) - for dir_region in regions.dir_regions.unique(): - regions_in_dir = regions[regions["dir_regions"]==dir_region] + for chrom in regions.chromosome.unique(): + chrom_regions = regions[regions["chromosome"]==chrom] lines = generate_igv_batch_per_region( - regions=regions_in_dir, + regions=chrom_regions, max_height=max_height, seq_type=seq_type, genome_build=genome_build, @@ -172,12 +172,11 @@ def generate_igv_batch_per_region(regions, max_height, seq_type, genome_build, s lines = [] # Set up snapshot directory string - dir_chrom = regions.dir_regions.unique()[0].split(":")[0] - dir_interval = regions.dir_regions.unique()[0].split(":")[1] + dir_chrom = regions.chromosome.unique()[0].split(":")[0] seq_type_build = f"{seq_type}--{genome_build}" # Add snapshot directory line to batch script - snapshot_regions_dir = os.path.join(snapshot_dir, seq_type_build, dir_chrom, dir_interval, "") + snapshot_regions_dir = os.path.join(snapshot_dir, seq_type_build, dir_chrom, "") lines.append(f"snapshotDirectory {snapshot_regions_dir}") # Add lines to batch script for each sample @@ -185,19 +184,20 @@ def generate_igv_batch_per_region(regions, max_height, seq_type, genome_build, s # Add components of filename as a list filename = [] - filename.append(row.regions) + filename.append(row.region) + + filename.append(str(row.padding)) - # Include gene name if available - if "region_name" in row: - filename.append(row.region_name) + filename.append(row.region_name) + filename.append(row.sample_id) - if not image_format.startswith("."): - image_format = "." + image_format + #if not image_format.startswith("."): + # image_format = "." + image_format - filename = "--".join(filename) + image_format + filename = "--".join(filename) + ".png" - row_lines = generate_igv_batch_per_row(regions = row.regions, snapshot_filename = filename, igv_options = options) + row_lines = generate_igv_batch_per_row(coordinates = row.snapshot_coordinates, snapshot_filename = filename, igv_options = options) lines.extend(row_lines) return lines diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 139601dd..da713e0e 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -15,6 +15,11 @@ import oncopipe as op import pandas as pd +# Needed for getting snapshot paths +import os +# Needed for creating table of snapshot dirs from MAF +import math + # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe min_oncopipe_version="1.0.11" import pkg_resources @@ -58,7 +63,10 @@ localrules: _igv_filter_maf, _igv_create_batch_script, _igv_download_igv, - _igv_run + _igv_run, + _igv_symlink_snapshots, + _igv_dispatch + ##### FUNCTIONS ##### @@ -76,7 +84,6 @@ def get_maf(wildcards): return expand(config["lcr-modules"]["igv"]["inputs"]["maf"], allow_missing=True, unix_group=unix_group) - ##### RULES ##### @@ -167,7 +174,7 @@ rule _igv_liftover_regions: op.as_one_line(""" {input.liftover_script} {input.regions} {params.regions_type} {params.regions_build} {params.target_build} - {output.regions_lifted} {params.chain_file} + {output.regions} {params.chain_file} {params.target_reference} > {log.stdout} 2> {log.stderr} """) @@ -178,11 +185,12 @@ if CFG["test_run"] == False: maf = str(rules._igv_reduce_maf_cols.output.maf), regions = str(rules._igv_liftover_regions.output.regions) output: - maf = temp(CFG["dirs"]["maf_filtered"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.maf") + maf = CFG["dirs"]["maf_filtered"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.maf" params: regions_format = REGIONS_FORMAT[CFG["inputs"]["regions_format"].lower()], oncodriveclustl_params = CFG["filter_maf"]["oncodriveclustl_options"], - n_snapshots = CFG["filter_maf"]["n_snapshots"] if CFG["filter_maf"]["n_snapshots"] is not None else 1000000 + n_snapshots = CFG["filter_maf"]["n_snapshots"] if CFG["filter_maf"]["n_snapshots"] is not None else 1000000, + metadata = CFG["runs"] script: config["lcr-modules"]["igv"]["scripts"]["filter_script"] @@ -197,49 +205,49 @@ elif CFG["test_run"] == True: params: regions_format = REGIONS_FORMAT[CFG["inputs"]["regions_format"].lower()], oncodriveclustl_params = CFG["filter_maf"]["oncodriveclustl_options"], - n_snapshots = CFG["filter_maf"]["n_snapshots"] if CFG["filter_maf"]["n_snapshots"] is not None else 1000000 + n_snapshots = CFG["filter_maf"]["n_snapshots"] if CFG["filter_maf"]["n_snapshots"] is not None else 1000000, + metadata = CFG["runs"] script: config["lcr-modules"]["igv"]["scripts"]["filter_script"] -if CFG["generate_batch_script"]["temp"] == False: - rule _igv_create_batch_script: - input: - bam_file = str(rules._igv_symlink_bam.output.bam), - bai_file = str(rules._igv_symlink_bai.output.bai), - maf_filtered = str(rules._igv_filter_maf.output.maf) - output: - batch_script = config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.batch" - params: - snapshot_dir = config["lcr-modules"]["igv"]["dirs"]["snapshots"], - genome_build = lambda w: w.genome_build, - seq_type = lambda w: w.seq_type, - padding = config["lcr-modules"]["igv"]["generate_batch_script"]["padding"], - max_height = config["lcr-modules"]["igv"]["generate_batch_script"]["max_height"], - igv_options = config["lcr-modules"]["igv"]["generate_batch_script"]["igv_options"], - image_format = config["lcr-modules"]["igv"]["generate_batch_script"]["image_format"] - wildcard_constraints: genome_build='[a-zA-Z0-9]+' - script: - config["lcr-modules"]["igv"]["scripts"]["batch_script"] - -elif CFG["generate_batch_script"]["temp"] == True: - rule _igv_create_batch_script: - input: - bam_file = str(rules._igv_symlink_bam.output.bam), - bai_file = str(rules._igv_symlink_bai.output.bai), - maf_filtered = str(rules._igv_filter_maf.output.maf) - output: - batch_script = temp(config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.batch") - params: - snapshot_dir = config["lcr-modules"]["igv"]["dirs"]["snapshots"], - genome_build = lambda w: w.genome_build, - seq_type = lambda w: w.seq_type, - padding = config["lcr-modules"]["igv"]["generate_batch_script"]["padding"], - max_height = config["lcr-modules"]["igv"]["generate_batch_script"]["max_height"], - igv_options = config["lcr-modules"]["igv"]["generate_batch_script"]["igv_options"], - image_format = config["lcr-modules"]["igv"]["generate_batch_script"]["image_format"] - wildcard_constraints: genome_build='[a-zA-Z0-9]+' - script: - config["lcr-modules"]["igv"]["scripts"]["batch_script"] +# Merge filtered MAF to create samples table that includes variant coordinates +#rule _igv_aggregate_mafs: +# input: +# expand(str(rules._igv_filter_maf.output.maf), +# zip, +# seq_type=CFG["runs"]["tumour_seq_type"], +# genome_build=CFG["runs"]["tumour_genome_build"], +# tumour_sample_id=CFG["runs"]["tumour_sample_id"], +# normal_sample_id=CFG["runs"]["normal_sample_id"], +# pair_status=CFG["runs"]["pair_status"]) +# output: +# maf = CFG["dirs"]["maf_filtered"] + "merged_filtered_maf.maf" +# run: +# merged_df = pd.DataFrame() +# for filename in input: +# maf = pd.read_table(filename, sep="\t") +# merged_df = pd.concat([merged_df, maf], ignore_index=True) +# merged_df.to_csv(output, sep="\t", index=False) + + +rule _igv_create_batch_script: + input: + bam_file = str(rules._igv_symlink_bam.output.bam), + bai_file = str(rules._igv_symlink_bai.output.bai), + maf_filtered = str(rules._igv_filter_maf.output.maf) + output: + batch_script = config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.batch" + params: + snapshot_dir = config["lcr-modules"]["igv"]["dirs"]["snapshots"], + genome_build = lambda w: w.genome_build, + seq_type = lambda w: w.seq_type, + padding = config["lcr-modules"]["igv"]["generate_batch_script"]["padding"], + max_height = config["lcr-modules"]["igv"]["generate_batch_script"]["max_height"], + igv_options = config["lcr-modules"]["igv"]["generate_batch_script"]["igv_options"], + image_format = config["lcr-modules"]["igv"]["generate_batch_script"]["image_format"] + wildcard_constraints: genome_build='[a-zA-Z0-9]+' + script: + config["lcr-modules"]["igv"]["scripts"]["batch_script"] rule _igv_download_igv: output: @@ -260,50 +268,74 @@ rule _igv_download_igv: # Add suffix for running as pairs RUN_SUFFIX = ".pairs" if CFG["view_as_pairs"] is True else "" -rule _igv_run: +checkpoint _igv_run: input: - batch_script = str(rules._igv_create_batch_script.output.batch_script), + batch_script = expand(str(rules._igv_create_batch_script.output.batch_script), zip, normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"], allow_missing=True), igv_installed = str(rules._igv_download_igv.output.igv_installed) output: - success = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}" + RUN_SUFFIX + ".finished" + snapshots = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{tumour_sample_id}.finished" params: #igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh" igv = "/projects/rmorin/projects/RNA_seq_ssm/test/bin/IGV_Linux_2.7.2/igv.sh" - log: - stdout = CFG["logs"]["outputs"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.stdout.log", - stderr = CFG["logs"]["outputs"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.stderr.log" + #log: + #stdout = CFG["logs"]["outputs"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.stdout.log", + #stderr = CFG["logs"]["outputs"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.stderr.log" shell: op.as_one_line(""" lines=$(wc -l < {input.batch_script}) ; if [ $lines > 1 ] ; then - xvfb-run --auto-servernum {params.igv} -b {input.batch_script} > {log.stdout} 2> {log.stderr} ; + xvfb-run --auto-servernum {params.igv} -b {input.batch_script} ; fi ; - touch {output.success} + touch {output.snapshots} """) +rule _igv_symlink_snapshots: + input: + snap = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{snap_path}--{tumour_sample_id}.png" + output: + snap = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{snap_path}--{tumour_sample_id}.png" + run: + op.relative_symlink(input.snap, output.snap) + +# Get genomic positions that will dictate filenames +#def aggregate_snapshots(wildcards): +# maf_table = pd.read_table(rules._igv_aggregate_mafs.output.maf) +# checkpoint_output = checkpoints._igv_run.get(**wildcards).output[0] +# return expand(config["lcr-modules"]["igv"]["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chrom}/{position}--{padding}--{hugo}--{tumour_sample_id}.png", +# zip, +# seq_type = wildcards.seq_type, +# genome_build = wildcards.genome_build, +# chrom, position, +# chrom=maf_table["chr_std"], position=maf_table["genomic_pos_std"], padding=config["lcr-modules"]["igv"]["generate_batch_script"]["padding"], hugo=maf_table["Hugo_Symbol"], tumour_sample_id = wildcards.tumour_sample_id) + +def _evaluate_snapshots(wildcards): + CFG = config["lcr-modules"]["igv"] + checkpoint_output = checkpoints._igv_run.get(**wildcards).output.snapshots + parent_dir = CFG["dirs"]["snapshots"] + f"{wildcards.seq_type}--{wildcards.genome_build}" + chromosome_dirs = os.listdir(parent_dir) + sample_snapshots = [] + for chrom in chromosome_dirs: + snapshots = os.listdir(os.path.join(parent_dir,chrom)) + sample_snapshots.extend([os.path.join(chrom, s.replace(f"--{wildcards.tumour_sample_id}.png","")) for s in snapshots if s.endswith(f"{wildcards.tumour_sample_id}.png")]) + return expand(rules._igv_symlink_snapshots.output.snap, zip, seq_type = wildcards.seq_type, genome_build = wildcards.genome_build, tumour_sample_id = wildcards.tumour_sample_id, snap_path=sample_snapshots) + +rule _igv_dispatch: + input: + _evaluate_snapshots + output: + touch(CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.dispatched") + # Generates the target sentinels for each run, which generate the symlinks if CFG["test_run"] is False: rule _igv_all: input: - expand(str(rules._igv_run.output.success), - zip, - seq_type=CFG["runs"]["tumour_seq_type"], - tumour_sample_id=CFG["runs"]["tumour_sample_id"], - normal_sample_id=CFG["runs"]["normal_sample_id"], - pair_status=CFG["runs"]["pair_status"], - genome_build=CFG["runs"]["tumour_genome_build"]) + expand(str(rules._igv_dispatch.output), zip, seq_type=CFG["runs"]["tumour_seq_type"], tumour_sample_id=CFG["runs"]["tumour_sample_id"], normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"], genome_build=CFG["runs"]["tumour_genome_build"]) if CFG["test_run"] is True: rule _igv_all: input: - expand(rules._igv_filter_maf.output.maf, - zip, - seq_type=CFG["runs"]["tumour_seq_type"], - tumour_sample_id=CFG["runs"]["tumour_sample_id"], - normal_sample_id=CFG["runs"]["normal_sample_id"], - pair_status=CFG["runs"]["pair_status"], - genome_build=CFG["runs"]["tumour_genome_build"]) + expand(rules._igv_filter_maf.output.maf, zip, seq_type=CFG["runs"]["tumour_seq_type"], tumour_sample_id=CFG["runs"]["tumour_sample_id"], normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"], genome_build=CFG["runs"]["tumour_genome_build"]) ##### CLEANUP ##### From f0047d5d42b7b4fae10de3a95308f581975957db Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 8 Mar 2023 12:55:34 -0800 Subject: [PATCH 019/132] Workflow changed to run per sample-variant combo --- modules/igv/1.0/config/default.yaml | 1 + modules/igv/1.0/etc/filter_maf.py | 6 +- .../etc/generate_batch_script_per_variant.py | 154 ++++++++++++++++++ modules/igv/1.0/igv.smk | 123 ++++++-------- 4 files changed, 205 insertions(+), 79 deletions(-) create mode 100644 modules/igv/1.0/etc/generate_batch_script_per_variant.py diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index 84a36ea8..e9cddb6f 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -44,6 +44,7 @@ lcr-modules: format_regions: "etc/format_regions.py" filter_script: "etc/filter_maf.py" region_liftover_script: "{MODSDIR}/etc/liftover_regions.sh" + batch_script_per_variant: "etc/generate_batch_script_per_variant.py" batch_script: "etc/generate_batch_scripts.py" scratch_subdirectories: [] diff --git a/modules/igv/1.0/etc/filter_maf.py b/modules/igv/1.0/etc/filter_maf.py index 3b548054..2aac3ba2 100644 --- a/modules/igv/1.0/etc/filter_maf.py +++ b/modules/igv/1.0/etc/filter_maf.py @@ -11,10 +11,10 @@ def filter_by_bed(maf, regions): regions = regions[regions[0].str.contains("chrom")==False] # Create common columns between BED and MAF - regions["chr_std"] = regions.apply(lambda x: str(x[0]).replace("chr",""), axis=1) + regions["chr_std"] = regions.apply(lambda x: "chr" + str(x[0]).replace("chr",""), axis=1) regions["genomic_pos_std"] = regions["chr_std"] + ":" + regions[1].map(str) - maf["chr_std"] = maf.apply(lambda x: str(x["Chromosome"]).replace("chr",""), axis=1) + maf["chr_std"] = maf.apply(lambda x: "chr" + str(x["Chromosome"]).replace("chr",""), axis=1) maf["genomic_pos_std"] = maf["chr_std"] + ":" + maf["Start_Position"].map(str) filtered_maf = maf[maf["genomic_pos_std"].isin(regions["genomic_pos_std"])] @@ -24,7 +24,7 @@ def filter_by_maf(maf, regions): # Create common column by which to subset MAF for df in [maf, regions]: - df["chr_std"] = df.apply(lambda x: str(x["Chromosome"]).replace("chr",""), axis=1) + df["chr_std"] = df.apply(lambda x: "chr" + str(x["Chromosome"]).replace("chr",""), axis=1) df["genomic_pos_std"] = df["chr_std"] + ":" + df["Start_Position"].map(str) # Subset the MAF diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py new file mode 100644 index 00000000..b3466aba --- /dev/null +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 + +import os +import warnings +import numpy as np +import pandas as pd +import oncopipe as op + +def main(): + + input_maf = open(snakemake.input[0], "r") + input_bam = snakemake.input[1] + input_bai = snakemake.input[2] + + # Skip if no variants in outfile + line_count = 0 + for line in input_maf: + line_count += 1 + if line_count > 1: + break + if line_count < 2: + input_maf.close() + touch_output.close() + exit() + + # Return to top of MAF + input_maf.seek(0) + + # Read MAF file and create dataframe + regions = get_regions_df( + input_maf, + seq_type=snakemake.params[2], + padding=snakemake.params[4] + ) + + input_maf.close() + + # Create the batch scripts + generate_igv_batches( + regions = regions, + bam = input_bam, + bai = input_bai, + output_dir = snakemake.params[0], + snapshot_dir = snakemake.params[1], + genome_build = snakemake.params[2], + seq_type = snakemake.params[3], + igv_options = snakemake.params[5], + max_height = snakemake.params[6] + ) + + touch_output = open(snakemake.output[0], "w") + touch_output.close() + +def get_regions_df(input_maf, seq_type, padding): + # Read MAF as dataframe + maf = pd.read_table(input_maf, comment="#", sep="\t") + + chrom = (maf["Chromosome"].astype(str)).apply(lambda x: x.replace("chr","")) + + # Specify regions that will be captured by IGV based on variant positions + region_start = (maf["Start_Position"]).astype(str) + snapshot_start = (maf["Start_Position"] - padding).astype(str) + snapshot_end = (maf["End_Position"] + padding).astype(str) + snapshot_coordinates = "chr" + chrom + ":" + snapshot_start + "-" + snapshot_end + regions = "chr" + chrom + ":" + region_start + + regions_df = pd.DataFrame( + {"chromosome": "chr" + chrom, + "region": regions, + "region_name": maf.Hugo_Symbol, + "sample_id": maf.Tumor_Sample_Barcode, + "snapshot_coordinates": snapshot_coordinates, + "padding": padding + } + ) + + return regions_df + +def output_lines(lines, batch_output): + output = open(batch_output, "w") + lines.append("") + text = "\n".join(lines) + output.write(text) + output.close() + +def generate_igv_batch_per_row(coordinates, snapshot_filename, igv_options): + lines = [] + lines.append(f"goto {coordinates}") + lines.append("sort") + lines.append("collapse") + for option in igv_options: + lines.append(option) + lines.append(f"snapshot {snapshot_filename}") + + return lines + +def generate_igv_batch_header(bam, index, max_height, genome_build): + lines = [] + + genome_build = genome_build.replace("grch37","hg19") + + bam_file = os.path.realpath(bam) + bai_file = os.path.realpath(index) + lines.append(f"load {bam_file} index={bai_file}") + + lines.append(f"maxPanelHeight {max_height}") + lines.append(f"genome {genome_build}") + + return lines + +def generate_igv_batch_footer(): + lines = [] + lines.append("exit") + return lines + +def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_build, seq_type, igv_options, max_height): + for _, row in regions.iterrows(): + all_lines = [] + + header = generate_igv_batch_header(bam=bam, index=bai, max_height=max_height, genome_build=genome_build) + all_lines.extend(header) + + dir_chrom = row.chromosome + seq_type_build = f"{seq_type}--{genome_build}" + + snapshot_regions_dir = os.path.join(snapshot_dir, seq_type_build, dir_chrom, "") + all_lines.append(f"snapshotDirectory {snapshot_regions_dir}") + + filename = [] + filename.append(row.region), + filename.append(str(row.padding)) + filename.append(row.region_name) + filename.append(row.sample_id) + + batch_filename = "--".join(filename) + ".batch" + filename = "--".join(filename) + ".png" + + lines = generate_igv_batch_per_row( + coordinates = row.snapshot_coordinates, + snapshot_filename = filename, + igv_options = igv_options + ) + + all_lines.extend(lines) + + footer = generate_igv_batch_footer() + all_lines.extend(footer) + + batch_file_path = os.path.join(output_dir, seq_type_build, batch_filename) + + output_lines(all_lines, batch_file_path) + +if __name__ == "__main__": + main() diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index da713e0e..d4c56d01 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -61,11 +61,11 @@ localrules: _igv_format_regions_file, _igv_liftover_regions, _igv_filter_maf, - _igv_create_batch_script, + _igv_create_batch_script_per_variant, _igv_download_igv, _igv_run, - _igv_symlink_snapshots, - _igv_dispatch + _igv_symlink_snapshot, + _igv_check_outputs, ##### FUNCTIONS ##### @@ -210,44 +210,24 @@ elif CFG["test_run"] == True: script: config["lcr-modules"]["igv"]["scripts"]["filter_script"] -# Merge filtered MAF to create samples table that includes variant coordinates -#rule _igv_aggregate_mafs: -# input: -# expand(str(rules._igv_filter_maf.output.maf), -# zip, -# seq_type=CFG["runs"]["tumour_seq_type"], -# genome_build=CFG["runs"]["tumour_genome_build"], -# tumour_sample_id=CFG["runs"]["tumour_sample_id"], -# normal_sample_id=CFG["runs"]["normal_sample_id"], -# pair_status=CFG["runs"]["pair_status"]) -# output: -# maf = CFG["dirs"]["maf_filtered"] + "merged_filtered_maf.maf" -# run: -# merged_df = pd.DataFrame() -# for filename in input: -# maf = pd.read_table(filename, sep="\t") -# merged_df = pd.concat([merged_df, maf], ignore_index=True) -# merged_df.to_csv(output, sep="\t", index=False) - - -rule _igv_create_batch_script: +# Create multiple batch scripts per sample for each variant +checkpoint _igv_create_batch_script_per_variant: input: + filter_maf = expand(str(rules._igv_filter_maf.output.maf), zip, normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"], allow_missing=True), bam_file = str(rules._igv_symlink_bam.output.bam), - bai_file = str(rules._igv_symlink_bai.output.bai), - maf_filtered = str(rules._igv_filter_maf.output.maf) + bai_file = str(rules._igv_symlink_bai.output.bai) output: - batch_script = config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.batch" + sample_batch = CFG["dirs"]["batch_scripts"] + "completed/{seq_type}--{genome_build}/{tumour_sample_id}.complete" params: + batch_dir = config["lcr-modules"]["igv"]["dirs"]["batch_scripts"], snapshot_dir = config["lcr-modules"]["igv"]["dirs"]["snapshots"], genome_build = lambda w: w.genome_build, seq_type = lambda w: w.seq_type, padding = config["lcr-modules"]["igv"]["generate_batch_script"]["padding"], - max_height = config["lcr-modules"]["igv"]["generate_batch_script"]["max_height"], igv_options = config["lcr-modules"]["igv"]["generate_batch_script"]["igv_options"], - image_format = config["lcr-modules"]["igv"]["generate_batch_script"]["image_format"] - wildcard_constraints: genome_build='[a-zA-Z0-9]+' + max_height = config["lcr-modules"]["igv"]["generate_batch_script"]["max_height"] script: - config["lcr-modules"]["igv"]["scripts"]["batch_script"] + config["lcr-modules"]["igv"]["scripts"]["batch_script_per_variant"] rule _igv_download_igv: output: @@ -265,72 +245,63 @@ rule _igv_download_igv: touch {output.igv_installed} """) -# Add suffix for running as pairs -RUN_SUFFIX = ".pairs" if CFG["view_as_pairs"] is True else "" - -checkpoint _igv_run: +rule _igv_run: input: - batch_script = expand(str(rules._igv_create_batch_script.output.batch_script), zip, normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"], allow_missing=True), - igv_installed = str(rules._igv_download_igv.output.igv_installed) + igv = str(rules._igv_download_igv.output.igv_installed), + batch_script = CFG["dirs"]["batch_scripts"] + "{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.batch" output: - snapshots = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{tumour_sample_id}.finished" + #snapshot_completed = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.snapshot_completed", + snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.png" params: - #igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh" igv = "/projects/rmorin/projects/RNA_seq_ssm/test/bin/IGV_Linux_2.7.2/igv.sh" - #log: - #stdout = CFG["logs"]["outputs"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.stdout.log", - #stderr = CFG["logs"]["outputs"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.stderr.log" + resources: + runtime = "30s" shell: op.as_one_line(""" - lines=$(wc -l < {input.batch_script}) ; - if [ $lines > 1 ] ; - then - xvfb-run --auto-servernum {params.igv} -b {input.batch_script} ; - fi ; - touch {output.snapshots} + xvfb-run --auto-servernum {params.igv} -b {input.batch_script} """) -rule _igv_symlink_snapshots: +rule _igv_symlink_snapshot: input: - snap = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{snap_path}--{tumour_sample_id}.png" + snapshot = str(rules._igv_run.output.snapshot) output: - snap = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{snap_path}--{tumour_sample_id}.png" + snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.png" run: - op.relative_symlink(input.snap, output.snap) - -# Get genomic positions that will dictate filenames -#def aggregate_snapshots(wildcards): -# maf_table = pd.read_table(rules._igv_aggregate_mafs.output.maf) -# checkpoint_output = checkpoints._igv_run.get(**wildcards).output[0] -# return expand(config["lcr-modules"]["igv"]["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chrom}/{position}--{padding}--{hugo}--{tumour_sample_id}.png", -# zip, -# seq_type = wildcards.seq_type, -# genome_build = wildcards.genome_build, -# chrom, position, -# chrom=maf_table["chr_std"], position=maf_table["genomic_pos_std"], padding=config["lcr-modules"]["igv"]["generate_batch_script"]["padding"], hugo=maf_table["Hugo_Symbol"], tumour_sample_id = wildcards.tumour_sample_id) + op.relative_symlink(input.snapshot, output.snapshot) def _evaluate_snapshots(wildcards): CFG = config["lcr-modules"]["igv"] - checkpoint_output = checkpoints._igv_run.get(**wildcards).output.snapshots - parent_dir = CFG["dirs"]["snapshots"] + f"{wildcards.seq_type}--{wildcards.genome_build}" - chromosome_dirs = os.listdir(parent_dir) - sample_snapshots = [] - for chrom in chromosome_dirs: - snapshots = os.listdir(os.path.join(parent_dir,chrom)) - sample_snapshots.extend([os.path.join(chrom, s.replace(f"--{wildcards.tumour_sample_id}.png","")) for s in snapshots if s.endswith(f"{wildcards.tumour_sample_id}.png")]) - return expand(rules._igv_symlink_snapshots.output.snap, zip, seq_type = wildcards.seq_type, genome_build = wildcards.genome_build, tumour_sample_id = wildcards.tumour_sample_id, snap_path=sample_snapshots) - -rule _igv_dispatch: + checkpoint_output = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.sample_batch + maf = expand(rules._igv_filter_maf.output.maf, zip, seq_type=wildcards.seq_type, genome_build=wildcards.genome_build, tumour_sample_id=wildcards.tumour_sample_id, normal_sample_id=CFG["runs"][(CFG["runs"]["tumour_sample_id"]==wildcards.tumour_sample_id) & (CFG["runs"]["tumour_seq_type"]==wildcards.seq_type) & (CFG["runs"]["tumour_genome_build"]==wildcards.genome_build)]["normal_sample_id"], pair_status=CFG["runs"][(CFG["runs"]["tumour_sample_id"]==wildcards.tumour_sample_id) & (CFG["runs"]["tumour_seq_type"]==wildcards.seq_type) & (CFG["runs"]["tumour_genome_build"]==wildcards.genome_build)]["pair_status"]) + + maf_table = pd.read_table(maf[0], comment="#", sep="\t") + + return expand( + expand( + str(rules._igv_symlink_snapshot.output.snapshot), + zip, + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + tumour_sample_id = maf_table["Tumor_Sample_Barcode"], + seq_type = maf_table["seq_type"], + genome_build = maf_table["genome_build"], + allow_missing=True + ), + padding=str(CFG["generate_batch_script"]["padding"]) + ) + +rule _igv_check_outputs: input: - _evaluate_snapshots + snapshots = _evaluate_snapshots output: - touch(CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.dispatched") + touch(CFG["dirs"]["outputs"] + "completed/.{tumour_sample_id}--{seq_type}--{genome_build}.complete") # Generates the target sentinels for each run, which generate the symlinks if CFG["test_run"] is False: rule _igv_all: input: - expand(str(rules._igv_dispatch.output), zip, seq_type=CFG["runs"]["tumour_seq_type"], tumour_sample_id=CFG["runs"]["tumour_sample_id"], normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"], genome_build=CFG["runs"]["tumour_genome_build"]) + expand(str(rules._igv_check_outputs.output), zip, tumour_sample_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) if CFG["test_run"] is True: rule _igv_all: From a9a27fe23ef65812720c23459d52276dd4b25516 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Thu, 9 Mar 2023 11:09:55 -0800 Subject: [PATCH 020/132] Merge variant batch scripts to prevent IGV crash --- .../etc/generate_batch_script_per_variant.py | 12 +- modules/igv/1.0/igv.smk | 157 +++++++++++++----- 2 files changed, 120 insertions(+), 49 deletions(-) diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py index b3466aba..d36b2e0c 100644 --- a/modules/igv/1.0/etc/generate_batch_script_per_variant.py +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -108,10 +108,10 @@ def generate_igv_batch_header(bam, index, max_height, genome_build): return lines -def generate_igv_batch_footer(): - lines = [] - lines.append("exit") - return lines +#def generate_igv_batch_footer(): +# lines = [] +# lines.append("exit") +# return lines def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_build, seq_type, igv_options, max_height): for _, row in regions.iterrows(): @@ -143,8 +143,8 @@ def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_bui all_lines.extend(lines) - footer = generate_igv_batch_footer() - all_lines.extend(footer) + #footer = generate_igv_batch_footer() + #all_lines.extend(footer) batch_file_path = os.path.join(output_dir, seq_type_build, batch_filename) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index d4c56d01..bd6f8533 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -17,8 +17,8 @@ import pandas as pd # Needed for getting snapshot paths import os -# Needed for creating table of snapshot dirs from MAF -import math +# Needed for copying contents of individual variant batch scripts to a merged sample batch_script +import shutil # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe min_oncopipe_version="1.0.11" @@ -83,6 +83,10 @@ def get_maf(wildcards): unix_group = config["unix_group"] return expand(config["lcr-modules"]["igv"]["inputs"]["maf"], allow_missing=True, unix_group=unix_group) +BATCH_SCRIPT_FILE = CFG["dirs"]["batch_scripts"] + "batch_script.batch" +# Open file as 'w' to overwrite contents +BATCH_SCRIPT = open(BATCH_SCRIPT_FILE, "w") +BATCH_SCRIPT.close() ##### RULES ##### @@ -217,7 +221,7 @@ checkpoint _igv_create_batch_script_per_variant: bam_file = str(rules._igv_symlink_bam.output.bam), bai_file = str(rules._igv_symlink_bai.output.bai) output: - sample_batch = CFG["dirs"]["batch_scripts"] + "completed/{seq_type}--{genome_build}/{tumour_sample_id}.complete" + sample_batch = CFG["dirs"]["batch_scripts"] + "merged_batches/{seq_type}--{genome_build}/{tumour_sample_id}.batch" params: batch_dir = config["lcr-modules"]["igv"]["dirs"]["batch_scripts"], snapshot_dir = config["lcr-modules"]["igv"]["dirs"]["snapshots"], @@ -229,47 +233,28 @@ checkpoint _igv_create_batch_script_per_variant: script: config["lcr-modules"]["igv"]["scripts"]["batch_script_per_variant"] -rule _igv_download_igv: - output: - igv_zip = CFG["dirs"]["igv"] + "IGV_2.7.2.zip", - igv_installed = CFG["dirs"]["igv"] + "igv_2.7.2.installed" - conda: - CFG["conda_envs"]["wget"] - log: - stdout = CFG["logs"]["igv"] + "download_igv.stdout.log", - stderr = CFG["logs"]["igv"] + "download_igv.stderr.log" - shell: - op.as_one_line(""" - wget -O {output.igv_zip} https://data.broadinstitute.org/igv/projects/downloads/2.7/IGV_Linux_2.7.2.zip && - unzip {output.igv_zip} -d $(dirname {output.igv_zip}) > {log.stdout} 2> {log.stderr} && - touch {output.igv_installed} - """) - -rule _igv_run: +rule _igv_batches_to_merge: input: - igv = str(rules._igv_download_igv.output.igv_installed), batch_script = CFG["dirs"]["batch_scripts"] + "{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.batch" output: - #snapshot_completed = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.snapshot_completed", - snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.png" + batched = CFG["dirs"]["batch_scripts"] + "batched/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.batched" params: - igv = "/projects/rmorin/projects/RNA_seq_ssm/test/bin/IGV_Linux_2.7.2/igv.sh" - resources: - runtime = "30s" - shell: - op.as_one_line(""" - xvfb-run --auto-servernum {params.igv} -b {input.batch_script} - """) - -rule _igv_symlink_snapshot: - input: - snapshot = str(rules._igv_run.output.snapshot) - output: - snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.png" + batch_script_file = str(rules._igv_create_batch_script_per_variant.output.sample_batch) run: - op.relative_symlink(input.snapshot, output.snapshot) + batch_script_path = os.path.abspath(input.batch_script) + output_file = os.path.abspath(params.batch_script_file) + + batch_script = open(batch_script_path, "r") + + with open(output_file, "a") as handle: + for line in batch_script: + handle.write(line) + batch_script.close() + + output_touch = open(output.batched, "w") + output_touch.close() -def _evaluate_snapshots(wildcards): +def _evaluate_batches(wildcards): CFG = config["lcr-modules"]["igv"] checkpoint_output = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.sample_batch maf = expand(rules._igv_filter_maf.output.maf, zip, seq_type=wildcards.seq_type, genome_build=wildcards.genome_build, tumour_sample_id=wildcards.tumour_sample_id, normal_sample_id=CFG["runs"][(CFG["runs"]["tumour_sample_id"]==wildcards.tumour_sample_id) & (CFG["runs"]["tumour_seq_type"]==wildcards.seq_type) & (CFG["runs"]["tumour_genome_build"]==wildcards.genome_build)]["normal_sample_id"], pair_status=CFG["runs"][(CFG["runs"]["tumour_sample_id"]==wildcards.tumour_sample_id) & (CFG["runs"]["tumour_seq_type"]==wildcards.seq_type) & (CFG["runs"]["tumour_genome_build"]==wildcards.genome_build)]["pair_status"]) @@ -278,7 +263,7 @@ def _evaluate_snapshots(wildcards): return expand( expand( - str(rules._igv_symlink_snapshot.output.snapshot), + str(rules._igv_batches_to_merge.output.batched), zip, chromosome = maf_table["chr_std"], start_position = maf_table["Start_Position"], @@ -291,17 +276,103 @@ def _evaluate_snapshots(wildcards): padding=str(CFG["generate_batch_script"]["padding"]) ) -rule _igv_check_outputs: +rule _igv_download_igv: + output: + igv_zip = CFG["dirs"]["igv"] + "IGV_2.7.2.zip", + igv_installed = CFG["dirs"]["igv"] + "igv_2.7.2.installed" + conda: + CFG["conda_envs"]["wget"] + log: + stdout = CFG["logs"]["igv"] + "download_igv.stdout.log", + stderr = CFG["logs"]["igv"] + "download_igv.stderr.log" + shell: + op.as_one_line(""" + wget -O {output.igv_zip} https://data.broadinstitute.org/igv/projects/downloads/2.7/IGV_Linux_2.7.2.zip && + unzip {output.igv_zip} -d $(dirname {output.igv_zip}) > {log.stdout} 2> {log.stderr} && + touch {output.igv_installed} + """) + +#TODO: check if i should add line to only run batch scripts with more than one line + +rule _igv_run: input: - snapshots = _evaluate_snapshots + igv = str(rules._igv_download_igv.output.igv_installed), + batch_script = _evaluate_batches output: - touch(CFG["dirs"]["outputs"] + "completed/.{tumour_sample_id}--{seq_type}--{genome_build}.complete") + complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}/{tumour_sample_id}.completed" + params: + merged_batch = str(rules._igv_create_batch_script_per_variant.output.sample_batch), + igv = "/projects/rmorin/projects/RNA_seq_ssm/test/bin/IGV_Linux_2.7.2/igv.sh" + shell: + op.as_one_line(""" + echo 'exit' >> {params.merged_batch} ; + xvfb-run --auto-servernum {params.igv} -b {params.merged_batch} ; + touch {output.complete} + """) + + +#rule _igv_run: +# input: +# igv = str(rules._igv_download_igv.output.igv_installed), +# batch_script = CFG["dirs"]["batch_scripts"] + "{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.batch" +# output: +# #snapshot_completed = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.snapshot_completed", +# snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.png" +# params: +# igv = "/projects/rmorin/projects/RNA_seq_ssm/test/bin/IGV_Linux_2.7.2/igv.sh" +# resources: +# runtime = "30s" +# shell: +# op.as_one_line(""" +# xvfb-run --auto-servernum {params.igv} -b {input.batch_script} +# """) + +#rule _igv_symlink_snapshot: +# input: +# snapshot = str(rules._igv_run.output.snapshot) +# output: +# snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.png" +# run: +# op.relative_symlink(input.snapshot, output.snapshot) + +#def _evaluate_snapshots(wildcards): +# CFG = config["lcr-modules"]["igv"] +# checkpoint_output = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.sample_batch +# maf = expand(rules._igv_filter_maf.output.maf, zip, seq_type=wildcards.seq_type, genome_build=wildcards.genome_build, tumour_sample_id=wildcards.tumour_sample_id, normal_sample_id=CFG["runs"][(CFG["runs"]["tumour_sample_id"]==wildcards.tumour_sample_id) & (CFG["runs"]["tumour_seq_type"]==wildcards.seq_type) & (CFG["runs"]["tumour_genome_build"]==wildcards.genome_build)]["normal_sample_id"], pair_status=CFG["runs"][(CFG["runs"]["tumour_sample_id"]==wildcards.tumour_sample_id) & (CFG["runs"]["tumour_seq_type"]==wildcards.seq_type) & (CFG["runs"]["tumour_genome_build"]==wildcards.genome_build)]["pair_status"]) +# +# maf_table = pd.read_table(maf[0], comment="#", sep="\t") +# +# return expand( +# expand( +# str(rules._igv_symlink_snapshot.output.snapshot), +# zip, +# chromosome = maf_table["chr_std"], +# start_position = maf_table["Start_Position"], +# gene = maf_table["Hugo_Symbol"], +# tumour_sample_id = maf_table["Tumor_Sample_Barcode"], +# seq_type = maf_table["seq_type"], +# genome_build = maf_table["genome_build"], +# allow_missing=True +# ), +# padding=str(CFG["generate_batch_script"]["padding"]) +# ) + +#rule _igv_check_outputs: +# input: +# snapshots = _evaluate_snapshots +# output: +# touch(CFG["dirs"]["outputs"] + "completed/.{tumour_sample_id}--{seq_type}--{genome_build}.complete") # Generates the target sentinels for each run, which generate the symlinks if CFG["test_run"] is False: rule _igv_all: input: - expand(str(rules._igv_check_outputs.output), zip, tumour_sample_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) + expand(str(rules._igv_run.output.complete), zip, tumour_sample_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) + +#if CFG["test_run"] is False: +# rule _igv_all: +# input: +# expand(str(rules._igv_check_outputs.output), zip, tumour_sample_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) if CFG["test_run"] is True: rule _igv_all: From 28c54f1b6cd35b7beef7d400fcb1b63e55917fdf Mon Sep 17 00:00:00 2001 From: mannycruz Date: Mon, 13 Mar 2023 13:38:09 -0700 Subject: [PATCH 021/132] Remove conda environment in filter_maf rule --- modules/igv/1.0/etc/format_regions.py | 54 +-------------------------- modules/igv/1.0/igv.smk | 2 - 2 files changed, 1 insertion(+), 55 deletions(-) diff --git a/modules/igv/1.0/etc/format_regions.py b/modules/igv/1.0/etc/format_regions.py index d1d68bee..e7a46404 100755 --- a/modules/igv/1.0/etc/format_regions.py +++ b/modules/igv/1.0/etc/format_regions.py @@ -3,7 +3,6 @@ import os import pandas as pd import oncopipe as op -import vcf import shutil def format_mutation_id(mutation_id): @@ -88,63 +87,12 @@ def format_clustl(clustl_regions): ) return clustl_reformatted -def format_vcf(regions): - # Load VCF file - vcf_reader = vcf.Reader(open(regions, "rb")) - - # Convert VCF records to BED format - chroms = [] - pos = [] - events_seen = set() - - for record in vcf_reader: - if len(record.FILTER) > 0: - continue - - # Skip SVs with ID matching previous record - if record.ID in events_seen: - continue - - chromosome = "chr" + str(record.CHROM).replace("chr","") - position = record.POS - - chroms.append(chromosome) - pos.append(position) - - if record.is_sv and "END" in record.INFO: - # Add end position of SV to regions of interest - end = record.INFO["END"][0] - - chroms.append(chromosome) - pos.append(end) - - if record.is_sv and record.INFO["SVTYPE"] == "BND": - # Add end position of SV to regions of interest - chromosome = "chr" + str(record.ALT[0].chr).replace("chr","") - position = record.ALT[0].pos - - chroms.append(chromosome) - pos.append(position) - - # To skip mate event in VCF file - events_seen.add(record.INFO["MATEID"]) - - vcf_reformatted = pd.DataFrame( - { - "chrom": chroms, - "start": pos, - "end": pos - } - ) - - return vcf_reformatted def format_regions(regions, regions_format): format_functions = { "oncodriveclustl": format_clustl, "hotmaps": format_hotmaps, - "mutation_id": format_mutation_id, - "vcf": format_vcf, + "mutation_id": format_mutation_id } return format_functions[regions_format](regions) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index bd6f8533..ef746a47 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -145,8 +145,6 @@ rule _igv_format_regions_file: regions_format = config["lcr-modules"]["igv"]["inputs"]["regions_format"], oncodriveclustl_params = config["lcr-modules"]["igv"]["filter_maf"]["oncodriveclustl_options"], regions_build = config["lcr-modules"]["igv"]["inputs"]["regions_build"] - conda: - CFG["conda_envs"]["format_regions"] script: config["lcr-modules"]["igv"]["scripts"]["format_regions"] From 4318b8d87e64aa1c11d87bbefa3f1496946f6029 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Mon, 13 Mar 2023 13:42:59 -0700 Subject: [PATCH 022/132] Add symlinked snapshots to workflow targets --- modules/igv/1.0/igv.smk | 111 +++++++++++++++++++++++++++++----------- 1 file changed, 80 insertions(+), 31 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index ef746a47..f4445020 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -62,10 +62,11 @@ localrules: _igv_liftover_regions, _igv_filter_maf, _igv_create_batch_script_per_variant, + _igv_batches_to_merge, _igv_download_igv, _igv_run, _igv_symlink_snapshot, - _igv_check_outputs, + _igv_check_snapshots, ##### FUNCTIONS ##### @@ -83,10 +84,6 @@ def get_maf(wildcards): unix_group = config["unix_group"] return expand(config["lcr-modules"]["igv"]["inputs"]["maf"], allow_missing=True, unix_group=unix_group) -BATCH_SCRIPT_FILE = CFG["dirs"]["batch_scripts"] + "batch_script.batch" -# Open file as 'w' to overwrite contents -BATCH_SCRIPT = open(BATCH_SCRIPT_FILE, "w") -BATCH_SCRIPT.close() ##### RULES ##### @@ -219,7 +216,7 @@ checkpoint _igv_create_batch_script_per_variant: bam_file = str(rules._igv_symlink_bam.output.bam), bai_file = str(rules._igv_symlink_bai.output.bai) output: - sample_batch = CFG["dirs"]["batch_scripts"] + "merged_batches/{seq_type}--{genome_build}/{tumour_sample_id}.batch" + variant_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{tumour_sample_id}.batch" params: batch_dir = config["lcr-modules"]["igv"]["dirs"]["batch_scripts"], snapshot_dir = config["lcr-modules"]["igv"]["dirs"]["snapshots"], @@ -233,11 +230,11 @@ checkpoint _igv_create_batch_script_per_variant: rule _igv_batches_to_merge: input: - batch_script = CFG["dirs"]["batch_scripts"] + "{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.batch" + batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.batch" output: - batched = CFG["dirs"]["batch_scripts"] + "batched/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.batched" + dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.batched" params: - batch_script_file = str(rules._igv_create_batch_script_per_variant.output.sample_batch) + batch_script_file = str(rules._igv_create_batch_script_per_variant.output.variant_batch) run: batch_script_path = os.path.abspath(input.batch_script) output_file = os.path.abspath(params.batch_script_file) @@ -249,30 +246,33 @@ rule _igv_batches_to_merge: handle.write(line) batch_script.close() - output_touch = open(output.batched, "w") + output_touch = open(output.dispatched_batch_script, "w") output_touch.close() def _evaluate_batches(wildcards): CFG = config["lcr-modules"]["igv"] - checkpoint_output = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.sample_batch + checkpoint_output = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.variant_batch maf = expand(rules._igv_filter_maf.output.maf, zip, seq_type=wildcards.seq_type, genome_build=wildcards.genome_build, tumour_sample_id=wildcards.tumour_sample_id, normal_sample_id=CFG["runs"][(CFG["runs"]["tumour_sample_id"]==wildcards.tumour_sample_id) & (CFG["runs"]["tumour_seq_type"]==wildcards.seq_type) & (CFG["runs"]["tumour_genome_build"]==wildcards.genome_build)]["normal_sample_id"], pair_status=CFG["runs"][(CFG["runs"]["tumour_sample_id"]==wildcards.tumour_sample_id) & (CFG["runs"]["tumour_seq_type"]==wildcards.seq_type) & (CFG["runs"]["tumour_genome_build"]==wildcards.genome_build)]["pair_status"]) - maf_table = pd.read_table(maf[0], comment="#", sep="\t") - - return expand( - expand( - str(rules._igv_batches_to_merge.output.batched), - zip, - chromosome = maf_table["chr_std"], - start_position = maf_table["Start_Position"], - gene = maf_table["Hugo_Symbol"], - tumour_sample_id = maf_table["Tumor_Sample_Barcode"], - seq_type = maf_table["seq_type"], - genome_build = maf_table["genome_build"], - allow_missing=True - ), - padding=str(CFG["generate_batch_script"]["padding"]) - ) + if os.path.exists(maf[0]): + maf_table = pd.read_table(maf[0], comment="#", sep="\t") + + return expand( + expand( + str(rules._igv_batches_to_merge.output.dispatched_batch_script), + zip, + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + tumour_sample_id = maf_table["Tumor_Sample_Barcode"], + seq_type = maf_table["seq_type"], + genome_build = maf_table["genome_build"], + allow_missing=True + ), + padding=str(CFG["generate_batch_script"]["padding"]) + ) + else: + return [] rule _igv_download_igv: output: @@ -292,14 +292,14 @@ rule _igv_download_igv: #TODO: check if i should add line to only run batch scripts with more than one line -rule _igv_run: +checkpoint _igv_run: input: igv = str(rules._igv_download_igv.output.igv_installed), batch_script = _evaluate_batches output: - complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}/{tumour_sample_id}.completed" + complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}--{tumour_sample_id}.completed" params: - merged_batch = str(rules._igv_create_batch_script_per_variant.output.sample_batch), + merged_batch = str(rules._igv_create_batch_script_per_variant.output.variant_batch), igv = "/projects/rmorin/projects/RNA_seq_ssm/test/bin/IGV_Linux_2.7.2/igv.sh" shell: op.as_one_line(""" @@ -308,6 +308,55 @@ rule _igv_run: touch {output.complete} """) +rule _igv_symlink_snapshot: + input: + snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.png" + output: + snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.png" + run: + op.relative_symlink(input.snapshot, output.snapshot) + +def _symlink_snapshot(wildcards): + CFG = config["lcr-modules"]["igv"] + checkpoint_outputs = checkpoints._igv_run.get(**wildcards).output.complete + maf = expand( + str(rules._igv_filter_maf.output.maf), + zip, seq_type=wildcards.seq_type, + genome_build=wildcards.genome_build, + tumour_sample_id=wildcards.tumour_sample_id, + normal_sample_id=CFG["runs"][(CFG["runs"]["tumour_sample_id"]==wildcards.tumour_sample_id) & (CFG["runs"]["tumour_seq_type"]==wildcards.seq_type) & (CFG["runs"]["tumour_genome_build"]==wildcards.genome_build)]["normal_sample_id"], + pair_status=CFG["runs"][(CFG["runs"]["tumour_sample_id"]==wildcards.tumour_sample_id) & (CFG["runs"]["tumour_seq_type"]==wildcards.seq_type) & (CFG["runs"]["tumour_genome_build"]==wildcards.genome_build)]["pair_status"] + ) + + if os.path.exists(maf[0]): + maf_table = pd.read_table(maf[0], comment="#", sep="\t") + + return expand( + expand( + str(rules._igv_symlink_snapshot.output.snapshot), + zip, + seq_type = maf_table["seq_type"], + genome_build = maf_table["genome_build"], + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + tumour_sample_id = maf_table["Tumor_Sample_Barcode"], + allow_missing=True + ), + padding = str(CFG["generate_batch_script"]["padding"]) + ) + else: + return [] + +rule _igv_check_snapshots: + input: + snapshots = _symlink_snapshot, + igv_completed = str(rules._igv_run.output.complete) + output: + snapshots = CFG["dirs"]["outputs"] + "completed/{seq_type}--{genome_build}--{tumour_sample_id}.completed" + shell: + "touch {output.snapshots}" + #rule _igv_run: # input: @@ -365,7 +414,7 @@ rule _igv_run: if CFG["test_run"] is False: rule _igv_all: input: - expand(str(rules._igv_run.output.complete), zip, tumour_sample_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) + expand([str(rules._igv_run.output.complete), str(rules._igv_check_snapshots.output.snapshots)], zip, tumour_sample_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) #if CFG["test_run"] is False: # rule _igv_all: From 7805531d16523e64a8f4f8ccdbd92959aca21858 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Mon, 13 Mar 2023 13:44:02 -0700 Subject: [PATCH 023/132] Clean up comment lines --- modules/igv/1.0/igv.smk | 55 ----------------------------------------- 1 file changed, 55 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index f4445020..6753740b 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -290,8 +290,6 @@ rule _igv_download_igv: touch {output.igv_installed} """) -#TODO: check if i should add line to only run batch scripts with more than one line - checkpoint _igv_run: input: igv = str(rules._igv_download_igv.output.igv_installed), @@ -357,59 +355,6 @@ rule _igv_check_snapshots: shell: "touch {output.snapshots}" - -#rule _igv_run: -# input: -# igv = str(rules._igv_download_igv.output.igv_installed), -# batch_script = CFG["dirs"]["batch_scripts"] + "{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.batch" -# output: -# #snapshot_completed = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.snapshot_completed", -# snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.png" -# params: -# igv = "/projects/rmorin/projects/RNA_seq_ssm/test/bin/IGV_Linux_2.7.2/igv.sh" -# resources: -# runtime = "30s" -# shell: -# op.as_one_line(""" -# xvfb-run --auto-servernum {params.igv} -b {input.batch_script} -# """) - -#rule _igv_symlink_snapshot: -# input: -# snapshot = str(rules._igv_run.output.snapshot) -# output: -# snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.png" -# run: -# op.relative_symlink(input.snapshot, output.snapshot) - -#def _evaluate_snapshots(wildcards): -# CFG = config["lcr-modules"]["igv"] -# checkpoint_output = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.sample_batch -# maf = expand(rules._igv_filter_maf.output.maf, zip, seq_type=wildcards.seq_type, genome_build=wildcards.genome_build, tumour_sample_id=wildcards.tumour_sample_id, normal_sample_id=CFG["runs"][(CFG["runs"]["tumour_sample_id"]==wildcards.tumour_sample_id) & (CFG["runs"]["tumour_seq_type"]==wildcards.seq_type) & (CFG["runs"]["tumour_genome_build"]==wildcards.genome_build)]["normal_sample_id"], pair_status=CFG["runs"][(CFG["runs"]["tumour_sample_id"]==wildcards.tumour_sample_id) & (CFG["runs"]["tumour_seq_type"]==wildcards.seq_type) & (CFG["runs"]["tumour_genome_build"]==wildcards.genome_build)]["pair_status"]) -# -# maf_table = pd.read_table(maf[0], comment="#", sep="\t") -# -# return expand( -# expand( -# str(rules._igv_symlink_snapshot.output.snapshot), -# zip, -# chromosome = maf_table["chr_std"], -# start_position = maf_table["Start_Position"], -# gene = maf_table["Hugo_Symbol"], -# tumour_sample_id = maf_table["Tumor_Sample_Barcode"], -# seq_type = maf_table["seq_type"], -# genome_build = maf_table["genome_build"], -# allow_missing=True -# ), -# padding=str(CFG["generate_batch_script"]["padding"]) -# ) - -#rule _igv_check_outputs: -# input: -# snapshots = _evaluate_snapshots -# output: -# touch(CFG["dirs"]["outputs"] + "completed/.{tumour_sample_id}--{seq_type}--{genome_build}.complete") - # Generates the target sentinels for each run, which generate the symlinks if CFG["test_run"] is False: rule _igv_all: From 0f4c924bc7c5e3ded98fc5a4ea1b5e7e1695ebb3 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Mon, 13 Mar 2023 13:46:18 -0700 Subject: [PATCH 024/132] Remove "exit" line from position batch scripts --- .../igv/1.0/etc/generate_batch_script_per_variant.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py index d36b2e0c..85984241 100644 --- a/modules/igv/1.0/etc/generate_batch_script_per_variant.py +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -108,11 +108,6 @@ def generate_igv_batch_header(bam, index, max_height, genome_build): return lines -#def generate_igv_batch_footer(): -# lines = [] -# lines.append("exit") -# return lines - def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_build, seq_type, igv_options, max_height): for _, row in regions.iterrows(): all_lines = [] @@ -143,10 +138,11 @@ def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_bui all_lines.extend(lines) - #footer = generate_igv_batch_footer() - #all_lines.extend(footer) + for subdir in [os.path.join(output_dir, "single_batch_scripts"), os.path.join(output_dir, "single_batch_scripts", seq_type_build)]: + if not os.path.exists(subdir): + os.mkdir(subdir) - batch_file_path = os.path.join(output_dir, seq_type_build, batch_filename) + batch_file_path = os.path.join(output_dir, "single_batch_scripts", seq_type_build, batch_filename) output_lines(all_lines, batch_file_path) From 383ff17a67fb37a57a3b465e1800e60b312dda05 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Mon, 13 Mar 2023 15:21:48 -0700 Subject: [PATCH 025/132] Fix variable referenced before assignment error --- modules/igv/1.0/etc/generate_batch_script_per_variant.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py index 85984241..fc338d8f 100644 --- a/modules/igv/1.0/etc/generate_batch_script_per_variant.py +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -20,6 +20,7 @@ def main(): break if line_count < 2: input_maf.close() + touch_output = open(snakemake.output[0],"w") touch_output.close() exit() From 13d6992f2d7f1d2e37e55d08fa2183bc6c4d8ea4 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Tue, 14 Mar 2023 14:17:10 -0700 Subject: [PATCH 026/132] Add log outputs to igv run --- modules/igv/1.0/igv.smk | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 6753740b..c4ec9ed0 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -296,9 +296,12 @@ checkpoint _igv_run: batch_script = _evaluate_batches output: complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}--{tumour_sample_id}.completed" + log: + stdout = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_sample_id}_igv_run.stdout", + stderr = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_sample_id}_igv_run.stderr" params: merged_batch = str(rules._igv_create_batch_script_per_variant.output.variant_batch), - igv = "/projects/rmorin/projects/RNA_seq_ssm/test/bin/IGV_Linux_2.7.2/igv.sh" + igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh" shell: op.as_one_line(""" echo 'exit' >> {params.merged_batch} ; From c9a8abc5a1743c66b1dd704e1f83a97d015c7a3c Mon Sep 17 00:00:00 2001 From: mannycruz Date: Tue, 14 Mar 2023 14:20:47 -0700 Subject: [PATCH 027/132] Add dependency to regions files to checkpoint rule --- modules/igv/1.0/igv.smk | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index c4ec9ed0..b8b19e95 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -214,7 +214,9 @@ checkpoint _igv_create_batch_script_per_variant: input: filter_maf = expand(str(rules._igv_filter_maf.output.maf), zip, normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"], allow_missing=True), bam_file = str(rules._igv_symlink_bam.output.bam), - bai_file = str(rules._igv_symlink_bai.output.bai) + bai_file = str(rules._igv_symlink_bai.output.bai), + regions_lifted = str(rules._igv_liftover_regions.output.regions), + regions_formatted = str(rules._igv_format_regions_file.output.regions) output: variant_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{tumour_sample_id}.batch" params: From 6bb0861e1d4ff1f0291e1e00f6b06c5cd6a2fde3 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Tue, 14 Mar 2023 14:21:51 -0700 Subject: [PATCH 028/132] Set thread limits on batch creation and IGV run --- modules/igv/1.0/igv.smk | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index b8b19e95..1273e131 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -137,11 +137,14 @@ rule _igv_format_regions_file: input: regions = str(rules._igv_symlink_regions_file.output.regions_file) output: - regions = config["lcr-modules"]["igv"]["dirs"]["inputs"] + "regions/regions_file_formatted.txt" + regions = CFG["dirs"]["inputs"] + "regions/regions_file_formatted.txt" params: - regions_format = config["lcr-modules"]["igv"]["inputs"]["regions_format"], - oncodriveclustl_params = config["lcr-modules"]["igv"]["filter_maf"]["oncodriveclustl_options"], - regions_build = config["lcr-modules"]["igv"]["inputs"]["regions_build"] + regions_format = CFG["inputs"]["regions_format"], + oncodriveclustl_params = CFG["filter_maf"]["oncodriveclustl_options"], + regions_build = CFG["inputs"]["regions_build"] + log: + stdout = CFG["logs"]["inputs"] + "format_regions.stdout.log", + stderr = CFG["logs"]["inputs"] + "format_regions.stderr.log" script: config["lcr-modules"]["igv"]["scripts"]["format_regions"] @@ -237,6 +240,7 @@ rule _igv_batches_to_merge: dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.batched" params: batch_script_file = str(rules._igv_create_batch_script_per_variant.output.variant_batch) + threads: (workflow.cores / 10) run: batch_script_path = os.path.abspath(input.batch_script) output_file = os.path.abspath(params.batch_script_file) @@ -304,10 +308,11 @@ checkpoint _igv_run: params: merged_batch = str(rules._igv_create_batch_script_per_variant.output.variant_batch), igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh" + threads: (workflow.cores / 5) shell: op.as_one_line(""" echo 'exit' >> {params.merged_batch} ; - xvfb-run --auto-servernum {params.igv} -b {params.merged_batch} ; + xvfb-run --auto-servernum {params.igv} -b {params.merged_batch} > {log.stdout} 2> {log.stderr} ; touch {output.complete} """) From 351019f2ac886ccedecd013aa4a369ff1b4835dc Mon Sep 17 00:00:00 2001 From: mannycruz Date: Tue, 14 Mar 2023 14:22:21 -0700 Subject: [PATCH 029/132] Clean up format_regions script --- modules/igv/1.0/etc/format_regions.py | 51 +++++++++++++++------------ 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/modules/igv/1.0/etc/format_regions.py b/modules/igv/1.0/etc/format_regions.py index e7a46404..770f5ffa 100755 --- a/modules/igv/1.0/etc/format_regions.py +++ b/modules/igv/1.0/etc/format_regions.py @@ -5,6 +5,33 @@ import oncopipe as op import shutil +def main(): + + regions_file = snakemake.input[0] + regions_format = snakemake.params[0] + + output_file = snakemake.output[0] + + if regions_format == "oncodriveclustl": + global CLUSTL_PARAMS + CLUSTL_PARAMS = snakemake.params[1] + + if regions_format == "mutation_id": + global REGIONS_BUILD + REGIONS_BUILD = snakemake.params[2] + REGIONS_BUILD = REGIONS_BUILD.lower() + + if regions_format == "bed" or regions_format == "maf": + # Do not need to reformat for liftover + shutil.copy(regions_file, output_file) + exit() + + # Reformat for liftover based on regions format + regions_formatted = format_regions(regions_file, regions_format) + + # Output regions file + regions_formatted.to_csv(output_file, sep="\t", index=False) + def format_mutation_id(mutation_id): # Read regions into dataframe mutation_id = pd.read_table(mutation_id, comment="#", sep="\t") @@ -97,25 +124,5 @@ def format_regions(regions, regions_format): return format_functions[regions_format](regions) -regions_file = snakemake.input[0] -regions_format = snakemake.params[0] - -output_file = snakemake.output[0] - -if regions_format == "oncodriveclustl": - CLUSTL_PARAMS = snakemake.params[1] - -if regions_format == "mutation_id": - REGIONS_BUILD = snakemake.params[2] - REGIONS_BUILD = REGIONS_BUILD.lower() - -if regions_format == "bed" or regions_format == "maf": - # Do not need to reformat for liftover - shutil.copy(regions_file, output_file) - exit() - -# Reformat for liftover based on regions format -regions_formatted = format_regions(regions_file, regions_format) - -# Output regions file -regions_formatted.to_csv(output_file, sep="\t", index=False) +if __name__ == "__main__": + main() \ No newline at end of file From 97641019edf807821c31ffcfeb09a166d4d28460 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 15 Mar 2023 09:37:39 -0700 Subject: [PATCH 030/132] Clean up subdirectories --- modules/igv/1.0/igv.smk | 53 +++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 34 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 1273e131..80782417 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -43,7 +43,7 @@ if version.parse(current_version) < version.parse(min_oncopipe_version): CFG = op.setup_module( name = "igv", version = "1.0", - subdirectories = ["inputs", "maf_filtered", "regions_lifted", "batch_scripts", "igv", "snapshots", "outputs"], + subdirectories = ["inputs", "batch_scripts", "igv", "snapshots", "outputs"], ) # Rename genome builds in metadata to match up with MAFs? @@ -160,7 +160,7 @@ rule _igv_liftover_regions: regions = str(rules._igv_format_regions_file.output.regions), liftover_script = CFG["scripts"]["region_liftover_script"] output: - regions = CFG["dirs"]["regions_lifted"] + "regions_file_{genome_build}.txt" + regions = CFG["dirs"]["inputs"] + "regions/regions_file_{genome_build}.crossmap.txt" params: chain_file = reference_files(CFG["liftover_regions"]["reference_chain_file"][(CFG["inputs"]["regions_build"]).replace("hg19","grch37").replace("grch38","hg38")]), target_reference = lambda w: config["lcr-modules"]["igv"]["liftover_regions"]["target_reference"][w.genome_build], @@ -180,42 +180,27 @@ rule _igv_liftover_regions: {params.target_reference} > {log.stdout} 2> {log.stderr} """) -if CFG["test_run"] == False: - # Pass metadata as a pandas dataframe directly from the samples value specified in config - rule _igv_filter_maf: - input: - maf = str(rules._igv_reduce_maf_cols.output.maf), - regions = str(rules._igv_liftover_regions.output.regions) - output: - maf = CFG["dirs"]["maf_filtered"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.maf" - params: - regions_format = REGIONS_FORMAT[CFG["inputs"]["regions_format"].lower()], - oncodriveclustl_params = CFG["filter_maf"]["oncodriveclustl_options"], - n_snapshots = CFG["filter_maf"]["n_snapshots"] if CFG["filter_maf"]["n_snapshots"] is not None else 1000000, - metadata = CFG["runs"] - script: - config["lcr-modules"]["igv"]["scripts"]["filter_script"] - -elif CFG["test_run"] == True: - # Pass metadata as a pandas dataframe directly from the samples value specified in config - rule _igv_filter_maf: - input: - maf = str(rules._igv_reduce_maf_cols.output.maf), - regions = str(rules._igv_liftover_regions.output.regions) - output: - maf = CFG["dirs"]["maf_filtered"] + "{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.maf" - params: - regions_format = REGIONS_FORMAT[CFG["inputs"]["regions_format"].lower()], - oncodriveclustl_params = CFG["filter_maf"]["oncodriveclustl_options"], - n_snapshots = CFG["filter_maf"]["n_snapshots"] if CFG["filter_maf"]["n_snapshots"] is not None else 1000000, - metadata = CFG["runs"] - script: - config["lcr-modules"]["igv"]["scripts"]["filter_script"] +# Pass metadata as a pandas dataframe directly from the samples value specified in config +rule _igv_filter_maf: + input: + maf = str(rules._igv_reduce_maf_cols.output.maf), + regions = str(rules._igv_liftover_regions.output.regions) + output: + maf = CFG["dirs"]["inputs"] + "maf/filtered_maf/{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.maf" + params: + regions_format = REGIONS_FORMAT[CFG["inputs"]["regions_format"].lower()], + oncodriveclustl_params = CFG["filter_maf"]["oncodriveclustl_options"], + n_snapshots = CFG["filter_maf"]["n_snapshots"] if CFG["filter_maf"]["n_snapshots"] is not None else 1000000, + metadata = CFG["runs"] + wildcard_constraints: + seq_type = "[a-zA-Z]+" + script: + config["lcr-modules"]["igv"]["scripts"]["filter_script"] # Create multiple batch scripts per sample for each variant checkpoint _igv_create_batch_script_per_variant: input: - filter_maf = expand(str(rules._igv_filter_maf.output.maf), zip, normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"], allow_missing=True), + filter_maf = expand(str(rules._igv_filter_maf.output.maf), zip, normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"], allow_missing=True)[0], bam_file = str(rules._igv_symlink_bam.output.bam), bai_file = str(rules._igv_symlink_bai.output.bai), regions_lifted = str(rules._igv_liftover_regions.output.regions), From 874d594012792f81c8b46cc6f6fd0beea69c0a63 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 15 Mar 2023 09:59:24 -0700 Subject: [PATCH 031/132] Add log outputs to script rules --- modules/igv/1.0/etc/filter_maf.py | 67 ++++++++++++++++----------- modules/igv/1.0/etc/format_regions.py | 42 ++++++++++------- modules/igv/1.0/igv.smk | 3 ++ 3 files changed, 66 insertions(+), 46 deletions(-) diff --git a/modules/igv/1.0/etc/filter_maf.py b/modules/igv/1.0/etc/filter_maf.py index 2aac3ba2..3107c2b4 100644 --- a/modules/igv/1.0/etc/filter_maf.py +++ b/modules/igv/1.0/etc/filter_maf.py @@ -1,10 +1,47 @@ #!/usr/bin/env python import os +import sys import math import pandas as pd import oncopipe as op +def main(): + + with open(snakemake.log[0], "w") as stdout, open(snakemake.log[1], "w") as stderr: + # Set up logging + sys.stdout = stdout + sys.stderr = stderr + + maf_file = snakemake.input[0] + + regions_file = snakemake.input[1] + regions_format = snakemake.params[0] + + metadata = snakemake.params[3] + + if regions_format == "oncodriveclustl": + global CLUSTL_PARAMS + CLUSTL_PARAMS = snakemake.params[1] + + n_snapshots = snakemake.params[2] + + output_file = snakemake.output[0] + + maf = maf_add_columns(maf=maf_file, metadata=metadata) + + # Peform filtering + + filtered_maf = maf_filter( + maf=maf, + regions=regions_file, + regions_format=regions_format + ) + + filtered_maf = maf_reduce_snapshots(maf=filtered_maf, snapshots=n_snapshots) + + write_output(filtered_maf, output_file) + def filter_by_bed(maf, regions): # Remove row containing column names @@ -78,31 +115,5 @@ def maf_add_columns(maf, metadata): def write_output(maf, outfile): maf.to_csv(outfile, sep="\t", index=False) -maf_file = snakemake.input[0] - -regions_file = snakemake.input[1] -regions_format = snakemake.params[0] - -metadata = snakemake.params[3] - -if regions_format == "oncodriveclustl": - # This should act as a global variable - CLUSTL_PARAMS = snakemake.params[1] - -n_snapshots = snakemake.params[2] - -output_file = snakemake.output[0] - -maf = maf_add_columns(maf=maf_file, metadata=metadata) - -# Peform filtering - -filtered_maf = maf_filter( - maf=maf, - regions=regions_file, - regions_format=regions_format - ) - -filtered_maf = maf_reduce_snapshots(maf=filtered_maf, snapshots=n_snapshots) - -write_output(filtered_maf, output_file) +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/modules/igv/1.0/etc/format_regions.py b/modules/igv/1.0/etc/format_regions.py index 770f5ffa..3c9eae9b 100755 --- a/modules/igv/1.0/etc/format_regions.py +++ b/modules/igv/1.0/etc/format_regions.py @@ -1,36 +1,42 @@ #!/usr/bin/env python import os +import sys import pandas as pd import oncopipe as op import shutil def main(): - regions_file = snakemake.input[0] - regions_format = snakemake.params[0] + with open(snakemake.log[0], "w") as stdout, open(snakemake.log[1], "w") as stderr: + # Set up logging + sys.stdout = stdout + sys.stderr = stderr - output_file = snakemake.output[0] + regions_file = snakemake.input[0] + regions_format = snakemake.params[0] - if regions_format == "oncodriveclustl": - global CLUSTL_PARAMS - CLUSTL_PARAMS = snakemake.params[1] + output_file = snakemake.output[0] - if regions_format == "mutation_id": - global REGIONS_BUILD - REGIONS_BUILD = snakemake.params[2] - REGIONS_BUILD = REGIONS_BUILD.lower() + if regions_format == "oncodriveclustl": + global CLUSTL_PARAMS + CLUSTL_PARAMS = snakemake.params[1] - if regions_format == "bed" or regions_format == "maf": - # Do not need to reformat for liftover - shutil.copy(regions_file, output_file) - exit() + if regions_format == "mutation_id": + global REGIONS_BUILD + REGIONS_BUILD = snakemake.params[2] + REGIONS_BUILD = REGIONS_BUILD.lower() - # Reformat for liftover based on regions format - regions_formatted = format_regions(regions_file, regions_format) + if regions_format == "bed" or regions_format == "maf": + # Do not need to reformat for liftover + shutil.copy(regions_file, output_file) + exit() - # Output regions file - regions_formatted.to_csv(output_file, sep="\t", index=False) + # Reformat for liftover based on regions format + regions_formatted = format_regions(regions_file, regions_format) + + # Output regions file + regions_formatted.to_csv(output_file, sep="\t", index=False) def format_mutation_id(mutation_id): # Read regions into dataframe diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 80782417..0dc66b4b 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -192,6 +192,9 @@ rule _igv_filter_maf: oncodriveclustl_params = CFG["filter_maf"]["oncodriveclustl_options"], n_snapshots = CFG["filter_maf"]["n_snapshots"] if CFG["filter_maf"]["n_snapshots"] is not None else 1000000, metadata = CFG["runs"] + log: + stdout = CFG["logs"]["inputs"] + "filter_maf/{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}/filter_maf.stdout.log", + stderr = CFG["logs"]["inputs"] + "filter_maf/{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}/filter_maf.stderr.log" wildcard_constraints: seq_type = "[a-zA-Z]+" script: From 4c343345a6d5b0b4b5bb419d0a43dc10885eb2f7 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Thu, 16 Mar 2023 17:02:17 -0700 Subject: [PATCH 032/132] Add descriptions to rules, remove redundant header statements from merged batch --- modules/igv/1.0/igv.smk | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 0dc66b4b..c191a9fc 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -14,11 +14,7 @@ # Import package with useful functions for developing analysis modules import oncopipe as op import pandas as pd - -# Needed for getting snapshot paths import os -# Needed for copying contents of individual variant batch scripts to a merged sample batch_script -import shutil # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe min_oncopipe_version="1.0.11" @@ -46,7 +42,7 @@ CFG = op.setup_module( subdirectories = ["inputs", "batch_scripts", "igv", "snapshots", "outputs"], ) -# Rename genome builds in metadata to match up with MAFs? +# Rename genome_build values in sample metadata to correlate with MAF values CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["genome_map"]["grch37"]), "grch37", inplace=True) CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["genome_map"]["hg38"]), "hg38", inplace=True) @@ -121,7 +117,7 @@ rule _igv_symlink_maf: run: op.absolute_symlink(input.maf, output.maf) -# Filter to essential columns to prevent errors in parsing with pandas +# Reduce MAF columns to prevent parsing errors in Pandas rule _igv_reduce_maf_cols: input: maf = str(rules._igv_symlink_maf.output.maf) @@ -132,7 +128,7 @@ rule _igv_reduce_maf_cols: cut -f 1,5,6,7,9,10,11,13,16 {input.maf} > {output.maf} """) -# Prepare regions file for liftover +# Convert input regions file into BED format rule _igv_format_regions_file: input: regions = str(rules._igv_symlink_regions_file.output.regions_file) @@ -180,7 +176,7 @@ rule _igv_liftover_regions: {params.target_reference} > {log.stdout} 2> {log.stderr} """) -# Pass metadata as a pandas dataframe directly from the samples value specified in config +# Filter MAF to lines containing positions of interest rule _igv_filter_maf: input: maf = str(rules._igv_reduce_maf_cols.output.maf), @@ -200,7 +196,7 @@ rule _igv_filter_maf: script: config["lcr-modules"]["igv"]["scripts"]["filter_script"] -# Create multiple batch scripts per sample for each variant +# Create batch scripts for each variant within sample_id's filtered MAF checkpoint _igv_create_batch_script_per_variant: input: filter_maf = expand(str(rules._igv_filter_maf.output.maf), zip, normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"], allow_missing=True)[0], @@ -221,6 +217,7 @@ checkpoint _igv_create_batch_script_per_variant: script: config["lcr-modules"]["igv"]["scripts"]["batch_script_per_variant"] +# Keep track of which variant and sample_id combinations have been seen, merge individual variant batch scripts into a large batch script per sample_id rule _igv_batches_to_merge: input: batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.batch" @@ -235,14 +232,21 @@ rule _igv_batches_to_merge: batch_script = open(batch_script_path, "r") + with open(output_file, "r") as f: + merged_lines = len(f.readlines()) + with open(output_file, "a") as handle: for line in batch_script: + if merged_lines > 0: + if line.startswith(("load","maxPanelHeight","genome")): + continue handle.write(line) batch_script.close() output_touch = open(output.dispatched_batch_script, "w") output_touch.close() +# Return list of all batch scripts that were created from the filtered maf and merged def _evaluate_batches(wildcards): CFG = config["lcr-modules"]["igv"] checkpoint_output = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.variant_batch @@ -284,6 +288,7 @@ rule _igv_download_igv: touch {output.igv_installed} """) +# Run IGV once all individual variant batch scripts have been merged into one script per sample_id checkpoint _igv_run: input: igv = str(rules._igv_download_igv.output.igv_installed), @@ -296,7 +301,7 @@ checkpoint _igv_run: params: merged_batch = str(rules._igv_create_batch_script_per_variant.output.variant_batch), igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh" - threads: (workflow.cores / 5) + threads: (workflow.cores) shell: op.as_one_line(""" echo 'exit' >> {params.merged_batch} ; @@ -304,6 +309,7 @@ checkpoint _igv_run: touch {output.complete} """) +# Symlinks the final output files into the module results directory (under '99-outputs/') rule _igv_symlink_snapshot: input: snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.png" @@ -312,6 +318,7 @@ rule _igv_symlink_snapshot: run: op.relative_symlink(input.snapshot, output.snapshot) +# Return a list of all snapshots that were taken during IGV def _symlink_snapshot(wildcards): CFG = config["lcr-modules"]["igv"] checkpoint_outputs = checkpoints._igv_run.get(**wildcards).output.complete @@ -344,6 +351,7 @@ def _symlink_snapshot(wildcards): else: return [] +# Check that snapshots have been symlinked rule _igv_check_snapshots: input: snapshots = _symlink_snapshot, From 901ab8de1adb760e2c3472d983549cfb6da4a44a Mon Sep 17 00:00:00 2001 From: mannycruz Date: Thu, 16 Mar 2023 17:37:45 -0700 Subject: [PATCH 033/132] Skip IGV run if empty batch script --- modules/igv/1.0/igv.smk | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index c191a9fc..30ef8321 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -296,16 +296,20 @@ checkpoint _igv_run: output: complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}--{tumour_sample_id}.completed" log: - stdout = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_sample_id}_igv_run.stdout", - stderr = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_sample_id}_igv_run.stderr" + stdout = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_sample_id}_igv_run.stdout.log", + stderr = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_sample_id}_igv_run.stderr.log" params: merged_batch = str(rules._igv_create_batch_script_per_variant.output.variant_batch), igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh" threads: (workflow.cores) shell: op.as_one_line(""" + lines=$(wc -l < {params.merged_batch}) ; + if [ $lines -gt 0 ] ; + then echo 'exit' >> {params.merged_batch} ; xvfb-run --auto-servernum {params.igv} -b {params.merged_batch} > {log.stdout} 2> {log.stderr} ; + fi ; touch {output.complete} """) From c229e744116f10931061502f4fe0676507679e6b Mon Sep 17 00:00:00 2001 From: mannycruz Date: Fri, 17 Mar 2023 12:34:52 -0700 Subject: [PATCH 034/132] Clean up input functions --- modules/igv/1.0/igv.smk | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 30ef8321..f8407c7c 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -250,7 +250,21 @@ rule _igv_batches_to_merge: def _evaluate_batches(wildcards): CFG = config["lcr-modules"]["igv"] checkpoint_output = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.variant_batch - maf = expand(rules._igv_filter_maf.output.maf, zip, seq_type=wildcards.seq_type, genome_build=wildcards.genome_build, tumour_sample_id=wildcards.tumour_sample_id, normal_sample_id=CFG["runs"][(CFG["runs"]["tumour_sample_id"]==wildcards.tumour_sample_id) & (CFG["runs"]["tumour_seq_type"]==wildcards.seq_type) & (CFG["runs"]["tumour_genome_build"]==wildcards.genome_build)]["normal_sample_id"], pair_status=CFG["runs"][(CFG["runs"]["tumour_sample_id"]==wildcards.tumour_sample_id) & (CFG["runs"]["tumour_seq_type"]==wildcards.seq_type) & (CFG["runs"]["tumour_genome_build"]==wildcards.genome_build)]["pair_status"]) + + this_sample = op.filter_runs(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_genome_build = wildcards.genome_build, tuomur_seq_type = wildcards.seq_type) + + normal_sample_id = this_sample["normal_sample_id"] + pair_status = this_sample["pair_status"] + + maf = expand( + str(rules._igv_filter_maf.output.maf), + zip, + seq_type=wildcards.seq_type, + genome_build=wildcards.genome_build, + tumour_sample_id=wildcards.tumour_sample_id, + normal_sample_id=normal_sample_id, + pair_status=pair_status + ) if os.path.exists(maf[0]): maf_table = pd.read_table(maf[0], comment="#", sep="\t") @@ -308,9 +322,10 @@ checkpoint _igv_run: if [ $lines -gt 0 ] ; then echo 'exit' >> {params.merged_batch} ; - xvfb-run --auto-servernum {params.igv} -b {params.merged_batch} > {log.stdout} 2> {log.stderr} ; - fi ; + xvfb-run --auto-servernum {params.igv} -b {params.merged_batch} > {log.stdout} 2> {log.stderr} && touch {output.complete} ; + else touch {output.complete} + fi """) # Symlinks the final output files into the module results directory (under '99-outputs/') @@ -326,13 +341,20 @@ rule _igv_symlink_snapshot: def _symlink_snapshot(wildcards): CFG = config["lcr-modules"]["igv"] checkpoint_outputs = checkpoints._igv_run.get(**wildcards).output.complete + + this_sample = op.filter_runs(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) + + normal_sample_id = this_sample["normal_sample_id"] + pair_status = this_sample["pair_status"] + maf = expand( str(rules._igv_filter_maf.output.maf), - zip, seq_type=wildcards.seq_type, + zip, + seq_type=wildcards.seq_type, genome_build=wildcards.genome_build, tumour_sample_id=wildcards.tumour_sample_id, - normal_sample_id=CFG["runs"][(CFG["runs"]["tumour_sample_id"]==wildcards.tumour_sample_id) & (CFG["runs"]["tumour_seq_type"]==wildcards.seq_type) & (CFG["runs"]["tumour_genome_build"]==wildcards.genome_build)]["normal_sample_id"], - pair_status=CFG["runs"][(CFG["runs"]["tumour_sample_id"]==wildcards.tumour_sample_id) & (CFG["runs"]["tumour_seq_type"]==wildcards.seq_type) & (CFG["runs"]["tumour_genome_build"]==wildcards.genome_build)]["pair_status"] + normal_sample_id=normal_sample_id, + pair_status=pair_status ) if os.path.exists(maf[0]): From a5e04c4f6bcda964ed39e5a0173051f2758ce684 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Fri, 17 Mar 2023 13:26:46 -0700 Subject: [PATCH 035/132] Rename tumour_sample_id wildcard to tumour_id --- modules/igv/1.0/igv.smk | 44 ++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index f8407c7c..a5fd0926 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -70,11 +70,11 @@ localrules: def get_bams(wildcards): metadata = config["lcr-modules"]["igv"]["samples"] - return expand("data/{{seq_type}}_bams/{{tumour_sample_id}}.{genome_build}.bam", genome_build=metadata[(metadata.sample_id == wildcards.tumour_sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) + return expand("data/{{seq_type}}_bams/{{tumour_id}}.{genome_build}.bam", genome_build=metadata[(metadata.sample_id == wildcards.tumour_sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) def get_bai(wildcards): metadata = config["lcr-modules"]["igv"]["samples"] - return expand("data/{{seq_type}}_bams/{{tumour_sample_id}}.{genome_build}.bam.bai", genome_build=metadata[(metadata.sample_id == wildcards.tumour_sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) + return expand("data/{{seq_type}}_bams/{{tumour_id}}.{genome_build}.bam.bai", genome_build=metadata[(metadata.sample_id == wildcards.tumour_sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) def get_maf(wildcards): unix_group = config["unix_group"] @@ -97,7 +97,7 @@ rule _igv_symlink_bam: input: bam = get_bams output: - bam = CFG["dirs"]["inputs"] + "bams/{seq_type}/{tumour_sample_id}.bam" + bam = CFG["dirs"]["inputs"] + "bams/{seq_type}/{tumour_id}.bam" run: op.absolute_symlink(input.bam, output.bam) @@ -105,7 +105,7 @@ rule _igv_symlink_bai: input: bai = get_bai output: - bai = CFG["dirs"]["inputs"] + "bams/{seq_type}/{tumour_sample_id}.bam.bai" + bai = CFG["dirs"]["inputs"] + "bams/{seq_type}/{tumour_id}.bam.bai" run: op.absolute_symlink(input.bai, output.bai) @@ -113,7 +113,7 @@ rule _igv_symlink_maf: input: maf = get_maf output: - maf = CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.maf" + maf = CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}.maf" run: op.absolute_symlink(input.maf, output.maf) @@ -122,7 +122,7 @@ rule _igv_reduce_maf_cols: input: maf = str(rules._igv_symlink_maf.output.maf) output: - maf = temp(CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.maf.temp") + maf = temp(CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}.maf.temp") shell: op.as_one_line(""" cut -f 1,5,6,7,9,10,11,13,16 {input.maf} > {output.maf} @@ -182,15 +182,15 @@ rule _igv_filter_maf: maf = str(rules._igv_reduce_maf_cols.output.maf), regions = str(rules._igv_liftover_regions.output.regions) output: - maf = CFG["dirs"]["inputs"] + "maf/filtered_maf/{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.maf" + maf = CFG["dirs"]["inputs"] + "maf/filtered_maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}.maf" params: regions_format = REGIONS_FORMAT[CFG["inputs"]["regions_format"].lower()], oncodriveclustl_params = CFG["filter_maf"]["oncodriveclustl_options"], n_snapshots = CFG["filter_maf"]["n_snapshots"] if CFG["filter_maf"]["n_snapshots"] is not None else 1000000, metadata = CFG["runs"] log: - stdout = CFG["logs"]["inputs"] + "filter_maf/{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}/filter_maf.stdout.log", - stderr = CFG["logs"]["inputs"] + "filter_maf/{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}/filter_maf.stderr.log" + stdout = CFG["logs"]["inputs"] + "filter_maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}/filter_maf.stdout.log", + stderr = CFG["logs"]["inputs"] + "filter_maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}/filter_maf.stderr.log" wildcard_constraints: seq_type = "[a-zA-Z]+" script: @@ -205,7 +205,7 @@ checkpoint _igv_create_batch_script_per_variant: regions_lifted = str(rules._igv_liftover_regions.output.regions), regions_formatted = str(rules._igv_format_regions_file.output.regions) output: - variant_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{tumour_sample_id}.batch" + variant_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{tumour_id}.batch" params: batch_dir = config["lcr-modules"]["igv"]["dirs"]["batch_scripts"], snapshot_dir = config["lcr-modules"]["igv"]["dirs"]["snapshots"], @@ -220,9 +220,9 @@ checkpoint _igv_create_batch_script_per_variant: # Keep track of which variant and sample_id combinations have been seen, merge individual variant batch scripts into a large batch script per sample_id rule _igv_batches_to_merge: input: - batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.batch" + batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.batch" output: - dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.batched" + dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.batched" params: batch_script_file = str(rules._igv_create_batch_script_per_variant.output.variant_batch) threads: (workflow.cores / 10) @@ -251,7 +251,7 @@ def _evaluate_batches(wildcards): CFG = config["lcr-modules"]["igv"] checkpoint_output = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.variant_batch - this_sample = op.filter_runs(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_genome_build = wildcards.genome_build, tuomur_seq_type = wildcards.seq_type) + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_genome_build = wildcards.genome_build, tuomur_seq_type = wildcards.seq_type) normal_sample_id = this_sample["normal_sample_id"] pair_status = this_sample["pair_status"] @@ -308,10 +308,10 @@ checkpoint _igv_run: igv = str(rules._igv_download_igv.output.igv_installed), batch_script = _evaluate_batches output: - complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}--{tumour_sample_id}.completed" + complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}--{tumour_id}.completed" log: - stdout = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_sample_id}_igv_run.stdout.log", - stderr = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_sample_id}_igv_run.stderr.log" + stdout = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.stdout.log", + stderr = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.stderr.log" params: merged_batch = str(rules._igv_create_batch_script_per_variant.output.variant_batch), igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh" @@ -331,9 +331,9 @@ checkpoint _igv_run: # Symlinks the final output files into the module results directory (under '99-outputs/') rule _igv_symlink_snapshot: input: - snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.png" + snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.png" output: - snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_sample_id}.png" + snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.png" run: op.relative_symlink(input.snapshot, output.snapshot) @@ -342,7 +342,7 @@ def _symlink_snapshot(wildcards): CFG = config["lcr-modules"]["igv"] checkpoint_outputs = checkpoints._igv_run.get(**wildcards).output.complete - this_sample = op.filter_runs(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) normal_sample_id = this_sample["normal_sample_id"] pair_status = this_sample["pair_status"] @@ -383,7 +383,7 @@ rule _igv_check_snapshots: snapshots = _symlink_snapshot, igv_completed = str(rules._igv_run.output.complete) output: - snapshots = CFG["dirs"]["outputs"] + "completed/{seq_type}--{genome_build}--{tumour_sample_id}.completed" + snapshots = CFG["dirs"]["outputs"] + "completed/{seq_type}--{genome_build}--{tumour_id}.completed" shell: "touch {output.snapshots}" @@ -391,7 +391,7 @@ rule _igv_check_snapshots: if CFG["test_run"] is False: rule _igv_all: input: - expand([str(rules._igv_run.output.complete), str(rules._igv_check_snapshots.output.snapshots)], zip, tumour_sample_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) + expand([str(rules._igv_run.output.complete), str(rules._igv_check_snapshots.output.snapshots)], zip, tumour_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) #if CFG["test_run"] is False: # rule _igv_all: @@ -401,7 +401,7 @@ if CFG["test_run"] is False: if CFG["test_run"] is True: rule _igv_all: input: - expand(rules._igv_filter_maf.output.maf, zip, seq_type=CFG["runs"]["tumour_seq_type"], tumour_sample_id=CFG["runs"]["tumour_sample_id"], normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"], genome_build=CFG["runs"]["tumour_genome_build"]) + expand(rules._igv_filter_maf.output.maf, zip, seq_type=CFG["runs"]["tumour_seq_type"], tumour_id=CFG["runs"]["tumour_sample_id"], normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"], genome_build=CFG["runs"]["tumour_genome_build"]) ##### CLEANUP ##### From bf0154e03def929d6941b589b313f5387a7fa57a Mon Sep 17 00:00:00 2001 From: mannycruz Date: Fri, 17 Mar 2023 14:10:40 -0700 Subject: [PATCH 036/132] Fix typos --- modules/igv/1.0/igv.smk | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index a5fd0926..9b238a2c 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -70,11 +70,11 @@ localrules: def get_bams(wildcards): metadata = config["lcr-modules"]["igv"]["samples"] - return expand("data/{{seq_type}}_bams/{{tumour_id}}.{genome_build}.bam", genome_build=metadata[(metadata.sample_id == wildcards.tumour_sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) + return expand("data/{{seq_type}}_bams/{{tumour_id}}.{genome_build}.bam", genome_build=metadata[(metadata.sample_id == wildcards.tumour_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) def get_bai(wildcards): metadata = config["lcr-modules"]["igv"]["samples"] - return expand("data/{{seq_type}}_bams/{{tumour_id}}.{genome_build}.bam.bai", genome_build=metadata[(metadata.sample_id == wildcards.tumour_sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) + return expand("data/{{seq_type}}_bams/{{tumour_id}}.{genome_build}.bam.bai", genome_build=metadata[(metadata.sample_id == wildcards.tumour_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) def get_maf(wildcards): unix_group = config["unix_group"] @@ -251,7 +251,7 @@ def _evaluate_batches(wildcards): CFG = config["lcr-modules"]["igv"] checkpoint_output = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.variant_batch - this_sample = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_genome_build = wildcards.genome_build, tuomur_seq_type = wildcards.seq_type) + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_genome_build = wildcards.genome_build, tumour_seq_type = wildcards.seq_type) normal_sample_id = this_sample["normal_sample_id"] pair_status = this_sample["pair_status"] @@ -261,7 +261,7 @@ def _evaluate_batches(wildcards): zip, seq_type=wildcards.seq_type, genome_build=wildcards.genome_build, - tumour_sample_id=wildcards.tumour_sample_id, + tumour_id=wildcards.tumour_id, normal_sample_id=normal_sample_id, pair_status=pair_status ) @@ -276,7 +276,7 @@ def _evaluate_batches(wildcards): chromosome = maf_table["chr_std"], start_position = maf_table["Start_Position"], gene = maf_table["Hugo_Symbol"], - tumour_sample_id = maf_table["Tumor_Sample_Barcode"], + tumour_id = maf_table["Tumor_Sample_Barcode"], seq_type = maf_table["seq_type"], genome_build = maf_table["genome_build"], allow_missing=True @@ -324,7 +324,7 @@ checkpoint _igv_run: echo 'exit' >> {params.merged_batch} ; xvfb-run --auto-servernum {params.igv} -b {params.merged_batch} > {log.stdout} 2> {log.stderr} && touch {output.complete} ; else - touch {output.complete} + touch {output.complete} ; fi """) @@ -342,7 +342,7 @@ def _symlink_snapshot(wildcards): CFG = config["lcr-modules"]["igv"] checkpoint_outputs = checkpoints._igv_run.get(**wildcards).output.complete - this_sample = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) + this_sample = op.filter_samples(CFG["runs"], tumour_id = wildcards.tumour_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) normal_sample_id = this_sample["normal_sample_id"] pair_status = this_sample["pair_status"] @@ -352,7 +352,7 @@ def _symlink_snapshot(wildcards): zip, seq_type=wildcards.seq_type, genome_build=wildcards.genome_build, - tumour_sample_id=wildcards.tumour_sample_id, + tumour_id=wildcards.tumour_id, normal_sample_id=normal_sample_id, pair_status=pair_status ) @@ -369,7 +369,7 @@ def _symlink_snapshot(wildcards): chromosome = maf_table["chr_std"], start_position = maf_table["Start_Position"], gene = maf_table["Hugo_Symbol"], - tumour_sample_id = maf_table["Tumor_Sample_Barcode"], + tumour_id = maf_table["Tumor_Sample_Barcode"], allow_missing=True ), padding = str(CFG["generate_batch_script"]["padding"]) From b15107008a9e37f85b0222594d4dc6b9862c70e5 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Fri, 17 Mar 2023 14:11:05 -0700 Subject: [PATCH 037/132] Remove bam and index file reformatting --- modules/igv/1.0/etc/generate_batch_script_per_variant.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py index fc338d8f..14439e69 100644 --- a/modules/igv/1.0/etc/generate_batch_script_per_variant.py +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -100,8 +100,8 @@ def generate_igv_batch_header(bam, index, max_height, genome_build): genome_build = genome_build.replace("grch37","hg19") - bam_file = os.path.realpath(bam) - bai_file = os.path.realpath(index) + bam_file = bam + bai_file = index lines.append(f"load {bam_file} index={bai_file}") lines.append(f"maxPanelHeight {max_height}") From 0e9e62ce079c7f18ecc39d9bfba83fda2593c121 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Mon, 20 Mar 2023 12:45:24 -0700 Subject: [PATCH 038/132] Fix typo in subset of runs table --- modules/igv/1.0/igv.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 9b238a2c..7cabab55 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -342,7 +342,7 @@ def _symlink_snapshot(wildcards): CFG = config["lcr-modules"]["igv"] checkpoint_outputs = checkpoints._igv_run.get(**wildcards).output.complete - this_sample = op.filter_samples(CFG["runs"], tumour_id = wildcards.tumour_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) normal_sample_id = this_sample["normal_sample_id"] pair_status = this_sample["pair_status"] From 80dcd76d347a8e918cdb14867d45b65b6dc7a1e7 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 22 Mar 2023 14:17:44 -0700 Subject: [PATCH 039/132] Add timeout proportional to batch lines --- modules/igv/1.0/igv.smk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 7cabab55..9f7bf1ff 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -322,7 +322,8 @@ checkpoint _igv_run: if [ $lines -gt 0 ] ; then echo 'exit' >> {params.merged_batch} ; - xvfb-run --auto-servernum {params.igv} -b {params.merged_batch} > {log.stdout} 2> {log.stderr} && touch {output.complete} ; + maxtime=$(($(wc -l < {params.merged_batch}) * 5 + 15)) ; + timeout $maxtime xvfb-run --auto-servernum {params.igv} -b {params.merged_batch} > {log.stdout} 2> {log.stderr} && touch {output.complete} ; else touch {output.complete} ; fi From 774ed17fb196f1603da0817c861531f9c6398d78 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 22 Mar 2023 14:21:10 -0700 Subject: [PATCH 040/132] Make dispatched batch file names cleaner --- modules/igv/1.0/igv.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 9f7bf1ff..7c8bce92 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -222,7 +222,7 @@ rule _igv_batches_to_merge: input: batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.batch" output: - dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.batched" + dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.batch" params: batch_script_file = str(rules._igv_create_batch_script_per_variant.output.variant_batch) threads: (workflow.cores / 10) From cae1f6efce7869882faeaa8f8bc3b9c7bbd4d965 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 22 Mar 2023 14:27:18 -0700 Subject: [PATCH 041/132] Improve test run output and functionality --- modules/igv/1.0/igv.smk | 96 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 90 insertions(+), 6 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 7c8bce92..d1fd8d09 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -63,6 +63,8 @@ localrules: _igv_run, _igv_symlink_snapshot, _igv_check_snapshots, + _igv_mock_merge_batches, + _igv_estimate_snapshots ##### FUNCTIONS ##### @@ -388,21 +390,103 @@ rule _igv_check_snapshots: shell: "touch {output.snapshots}" +if CFG["test_run"] is True: + def _estimate_batches(wildcards): + CFG = config["lcr-modules"]["igv"] + checkpoint_outputs = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.variant_batch + + this_sample = op.filter_samples(CFG["runs"], tumour_seq_type=wildcards.seq_type, tumour_genome_build=wildcards.genome_build, tumour_sample_id=wildcards.tumour_id) + normal_sample_id = this_sample["normal_sample_id"] + pair_status = this_sample["pair_status"] + + maf = expand( + str(rules._igv_filter_maf.output.maf), + zip, + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + tumour_id = wildcards.tumour_id, + normal_sample_id = normal_sample_id, + pair_status = pair_status + ) + + if os.path.exists(maf[0]): + maf_table = pd.read_table(maf[0], comment="#", sep="\t") + + return expand( + expand( + CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.batch", + zip, + seq_type = maf_table["seq_type"], + genome_build = maf_table["genome_build"], + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + tumour_id = maf_table["Tumor_Sample_Barcode"], + allow_missing = True + ), + padding = str(CFG["generate_batch_script"]["padding"]) + ) + else: + return [] + + rule _igv_mock_merge_batches: + input: + batch_script = _estimate_batches + output: + batch_script = temp(CFG["dirs"]["batch_scripts"] + "mock_batches/{seq_type}--{genome_build}--{tumour_id}.batch") + run: + CFG = config["lcr-modules"]["igv"] + if not os.path.exists(CFG["dirs"]["batch_scripts"] + "mock_batches"): + os.mkdir(CFG["dirs"]["batch_scripts"] + "mock_batches") + with open(output.batch_script, "a") as out: + for batch in input.batch_script: + out.write(batch + "\n") + + rule _igv_estimate_snapshots: + input: + batch_scripts = expand(str(rules._igv_mock_merge_batches.output.batch_script), zip, tumour_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) + output: + summary = CFG["dirs"]["batch_scripts"] + "mock_batches/snapshot_summary.txt" + params: + dispatch_dir = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/" + run: + gene_ids = [] + + for sample_batches in input.batch_scripts: + with open(sample_batches, "r") as handle: + for batch_path in handle: + batch_path = batch_path.split("/") + snapshot_name = batch_path[-1] + seq_type = batch_path[-2].split("--")[0] + genome_build = batch_path[-2].split("--")[1] + + potential_dispatch_file = params.dispatch_dir + f"{seq_type}--{genome_build}/{snapshot_name}" + + if not os.path.exists(potential_dispatch_file): + gene = snapshot_name.split("--")[2] + gene_ids.append(gene) + + snapshot_summary = pd.DataFrame( + { + "gene": list(set(gene_ids)), + "snapshots": map(gene_ids.count, list(set(gene_ids))) + } + ) + + snapshot_summary.loc["Total"] = snapshot_summary.sum() + snapshot_summary["gene"]["Total"] = "Total" + snapshot_summary.to_csv(output.summary, sep="\t", index=False) + # Generates the target sentinels for each run, which generate the symlinks if CFG["test_run"] is False: rule _igv_all: input: expand([str(rules._igv_run.output.complete), str(rules._igv_check_snapshots.output.snapshots)], zip, tumour_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) -#if CFG["test_run"] is False: -# rule _igv_all: -# input: -# expand(str(rules._igv_check_outputs.output), zip, tumour_sample_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) - if CFG["test_run"] is True: rule _igv_all: input: - expand(rules._igv_filter_maf.output.maf, zip, seq_type=CFG["runs"]["tumour_seq_type"], tumour_id=CFG["runs"]["tumour_sample_id"], normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"], genome_build=CFG["runs"]["tumour_genome_build"]) + str(rules._igv_estimate_snapshots.output.summary) ##### CLEANUP ##### From a376b4bffb05dd9fc738e64e1ad05cce75e5bde6 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Thu, 23 Mar 2023 11:12:52 -0700 Subject: [PATCH 042/132] Increase sleep interval to prevent cut-off snaps --- modules/igv/1.0/etc/generate_batch_script_per_variant.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py index 14439e69..fcb7ad5d 100644 --- a/modules/igv/1.0/etc/generate_batch_script_per_variant.py +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -91,6 +91,7 @@ def generate_igv_batch_per_row(coordinates, snapshot_filename, igv_options): lines.append("collapse") for option in igv_options: lines.append(option) + lines.append("setSleepInterval 50") lines.append(f"snapshot {snapshot_filename}") return lines From 9490642b0fd058a50647c21a8f7670fd081248a7 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Thu, 23 Mar 2023 11:14:43 -0700 Subject: [PATCH 043/132] Prevent multiple exit statements added to batches --- modules/igv/1.0/igv.smk | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index d1fd8d09..1335cfe5 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -323,7 +323,10 @@ checkpoint _igv_run: lines=$(wc -l < {params.merged_batch}) ; if [ $lines -gt 0 ] ; then + if ! grep -q -e "exit" {params.merged_batch} ; + then echo 'exit' >> {params.merged_batch} ; + fi ; maxtime=$(($(wc -l < {params.merged_batch}) * 5 + 15)) ; timeout $maxtime xvfb-run --auto-servernum {params.igv} -b {params.merged_batch} > {log.stdout} 2> {log.stderr} && touch {output.complete} ; else From 63ca90955b7a0188747b0bece1ed6309b4b706cf Mon Sep 17 00:00:00 2001 From: mannycruz Date: Thu, 23 Mar 2023 14:26:36 -0700 Subject: [PATCH 044/132] Add ability to take snapshots in pair orientation --- .../etc/generate_batch_script_per_variant.py | 23 +++++--- modules/igv/1.0/igv.smk | 55 +++++++++++-------- 2 files changed, 48 insertions(+), 30 deletions(-) diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py index fcb7ad5d..76e16652 100644 --- a/modules/igv/1.0/etc/generate_batch_script_per_variant.py +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -46,7 +46,8 @@ def main(): genome_build = snakemake.params[2], seq_type = snakemake.params[3], igv_options = snakemake.params[5], - max_height = snakemake.params[6] + max_height = snakemake.params[6], + as_pairs = snakemake.params[7] ) touch_output = open(snakemake.output[0], "w") @@ -91,12 +92,12 @@ def generate_igv_batch_per_row(coordinates, snapshot_filename, igv_options): lines.append("collapse") for option in igv_options: lines.append(option) - lines.append("setSleepInterval 50") + lines.append("setSleepInterval 100") lines.append(f"snapshot {snapshot_filename}") return lines -def generate_igv_batch_header(bam, index, max_height, genome_build): +def generate_igv_batch_header(bam, index, max_height, genome_build, as_pairs): lines = [] genome_build = genome_build.replace("grch37","hg19") @@ -108,13 +109,16 @@ def generate_igv_batch_header(bam, index, max_height, genome_build): lines.append(f"maxPanelHeight {max_height}") lines.append(f"genome {genome_build}") + if as_pairs: + lines.append("viewaspairs") + return lines -def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_build, seq_type, igv_options, max_height): +def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_build, seq_type, igv_options, max_height, as_pairs=False): for _, row in regions.iterrows(): all_lines = [] - header = generate_igv_batch_header(bam=bam, index=bai, max_height=max_height, genome_build=genome_build) + header = generate_igv_batch_header(bam=bam, index=bai, max_height=max_height, genome_build=genome_build, as_pairs=as_pairs) all_lines.extend(header) dir_chrom = row.chromosome @@ -129,8 +133,13 @@ def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_bui filename.append(row.region_name) filename.append(row.sample_id) - batch_filename = "--".join(filename) + ".batch" - filename = "--".join(filename) + ".png" + if as_pairs: + suffix = ".pairs" + else: + suffix = "" + + batch_filename = "--".join(filename) + suffix + ".batch" + filename = "--".join(filename) + suffix + ".png" lines = generate_igv_batch_per_row( coordinates = row.snapshot_coordinates, diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 1335cfe5..6db330e1 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -85,6 +85,11 @@ def get_maf(wildcards): ##### RULES ##### +if CFG["view_as_pairs"]: + file_suffix = "pairs." +else: + file_suffix = "" + # Symlinks the input files into the module results directory (under '00-inputs/') rule _igv_symlink_regions_file: @@ -207,7 +212,7 @@ checkpoint _igv_create_batch_script_per_variant: regions_lifted = str(rules._igv_liftover_regions.output.regions), regions_formatted = str(rules._igv_format_regions_file.output.regions) output: - variant_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{tumour_id}.batch" + variant_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{tumour_id}.{suffix}batch" params: batch_dir = config["lcr-modules"]["igv"]["dirs"]["batch_scripts"], snapshot_dir = config["lcr-modules"]["igv"]["dirs"]["snapshots"], @@ -215,16 +220,17 @@ checkpoint _igv_create_batch_script_per_variant: seq_type = lambda w: w.seq_type, padding = config["lcr-modules"]["igv"]["generate_batch_script"]["padding"], igv_options = config["lcr-modules"]["igv"]["generate_batch_script"]["igv_options"], - max_height = config["lcr-modules"]["igv"]["generate_batch_script"]["max_height"] + max_height = config["lcr-modules"]["igv"]["generate_batch_script"]["max_height"], + view_pairs = config["lcr-modules"]["igv"]["view_as_pairs"] script: config["lcr-modules"]["igv"]["scripts"]["batch_script_per_variant"] # Keep track of which variant and sample_id combinations have been seen, merge individual variant batch scripts into a large batch script per sample_id rule _igv_batches_to_merge: input: - batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.batch" + batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.{suffix}batch" output: - dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.batch" + dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.{suffix}batch" params: batch_script_file = str(rules._igv_create_batch_script_per_variant.output.variant_batch) threads: (workflow.cores / 10) @@ -240,7 +246,7 @@ rule _igv_batches_to_merge: with open(output_file, "a") as handle: for line in batch_script: if merged_lines > 0: - if line.startswith(("load","maxPanelHeight","genome")): + if line.startswith(("load","maxPanelHeight","genome","viewaspairs")): continue handle.write(line) batch_script.close() @@ -283,7 +289,8 @@ def _evaluate_batches(wildcards): genome_build = maf_table["genome_build"], allow_missing=True ), - padding=str(CFG["generate_batch_script"]["padding"]) + padding = str(CFG["generate_batch_script"]["padding"]), + suffix = file_suffix ) else: return [] @@ -310,10 +317,10 @@ checkpoint _igv_run: igv = str(rules._igv_download_igv.output.igv_installed), batch_script = _evaluate_batches output: - complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}--{tumour_id}.completed" + complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}--{tumour_id}.{suffix}completed" log: - stdout = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.stdout.log", - stderr = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.stderr.log" + stdout = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.{suffix}stdout.log", + stderr = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.{suffix}stderr.log" params: merged_batch = str(rules._igv_create_batch_script_per_variant.output.variant_batch), igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh" @@ -337,9 +344,9 @@ checkpoint _igv_run: # Symlinks the final output files into the module results directory (under '99-outputs/') rule _igv_symlink_snapshot: input: - snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.png" + snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.{suffix}png" output: - snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.png" + snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.{suffix}png" run: op.relative_symlink(input.snapshot, output.snapshot) @@ -378,7 +385,8 @@ def _symlink_snapshot(wildcards): tumour_id = maf_table["Tumor_Sample_Barcode"], allow_missing=True ), - padding = str(CFG["generate_batch_script"]["padding"]) + padding = str(CFG["generate_batch_script"]["padding"]), + suffix = file_suffix ) else: return [] @@ -389,7 +397,7 @@ rule _igv_check_snapshots: snapshots = _symlink_snapshot, igv_completed = str(rules._igv_run.output.complete) output: - snapshots = CFG["dirs"]["outputs"] + "completed/{seq_type}--{genome_build}--{tumour_id}.completed" + snapshots = CFG["dirs"]["outputs"] + "completed/{seq_type}--{genome_build}--{tumour_id}.{suffix}completed" shell: "touch {output.snapshots}" @@ -405,11 +413,11 @@ if CFG["test_run"] is True: maf = expand( str(rules._igv_filter_maf.output.maf), zip, - seq_type = wildcards.seq_type, - genome_build = wildcards.genome_build, - tumour_id = wildcards.tumour_id, - normal_sample_id = normal_sample_id, - pair_status = pair_status + seq_type=wildcards.seq_type, + genome_build=wildcards.genome_build, + tumour_id=wildcards.tumour_id, + normal_sample_id=normal_sample_id, + pair_status=pair_status ) if os.path.exists(maf[0]): @@ -417,7 +425,7 @@ if CFG["test_run"] is True: return expand( expand( - CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.batch", + CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.{suffix}batch", zip, seq_type = maf_table["seq_type"], genome_build = maf_table["genome_build"], @@ -427,7 +435,8 @@ if CFG["test_run"] is True: tumour_id = maf_table["Tumor_Sample_Barcode"], allow_missing = True ), - padding = str(CFG["generate_batch_script"]["padding"]) + padding = str(CFG["generate_batch_script"]["padding"]), + suffix = file_suffix ) else: return [] @@ -436,7 +445,7 @@ if CFG["test_run"] is True: input: batch_script = _estimate_batches output: - batch_script = temp(CFG["dirs"]["batch_scripts"] + "mock_batches/{seq_type}--{genome_build}--{tumour_id}.batch") + batch_script = temp(CFG["dirs"]["batch_scripts"] + "mock_batches/{seq_type}--{genome_build}--{tumour_id}.{suffix}batch") run: CFG = config["lcr-modules"]["igv"] if not os.path.exists(CFG["dirs"]["batch_scripts"] + "mock_batches"): @@ -447,7 +456,7 @@ if CFG["test_run"] is True: rule _igv_estimate_snapshots: input: - batch_scripts = expand(str(rules._igv_mock_merge_batches.output.batch_script), zip, tumour_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) + batch_scripts = expand(str(rules._igv_mock_merge_batches.output.batch_script), zip, tumour_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"], suffix=file_suffix) output: summary = CFG["dirs"]["batch_scripts"] + "mock_batches/snapshot_summary.txt" params: @@ -484,7 +493,7 @@ if CFG["test_run"] is True: if CFG["test_run"] is False: rule _igv_all: input: - expand([str(rules._igv_run.output.complete), str(rules._igv_check_snapshots.output.snapshots)], zip, tumour_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) + expand([str(rules._igv_run.output.complete), str(rules._igv_check_snapshots.output.snapshots)], zip, tumour_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"], suffix=file_suffix) if CFG["test_run"] is True: rule _igv_all: From 58529cccdee8a10b7c0281d304cf454c34bf69c7 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Fri, 24 Mar 2023 01:26:31 -0700 Subject: [PATCH 045/132] Increase sleep interval even more --- modules/igv/1.0/etc/generate_batch_script_per_variant.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py index 76e16652..492e4702 100644 --- a/modules/igv/1.0/etc/generate_batch_script_per_variant.py +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -92,7 +92,7 @@ def generate_igv_batch_per_row(coordinates, snapshot_filename, igv_options): lines.append("collapse") for option in igv_options: lines.append(option) - lines.append("setSleepInterval 100") + lines.append("setSleepInterval 1000") lines.append(f"snapshot {snapshot_filename}") return lines From 5da0a49cdb73721aba75a3ed05975e21e5f4c773 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Mon, 27 Mar 2023 12:02:10 -0700 Subject: [PATCH 046/132] Add ability to handle empty MAF files --- modules/igv/1.0/etc/filter_maf.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/modules/igv/1.0/etc/filter_maf.py b/modules/igv/1.0/etc/filter_maf.py index 3107c2b4..08d9f8b2 100644 --- a/modules/igv/1.0/etc/filter_maf.py +++ b/modules/igv/1.0/etc/filter_maf.py @@ -28,6 +28,16 @@ def main(): output_file = snakemake.output[0] + # Return empty dataframe if no lines in MAF + line_count = count_lines(maf_file) + if line_count == 1: + empty_maf = pd.read_table(maf_file, comment="#", sep="\t") + # Add columns required by workflow + required_columns = ["seq_type","genome_build","chr_std"] + maf_table = maf_table.assign(**{col:None for col in required_columns if col not in empty_maf.columns}) + write_output(empty_maf, output_file) + exit() + maf = maf_add_columns(maf=maf_file, metadata=metadata) # Peform filtering @@ -42,6 +52,11 @@ def main(): write_output(filtered_maf, output_file) +def count_lines(maf): + with open(maf, "r") as handle: + total_lines = len(handle.readlines()) + return total_lines + def filter_by_bed(maf, regions): # Remove row containing column names From 95900fdef05315bb6f23ce2dde59c18baaef3c35 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Mon, 27 Mar 2023 12:03:49 -0700 Subject: [PATCH 047/132] Fix issues with pairs version file suffix --- modules/igv/1.0/igv.smk | 69 ++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 6db330e1..966481c9 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -86,9 +86,9 @@ def get_maf(wildcards): ##### RULES ##### if CFG["view_as_pairs"]: - file_suffix = "pairs." + SUFFIX = ".pairs" else: - file_suffix = "" + SUFFIX = "" # Symlinks the input files into the module results directory (under '00-inputs/') @@ -203,7 +203,7 @@ rule _igv_filter_maf: script: config["lcr-modules"]["igv"]["scripts"]["filter_script"] -# Create batch scripts for each variant within sample_id's filtered MAF +# Create batch scripts for each variant checkpoint _igv_create_batch_script_per_variant: input: filter_maf = expand(str(rules._igv_filter_maf.output.maf), zip, normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"], allow_missing=True)[0], @@ -212,7 +212,7 @@ checkpoint _igv_create_batch_script_per_variant: regions_lifted = str(rules._igv_liftover_regions.output.regions), regions_formatted = str(rules._igv_format_regions_file.output.regions) output: - variant_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{tumour_id}.{suffix}batch" + variant_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{tumour_id}" + SUFFIX + ".batch" params: batch_dir = config["lcr-modules"]["igv"]["dirs"]["batch_scripts"], snapshot_dir = config["lcr-modules"]["igv"]["dirs"]["snapshots"], @@ -228,9 +228,9 @@ checkpoint _igv_create_batch_script_per_variant: # Keep track of which variant and sample_id combinations have been seen, merge individual variant batch scripts into a large batch script per sample_id rule _igv_batches_to_merge: input: - batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.{suffix}batch" + batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}" + SUFFIX + ".batch" output: - dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.{suffix}batch" + dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}" + SUFFIX + ".batch" params: batch_script_file = str(rules._igv_create_batch_script_per_variant.output.variant_batch) threads: (workflow.cores / 10) @@ -289,8 +289,7 @@ def _evaluate_batches(wildcards): genome_build = maf_table["genome_build"], allow_missing=True ), - padding = str(CFG["generate_batch_script"]["padding"]), - suffix = file_suffix + padding = str(CFG["generate_batch_script"]["padding"]) ) else: return [] @@ -317,10 +316,10 @@ checkpoint _igv_run: igv = str(rules._igv_download_igv.output.igv_installed), batch_script = _evaluate_batches output: - complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}--{tumour_id}.{suffix}completed" + complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}--{tumour_id}.completed" log: - stdout = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.{suffix}stdout.log", - stderr = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.{suffix}stderr.log" + stdout = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.stdout.log", + stderr = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.stderr.log" params: merged_batch = str(rules._igv_create_batch_script_per_variant.output.variant_batch), igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh" @@ -344,9 +343,9 @@ checkpoint _igv_run: # Symlinks the final output files into the module results directory (under '99-outputs/') rule _igv_symlink_snapshot: input: - snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.{suffix}png" + snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}" + SUFFIX + ".png" output: - snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.{suffix}png" + snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}" + SUFFIX + ".png" run: op.relative_symlink(input.snapshot, output.snapshot) @@ -385,8 +384,7 @@ def _symlink_snapshot(wildcards): tumour_id = maf_table["Tumor_Sample_Barcode"], allow_missing=True ), - padding = str(CFG["generate_batch_script"]["padding"]), - suffix = file_suffix + padding = str(CFG["generate_batch_script"]["padding"]) ) else: return [] @@ -397,7 +395,7 @@ rule _igv_check_snapshots: snapshots = _symlink_snapshot, igv_completed = str(rules._igv_run.output.complete) output: - snapshots = CFG["dirs"]["outputs"] + "completed/{seq_type}--{genome_build}--{tumour_id}.{suffix}completed" + snapshots = CFG["dirs"]["outputs"] + "completed/{seq_type}--{genome_build}--{tumour_id}.completed" shell: "touch {output.snapshots}" @@ -425,7 +423,7 @@ if CFG["test_run"] is True: return expand( expand( - CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}.{suffix}batch", + CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}" + SUFFIX + ".batch", zip, seq_type = maf_table["seq_type"], genome_build = maf_table["genome_build"], @@ -435,8 +433,7 @@ if CFG["test_run"] is True: tumour_id = maf_table["Tumor_Sample_Barcode"], allow_missing = True ), - padding = str(CFG["generate_batch_script"]["padding"]), - suffix = file_suffix + padding = str(CFG["generate_batch_script"]["padding"]) ) else: return [] @@ -445,7 +442,7 @@ if CFG["test_run"] is True: input: batch_script = _estimate_batches output: - batch_script = temp(CFG["dirs"]["batch_scripts"] + "mock_batches/{seq_type}--{genome_build}--{tumour_id}.{suffix}batch") + batch_script = temp(CFG["dirs"]["batch_scripts"] + "mock_batches/{seq_type}--{genome_build}--{tumour_id}.batch") run: CFG = config["lcr-modules"]["igv"] if not os.path.exists(CFG["dirs"]["batch_scripts"] + "mock_batches"): @@ -456,44 +453,44 @@ if CFG["test_run"] is True: rule _igv_estimate_snapshots: input: - batch_scripts = expand(str(rules._igv_mock_merge_batches.output.batch_script), zip, tumour_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"], suffix=file_suffix) + batch_scripts = expand(str(rules._igv_mock_merge_batches.output.batch_script), zip, tumour_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) output: summary = CFG["dirs"]["batch_scripts"] + "mock_batches/snapshot_summary.txt" params: - dispatch_dir = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/" + dispatch_dir = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/", + suffix = SUFFIX run: - gene_ids = [] + sample_dictionary = {} for sample_batches in input.batch_scripts: with open(sample_batches, "r") as handle: for batch_path in handle: batch_path = batch_path.split("/") - snapshot_name = batch_path[-1] + snapshot_name = batch_path[-1].split(f"{params.suffix}.batch")[0] seq_type = batch_path[-2].split("--")[0] - genome_build = batch_path[-2].split("--")[1] - - potential_dispatch_file = params.dispatch_dir + f"{seq_type}--{genome_build}/{snapshot_name}" + genome_build = batch_path[-2].split("--")[1] + sample_id = snapshot_name.split("--")[3] + potential_dispatch_file = params.dispatch_dir + f"{seq_type}--{genome_build}/{snapshot_name}{params.suffix}.batch" if not os.path.exists(potential_dispatch_file): gene = snapshot_name.split("--")[2] - gene_ids.append(gene) + if not sample_id in list(sample_dictionary): + sample_dictionary[sample_id] = [gene] + else: + sample_dictionary[sample_id].append(gene) - snapshot_summary = pd.DataFrame( - { - "gene": list(set(gene_ids)), - "snapshots": map(gene_ids.count, list(set(gene_ids))) - } - ) + snapshot_summary = pd.DataFrame(list(sample_dictionary.items()), columns=["sample_id","snapshots"]) + snapshot_summary["snapshots"] = snapshot_summary["snapshots"].apply(lambda x: len(x)) snapshot_summary.loc["Total"] = snapshot_summary.sum() - snapshot_summary["gene"]["Total"] = "Total" + snapshot_summary["sample_id"]["Total"] = "Total" snapshot_summary.to_csv(output.summary, sep="\t", index=False) # Generates the target sentinels for each run, which generate the symlinks if CFG["test_run"] is False: rule _igv_all: input: - expand([str(rules._igv_run.output.complete), str(rules._igv_check_snapshots.output.snapshots)], zip, tumour_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"], suffix=file_suffix) + expand([str(rules._igv_run.output.complete), str(rules._igv_check_snapshots.output.snapshots)], zip, tumour_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) if CFG["test_run"] is True: rule _igv_all: From 7a9b0f362a548d107802ad67d9a69007a36ff1b5 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 5 Apr 2023 15:03:34 -0700 Subject: [PATCH 048/132] Remove outdated lines --- modules/igv/1.0/config/default.yaml | 8 ++------ modules/igv/1.0/etc/filter_maf.py | 4 +--- modules/igv/1.0/igv.smk | 1 - 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index e9cddb6f..2a20b3eb 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -18,7 +18,6 @@ lcr-modules: p_value: # Desired q-value of OncodriveCLUSTL clusters score: # Desired scores of OncodriveCLUSTL clusters n_samples: # Desired number of samples in OncodriveCLUSTL clusters - n_snapshots: 20 # Number of snapshots to take per unique variant position. Default is set to max (1000000). genome_map: # Map different builds in metadata grch37: ["grch37","hg19","hg19-clc","hg19-reddy","hs37d5"] @@ -33,10 +32,8 @@ lcr-modules: hg38: "/projects/rmorin/reference/igenomes/Homo_sapiens/GSC/GRCh38/Sequence/WholeGenomeFasta/genome.fa" generate_batch_script: - temp: True # Create temporary batch files - image_format: ".svg" - padding: 300 - max_height: 400 + padding: 80 + max_height: 1000 # Available batch script options: https://github.com/igvteam/igv/wiki/Batch-commands igv_options: [] @@ -55,7 +52,6 @@ lcr-modules: conda_envs: liftover_regions: "{MODSDIR}/envs/crossmap.yaml" - format_regions: "{MODSDIR}/envs/format_regions.yaml" wget: "{REPODIR}/envs/wget/wget-1.20.1.yaml" threads: diff --git a/modules/igv/1.0/etc/filter_maf.py b/modules/igv/1.0/etc/filter_maf.py index 08d9f8b2..360844f0 100644 --- a/modules/igv/1.0/etc/filter_maf.py +++ b/modules/igv/1.0/etc/filter_maf.py @@ -18,14 +18,12 @@ def main(): regions_file = snakemake.input[1] regions_format = snakemake.params[0] - metadata = snakemake.params[3] + metadata = snakemake.params[2] if regions_format == "oncodriveclustl": global CLUSTL_PARAMS CLUSTL_PARAMS = snakemake.params[1] - n_snapshots = snakemake.params[2] - output_file = snakemake.output[0] # Return empty dataframe if no lines in MAF diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 966481c9..725a5dcf 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -193,7 +193,6 @@ rule _igv_filter_maf: params: regions_format = REGIONS_FORMAT[CFG["inputs"]["regions_format"].lower()], oncodriveclustl_params = CFG["filter_maf"]["oncodriveclustl_options"], - n_snapshots = CFG["filter_maf"]["n_snapshots"] if CFG["filter_maf"]["n_snapshots"] is not None else 1000000, metadata = CFG["runs"] log: stdout = CFG["logs"]["inputs"] + "filter_maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}/filter_maf.stdout.log", From 2054e4177268e8981a30b56f6f98fe7c63a91997 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 5 Apr 2023 15:15:24 -0700 Subject: [PATCH 049/132] Add sleep timer + igv options to batch scripts + change how suffix is added to filename --- modules/igv/1.0/config/default.yaml | 3 +- .../etc/generate_batch_script_per_variant.py | 21 ++++--- modules/igv/1.0/igv.smk | 55 +++++++++---------- 3 files changed, 39 insertions(+), 40 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index 2a20b3eb..6b19c7c8 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -34,6 +34,7 @@ lcr-modules: generate_batch_script: padding: 80 max_height: 1000 + sleep_timer: 2000 # Batch scripts with more options may require longer sleep intervals # Available batch script options: https://github.com/igvteam/igv/wiki/Batch-commands igv_options: [] @@ -53,7 +54,7 @@ lcr-modules: conda_envs: liftover_regions: "{MODSDIR}/envs/crossmap.yaml" wget: "{REPODIR}/envs/wget/wget-1.20.1.yaml" - + threads: step_1: 4 diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py index 492e4702..55599d45 100644 --- a/modules/igv/1.0/etc/generate_batch_script_per_variant.py +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -47,7 +47,9 @@ def main(): seq_type = snakemake.params[3], igv_options = snakemake.params[5], max_height = snakemake.params[6], - as_pairs = snakemake.params[7] + suffix = snakemake.params[7], + as_pairs = snakemake.params[8], + sleep_timer = snakemake.params[9] ) touch_output = open(snakemake.output[0], "w") @@ -85,19 +87,17 @@ def output_lines(lines, batch_output): output.write(text) output.close() -def generate_igv_batch_per_row(coordinates, snapshot_filename, igv_options): +def generate_igv_batch_per_row(coordinates, snapshot_filename, sleep_timer): lines = [] lines.append(f"goto {coordinates}") lines.append("sort") lines.append("collapse") - for option in igv_options: - lines.append(option) - lines.append("setSleepInterval 1000") + lines.append(f"setSleepInterval {sleep_timer}") lines.append(f"snapshot {snapshot_filename}") return lines -def generate_igv_batch_header(bam, index, max_height, genome_build, as_pairs): +def generate_igv_batch_header(bam, index, max_height, genome_build, igv_options, as_pairs): lines = [] genome_build = genome_build.replace("grch37","hg19") @@ -108,17 +108,20 @@ def generate_igv_batch_header(bam, index, max_height, genome_build, as_pairs): lines.append(f"maxPanelHeight {max_height}") lines.append(f"genome {genome_build}") + + for option in igv_options: + lines.append(option) if as_pairs: lines.append("viewaspairs") return lines -def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_build, seq_type, igv_options, max_height, as_pairs=False): +def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_build, seq_type, igv_options, max_height, suffix, as_pairs=False, sleep_timer=2000): for _, row in regions.iterrows(): all_lines = [] - header = generate_igv_batch_header(bam=bam, index=bai, max_height=max_height, genome_build=genome_build, as_pairs=as_pairs) + header = generate_igv_batch_header(bam=bam, index=bai, max_height=max_height, genome_build=genome_build, igv_options=igv_options, as_pairs=as_pairs) all_lines.extend(header) dir_chrom = row.chromosome @@ -144,7 +147,7 @@ def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_bui lines = generate_igv_batch_per_row( coordinates = row.snapshot_coordinates, snapshot_filename = filename, - igv_options = igv_options + sleep_timer = sleep_timer ) all_lines.extend(lines) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 725a5dcf..fdd02d17 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -46,6 +46,10 @@ CFG = op.setup_module( CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["genome_map"]["grch37"]), "grch37", inplace=True) CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["genome_map"]["hg38"]), "hg38", inplace=True) +# Define output file suffix based on config parameters +SUFFIX = ".pad" + str(CFG["generate_batch_script"]["padding"]) +if CFG["view_as_pairs"]: + SUFFIX = SUFFIX + ".pairs" # Define rules to be run locally when using a compute cluster localrules: @@ -85,10 +89,6 @@ def get_maf(wildcards): ##### RULES ##### -if CFG["view_as_pairs"]: - SUFFIX = ".pairs" -else: - SUFFIX = "" # Symlinks the input files into the module results directory (under '00-inputs/') @@ -220,18 +220,21 @@ checkpoint _igv_create_batch_script_per_variant: padding = config["lcr-modules"]["igv"]["generate_batch_script"]["padding"], igv_options = config["lcr-modules"]["igv"]["generate_batch_script"]["igv_options"], max_height = config["lcr-modules"]["igv"]["generate_batch_script"]["max_height"], - view_pairs = config["lcr-modules"]["igv"]["view_as_pairs"] + suffix = SUFFIX, + view_pairs = config["lcr-modules"]["igv"]["view_as_pairs"], + sleep_timer = config["lcr-modules"]["igv"]["generate_batch_script"]["sleep_timer"] script: config["lcr-modules"]["igv"]["scripts"]["batch_script_per_variant"] # Keep track of which variant and sample_id combinations have been seen, merge individual variant batch scripts into a large batch script per sample_id rule _igv_batches_to_merge: input: - batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}" + SUFFIX + ".batch" + batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" output: - dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}" + SUFFIX + ".batch" + dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" params: - batch_script_file = str(rules._igv_create_batch_script_per_variant.output.variant_batch) + batch_script_file = str(rules._igv_create_batch_script_per_variant.output.variant_batch), + igv_options = CFG["generate_batch_script"]["igv_options"] threads: (workflow.cores / 10) run: batch_script_path = os.path.abspath(input.batch_script) @@ -247,6 +250,8 @@ rule _igv_batches_to_merge: if merged_lines > 0: if line.startswith(("load","maxPanelHeight","genome","viewaspairs")): continue + if line.startswith(tuple(params.igv_options)): + continue handle.write(line) batch_script.close() @@ -277,7 +282,6 @@ def _evaluate_batches(wildcards): maf_table = pd.read_table(maf[0], comment="#", sep="\t") return expand( - expand( str(rules._igv_batches_to_merge.output.dispatched_batch_script), zip, chromosome = maf_table["chr_std"], @@ -285,11 +289,8 @@ def _evaluate_batches(wildcards): gene = maf_table["Hugo_Symbol"], tumour_id = maf_table["Tumor_Sample_Barcode"], seq_type = maf_table["seq_type"], - genome_build = maf_table["genome_build"], - allow_missing=True - ), - padding = str(CFG["generate_batch_script"]["padding"]) - ) + genome_build = maf_table["genome_build"] + ) else: return [] @@ -313,7 +314,8 @@ rule _igv_download_igv: checkpoint _igv_run: input: igv = str(rules._igv_download_igv.output.igv_installed), - batch_script = _evaluate_batches + batch_script = _evaluate_batches, + merged_batch = str(rules._igv_create_batch_script_per_variant.output.variant_batch) output: complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}--{tumour_id}.completed" log: @@ -321,7 +323,8 @@ checkpoint _igv_run: stderr = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.stderr.log" params: merged_batch = str(rules._igv_create_batch_script_per_variant.output.variant_batch), - igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh" + igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh", + max_time = CFG["generate_batch_script"]["sleep_timer"] threads: (workflow.cores) shell: op.as_one_line(""" @@ -342,9 +345,9 @@ checkpoint _igv_run: # Symlinks the final output files into the module results directory (under '99-outputs/') rule _igv_symlink_snapshot: input: - snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}" + SUFFIX + ".png" + snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" output: - snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}" + SUFFIX + ".png" + snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" run: op.relative_symlink(input.snapshot, output.snapshot) @@ -372,7 +375,6 @@ def _symlink_snapshot(wildcards): maf_table = pd.read_table(maf[0], comment="#", sep="\t") return expand( - expand( str(rules._igv_symlink_snapshot.output.snapshot), zip, seq_type = maf_table["seq_type"], @@ -380,11 +382,8 @@ def _symlink_snapshot(wildcards): chromosome = maf_table["chr_std"], start_position = maf_table["Start_Position"], gene = maf_table["Hugo_Symbol"], - tumour_id = maf_table["Tumor_Sample_Barcode"], - allow_missing=True - ), - padding = str(CFG["generate_batch_script"]["padding"]) - ) + tumour_id = maf_table["Tumor_Sample_Barcode"] + ) else: return [] @@ -421,18 +420,14 @@ if CFG["test_run"] is True: maf_table = pd.read_table(maf[0], comment="#", sep="\t") return expand( - expand( - CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{padding}--{gene}--{tumour_id}" + SUFFIX + ".batch", + CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch", zip, seq_type = maf_table["seq_type"], genome_build = maf_table["genome_build"], chromosome = maf_table["chr_std"], start_position = maf_table["Start_Position"], gene = maf_table["Hugo_Symbol"], - tumour_id = maf_table["Tumor_Sample_Barcode"], - allow_missing = True - ), - padding = str(CFG["generate_batch_script"]["padding"]) + tumour_id = maf_table["Tumor_Sample_Barcode"] ) else: return [] From aa2d167cf94eb6e2ef18da03a83471dd37eff329 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 5 Apr 2023 15:17:33 -0700 Subject: [PATCH 050/132] Add option for time limit on IGV run based on lines in batch script in case it errors and never exists --- modules/igv/1.0/igv.smk | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index fdd02d17..6a3f9677 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -335,9 +335,10 @@ checkpoint _igv_run: then echo 'exit' >> {params.merged_batch} ; fi ; - maxtime=$(($(wc -l < {params.merged_batch}) * 5 + 15)) ; - timeout $maxtime xvfb-run --auto-servernum {params.igv} -b {params.merged_batch} > {log.stdout} 2> {log.stderr} && touch {output.complete} ; + maxtime=$(($(wc -l < {params.merged_batch}) * 60 + 15)) ; + timeout $maxtime xvfb-run -s "-screen 0 1920x1080x24" --auto-servernum {params.igv} -b {params.merged_batch} > {log.stdout} 2> {log.stderr} && touch {output.complete} ; else + echo 'Skipping sample {wildcards.tumour_id} because it either has no variants to snapshot or all variants have been already been snapshot.' ; touch {output.complete} ; fi """) From b5ff4f952f5ef5f428e4ed003dd14606026fe443 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 5 Apr 2023 15:19:56 -0700 Subject: [PATCH 051/132] Fix typos causing list errors --- modules/igv/1.0/igv.smk | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 6a3f9677..4756d81b 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -458,21 +458,20 @@ if CFG["test_run"] is True: sample_dictionary = {} for sample_batches in input.batch_scripts: + tumour_id = sample_batches.split("--")[-1].replace(".batch","") + sample_dictionary[tumour_id] = [] with open(sample_batches, "r") as handle: for batch_path in handle: batch_path = batch_path.split("/") snapshot_name = batch_path[-1].split(f"{params.suffix}.batch")[0] seq_type = batch_path[-2].split("--")[0] genome_build = batch_path[-2].split("--")[1] - sample_id = snapshot_name.split("--")[3] + sample_id = snapshot_name.split("--")[2] potential_dispatch_file = params.dispatch_dir + f"{seq_type}--{genome_build}/{snapshot_name}{params.suffix}.batch" if not os.path.exists(potential_dispatch_file): - gene = snapshot_name.split("--")[2] - if not sample_id in list(sample_dictionary): - sample_dictionary[sample_id] = [gene] - else: - sample_dictionary[sample_id].append(gene) + gene = snapshot_name.split("--")[1] + sample_dictionary[sample_id].append(gene) snapshot_summary = pd.DataFrame(list(sample_dictionary.items()), columns=["sample_id","snapshots"]) snapshot_summary["snapshots"] = snapshot_summary["snapshots"].apply(lambda x: len(x)) From 6717cd7b091edaac9886b37873c60dcfebd8f8a7 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 5 Apr 2023 15:21:26 -0700 Subject: [PATCH 052/132] Add example IGV options to default config --- modules/igv/1.0/config/default.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index 6b19c7c8..569e0d43 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -36,7 +36,7 @@ lcr-modules: max_height: 1000 sleep_timer: 2000 # Batch scripts with more options may require longer sleep intervals # Available batch script options: https://github.com/igvteam/igv/wiki/Batch-commands - igv_options: [] + igv_options: ["colorBy INSERT_SIZE", "preference SAM.SHOW_CENTER_LINE TRUE", "expand 100", "NAME_PANEL_WIDTH 70"] scripts: format_regions: "etc/format_regions.py" From 60b84e8a1535e5147f1a53d05e9a163953d196f3 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 5 Apr 2023 15:22:39 -0700 Subject: [PATCH 053/132] Fix typos --- modules/igv/1.0/etc/filter_maf.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/igv/1.0/etc/filter_maf.py b/modules/igv/1.0/etc/filter_maf.py index 360844f0..6f7eb6f2 100644 --- a/modules/igv/1.0/etc/filter_maf.py +++ b/modules/igv/1.0/etc/filter_maf.py @@ -32,14 +32,13 @@ def main(): empty_maf = pd.read_table(maf_file, comment="#", sep="\t") # Add columns required by workflow required_columns = ["seq_type","genome_build","chr_std"] - maf_table = maf_table.assign(**{col:None for col in required_columns if col not in empty_maf.columns}) + empty_maf = empty_maf.assign(**{col:None for col in required_columns if col not in empty_maf.columns}) write_output(empty_maf, output_file) exit() maf = maf_add_columns(maf=maf_file, metadata=metadata) - # Peform filtering - + # Perform filtering filtered_maf = maf_filter( maf=maf, regions=regions_file, From c5f443f8d30941b755b6cd74685b3cd8dd047b44 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 5 Apr 2023 15:23:52 -0700 Subject: [PATCH 054/132] Move padding value from filename to file extension --- modules/igv/1.0/etc/generate_batch_script_per_variant.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py index 55599d45..51a84da1 100644 --- a/modules/igv/1.0/etc/generate_batch_script_per_variant.py +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -132,15 +132,9 @@ def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_bui filename = [] filename.append(row.region), - filename.append(str(row.padding)) filename.append(row.region_name) filename.append(row.sample_id) - if as_pairs: - suffix = ".pairs" - else: - suffix = "" - batch_filename = "--".join(filename) + suffix + ".batch" filename = "--".join(filename) + suffix + ".png" From c5444f6d7eed0b262841cc6bec9b0367440256c3 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Tue, 2 May 2023 12:02:30 -0700 Subject: [PATCH 055/132] Fix stderr logging, remove outdated fxn --- modules/igv/1.0/etc/filter_maf.py | 83 +++++++++++++++++-------------- 1 file changed, 47 insertions(+), 36 deletions(-) diff --git a/modules/igv/1.0/etc/filter_maf.py b/modules/igv/1.0/etc/filter_maf.py index 6f7eb6f2..611c0c91 100644 --- a/modules/igv/1.0/etc/filter_maf.py +++ b/modules/igv/1.0/etc/filter_maf.py @@ -2,52 +2,63 @@ import os import sys -import math +import logging +import traceback import pandas as pd import oncopipe as op + +def log_exceptions(exctype, value, tb): + logging.critical(''.join(traceback.format_tb(tb))) + logging.critical('{0}: {1}'.format(exctype, value)) + +sys.excepthook = log_exceptions + def main(): - with open(snakemake.log[0], "w") as stdout, open(snakemake.log[1], "w") as stderr: + with open(snakemake.log[0], "w") as stdout: # Set up logging sys.stdout = stdout - sys.stderr = stderr - - maf_file = snakemake.input[0] + + try: - regions_file = snakemake.input[1] - regions_format = snakemake.params[0] + maf_file = snakemake.input[0] - metadata = snakemake.params[2] + regions_file = snakemake.input[1] + regions_format = snakemake.params[0] - if regions_format == "oncodriveclustl": - global CLUSTL_PARAMS - CLUSTL_PARAMS = snakemake.params[1] + metadata = snakemake.params[2] - output_file = snakemake.output[0] + if regions_format == "oncodriveclustl": + global CLUSTL_PARAMS + CLUSTL_PARAMS = snakemake.params[1] - # Return empty dataframe if no lines in MAF - line_count = count_lines(maf_file) - if line_count == 1: - empty_maf = pd.read_table(maf_file, comment="#", sep="\t") - # Add columns required by workflow - required_columns = ["seq_type","genome_build","chr_std"] - empty_maf = empty_maf.assign(**{col:None for col in required_columns if col not in empty_maf.columns}) - write_output(empty_maf, output_file) - exit() + output_file = snakemake.output[0] - maf = maf_add_columns(maf=maf_file, metadata=metadata) + # Return empty dataframe if no lines in MAF + line_count = count_lines(maf_file) + if line_count == 1: + empty_maf = pd.read_table(maf_file, comment="#", sep="\t") + # Add columns required by workflow + required_columns = ["seq_type","genome_build","chr_std"] + empty_maf = empty_maf.assign(**{col:None for col in required_columns if col not in empty_maf.columns}) + write_output(empty_maf, output_file) + exit() - # Perform filtering - filtered_maf = maf_filter( - maf=maf, - regions=regions_file, - regions_format=regions_format - ) + maf = maf_add_columns(maf=maf_file, metadata=metadata) - filtered_maf = maf_reduce_snapshots(maf=filtered_maf, snapshots=n_snapshots) + # Perform filtering + filtered_maf = maf_filter( + maf=maf, + regions=regions_file, + regions_format=regions_format + ) - write_output(filtered_maf, output_file) + write_output(filtered_maf, output_file) + + except Exception as e: + logging.error(e, exc_info=1) + raise def count_lines(maf): with open(maf, "r") as handle: @@ -98,12 +109,6 @@ def maf_filter(maf, regions, regions_format): return filter_functions[regions_format](maf, regions_df) -def maf_reduce_snapshots(maf, snapshots): - # Only include max of number of snapshots for each variant - maf = maf.groupby(["Chromosome","Start_Position", "End_Position", "Reference_Allele", "Tumor_Seq_Allele2"]).head(n=snapshots) - - return maf - def maf_add_columns(maf, metadata): # Read input MAF as df maf = pd.read_table(maf, comment="#", sep="\t") @@ -128,4 +133,10 @@ def write_output(maf, outfile): maf.to_csv(outfile, sep="\t", index=False) if __name__ == "__main__": + logging.basicConfig( + level=logging.DEBUG, + filename=snakemake.log[1], + filemode='w' + ) + main() \ No newline at end of file From bf68f6cc831a8b17527c668d618dc444c26cf77f Mon Sep 17 00:00:00 2001 From: mannycruz Date: Tue, 2 May 2023 12:03:58 -0700 Subject: [PATCH 056/132] Fix stderr logging --- modules/igv/1.0/etc/format_regions.py | 57 +++++++++++++++++---------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/modules/igv/1.0/etc/format_regions.py b/modules/igv/1.0/etc/format_regions.py index 3c9eae9b..866f4321 100755 --- a/modules/igv/1.0/etc/format_regions.py +++ b/modules/igv/1.0/etc/format_regions.py @@ -5,38 +5,50 @@ import pandas as pd import oncopipe as op import shutil +import logging +import traceback + +def log_exceptions(exctype, value, tb): + logging.critical(''.join(traceback.format_tb(tb))) + logging.critical('{0}: {1}'.format(exctype, value)) + +sys.excepthook = log_exceptions def main(): - with open(snakemake.log[0], "w") as stdout, open(snakemake.log[1], "w") as stderr: + with open(snakemake.log[0], "w") as stdout: # Set up logging sys.stdout = stdout - sys.stderr = stderr - regions_file = snakemake.input[0] - regions_format = snakemake.params[0] + try: + regions_file = snakemake.input[0] + regions_format = snakemake.params[0] - output_file = snakemake.output[0] + output_file = snakemake.output[0] - if regions_format == "oncodriveclustl": - global CLUSTL_PARAMS - CLUSTL_PARAMS = snakemake.params[1] + if regions_format == "oncodriveclustl": + global CLUSTL_PARAMS + CLUSTL_PARAMS = snakemake.params[1] - if regions_format == "mutation_id": - global REGIONS_BUILD - REGIONS_BUILD = snakemake.params[2] - REGIONS_BUILD = REGIONS_BUILD.lower() + if regions_format == "mutation_id": + global REGIONS_BUILD + REGIONS_BUILD = snakemake.params[2] + REGIONS_BUILD = REGIONS_BUILD.lower() - if regions_format == "bed" or regions_format == "maf": - # Do not need to reformat for liftover - shutil.copy(regions_file, output_file) - exit() + if regions_format == "bed" or regions_format == "maf": + # Do not need to reformat for liftover + shutil.copy(regions_file, output_file) + exit() - # Reformat for liftover based on regions format - regions_formatted = format_regions(regions_file, regions_format) + # Reformat for liftover based on regions format + regions_formatted = format_regions(regions_file, regions_format) - # Output regions file - regions_formatted.to_csv(output_file, sep="\t", index=False) + # Output regions file + regions_formatted.to_csv(output_file, sep="\t", index=False) + + except Exception as e: + logging.error(e, exc_info=1) + raise def format_mutation_id(mutation_id): # Read regions into dataframe @@ -131,4 +143,9 @@ def format_regions(regions, regions_format): return format_functions[regions_format](regions) if __name__ == "__main__": + logging.basicConfig( + level=logging.DEBUG, + filename=snakemake.log[1], + filemode='w' + ) main() \ No newline at end of file From 1f305bca870202d6877ce43508eec97ea160154a Mon Sep 17 00:00:00 2001 From: mannycruz Date: Tue, 2 May 2023 12:05:02 -0700 Subject: [PATCH 057/132] Fix stderr logging, move sleep interval to header --- .../etc/generate_batch_script_per_variant.py | 131 ++++++++++-------- 1 file changed, 77 insertions(+), 54 deletions(-) diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py index 51a84da1..8df4f3e9 100644 --- a/modules/igv/1.0/etc/generate_batch_script_per_variant.py +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -5,55 +5,72 @@ import numpy as np import pandas as pd import oncopipe as op +import sys +import logging +import traceback -def main(): +def log_exceptions(exctype, value, tb): + logging.critical(''.join(traceback.format_tb(tb))) + logging.critical('{0}: {1}'.format(exctype, value)) - input_maf = open(snakemake.input[0], "r") - input_bam = snakemake.input[1] - input_bai = snakemake.input[2] - - # Skip if no variants in outfile - line_count = 0 - for line in input_maf: - line_count += 1 - if line_count > 1: - break - if line_count < 2: - input_maf.close() - touch_output = open(snakemake.output[0],"w") - touch_output.close() - exit() - - # Return to top of MAF - input_maf.seek(0) - - # Read MAF file and create dataframe - regions = get_regions_df( - input_maf, - seq_type=snakemake.params[2], - padding=snakemake.params[4] - ) +sys.excepthook = log_exceptions - input_maf.close() - - # Create the batch scripts - generate_igv_batches( - regions = regions, - bam = input_bam, - bai = input_bai, - output_dir = snakemake.params[0], - snapshot_dir = snakemake.params[1], - genome_build = snakemake.params[2], - seq_type = snakemake.params[3], - igv_options = snakemake.params[5], - max_height = snakemake.params[6], - suffix = snakemake.params[7], - as_pairs = snakemake.params[8], - sleep_timer = snakemake.params[9] - ) - - touch_output = open(snakemake.output[0], "w") - touch_output.close() +def main(): + with open(snakemake.log[0], "w") as stdout: + # Set up logging + sys.stdout = stdout + + try: + input_maf = open(snakemake.input[0], "r") + input_bam = snakemake.input[1] + input_bai = snakemake.input[2] + + # Skip if no variants in outfile + line_count = 0 + for line in input_maf: + line_count += 1 + if line_count > 1: + break + if line_count < 2: + input_maf.close() + touch_output = open(snakemake.output[0],"w") + touch_output.close() + exit() + + # Return to top of MAF + input_maf.seek(0) + + # Read MAF file and create dataframe + regions = get_regions_df( + input_maf, + seq_type=snakemake.params[2], + padding=snakemake.params[4] + ) + + input_maf.close() + + # Create the batch scripts + generate_igv_batches( + regions = regions, + bam = input_bam, + bai = input_bai, + output_dir = snakemake.params[0], + snapshot_dir = snakemake.params[1], + genome_build = snakemake.params[2], + seq_type = snakemake.params[3], + igv_options = snakemake.params[5], + max_height = snakemake.params[6], + suffix = snakemake.params[7], + as_pairs = snakemake.params[8], + sleep_timer = snakemake.params[9] + ) + + touch_output = open(snakemake.output[0], "w") + touch_output.close() + + except Exception as e: + logging.error(e, exc_info=1) + raise def get_regions_df(input_maf, seq_type, padding): # Read MAF as dataframe @@ -87,17 +104,16 @@ def output_lines(lines, batch_output): output.write(text) output.close() -def generate_igv_batch_per_row(coordinates, snapshot_filename, sleep_timer): +def generate_igv_batch_per_row(coordinates, snapshot_filename): lines = [] lines.append(f"goto {coordinates}") lines.append("sort") lines.append("collapse") - lines.append(f"setSleepInterval {sleep_timer}") lines.append(f"snapshot {snapshot_filename}") return lines -def generate_igv_batch_header(bam, index, max_height, genome_build, igv_options, as_pairs): +def generate_igv_batch_header(bam, index, max_height, genome_build, igv_options, sleep_timer, as_pairs): lines = [] genome_build = genome_build.replace("grch37","hg19") @@ -109,8 +125,11 @@ def generate_igv_batch_header(bam, index, max_height, genome_build, igv_options, lines.append(f"maxPanelHeight {max_height}") lines.append(f"genome {genome_build}") - for option in igv_options: - lines.append(option) + if igv_options is not None: + for option in igv_options: + lines.append(option) + + lines.append(f"setSleepInterval {sleep_timer}") if as_pairs: lines.append("viewaspairs") @@ -121,7 +140,7 @@ def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_bui for _, row in regions.iterrows(): all_lines = [] - header = generate_igv_batch_header(bam=bam, index=bai, max_height=max_height, genome_build=genome_build, igv_options=igv_options, as_pairs=as_pairs) + header = generate_igv_batch_header(bam=bam, index=bai, max_height=max_height, genome_build=genome_build, igv_options=igv_options, sleep_timer=sleep_timer, as_pairs=as_pairs) all_lines.extend(header) dir_chrom = row.chromosome @@ -140,8 +159,7 @@ def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_bui lines = generate_igv_batch_per_row( coordinates = row.snapshot_coordinates, - snapshot_filename = filename, - sleep_timer = sleep_timer + snapshot_filename = filename ) all_lines.extend(lines) @@ -155,4 +173,9 @@ def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_bui output_lines(all_lines, batch_file_path) if __name__ == "__main__": + logging.basicConfig( + level=logging.DEBUG, + filename=snakemake.log[1], + filemode='w' + ) main() From d4d163d722687e450ad008afe449cbbf638ad8e9 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Tue, 2 May 2023 12:11:36 -0700 Subject: [PATCH 058/132] Add log to batch script rule --- modules/igv/1.0/igv.smk | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 4756d81b..504f3b9b 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -223,6 +223,9 @@ checkpoint _igv_create_batch_script_per_variant: suffix = SUFFIX, view_pairs = config["lcr-modules"]["igv"]["view_as_pairs"], sleep_timer = config["lcr-modules"]["igv"]["generate_batch_script"]["sleep_timer"] + log: + stdout = CFG["logs"]["batch_scripts"] + "_igv_create_batch_script_per_variant/{seq_type}--{genome_build}/{tumour_id}" + SUFFIX + ".stdout.log", + stderr = CFG["logs"]["batch_scripts"] + "_igv_create_batch_script_per_variant/{seq_type}--{genome_build}/{tumour_id}" + SUFFIX + ".stderr.log" script: config["lcr-modules"]["igv"]["scripts"]["batch_script_per_variant"] @@ -248,7 +251,7 @@ rule _igv_batches_to_merge: with open(output_file, "a") as handle: for line in batch_script: if merged_lines > 0: - if line.startswith(("load","maxPanelHeight","genome","viewaspairs")): + if line.startswith(("load","maxPanelHeight","genome","viewaspairs","setSleepInterval")): continue if line.startswith(tuple(params.igv_options)): continue From 00ff6dd9ceda4b9ec5b2ca5d73c8d0653c0ed2a0 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Tue, 2 May 2023 13:08:15 -0700 Subject: [PATCH 059/132] Track changelog --- modules/igv/CHANGELOG.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 modules/igv/CHANGELOG.md diff --git a/modules/igv/CHANGELOG.md b/modules/igv/CHANGELOG.md new file mode 100644 index 00000000..24815232 --- /dev/null +++ b/modules/igv/CHANGELOG.md @@ -0,0 +1,12 @@ +# Changelog + +All notable changes to the `igv` module will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [1.0] - 2023-01-10 + +This release was authored by Manuela Cruz. + +- No module design decisions explained here yet. From 2ac9e62aac3bf7498cb27afe738be1dbaba7b6a9 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 10 May 2023 09:55:31 -0700 Subject: [PATCH 060/132] Merge remote tracking branch with module/igv/1.0 From 2a5462bd2ded686cd1b678d7c6d7cb90e3c88d9d Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 7 Jun 2023 23:55:56 -0700 Subject: [PATCH 061/132] Organize config for clarity --- modules/igv/1.0/config/default.yaml | 61 +++++++++++++++-------------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index 569e0d43..3470b3f3 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -6,37 +6,41 @@ lcr-modules: # Available wildcards: {seq_type} {tumour_sample_id} {normal_sample_id} {pair_status} {genome_build} maf: "__UPDATE__" regions_file: "__UPDATE__" # Path to a MAF, VCF, BED, OncodriveCLUSTL clusters file, HotMAPS results file, or mutation_id file containing regions of interest to snapshot. - regions_format: "__UPDATE__" + regions_format: "__UPDATE__" # Available options are "bed", "maf", "oncodriveclust", "hotmaps" or "mutation_id" regions_build: "__UPDATE__" # Genome build of regions file, which will be lifted over as needed to filter MAFs on opposite genome builds - test_run: True # Stop after MAF filtering step to get an estimate of how many snapshots will be taken + test_run: True # Stop after MAF filtering step to estimate total number of snapshots - view_as_pairs: False # Toggle pairwise orientation in IGV + options: + filter_maf: + oncodriveclustl_options: # These parameters will filter the OncodriveCLUSTL cluster results file. + p_value: 0.001 # Maximum p-value of OncodriveCLUSTL clusters + score: # Minimum score of OncodriveCLUSTL clusters + n_samples: 5 # Desired number of samples in OncodriveCLUSTL clusters - filter_maf: - oncodriveclustl_options: # These parameters will filter the OncodriveCLUSTL cluster results file. - p_value: # Desired q-value of OncodriveCLUSTL clusters - score: # Desired scores of OncodriveCLUSTL clusters - n_samples: # Desired number of samples in OncodriveCLUSTL clusters + genome_map: # Map builds between metadata and MAFs + grch37: ["grch37","hg19","hg19-clc","hg19-reddy","hs37d5"] + hg38: ["hg38","hg38-nci","hg38-panea","grch38"] - genome_map: # Map different builds in metadata - grch37: ["grch37","hg19","hg19-clc","hg19-reddy","hs37d5"] - hg38: ["hg38","hg38-nci","hg38-panea","grch38"] + liftover_regions: + reference_chain_file: + grch37: "genomes/grch37/chains/grch37/hg19ToHg38.over.chain" + hg38: "genomes/hg38/chains/grch38/hg38ToHg19.over.chain" + target_reference: + grch37: "/projects/rmorin/reference/igenomes/Homo_sapiens/GSC/GRCh37-lite/Sequence/WholeGenomeFasta/genome.fa" + hg38: "/projects/rmorin/reference/igenomes/Homo_sapiens/GSC/GRCh38/Sequence/WholeGenomeFasta/genome.fa" - liftover_regions: - reference_chain_file: - grch37: "genomes/grch37/chains/grch37/hg19ToHg38.over.chain" - hg38: "genomes/hg38/chains/grch38/hg38ToHg19.over.chain" - target_reference: - grch37: "/projects/rmorin/reference/igenomes/Homo_sapiens/GSC/GRCh37-lite/Sequence/WholeGenomeFasta/genome.fa" - hg38: "/projects/rmorin/reference/igenomes/Homo_sapiens/GSC/GRCh38/Sequence/WholeGenomeFasta/genome.fa" + generate_batch_script: + padding: 100 # Base pairs upstream and downstream of variant position + max_height: 1000 + sleep_timer: 2000 # Batch scripts with more options may require longer sleep intervals + # Available igv options: https://github.com/igvteam/igv/wiki/Batch-commands + igv_options: ["preference SAM.COLOR_BY READ_STRAND", "preference SAM.SHOW_CENTER_LINE TRUE", "preference SAM.DOWNSAMPLE_READS FALSE"] + view_as_pairs: False # Toggle pairwise orientation in IGV - generate_batch_script: - padding: 80 - max_height: 1000 - sleep_timer: 2000 # Batch scripts with more options may require longer sleep intervals - # Available batch script options: https://github.com/igvteam/igv/wiki/Batch-commands - igv_options: ["colorBy INSERT_SIZE", "preference SAM.SHOW_CENTER_LINE TRUE", "expand 100", "NAME_PANEL_WIDTH 70"] + xvfb_parameters: + server_number: "99" + server_args: "" scripts: format_regions: "etc/format_regions.py" @@ -47,16 +51,15 @@ lcr-modules: scratch_subdirectories: [] - options: - step_1: "" - step_2: "" - conda_envs: liftover_regions: "{MODSDIR}/envs/crossmap.yaml" wget: "{REPODIR}/envs/wget/wget-1.20.1.yaml" threads: - step_1: 4 + _igv_symlink_bam: 30 + _igv_symlink_bai: 30 + _igv_quality_control: 20 + _igv_symlink_snapshot: 20 resources: step_1: From 0491dc0affc6dd3d42b5946717c0e9bd246ccad5 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Thu, 8 Jun 2023 00:00:05 -0700 Subject: [PATCH 062/132] Match values to new config structure --- modules/igv/1.0/igv.smk | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 504f3b9b..26fba0c2 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -143,7 +143,7 @@ rule _igv_format_regions_file: regions = CFG["dirs"]["inputs"] + "regions/regions_file_formatted.txt" params: regions_format = CFG["inputs"]["regions_format"], - oncodriveclustl_params = CFG["filter_maf"]["oncodriveclustl_options"], + oncodriveclustl_params = CFG["options"]["filter_maf"]["oncodriveclustl_options"], regions_build = CFG["inputs"]["regions_build"] log: stdout = CFG["logs"]["inputs"] + "format_regions.stdout.log", @@ -165,8 +165,8 @@ rule _igv_liftover_regions: output: regions = CFG["dirs"]["inputs"] + "regions/regions_file_{genome_build}.crossmap.txt" params: - chain_file = reference_files(CFG["liftover_regions"]["reference_chain_file"][(CFG["inputs"]["regions_build"]).replace("hg19","grch37").replace("grch38","hg38")]), - target_reference = lambda w: config["lcr-modules"]["igv"]["liftover_regions"]["target_reference"][w.genome_build], + chain_file = reference_files(CFG["options"]["liftover_regions"]["reference_chain_file"][(CFG["inputs"]["regions_build"]).replace("hg19","grch37").replace("grch38","hg38")]), + target_reference = lambda w: config["lcr-modules"]["igv"]["options"]["liftover_regions"]["target_reference"][w.genome_build], regions_type = REGIONS_FORMAT[CFG["inputs"]["regions_format"].lower()], regions_build = CFG["inputs"]["regions_build"].replace("grch37","GRCh37").replace("hg38","GRCh38"), target_build = lambda w: w.genome_build.replace("grch37","GRCh37").replace("hg38", "GRCh38") @@ -192,7 +192,7 @@ rule _igv_filter_maf: maf = CFG["dirs"]["inputs"] + "maf/filtered_maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}.maf" params: regions_format = REGIONS_FORMAT[CFG["inputs"]["regions_format"].lower()], - oncodriveclustl_params = CFG["filter_maf"]["oncodriveclustl_options"], + oncodriveclustl_params = CFG["options"]["filter_maf"]["oncodriveclustl_options"], metadata = CFG["runs"] log: stdout = CFG["logs"]["inputs"] + "filter_maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}/filter_maf.stdout.log", @@ -217,12 +217,12 @@ checkpoint _igv_create_batch_script_per_variant: snapshot_dir = config["lcr-modules"]["igv"]["dirs"]["snapshots"], genome_build = lambda w: w.genome_build, seq_type = lambda w: w.seq_type, - padding = config["lcr-modules"]["igv"]["generate_batch_script"]["padding"], - igv_options = config["lcr-modules"]["igv"]["generate_batch_script"]["igv_options"], - max_height = config["lcr-modules"]["igv"]["generate_batch_script"]["max_height"], + padding = config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["padding"], + igv_options = config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["igv_options"], + max_height = config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["max_height"], suffix = SUFFIX, - view_pairs = config["lcr-modules"]["igv"]["view_as_pairs"], - sleep_timer = config["lcr-modules"]["igv"]["generate_batch_script"]["sleep_timer"] + view_pairs = config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["view_as_pairs"], + sleep_timer = config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["sleep_timer"] log: stdout = CFG["logs"]["batch_scripts"] + "_igv_create_batch_script_per_variant/{seq_type}--{genome_build}/{tumour_id}" + SUFFIX + ".stdout.log", stderr = CFG["logs"]["batch_scripts"] + "_igv_create_batch_script_per_variant/{seq_type}--{genome_build}/{tumour_id}" + SUFFIX + ".stderr.log" @@ -237,7 +237,7 @@ rule _igv_batches_to_merge: dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" params: batch_script_file = str(rules._igv_create_batch_script_per_variant.output.variant_batch), - igv_options = CFG["generate_batch_script"]["igv_options"] + igv_options = CFG["options"]["generate_batch_script"]["igv_options"] threads: (workflow.cores / 10) run: batch_script_path = os.path.abspath(input.batch_script) @@ -327,7 +327,9 @@ checkpoint _igv_run: params: merged_batch = str(rules._igv_create_batch_script_per_variant.output.variant_batch), igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh", - max_time = CFG["generate_batch_script"]["sleep_timer"] + max_time = CFG["options"]["generate_batch_script"]["sleep_timer"], + server_number = "-n " + CFG["options"]["xvfb_parameters"]["server_number"] if CFG["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", + server_args = CFG["options"]["xvfb_parameters"]["server_args"] threads: (workflow.cores) shell: op.as_one_line(""" From c4c12906e9f94036661883bd414bf58b87d31a91 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Thu, 8 Jun 2023 00:02:57 -0700 Subject: [PATCH 063/132] Access genome map from new config structure --- modules/igv/1.0/igv.smk | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 26fba0c2..d035a341 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -43,12 +43,12 @@ CFG = op.setup_module( ) # Rename genome_build values in sample metadata to correlate with MAF values -CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["genome_map"]["grch37"]), "grch37", inplace=True) -CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["genome_map"]["hg38"]), "hg38", inplace=True) +CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["options"]["genome_map"]["grch37"]), "grch37", inplace=True) +CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["options"]["genome_map"]["hg38"]), "hg38", inplace=True) # Define output file suffix based on config parameters -SUFFIX = ".pad" + str(CFG["generate_batch_script"]["padding"]) -if CFG["view_as_pairs"]: +SUFFIX = ".pad" + str(CFG["options"]["generate_batch_script"]["padding"]) +if CFG["options"]["generate_batch_script"]["view_as_pairs"]: SUFFIX = SUFFIX + ".pairs" # Define rules to be run locally when using a compute cluster From dba738397aa2764a7723a9aeb8cba38f4ad69f17 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Thu, 8 Jun 2023 00:07:35 -0700 Subject: [PATCH 064/132] Set server number and xvfb arguments in igv rule --- modules/igv/1.0/igv.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index d035a341..cc5133f1 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -341,7 +341,7 @@ checkpoint _igv_run: echo 'exit' >> {params.merged_batch} ; fi ; maxtime=$(($(wc -l < {params.merged_batch}) * 60 + 15)) ; - timeout $maxtime xvfb-run -s "-screen 0 1920x1080x24" --auto-servernum {params.igv} -b {params.merged_batch} > {log.stdout} 2> {log.stderr} && touch {output.complete} ; + timeout $maxtime xvfb-run -s "-screen 0 1920x1080x24" {params.server_number} {params.server_args} {params.igv} -b {params.merged_batch} > {log.stdout} 2> {log.stderr} && touch {output.complete} ; else echo 'Skipping sample {wildcards.tumour_id} because it either has no variants to snapshot or all variants have been already been snapshot.' ; touch {output.complete} ; From 94f20a8fb326d01dc7f3adc9d13572d3405e6ecf Mon Sep 17 00:00:00 2001 From: mannycruz Date: Thu, 8 Jun 2023 00:09:12 -0700 Subject: [PATCH 065/132] Add check for IGV exit status to handle if xvfb-run error occurs but all snapshots have been taken succesfully --- modules/igv/1.0/igv.smk | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index cc5133f1..7a7593bd 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -342,6 +342,16 @@ checkpoint _igv_run: fi ; maxtime=$(($(wc -l < {params.merged_batch}) * 60 + 15)) ; timeout $maxtime xvfb-run -s "-screen 0 1920x1080x24" {params.server_number} {params.server_args} {params.igv} -b {params.merged_batch} > {log.stdout} 2> {log.stderr} && touch {output.complete} ; + exit=$? ; + if [ $exit -ne 0 ] ; + then + if grep -q -e "No such process" {log.stderr} && grep -q -e "Executing Command: exit" {log.stdout} ; + then + echo "All IGV batch script commands have completed succesfully, but an Xvfb-run kill error has occurred." >> {log.stdout} && touch {output.complete} ; + else + false ; + fi ; + fi ; else echo 'Skipping sample {wildcards.tumour_id} because it either has no variants to snapshot or all variants have been already been snapshot.' ; touch {output.complete} ; From 41c5effd04c27bfd1935894bd4047afe290be2c6 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Thu, 8 Jun 2023 00:16:10 -0700 Subject: [PATCH 066/132] Add QC to catch truncated/wrong dimensions snapshots --- modules/igv/1.0/igv.smk | 82 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 2 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 7a7593bd..3c2aec87 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -358,15 +358,92 @@ checkpoint _igv_run: fi """) +rule _igv_quality_control: + input: + igv = str(rules._igv_run.output.complete), + snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" + output: + snapshot_qc = temp(CFG["dirs"]["snapshots"] + "qc/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".qc") + params: + batch_script = lambda w: config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + f"single_batch_scripts/{w.seq_type}--{w.genome_build}/{w.chromosome}:{w.start_position}--{w.gene}--{w.tumour_id}" + SUFFIX + ".batch", + merged_batch = lambda w: config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + f"merged_batch_scripts/{w.seq_type}--{w.genome_build}/{w.tumour_id}" + SUFFIX + ".batch", + igv = config["lcr-modules"]["igv"]["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh", + server_number = "-n " + config["lcr-modules"]["igv"]["options"]["xvfb_parameters"]["server_number"] if config["lcr-modules"]["igv"]["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", + server_args = config["lcr-modules"]["igv"]["options"]["xvfb_parameters"]["server_args"], + batch_temp = lambda w: config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + f"single_batch_scripts/{w.seq_type}--{w.genome_build}/{w.chromosome}:{w.start_position}--{w.gene}--{w.tumour_id}" + SUFFIX + ".batch.temp" + log: + stdout = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stdout.log", + stderr = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stderr.log" + threads: + CFG["threads"]["_igv_quality_control"] + run: + import subprocess + height = str(subprocess.check_output(f"identify -format '%h' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] + width = str(subprocess.check_output(f"identify -format '%w' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] + if height == "559" or height == "506" or height=="547": + print(f"{input.snapshot} appears to be truncated. Rerunning IGV with increased sleep interval.") + os.system(f'sleep=$(grep "Sleep" {params.batch_script} | cut -d " " -f 2) && sed "s/setSleepInterval $sleep/setSleepInterval 10000/g" {params.batch_script} > {params.batch_temp} && echo "exit" >> {params.batch_temp}') + os.system(f'echo "Snapshot appears to be truncated. Rerunning IGV on batch script {params.batch_script} with increased sleep interval" > {log.stdout}') + os.system(f'maxtime=$(($(wc -l < {params.batch_temp}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" {params.server_number} {params.server_args} {params.igv} -b {params.batch_temp} >> {log.stdout} 2>> {log.stderr}') + os.system(f'rm {params.batch_temp}') + if width == "640": + with open(log.stdout, "a") as handle: + handle.write(f"Snapshot {input.snapshot} width is 640. Improper dimensions might be due to xvfb-run unable to connect to current server {params.server_number} due to a server lock file existing in directory '/tmp/.X*'. Attempting to run on a different server number...") + if params.server_number == "--auto-servernum" or params.server_number == "-n 99": + new_server = 1 + while width == "640": + handle.write(f'Attempting...\nxvfb-run -s "-screen 0 1980x1020x24" -n {str(new_server)} {params.server_args} {params.igv} -b {params.merged_batch} >> {log.stdout} 2>> {log.stderr}') + os.system(f'maxtime=$(($(wc -l < {params.merged_batch}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" -n {str(new_server)} {params.server_args} {params.igv} -b {params.merged_batch} >> {log.stdout} 2>> {log.stderr}') + width = str(subprocess.check_output(f"identify -format '%w' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] + new_server += 1 + os.system(f'touch {output.snapshot_qc}') + # Symlinks the final output files into the module results directory (under '99-outputs/') rule _igv_symlink_snapshot: input: snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" output: snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" + threads: + CFG["threads"]["_igv_symlink_snapshot"] run: op.relative_symlink(input.snapshot, output.snapshot) +# Return a list of all snapshots that have undergone quality control +def _quality_control(wildcards): + CFG = config["lcr-modules"]["igv"] + checkpoint_outputs = checkpoints._igv_run.get(**wildcards).output.complete + + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) + + normal_sample_id = this_sample["normal_sample_id"] + pair_status = this_sample["pair_status"] + + maf = expand( + str(rules._igv_filter_maf.output.maf), + zip, + seq_type=wildcards.seq_type, + genome_build=wildcards.genome_build, + tumour_id=wildcards.tumour_id, + normal_sample_id=normal_sample_id, + pair_status=pair_status + ) + + if os.path.exists(maf[0]): + maf_table = pd.read_table(maf[0], comment="#", sep="\t") + return expand( + str(rules._igv_quality_control.output.snapshot_qc), + zip, + seq_type = maf_table["seq_type"], + genome_build = maf_table["genome_build"], + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + tumour_id = maf_table["Tumor_Sample_Barcode"] + ) + else: + return [] + # Return a list of all snapshots that were taken during IGV def _symlink_snapshot(wildcards): CFG = config["lcr-modules"]["igv"] @@ -403,11 +480,12 @@ def _symlink_snapshot(wildcards): else: return [] -# Check that snapshots have been symlinked +# Check that snapshots have been symlinked and quality controlled rule _igv_check_snapshots: input: snapshots = _symlink_snapshot, - igv_completed = str(rules._igv_run.output.complete) + igv_completed = str(rules._igv_run.output.complete), + quality_control = _quality_control output: snapshots = CFG["dirs"]["outputs"] + "completed/{seq_type}--{genome_build}--{tumour_id}.completed" shell: From 636dc7bca66395e58c6f5ef593538aa8ddba6eb7 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Thu, 8 Jun 2023 00:16:43 -0700 Subject: [PATCH 067/132] Add thread limits to bam/bai symlinking rules --- modules/igv/1.0/igv.smk | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 3c2aec87..ec66a6f0 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -105,6 +105,8 @@ rule _igv_symlink_bam: bam = get_bams output: bam = CFG["dirs"]["inputs"] + "bams/{seq_type}/{tumour_id}.bam" + threads: + CFG["threads"]["_igv_symlink_bam"] run: op.absolute_symlink(input.bam, output.bam) @@ -113,6 +115,8 @@ rule _igv_symlink_bai: bai = get_bai output: bai = CFG["dirs"]["inputs"] + "bams/{seq_type}/{tumour_id}.bam.bai" + threads: + CFG["threads"]["_igv_symlink_bai"] run: op.absolute_symlink(input.bai, output.bai) From 1f4ecfa718604915d50413263b75dc4848b286cd Mon Sep 17 00:00:00 2001 From: mannycruz Date: Tue, 13 Jun 2023 11:28:25 -0700 Subject: [PATCH 068/132] Fix input error using input function --- modules/igv/1.0/igv.smk | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index ec66a6f0..764f6ac2 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -206,10 +206,30 @@ rule _igv_filter_maf: script: config["lcr-modules"]["igv"]["scripts"]["filter_script"] +def _get_maf(wildcards): + CFG = config["lcr-modules"]["igv"] + + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id=wildcards.tumour_id, tumour_seq_type=wildcards.seq_type) + genome_build = this_sample["tumour_genome_build"] + normal_sample_id = this_sample["normal_sample_id"] + pair_status = this_sample["pair_status"] + + return ( + expand( + str(rules._igv_filter_maf.output.maf), + zip, + seq_type = wildcards.seq_type, + genome_build = genome_build, + tumour_id = wildcards.tumour_id, + normal_sample_id = normal_sample_id, + pair_status = pair_status + ) + ) + # Create batch scripts for each variant checkpoint _igv_create_batch_script_per_variant: input: - filter_maf = expand(str(rules._igv_filter_maf.output.maf), zip, normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"], allow_missing=True)[0], + filter_maf = _get_maf, bam_file = str(rules._igv_symlink_bam.output.bam), bai_file = str(rules._igv_symlink_bai.output.bai), regions_lifted = str(rules._igv_liftover_regions.output.regions), From f2a5fe3b0f83bfba7144bf6ef5899b29d1bcad63 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Mon, 19 Jun 2023 14:26:52 -0700 Subject: [PATCH 069/132] Add resource limits to config --- modules/igv/1.0/config/default.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index 3470b3f3..2b1e17f1 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -62,8 +62,12 @@ lcr-modules: _igv_symlink_snapshot: 20 resources: - step_1: + _igv_liftover_regions: mem_mb: 2000 + _igv_run: + mem_mb: 2500 + _igv_quality_control: + mem_mb: 2500 pairing_config: genome: From 1af181bf29b554e4b3b3cde59489ef0b7940e3fe Mon Sep 17 00:00:00 2001 From: mannycruz Date: Mon, 19 Jun 2023 14:28:35 -0700 Subject: [PATCH 070/132] Add resource limits, handle missing MAFs, fix server assignment typo --- modules/igv/1.0/igv.smk | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 764f6ac2..f1b3ce3e 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -46,6 +46,14 @@ CFG = op.setup_module( CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["options"]["genome_map"]["grch37"]), "grch37", inplace=True) CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["options"]["genome_map"]["hg38"]), "hg38", inplace=True) +# Remove samples if MAF files don't exist +maf_path = CFG["inputs"]["maf"] +def get_maf_path(row): + return maf_path.format(unix_group=row["tumour_unix_group"], seq_type=row["tumour_seq_type"], tumour_id=row["tumour_sample_id"], normal_sample_id=row["normal_sample_id"], pair_status=row["pair_status"], genome_build=row["tumour_genome_build"]) + +CFG["runs"]["maf_path"] = CFG["runs"].apply(get_maf_path, axis=1) +CFG["runs"] = CFG["runs"][CFG["runs"]["maf_path"].apply(os.path.exists)] + # Define output file suffix based on config parameters SUFFIX = ".pad" + str(CFG["options"]["generate_batch_script"]["padding"]) if CFG["options"]["generate_batch_script"]["view_as_pairs"]: @@ -176,6 +184,8 @@ rule _igv_liftover_regions: target_build = lambda w: w.genome_build.replace("grch37","GRCh37").replace("hg38", "GRCh38") conda: CFG["conda_envs"]["liftover_regions"] + resources: + **CFG["resources"]["_igv_liftover_regions"] log: stdout = CFG["logs"]["inputs"] + "liftover_regions_{genome_build}.stdout.log", stderr = CFG["logs"]["inputs"] + "liftover_regions_{genome_build}.stderr.log" @@ -345,15 +355,17 @@ checkpoint _igv_run: merged_batch = str(rules._igv_create_batch_script_per_variant.output.variant_batch) output: complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}--{tumour_id}.completed" - log: - stdout = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.stdout.log", - stderr = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.stderr.log" params: merged_batch = str(rules._igv_create_batch_script_per_variant.output.variant_batch), igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh", max_time = CFG["options"]["generate_batch_script"]["sleep_timer"], server_number = "-n " + CFG["options"]["xvfb_parameters"]["server_number"] if CFG["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", server_args = CFG["options"]["xvfb_parameters"]["server_args"] + resources: + **CFG["resources"]["_igv_run"] + log: + stdout = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.stdout.log", + stderr = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.stderr.log" threads: (workflow.cores) shell: op.as_one_line(""" @@ -395,6 +407,8 @@ rule _igv_quality_control: server_number = "-n " + config["lcr-modules"]["igv"]["options"]["xvfb_parameters"]["server_number"] if config["lcr-modules"]["igv"]["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", server_args = config["lcr-modules"]["igv"]["options"]["xvfb_parameters"]["server_args"], batch_temp = lambda w: config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + f"single_batch_scripts/{w.seq_type}--{w.genome_build}/{w.chromosome}:{w.start_position}--{w.gene}--{w.tumour_id}" + SUFFIX + ".batch.temp" + resources: + **CFG["resources"]["_igv_quality_control"] log: stdout = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stdout.log", stderr = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stderr.log" @@ -413,8 +427,10 @@ rule _igv_quality_control: if width == "640": with open(log.stdout, "a") as handle: handle.write(f"Snapshot {input.snapshot} width is 640. Improper dimensions might be due to xvfb-run unable to connect to current server {params.server_number} due to a server lock file existing in directory '/tmp/.X*'. Attempting to run on a different server number...") - if params.server_number == "--auto-servernum" or params.server_number == "-n 99": + if params.server_number == "--auto-servernum" or int(params.server_number.replace("-n ","")) >= 99: new_server = 1 + else: + new_server = int(params.server_number.replace("-n ","")) + 1 while width == "640": handle.write(f'Attempting...\nxvfb-run -s "-screen 0 1980x1020x24" -n {str(new_server)} {params.server_args} {params.igv} -b {params.merged_batch} >> {log.stdout} 2>> {log.stderr}') os.system(f'maxtime=$(($(wc -l < {params.merged_batch}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" -n {str(new_server)} {params.server_args} {params.igv} -b {params.merged_batch} >> {log.stdout} 2>> {log.stderr}') @@ -425,7 +441,7 @@ rule _igv_quality_control: # Symlinks the final output files into the module results directory (under '99-outputs/') rule _igv_symlink_snapshot: input: - snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" + snapshot = ancient(CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png") output: snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" threads: @@ -573,6 +589,7 @@ if CFG["test_run"] is True: suffix = SUFFIX run: sample_dictionary = {} + seen = [] for sample_batches in input.batch_scripts: tumour_id = sample_batches.split("--")[-1].replace(".batch","") @@ -581,6 +598,10 @@ if CFG["test_run"] is True: for batch_path in handle: batch_path = batch_path.split("/") snapshot_name = batch_path[-1].split(f"{params.suffix}.batch")[0] + if not snapshot_name in seen: + seen.append(snapshot_name) + else: + continue seq_type = batch_path[-2].split("--")[0] genome_build = batch_path[-2].split("--")[1] sample_id = snapshot_name.split("--")[2] From 838efdf9eabfed6ffe4379aa2c829976fffae109 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 12 Jul 2023 15:10:59 -0700 Subject: [PATCH 071/132] Improve quality control log descriptions, add thresholds to check for blank snaps, add while loop for multiple attempts, make symlink rule depend on successful completion of quality control step --- modules/igv/1.0/igv.smk | 137 ++++++++++++++++++++++------------------ 1 file changed, 74 insertions(+), 63 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index f1b3ce3e..b1ad896e 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -46,14 +46,6 @@ CFG = op.setup_module( CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["options"]["genome_map"]["grch37"]), "grch37", inplace=True) CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["options"]["genome_map"]["hg38"]), "hg38", inplace=True) -# Remove samples if MAF files don't exist -maf_path = CFG["inputs"]["maf"] -def get_maf_path(row): - return maf_path.format(unix_group=row["tumour_unix_group"], seq_type=row["tumour_seq_type"], tumour_id=row["tumour_sample_id"], normal_sample_id=row["normal_sample_id"], pair_status=row["pair_status"], genome_build=row["tumour_genome_build"]) - -CFG["runs"]["maf_path"] = CFG["runs"].apply(get_maf_path, axis=1) -CFG["runs"] = CFG["runs"][CFG["runs"]["maf_path"].apply(os.path.exists)] - # Define output file suffix based on config parameters SUFFIX = ".pad" + str(CFG["options"]["generate_batch_script"]["padding"]) if CFG["options"]["generate_batch_script"]["view_as_pairs"]: @@ -412,36 +404,56 @@ rule _igv_quality_control: log: stdout = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stdout.log", stderr = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stderr.log" - threads: - CFG["threads"]["_igv_quality_control"] + threads: (workflow.cores) run: import subprocess + success = True height = str(subprocess.check_output(f"identify -format '%h' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] width = str(subprocess.check_output(f"identify -format '%w' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] - if height == "559" or height == "506" or height=="547": - print(f"{input.snapshot} appears to be truncated. Rerunning IGV with increased sleep interval.") + if height in ["506","547","559"]: + attempts = 0 os.system(f'sleep=$(grep "Sleep" {params.batch_script} | cut -d " " -f 2) && sed "s/setSleepInterval $sleep/setSleepInterval 10000/g" {params.batch_script} > {params.batch_temp} && echo "exit" >> {params.batch_temp}') - os.system(f'echo "Snapshot appears to be truncated. Rerunning IGV on batch script {params.batch_script} with increased sleep interval" > {log.stdout}') - os.system(f'maxtime=$(($(wc -l < {params.batch_temp}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" {params.server_number} {params.server_args} {params.igv} -b {params.batch_temp} >> {log.stdout} 2>> {log.stderr}') - os.system(f'rm {params.batch_temp}') + while height in ["506","547","559"] and attempts < 4: + os.system(f'echo "Snapshot may be truncated. Current snapshot height is {height}. Rerunning IGV batch script {params.batch_script} with increased sleep interval.\n" >> {log.stdout}') + attempts += 1 + os.system(f'echo "IGV ATTEMPT #{attempts}:" >> {log.stdout}') + os.system(f'echo "IGV ATTEMPT #{attempts}:" >> {log.stderr}') + os.system(f'maxtime=$(($(wc -l < {params.batch_temp}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" {params.server_number} {params.server_args} {params.igv} -b {params.batch_temp} >> {log.stdout} 2>> {log.stderr}') + height = str(subprocess.check_output(f"identify -format '%h' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] + if height in ["547","559"]: + kurtosis, skewness = [float(value.split(": ")[1]) for value in str(subprocess.check_output(f'identify -verbose {input.snapshot} | grep -E "kurtosis|skewness" | tail -n 2', shell=True)).replace("\\n'","").split("\\n ")] + blank_kurtosis = {"547": 18.5, "559": 18.2} + blank_skew = -4 + if kurtosis > blank_kurtosis[height] and skewness < blank_skew: + os.system(f'echo "Snapshot may be blank. Current values:\nHeight:{height}, kurtosis: {str(kurtosis)}, skewness: {str(skewness)}\nSnapshots with height of 547, kurtosis greater than 18.5, and skewness less than 4 are likely blank and may be due to errors reading BAM file headers or Java address bind errors. Snapshots with height of 559, kurtosis greater than 18.2, and skewness less than 4 are likely blank and may be due to errors during IGV run." >> {log.stdout}') + success = False + if height == "506": + os.system(f'echo "Snapshot height is {height} and may still be truncated or improperly loaded. Check snapshot {input.snapshot}" >> {log.stdout}') + success = False if width == "640": - with open(log.stdout, "a") as handle: - handle.write(f"Snapshot {input.snapshot} width is 640. Improper dimensions might be due to xvfb-run unable to connect to current server {params.server_number} due to a server lock file existing in directory '/tmp/.X*'. Attempting to run on a different server number...") - if params.server_number == "--auto-servernum" or int(params.server_number.replace("-n ","")) >= 99: - new_server = 1 - else: - new_server = int(params.server_number.replace("-n ","")) + 1 - while width == "640": - handle.write(f'Attempting...\nxvfb-run -s "-screen 0 1980x1020x24" -n {str(new_server)} {params.server_args} {params.igv} -b {params.merged_batch} >> {log.stdout} 2>> {log.stderr}') - os.system(f'maxtime=$(($(wc -l < {params.merged_batch}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" -n {str(new_server)} {params.server_args} {params.igv} -b {params.merged_batch} >> {log.stdout} 2>> {log.stderr}') - width = str(subprocess.check_output(f"identify -format '%w' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] - new_server += 1 - os.system(f'touch {output.snapshot_qc}') + attempts = 0 + os.system(f'echo "Snapshot appears to be in incorrect dimensions. Current width is {width} and might be due to xvfb-run unable to connect to current server {params.server_number} due to a server lock. Attempting to run on different server number..." >> {log.stdout}') + if params.server_number == "--auto-servernum" or int(params.server_number.replace("-n ","")) >= 99: + new_server = 1 + else: + new_server = int(params.server_number.replace("-n ","")) + 1 + while width == "640" and attempts < 5: + os.system(f'echo "Attempting with server number {new_server}..." >> {log.stdout}') + os.system(f'echo "Attempting with server number {new_server}..." >> {log.stderr}') + os.system(f'maxtime=$(($(wc -l < {params.merged_batch}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" -n {str(new_server)} {params.server_args} {params.igv} -b {params.merged_batch} >> {log.stdout} 2>> {log.stderr}') + width = str(subprocess.check_output(f"identify -format '%w' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] + new_server += 1 + if width == "640": + os.system(f'echo "Snapshot still appears to be in improper dimensions. Double check xvfb-run parameters." >> {log.stdout}') + success = False + if success == True: + os.system(f'touch {output.snapshot_qc}') # Symlinks the final output files into the module results directory (under '99-outputs/') rule _igv_symlink_snapshot: input: - snapshot = ancient(CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png") + snapshot = ancient(CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png"), + snapshot_qc = str(rules._igv_quality_control.output.snapshot_qc) output: snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" threads: @@ -450,39 +462,39 @@ rule _igv_symlink_snapshot: op.relative_symlink(input.snapshot, output.snapshot) # Return a list of all snapshots that have undergone quality control -def _quality_control(wildcards): - CFG = config["lcr-modules"]["igv"] - checkpoint_outputs = checkpoints._igv_run.get(**wildcards).output.complete - - this_sample = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) - - normal_sample_id = this_sample["normal_sample_id"] - pair_status = this_sample["pair_status"] - - maf = expand( - str(rules._igv_filter_maf.output.maf), - zip, - seq_type=wildcards.seq_type, - genome_build=wildcards.genome_build, - tumour_id=wildcards.tumour_id, - normal_sample_id=normal_sample_id, - pair_status=pair_status - ) - - if os.path.exists(maf[0]): - maf_table = pd.read_table(maf[0], comment="#", sep="\t") - return expand( - str(rules._igv_quality_control.output.snapshot_qc), - zip, - seq_type = maf_table["seq_type"], - genome_build = maf_table["genome_build"], - chromosome = maf_table["chr_std"], - start_position = maf_table["Start_Position"], - gene = maf_table["Hugo_Symbol"], - tumour_id = maf_table["Tumor_Sample_Barcode"] - ) - else: - return [] +#def _quality_control(wildcards): +# CFG = config["lcr-modules"]["igv"] +# checkpoint_outputs = checkpoints._igv_run.get(**wildcards).output.complete +# +# this_sample = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) +# +# normal_sample_id = this_sample["normal_sample_id"] +# pair_status = this_sample["pair_status"] +# +# maf = expand( +# str(rules._igv_filter_maf.output.maf), +# zip, +# seq_type=wildcards.seq_type, +# genome_build=wildcards.genome_build, +# tumour_id=wildcards.tumour_id, +# normal_sample_id=normal_sample_id, +# pair_status=pair_status +# ) +# +# if os.path.exists(maf[0]): +# maf_table = pd.read_table(maf[0], comment="#", sep="\t") +# return expand( +# str(rules._igv_quality_control.output.snapshot_qc), +# zip, +# seq_type = maf_table["seq_type"], +# genome_build = maf_table["genome_build"], +# chromosome = maf_table["chr_std"], +# start_position = maf_table["Start_Position"], +# gene = maf_table["Hugo_Symbol"], +# tumour_id = maf_table["Tumor_Sample_Barcode"] +# ) +# else: +# return [] # Return a list of all snapshots that were taken during IGV def _symlink_snapshot(wildcards): @@ -523,9 +535,8 @@ def _symlink_snapshot(wildcards): # Check that snapshots have been symlinked and quality controlled rule _igv_check_snapshots: input: - snapshots = _symlink_snapshot, igv_completed = str(rules._igv_run.output.complete), - quality_control = _quality_control + snapshots = _symlink_snapshot output: snapshots = CFG["dirs"]["outputs"] + "completed/{seq_type}--{genome_build}--{tumour_id}.completed" shell: From 28ad3c27731e1f2345106ede4ba01e1d440e1462 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 12 Jul 2023 15:14:19 -0700 Subject: [PATCH 072/132] Use rule wildcards to set additional columns in MAF instead of extracting values from metadata matches for cases where sample id matches more than one seq_type --- modules/igv/1.0/etc/filter_maf.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/modules/igv/1.0/etc/filter_maf.py b/modules/igv/1.0/etc/filter_maf.py index 611c0c91..d3583767 100644 --- a/modules/igv/1.0/etc/filter_maf.py +++ b/modules/igv/1.0/etc/filter_maf.py @@ -45,7 +45,7 @@ def main(): write_output(empty_maf, output_file) exit() - maf = maf_add_columns(maf=maf_file, metadata=metadata) + maf = maf_add_columns(maf=maf_file, metadata=metadata, wildcards=snakemake.wildcards) # Perform filtering filtered_maf = maf_filter( @@ -109,18 +109,15 @@ def maf_filter(maf, regions, regions_format): return filter_functions[regions_format](maf, regions_df) -def maf_add_columns(maf, metadata): +def maf_add_columns(maf, metadata, wildcards): # Read input MAF as df maf = pd.read_table(maf, comment="#", sep="\t") - sample_id = maf["Tumor_Sample_Barcode"].unique()[0] - - row = metadata[metadata["tumour_sample_id"]==sample_id] - - seq_type = row["tumour_seq_type"].item() - genome_build = row["tumour_genome_build"].item() - normal_sample_id = row["normal_sample_id"].item() - pair_status = row["pair_status"].item() + sample_id = snakemake.wildcards["tumour_id"] + seq_type = snakemake.wildcards["seq_type"] + genome_build = snakemake.wildcards["genome_build"] + normal_sample_id = snakemake.wildcards["normal_sample_id"] + pair_status = snakemake.wildcards["pair_status"] maf["seq_type"] = seq_type maf["genome_build"] = genome_build From e6a1c3c8d55b01a2de61035e3670cd2d310ec1d2 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 12 Jul 2023 15:14:38 -0700 Subject: [PATCH 073/132] Fix typo in HotMAPS formatting function --- modules/igv/1.0/etc/format_regions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/igv/1.0/etc/format_regions.py b/modules/igv/1.0/etc/format_regions.py index 866f4321..8dff0f3d 100755 --- a/modules/igv/1.0/etc/format_regions.py +++ b/modules/igv/1.0/etc/format_regions.py @@ -80,7 +80,7 @@ def format_hotmaps(hotmaps_regions): # Convert HotMAPS coordinates to BED format hotmaps_regions["chr_std"] = hotmaps_regions.apply(lambda x: str(x["Chromosome"]).replace("chr",""), axis=1) - chr_std = "chr" + hotmaps_regions["Chromosome"].map(str) + chr_std = "chr" + hotmaps_regions["chr_std"].map(str) hotmaps_reformatted = pd.DataFrame( { From 6f71a23f81becdbf9e25df7911d54f1df18c3500 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 12 Jul 2023 20:13:07 -0700 Subject: [PATCH 074/132] Remove quality control thread config value --- modules/igv/1.0/config/default.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index 2b1e17f1..ba1a047a 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -58,7 +58,6 @@ lcr-modules: threads: _igv_symlink_bam: 30 _igv_symlink_bai: 30 - _igv_quality_control: 20 _igv_symlink_snapshot: 20 resources: From c06b404c1f5c364732b5f142405fa441e1a2cab9 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 9 Aug 2023 21:33:23 -0700 Subject: [PATCH 075/132] Handle corrupt snapshots and minimize sleep interval + tries --- modules/igv/1.0/igv.smk | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index b1ad896e..80af4ee7 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -408,12 +408,25 @@ rule _igv_quality_control: run: import subprocess success = True - height = str(subprocess.check_output(f"identify -format '%h' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] - width = str(subprocess.check_output(f"identify -format '%w' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] + corrupt_checks = 0 + is_corrupt = True + while corrupt_checks < 2 and is_corrupt == True: + corrupt_checks += 1 + try: + height = str(subprocess.check_output(f"identify -format '%h' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] + width = str(subprocess.check_output(f"identify -format '%w' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] + is_corrupt = False + except: + os.system(f'echo "Snapshot may be corrupt. Rerunning IGV, attempt {str(corrupt_checks)}:" >> {log.stdout}') + os.system(f'cat {params.batch_script} > {params.batch_temp} && echo "exit" >> {params.batch_temp}') + os.system(f'maxtime=$(($(wc -l < {params.batch_temp}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" {params.server_number} {params.server_args} {params.igv} -b {params.batch_temp} >> {log.stdout} 2>> {log.stderr}') + if is_corrupt == True: + os.system(f'echo "Snapshot may be corrupt." >> {log.stdout}') + success = False if height in ["506","547","559"]: attempts = 0 - os.system(f'sleep=$(grep "Sleep" {params.batch_script} | cut -d " " -f 2) && sed "s/setSleepInterval $sleep/setSleepInterval 10000/g" {params.batch_script} > {params.batch_temp} && echo "exit" >> {params.batch_temp}') - while height in ["506","547","559"] and attempts < 4: + os.system(f'sleep=$(grep "Sleep" {params.batch_script} | cut -d " " -f 2) && sed "s/setSleepInterval $sleep/setSleepInterval 5000/g" {params.batch_script} > {params.batch_temp} && echo "exit" >> {params.batch_temp}') + while height in ["506","547","559"] and attempts < 3: os.system(f'echo "Snapshot may be truncated. Current snapshot height is {height}. Rerunning IGV batch script {params.batch_script} with increased sleep interval.\n" >> {log.stdout}') attempts += 1 os.system(f'echo "IGV ATTEMPT #{attempts}:" >> {log.stdout}') @@ -425,7 +438,7 @@ rule _igv_quality_control: blank_kurtosis = {"547": 18.5, "559": 18.2} blank_skew = -4 if kurtosis > blank_kurtosis[height] and skewness < blank_skew: - os.system(f'echo "Snapshot may be blank. Current values:\nHeight:{height}, kurtosis: {str(kurtosis)}, skewness: {str(skewness)}\nSnapshots with height of 547, kurtosis greater than 18.5, and skewness less than 4 are likely blank and may be due to errors reading BAM file headers or Java address bind errors. Snapshots with height of 559, kurtosis greater than 18.2, and skewness less than 4 are likely blank and may be due to errors during IGV run." >> {log.stdout}') + os.system(f'echo "Snapshot may be blank. Current values:\nHeight:{height}, kurtosis: {str(kurtosis)}, skewness: {str(skewness)}\nSnapshots with height of 547, kurtosis greater than 18.5, and skewness less than 4 are likely blank, snapshots with height of 559, kurtosis greater than 18.2, and skewness less than 4 may be blank. Blank snapshots may be due to errors reading BAM file headers, Java address bind errors or other errors occurring during IGV run." >> {log.stdout}') success = False if height == "506": os.system(f'echo "Snapshot height is {height} and may still be truncated or improperly loaded. Check snapshot {input.snapshot}" >> {log.stdout}') From 1b9facba591632115f8da06ef6151617e75bd67f Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 9 Aug 2023 21:36:00 -0700 Subject: [PATCH 076/132] Clean up commented out lines --- modules/igv/1.0/igv.smk | 37 +------------------------------------ 1 file changed, 1 insertion(+), 36 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 80af4ee7..b4b875c5 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -474,42 +474,7 @@ rule _igv_symlink_snapshot: run: op.relative_symlink(input.snapshot, output.snapshot) -# Return a list of all snapshots that have undergone quality control -#def _quality_control(wildcards): -# CFG = config["lcr-modules"]["igv"] -# checkpoint_outputs = checkpoints._igv_run.get(**wildcards).output.complete -# -# this_sample = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) -# -# normal_sample_id = this_sample["normal_sample_id"] -# pair_status = this_sample["pair_status"] -# -# maf = expand( -# str(rules._igv_filter_maf.output.maf), -# zip, -# seq_type=wildcards.seq_type, -# genome_build=wildcards.genome_build, -# tumour_id=wildcards.tumour_id, -# normal_sample_id=normal_sample_id, -# pair_status=pair_status -# ) -# -# if os.path.exists(maf[0]): -# maf_table = pd.read_table(maf[0], comment="#", sep="\t") -# return expand( -# str(rules._igv_quality_control.output.snapshot_qc), -# zip, -# seq_type = maf_table["seq_type"], -# genome_build = maf_table["genome_build"], -# chromosome = maf_table["chr_std"], -# start_position = maf_table["Start_Position"], -# gene = maf_table["Hugo_Symbol"], -# tumour_id = maf_table["Tumor_Sample_Barcode"] -# ) -# else: -# return [] - -# Return a list of all snapshots that were taken during IGV +# Return a list of all snapshots that were taken during IGV for each specific tumour_id, tumour_seq_type, and tumour_genome_build combination def _symlink_snapshot(wildcards): CFG = config["lcr-modules"]["igv"] checkpoint_outputs = checkpoints._igv_run.get(**wildcards).output.complete From 6a2a468150ad5b571fd563306dc7f250bdbe424a Mon Sep 17 00:00:00 2001 From: mannycruz Date: Sun, 13 Aug 2023 19:30:16 -0700 Subject: [PATCH 077/132] Rename snapshot estimate parameter for clarity --- modules/igv/1.0/config/default.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index 2b1e17f1..7de7feca 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -6,10 +6,10 @@ lcr-modules: # Available wildcards: {seq_type} {tumour_sample_id} {normal_sample_id} {pair_status} {genome_build} maf: "__UPDATE__" regions_file: "__UPDATE__" # Path to a MAF, VCF, BED, OncodriveCLUSTL clusters file, HotMAPS results file, or mutation_id file containing regions of interest to snapshot. - regions_format: "__UPDATE__" # Available options are "bed", "maf", "oncodriveclust", "hotmaps" or "mutation_id" + regions_format: "__UPDATE__" # Available options are "bed", "maf", "oncodriveclustl", "hotmaps" or "mutation_id" regions_build: "__UPDATE__" # Genome build of regions file, which will be lifted over as needed to filter MAFs on opposite genome builds - test_run: True # Stop after MAF filtering step to estimate total number of snapshots + estimate_only: True # Stop after MAF filtering step to estimate total number of snapshots options: filter_maf: From fe60b9420cb4e682ce5fd9d918dc7d3042e90369 Mon Sep 17 00:00:00 2001 From: mannycruz Date: Sun, 13 Aug 2023 19:31:43 -0700 Subject: [PATCH 078/132] Add function to estimate snapshots --- modules/igv/1.0/igv.smk | 601 ++++++++++++++++++++-------------------- 1 file changed, 298 insertions(+), 303 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index b4b875c5..5ff57ed7 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -255,287 +255,243 @@ checkpoint _igv_create_batch_script_per_variant: script: config["lcr-modules"]["igv"]["scripts"]["batch_script_per_variant"] -# Keep track of which variant and sample_id combinations have been seen, merge individual variant batch scripts into a large batch script per sample_id -rule _igv_batches_to_merge: - input: - batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" - output: - dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" - params: - batch_script_file = str(rules._igv_create_batch_script_per_variant.output.variant_batch), - igv_options = CFG["options"]["generate_batch_script"]["igv_options"] - threads: (workflow.cores / 10) - run: - batch_script_path = os.path.abspath(input.batch_script) - output_file = os.path.abspath(params.batch_script_file) - - batch_script = open(batch_script_path, "r") +if CFG["estimate_only"] == False: + # Keep track of which variant and sample_id combinations have been seen, merge individual variant batch scripts into a large batch script per sample_id + rule _igv_batches_to_merge: + input: + batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" + output: + dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" + params: + batch_script_file = str(rules._igv_create_batch_script_per_variant.output.variant_batch), + igv_options = CFG["options"]["generate_batch_script"]["igv_options"] + threads: (workflow.cores / 10) + run: + batch_script_path = os.path.abspath(input.batch_script) + output_file = os.path.abspath(params.batch_script_file) - with open(output_file, "r") as f: - merged_lines = len(f.readlines()) + batch_script = open(batch_script_path, "r") - with open(output_file, "a") as handle: - for line in batch_script: - if merged_lines > 0: - if line.startswith(("load","maxPanelHeight","genome","viewaspairs","setSleepInterval")): - continue - if line.startswith(tuple(params.igv_options)): - continue - handle.write(line) - batch_script.close() + with open(output_file, "r") as f: + merged_lines = len(f.readlines()) - output_touch = open(output.dispatched_batch_script, "w") - output_touch.close() + with open(output_file, "a") as handle: + for line in batch_script: + if merged_lines > 0: + if line.startswith(("load","maxPanelHeight","genome","viewaspairs","setSleepInterval")): + continue + if line.startswith(tuple(params.igv_options)): + continue + handle.write(line) + batch_script.close() -# Return list of all batch scripts that were created from the filtered maf and merged -def _evaluate_batches(wildcards): - CFG = config["lcr-modules"]["igv"] - checkpoint_output = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.variant_batch - - this_sample = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_genome_build = wildcards.genome_build, tumour_seq_type = wildcards.seq_type) - - normal_sample_id = this_sample["normal_sample_id"] - pair_status = this_sample["pair_status"] + output_touch = open(output.dispatched_batch_script, "w") + output_touch.close() - maf = expand( - str(rules._igv_filter_maf.output.maf), - zip, - seq_type=wildcards.seq_type, - genome_build=wildcards.genome_build, - tumour_id=wildcards.tumour_id, - normal_sample_id=normal_sample_id, - pair_status=pair_status - ) - - if os.path.exists(maf[0]): - maf_table = pd.read_table(maf[0], comment="#", sep="\t") - - return expand( - str(rules._igv_batches_to_merge.output.dispatched_batch_script), - zip, - chromosome = maf_table["chr_std"], - start_position = maf_table["Start_Position"], - gene = maf_table["Hugo_Symbol"], - tumour_id = maf_table["Tumor_Sample_Barcode"], - seq_type = maf_table["seq_type"], - genome_build = maf_table["genome_build"] - ) - else: - return [] - -rule _igv_download_igv: - output: - igv_zip = CFG["dirs"]["igv"] + "IGV_2.7.2.zip", - igv_installed = CFG["dirs"]["igv"] + "igv_2.7.2.installed" - conda: - CFG["conda_envs"]["wget"] - log: - stdout = CFG["logs"]["igv"] + "download_igv.stdout.log", - stderr = CFG["logs"]["igv"] + "download_igv.stderr.log" - shell: - op.as_one_line(""" - wget -O {output.igv_zip} https://data.broadinstitute.org/igv/projects/downloads/2.7/IGV_Linux_2.7.2.zip && - unzip {output.igv_zip} -d $(dirname {output.igv_zip}) > {log.stdout} 2> {log.stderr} && - touch {output.igv_installed} - """) + # Return list of all batch scripts that were created from the filtered maf and merged + def _evaluate_batches(wildcards): + CFG = config["lcr-modules"]["igv"] + checkpoint_output = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.variant_batch -# Run IGV once all individual variant batch scripts have been merged into one script per sample_id -checkpoint _igv_run: - input: - igv = str(rules._igv_download_igv.output.igv_installed), - batch_script = _evaluate_batches, - merged_batch = str(rules._igv_create_batch_script_per_variant.output.variant_batch) - output: - complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}--{tumour_id}.completed" - params: - merged_batch = str(rules._igv_create_batch_script_per_variant.output.variant_batch), - igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh", - max_time = CFG["options"]["generate_batch_script"]["sleep_timer"], - server_number = "-n " + CFG["options"]["xvfb_parameters"]["server_number"] if CFG["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", - server_args = CFG["options"]["xvfb_parameters"]["server_args"] - resources: - **CFG["resources"]["_igv_run"] - log: - stdout = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.stdout.log", - stderr = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.stderr.log" - threads: (workflow.cores) - shell: - op.as_one_line(""" - lines=$(wc -l < {params.merged_batch}) ; - if [ $lines -gt 0 ] ; - then - if ! grep -q -e "exit" {params.merged_batch} ; - then - echo 'exit' >> {params.merged_batch} ; - fi ; - maxtime=$(($(wc -l < {params.merged_batch}) * 60 + 15)) ; - timeout $maxtime xvfb-run -s "-screen 0 1920x1080x24" {params.server_number} {params.server_args} {params.igv} -b {params.merged_batch} > {log.stdout} 2> {log.stderr} && touch {output.complete} ; - exit=$? ; - if [ $exit -ne 0 ] ; - then - if grep -q -e "No such process" {log.stderr} && grep -q -e "Executing Command: exit" {log.stdout} ; - then - echo "All IGV batch script commands have completed succesfully, but an Xvfb-run kill error has occurred." >> {log.stdout} && touch {output.complete} ; - else - false ; - fi ; - fi ; - else - echo 'Skipping sample {wildcards.tumour_id} because it either has no variants to snapshot or all variants have been already been snapshot.' ; - touch {output.complete} ; - fi - """) + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_genome_build = wildcards.genome_build, tumour_seq_type = wildcards.seq_type) -rule _igv_quality_control: - input: - igv = str(rules._igv_run.output.complete), - snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" - output: - snapshot_qc = temp(CFG["dirs"]["snapshots"] + "qc/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".qc") - params: - batch_script = lambda w: config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + f"single_batch_scripts/{w.seq_type}--{w.genome_build}/{w.chromosome}:{w.start_position}--{w.gene}--{w.tumour_id}" + SUFFIX + ".batch", - merged_batch = lambda w: config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + f"merged_batch_scripts/{w.seq_type}--{w.genome_build}/{w.tumour_id}" + SUFFIX + ".batch", - igv = config["lcr-modules"]["igv"]["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh", - server_number = "-n " + config["lcr-modules"]["igv"]["options"]["xvfb_parameters"]["server_number"] if config["lcr-modules"]["igv"]["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", - server_args = config["lcr-modules"]["igv"]["options"]["xvfb_parameters"]["server_args"], - batch_temp = lambda w: config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + f"single_batch_scripts/{w.seq_type}--{w.genome_build}/{w.chromosome}:{w.start_position}--{w.gene}--{w.tumour_id}" + SUFFIX + ".batch.temp" - resources: - **CFG["resources"]["_igv_quality_control"] - log: - stdout = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stdout.log", - stderr = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stderr.log" - threads: (workflow.cores) - run: - import subprocess - success = True - corrupt_checks = 0 - is_corrupt = True - while corrupt_checks < 2 and is_corrupt == True: - corrupt_checks += 1 - try: - height = str(subprocess.check_output(f"identify -format '%h' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] - width = str(subprocess.check_output(f"identify -format '%w' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] - is_corrupt = False - except: - os.system(f'echo "Snapshot may be corrupt. Rerunning IGV, attempt {str(corrupt_checks)}:" >> {log.stdout}') - os.system(f'cat {params.batch_script} > {params.batch_temp} && echo "exit" >> {params.batch_temp}') - os.system(f'maxtime=$(($(wc -l < {params.batch_temp}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" {params.server_number} {params.server_args} {params.igv} -b {params.batch_temp} >> {log.stdout} 2>> {log.stderr}') - if is_corrupt == True: - os.system(f'echo "Snapshot may be corrupt." >> {log.stdout}') - success = False - if height in ["506","547","559"]: - attempts = 0 - os.system(f'sleep=$(grep "Sleep" {params.batch_script} | cut -d " " -f 2) && sed "s/setSleepInterval $sleep/setSleepInterval 5000/g" {params.batch_script} > {params.batch_temp} && echo "exit" >> {params.batch_temp}') - while height in ["506","547","559"] and attempts < 3: - os.system(f'echo "Snapshot may be truncated. Current snapshot height is {height}. Rerunning IGV batch script {params.batch_script} with increased sleep interval.\n" >> {log.stdout}') - attempts += 1 - os.system(f'echo "IGV ATTEMPT #{attempts}:" >> {log.stdout}') - os.system(f'echo "IGV ATTEMPT #{attempts}:" >> {log.stderr}') - os.system(f'maxtime=$(($(wc -l < {params.batch_temp}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" {params.server_number} {params.server_args} {params.igv} -b {params.batch_temp} >> {log.stdout} 2>> {log.stderr}') - height = str(subprocess.check_output(f"identify -format '%h' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] - if height in ["547","559"]: - kurtosis, skewness = [float(value.split(": ")[1]) for value in str(subprocess.check_output(f'identify -verbose {input.snapshot} | grep -E "kurtosis|skewness" | tail -n 2', shell=True)).replace("\\n'","").split("\\n ")] - blank_kurtosis = {"547": 18.5, "559": 18.2} - blank_skew = -4 - if kurtosis > blank_kurtosis[height] and skewness < blank_skew: - os.system(f'echo "Snapshot may be blank. Current values:\nHeight:{height}, kurtosis: {str(kurtosis)}, skewness: {str(skewness)}\nSnapshots with height of 547, kurtosis greater than 18.5, and skewness less than 4 are likely blank, snapshots with height of 559, kurtosis greater than 18.2, and skewness less than 4 may be blank. Blank snapshots may be due to errors reading BAM file headers, Java address bind errors or other errors occurring during IGV run." >> {log.stdout}') - success = False - if height == "506": - os.system(f'echo "Snapshot height is {height} and may still be truncated or improperly loaded. Check snapshot {input.snapshot}" >> {log.stdout}') - success = False - if width == "640": - attempts = 0 - os.system(f'echo "Snapshot appears to be in incorrect dimensions. Current width is {width} and might be due to xvfb-run unable to connect to current server {params.server_number} due to a server lock. Attempting to run on different server number..." >> {log.stdout}') - if params.server_number == "--auto-servernum" or int(params.server_number.replace("-n ","")) >= 99: - new_server = 1 - else: - new_server = int(params.server_number.replace("-n ","")) + 1 - while width == "640" and attempts < 5: - os.system(f'echo "Attempting with server number {new_server}..." >> {log.stdout}') - os.system(f'echo "Attempting with server number {new_server}..." >> {log.stderr}') - os.system(f'maxtime=$(($(wc -l < {params.merged_batch}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" -n {str(new_server)} {params.server_args} {params.igv} -b {params.merged_batch} >> {log.stdout} 2>> {log.stderr}') - width = str(subprocess.check_output(f"identify -format '%w' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] - new_server += 1 - if width == "640": - os.system(f'echo "Snapshot still appears to be in improper dimensions. Double check xvfb-run parameters." >> {log.stdout}') - success = False - if success == True: - os.system(f'touch {output.snapshot_qc}') + normal_sample_id = this_sample["normal_sample_id"] + pair_status = this_sample["pair_status"] -# Symlinks the final output files into the module results directory (under '99-outputs/') -rule _igv_symlink_snapshot: - input: - snapshot = ancient(CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png"), - snapshot_qc = str(rules._igv_quality_control.output.snapshot_qc) - output: - snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" - threads: - CFG["threads"]["_igv_symlink_snapshot"] - run: - op.relative_symlink(input.snapshot, output.snapshot) + maf = expand( + str(rules._igv_filter_maf.output.maf), + zip, + seq_type=wildcards.seq_type, + genome_build=wildcards.genome_build, + tumour_id=wildcards.tumour_id, + normal_sample_id=normal_sample_id, + pair_status=pair_status + ) -# Return a list of all snapshots that were taken during IGV for each specific tumour_id, tumour_seq_type, and tumour_genome_build combination -def _symlink_snapshot(wildcards): - CFG = config["lcr-modules"]["igv"] - checkpoint_outputs = checkpoints._igv_run.get(**wildcards).output.complete - - this_sample = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) + if os.path.exists(maf[0]): + maf_table = pd.read_table(maf[0], comment="#", sep="\t") - normal_sample_id = this_sample["normal_sample_id"] - pair_status = this_sample["pair_status"] + return expand( + str(rules._igv_batches_to_merge.output.dispatched_batch_script), + zip, + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + tumour_id = maf_table["Tumor_Sample_Barcode"], + seq_type = maf_table["seq_type"], + genome_build = maf_table["genome_build"] + ) + else: + return [] - maf = expand( - str(rules._igv_filter_maf.output.maf), - zip, - seq_type=wildcards.seq_type, - genome_build=wildcards.genome_build, - tumour_id=wildcards.tumour_id, - normal_sample_id=normal_sample_id, - pair_status=pair_status - ) + rule _igv_download_igv: + output: + igv_zip = CFG["dirs"]["igv"] + "IGV_2.7.2.zip", + igv_installed = CFG["dirs"]["igv"] + "igv_2.7.2.installed" + conda: + CFG["conda_envs"]["wget"] + log: + stdout = CFG["logs"]["igv"] + "download_igv.stdout.log", + stderr = CFG["logs"]["igv"] + "download_igv.stderr.log" + shell: + op.as_one_line(""" + wget -O {output.igv_zip} https://data.broadinstitute.org/igv/projects/downloads/2.7/IGV_Linux_2.7.2.zip && + unzip {output.igv_zip} -d $(dirname {output.igv_zip}) > {log.stdout} 2> {log.stderr} && + touch {output.igv_installed} + """) + + # Run IGV once all individual variant batch scripts have been merged into one script per sample_id + checkpoint _igv_run: + input: + igv = str(rules._igv_download_igv.output.igv_installed), + batch_script = _evaluate_batches, + merged_batch = str(rules._igv_create_batch_script_per_variant.output.variant_batch) + output: + complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}--{tumour_id}.completed" + params: + merged_batch = str(rules._igv_create_batch_script_per_variant.output.variant_batch), + igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh", + max_time = CFG["options"]["generate_batch_script"]["sleep_timer"], + server_number = "-n " + CFG["options"]["xvfb_parameters"]["server_number"] if CFG["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", + server_args = CFG["options"]["xvfb_parameters"]["server_args"] + resources: + **CFG["resources"]["_igv_run"] + log: + stdout = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.stdout.log", + stderr = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.stderr.log" + threads: (workflow.cores) + shell: + op.as_one_line(""" + lines=$(wc -l < {params.merged_batch}) ; + if [ $lines -gt 0 ] ; + then + if ! grep -q -e "exit" {params.merged_batch} ; + then + echo 'exit' >> {params.merged_batch} ; + fi ; + maxtime=$(($(wc -l < {params.merged_batch}) * 60 + 15)) ; + timeout $maxtime xvfb-run -s "-screen 0 1920x1080x24" {params.server_number} {params.server_args} {params.igv} -b {params.merged_batch} > {log.stdout} 2> {log.stderr} && touch {output.complete} ; + exit=$? ; + if [ $exit -ne 0 ] ; + then + if grep -q -e "No such process" {log.stderr} && grep -q -e "Executing Command: exit" {log.stdout} ; + then + echo "All IGV batch script commands have completed succesfully, but an Xvfb-run kill error has occurred." >> {log.stdout} && touch {output.complete} ; + else + false ; + fi ; + fi ; + else + echo 'Skipping sample {wildcards.tumour_id} because it either has no variants to snapshot or all variants have been already been snapshot.' ; + touch {output.complete} ; + fi + """) + + rule _igv_quality_control: + input: + igv = str(rules._igv_run.output.complete), + snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" + output: + snapshot_qc = temp(CFG["dirs"]["snapshots"] + "qc/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".qc") + params: + batch_script = lambda w: config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + f"single_batch_scripts/{w.seq_type}--{w.genome_build}/{w.chromosome}:{w.start_position}--{w.gene}--{w.tumour_id}" + SUFFIX + ".batch", + merged_batch = lambda w: config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + f"merged_batch_scripts/{w.seq_type}--{w.genome_build}/{w.tumour_id}" + SUFFIX + ".batch", + igv = config["lcr-modules"]["igv"]["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh", + server_number = "-n " + config["lcr-modules"]["igv"]["options"]["xvfb_parameters"]["server_number"] if config["lcr-modules"]["igv"]["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", + server_args = config["lcr-modules"]["igv"]["options"]["xvfb_parameters"]["server_args"], + batch_temp = lambda w: config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + f"single_batch_scripts/{w.seq_type}--{w.genome_build}/{w.chromosome}:{w.start_position}--{w.gene}--{w.tumour_id}" + SUFFIX + ".batch.temp" + resources: + **CFG["resources"]["_igv_quality_control"] + log: + stdout = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stdout.log", + stderr = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stderr.log" + threads: (workflow.cores) + run: + import subprocess + success = True + corrupt_checks = 0 + is_corrupt = True + while corrupt_checks < 2 and is_corrupt == True: + corrupt_checks += 1 + try: + height = str(subprocess.check_output(f"identify -format '%h' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] + width = str(subprocess.check_output(f"identify -format '%w' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] + is_corrupt = False + except: + os.system(f'echo "Snapshot may be corrupt. Rerunning IGV, attempt {str(corrupt_checks)}:" >> {log.stdout}') + os.system(f'cat {params.batch_script} > {params.batch_temp} && echo "exit" >> {params.batch_temp}') + os.system(f'maxtime=$(($(wc -l < {params.batch_temp}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" {params.server_number} {params.server_args} {params.igv} -b {params.batch_temp} >> {log.stdout} 2>> {log.stderr}') + if is_corrupt == True: + os.system(f'echo "Snapshot may be corrupt." >> {log.stdout}') + success = False + if height in ["506","547","559"]: + attempts = 0 + os.system(f'sleep=$(grep "Sleep" {params.batch_script} | cut -d " " -f 2) && sed "s/setSleepInterval $sleep/setSleepInterval 5000/g" {params.batch_script} > {params.batch_temp} && echo "exit" >> {params.batch_temp}') + while height in ["506","547","559"] and attempts < 3: + os.system(f'echo "Snapshot may be truncated. Current snapshot height is {height}. Rerunning IGV batch script {params.batch_script} with increased sleep interval.\n" >> {log.stdout}') + attempts += 1 + os.system(f'echo "IGV ATTEMPT #{attempts}:" >> {log.stdout}') + os.system(f'echo "IGV ATTEMPT #{attempts}:" >> {log.stderr}') + os.system(f'maxtime=$(($(wc -l < {params.batch_temp}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" {params.server_number} {params.server_args} {params.igv} -b {params.batch_temp} >> {log.stdout} 2>> {log.stderr}') + height = str(subprocess.check_output(f"identify -format '%h' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] + if height in ["547","559"]: + kurtosis, skewness = [float(value.split(": ")[1]) for value in str(subprocess.check_output(f'identify -verbose {input.snapshot} | grep -E "kurtosis|skewness" | tail -n 2', shell=True)).replace("\\n'","").split("\\n ")] + blank_kurtosis = {"547": 18.5, "559": 18.2} + blank_skew = -4 + if kurtosis > blank_kurtosis[height] and skewness < blank_skew: + os.system(f'echo "Snapshot may be blank. Current values:\nHeight:{height}, kurtosis: {str(kurtosis)}, skewness: {str(skewness)}\nSnapshots with height of 547, kurtosis greater than 18.5, and skewness less than 4 are likely blank, snapshots with height of 559, kurtosis greater than 18.2, and skewness less than 4 may be blank. Blank snapshots may be due to errors reading BAM file headers, Java address bind errors or other errors occurring during IGV run." >> {log.stdout}') + success = False + if height == "506": + os.system(f'echo "Snapshot height is {height} and may still be truncated or improperly loaded. Check snapshot {input.snapshot}" >> {log.stdout}') + success = False + if width == "640": + attempts = 0 + os.system(f'echo "Snapshot appears to be in incorrect dimensions. Current width is {width} and might be due to xvfb-run unable to connect to current server {params.server_number} due to a server lock. Attempting to run on different server number..." >> {log.stdout}') + if params.server_number == "--auto-servernum" or int(params.server_number.replace("-n ","")) >= 99: + new_server = 1 + else: + new_server = int(params.server_number.replace("-n ","")) + 1 + while width == "640" and attempts < 5: + os.system(f'echo "Attempting with server number {new_server}..." >> {log.stdout}') + os.system(f'echo "Attempting with server number {new_server}..." >> {log.stderr}') + os.system(f'maxtime=$(($(wc -l < {params.merged_batch}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" -n {str(new_server)} {params.server_args} {params.igv} -b {params.merged_batch} >> {log.stdout} 2>> {log.stderr}') + width = str(subprocess.check_output(f"identify -format '%w' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] + new_server += 1 + if width == "640": + os.system(f'echo "Snapshot still appears to be in improper dimensions. Double check xvfb-run parameters." >> {log.stdout}') + success = False + if success == True: + os.system(f'touch {output.snapshot_qc}') - if os.path.exists(maf[0]): - maf_table = pd.read_table(maf[0], comment="#", sep="\t") - - return expand( - str(rules._igv_symlink_snapshot.output.snapshot), - zip, - seq_type = maf_table["seq_type"], - genome_build = maf_table["genome_build"], - chromosome = maf_table["chr_std"], - start_position = maf_table["Start_Position"], - gene = maf_table["Hugo_Symbol"], - tumour_id = maf_table["Tumor_Sample_Barcode"] - ) - else: - return [] - -# Check that snapshots have been symlinked and quality controlled -rule _igv_check_snapshots: - input: - igv_completed = str(rules._igv_run.output.complete), - snapshots = _symlink_snapshot - output: - snapshots = CFG["dirs"]["outputs"] + "completed/{seq_type}--{genome_build}--{tumour_id}.completed" - shell: - "touch {output.snapshots}" + # Symlinks the final output files into the module results directory (under '99-outputs/') + rule _igv_symlink_snapshot: + input: + snapshot = ancient(CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png"), + snapshot_qc = str(rules._igv_quality_control.output.snapshot_qc) + output: + snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" + threads: + CFG["threads"]["_igv_symlink_snapshot"] + run: + op.relative_symlink(input.snapshot, output.snapshot) -if CFG["test_run"] is True: - def _estimate_batches(wildcards): + # Return a list of all snapshots that were taken during IGV for each specific tumour_id, tumour_seq_type, and tumour_genome_build combination + def _symlink_snapshot(wildcards): CFG = config["lcr-modules"]["igv"] - checkpoint_outputs = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.variant_batch + checkpoint_outputs = checkpoints._igv_run.get(**wildcards).output.complete + + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) - this_sample = op.filter_samples(CFG["runs"], tumour_seq_type=wildcards.seq_type, tumour_genome_build=wildcards.genome_build, tumour_sample_id=wildcards.tumour_id) normal_sample_id = this_sample["normal_sample_id"] pair_status = this_sample["pair_status"] maf = expand( str(rules._igv_filter_maf.output.maf), zip, - seq_type=wildcards.seq_type, - genome_build=wildcards.genome_build, - tumour_id=wildcards.tumour_id, - normal_sample_id=normal_sample_id, + seq_type=wildcards.seq_type, + genome_build=wildcards.genome_build, + tumour_id=wildcards.tumour_id, + normal_sample_id=normal_sample_id, pair_status=pair_status ) @@ -543,7 +499,7 @@ if CFG["test_run"] is True: maf_table = pd.read_table(maf[0], comment="#", sep="\t") return expand( - CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch", + str(rules._igv_symlink_snapshot.output.snapshot), zip, seq_type = maf_table["seq_type"], genome_build = maf_table["genome_build"], @@ -555,68 +511,107 @@ if CFG["test_run"] is True: else: return [] - rule _igv_mock_merge_batches: + # Check that snapshots have been symlinked and quality controlled + rule _igv_check_snapshots: + input: + igv_completed = str(rules._igv_run.output.complete), + snapshots = _symlink_snapshot + output: + snapshots = CFG["dirs"]["outputs"] + "completed/{seq_type}--{genome_build}--{tumour_id}.completed" + shell: + "touch {output.snapshots}" + +if CFG["estimate_only"] is True: + + rule _igv_touch_summary: input: - batch_script = _estimate_batches + finished_batch_scripts = expand(str(rules._igv_create_batch_script_per_variant.output.variant_batch), zip, seq_type = CFG["runs"]["tumour_seq_type"], tumour_id=CFG["runs"]["tumour_sample_id"], genome_build=CFG["runs"]["tumour_genome_build"]) output: - batch_script = temp(CFG["dirs"]["batch_scripts"] + "mock_batches/{seq_type}--{genome_build}--{tumour_id}.batch") + snapshot_summary = CFG["dirs"]["outputs"] + "snapshot_estimates/snapshot_summary.txt", + summary_ready = temp(CFG["dirs"]["outputs"] + "snapshot_estimates/touch_summary.completed") run: - CFG = config["lcr-modules"]["igv"] - if not os.path.exists(CFG["dirs"]["batch_scripts"] + "mock_batches"): - os.mkdir(CFG["dirs"]["batch_scripts"] + "mock_batches") - with open(output.batch_script, "a") as out: - for batch in input.batch_script: - out.write(batch + "\n") + header = "\t".join(["sample_id","seq_type","genome_build","gene","chromosome","position"]) + with open(output.snapshot_summary,"w") as handle: + handle.write(header + "\n") + ready = open(output.summary_ready, "w") + ready.close() rule _igv_estimate_snapshots: input: - batch_scripts = expand(str(rules._igv_mock_merge_batches.output.batch_script), zip, tumour_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) + single_batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch", + summary_file_ready = str(rules._igv_touch_summary.output.summary_ready) output: - summary = CFG["dirs"]["batch_scripts"] + "mock_batches/snapshot_summary.txt" + mock_dispatch_batch_script = temp(CFG["dirs"]["batch_scripts"] + "mock_dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch") params: - dispatch_dir = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/", - suffix = SUFFIX + snapshot_summary = str(rules._igv_touch_summary.output.snapshot_summary), + real_dispatch_file = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" run: - sample_dictionary = {} - seen = [] - - for sample_batches in input.batch_scripts: - tumour_id = sample_batches.split("--")[-1].replace(".batch","") - sample_dictionary[tumour_id] = [] - with open(sample_batches, "r") as handle: - for batch_path in handle: - batch_path = batch_path.split("/") - snapshot_name = batch_path[-1].split(f"{params.suffix}.batch")[0] - if not snapshot_name in seen: - seen.append(snapshot_name) - else: - continue - seq_type = batch_path[-2].split("--")[0] - genome_build = batch_path[-2].split("--")[1] - sample_id = snapshot_name.split("--")[2] + if not os.path.exists(params.real_dispatch_file): + with open(params.snapshot_summary, "a") as handle: + tumour_id = wildcards.tumour_id + seq_type = wildcards.seq_type + genome_build = wildcards.genome_build + gene = wildcards.gene + chromosome = wildcards.chromosome + start_position = wildcards.start_position + + outline = "\t".join([tumour_id, seq_type, genome_build, gene, chromosome, start_position]) + handle.write(outline + "\n") + finished = open(output.mock_dispatch_batch_script, "w") + finished.close() + + def _estimate_batches(wildcards): + CFG = config["lcr-modules"]["igv"] + checkpoint_outputs = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.variant_batch + + this_sample = op.filter_samples(CFG["runs"], tumour_seq_type=wildcards.seq_type, tumour_genome_build=wildcards.genome_build, tumour_sample_id=wildcards.tumour_id) + normal_sample_id = this_sample["normal_sample_id"] + pair_status = this_sample["pair_status"] + + maf = expand( + str(rules._igv_filter_maf.output.maf), + zip, + seq_type=wildcards.seq_type, + genome_build=wildcards.genome_build, + tumour_id=wildcards.tumour_id, + normal_sample_id=normal_sample_id, + pair_status=pair_status + ) - potential_dispatch_file = params.dispatch_dir + f"{seq_type}--{genome_build}/{snapshot_name}{params.suffix}.batch" - if not os.path.exists(potential_dispatch_file): - gene = snapshot_name.split("--")[1] - sample_dictionary[sample_id].append(gene) + if os.path.exists(maf[0]): + maf_table = pd.read_table(maf[0], sep="\t", comment="#") - snapshot_summary = pd.DataFrame(list(sample_dictionary.items()), columns=["sample_id","snapshots"]) - snapshot_summary["snapshots"] = snapshot_summary["snapshots"].apply(lambda x: len(x)) + return expand( + str(rules._igv_estimate_snapshots.output.mock_dispatch_batch_script), + zip, + seq_type = maf_table["seq_type"], + genome_build = maf_table["genome_build"], + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + tumour_id = maf_table["Tumor_Sample_Barcode"] + ) + else: + return [] - snapshot_summary.loc["Total"] = snapshot_summary.sum() - snapshot_summary["sample_id"]["Total"] = "Total" - snapshot_summary.to_csv(output.summary, sep="\t", index=False) + rule _igv_snapshot_estimate_finished: + input: + _estimate_batches + output: + mock_merge_finished = temp(CFG["dirs"]["outputs"] + "snapshot_estimates/completed/{seq_type}--{genome_build}/{tumour_id}.completed") + shell: + "touch {output.mock_merge_finished}" # Generates the target sentinels for each run, which generate the symlinks -if CFG["test_run"] is False: +if CFG["estimate_only"] is False: rule _igv_all: input: expand([str(rules._igv_run.output.complete), str(rules._igv_check_snapshots.output.snapshots)], zip, tumour_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) -if CFG["test_run"] is True: +if CFG["estimate_only"] is True: rule _igv_all: input: - str(rules._igv_estimate_snapshots.output.summary) + expand(str(rules._igv_snapshot_estimate_finished.output.mock_merge_finished), zip, tumour_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) ##### CLEANUP ##### From e76c9f633f232758459266d09ee8822a4c46dd8e Mon Sep 17 00:00:00 2001 From: mannycruz Date: Wed, 16 Aug 2023 12:49:00 -0700 Subject: [PATCH 079/132] Add functions for estimating snapshots and finding failed snaps --- modules/igv/1.0/config/default.yaml | 1 + modules/igv/1.0/igv.smk | 102 ++++++++++++++++++++++++++-- 2 files changed, 97 insertions(+), 6 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index 7de7feca..016cd4fe 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -10,6 +10,7 @@ lcr-modules: regions_build: "__UPDATE__" # Genome build of regions file, which will be lifted over as needed to filter MAFs on opposite genome builds estimate_only: True # Stop after MAF filtering step to estimate total number of snapshots + identify_failed_snaps: False options: filter_maf: diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 5ff57ed7..101089a3 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -67,8 +67,9 @@ localrules: _igv_run, _igv_symlink_snapshot, _igv_check_snapshots, - _igv_mock_merge_batches, - _igv_estimate_snapshots + _igv_touch_summary, + _igv_estimate_snapshots, + _igv_snapshot_estimate_finished ##### FUNCTIONS ##### @@ -255,7 +256,7 @@ checkpoint _igv_create_batch_script_per_variant: script: config["lcr-modules"]["igv"]["scripts"]["batch_script_per_variant"] -if CFG["estimate_only"] == False: +if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: # Keep track of which variant and sample_id combinations have been seen, merge individual variant batch scripts into a large batch script per sample_id rule _igv_batches_to_merge: input: @@ -411,6 +412,8 @@ if CFG["estimate_only"] == False: success = True corrupt_checks = 0 is_corrupt = True + height = None + width = None while corrupt_checks < 2 and is_corrupt == True: corrupt_checks += 1 try: @@ -521,7 +524,7 @@ if CFG["estimate_only"] == False: shell: "touch {output.snapshots}" -if CFG["estimate_only"] is True: +if CFG["estimate_only"] is True and CFG["identify_failed_snaps"] is False: rule _igv_touch_summary: input: @@ -602,17 +605,104 @@ if CFG["estimate_only"] is True: shell: "touch {output.mock_merge_finished}" +if CFG["identify_failed_snaps"] is True and CFG["estimate_only"] is False: + rule _igv_touch_failed: + input: + finished_batch_scripts = expand(str(rules._igv_create_batch_script_per_variant.output.variant_batch), zip, seq_type = CFG["runs"]["tumour_seq_type"], tumour_id=CFG["runs"]["tumour_sample_id"], genome_build=CFG["runs"]["tumour_genome_build"]) + output: + failed_summary = CFG["dirs"]["outputs"] + "snapshot_estimates/failed_summary.txt", + failed_ready = temp(CFG["dirs"]["outputs"] + "snapshot_estimates/touch_failed.completed") + run: + header = "\t".join(["sample_id","seq_type","genome_build","gene","chromosome","position","snapshot_path"]) + with open(output.failed_summary, "w") as handle: + handle.write(header + "\n") + ready = open(output.failed_ready, "w") + ready.close() + + rule _igv_find_failed: + input: + single_batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch", + failed_file_ready = str(rules._igv_touch_failed.output.failed_ready) + output: + mock_dispatch_batch_script = temp(CFG["dirs"]["batch_scripts"] + "mock_dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch") + params: + failed_summary = str(rules._igv_touch_failed.output.failed_summary), + theoretical_snap = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png", + theoretical_symlink = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" + run: + if not os.path.exists(params.theoretical_symlink) and os.path.exists(params.theoretical_snap): + with open(params.failed_summary, "a") as handle: + tumour_id = wildcards.tumour_id, + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + gene = wildcards.gene, + chromosome = wildcards.chromosome, + position = wildcards.start_position, + snapshot_path = params.theoretical_snap + + outline = "\t".join([tumour_id, seq_type, genome_build, gene, chromosome, position, snapshot_path]) + handle.write(outline + "\n") + finished = open(output.mock_dispatch_batch_script, "w") + finished.close() + + def _find_failed(wildcards): + CFG = config["lcr-modules"]["igv"] + checkpoint_outputs = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.variant_batch + + this_sample = op.filter_samples(CFG["runs"], tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build, tumour_sample_id = wildcards.tumour_id) + normal_sample_id = this_sample["normal_sample_id"] + pair_status = this_sample["pair_status"] + + maf = expand( + str(rules._igv_filter_maf.output.maf), + zip, + seq_type=wildcards.seq_type, + genome_build=wildcards.genome_build, + tumour_id=wildcards.tumour_id, + normal_sample_id=normal_sample_id, + pair_status=pair_status + ) + + if os.path.exists(maf[0]): + maf_table = pd.read_table(maf[0], sep="\t", comment="#") + + return expand( + str(rules._igv_find_failed.output.mock_dispatch_batch_script), + zip, + seq_type = maf_table["seq_type"], + genome_build = maf_table["genome_build"], + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + tumour_id = maf_table["Tumor_Sample_Barcode"] + ) + else: + return [] + + rule _igv_failed_estimate_finished: + input: + _find_failed + output: + mock_merge_finished = temp(CFG["dirs"]["outputs"] + "snapshot_estimates/completed/{seq_type}--{genome_build}/{tumour_id}.failed.completed") + shell: + "touch {output.mock_merge_finished}" + # Generates the target sentinels for each run, which generate the symlinks -if CFG["estimate_only"] is False: +if CFG["estimate_only"] is False and CFG["identify_failed_snaps"] is False: rule _igv_all: input: expand([str(rules._igv_run.output.complete), str(rules._igv_check_snapshots.output.snapshots)], zip, tumour_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) -if CFG["estimate_only"] is True: +if CFG["estimate_only"] is True and CFG["identify_failed_snaps"] is False: rule _igv_all: input: expand(str(rules._igv_snapshot_estimate_finished.output.mock_merge_finished), zip, tumour_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) +if CFG["identify_failed_snaps"] is True and CFG["estimate_only"] is False: + rule _igv_all: + input: + expand(str(rules._igv_failed_estimate_finished.output.mock_merge_finished), zip, tumour_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) + ##### CLEANUP ##### From 801f30853ed6e28a5e079cc3942df85fa4070906 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Thu, 31 Aug 2023 23:53:41 -0700 Subject: [PATCH 080/132] Use filter maf rule outputs to estimate snapshots (faster) --- modules/igv/1.0/igv.smk | 254 ++++++++++++++++------------------------ 1 file changed, 104 insertions(+), 150 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 101089a3..cd779135 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -229,34 +229,34 @@ def _get_maf(wildcards): ) ) -# Create batch scripts for each variant -checkpoint _igv_create_batch_script_per_variant: - input: - filter_maf = _get_maf, - bam_file = str(rules._igv_symlink_bam.output.bam), - bai_file = str(rules._igv_symlink_bai.output.bai), - regions_lifted = str(rules._igv_liftover_regions.output.regions), - regions_formatted = str(rules._igv_format_regions_file.output.regions) - output: - variant_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{tumour_id}" + SUFFIX + ".batch" - params: - batch_dir = config["lcr-modules"]["igv"]["dirs"]["batch_scripts"], - snapshot_dir = config["lcr-modules"]["igv"]["dirs"]["snapshots"], - genome_build = lambda w: w.genome_build, - seq_type = lambda w: w.seq_type, - padding = config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["padding"], - igv_options = config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["igv_options"], - max_height = config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["max_height"], - suffix = SUFFIX, - view_pairs = config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["view_as_pairs"], - sleep_timer = config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["sleep_timer"] - log: - stdout = CFG["logs"]["batch_scripts"] + "_igv_create_batch_script_per_variant/{seq_type}--{genome_build}/{tumour_id}" + SUFFIX + ".stdout.log", - stderr = CFG["logs"]["batch_scripts"] + "_igv_create_batch_script_per_variant/{seq_type}--{genome_build}/{tumour_id}" + SUFFIX + ".stderr.log" - script: - config["lcr-modules"]["igv"]["scripts"]["batch_script_per_variant"] - if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: + # Create batch scripts for each variant + checkpoint _igv_create_batch_script_per_variant: + input: + filter_maf = _get_maf, + bam_file = str(rules._igv_symlink_bam.output.bam), + bai_file = str(rules._igv_symlink_bai.output.bai), + regions_lifted = str(rules._igv_liftover_regions.output.regions), + regions_formatted = str(rules._igv_format_regions_file.output.regions) + output: + variant_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{tumour_id}" + SUFFIX + ".batch" + params: + batch_dir = config["lcr-modules"]["igv"]["dirs"]["batch_scripts"], + snapshot_dir = config["lcr-modules"]["igv"]["dirs"]["snapshots"], + genome_build = lambda w: w.genome_build, + seq_type = lambda w: w.seq_type, + padding = config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["padding"], + igv_options = config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["igv_options"], + max_height = config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["max_height"], + suffix = SUFFIX, + view_pairs = config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["view_as_pairs"], + sleep_timer = config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["sleep_timer"] + log: + stdout = CFG["logs"]["batch_scripts"] + "_igv_create_batch_script_per_variant/{seq_type}--{genome_build}/{tumour_id}" + SUFFIX + ".stdout.log", + stderr = CFG["logs"]["batch_scripts"] + "_igv_create_batch_script_per_variant/{seq_type}--{genome_build}/{tumour_id}" + SUFFIX + ".stderr.log" + script: + config["lcr-modules"]["igv"]["scripts"]["batch_script_per_variant"] + # Keep track of which variant and sample_id combinations have been seen, merge individual variant batch scripts into a large batch script per sample_id rule _igv_batches_to_merge: input: @@ -528,7 +528,7 @@ if CFG["estimate_only"] is True and CFG["identify_failed_snaps"] is False: rule _igv_touch_summary: input: - finished_batch_scripts = expand(str(rules._igv_create_batch_script_per_variant.output.variant_batch), zip, seq_type = CFG["runs"]["tumour_seq_type"], tumour_id=CFG["runs"]["tumour_sample_id"], genome_build=CFG["runs"]["tumour_genome_build"]) + finished_filtered_mafs = expand(str(rules._igv_filter_maf.output.maf), zip, seq_type = CFG["runs"]["tumour_seq_type"], tumour_id=CFG["runs"]["tumour_sample_id"], genome_build=CFG["runs"]["tumour_genome_build"], normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"]) output: snapshot_summary = CFG["dirs"]["outputs"] + "snapshot_estimates/snapshot_summary.txt", summary_ready = temp(CFG["dirs"]["outputs"] + "snapshot_estimates/touch_summary.completed") @@ -541,74 +541,40 @@ if CFG["estimate_only"] is True and CFG["identify_failed_snaps"] is False: rule _igv_estimate_snapshots: input: - single_batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch", + maf = str(rules._igv_filter_maf.output.maf), summary_file_ready = str(rules._igv_touch_summary.output.summary_ready) output: - mock_dispatch_batch_script = temp(CFG["dirs"]["batch_scripts"] + "mock_dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch") + estimate_finished = temp(CFG["dirs"]["batch_scripts"] + "estimate_batch_scripts/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}.temp") params: - snapshot_summary = str(rules._igv_touch_summary.output.snapshot_summary), - real_dispatch_file = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" + snapshot_summary = str(rules._igv_touch_summary.output.snapshot_summary) + threads: (workflow.cores) run: - if not os.path.exists(params.real_dispatch_file): - with open(params.snapshot_summary, "a") as handle: - tumour_id = wildcards.tumour_id - seq_type = wildcards.seq_type - genome_build = wildcards.genome_build - gene = wildcards.gene - chromosome = wildcards.chromosome - start_position = wildcards.start_position - - outline = "\t".join([tumour_id, seq_type, genome_build, gene, chromosome, start_position]) - handle.write(outline + "\n") - finished = open(output.mock_dispatch_batch_script, "w") + CFG = config["lcr-modules"]["igv"] + maf_table = pd.read_table(input.maf, sep="\t", comment="#") + + seq_type = wildcards.seq_type + genome_build = wildcards.genome_build + tumour_id = wildcards.tumour_id + + for index, row in maf_table.iterrows(): + gene = row["Hugo_Symbol"] + chromosome = row["chr_std"] + position = str(row["Start_Position"]) + + dispatch_path = CFG["dirs"]["batch_scripts"] + f"dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{position}--{gene}--{tumour_id}" + SUFFIX + ".batch" + + if not os.path.exists(dispatch_path): + with open(params.snapshot_summary, "a") as handle: + outline = "\t".join([tumour_id, seq_type, genome_build, gene, chromosome, position]) + handle.write(outline + "\n") + + finished = open(output.estimate_finished, "w") finished.close() - def _estimate_batches(wildcards): - CFG = config["lcr-modules"]["igv"] - checkpoint_outputs = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.variant_batch - - this_sample = op.filter_samples(CFG["runs"], tumour_seq_type=wildcards.seq_type, tumour_genome_build=wildcards.genome_build, tumour_sample_id=wildcards.tumour_id) - normal_sample_id = this_sample["normal_sample_id"] - pair_status = this_sample["pair_status"] - - maf = expand( - str(rules._igv_filter_maf.output.maf), - zip, - seq_type=wildcards.seq_type, - genome_build=wildcards.genome_build, - tumour_id=wildcards.tumour_id, - normal_sample_id=normal_sample_id, - pair_status=pair_status - ) - - if os.path.exists(maf[0]): - maf_table = pd.read_table(maf[0], sep="\t", comment="#") - - return expand( - str(rules._igv_estimate_snapshots.output.mock_dispatch_batch_script), - zip, - seq_type = maf_table["seq_type"], - genome_build = maf_table["genome_build"], - chromosome = maf_table["chr_std"], - start_position = maf_table["Start_Position"], - gene = maf_table["Hugo_Symbol"], - tumour_id = maf_table["Tumor_Sample_Barcode"] - ) - else: - return [] - - rule _igv_snapshot_estimate_finished: - input: - _estimate_batches - output: - mock_merge_finished = temp(CFG["dirs"]["outputs"] + "snapshot_estimates/completed/{seq_type}--{genome_build}/{tumour_id}.completed") - shell: - "touch {output.mock_merge_finished}" - if CFG["identify_failed_snaps"] is True and CFG["estimate_only"] is False: rule _igv_touch_failed: input: - finished_batch_scripts = expand(str(rules._igv_create_batch_script_per_variant.output.variant_batch), zip, seq_type = CFG["runs"]["tumour_seq_type"], tumour_id=CFG["runs"]["tumour_sample_id"], genome_build=CFG["runs"]["tumour_genome_build"]) + filter_maf = expand(str(rules._igv_filter_maf.output.maf), zip, seq_type = CFG["runs"]["tumour_seq_type"], tumour_id = CFG["runs"]["tumour_sample_id"], genome_build=CFG["runs"]["tumour_genome_build"], normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"]) output: failed_summary = CFG["dirs"]["outputs"] + "snapshot_estimates/failed_summary.txt", failed_ready = temp(CFG["dirs"]["outputs"] + "snapshot_estimates/touch_failed.completed") @@ -621,72 +587,60 @@ if CFG["identify_failed_snaps"] is True and CFG["estimate_only"] is False: rule _igv_find_failed: input: - single_batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch", - failed_file_ready = str(rules._igv_touch_failed.output.failed_ready) + maf = str(rules._igv_filter_maf.output.maf), + failed_ready = str(rules._igv_touch_failed.output.failed_ready) output: - mock_dispatch_batch_script = temp(CFG["dirs"]["batch_scripts"] + "mock_dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch") + failed_finished = temp(CFG["dirs"]["batch_scripts"] + "estimate_failed_scripts/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}.temp") params: - failed_summary = str(rules._igv_touch_failed.output.failed_summary), - theoretical_snap = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png", - theoretical_symlink = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" + failed_summary = str(rules._igv_touch_failed.output.failed_summary) + threads: (workflow.cores) run: - if not os.path.exists(params.theoretical_symlink) and os.path.exists(params.theoretical_snap): - with open(params.failed_summary, "a") as handle: - tumour_id = wildcards.tumour_id, - seq_type = wildcards.seq_type, - genome_build = wildcards.genome_build, - gene = wildcards.gene, - chromosome = wildcards.chromosome, - position = wildcards.start_position, - snapshot_path = params.theoretical_snap - - outline = "\t".join([tumour_id, seq_type, genome_build, gene, chromosome, position, snapshot_path]) - handle.write(outline + "\n") - finished = open(output.mock_dispatch_batch_script, "w") + import subprocess + CFG = config["lcr-modules"]["igv"] + maf_table = pd.read_table(input.maf, sep="\t", comment="#") + + seq_type = wildcards.seq_type + genome_build = wildcards.genome_build + tumour_id = wildcards.tumour_id + + for index, row in maf_table.iterrows(): + gene = row["Hugo_Symbol"] + chromosome = row["chr_std"] + position = str(row["Start_Position"]) + + snapshot = CFG["dirs"]["snapshots"] + f"{seq_type}--{genome_build}/{chromosome}/{chromosome}:{position}--{gene}--{tumour_id}" + SUFFIX + ".png" + snapshot_symlink = CFG["dirs"]["outputs"] + f"{seq_type}--{genome_build}/{chromosome}/{chromosome}:{position}--{gene}--{tumour_id}" + SUFFIX + ".png" + + success = True + if not os.path.exists(snapshot): + print(f"{snapshot} doesn't exist") + if os.path.exists(snapshot): + if not os.path.exists(snapshot_symlink): + success = False + if success is True: + try: + height = str(subprocess.check_output(f"identify -format '%h' {snapshot}", shell=True)).split("'")[1].split("\\n")[0] + width = str(subprocess.check_output(f"identify -format '%w' {snapshot}", shell=True)).split("'")[1].split("\\n")[0] + print(f"Height is {height} for {tumour_id} {gene} {chromosome}:{position}") + except: + success = False + if success is True and height in ["547","559"]: + kurtosis, skewness = [float(value.split(": ")[1]) for value in str(subprocess.check_output(f'identify -verbose {snapshot} | grep -E "kurtosis|skewness" | tail -n 2', shell=True)).replace("\\n'","").split("\\n ")] + print(f"Kurtosis is {kurtosis} for {tumour_id} {gene} {chromosome}:{position}") + blank_kurtosis = {"547": 18.5, "559": 18.2} + blank_skew = -4 + if kurtosis > blank_kurtosis[height] and skewness < blank_skew: + success = False + print(f"Success value is {success} for {tumour_id} {gene} {chromosome}:{position}") + if success is False: + with open(params.failed_summary, "a") as handle: + print("Writing line to failed file") + outline = "\t".join([tumour_id, seq_type, genome_build, gene, chromosome, position, snapshot]) + handle.write(outline + "\n") + + finished = open(output.failed_finished, "w") finished.close() - def _find_failed(wildcards): - CFG = config["lcr-modules"]["igv"] - checkpoint_outputs = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.variant_batch - - this_sample = op.filter_samples(CFG["runs"], tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build, tumour_sample_id = wildcards.tumour_id) - normal_sample_id = this_sample["normal_sample_id"] - pair_status = this_sample["pair_status"] - - maf = expand( - str(rules._igv_filter_maf.output.maf), - zip, - seq_type=wildcards.seq_type, - genome_build=wildcards.genome_build, - tumour_id=wildcards.tumour_id, - normal_sample_id=normal_sample_id, - pair_status=pair_status - ) - - if os.path.exists(maf[0]): - maf_table = pd.read_table(maf[0], sep="\t", comment="#") - - return expand( - str(rules._igv_find_failed.output.mock_dispatch_batch_script), - zip, - seq_type = maf_table["seq_type"], - genome_build = maf_table["genome_build"], - chromosome = maf_table["chr_std"], - start_position = maf_table["Start_Position"], - gene = maf_table["Hugo_Symbol"], - tumour_id = maf_table["Tumor_Sample_Barcode"] - ) - else: - return [] - - rule _igv_failed_estimate_finished: - input: - _find_failed - output: - mock_merge_finished = temp(CFG["dirs"]["outputs"] + "snapshot_estimates/completed/{seq_type}--{genome_build}/{tumour_id}.failed.completed") - shell: - "touch {output.mock_merge_finished}" - # Generates the target sentinels for each run, which generate the symlinks if CFG["estimate_only"] is False and CFG["identify_failed_snaps"] is False: rule _igv_all: @@ -696,12 +650,12 @@ if CFG["estimate_only"] is False and CFG["identify_failed_snaps"] is False: if CFG["estimate_only"] is True and CFG["identify_failed_snaps"] is False: rule _igv_all: input: - expand(str(rules._igv_snapshot_estimate_finished.output.mock_merge_finished), zip, tumour_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) + expand(str(rules._igv_estimate_snapshots.output.estimate_finished), zip, seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"], tumour_id=CFG["runs"]["tumour_sample_id"], normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"]) if CFG["identify_failed_snaps"] is True and CFG["estimate_only"] is False: rule _igv_all: input: - expand(str(rules._igv_failed_estimate_finished.output.mock_merge_finished), zip, tumour_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) + expand(str(rules._igv_find_failed.output.failed_finished), zip, seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"], tumour_id=CFG["runs"]["tumour_sample_id"], normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"]) ##### CLEANUP ##### From 5f971b797eed6930873ee6eb1c7f2678650758fe Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Wed, 6 Sep 2023 23:28:36 -0700 Subject: [PATCH 081/132] Add ability to load multiple bam files and image presets into batch scripts --- .../etc/generate_batch_script_per_variant.py | 107 ++++++++++++------ 1 file changed, 70 insertions(+), 37 deletions(-) diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py index 8df4f3e9..f9e74712 100644 --- a/modules/igv/1.0/etc/generate_batch_script_per_variant.py +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -21,9 +21,18 @@ def main(): sys.stdout = stdout try: - input_maf = open(snakemake.input[0], "r") - input_bam = snakemake.input[1] - input_bai = snakemake.input[2] + # Handle matched samples with matched normal BAMs + input_bams = snakemake.input[0:len(snakemake.input) - 3] + input_bam = input_bams[:int(len(input_bams) / 2)] + input_bai = input_bams[int(len(input_bams)/2):] + + inputs = snakemake.input[-3:len(snakemake.input)] + batch_options = snakemake.params[4] + + # Print run info for logging + print(f"Setting up batch scripts using the following inputs:\nBam files:\t{input_bam}\nBai files:\t{input_bai}\nParameters:\t{snakemake.params[6]}\nBatch options:\t{batch_options}") + + input_maf = open(inputs[0], "r") # Skip if no variants in outfile line_count = 0 @@ -43,8 +52,7 @@ def main(): # Read MAF file and create dataframe regions = get_regions_df( input_maf, - seq_type=snakemake.params[2], - padding=snakemake.params[4] + padding=batch_options["padding"] ) input_maf.close() @@ -58,11 +66,11 @@ def main(): snapshot_dir = snakemake.params[1], genome_build = snakemake.params[2], seq_type = snakemake.params[3], - igv_options = snakemake.params[5], - max_height = snakemake.params[6], - suffix = snakemake.params[7], - as_pairs = snakemake.params[8], - sleep_timer = snakemake.params[9] + suffix = snakemake.params[5], + igv_presets = snakemake.params[6], + igv_options = batch_options["igv_options"], + max_height = batch_options["max_height"], + sleep_timer = batch_options["sleep_timer"] ) touch_output = open(snakemake.output[0], "w") @@ -72,7 +80,7 @@ def main(): logging.error(e, exc_info=1) raise -def get_regions_df(input_maf, seq_type, padding): +def get_regions_df(input_maf, padding): # Read MAF as dataframe maf = pd.read_table(input_maf, comment="#", sep="\t") @@ -91,7 +99,8 @@ def get_regions_df(input_maf, seq_type, padding): "region_name": maf.Hugo_Symbol, "sample_id": maf.Tumor_Sample_Barcode, "snapshot_coordinates": snapshot_coordinates, - "padding": padding + "padding": padding, + "pair_status": maf.pair_status } ) @@ -104,50 +113,67 @@ def output_lines(lines, batch_output): output.write(text) output.close() -def generate_igv_batch_per_row(coordinates, snapshot_filename): +def generate_igv_batch_per_row(sleep_interval, presets, options, coordinates, directory, child_dir, seq_build, chrom_directory, snapshot_filename): lines = [] - lines.append(f"goto {coordinates}") - lines.append("sort") - lines.append("collapse") - lines.append(f"snapshot {snapshot_filename}") + paired_lines = [] - return lines + lines.append(f"goto {coordinates}") -def generate_igv_batch_header(bam, index, max_height, genome_build, igv_options, sleep_timer, as_pairs): + for preset in presets: + snapshot_regions_dir = os.path.join(directory, seq_build, child_dir, preset, chrom_directory, "") + if preset == "paired_reads": + # Low sleep interval to speed up process + paired_lines.append("setSleepInterval 1") + paired_lines.append(f"snapshotDirectory {snapshot_regions_dir}") + for igv_option in options["paired_reads"]: + paired_lines.append(igv_option) + paired_lines.append(f"setSleepInterval {sleep_interval}") + paired_lines.append("collapse") + paired_lines.append(f"snapshot {snapshot_filename}") + else: + # Low sleep interval to speed up process + lines.append("setSleepInterval 1") + lines.append(f"snapshotDirectory {snapshot_regions_dir}") + for igv_option in options[preset]: + lines.append(igv_option) + lines.append(f"setSleepInterval {sleep_interval}") + lines.append("collapse") + lines.append(f"snapshot {snapshot_filename}") + + # Paired lines go last because `View as Pairs` setting remains on until IGV session ends + variant_lines = lines + paired_lines + + return variant_lines + +def generate_igv_batch_header(bam, index, max_height, genome_build): lines = [] genome_build = genome_build.replace("grch37","hg19") - bam_file = bam - bai_file = index - lines.append(f"load {bam_file} index={bai_file}") + assert len(bam) == len(index), "Error while generating batch script: number of .bam files and .bai files are not equal" + + for i in range(0,len(bam)): + lines.append(f"load {bam[i]} index={index[i]}") lines.append(f"maxPanelHeight {max_height}") lines.append(f"genome {genome_build}") - - if igv_options is not None: - for option in igv_options: - lines.append(option) - - lines.append(f"setSleepInterval {sleep_timer}") - - if as_pairs: - lines.append("viewaspairs") return lines -def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_build, seq_type, igv_options, max_height, suffix, as_pairs=False, sleep_timer=2000): +def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_build, seq_type, suffix, igv_presets, igv_options, max_height, sleep_timer=2000): for _, row in regions.iterrows(): all_lines = [] - header = generate_igv_batch_header(bam=bam, index=bai, max_height=max_height, genome_build=genome_build, igv_options=igv_options, sleep_timer=sleep_timer, as_pairs=as_pairs) + header = generate_igv_batch_header(bam=bam, index=bai, max_height=max_height, genome_build=genome_build) all_lines.extend(header) - dir_chrom = row.chromosome - seq_type_build = f"{seq_type}--{genome_build}" + if row.pair_status == "matched": + child_directory = "tumour_normal_pair" + elif row.pair_status == "unmatched": + child_directory = "tumour_only" - snapshot_regions_dir = os.path.join(snapshot_dir, seq_type_build, dir_chrom, "") - all_lines.append(f"snapshotDirectory {snapshot_regions_dir}") + seq_type_build = f"{seq_type}--{genome_build}" + chrom_dir = row.chromosome filename = [] filename.append(row.region), @@ -158,7 +184,14 @@ def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_bui filename = "--".join(filename) + suffix + ".png" lines = generate_igv_batch_per_row( + sleep_interval = sleep_timer, + presets = igv_presets, + options = igv_options, coordinates = row.snapshot_coordinates, + directory = snapshot_dir, + child_dir = child_directory, + seq_build = seq_type_build, + chrom_directory = chrom_dir, snapshot_filename = filename ) From d35faaa2bb861585f760a3c49e779a7424eafd08 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Wed, 6 Sep 2023 23:29:42 -0700 Subject: [PATCH 082/132] Update directory syntax for tumour normal pairs and igv preferences presets --- modules/igv/1.0/igv.smk | 111 +++++++++++++++++++++++++++++++--------- 1 file changed, 87 insertions(+), 24 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index cd779135..08dd238f 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -47,9 +47,7 @@ CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin( CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["options"]["genome_map"]["hg38"]), "hg38", inplace=True) # Define output file suffix based on config parameters -SUFFIX = ".pad" + str(CFG["options"]["generate_batch_script"]["padding"]) -if CFG["options"]["generate_batch_script"]["view_as_pairs"]: - SUFFIX = SUFFIX + ".pairs" +SUFFIX = ".pad" + str(CFG["options"]["generate_batch_script"]["padding"]) # Define rules to be run locally when using a compute cluster localrules: @@ -83,6 +81,14 @@ def get_bai(wildcards): metadata = config["lcr-modules"]["igv"]["samples"] return expand("data/{{seq_type}}_bams/{{tumour_id}}.{genome_build}.bam.bai", genome_build=metadata[(metadata.sample_id == wildcards.tumour_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) +def get_normal_bam(wildcards): + metadata = config["lcr-modules"]["igv"]["samples"] + return expand("data/{{seq_type}}_bams/{{normal_sample_id}}.{genome_build}.bam", genome_build=metadata[(metadata.sample_id == wildcards.normal_sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) + +def get_normal_bai(wildcards): + metadata = config["lcr-modules"]["igv"]["samples"] + return expand("data/{{seq_type}}_bams/{{normal_sample_id}}.{genome_build}.bam.bai", genome_build=metadata[(metadata.sample_id == wildcards.normal_sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) + def get_maf(wildcards): unix_group = config["unix_group"] return expand(config["lcr-modules"]["igv"]["inputs"]["maf"], allow_missing=True, unix_group=unix_group) @@ -121,6 +127,22 @@ rule _igv_symlink_bai: run: op.absolute_symlink(input.bai, output.bai) +rule _igv_symlink_normal_bam: + input: + bam = get_normal_bam + output: + bam = CFG["dirs"]["inputs"] + "normal_bams/{seq_type}/{normal_sample_id}.bam" + run: + op.absolute_symlink(input.bam, output.bam) + +rule _igv_symlink_normal_bai: + input: + bai = get_normal_bai + output: + bai = CFG["dirs"]["inputs"] + "normal_bams/{seq_type}/{normal_sample_id}.bam.bai" + run: + op.absolute_symlink(input.bai, output.bai) + rule _igv_symlink_maf: input: maf = get_maf @@ -229,13 +251,46 @@ def _get_maf(wildcards): ) ) +def _get_bam_files(wildcards): + CFG = config["lcr-modules"]["igv"] + + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id=wildcards.tumour_id, tumour_seq_type=wildcards.seq_type) + normal_sample_id = this_sample["normal_sample_id"] + pair_status = this_sample["pair_status"] + + if pair_status.item() == "matched": + tumour_bam_file = expand(str(rules._igv_symlink_bam.output.bam), zip, seq_type=wildcards.seq_type, tumour_id=wildcards.tumour_id)[0] + normal_bam_file = expand(str(rules._igv_symlink_normal_bam.output.bam), zip, seq_type=wildcards.seq_type, normal_sample_id=normal_sample_id)[0] + return([tumour_bam_file, normal_bam_file]) + + if pair_status.item() == "unmatched": + return ( + expand(str(rules._igv_symlink_bam.output.bam), zip, seq_type=wildcards.seq_type, tumour_id=wildcards.tumour_id) + ) + +def _get_bai_files(wildcards): + CFG = config["lcr-modules"]["igv"] + + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id=wildcards.tumour_id, tumour_seq_type=wildcards.seq_type) + normal_sample_id = this_sample["normal_sample_id"] + pair_status = this_sample["pair_status"] + + if pair_status.item() == "matched": + tumour_bai_file = expand(str(rules._igv_symlink_bai.output.bai), zip, seq_type=wildcards.seq_type, tumour_id=wildcards.tumour_id)[0] + normal_bai_file = expand(str(rules._igv_symlink_normal_bai.output.bai), zip, seq_type=wildcards.seq_type, normal_sample_id=normal_sample_id)[0] + return([tumour_bai_file, normal_bai_file]) + if pair_status.item() == "unmatched": + return( + expand(str(rules._igv_symlink_bai.output.bai), zip, seq_type=wildcards.seq_type, tumour_id=wildcards.tumour_id) + ) + if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: # Create batch scripts for each variant checkpoint _igv_create_batch_script_per_variant: input: + bam_file = _get_bam_files, + bai_file = _get_bai_files, filter_maf = _get_maf, - bam_file = str(rules._igv_symlink_bam.output.bam), - bai_file = str(rules._igv_symlink_bai.output.bai), regions_lifted = str(rules._igv_liftover_regions.output.regions), regions_formatted = str(rules._igv_format_regions_file.output.regions) output: @@ -245,12 +300,9 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: snapshot_dir = config["lcr-modules"]["igv"]["dirs"]["snapshots"], genome_build = lambda w: w.genome_build, seq_type = lambda w: w.seq_type, - padding = config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["padding"], - igv_options = config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["igv_options"], - max_height = config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["max_height"], + batch_options = config["lcr-modules"]["igv"]["options"]["generate_batch_script"], suffix = SUFFIX, - view_pairs = config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["view_as_pairs"], - sleep_timer = config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["sleep_timer"] + igv_presets = config["lcr-modules"]["igv"]["options"]["igv_presets"] log: stdout = CFG["logs"]["batch_scripts"] + "_igv_create_batch_script_per_variant/{seq_type}--{genome_build}/{tumour_id}" + SUFFIX + ".stdout.log", stderr = CFG["logs"]["batch_scripts"] + "_igv_create_batch_script_per_variant/{seq_type}--{genome_build}/{tumour_id}" + SUFFIX + ".stderr.log" @@ -391,9 +443,9 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: rule _igv_quality_control: input: igv = str(rules._igv_run.output.complete), - snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" + snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" output: - snapshot_qc = temp(CFG["dirs"]["snapshots"] + "qc/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".qc") + snapshot_qc = temp(CFG["dirs"]["snapshots"] + "qc/{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".qc") params: batch_script = lambda w: config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + f"single_batch_scripts/{w.seq_type}--{w.genome_build}/{w.chromosome}:{w.start_position}--{w.gene}--{w.tumour_id}" + SUFFIX + ".batch", merged_batch = lambda w: config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + f"merged_batch_scripts/{w.seq_type}--{w.genome_build}/{w.tumour_id}" + SUFFIX + ".batch", @@ -404,8 +456,8 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: resources: **CFG["resources"]["_igv_quality_control"] log: - stdout = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stdout.log", - stderr = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stderr.log" + stdout = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stdout.log", + stderr = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stderr.log" threads: (workflow.cores) run: import subprocess @@ -469,10 +521,10 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: # Symlinks the final output files into the module results directory (under '99-outputs/') rule _igv_symlink_snapshot: input: - snapshot = ancient(CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png"), + snapshot = ancient(CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png"), snapshot_qc = str(rules._igv_quality_control.output.snapshot_qc) output: - snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" + snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" threads: CFG["threads"]["_igv_symlink_snapshot"] run: @@ -488,6 +540,9 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: normal_sample_id = this_sample["normal_sample_id"] pair_status = this_sample["pair_status"] + # Assign pair_status_directories based on pair_status value + PAIR_STATUS_DICT = {"matched": "tumour_normal_pair", "unmatched": "tumour_only"} + maf = expand( str(rules._igv_filter_maf.output.maf), zip, @@ -502,14 +557,22 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: maf_table = pd.read_table(maf[0], comment="#", sep="\t") return expand( - str(rules._igv_symlink_snapshot.output.snapshot), - zip, - seq_type = maf_table["seq_type"], - genome_build = maf_table["genome_build"], - chromosome = maf_table["chr_std"], - start_position = maf_table["Start_Position"], - gene = maf_table["Hugo_Symbol"], - tumour_id = maf_table["Tumor_Sample_Barcode"] + expand( + expand( + str(rules._igv_symlink_snapshot.output.snapshot), + zip, + seq_type = maf_table["seq_type"], + genome_build = maf_table["genome_build"], + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + tumour_id = maf_table["Tumor_Sample_Barcode"], + allow_missing = True + ), + pair_status_directory = PAIR_STATUS_DICT[pair_status.item()], + allow_missing = True + ), + preset_directory = CFG["options"]["igv_presets"] ) else: return [] From 91c0355069ec2ccef01dc58e5561f5ae2f990178 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Wed, 6 Sep 2023 23:30:20 -0700 Subject: [PATCH 083/132] Add ability to define igv presets of different IGV parameters --- modules/igv/1.0/config/default.yaml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index 016cd4fe..c51e6802 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -9,7 +9,7 @@ lcr-modules: regions_format: "__UPDATE__" # Available options are "bed", "maf", "oncodriveclustl", "hotmaps" or "mutation_id" regions_build: "__UPDATE__" # Genome build of regions file, which will be lifted over as needed to filter MAFs on opposite genome builds - estimate_only: True # Stop after MAF filtering step to estimate total number of snapshots + estimate_only: False # Stop after MAF filtering step to estimate total number of snapshots identify_failed_snaps: False options: @@ -31,12 +31,16 @@ lcr-modules: grch37: "/projects/rmorin/reference/igenomes/Homo_sapiens/GSC/GRCh37-lite/Sequence/WholeGenomeFasta/genome.fa" hg38: "/projects/rmorin/reference/igenomes/Homo_sapiens/GSC/GRCh38/Sequence/WholeGenomeFasta/genome.fa" + igv_presets: ["default", "paired_reads"] + generate_batch_script: padding: 100 # Base pairs upstream and downstream of variant position max_height: 1000 sleep_timer: 2000 # Batch scripts with more options may require longer sleep intervals # Available igv options: https://github.com/igvteam/igv/wiki/Batch-commands - igv_options: ["preference SAM.COLOR_BY READ_STRAND", "preference SAM.SHOW_CENTER_LINE TRUE", "preference SAM.DOWNSAMPLE_READS FALSE"] + igv_options: + default: ["preference SAM.COLOR_BY READ_STRAND", "preference SAM.SHOW_CENTER_LINE TRUE", "preference SAM.SHADE_BASE_QUALITY true", "preference SAM.DOWNSAMPLE_READS FALSE", "preference SAM.ALLELE_THRESHOLD 0.05", "sort"] + paired_reads: ["viewaspairs", "preference SAM.COLOR_BY READ_STRAND", "preference SAM.SHOW_CENTER_LINE TRUE", "preference SAM.SHADE_BASE_QUALITY true", "preference SAM.DOWNSAMPLE_READS FALSE", "preference SAM.ALLELE_THRESHOLD 0.05", "sort QUALITY"] view_as_pairs: False # Toggle pairwise orientation in IGV xvfb_parameters: From 1665b7fd77a031441556c8be2914379d32fe6d43 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Thu, 7 Sep 2023 16:02:48 -0700 Subject: [PATCH 084/132] Add preset and pair directory wildcards to batch scripts to track which variant/present combos have been run through IGV --- .../etc/generate_batch_script_per_variant.py | 126 +++++++-------- modules/igv/1.0/igv.smk | 144 +++++++++++------- 2 files changed, 142 insertions(+), 128 deletions(-) diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py index f9e74712..9fa9b9f7 100644 --- a/modules/igv/1.0/etc/generate_batch_script_per_variant.py +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -113,37 +113,23 @@ def output_lines(lines, batch_output): output.write(text) output.close() -def generate_igv_batch_per_row(sleep_interval, presets, options, coordinates, directory, child_dir, seq_build, chrom_directory, snapshot_filename): +def generate_igv_batch_per_row(sleep_interval, preset, options, coordinates, directory, child_dir, seq_build, chrom_directory, snapshot_filename): lines = [] - paired_lines = [] lines.append(f"goto {coordinates}") - for preset in presets: - snapshot_regions_dir = os.path.join(directory, seq_build, child_dir, preset, chrom_directory, "") - if preset == "paired_reads": - # Low sleep interval to speed up process - paired_lines.append("setSleepInterval 1") - paired_lines.append(f"snapshotDirectory {snapshot_regions_dir}") - for igv_option in options["paired_reads"]: - paired_lines.append(igv_option) - paired_lines.append(f"setSleepInterval {sleep_interval}") - paired_lines.append("collapse") - paired_lines.append(f"snapshot {snapshot_filename}") - else: - # Low sleep interval to speed up process - lines.append("setSleepInterval 1") - lines.append(f"snapshotDirectory {snapshot_regions_dir}") - for igv_option in options[preset]: - lines.append(igv_option) - lines.append(f"setSleepInterval {sleep_interval}") - lines.append("collapse") - lines.append(f"snapshot {snapshot_filename}") - - # Paired lines go last because `View as Pairs` setting remains on until IGV session ends - variant_lines = lines + paired_lines - - return variant_lines + snapshot_regions_dir = os.path.join(directory, seq_build, child_dir, preset, chrom_directory, "") + + # Low sleep interval to speed up process + lines.append("setSleepInterval 1") + lines.append(f"snapshotDirectory {snapshot_regions_dir}") + for igv_option in options[preset]: + lines.append(igv_option) + lines.append(f"setSleepInterval {sleep_interval}") + lines.append("collapse") + lines.append(f"snapshot {snapshot_filename}") + + return lines def generate_igv_batch_header(bam, index, max_height, genome_build): lines = [] @@ -161,49 +147,49 @@ def generate_igv_batch_header(bam, index, max_height, genome_build): return lines def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_build, seq_type, suffix, igv_presets, igv_options, max_height, sleep_timer=2000): - for _, row in regions.iterrows(): - all_lines = [] - - header = generate_igv_batch_header(bam=bam, index=bai, max_height=max_height, genome_build=genome_build) - all_lines.extend(header) - - if row.pair_status == "matched": - child_directory = "tumour_normal_pair" - elif row.pair_status == "unmatched": - child_directory = "tumour_only" - - seq_type_build = f"{seq_type}--{genome_build}" - chrom_dir = row.chromosome - - filename = [] - filename.append(row.region), - filename.append(row.region_name) - filename.append(row.sample_id) - - batch_filename = "--".join(filename) + suffix + ".batch" - filename = "--".join(filename) + suffix + ".png" - - lines = generate_igv_batch_per_row( - sleep_interval = sleep_timer, - presets = igv_presets, - options = igv_options, - coordinates = row.snapshot_coordinates, - directory = snapshot_dir, - child_dir = child_directory, - seq_build = seq_type_build, - chrom_directory = chrom_dir, - snapshot_filename = filename - ) - - all_lines.extend(lines) - - for subdir in [os.path.join(output_dir, "single_batch_scripts"), os.path.join(output_dir, "single_batch_scripts", seq_type_build)]: - if not os.path.exists(subdir): - os.mkdir(subdir) - - batch_file_path = os.path.join(output_dir, "single_batch_scripts", seq_type_build, batch_filename) - - output_lines(all_lines, batch_file_path) + for preset in igv_presets: + for _, row in regions.iterrows(): + all_lines = [] + + header = generate_igv_batch_header(bam=bam, index=bai, max_height=max_height, genome_build=genome_build) + all_lines.extend(header) + + if row.pair_status == "matched": + child_directory = "tumour_normal_pair" + elif row.pair_status == "unmatched": + child_directory = "tumour_only" + + seq_type_build = f"{seq_type}--{genome_build}" + chrom_dir = row.chromosome + + filename = [] + filename.append(row.region), + filename.append(row.region_name) + filename.append(row.sample_id) + + batch_filename = "--".join(filename) + suffix + ".batch" + filename = "--".join(filename) + suffix + ".png" + + lines = generate_igv_batch_per_row( + sleep_interval = sleep_timer, + preset = preset, + options = igv_options, + coordinates = row.snapshot_coordinates, + directory = snapshot_dir, + child_dir = child_directory, + seq_build = seq_type_build, + chrom_directory = chrom_dir, + snapshot_filename = filename + ) + + all_lines.extend(lines) + + # Make subdirectories if necessary because snakemake won't make them since rule is a checkpoint + os.makedirs(os.path.join(output_dir, "single_batch_scripts", seq_type_build, child_directory, preset), exist_ok=True) + + batch_file_path = os.path.join(output_dir, "single_batch_scripts", seq_type_build, child_directory, preset, batch_filename) + + output_lines(all_lines, batch_file_path) if __name__ == "__main__": logging.basicConfig( diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 08dd238f..fbda6904 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -49,6 +49,12 @@ CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin( # Define output file suffix based on config parameters SUFFIX = ".pad" + str(CFG["options"]["generate_batch_script"]["padding"]) +# Reorganize presets so paired_reads is last +PRESETS = CFG["options"]["igv_presets"] +if "paired_reads" in PRESETS: + paired_idx = PRESETS.index("paired_reads") + PRESETS[paired_idx], PRESETS[len(PRESETS)-1] = PRESETS[len(PRESETS)-1], PRESETS[paired_idx] + # Define rules to be run locally when using a compute cluster localrules: _igv_symlink_regions_file, @@ -231,66 +237,79 @@ rule _igv_filter_maf: script: config["lcr-modules"]["igv"]["scripts"]["filter_script"] -def _get_maf(wildcards): - CFG = config["lcr-modules"]["igv"] - - this_sample = op.filter_samples(CFG["runs"], tumour_sample_id=wildcards.tumour_id, tumour_seq_type=wildcards.seq_type) - genome_build = this_sample["tumour_genome_build"] - normal_sample_id = this_sample["normal_sample_id"] - pair_status = this_sample["pair_status"] - - return ( - expand( - str(rules._igv_filter_maf.output.maf), - zip, - seq_type = wildcards.seq_type, - genome_build = genome_build, - tumour_id = wildcards.tumour_id, - normal_sample_id = normal_sample_id, - pair_status = pair_status - ) - ) +if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: -def _get_bam_files(wildcards): - CFG = config["lcr-modules"]["igv"] + # Trigger batch scripts to be created if new presets are specified + rule _igv_touch_presets: + output: + preset = CFG["dirs"]["inputs"] + "presets/{preset}.touch" + shell: + "touch {output.preset}" - this_sample = op.filter_samples(CFG["runs"], tumour_sample_id=wildcards.tumour_id, tumour_seq_type=wildcards.seq_type) - normal_sample_id = this_sample["normal_sample_id"] - pair_status = this_sample["pair_status"] + def _get_maf(wildcards): + CFG = config["lcr-modules"]["igv"] - if pair_status.item() == "matched": - tumour_bam_file = expand(str(rules._igv_symlink_bam.output.bam), zip, seq_type=wildcards.seq_type, tumour_id=wildcards.tumour_id)[0] - normal_bam_file = expand(str(rules._igv_symlink_normal_bam.output.bam), zip, seq_type=wildcards.seq_type, normal_sample_id=normal_sample_id)[0] - return([tumour_bam_file, normal_bam_file]) + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id=wildcards.tumour_id, tumour_seq_type=wildcards.seq_type) + genome_build = this_sample["tumour_genome_build"] + normal_sample_id = this_sample["normal_sample_id"] + pair_status = this_sample["pair_status"] - if pair_status.item() == "unmatched": return ( - expand(str(rules._igv_symlink_bam.output.bam), zip, seq_type=wildcards.seq_type, tumour_id=wildcards.tumour_id) + expand( + str(rules._igv_filter_maf.output.maf), + zip, + seq_type = wildcards.seq_type, + genome_build = genome_build, + tumour_id = wildcards.tumour_id, + normal_sample_id = normal_sample_id, + pair_status = pair_status + ) ) -def _get_bai_files(wildcards): - CFG = config["lcr-modules"]["igv"] + def _get_bam_files(wildcards): + CFG = config["lcr-modules"]["igv"] - this_sample = op.filter_samples(CFG["runs"], tumour_sample_id=wildcards.tumour_id, tumour_seq_type=wildcards.seq_type) - normal_sample_id = this_sample["normal_sample_id"] - pair_status = this_sample["pair_status"] + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id=wildcards.tumour_id, tumour_seq_type=wildcards.seq_type) + normal_sample_id = this_sample["normal_sample_id"] + pair_status = this_sample["pair_status"] - if pair_status.item() == "matched": - tumour_bai_file = expand(str(rules._igv_symlink_bai.output.bai), zip, seq_type=wildcards.seq_type, tumour_id=wildcards.tumour_id)[0] - normal_bai_file = expand(str(rules._igv_symlink_normal_bai.output.bai), zip, seq_type=wildcards.seq_type, normal_sample_id=normal_sample_id)[0] - return([tumour_bai_file, normal_bai_file]) - if pair_status.item() == "unmatched": - return( - expand(str(rules._igv_symlink_bai.output.bai), zip, seq_type=wildcards.seq_type, tumour_id=wildcards.tumour_id) - ) + if pair_status.item() == "matched": + tumour_bam_file = expand(str(rules._igv_symlink_bam.output.bam), zip, seq_type=wildcards.seq_type, tumour_id=wildcards.tumour_id)[0] + normal_bam_file = expand(str(rules._igv_symlink_normal_bam.output.bam), zip, seq_type=wildcards.seq_type, normal_sample_id=normal_sample_id)[0] + return([tumour_bam_file, normal_bam_file]) + + if pair_status.item() == "unmatched": + return ( + expand(str(rules._igv_symlink_bam.output.bam), zip, seq_type=wildcards.seq_type, tumour_id=wildcards.tumour_id) + ) + + def _get_bai_files(wildcards): + CFG = config["lcr-modules"]["igv"] + + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id=wildcards.tumour_id, tumour_seq_type=wildcards.seq_type) + normal_sample_id = this_sample["normal_sample_id"] + pair_status = this_sample["pair_status"] + + if pair_status.item() == "matched": + tumour_bai_file = expand(str(rules._igv_symlink_bai.output.bai), zip, seq_type=wildcards.seq_type, tumour_id=wildcards.tumour_id)[0] + normal_bai_file = expand(str(rules._igv_symlink_normal_bai.output.bai), zip, seq_type=wildcards.seq_type, normal_sample_id=normal_sample_id)[0] + return([tumour_bai_file, normal_bai_file]) + if pair_status.item() == "unmatched": + return( + expand(str(rules._igv_symlink_bai.output.bai), zip, seq_type=wildcards.seq_type, tumour_id=wildcards.tumour_id) + ) + + def _get_presets(wildcards): + CFG = config["lcr-modules"]["igv"] + return(expand(str(rules._igv_touch_presets.output.preset), preset=CFG["options"]["igv_presets"])) -if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: # Create batch scripts for each variant checkpoint _igv_create_batch_script_per_variant: input: bam_file = _get_bam_files, bai_file = _get_bai_files, filter_maf = _get_maf, + presets = _get_presets, regions_lifted = str(rules._igv_liftover_regions.output.regions), regions_formatted = str(rules._igv_format_regions_file.output.regions) output: @@ -312,9 +331,9 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: # Keep track of which variant and sample_id combinations have been seen, merge individual variant batch scripts into a large batch script per sample_id rule _igv_batches_to_merge: input: - batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" + batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" output: - dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" + dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" params: batch_script_file = str(rules._igv_create_batch_script_per_variant.output.variant_batch), igv_options = CFG["options"]["generate_batch_script"]["igv_options"] @@ -331,9 +350,7 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: with open(output_file, "a") as handle: for line in batch_script: if merged_lines > 0: - if line.startswith(("load","maxPanelHeight","genome","viewaspairs","setSleepInterval")): - continue - if line.startswith(tuple(params.igv_options)): + if line.startswith(("load","maxPanelHeight","genome")): continue handle.write(line) batch_script.close() @@ -351,6 +368,9 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: normal_sample_id = this_sample["normal_sample_id"] pair_status = this_sample["pair_status"] + # Assign pair_status_directories based on pair_status value + PAIR_STATUS_DICT = {"matched": "tumour_normal_pair", "unmatched": "tumour_only"} + maf = expand( str(rules._igv_filter_maf.output.maf), zip, @@ -365,15 +385,23 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: maf_table = pd.read_table(maf[0], comment="#", sep="\t") return expand( - str(rules._igv_batches_to_merge.output.dispatched_batch_script), - zip, - chromosome = maf_table["chr_std"], - start_position = maf_table["Start_Position"], - gene = maf_table["Hugo_Symbol"], - tumour_id = maf_table["Tumor_Sample_Barcode"], - seq_type = maf_table["seq_type"], - genome_build = maf_table["genome_build"] - ) + expand( + expand( + str(rules._igv_batches_to_merge.output.dispatched_batch_script), + zip, + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + tumour_id = maf_table["Tumor_Sample_Barcode"], + seq_type = maf_table["seq_type"], + genome_build = maf_table["genome_build"], + allow_missing = True + ), + pair_status_directory = PAIR_STATUS_DICT[pair_status.item()], + allow_missing = True + ), + preset_directory = PRESETS + ) else: return [] From 8b3c52018af5e68981ba5f0b0d2d3be6bdf19e27 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Tue, 12 Sep 2023 15:26:41 -0700 Subject: [PATCH 085/132] Create one merged batch per IGV preset, add quality control input function to fix error where not all snapshots were being checked, clean up quality control params --- .../etc/generate_batch_script_per_variant.py | 16 +- modules/igv/1.0/igv.smk | 140 +++++++++++------- 2 files changed, 96 insertions(+), 60 deletions(-) diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py index 9fa9b9f7..4d4951cb 100644 --- a/modules/igv/1.0/etc/generate_batch_script_per_variant.py +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -120,8 +120,6 @@ def generate_igv_batch_per_row(sleep_interval, preset, options, coordinates, dir snapshot_regions_dir = os.path.join(directory, seq_build, child_dir, preset, chrom_directory, "") - # Low sleep interval to speed up process - lines.append("setSleepInterval 1") lines.append(f"snapshotDirectory {snapshot_regions_dir}") for igv_option in options[preset]: lines.append(igv_option) @@ -151,6 +149,8 @@ def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_bui for _, row in regions.iterrows(): all_lines = [] + merged_batch_suffix = row.sample_id + suffix + ".batch" + header = generate_igv_batch_header(bam=bam, index=bai, max_height=max_height, genome_build=genome_build) all_lines.extend(header) @@ -185,11 +185,19 @@ def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_bui all_lines.extend(lines) # Make subdirectories if necessary because snakemake won't make them since rule is a checkpoint - os.makedirs(os.path.join(output_dir, "single_batch_scripts", seq_type_build, child_directory, preset), exist_ok=True) + os.makedirs(os.path.join(output_dir, "single_batch_scripts", seq_type_build, preset), exist_ok=True) - batch_file_path = os.path.join(output_dir, "single_batch_scripts", seq_type_build, child_directory, preset, batch_filename) + batch_file_path = os.path.join(output_dir, "single_batch_scripts", seq_type_build, preset, batch_filename) output_lines(all_lines, batch_file_path) + + os.makedirs(os.path.join(output_dir, "merged_batch_scripts", seq_type_build, preset), exist_ok=True) + + merged_preset_path = os.path.join(output_dir, "merged_batch_scripts", seq_type_build, preset, merged_batch_suffix) + + merged_preset_touch = open(merged_preset_path, "w") + merged_preset_touch.close() + if __name__ == "__main__": logging.basicConfig( diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index fbda6904..edd57b18 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -49,11 +49,8 @@ CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin( # Define output file suffix based on config parameters SUFFIX = ".pad" + str(CFG["options"]["generate_batch_script"]["padding"]) -# Reorganize presets so paired_reads is last -PRESETS = CFG["options"]["igv_presets"] -if "paired_reads" in PRESETS: - paired_idx = PRESETS.index("paired_reads") - PRESETS[paired_idx], PRESETS[len(PRESETS)-1] = PRESETS[len(PRESETS)-1], PRESETS[paired_idx] +# Assign pair_status_directory value based on pair_status value +PAIR_STATUS_DICT = {"matched": "tumour_normal_pair", "unmatched": "tumour_only"} # Define rules to be run locally when using a compute cluster localrules: @@ -239,13 +236,6 @@ rule _igv_filter_maf: if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: - # Trigger batch scripts to be created if new presets are specified - rule _igv_touch_presets: - output: - preset = CFG["dirs"]["inputs"] + "presets/{preset}.touch" - shell: - "touch {output.preset}" - def _get_maf(wildcards): CFG = config["lcr-modules"]["igv"] @@ -299,21 +289,17 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: expand(str(rules._igv_symlink_bai.output.bai), zip, seq_type=wildcards.seq_type, tumour_id=wildcards.tumour_id) ) - def _get_presets(wildcards): - CFG = config["lcr-modules"]["igv"] - return(expand(str(rules._igv_touch_presets.output.preset), preset=CFG["options"]["igv_presets"])) - # Create batch scripts for each variant checkpoint _igv_create_batch_script_per_variant: input: bam_file = _get_bam_files, bai_file = _get_bai_files, filter_maf = _get_maf, - presets = _get_presets, regions_lifted = str(rules._igv_liftover_regions.output.regions), regions_formatted = str(rules._igv_format_regions_file.output.regions) output: - variant_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{tumour_id}" + SUFFIX + ".batch" + batches_finished = CFG["dirs"]["batch_scripts"] + "completed/{seq_type}--{genome_build}/{tumour_id}.finished", + variant_batch = expand(CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{{seq_type}}--{{genome_build}}/{preset_directory}/{{tumour_id}}" + SUFFIX + ".batch", preset_directory = CFG["options"]["igv_presets"], allow_missing=True), params: batch_dir = config["lcr-modules"]["igv"]["dirs"]["batch_scripts"], snapshot_dir = config["lcr-modules"]["igv"]["dirs"]["snapshots"], @@ -331,16 +317,16 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: # Keep track of which variant and sample_id combinations have been seen, merge individual variant batch scripts into a large batch script per sample_id rule _igv_batches_to_merge: input: - batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" + batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" output: - dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" + dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" params: - batch_script_file = str(rules._igv_create_batch_script_per_variant.output.variant_batch), - igv_options = CFG["options"]["generate_batch_script"]["igv_options"] + merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset_directory}/{tumour_id}" + SUFFIX + ".batch", + igv_options = lambda w: config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["igv_options"][w.preset_directory] threads: (workflow.cores / 10) run: batch_script_path = os.path.abspath(input.batch_script) - output_file = os.path.abspath(params.batch_script_file) + output_file = os.path.abspath(params.merged_batch) batch_script = open(batch_script_path, "r") @@ -350,7 +336,9 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: with open(output_file, "a") as handle: for line in batch_script: if merged_lines > 0: - if line.startswith(("load","maxPanelHeight","genome")): + if line.startswith(("load","maxPanelHeight","genome", "setSleepInterval", "collapse")): + continue + if line.startswith(tuple(params.igv_options)) and not line.startswith("sort"): continue handle.write(line) batch_script.close() @@ -368,9 +356,6 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: normal_sample_id = this_sample["normal_sample_id"] pair_status = this_sample["pair_status"] - # Assign pair_status_directories based on pair_status value - PAIR_STATUS_DICT = {"matched": "tumour_normal_pair", "unmatched": "tumour_only"} - maf = expand( str(rules._igv_filter_maf.output.maf), zip, @@ -385,7 +370,6 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: maf_table = pd.read_table(maf[0], comment="#", sep="\t") return expand( - expand( expand( str(rules._igv_batches_to_merge.output.dispatched_batch_script), zip, @@ -397,11 +381,8 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: genome_build = maf_table["genome_build"], allow_missing = True ), - pair_status_directory = PAIR_STATUS_DICT[pair_status.item()], - allow_missing = True - ), - preset_directory = PRESETS - ) + preset_directory = wildcards.preset_directory + ) else: return [] @@ -426,11 +407,11 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: input: igv = str(rules._igv_download_igv.output.igv_installed), batch_script = _evaluate_batches, - merged_batch = str(rules._igv_create_batch_script_per_variant.output.variant_batch) + finished_batches = str(rules._igv_create_batch_script_per_variant.output.batches_finished) output: - complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}--{tumour_id}.completed" + complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}--{tumour_id}--{preset_directory}.completed" params: - merged_batch = str(rules._igv_create_batch_script_per_variant.output.variant_batch), + merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset_directory}/{tumour_id}" + SUFFIX + ".batch", igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh", max_time = CFG["options"]["generate_batch_script"]["sleep_timer"], server_number = "-n " + CFG["options"]["xvfb_parameters"]["server_number"] if CFG["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", @@ -438,8 +419,8 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: resources: **CFG["resources"]["_igv_run"] log: - stdout = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.stdout.log", - stderr = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{tumour_id}_igv_run.stderr.log" + stdout = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{preset_directory}/{tumour_id}_igv_run.stdout.log", + stderr = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{preset_directory}/{tumour_id}_igv_run.stderr.log" threads: (workflow.cores) shell: op.as_one_line(""" @@ -475,12 +456,12 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: output: snapshot_qc = temp(CFG["dirs"]["snapshots"] + "qc/{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".qc") params: - batch_script = lambda w: config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + f"single_batch_scripts/{w.seq_type}--{w.genome_build}/{w.chromosome}:{w.start_position}--{w.gene}--{w.tumour_id}" + SUFFIX + ".batch", - merged_batch = lambda w: config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + f"merged_batch_scripts/{w.seq_type}--{w.genome_build}/{w.tumour_id}" + SUFFIX + ".batch", - igv = config["lcr-modules"]["igv"]["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh", - server_number = "-n " + config["lcr-modules"]["igv"]["options"]["xvfb_parameters"]["server_number"] if config["lcr-modules"]["igv"]["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", - server_args = config["lcr-modules"]["igv"]["options"]["xvfb_parameters"]["server_args"], - batch_temp = lambda w: config["lcr-modules"]["igv"]["dirs"]["batch_scripts"] + f"single_batch_scripts/{w.seq_type}--{w.genome_build}/{w.chromosome}:{w.start_position}--{w.gene}--{w.tumour_id}" + SUFFIX + ".batch.temp" + batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch", + merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset_directory}/{tumour_id}" + SUFFIX + ".batch", + igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh", + server_number = "-n " + CFG["options"]["xvfb_parameters"]["server_number"] if CFG["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", + server_args = CFG["options"]["xvfb_parameters"]["server_args"], + batch_temp = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch.temp" resources: **CFG["resources"]["_igv_quality_control"] log: @@ -549,7 +530,7 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: # Symlinks the final output files into the module results directory (under '99-outputs/') rule _igv_symlink_snapshot: input: - snapshot = ancient(CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png"), + snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png", snapshot_qc = str(rules._igv_quality_control.output.snapshot_qc) output: snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" @@ -568,9 +549,6 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: normal_sample_id = this_sample["normal_sample_id"] pair_status = this_sample["pair_status"] - # Assign pair_status_directories based on pair_status value - PAIR_STATUS_DICT = {"matched": "tumour_normal_pair", "unmatched": "tumour_only"} - maf = expand( str(rules._igv_filter_maf.output.maf), zip, @@ -585,7 +563,6 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: maf_table = pd.read_table(maf[0], comment="#", sep="\t") return expand( - expand( expand( str(rules._igv_symlink_snapshot.output.snapshot), zip, @@ -598,20 +575,58 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: allow_missing = True ), pair_status_directory = PAIR_STATUS_DICT[pair_status.item()], - allow_missing = True - ), - preset_directory = CFG["options"]["igv_presets"] - ) + preset_directory = wildcards.preset_directory + ) else: return [] + def _quality_control(wildcards): + CFG = config["lcr-modules"]["igv"] + + checkpoint_outputs = checkpoints._igv_run.get(**wildcards).output.complete + + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) + + normal_sample_id = this_sample["normal_sample_id"] + pair_status = this_sample["pair_status"] + + maf = expand( + str(rules._igv_filter_maf.output.maf), + zip, + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + tumour_id = wildcards.tumour_id, + normal_sample_id = normal_sample_id, + pair_status = pair_status + ) + + if os.path.exists(maf[0]): + maf_table = pd.read_table(maf[0], comment="#", sep="\t") + + return expand( + expand( + str(rules._igv_quality_control.output.snapshot_qc), + zip, + seq_type = maf_table["seq_type"], + genome_build = maf_table["genome_build"], + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + tumour_id = maf_table["Tumor_Sample_Barcode"], + allow_missing = True + ), + pair_status_directory = PAIR_STATUS_DICT[pair_status.item()], + preset_directory = wildcards.preset_directory + ) + # Check that snapshots have been symlinked and quality controlled rule _igv_check_snapshots: input: igv_completed = str(rules._igv_run.output.complete), - snapshots = _symlink_snapshot + snapshots = _symlink_snapshot, + quality_control = _quality_control output: - snapshots = CFG["dirs"]["outputs"] + "completed/{seq_type}--{genome_build}--{tumour_id}.completed" + snapshots = CFG["dirs"]["outputs"] + "completed/{preset_directory}/{seq_type}--{genome_build}--{tumour_id}.completed" shell: "touch {output.snapshots}" @@ -736,7 +751,20 @@ if CFG["identify_failed_snaps"] is True and CFG["estimate_only"] is False: if CFG["estimate_only"] is False and CFG["identify_failed_snaps"] is False: rule _igv_all: input: - expand([str(rules._igv_run.output.complete), str(rules._igv_check_snapshots.output.snapshots)], zip, tumour_id=CFG["runs"]["tumour_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"]) + expand( + expand( + [ + str(rules._igv_run.output.complete), + str(rules._igv_check_snapshots.output.snapshots) + ], + zip, + tumour_id=CFG["runs"]["tumour_sample_id"], + seq_type=CFG["runs"]["tumour_seq_type"], + genome_build=CFG["runs"]["tumour_genome_build"], + allow_missing=True + ), + preset_directory=CFG["options"]["igv_presets"] + ) if CFG["estimate_only"] is True and CFG["identify_failed_snaps"] is False: rule _igv_all: From 91df3cf8ef02e85bff880c8e69d3bbadd156ff04 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Tue, 12 Sep 2023 16:44:56 -0700 Subject: [PATCH 086/132] Update snapshot estimate for igv presets --- modules/igv/1.0/igv.smk | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index edd57b18..93a9626c 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -637,11 +637,14 @@ if CFG["estimate_only"] is True and CFG["identify_failed_snaps"] is False: finished_filtered_mafs = expand(str(rules._igv_filter_maf.output.maf), zip, seq_type = CFG["runs"]["tumour_seq_type"], tumour_id=CFG["runs"]["tumour_sample_id"], genome_build=CFG["runs"]["tumour_genome_build"], normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"]) output: snapshot_summary = CFG["dirs"]["outputs"] + "snapshot_estimates/snapshot_summary.txt", + snapshot_estimate = CFG["dirs"]["outputs"] + "snapshot_estimates/snapshot_estimate.txt", summary_ready = temp(CFG["dirs"]["outputs"] + "snapshot_estimates/touch_summary.completed") run: - header = "\t".join(["sample_id","seq_type","genome_build","gene","chromosome","position"]) + header = "\t".join(["sample_id","seq_type","genome_build","gene","chromosome","position", "igv_preset"]) with open(output.snapshot_summary,"w") as handle: handle.write(header + "\n") + with open(output.snapshot_estimate, "w") as handle: + handle.write(header + "\n") ready = open(output.summary_ready, "w") ready.close() @@ -650,9 +653,10 @@ if CFG["estimate_only"] is True and CFG["identify_failed_snaps"] is False: maf = str(rules._igv_filter_maf.output.maf), summary_file_ready = str(rules._igv_touch_summary.output.summary_ready) output: - estimate_finished = temp(CFG["dirs"]["batch_scripts"] + "estimate_batch_scripts/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}.temp") + estimate_finished = temp(CFG["dirs"]["batch_scripts"] + "estimate_batch_scripts/{seq_type}--{genome_build}/{preset_directory}/{tumour_id}--{normal_sample_id}--{pair_status}.temp") params: - snapshot_summary = str(rules._igv_touch_summary.output.snapshot_summary) + snapshot_summary = str(rules._igv_touch_summary.output.snapshot_summary), + snapshot_estimate = str(rules._igv_touch_summary.output.snapshot_estimate) threads: (workflow.cores) run: CFG = config["lcr-modules"]["igv"] @@ -661,18 +665,23 @@ if CFG["estimate_only"] is True and CFG["identify_failed_snaps"] is False: seq_type = wildcards.seq_type genome_build = wildcards.genome_build tumour_id = wildcards.tumour_id + preset = wildcards.preset_directory + + snapshot_summary = open(params.snapshot_summary, "a") + snapshot_estimate = open(params.snapshot_estimate, "a") for index, row in maf_table.iterrows(): gene = row["Hugo_Symbol"] chromosome = row["chr_std"] position = str(row["Start_Position"]) - dispatch_path = CFG["dirs"]["batch_scripts"] + f"dispatched_batch_scripts/{seq_type}--{genome_build}/{chromosome}:{position}--{gene}--{tumour_id}" + SUFFIX + ".batch" + dispatch_path = CFG["dirs"]["batch_scripts"] + f"dispatched_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{position}--{gene}--{tumour_id}" + SUFFIX + ".batch" + + outline = "\t".join([tumour_id, seq_type, genome_build, gene, chromosome, position, preset]) + snapshot_summary.write(outline + "\n") if not os.path.exists(dispatch_path): - with open(params.snapshot_summary, "a") as handle: - outline = "\t".join([tumour_id, seq_type, genome_build, gene, chromosome, position]) - handle.write(outline + "\n") + snapshot_estimate.write(outline + "\n") finished = open(output.estimate_finished, "w") finished.close() @@ -769,7 +778,19 @@ if CFG["estimate_only"] is False and CFG["identify_failed_snaps"] is False: if CFG["estimate_only"] is True and CFG["identify_failed_snaps"] is False: rule _igv_all: input: - expand(str(rules._igv_estimate_snapshots.output.estimate_finished), zip, seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"], tumour_id=CFG["runs"]["tumour_sample_id"], normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"]) + expand( + expand( + str(rules._igv_estimate_snapshots.output.estimate_finished), + zip, + seq_type=CFG["runs"]["tumour_seq_type"], + genome_build=CFG["runs"]["tumour_genome_build"], + tumour_id=CFG["runs"]["tumour_sample_id"], + normal_sample_id=CFG["runs"]["normal_sample_id"], + pair_status=CFG["runs"]["pair_status"], + allow_missing=True + ), + preset_directory=CFG["options"]["igv_presets"] + ) if CFG["identify_failed_snaps"] is True and CFG["estimate_only"] is False: rule _igv_all: From d7c8444d4bf8bb15c5247490b22588399cae6ee8 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Tue, 12 Sep 2023 17:01:40 -0700 Subject: [PATCH 087/132] Update failed snap estimate for presets --- modules/igv/1.0/igv.smk | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 93a9626c..d318be5c 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -694,7 +694,7 @@ if CFG["identify_failed_snaps"] is True and CFG["estimate_only"] is False: failed_summary = CFG["dirs"]["outputs"] + "snapshot_estimates/failed_summary.txt", failed_ready = temp(CFG["dirs"]["outputs"] + "snapshot_estimates/touch_failed.completed") run: - header = "\t".join(["sample_id","seq_type","genome_build","gene","chromosome","position","snapshot_path"]) + header = "\t".join(["sample_id","seq_type","genome_build","gene","chromosome","position","preset","snapshot_path"]) with open(output.failed_summary, "w") as handle: handle.write(header + "\n") ready = open(output.failed_ready, "w") @@ -705,7 +705,7 @@ if CFG["identify_failed_snaps"] is True and CFG["estimate_only"] is False: maf = str(rules._igv_filter_maf.output.maf), failed_ready = str(rules._igv_touch_failed.output.failed_ready) output: - failed_finished = temp(CFG["dirs"]["batch_scripts"] + "estimate_failed_scripts/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}.temp") + failed_finished = temp(CFG["dirs"]["batch_scripts"] + "estimate_failed_scripts/{seq_type}--{genome_build}/{preset_directory}/{tumour_id}--{normal_sample_id}--{pair_status}.temp") params: failed_summary = str(rules._igv_touch_failed.output.failed_summary) threads: (workflow.cores) @@ -717,18 +717,20 @@ if CFG["identify_failed_snaps"] is True and CFG["estimate_only"] is False: seq_type = wildcards.seq_type genome_build = wildcards.genome_build tumour_id = wildcards.tumour_id + preset = wildcards.preset_directory + pair_status_directory = PAIR_STATUS_DICT[wildcards.pair_status] for index, row in maf_table.iterrows(): gene = row["Hugo_Symbol"] chromosome = row["chr_std"] position = str(row["Start_Position"]) - snapshot = CFG["dirs"]["snapshots"] + f"{seq_type}--{genome_build}/{chromosome}/{chromosome}:{position}--{gene}--{tumour_id}" + SUFFIX + ".png" - snapshot_symlink = CFG["dirs"]["outputs"] + f"{seq_type}--{genome_build}/{chromosome}/{chromosome}:{position}--{gene}--{tumour_id}" + SUFFIX + ".png" + snapshot = CFG["dirs"]["snapshots"] + f"{seq_type}--{genome_build}/{pair_status_directory}/{preset}/{chromosome}/{chromosome}:{position}--{gene}--{tumour_id}" + SUFFIX + ".png" + snapshot_symlink = CFG["dirs"]["outputs"] + f"{seq_type}--{genome_build}/{pair_status_directory}/{preset}/{chromosome}/{chromosome}:{position}--{gene}--{tumour_id}" + SUFFIX + ".png" success = True if not os.path.exists(snapshot): - print(f"{snapshot} doesn't exist") + print(f"{snapshot} doesn't exist yet, skipping...") if os.path.exists(snapshot): if not os.path.exists(snapshot_symlink): success = False @@ -750,7 +752,7 @@ if CFG["identify_failed_snaps"] is True and CFG["estimate_only"] is False: if success is False: with open(params.failed_summary, "a") as handle: print("Writing line to failed file") - outline = "\t".join([tumour_id, seq_type, genome_build, gene, chromosome, position, snapshot]) + outline = "\t".join([tumour_id, seq_type, genome_build, gene, chromosome, position, preset, snapshot]) handle.write(outline + "\n") finished = open(output.failed_finished, "w") @@ -795,7 +797,19 @@ if CFG["estimate_only"] is True and CFG["identify_failed_snaps"] is False: if CFG["identify_failed_snaps"] is True and CFG["estimate_only"] is False: rule _igv_all: input: - expand(str(rules._igv_find_failed.output.failed_finished), zip, seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"], tumour_id=CFG["runs"]["tumour_sample_id"], normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"]) + expand( + expand( + str(rules._igv_find_failed.output.failed_finished), + zip, + seq_type=CFG["runs"]["tumour_seq_type"], + genome_build=CFG["runs"]["tumour_genome_build"], + tumour_id=CFG["runs"]["tumour_sample_id"], + normal_sample_id=CFG["runs"]["normal_sample_id"], + pair_status=CFG["runs"]["pair_status"], + allow_missing=True + ), + preset_directory=CFG["options"]["igv_presets"] + ) ##### CLEANUP ##### From 97ff3670405d020741158bfd826b680e4a2a3a18 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Wed, 13 Sep 2023 20:55:58 -0700 Subject: [PATCH 088/132] Clean up rule _igv_touch_summary --- modules/igv/1.0/igv.smk | 2 -- 1 file changed, 2 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index d318be5c..41cf8d1f 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -633,8 +633,6 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: if CFG["estimate_only"] is True and CFG["identify_failed_snaps"] is False: rule _igv_touch_summary: - input: - finished_filtered_mafs = expand(str(rules._igv_filter_maf.output.maf), zip, seq_type = CFG["runs"]["tumour_seq_type"], tumour_id=CFG["runs"]["tumour_sample_id"], genome_build=CFG["runs"]["tumour_genome_build"], normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"]) output: snapshot_summary = CFG["dirs"]["outputs"] + "snapshot_estimates/snapshot_summary.txt", snapshot_estimate = CFG["dirs"]["outputs"] + "snapshot_estimates/snapshot_estimate.txt", From 9573e5922298b8fb5ca027766bf24afed9ea4618 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Wed, 13 Sep 2023 20:57:01 -0700 Subject: [PATCH 089/132] Add function to variant batch generator script to handle samples with no variants to snapshot --- .../etc/generate_batch_script_per_variant.py | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py index 4d4951cb..c1da0bb1 100644 --- a/modules/igv/1.0/etc/generate_batch_script_per_variant.py +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -42,8 +42,15 @@ def main(): break if line_count < 2: input_maf.close() - touch_output = open(snakemake.output[0],"w") - touch_output.close() + touch_outputs( + output_dir = snakemake.params[0], + seq_type = snakemake.wildcards["seq_type"], + genome_build = snakemake.wildcards["genome_build"], + presets = snakemake.params[6], + tumour_id = snakemake.wildcards["tumour_id"], + suffix = snakemake.params[5], + finished_file = snakemake.output[0] + ) exit() # Return to top of MAF @@ -80,6 +87,16 @@ def main(): logging.error(e, exc_info=1) raise +def touch_outputs(output_dir, seq_type, genome_build, presets, tumour_id, suffix, finished_file): + tumour_suffix = tumour_id + suffix + ".batch" + for preset in presets: + os.makedirs(os.path.join(output_dir, "--".join([seq_type, genome_build]), preset), exist_ok = True) + merged_batch = os.path.join(output_dir, "merged_batch_scripts", "--".join([seq_type, genome_build]), preset, tumour_suffix) + merged_file = open(merged_batch, "w") + merged_file.close() + touch_finished = open(finished_file, "w") + touch_finished.close() + def get_regions_df(input_maf, padding): # Read MAF as dataframe maf = pd.read_table(input_maf, comment="#", sep="\t") From ad03a99e2bbb77916b5708666033e969d9180469 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Wed, 13 Sep 2023 21:24:27 -0700 Subject: [PATCH 090/132] Add ability to track failed snaps while snapshots are being taken --- modules/igv/1.0/igv.smk | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 41cf8d1f..7645d36e 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -449,10 +449,22 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: fi """) + rule _igv_track_failed: + output: + failed_summary = CFG["dirs"]["outputs"] + "snapshot_estimates/failed_summary.txt", + ready = temp(CFG["dirs"]["outputs"] + "snapshot_estimates/failed_summary.completed") + run: + header = "\t".join(["sample_id","seq_type","genome_build","gene","chromosome","position","preset","snapshot_path"]) + with open(output.failed_summary, "w") as handle: + handle.write(header + "\n") + ready = open(output.ready, "w") + ready.close() + rule _igv_quality_control: input: igv = str(rules._igv_run.output.complete), - snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" + snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png", + failed_summary = str(rules._igv_track_failed.output.ready) output: snapshot_qc = temp(CFG["dirs"]["snapshots"] + "qc/{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".qc") params: @@ -461,7 +473,8 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh", server_number = "-n " + CFG["options"]["xvfb_parameters"]["server_number"] if CFG["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", server_args = CFG["options"]["xvfb_parameters"]["server_args"], - batch_temp = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch.temp" + batch_temp = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch.temp", + failed_summary = str(rules._igv_track_failed.output.failed_summary) resources: **CFG["resources"]["_igv_quality_control"] log: @@ -526,6 +539,10 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: success = False if success == True: os.system(f'touch {output.snapshot_qc}') + if success == False: + outline = "\t".join([wildcards.tumour_id, wildcards.seq_type, wildcards.genome_build, wildcards.gene, wildcards.chromosome, wildcards.start_position, wildcards.preset_directory, input.snapshot]) + with open(params.failed_summary, "a") as handle: + handle.write(outline + "\n") # Symlinks the final output files into the module results directory (under '99-outputs/') rule _igv_symlink_snapshot: From 8d0af4e2cbc2140ff7a1d784c7c9903998e2bde9 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Wed, 13 Sep 2023 21:40:22 -0700 Subject: [PATCH 091/132] Clean up preset wildcard --- modules/igv/1.0/igv.smk | 60 ++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 7645d36e..8d542c0e 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -299,7 +299,7 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: regions_formatted = str(rules._igv_format_regions_file.output.regions) output: batches_finished = CFG["dirs"]["batch_scripts"] + "completed/{seq_type}--{genome_build}/{tumour_id}.finished", - variant_batch = expand(CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{{seq_type}}--{{genome_build}}/{preset_directory}/{{tumour_id}}" + SUFFIX + ".batch", preset_directory = CFG["options"]["igv_presets"], allow_missing=True), + variant_batch = expand(CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{{seq_type}}--{{genome_build}}/{preset}/{{tumour_id}}" + SUFFIX + ".batch", preset = CFG["options"]["igv_presets"], allow_missing=True), params: batch_dir = config["lcr-modules"]["igv"]["dirs"]["batch_scripts"], snapshot_dir = config["lcr-modules"]["igv"]["dirs"]["snapshots"], @@ -317,12 +317,12 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: # Keep track of which variant and sample_id combinations have been seen, merge individual variant batch scripts into a large batch script per sample_id rule _igv_batches_to_merge: input: - batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" + batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" output: - dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" + dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" params: - merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset_directory}/{tumour_id}" + SUFFIX + ".batch", - igv_options = lambda w: config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["igv_options"][w.preset_directory] + merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset}/{tumour_id}" + SUFFIX + ".batch", + igv_options = lambda w: config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["igv_options"][w.preset] threads: (workflow.cores / 10) run: batch_script_path = os.path.abspath(input.batch_script) @@ -381,7 +381,7 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: genome_build = maf_table["genome_build"], allow_missing = True ), - preset_directory = wildcards.preset_directory + preset = wildcards.preset ) else: return [] @@ -409,9 +409,9 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: batch_script = _evaluate_batches, finished_batches = str(rules._igv_create_batch_script_per_variant.output.batches_finished) output: - complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}--{tumour_id}--{preset_directory}.completed" + complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}--{tumour_id}--{preset}.completed" params: - merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset_directory}/{tumour_id}" + SUFFIX + ".batch", + merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset}/{tumour_id}" + SUFFIX + ".batch", igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh", max_time = CFG["options"]["generate_batch_script"]["sleep_timer"], server_number = "-n " + CFG["options"]["xvfb_parameters"]["server_number"] if CFG["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", @@ -419,8 +419,8 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: resources: **CFG["resources"]["_igv_run"] log: - stdout = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{preset_directory}/{tumour_id}_igv_run.stdout.log", - stderr = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{preset_directory}/{tumour_id}_igv_run.stderr.log" + stdout = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{preset}/{tumour_id}_igv_run.stdout.log", + stderr = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{preset}/{tumour_id}_igv_run.stderr.log" threads: (workflow.cores) shell: op.as_one_line(""" @@ -463,23 +463,23 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: rule _igv_quality_control: input: igv = str(rules._igv_run.output.complete), - snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png", + snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png", failed_summary = str(rules._igv_track_failed.output.ready) output: - snapshot_qc = temp(CFG["dirs"]["snapshots"] + "qc/{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".qc") + snapshot_qc = temp(CFG["dirs"]["snapshots"] + "qc/{seq_type}--{genome_build}/{pair_status_directory}/{preset}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".qc") params: - batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch", - merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset_directory}/{tumour_id}" + SUFFIX + ".batch", + batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch", + merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset}/{tumour_id}" + SUFFIX + ".batch", igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh", server_number = "-n " + CFG["options"]["xvfb_parameters"]["server_number"] if CFG["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", server_args = CFG["options"]["xvfb_parameters"]["server_args"], - batch_temp = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch.temp", + batch_temp = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch.temp", failed_summary = str(rules._igv_track_failed.output.failed_summary) resources: **CFG["resources"]["_igv_quality_control"] log: - stdout = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stdout.log", - stderr = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stderr.log" + stdout = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stdout.log", + stderr = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stderr.log" threads: (workflow.cores) run: import subprocess @@ -540,17 +540,17 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: if success == True: os.system(f'touch {output.snapshot_qc}') if success == False: - outline = "\t".join([wildcards.tumour_id, wildcards.seq_type, wildcards.genome_build, wildcards.gene, wildcards.chromosome, wildcards.start_position, wildcards.preset_directory, input.snapshot]) + outline = "\t".join([wildcards.tumour_id, wildcards.seq_type, wildcards.genome_build, wildcards.gene, wildcards.chromosome, wildcards.start_position, wildcards.preset, input.snapshot]) with open(params.failed_summary, "a") as handle: handle.write(outline + "\n") # Symlinks the final output files into the module results directory (under '99-outputs/') rule _igv_symlink_snapshot: input: - snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png", + snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png", snapshot_qc = str(rules._igv_quality_control.output.snapshot_qc) output: - snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset_directory}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" + snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" threads: CFG["threads"]["_igv_symlink_snapshot"] run: @@ -592,7 +592,7 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: allow_missing = True ), pair_status_directory = PAIR_STATUS_DICT[pair_status.item()], - preset_directory = wildcards.preset_directory + preset = wildcards.preset ) else: return [] @@ -633,7 +633,7 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: allow_missing = True ), pair_status_directory = PAIR_STATUS_DICT[pair_status.item()], - preset_directory = wildcards.preset_directory + preset = wildcards.preset ) # Check that snapshots have been symlinked and quality controlled @@ -643,7 +643,7 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: snapshots = _symlink_snapshot, quality_control = _quality_control output: - snapshots = CFG["dirs"]["outputs"] + "completed/{preset_directory}/{seq_type}--{genome_build}--{tumour_id}.completed" + snapshots = CFG["dirs"]["outputs"] + "completed/{preset}/{seq_type}--{genome_build}--{tumour_id}.completed" shell: "touch {output.snapshots}" @@ -668,7 +668,7 @@ if CFG["estimate_only"] is True and CFG["identify_failed_snaps"] is False: maf = str(rules._igv_filter_maf.output.maf), summary_file_ready = str(rules._igv_touch_summary.output.summary_ready) output: - estimate_finished = temp(CFG["dirs"]["batch_scripts"] + "estimate_batch_scripts/{seq_type}--{genome_build}/{preset_directory}/{tumour_id}--{normal_sample_id}--{pair_status}.temp") + estimate_finished = temp(CFG["dirs"]["batch_scripts"] + "estimate_batch_scripts/{seq_type}--{genome_build}/{preset}/{tumour_id}--{normal_sample_id}--{pair_status}.temp") params: snapshot_summary = str(rules._igv_touch_summary.output.snapshot_summary), snapshot_estimate = str(rules._igv_touch_summary.output.snapshot_estimate) @@ -680,7 +680,7 @@ if CFG["estimate_only"] is True and CFG["identify_failed_snaps"] is False: seq_type = wildcards.seq_type genome_build = wildcards.genome_build tumour_id = wildcards.tumour_id - preset = wildcards.preset_directory + preset = wildcards.preset snapshot_summary = open(params.snapshot_summary, "a") snapshot_estimate = open(params.snapshot_estimate, "a") @@ -720,7 +720,7 @@ if CFG["identify_failed_snaps"] is True and CFG["estimate_only"] is False: maf = str(rules._igv_filter_maf.output.maf), failed_ready = str(rules._igv_touch_failed.output.failed_ready) output: - failed_finished = temp(CFG["dirs"]["batch_scripts"] + "estimate_failed_scripts/{seq_type}--{genome_build}/{preset_directory}/{tumour_id}--{normal_sample_id}--{pair_status}.temp") + failed_finished = temp(CFG["dirs"]["batch_scripts"] + "estimate_failed_scripts/{seq_type}--{genome_build}/{preset}/{tumour_id}--{normal_sample_id}--{pair_status}.temp") params: failed_summary = str(rules._igv_touch_failed.output.failed_summary) threads: (workflow.cores) @@ -732,7 +732,7 @@ if CFG["identify_failed_snaps"] is True and CFG["estimate_only"] is False: seq_type = wildcards.seq_type genome_build = wildcards.genome_build tumour_id = wildcards.tumour_id - preset = wildcards.preset_directory + preset = wildcards.preset pair_status_directory = PAIR_STATUS_DICT[wildcards.pair_status] for index, row in maf_table.iterrows(): @@ -789,7 +789,7 @@ if CFG["estimate_only"] is False and CFG["identify_failed_snaps"] is False: genome_build=CFG["runs"]["tumour_genome_build"], allow_missing=True ), - preset_directory=CFG["options"]["igv_presets"] + preset=CFG["options"]["igv_presets"] ) if CFG["estimate_only"] is True and CFG["identify_failed_snaps"] is False: @@ -806,7 +806,7 @@ if CFG["estimate_only"] is True and CFG["identify_failed_snaps"] is False: pair_status=CFG["runs"]["pair_status"], allow_missing=True ), - preset_directory=CFG["options"]["igv_presets"] + preset=CFG["options"]["igv_presets"] ) if CFG["identify_failed_snaps"] is True and CFG["estimate_only"] is False: @@ -823,7 +823,7 @@ if CFG["identify_failed_snaps"] is True and CFG["estimate_only"] is False: pair_status=CFG["runs"]["pair_status"], allow_missing=True ), - preset_directory=CFG["options"]["igv_presets"] + preset=CFG["options"]["igv_presets"] ) From f02f0fb65985ad272d7773cf544a36e916f850eb Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Wed, 13 Sep 2023 21:47:18 -0700 Subject: [PATCH 092/132] Clean up rule _igv_touch_failed --- modules/igv/1.0/igv.smk | 2 -- 1 file changed, 2 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 8d542c0e..ba8efb8a 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -703,8 +703,6 @@ if CFG["estimate_only"] is True and CFG["identify_failed_snaps"] is False: if CFG["identify_failed_snaps"] is True and CFG["estimate_only"] is False: rule _igv_touch_failed: - input: - filter_maf = expand(str(rules._igv_filter_maf.output.maf), zip, seq_type = CFG["runs"]["tumour_seq_type"], tumour_id = CFG["runs"]["tumour_sample_id"], genome_build=CFG["runs"]["tumour_genome_build"], normal_sample_id=CFG["runs"]["normal_sample_id"], pair_status=CFG["runs"]["pair_status"]) output: failed_summary = CFG["dirs"]["outputs"] + "snapshot_estimates/failed_summary.txt", failed_ready = temp(CFG["dirs"]["outputs"] + "snapshot_estimates/touch_failed.completed") From 870068ab2250015e612fb67242419efa852ec3c6 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Thu, 14 Sep 2023 20:40:46 -0700 Subject: [PATCH 093/132] Add ability to provide multiple regions files in config --- modules/igv/1.0/config/default.yaml | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index c51e6802..da4980ef 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -5,9 +5,23 @@ lcr-modules: inputs: # Available wildcards: {seq_type} {tumour_sample_id} {normal_sample_id} {pair_status} {genome_build} maf: "__UPDATE__" - regions_file: "__UPDATE__" # Path to a MAF, VCF, BED, OncodriveCLUSTL clusters file, HotMAPS results file, or mutation_id file containing regions of interest to snapshot. - regions_format: "__UPDATE__" # Available options are "bed", "maf", "oncodriveclustl", "hotmaps" or "mutation_id" - regions_build: "__UPDATE__" # Genome build of regions file, which will be lifted over as needed to filter MAFs on opposite genome builds + + regions: + oncodriveclustl: + grch37: [] + hg38: [] + hotmaps: + grch37: [] + hg38: [] + bed: + grch37: [] + hg38: [] + maf: + grch37: [] + hg38: [] + mutation_id: + grch37: [] + hg38: [] estimate_only: False # Stop after MAF filtering step to estimate total number of snapshots identify_failed_snaps: False From 4ac86285e257cf5e1b0727431b42780046f0e30a Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Thu, 14 Sep 2023 20:42:42 -0700 Subject: [PATCH 094/132] Add rule to merge regions file of same tool + build combo --- modules/igv/1.0/igv.smk | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index ba8efb8a..15ed65e8 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -165,6 +165,25 @@ rule _igv_reduce_maf_cols: cut -f 1,5,6,7,9,10,11,13,16 {input.maf} > {output.maf} """) +rule _igv_merge_regions: + input: + input_regions = lambda w: config["lcr-modules"]["igv"]["regions"][w.tool_type][w.tool_build] + output: + merged_regions = CFG["dirs"]["inputs"] + "regions/{tool_type}_merged.{tool_build}.tsv" + log: + stdout = CFG["logs"]["inputs"] + "merge_{tool_type}_regions.{tool_build}.stdout.log", + stderr = CFG["logs"]["inputs"] + "merge_{tool_type}_regions.{tool_build}.stderr.log" + run: + merged_df = pd.DataFrame() + for result in input.input_regions: + try: + df = pd.read_table(result, comment="#", sep="\t") + merged_df = pd.concat([merged_df, df]) + except: + with open(log.stdout, "a") as header: + header.write(f"Error reading or merging file {result}\n") + merged_df.to_csv(output.merged_regions, sep="\t", index=False) + # Convert input regions file into BED format rule _igv_format_regions_file: input: From c447af1cb18489723bc1b7b731a1bab0eba90b45 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Thu, 14 Sep 2023 20:45:07 -0700 Subject: [PATCH 095/132] Reformat regions for each tool and build combo --- modules/igv/1.0/igv.smk | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 15ed65e8..4a2fa9a5 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -185,18 +185,18 @@ rule _igv_merge_regions: merged_df.to_csv(output.merged_regions, sep="\t", index=False) # Convert input regions file into BED format -rule _igv_format_regions_file: +rule _igv_format_regions: input: - regions = str(rules._igv_symlink_regions_file.output.regions_file) + regions = str(rules._igv_merge_regions.output.merged_regions) output: - regions = CFG["dirs"]["inputs"] + "regions/regions_file_formatted.txt" + regions = CFG["dirs"]["inputs"] + "regions/{tool_type}_formatted.{tool_build}.tsv" params: - regions_format = CFG["inputs"]["regions_format"], + regions_format = lambda w: w.tool_type, oncodriveclustl_params = CFG["options"]["filter_maf"]["oncodriveclustl_options"], - regions_build = CFG["inputs"]["regions_build"] + regions_build = lambda w: w.tool_build log: - stdout = CFG["logs"]["inputs"] + "format_regions.stdout.log", - stderr = CFG["logs"]["inputs"] + "format_regions.stderr.log" + stdout = CFG["logs"]["inputs"] + "format_regions_{tool_type}.{tool_build}.stdout.log", + stderr = CFG["logs"]["inputs"] + "format_regions_{tool_type}.{tool_build}.stderr.log" script: config["lcr-modules"]["igv"]["scripts"]["format_regions"] From ba82c640949b3a16d00e7be22e9269da0a352e84 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Thu, 14 Sep 2023 20:46:09 -0700 Subject: [PATCH 096/132] Touch snakemake output if no regions files provided for tool+build combo --- modules/igv/1.0/etc/format_regions.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/modules/igv/1.0/etc/format_regions.py b/modules/igv/1.0/etc/format_regions.py index 8dff0f3d..aea004b4 100755 --- a/modules/igv/1.0/etc/format_regions.py +++ b/modules/igv/1.0/etc/format_regions.py @@ -26,6 +26,17 @@ def main(): output_file = snakemake.output[0] + line_count = 0 + with open(regions_file, "r") as handle: + for line in handle: + line_count += 1 + if line_count > 1: + break + if line_count < 2: + touch_output = open(output_file, "w") + touch_output.close() + exit() + if regions_format == "oncodriveclustl": global CLUSTL_PARAMS CLUSTL_PARAMS = snakemake.params[1] From b04e6ce95170b8e8ee812520bcab90532c3e9c26 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Thu, 14 Sep 2023 20:46:51 -0700 Subject: [PATCH 097/132] Fix typo --- modules/igv/1.0/etc/format_regions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/igv/1.0/etc/format_regions.py b/modules/igv/1.0/etc/format_regions.py index aea004b4..2839221f 100755 --- a/modules/igv/1.0/etc/format_regions.py +++ b/modules/igv/1.0/etc/format_regions.py @@ -133,10 +133,10 @@ def format_clustl(clustl_regions): clustl_regions = clustl_regions.explode("COORDINATES") # Create columnsn required for BED format - chr_str = "chr" + clustl_regions["CHROMOSOME"].map(str) + chr_std = "chr" + clustl_regions["CHROMOSOME"].map(str) clustl_reformatted = pd.DataFrame( { - "chrom": chr_str, + "chrom": chr_std, "start": clustl_regions["COORDINATES"], "end": clustl_regions["COORDINATES"] } From 9aee267f126ee8b930515640d631f8d995e21e8c Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Thu, 14 Sep 2023 20:48:35 -0700 Subject: [PATCH 098/132] Add function to convert MAFs to BED format --- modules/igv/1.0/etc/format_regions.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/modules/igv/1.0/etc/format_regions.py b/modules/igv/1.0/etc/format_regions.py index 2839221f..501a4b09 100755 --- a/modules/igv/1.0/etc/format_regions.py +++ b/modules/igv/1.0/etc/format_regions.py @@ -46,7 +46,7 @@ def main(): REGIONS_BUILD = snakemake.params[2] REGIONS_BUILD = REGIONS_BUILD.lower() - if regions_format == "bed" or regions_format == "maf": + if regions_format == "bed": # Do not need to reformat for liftover shutil.copy(regions_file, output_file) exit() @@ -143,12 +143,29 @@ def format_clustl(clustl_regions): ) return clustl_reformatted +def format_maf(maf): + # Read regions into dataframe + maf_regions = pd.read_table(maf, comment="#", sep="\t") + + # Create dataframe in BED format + chr_std = "chr" + maf_regions["Chromosome"].map(str).replace("chr","") + + maf_reformatted = pd.DataFrame( + { + "chrom": chr_std, + "start": maf_regions["Start_Position"], + "end": maf_regions["End_Position"] + } + ) + + return maf_reformatted def format_regions(regions, regions_format): format_functions = { "oncodriveclustl": format_clustl, "hotmaps": format_hotmaps, - "mutation_id": format_mutation_id + "mutation_id": format_mutation_id, + "maf": format_maf } return format_functions[regions_format](regions) From 59719179985c3a732f71f9e95525bf0aa3ac3eee Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Thu, 14 Sep 2023 20:51:50 -0700 Subject: [PATCH 099/132] Run liftover on each tool+build combo... for each required genome build in CFG["runs"]["tumour_genome_build"] --- modules/igv/1.0/igv.smk | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 4a2fa9a5..55f8f601 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -201,7 +201,8 @@ rule _igv_format_regions: config["lcr-modules"]["igv"]["scripts"]["format_regions"] REGIONS_FORMAT = { - "maf": "maf", + "bed": "bed", + "maf": "bed", "oncodriveclustl": "bed", "hotmaps": "bed", "mutation_id": "bed" @@ -209,28 +210,28 @@ REGIONS_FORMAT = { rule _igv_liftover_regions: input: - regions = str(rules._igv_format_regions_file.output.regions), + regions = str(rules._igv_format_regions.output.regions), liftover_script = CFG["scripts"]["region_liftover_script"] output: - regions = CFG["dirs"]["inputs"] + "regions/regions_file_{genome_build}.crossmap.txt" + regions = CFG["dirs"]["inputs"] + "regions/{tool_type}.{tool_build}To{genome_build}.crossmap.txt" params: - chain_file = reference_files(CFG["options"]["liftover_regions"]["reference_chain_file"][(CFG["inputs"]["regions_build"]).replace("hg19","grch37").replace("grch38","hg38")]), + chain_file = lambda w: reference_files(config["lcr-modules"]["igv"]["options"]["liftover_regions"]["reference_chain_file"][w.tool_build]), target_reference = lambda w: config["lcr-modules"]["igv"]["options"]["liftover_regions"]["target_reference"][w.genome_build], - regions_type = REGIONS_FORMAT[CFG["inputs"]["regions_format"].lower()], - regions_build = CFG["inputs"]["regions_build"].replace("grch37","GRCh37").replace("hg38","GRCh38"), - target_build = lambda w: w.genome_build.replace("grch37","GRCh37").replace("hg38", "GRCh38") + regions_type = lambda w: REGIONS_FORMAT[(w.tool_type).lower()], + regions_build = lambda w: (w.tool_build).replace("grch37","GRCh37").replace("hg38","GRCh38"), + target_build = lambda w: (w.genome_build).replace("grch37","GRCh37").replace("hg38","GRCh38") conda: CFG["conda_envs"]["liftover_regions"] resources: **CFG["resources"]["_igv_liftover_regions"] log: - stdout = CFG["logs"]["inputs"] + "liftover_regions_{genome_build}.stdout.log", - stderr = CFG["logs"]["inputs"] + "liftover_regions_{genome_build}.stderr.log" + stdout = CFG["logs"]["inputs"] + "liftover_regions_{tool_type}.{tool_build}To{genome_build}.stdout.log", + stderr = CFG["logs"]["inputs"] + "liftover_regions_{tool_type}.{tool_build}To{genome_build}.stderr.log" shell: op.as_one_line(""" - {input.liftover_script} {input.regions} - {params.regions_type} {params.regions_build} {params.target_build} - {output.regions} {params.chain_file} + {input.liftover_script} + {input.regions} {params.regions_type} {params.regions_build} + {params.target_build} {output.regions} {params.chain_file} {params.target_reference} > {log.stdout} 2> {log.stderr} """) From 57e96e098607975e5293a6265ea311f1ec169c18 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Thu, 14 Sep 2023 20:55:57 -0700 Subject: [PATCH 100/132] Merge ALL tool+build combo files that are the same target build into one file --- modules/igv/1.0/igv.smk | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 55f8f601..f5881a08 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -235,6 +235,34 @@ rule _igv_liftover_regions: {params.target_reference} > {log.stdout} 2> {log.stderr} """) +def _get_lifted_regions(wildcards): + CFG = config["lcr-modules"]["igv"] + return expand( + expand( + str(rules._igv_liftover_regions.output.regions), + tool_type = list(CFG["regions"]), + tool_build = ["grch37","hg38"], + allow_missing = True + ), + genome_build = wildcards.genome_build + ) + +rule _igv_merge_lifted_regions: + input: + regions = _get_lifted_regions + output: + regions = CFG["dirs"]["inputs"] + "regions/regions.{genome_build}.txt" + run: + merged_df = pd.DataFrame() + for region in input.regions: + try: + df = pd.read_table(region, comment = "#", sep = "\t") + merged_df = pd.concat([merged_df, df]) + except: + print(f"Lifted regions file is empty: {region}") + merged_df = merged_df.drop_duplicates() + merged_df.to_csv(output.regions, sep="\t", index=False) + # Filter MAF to lines containing positions of interest rule _igv_filter_maf: input: From d6eabda39c29f6a39ac17f05ce1693f954013749 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Thu, 14 Sep 2023 20:57:53 -0700 Subject: [PATCH 101/132] Filter MAFs based using merged regions file of same build --- modules/igv/1.0/etc/filter_maf.py | 4 ---- modules/igv/1.0/igv.smk | 5 ++--- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/modules/igv/1.0/etc/filter_maf.py b/modules/igv/1.0/etc/filter_maf.py index d3583767..11d0494e 100644 --- a/modules/igv/1.0/etc/filter_maf.py +++ b/modules/igv/1.0/etc/filter_maf.py @@ -29,10 +29,6 @@ def main(): metadata = snakemake.params[2] - if regions_format == "oncodriveclustl": - global CLUSTL_PARAMS - CLUSTL_PARAMS = snakemake.params[1] - output_file = snakemake.output[0] # Return empty dataframe if no lines in MAF diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index f5881a08..aaef1946 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -267,12 +267,11 @@ rule _igv_merge_lifted_regions: rule _igv_filter_maf: input: maf = str(rules._igv_reduce_maf_cols.output.maf), - regions = str(rules._igv_liftover_regions.output.regions) + regions = str(rules._igv_merge_lifted_regions.output.regions) output: maf = CFG["dirs"]["inputs"] + "maf/filtered_maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}.maf" params: - regions_format = REGIONS_FORMAT[CFG["inputs"]["regions_format"].lower()], - oncodriveclustl_params = CFG["options"]["filter_maf"]["oncodriveclustl_options"], + regions_format = "bed", metadata = CFG["runs"] log: stdout = CFG["logs"]["inputs"] + "filter_maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}/filter_maf.stdout.log", From a7f33e7db931b9c90b519d4077c4f4d0fd9679ae Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Thu, 14 Sep 2023 21:04:27 -0700 Subject: [PATCH 102/132] Update metadata argument position --- modules/igv/1.0/etc/filter_maf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/igv/1.0/etc/filter_maf.py b/modules/igv/1.0/etc/filter_maf.py index 11d0494e..37096d72 100644 --- a/modules/igv/1.0/etc/filter_maf.py +++ b/modules/igv/1.0/etc/filter_maf.py @@ -27,7 +27,7 @@ def main(): regions_file = snakemake.input[1] regions_format = snakemake.params[0] - metadata = snakemake.params[2] + metadata = snakemake.params[1] output_file = snakemake.output[0] From ff04705b5d20f7d357c3b1a6f9e9df717781625f Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Thu, 14 Sep 2023 21:05:30 -0700 Subject: [PATCH 103/132] Remove regions dependency in rule _igv_create_batch_script_per_variant --- modules/igv/1.0/etc/generate_batch_script_per_variant.py | 6 +++--- modules/igv/1.0/igv.smk | 4 +--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py index c1da0bb1..9fc87586 100644 --- a/modules/igv/1.0/etc/generate_batch_script_per_variant.py +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -22,17 +22,17 @@ def main(): try: # Handle matched samples with matched normal BAMs - input_bams = snakemake.input[0:len(snakemake.input) - 3] + input_bams = snakemake.input[0:len(snakemake.input) - 1] input_bam = input_bams[:int(len(input_bams) / 2)] input_bai = input_bams[int(len(input_bams)/2):] - inputs = snakemake.input[-3:len(snakemake.input)] + maf = snakemake.input[-1] batch_options = snakemake.params[4] # Print run info for logging print(f"Setting up batch scripts using the following inputs:\nBam files:\t{input_bam}\nBai files:\t{input_bai}\nParameters:\t{snakemake.params[6]}\nBatch options:\t{batch_options}") - input_maf = open(inputs[0], "r") + input_maf = open(maf, "r") # Skip if no variants in outfile line_count = 0 diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index aaef1946..ccc323db 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -341,9 +341,7 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: input: bam_file = _get_bam_files, bai_file = _get_bai_files, - filter_maf = _get_maf, - regions_lifted = str(rules._igv_liftover_regions.output.regions), - regions_formatted = str(rules._igv_format_regions_file.output.regions) + filter_maf = _get_maf output: batches_finished = CFG["dirs"]["batch_scripts"] + "completed/{seq_type}--{genome_build}/{tumour_id}.finished", variant_batch = expand(CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{{seq_type}}--{{genome_build}}/{preset}/{{tumour_id}}" + SUFFIX + ".batch", preset = CFG["options"]["igv_presets"], allow_missing=True), From e62ce63114f43dbc4bf86db547d52bf8341c0cfd Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Mon, 18 Sep 2023 15:57:06 -0700 Subject: [PATCH 104/132] Skip liftover step if input regions file is empty --- modules/igv/1.0/etc/liftover_regions.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/modules/igv/1.0/etc/liftover_regions.sh b/modules/igv/1.0/etc/liftover_regions.sh index be39f72b..05c3cb61 100755 --- a/modules/igv/1.0/etc/liftover_regions.sh +++ b/modules/igv/1.0/etc/liftover_regions.sh @@ -29,12 +29,19 @@ echo "Target reference: $target_ref" intermediate_output_file=$(echo $output_file)_int +# Skip empty files +lines=$(wc -l < $input_regions) +if [ ! $lines -gt 0 ] ; +then + touch $output_file +fi + # MAFs # Check genome build of incoming MAF file to determine what build it needs to be changed to if [ "$input_type" == "maf" ] ; then echo "Proceeding with MAF input..." - if [ $regions_build == $target_build ] ; + if [ $regions_build == $target_build ] && [ $lines -gt 0 ] ; then echo "WARNING: Input regions file $input_regions is already $target_build. Copying contents of $input_regions to $output_file"; cut -f 1,5,6,7,9,10,11,13,16 $input_regions > $output_file @@ -48,7 +55,7 @@ then echo "Finished MAF block." fi -if [ "$input_type" == "bed" ] ; +if [ "$input_type" == "bed" ] && [ $lines -gt 0 ] ; then echo "Proceeding with BED input..." if [ $regions_build == $target_build ] ; From 064ac93c2cfb332dbc41164e2f0e85eb569b87cb Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Fri, 22 Dec 2023 13:24:07 -0800 Subject: [PATCH 105/132] Added more flexibility in setting qc thresholds, because trying to stack normal and tumor bams were producing many more truncated and blank snaps with a variety of heights --- modules/igv/1.0/config/default.yaml | 19 +++++ modules/igv/1.0/igv.smk | 120 +++++++++++++++++++++++----- 2 files changed, 117 insertions(+), 22 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index da4980ef..82825615 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -61,6 +61,25 @@ lcr-modules: server_number: "99" server_args: "" + quality_control: + tumour_only: + truncated: ["506,545,547,559,570"] + blank: + "547": + kurtosis: 18.5 + skewness: -4 + "559": + kurtosis: 18.2 + skewness: -4 + failed: ["506"] + tumour_normal_pair: + truncated: ["533","1226","1055"] + blank: + "<1000": + kurtosis: 8 + skewness: -3 + failed: ["533"] + scripts: format_regions: "etc/format_regions.py" filter_script: "etc/filter_maf.py" diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index ccc323db..4edb4753 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -102,13 +102,6 @@ def get_maf(wildcards): # Symlinks the input files into the module results directory (under '00-inputs/') -rule _igv_symlink_regions_file: - input: - regions_file = CFG["inputs"]["regions_file"] - output: - regions_file = CFG["dirs"]["inputs"] + "regions/regions_file.txt" - run: - op.absolute_symlink(input.regions_file, output.regions_file) rule _igv_symlink_bam: input: @@ -256,12 +249,13 @@ rule _igv_merge_lifted_regions: merged_df = pd.DataFrame() for region in input.regions: try: - df = pd.read_table(region, comment = "#", sep = "\t") + df = pd.read_table(region, comment = "#", sep = "\t", header=None) + df.drop(df[df[0] == "chrom"].index, inplace = True) merged_df = pd.concat([merged_df, df]) except: print(f"Lifted regions file is empty: {region}") merged_df = merged_df.drop_duplicates() - merged_df.to_csv(output.regions, sep="\t", index=False) + merged_df.to_csv(output.regions, sep="\t", index=False, header=False) # Filter MAF to lines containing positions of interest rule _igv_filter_maf: @@ -499,7 +493,7 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: failed_summary = CFG["dirs"]["outputs"] + "snapshot_estimates/failed_summary.txt", ready = temp(CFG["dirs"]["outputs"] + "snapshot_estimates/failed_summary.completed") run: - header = "\t".join(["sample_id","seq_type","genome_build","gene","chromosome","position","preset","snapshot_path"]) + header = "\t".join(["sample_id","seq_type","genome_build","gene","chromosome","position","preset","status","snapshot_path"]) with open(output.failed_summary, "w") as handle: handle.write(header + "\n") ready = open(output.ready, "w") @@ -519,7 +513,8 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: server_number = "-n " + CFG["options"]["xvfb_parameters"]["server_number"] if CFG["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", server_args = CFG["options"]["xvfb_parameters"]["server_args"], batch_temp = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch.temp", - failed_summary = str(rules._igv_track_failed.output.failed_summary) + failed_summary = str(rules._igv_track_failed.output.failed_summary), + thresholds = CFG["options"]["quality_control"] resources: **CFG["resources"]["_igv_quality_control"] log: @@ -546,26 +541,58 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: if is_corrupt == True: os.system(f'echo "Snapshot may be corrupt." >> {log.stdout}') success = False - if height in ["506","547","559"]: + # Check if truncated + if height in params.thresholds[wildcards.pair_status_directory]["truncated"]: attempts = 0 os.system(f'sleep=$(grep "Sleep" {params.batch_script} | cut -d " " -f 2) && sed "s/setSleepInterval $sleep/setSleepInterval 5000/g" {params.batch_script} > {params.batch_temp} && echo "exit" >> {params.batch_temp}') - while height in ["506","547","559"] and attempts < 3: + while height in params.thresholds[wildcards.pair_status_directory]["truncated"] and attempts < 3: os.system(f'echo "Snapshot may be truncated. Current snapshot height is {height}. Rerunning IGV batch script {params.batch_script} with increased sleep interval.\n" >> {log.stdout}') attempts += 1 os.system(f'echo "IGV ATTEMPT #{attempts}:" >> {log.stdout}') os.system(f'echo "IGV ATTEMPT #{attempts}:" >> {log.stderr}') os.system(f'maxtime=$(($(wc -l < {params.batch_temp}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" {params.server_number} {params.server_args} {params.igv} -b {params.batch_temp} >> {log.stdout} 2>> {log.stderr}') height = str(subprocess.check_output(f"identify -format '%h' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] - if height in ["547","559"]: + # Check if blank + possibly_blank = False + blank_heights = list(params.tresholds[wildcards.pair_status_directory]["blank"]) + for height_threshold in blank_heights: + if any(symbol in height_threshold for symbol in ["<",">"]): + if "<" in height_threshold: + if float(height) < float(height_threshold.replace("<","")): + possibly_blank = True + if ">" in height_threshold: + if float(height) > float(height_threshold.replace("<","")): + possibly_blank = True + else: + if height == height_threshold: + possibly_blank = True + if possibly_blank == True: + # Determine if snap is blank based on kurtosis and skewness values + kurtosis_threshold = blank_heights[height_threshold]["kurtosis"] + skewness_threshold = blank_heights[height_threshold]["skewness"] kurtosis, skewness = [float(value.split(": ")[1]) for value in str(subprocess.check_output(f'identify -verbose {input.snapshot} | grep -E "kurtosis|skewness" | tail -n 2', shell=True)).replace("\\n'","").split("\\n ")] - blank_kurtosis = {"547": 18.5, "559": 18.2} - blank_skew = -4 - if kurtosis > blank_kurtosis[height] and skewness < blank_skew: - os.system(f'echo "Snapshot may be blank. Current values:\nHeight:{height}, kurtosis: {str(kurtosis)}, skewness: {str(skewness)}\nSnapshots with height of 547, kurtosis greater than 18.5, and skewness less than 4 are likely blank, snapshots with height of 559, kurtosis greater than 18.2, and skewness less than 4 may be blank. Blank snapshots may be due to errors reading BAM file headers, Java address bind errors or other errors occurring during IGV run." >> {log.stdout}') + if kurtosis > kurtosis_threshold and skewness < skewness_threshold and attempts == 0: + # Rerun if snapshot was not run during truncated check + os.system(f'sleep=$(grep "Sleep" {params.batch_script} | cut -d " " -f 2) && sed "s/setSleepInterval $sleep/setSleepInterval 5000/g" {params.batch_script} > {params.batch_temp} && echo "exit" >> {params.batch_temp}') + while kurtosis > kurtosis_threshold and skewness < skewness_threshold and new_height == height and attempts < 3: + os.system(f'echo "Snapshot may be truncated. Current snapshot height is {height}. Rerunning IGV batch script {params.batch_script} with increased sleep interval.\n" >> {log.stdout}') + attempts += 1 + os.system(f'echo "IGV ATTEMPT #{attempts}:" >> {log.stdout}') + os.system(f'echo "IGV ATTEMPT #{attempts}:" >> {log.stderr}') + os.system(f'maxtime=$(($(wc -l < {params.batch_temp}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" {params.server_number} {params.server_args} {params.igv} -b {params.batch_temp} >> {log.stdout} 2>> {log.stderr}') + new_height = str(subprocess.check_output(f"identify -format '%h' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] + kurtosis, skewness = [float(value.split(": ")[1]) for value in str(subprocess.check_output(f'identify -verbose {input.snapshot} | grep -E "kurtosis|skewness" | tail -n 2', shell=True)).replace("\\n'","").split("\\n ")] + if new_height != height: + if height_range(new_height, height, symbol): + do !kurtosis cchecks and stuff + if kurtosis > kurtosis_threshold and skewness < skewness_threshold: + os.system(f'echo "Snapshot may be blank. Current values:\nHeight:{height}, kurtosis: {str(kurtosis)}, skewness: {str(skewness)}\nSnapshots at this height with kurtosis values greater than {str(kurtosis_threshold)}, and skewness values less than {str(skewness_threshold)} may be blank. Blank snapshots may be due to errors reading BAM file headers, Java address bind errors or other errors occurring during IGV run." >> {log.stdout}') success = False - if height == "506": - os.system(f'echo "Snapshot height is {height} and may still be truncated or improperly loaded. Check snapshot {input.snapshot}" >> {log.stdout}') - success = False + break + # Check if height matches heights of failed snapshots + if height in params.thresholds[wildcards.pair_status_directory]["failed"]: + os.system(f'echo "Snapshot height is {height} and may still be truncated or improperly loaded. Check snapshot {input.snapshot}" >> {log.stdout}') + success = False if width == "640": attempts = 0 os.system(f'echo "Snapshot appears to be in incorrect dimensions. Current width is {width} and might be due to xvfb-run unable to connect to current server {params.server_number} due to a server lock. Attempting to run on different server number..." >> {log.stdout}') @@ -585,9 +612,58 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: if success == True: os.system(f'touch {output.snapshot_qc}') if success == False: - outline = "\t".join([wildcards.tumour_id, wildcards.seq_type, wildcards.genome_build, wildcards.gene, wildcards.chromosome, wildcards.start_position, wildcards.preset, input.snapshot]) + qc_status = "failed" + outline = "\t".join([wildcards.tumour_id, wildcards.seq_type, wildcards.genome_build, wildcards.gene, wildcards.chromosome, wildcards.start_position, wildcards.preset, qc_status, input.snapshot]) with open(params.failed_summary, "a") as handle: handle.write(outline + "\n") + elif attempts == 3: + qc_status = "suspicious" + outline = "\t".join([wildcards.tumour_id, wildcards.seq_type, wildcards.genome_build, wildcards.gene, wildcards.chromosome, wildcards.start_position, wildcards.preset, qc_status, input.snapshot]) + with open(params.failed_summary, "a") as handle: + handle.write(outline + "\n") + + #if height in ["506","545","547","559","570"]: + # attempts = 0 + # os.system(f'sleep=$(grep "Sleep" {params.batch_script} | cut -d " " -f 2) && sed "s/setSleepInterval $sleep/setSleepInterval 5000/g" {params.batch_script} > {params.batch_temp} && echo "exit" >> {params.batch_temp}') + # while height in ["506","545","547","559","570"] and attempts < 3: + # os.system(f'echo "Snapshot may be truncated. Current snapshot height is {height}. Rerunning IGV batch script {params.batch_script} with increased sleep interval.\n" >> {log.stdout}') + # attempts += 1 + # os.system(f'echo "IGV ATTEMPT #{attempts}:" >> {log.stdout}') + # os.system(f'echo "IGV ATTEMPT #{attempts}:" >> {log.stderr}') + # os.system(f'maxtime=$(($(wc -l < {params.batch_temp}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" {params.server_number} {params.server_args} {params.igv} -b {params.batch_temp} >> {log.stdout} 2>> {log.stderr}') + # height = str(subprocess.check_output(f"identify -format '%h' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] + # if height in ["547","559"]: + # kurtosis, skewness = [float(value.split(": ")[1]) for value in str(subprocess.check_output(f'identify -verbose {input.snapshot} | grep -E "kurtosis|skewness" | tail -n 2', shell=True)).replace("\\n'","").split("\\n ")] + # blank_kurtosis = {"547": 18.5, "559": 18.2} + # blank_skew = -4 + # if kurtosis > blank_kurtosis[height] and skewness < blank_skew: + # os.system(f'echo "Snapshot may be blank. Current values:\nHeight:{height}, kurtosis: {str(kurtosis)}, skewness: {str(skewness)}\nSnapshots with height of 547, kurtosis greater than 18.5, and skewness less than 4 are likely blank, snapshots with height of 559, kurtosis greater than 18.2, and skewness less than 4 may be blank. Blank snapshots may be due to errors reading BAM file headers, Java address bind errors or other errors occurring during IGV run." >> {log.stdout}') + # success = False + # if height == "506": + # os.system(f'echo "Snapshot height is {height} and may still be truncated or improperly loaded. Check snapshot {input.snapshot}" >> {log.stdout}') + # success = False + #if width == "640": + # attempts = 0 + # os.system(f'echo "Snapshot appears to be in incorrect dimensions. Current width is {width} and might be due to xvfb-run unable to connect to current server {params.server_number} due to a server lock. Attempting to run on different server number..." >> {log.stdout}') + # if params.server_number == "--auto-servernum" or int(params.server_number.replace("-n ","")) >= 99: + # new_server = 1 + # else: + # new_server = int(params.server_number.replace("-n ","")) + 1 + # while width == "640" and attempts < 5: + # os.system(f'echo "Attempting with server number {new_server}..." >> {log.stdout}') + # os.system(f'echo "Attempting with server number {new_server}..." >> {log.stderr}') + # os.system(f'maxtime=$(($(wc -l < {params.merged_batch}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" -n {str(new_server)} {params.server_args} {params.igv} -b {params.merged_batch} >> {log.stdout} 2>> {log.stderr}') + # width = str(subprocess.check_output(f"identify -format '%w' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] + # new_server += 1 + # if width == "640": + # os.system(f'echo "Snapshot still appears to be in improper dimensions. Double check xvfb-run parameters." >> {log.stdout}') + # success = False + #if success == True: + # os.system(f'touch {output.snapshot_qc}') + #if success == False: + # outline = "\t".join([wildcards.tumour_id, wildcards.seq_type, wildcards.genome_build, wildcards.gene, wildcards.chromosome, wildcards.start_position, wildcards.preset, input.snapshot]) + # with open(params.failed_summary, "a") as handle: + # handle.write(outline + "\n") # Symlinks the final output files into the module results directory (under '99-outputs/') rule _igv_symlink_snapshot: From d8fa50c34f47dd6005f26f7a1c86fb2359aa5124 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Fri, 29 Dec 2023 16:57:31 -0800 Subject: [PATCH 106/132] Move quality control process into a script --- modules/igv/1.0/config/default.yaml | 3 +- modules/igv/1.0/etc/quality_control.py | 251 +++++++++++++++++++++++++ modules/igv/1.0/igv.smk | 145 +------------- 3 files changed, 255 insertions(+), 144 deletions(-) create mode 100644 modules/igv/1.0/etc/quality_control.py diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index 82825615..ae799ac6 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -63,7 +63,7 @@ lcr-modules: quality_control: tumour_only: - truncated: ["506,545,547,559,570"] + truncated: ["506","545","547","559",570"] blank: "547": kurtosis: 18.5 @@ -86,6 +86,7 @@ lcr-modules: region_liftover_script: "{MODSDIR}/etc/liftover_regions.sh" batch_script_per_variant: "etc/generate_batch_script_per_variant.py" batch_script: "etc/generate_batch_scripts.py" + quality_control: "etc/quality_control.py" scratch_subdirectories: [] diff --git a/modules/igv/1.0/etc/quality_control.py b/modules/igv/1.0/etc/quality_control.py new file mode 100644 index 00000000..d7d1847c --- /dev/null +++ b/modules/igv/1.0/etc/quality_control.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python + +import subprocess +import sys + +def increaseSleepInterval(batch_file): + """ + Increase sleep interval between batch commands by 5 seconds + """ + os.system(f'sleep=$(grep "Sleep" {batch_file} | cut -d " " -f2) && new_sleep=$(($sleep + 5000)) && sed -i "s/setSleepInterval $sleep/setSleepInterval $new_sleep/g" {batch_file}') + +def runIGV(batch_file, igv, status, message, attempt): + os.system(f'echo "Snapshot may be {status}. {message}... Rerunning IGV... Attempt {str(attempt)}: >> {stdout} 2>> {stderr}') + os.system(f'maxtime=$(($(wc -l < {temp_file}) * 60 + 15)) && timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" {igv["server_num"]} {igv["server_args"]} {igv["igv"]} -b {temp_file} >> {stdout} 2>> {stderr}') + +def getImageQualities(snapshot, batch_file, igv, summary_file, attempts = 0): + height = None + width = None + kurtosis = None + skewness = None + corrupt = True + while attempts < 4 and corrupt: + attempts += 1 + try: + height = str(subprocess.check_output(f"identify -format '%h' {snapshot}", shell=True)).split("'")[1].split("\\n")[0] + width = str(subprocess.check_output(f"identify -format '%w' {snapshot}", shell=True)).split("'")[1].split("\\n")[0] + kurtosis, skewness = [float(value.split(": ")[1]) for value in str(subprocess.check_output(f"identify -verbose {snapshot} | grep -E 'kurtosis|skewness' | tail -n 2", shell=True)).replace("\\n'","").split("\\n ")] + corrupt = False + except: + status = "corrupt" + message = "" + if attempts < 3: + runIGV(batch_file, igv, status, message, attempts) + if attempts == 4 and corrupt: + # Quit because snapshot is corrupt and can't run quality control + qc_status = "failed" + logFailedSnapshots(snapshot, summary_file, qc_status) + sys.exit() + + quality_dict = { + "height": height, + "width": width, + "kurtosis": kurtosis, + "skewness": skewness + } + + return quality_dict + +def handleIncorrectDimensions(snapshot, img_values, thresholds, batch_file, igv, failed_summary): + status = "in incorrect dimensions" + attempts = 0 + + # Increase sleep interval + increaseSleepInterval() + + while float(img_values["width"]) == 640 and attempts < 5: + attempts += 1 + previous_server_arg = igv["server_num"] + + if igv["server_num"] == "--auto-servernum" or int(igv["server_num"].replace("-n ","")) >= 99: + new_server_arg = "-n 1" + else: + new_server_arg = f'-n {str(float(igv["server_num"]) + 1)} + + messsage = f'Current snapshot width is {img_values["width"]}, while 1020 is expected. This might occurr if xvfb-run is unable to connect to current server ({previous_server_arg}) due to a server lock. Switching server numbers... Attempting new server argument: {new_server_arg}' + + igv["server_num"] = new_server_arg + + # Rerun IGV + runIGV(batch_file, igv, status, message, attempts) + + # Update image values + img_values = getImageQualities(snapshot, batch_file, igv, failed_summary) + + return attempts, img_values + +def handleTruncated(snapshot, img_values, thresholds, batch_file, igv, dim_attempts, failed_summary): + status = "truncated" + attempts = 0 + + # Increase sleep interval + #os.system(f'sleep=$(grep "Sleep" {batch_file} | cut -d " " -f 2) && new_sleep=$(($sleep + 5000)) && sed -i "s/setSleepInterval $sleep/setSleepInterval $new_sleep/g" {batch_file}') + if dim_attempts == 0: + increaseSleepInterval() + + # Rerun IGV until height is no longer truncated + while img_values["height"] in thresholds["truncated"] and attempts < 3: + attempts += 1 + message = f"Current snapshot height is {img_values["height"]}." + + # Rerun IGV + runIGV(batch_file, igv, status, message, attempts) + + # Update image values + img_values = getImageQualities(snapshot, batch_file, igv, failed_summary) + + return attempts, img_values + +def handleBlank(snapshot, img_values, thresholds, batch_file, igv, failed_summary, truncated_attempts, dim_attempts): + status = "blank" + attempts = 0 + + if dim_attempts == 0 and truncated attempts == 0: + increaseSleepInterval() + + while blank and attempts < 3: + attempts += 1 + message = f'Current snapshot values are: {img_values["height"]} height, {img_values["kurtosis"]} kurtosis, and {img_values["skewness"]} skewness. Snapshots with these values may be blank. Blank snapshots may be due to errors reading BAM file headers, Java address bind errors, or other errors during that occur during the IGV run. Rerunning with increased sleep interval.' + + # Rerun IGV + runIGV(batch_file, igv, status, message, attempts) + + # Get updated values + img_values = getImageQualities(snapshot, batch_file, igv, failed_summary) + + # Check if still blank + blank = is_blank(img_values, thresholds) + + return attempts, img_values + + +def is_blank(img_values, thresholds): + blank_check = any( + ( + ( + float(img_values["height"] == float(height_threshold)) or + ("<" in height_threshold and float(img_values["height"]) < float(height_threshold.replace("<",""))) or + (">" in height_threshold and float(img_values["height"]) > float(height_threshold.replace(">",""))) + ) and + (float(img_values["kurtosis"]) > float(thresholds[height_threshold]["kurtosis"])) and + (float(img_values["skewness"]) < float(thresholds[height_threshold]["skewness"])) + ) + for height_threshold in list(thresholds) + ) + + return blank_check + +def qualityControl(snapshot, batch_file, igv, img_values, thresholds, failed_summary, attempts=0): + # Set default values for attempts so sleep value is not perpetually increased + dimension_attempts = 0 + truncated_attempts = 0 + blank_attempts = 0 + + # Set default qc status + qc_status = "pass" + + # Check width + if float(img_values["width"]) == 640: + dimension_attempts, img_values = handleIncorrectDimensions(snapshot, img_values, thresholds, batch_file, igv, failed_summary) + + # Handle truncated attempts + if img_values["height"] in thresholds["truncated"]: + truncated_attempts, img_values = handleTruncated(snapshot, img_values, thresholds, batch_file, igv, dimension_attempts, failed_summary) + + # Check if blank + blank_thresholds = thresholds["blank"] + + blank = is_blank(img_values, blank_thresholds) + + if blank: + blank_attempts, img_values = handleBlank(snapshot, img_values, thresholds, batch_file, igv, failed_summary, truncated_attempts) + + #if blank: + # status = "blank" + # if truncated_attempts == 0: + # # Increase sleep timer if it hasn't been increased before + # os.system(f'sleep=$(grep "Sleep" {batch_file} | cut -d " " -f 2) && new_sleep=$(($sleep + 5000)) && sed -i "s/setSleepInterval $sleep/setSleepInterval $new_sleep/g" {batch_file}') + # blank_attempts = 0 + # while blank and blank_attempts < 3: + # blank_attempts += 1 + # message = f"Current snapshot values are: {img_values["height"]} height, {img_values["kurtosis"]}, and {img_values["skewness"]} skewness. Snapshots with these values may be blank. Blank snapshots may be due to errors reading BAM file headers, Java address bind erors, or other errors occurring during IGV run. Rerunning with increased sleep interval." + # # Rerun IGV + # runIGV(batch_file, igv, status, message, blank_attempts) + # # Get updated values + # img_values = getImageQualities(snapshot, batch_file, igv, failed_summary) + # # Check if blank + # blank = is_blank(img_values, blank_heights) + + # Check final values and log failed/suspicious + if float(img_values["width"]) == 640: + os.system(f'echo "Snapshot width is {img_values["width"]}. Improper dimensions should be fixed and rerun. Check snapshot {snapshot}" >> {stdout}') + qc_status = "fail" + + if img_values["height"] in thresholds["failed"]: + os.system(f'echo "Snapshot height is {img_values["height"]} and may still be truncated or improperly loaded. Check snapshot {snapshot}"" >> {stdout}') + qc_status = "fail" + + if qc_status == "pass" and any(dimension_attempts >= 5, truncated_attempts >= 3, blank_attempts >= 3): + qc_status = "suspicious" + + return qc_status + +def logFailedSnapshots(snapshot, summary_file, qc_status): + outline = "\t".join([snakemake.wildcards["tumour_id"], + snakemake.wildcards["seq_type"], + snakemake.wildcards["genome_build"], + snakemake.wildcards["gene"], + snakemake.wildcards["chromosome"], + snakemake.wildcards["start_position"], + snakemake.wildcards["preset"], + qc_status, + snapshot]) + with open(summary_file, "a" as handle: + handle.write(outline + "\n")) + +def main(): + # Output file + outfile = snakemake.output["snapshot_qc"] + + ## Quality control variables + snapshot = snakemake.input["snapshot"] + qc_thresholds = snakemake.params["thresholds"][snakemake.wildcards["pair_status_directory"]] + + ## Batch scripts + + batch_script = snakemake.params["batch_script"] + merged_batch = snakemake.params["merged_batch"] + batch_temp = snakemake.params["batch_temp"] + # Set up the temporary batch script file + os.system('cat {batch_script} > {batch_temp} && echo "exit" >> {batch_temp}') + + ## Variables for running IGV + igv_exec = { + "igv": snakemake.params["igv"], + "server_num": snakemake.params["server_number"] + "server_args": snakemake.params["server_args"] + } + + ## Summary file to append failed snapshots to + f_summary = snakemake.params["failed_summary"] + + ## Logging files + global stdout + global stderr + + stdout = snakemake.log["stdout"] + stderr = snakemake.log["stderr"] + + # Get image qualities + img_values = getImageQualities(snapshot, batch_temp, igv_exec, f_summary) + + # Control the qualities + results = qualityControl(snapshot, batch_temp, igv_exec, img_values, qc_thresholds, f_summary) + + if results != "pass": + logFailedSnapshots(snapshot, f_summary, results) + if results != "fail": + os.system(f'touch {outfile}') + + + \ No newline at end of file diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 4edb4753..72916b61 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -521,149 +521,8 @@ if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: stdout = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stdout.log", stderr = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stderr.log" threads: (workflow.cores) - run: - import subprocess - success = True - corrupt_checks = 0 - is_corrupt = True - height = None - width = None - while corrupt_checks < 2 and is_corrupt == True: - corrupt_checks += 1 - try: - height = str(subprocess.check_output(f"identify -format '%h' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] - width = str(subprocess.check_output(f"identify -format '%w' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] - is_corrupt = False - except: - os.system(f'echo "Snapshot may be corrupt. Rerunning IGV, attempt {str(corrupt_checks)}:" >> {log.stdout}') - os.system(f'cat {params.batch_script} > {params.batch_temp} && echo "exit" >> {params.batch_temp}') - os.system(f'maxtime=$(($(wc -l < {params.batch_temp}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" {params.server_number} {params.server_args} {params.igv} -b {params.batch_temp} >> {log.stdout} 2>> {log.stderr}') - if is_corrupt == True: - os.system(f'echo "Snapshot may be corrupt." >> {log.stdout}') - success = False - # Check if truncated - if height in params.thresholds[wildcards.pair_status_directory]["truncated"]: - attempts = 0 - os.system(f'sleep=$(grep "Sleep" {params.batch_script} | cut -d " " -f 2) && sed "s/setSleepInterval $sleep/setSleepInterval 5000/g" {params.batch_script} > {params.batch_temp} && echo "exit" >> {params.batch_temp}') - while height in params.thresholds[wildcards.pair_status_directory]["truncated"] and attempts < 3: - os.system(f'echo "Snapshot may be truncated. Current snapshot height is {height}. Rerunning IGV batch script {params.batch_script} with increased sleep interval.\n" >> {log.stdout}') - attempts += 1 - os.system(f'echo "IGV ATTEMPT #{attempts}:" >> {log.stdout}') - os.system(f'echo "IGV ATTEMPT #{attempts}:" >> {log.stderr}') - os.system(f'maxtime=$(($(wc -l < {params.batch_temp}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" {params.server_number} {params.server_args} {params.igv} -b {params.batch_temp} >> {log.stdout} 2>> {log.stderr}') - height = str(subprocess.check_output(f"identify -format '%h' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] - # Check if blank - possibly_blank = False - blank_heights = list(params.tresholds[wildcards.pair_status_directory]["blank"]) - for height_threshold in blank_heights: - if any(symbol in height_threshold for symbol in ["<",">"]): - if "<" in height_threshold: - if float(height) < float(height_threshold.replace("<","")): - possibly_blank = True - if ">" in height_threshold: - if float(height) > float(height_threshold.replace("<","")): - possibly_blank = True - else: - if height == height_threshold: - possibly_blank = True - if possibly_blank == True: - # Determine if snap is blank based on kurtosis and skewness values - kurtosis_threshold = blank_heights[height_threshold]["kurtosis"] - skewness_threshold = blank_heights[height_threshold]["skewness"] - kurtosis, skewness = [float(value.split(": ")[1]) for value in str(subprocess.check_output(f'identify -verbose {input.snapshot} | grep -E "kurtosis|skewness" | tail -n 2', shell=True)).replace("\\n'","").split("\\n ")] - if kurtosis > kurtosis_threshold and skewness < skewness_threshold and attempts == 0: - # Rerun if snapshot was not run during truncated check - os.system(f'sleep=$(grep "Sleep" {params.batch_script} | cut -d " " -f 2) && sed "s/setSleepInterval $sleep/setSleepInterval 5000/g" {params.batch_script} > {params.batch_temp} && echo "exit" >> {params.batch_temp}') - while kurtosis > kurtosis_threshold and skewness < skewness_threshold and new_height == height and attempts < 3: - os.system(f'echo "Snapshot may be truncated. Current snapshot height is {height}. Rerunning IGV batch script {params.batch_script} with increased sleep interval.\n" >> {log.stdout}') - attempts += 1 - os.system(f'echo "IGV ATTEMPT #{attempts}:" >> {log.stdout}') - os.system(f'echo "IGV ATTEMPT #{attempts}:" >> {log.stderr}') - os.system(f'maxtime=$(($(wc -l < {params.batch_temp}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" {params.server_number} {params.server_args} {params.igv} -b {params.batch_temp} >> {log.stdout} 2>> {log.stderr}') - new_height = str(subprocess.check_output(f"identify -format '%h' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] - kurtosis, skewness = [float(value.split(": ")[1]) for value in str(subprocess.check_output(f'identify -verbose {input.snapshot} | grep -E "kurtosis|skewness" | tail -n 2', shell=True)).replace("\\n'","").split("\\n ")] - if new_height != height: - if height_range(new_height, height, symbol): - do !kurtosis cchecks and stuff - if kurtosis > kurtosis_threshold and skewness < skewness_threshold: - os.system(f'echo "Snapshot may be blank. Current values:\nHeight:{height}, kurtosis: {str(kurtosis)}, skewness: {str(skewness)}\nSnapshots at this height with kurtosis values greater than {str(kurtosis_threshold)}, and skewness values less than {str(skewness_threshold)} may be blank. Blank snapshots may be due to errors reading BAM file headers, Java address bind errors or other errors occurring during IGV run." >> {log.stdout}') - success = False - break - # Check if height matches heights of failed snapshots - if height in params.thresholds[wildcards.pair_status_directory]["failed"]: - os.system(f'echo "Snapshot height is {height} and may still be truncated or improperly loaded. Check snapshot {input.snapshot}" >> {log.stdout}') - success = False - if width == "640": - attempts = 0 - os.system(f'echo "Snapshot appears to be in incorrect dimensions. Current width is {width} and might be due to xvfb-run unable to connect to current server {params.server_number} due to a server lock. Attempting to run on different server number..." >> {log.stdout}') - if params.server_number == "--auto-servernum" or int(params.server_number.replace("-n ","")) >= 99: - new_server = 1 - else: - new_server = int(params.server_number.replace("-n ","")) + 1 - while width == "640" and attempts < 5: - os.system(f'echo "Attempting with server number {new_server}..." >> {log.stdout}') - os.system(f'echo "Attempting with server number {new_server}..." >> {log.stderr}') - os.system(f'maxtime=$(($(wc -l < {params.merged_batch}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" -n {str(new_server)} {params.server_args} {params.igv} -b {params.merged_batch} >> {log.stdout} 2>> {log.stderr}') - width = str(subprocess.check_output(f"identify -format '%w' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] - new_server += 1 - if width == "640": - os.system(f'echo "Snapshot still appears to be in improper dimensions. Double check xvfb-run parameters." >> {log.stdout}') - success = False - if success == True: - os.system(f'touch {output.snapshot_qc}') - if success == False: - qc_status = "failed" - outline = "\t".join([wildcards.tumour_id, wildcards.seq_type, wildcards.genome_build, wildcards.gene, wildcards.chromosome, wildcards.start_position, wildcards.preset, qc_status, input.snapshot]) - with open(params.failed_summary, "a") as handle: - handle.write(outline + "\n") - elif attempts == 3: - qc_status = "suspicious" - outline = "\t".join([wildcards.tumour_id, wildcards.seq_type, wildcards.genome_build, wildcards.gene, wildcards.chromosome, wildcards.start_position, wildcards.preset, qc_status, input.snapshot]) - with open(params.failed_summary, "a") as handle: - handle.write(outline + "\n") - - #if height in ["506","545","547","559","570"]: - # attempts = 0 - # os.system(f'sleep=$(grep "Sleep" {params.batch_script} | cut -d " " -f 2) && sed "s/setSleepInterval $sleep/setSleepInterval 5000/g" {params.batch_script} > {params.batch_temp} && echo "exit" >> {params.batch_temp}') - # while height in ["506","545","547","559","570"] and attempts < 3: - # os.system(f'echo "Snapshot may be truncated. Current snapshot height is {height}. Rerunning IGV batch script {params.batch_script} with increased sleep interval.\n" >> {log.stdout}') - # attempts += 1 - # os.system(f'echo "IGV ATTEMPT #{attempts}:" >> {log.stdout}') - # os.system(f'echo "IGV ATTEMPT #{attempts}:" >> {log.stderr}') - # os.system(f'maxtime=$(($(wc -l < {params.batch_temp}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" {params.server_number} {params.server_args} {params.igv} -b {params.batch_temp} >> {log.stdout} 2>> {log.stderr}') - # height = str(subprocess.check_output(f"identify -format '%h' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] - # if height in ["547","559"]: - # kurtosis, skewness = [float(value.split(": ")[1]) for value in str(subprocess.check_output(f'identify -verbose {input.snapshot} | grep -E "kurtosis|skewness" | tail -n 2', shell=True)).replace("\\n'","").split("\\n ")] - # blank_kurtosis = {"547": 18.5, "559": 18.2} - # blank_skew = -4 - # if kurtosis > blank_kurtosis[height] and skewness < blank_skew: - # os.system(f'echo "Snapshot may be blank. Current values:\nHeight:{height}, kurtosis: {str(kurtosis)}, skewness: {str(skewness)}\nSnapshots with height of 547, kurtosis greater than 18.5, and skewness less than 4 are likely blank, snapshots with height of 559, kurtosis greater than 18.2, and skewness less than 4 may be blank. Blank snapshots may be due to errors reading BAM file headers, Java address bind errors or other errors occurring during IGV run." >> {log.stdout}') - # success = False - # if height == "506": - # os.system(f'echo "Snapshot height is {height} and may still be truncated or improperly loaded. Check snapshot {input.snapshot}" >> {log.stdout}') - # success = False - #if width == "640": - # attempts = 0 - # os.system(f'echo "Snapshot appears to be in incorrect dimensions. Current width is {width} and might be due to xvfb-run unable to connect to current server {params.server_number} due to a server lock. Attempting to run on different server number..." >> {log.stdout}') - # if params.server_number == "--auto-servernum" or int(params.server_number.replace("-n ","")) >= 99: - # new_server = 1 - # else: - # new_server = int(params.server_number.replace("-n ","")) + 1 - # while width == "640" and attempts < 5: - # os.system(f'echo "Attempting with server number {new_server}..." >> {log.stdout}') - # os.system(f'echo "Attempting with server number {new_server}..." >> {log.stderr}') - # os.system(f'maxtime=$(($(wc -l < {params.merged_batch}) * 60 + 15)) ; timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" -n {str(new_server)} {params.server_args} {params.igv} -b {params.merged_batch} >> {log.stdout} 2>> {log.stderr}') - # width = str(subprocess.check_output(f"identify -format '%w' {input.snapshot}", shell=True)).split("'")[1].split("\\n")[0] - # new_server += 1 - # if width == "640": - # os.system(f'echo "Snapshot still appears to be in improper dimensions. Double check xvfb-run parameters." >> {log.stdout}') - # success = False - #if success == True: - # os.system(f'touch {output.snapshot_qc}') - #if success == False: - # outline = "\t".join([wildcards.tumour_id, wildcards.seq_type, wildcards.genome_build, wildcards.gene, wildcards.chromosome, wildcards.start_position, wildcards.preset, input.snapshot]) - # with open(params.failed_summary, "a") as handle: - # handle.write(outline + "\n") + script: + CFG["scripts"]["quality_control"] # Symlinks the final output files into the module results directory (under '99-outputs/') rule _igv_symlink_snapshot: From f565b3088fbe00af8bade7455f1eb819570fb680 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Wed, 24 Jan 2024 02:17:02 -0800 Subject: [PATCH 107/132] Overhaul to take snaps of N and T if applicable --- modules/igv/1.0/etc/filter_maf.py | 2 +- .../etc/generate_batch_script_per_variant.py | 152 ++- modules/igv/1.0/igv.smk | 1042 +++++++++-------- modules/igv/CHANGELOG.md | 29 +- 4 files changed, 649 insertions(+), 576 deletions(-) diff --git a/modules/igv/1.0/etc/filter_maf.py b/modules/igv/1.0/etc/filter_maf.py index 37096d72..1f85571b 100644 --- a/modules/igv/1.0/etc/filter_maf.py +++ b/modules/igv/1.0/etc/filter_maf.py @@ -123,7 +123,7 @@ def maf_add_columns(maf, metadata, wildcards): return maf def write_output(maf, outfile): - maf.to_csv(outfile, sep="\t", index=False) + maf.to_csv(outfile, sep="\t", na_rep="NA", index=False) if __name__ == "__main__": logging.basicConfig( diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py index 9fc87586..1f7e08af 100644 --- a/modules/igv/1.0/etc/generate_batch_script_per_variant.py +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd import oncopipe as op +import copy import sys import logging import traceback @@ -22,61 +23,76 @@ def main(): try: # Handle matched samples with matched normal BAMs - input_bams = snakemake.input[0:len(snakemake.input) - 1] - input_bam = input_bams[:int(len(input_bams) / 2)] - input_bai = input_bams[int(len(input_bams)/2):] + input_bam = snakemake.input["bam_file"] + input_bai = snakemake.input["bai_file"] - maf = snakemake.input[-1] - batch_options = snakemake.params[4] + maf = snakemake.input["filter_maf"] + + batch_options = snakemake.params["batch_options"] # Print run info for logging - print(f"Setting up batch scripts using the following inputs:\nBam files:\t{input_bam}\nBai files:\t{input_bai}\nParameters:\t{snakemake.params[6]}\nBatch options:\t{batch_options}") - - input_maf = open(maf, "r") - - # Skip if no variants in outfile - line_count = 0 - for line in input_maf: - line_count += 1 - if line_count > 1: - break - if line_count < 2: - input_maf.close() - touch_outputs( - output_dir = snakemake.params[0], - seq_type = snakemake.wildcards["seq_type"], - genome_build = snakemake.wildcards["genome_build"], - presets = snakemake.params[6], - tumour_id = snakemake.wildcards["tumour_id"], - suffix = snakemake.params[5], - finished_file = snakemake.output[0] - ) - exit() - - # Return to top of MAF - input_maf.seek(0) + print(f"Setting up batch scripts using the following inputs:\n\ + Bam files:\t{input_bam}\n\ + Bai files:\t{input_bai}\n\ + Filtered maf:\t{maf}\n\ + Parameters:\t{snakemake.params[6]}\n\ + Batch options:\t{batch_options}") + + if not isinstance(maf, list): + maf = list(maf) + + empty_mafs = [] + + for m in maf: + # Skip if no variants in outfile + input_maf = open(m, "r") + + line_count = 0 + for line in input_maf: + line_count += 1 + if line_count > 1: + # Return to top of MAF + input_maf.seek(0) + break + if line_count < 2: + input_maf.close() + empty_mafs.append(m) + + if len(empty_mafs) != 0: + if all(m in empty_mafs for m in maf): + touch_outputs( + output_dir = snakemake.params["batch_dir"], + seq_type = snakemake.wildcards["seq_type"], + genome_build = snakemake.wildcards["genome_build"], + presets = snakemake.params["igv_presets"], + sample_id = snakemake.wildcards["sample_id"], + suffix = snakemake.params["suffix"], + finished_file = snakemake.output["batches_finished"] + ) + exit() + for e in empty_mafs: + maf.remove(e) # Read MAF file and create dataframe regions = get_regions_df( - input_maf, + maf, padding=batch_options["padding"] ) - input_maf.close() - # Create the batch scripts generate_igv_batches( regions = regions, bam = input_bam, bai = input_bai, - output_dir = snakemake.params[0], - snapshot_dir = snakemake.params[1], - genome_build = snakemake.params[2], - seq_type = snakemake.params[3], - suffix = snakemake.params[5], - igv_presets = snakemake.params[6], + output_dir = snakemake.params["batch_dir"], + snapshot_dir = snakemake.params["snapshot_dir"], + genome_build = snakemake.params["genome_build"], + seq_type = snakemake.params["seq_type"], + suffix = snakemake.params["suffix"], + igv_presets = snakemake.params["igv_presets"], igv_options = batch_options["igv_options"], max_height = batch_options["max_height"], + tissue_status = snakemake.params["tissue_status"], sleep_timer = batch_options["sleep_timer"] ) @@ -87,11 +103,11 @@ def main(): logging.error(e, exc_info=1) raise -def touch_outputs(output_dir, seq_type, genome_build, presets, tumour_id, suffix, finished_file): - tumour_suffix = tumour_id + suffix + ".batch" +def touch_outputs(output_dir, seq_type, genome_build, presets, sample_id, suffix, finished_file): + sample_suffix = sample_id + suffix + ".batch" for preset in presets: - os.makedirs(os.path.join(output_dir, "--".join([seq_type, genome_build]), preset), exist_ok = True) - merged_batch = os.path.join(output_dir, "merged_batch_scripts", "--".join([seq_type, genome_build]), preset, tumour_suffix) + os.makedirs(os.path.join(output_dir, "merged_batch_scripts", "--".join([seq_type, genome_build]), preset), exist_ok = True) + merged_batch = os.path.join(output_dir, "merged_batch_scripts", "--".join([seq_type, genome_build]), preset, sample_suffix) merged_file = open(merged_batch, "w") merged_file.close() touch_finished = open(finished_file, "w") @@ -99,7 +115,10 @@ def touch_outputs(output_dir, seq_type, genome_build, presets, tumour_id, suffix def get_regions_df(input_maf, padding): # Read MAF as dataframe - maf = pd.read_table(input_maf, comment="#", sep="\t") + if len(input_maf) > 1: + maf = pd.concat([pd.read_table(file, comment="#", sep="\t") for file in input_maf]) + else: + maf = pd.read_table(input_maf[0], comment="#", sep="\t") chrom = (maf["Chromosome"].astype(str)).apply(lambda x: x.replace("chr","")) @@ -114,13 +133,20 @@ def get_regions_df(input_maf, padding): {"chromosome": "chr" + chrom, "region": regions, "region_name": maf.Hugo_Symbol, - "sample_id": maf.Tumor_Sample_Barcode, + "tumour_id": maf.Tumor_Sample_Barcode, + "normal_id": maf.Matched_Norm_Sample_Barcode, + "ref_allele": maf.Reference_Allele, + "alt_allele": maf.Tumor_Seq_Allele2, "snapshot_coordinates": snapshot_coordinates, "padding": padding, "pair_status": maf.pair_status } ) + samp_id = snakemake.wildcards["sample_id"] + + assert len(regions_df["normal_id"].drop_duplicates()) == 1, f"More than one normal ID found within the MAF files' `Matched_Norm_Sample_Barcode` column for this sample: {samp_id}. Please double check MAF files: {input_maf}" + return regions_df def output_lines(lines, batch_output): @@ -138,10 +164,10 @@ def generate_igv_batch_per_row(sleep_interval, preset, options, coordinates, dir snapshot_regions_dir = os.path.join(directory, seq_build, child_dir, preset, chrom_directory, "") lines.append(f"snapshotDirectory {snapshot_regions_dir}") + lines.append("collapse") for igv_option in options[preset]: lines.append(igv_option) lines.append(f"setSleepInterval {sleep_interval}") - lines.append("collapse") lines.append(f"snapshot {snapshot_filename}") return lines @@ -151,30 +177,24 @@ def generate_igv_batch_header(bam, index, max_height, genome_build): genome_build = genome_build.replace("grch37","hg19") - assert len(bam) == len(index), "Error while generating batch script: number of .bam files and .bai files are not equal" - - for i in range(0,len(bam)): - lines.append(f"load {bam[i]} index={index[i]}") - + lines.append(f"load {bam} index={index}") lines.append(f"maxPanelHeight {max_height}") lines.append(f"genome {genome_build}") return lines -def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_build, seq_type, suffix, igv_presets, igv_options, max_height, sleep_timer=2000): +def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_build, seq_type, suffix, igv_presets, igv_options, max_height, tissue_status, sleep_timer=2000): for preset in igv_presets: + + merged_batch_suffix = snakemake.wildcards["sample_id"] + suffix + ".batch" + for _, row in regions.iterrows(): all_lines = [] - merged_batch_suffix = row.sample_id + suffix + ".batch" - header = generate_igv_batch_header(bam=bam, index=bai, max_height=max_height, genome_build=genome_build) all_lines.extend(header) - if row.pair_status == "matched": - child_directory = "tumour_normal_pair" - elif row.pair_status == "unmatched": - child_directory = "tumour_only" + child_directory = tissue_status seq_type_build = f"{seq_type}--{genome_build}" chrom_dir = row.chromosome @@ -182,10 +202,18 @@ def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_bui filename = [] filename.append(row.region), filename.append(row.region_name) - filename.append(row.sample_id) - batch_filename = "--".join(filename) + suffix + ".batch" - filename = "--".join(filename) + suffix + ".png" + batch_filename = filename.copy() + batch_filename.append(snakemake.wildcards["sample_id"]) + batch_filename = "--".join(batch_filename) + suffix + ".batch" + + snap_filename = filename.copy() + if tissue_status == "tumour": + snap_filename.append(f"{row.ref_allele}_{row.alt_allele}") + elif tissue_status == "normal": + snap_filename.append(f"{row.ref_allele}_{row.ref_allele}") + snap_filename.append(snakemake.wildcards["sample_id"]) + snap_filename = "--".join(snap_filename) + suffix + ".png" lines = generate_igv_batch_per_row( sleep_interval = sleep_timer, @@ -196,7 +224,7 @@ def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_bui child_dir = child_directory, seq_build = seq_type_build, chrom_directory = chrom_dir, - snapshot_filename = filename + snapshot_filename = snap_filename ) all_lines.extend(lines) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 72916b61..52e969ad 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -14,6 +14,7 @@ # Import package with useful functions for developing analysis modules import oncopipe as op import pandas as pd +from datetime import datetime import os # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe @@ -46,11 +47,14 @@ CFG = op.setup_module( CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["options"]["genome_map"]["grch37"]), "grch37", inplace=True) CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["options"]["genome_map"]["hg38"]), "hg38", inplace=True) -# Define output file suffix based on config parameters +# Setup variables + SUFFIX = ".pad" + str(CFG["options"]["generate_batch_script"]["padding"]) +if "launch_date" in CFG: + LAUNCH_DATE = CFG["launch_date"] +else: + LAUNCH_DATE = datetime.today().strftime('%Y-%m-%d') -# Assign pair_status_directory value based on pair_status value -PAIR_STATUS_DICT = {"matched": "tumour_normal_pair", "unmatched": "tumour_only"} # Define rules to be run locally when using a compute cluster localrules: @@ -76,21 +80,13 @@ localrules: ##### FUNCTIONS ##### -def get_bams(wildcards): +def get_bam(wildcards): metadata = config["lcr-modules"]["igv"]["samples"] - return expand("data/{{seq_type}}_bams/{{tumour_id}}.{genome_build}.bam", genome_build=metadata[(metadata.sample_id == wildcards.tumour_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) + return expand("data/{{seq_type}}_bams/{{sample_id}}.{genome_build}.bam", genome_build=metadata[(metadata.sample_id == wildcards.sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) def get_bai(wildcards): metadata = config["lcr-modules"]["igv"]["samples"] - return expand("data/{{seq_type}}_bams/{{tumour_id}}.{genome_build}.bam.bai", genome_build=metadata[(metadata.sample_id == wildcards.tumour_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) - -def get_normal_bam(wildcards): - metadata = config["lcr-modules"]["igv"]["samples"] - return expand("data/{{seq_type}}_bams/{{normal_sample_id}}.{genome_build}.bam", genome_build=metadata[(metadata.sample_id == wildcards.normal_sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) - -def get_normal_bai(wildcards): - metadata = config["lcr-modules"]["igv"]["samples"] - return expand("data/{{seq_type}}_bams/{{normal_sample_id}}.{genome_build}.bam.bai", genome_build=metadata[(metadata.sample_id == wildcards.normal_sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) + return expand("data/{{seq_type}}_bams/{{sample_id}}.{genome_build}.bam.bai", genome_build=metadata[(metadata.sample_id == wildcards.sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) def get_maf(wildcards): unix_group = config["unix_group"] @@ -105,11 +101,9 @@ def get_maf(wildcards): rule _igv_symlink_bam: input: - bam = get_bams + bam = get_bam output: - bam = CFG["dirs"]["inputs"] + "bams/{seq_type}/{tumour_id}.bam" - threads: - CFG["threads"]["_igv_symlink_bam"] + bam = CFG["dirs"]["inputs"] + "bams/{seq_type}/{sample_id}.bam" run: op.absolute_symlink(input.bam, output.bam) @@ -117,25 +111,7 @@ rule _igv_symlink_bai: input: bai = get_bai output: - bai = CFG["dirs"]["inputs"] + "bams/{seq_type}/{tumour_id}.bam.bai" - threads: - CFG["threads"]["_igv_symlink_bai"] - run: - op.absolute_symlink(input.bai, output.bai) - -rule _igv_symlink_normal_bam: - input: - bam = get_normal_bam - output: - bam = CFG["dirs"]["inputs"] + "normal_bams/{seq_type}/{normal_sample_id}.bam" - run: - op.absolute_symlink(input.bam, output.bam) - -rule _igv_symlink_normal_bai: - input: - bai = get_normal_bai - output: - bai = CFG["dirs"]["inputs"] + "normal_bams/{seq_type}/{normal_sample_id}.bam.bai" + bai = CFG["dirs"]["inputs"] + "bams/{seq_type}/{sample_id}.bam.bai" run: op.absolute_symlink(input.bai, output.bai) @@ -155,7 +131,7 @@ rule _igv_reduce_maf_cols: maf = temp(CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}.maf.temp") shell: op.as_one_line(""" - cut -f 1,5,6,7,9,10,11,13,16 {input.maf} > {output.maf} + cut -f 1,5,6,7,9,10,11,13,16,17 {input.maf} > {output.maf} """) rule _igv_merge_regions: @@ -164,8 +140,8 @@ rule _igv_merge_regions: output: merged_regions = CFG["dirs"]["inputs"] + "regions/{tool_type}_merged.{tool_build}.tsv" log: - stdout = CFG["logs"]["inputs"] + "merge_{tool_type}_regions.{tool_build}.stdout.log", - stderr = CFG["logs"]["inputs"] + "merge_{tool_type}_regions.{tool_build}.stderr.log" + stdout = CFG["logs"]["inputs"] + "regions/merge_{tool_type}_regions.{tool_build}.stdout.log", + stderr = CFG["logs"]["inputs"] + "regions/merge_{tool_type}_regions.{tool_build}.stderr.log" run: merged_df = pd.DataFrame() for result in input.input_regions: @@ -175,7 +151,7 @@ rule _igv_merge_regions: except: with open(log.stdout, "a") as header: header.write(f"Error reading or merging file {result}\n") - merged_df.to_csv(output.merged_regions, sep="\t", index=False) + merged_df.to_csv(output.merged_regions, sep="\t", na_rep="NA", index=False) # Convert input regions file into BED format rule _igv_format_regions: @@ -188,8 +164,8 @@ rule _igv_format_regions: oncodriveclustl_params = CFG["options"]["filter_maf"]["oncodriveclustl_options"], regions_build = lambda w: w.tool_build log: - stdout = CFG["logs"]["inputs"] + "format_regions_{tool_type}.{tool_build}.stdout.log", - stderr = CFG["logs"]["inputs"] + "format_regions_{tool_type}.{tool_build}.stderr.log" + stdout = CFG["logs"]["inputs"] + "regions/format_regions_{tool_type}.{tool_build}.stdout.log", + stderr = CFG["logs"]["inputs"] + "regions/format_regions_{tool_type}.{tool_build}.stderr.log" script: config["lcr-modules"]["igv"]["scripts"]["format_regions"] @@ -218,8 +194,8 @@ rule _igv_liftover_regions: resources: **CFG["resources"]["_igv_liftover_regions"] log: - stdout = CFG["logs"]["inputs"] + "liftover_regions_{tool_type}.{tool_build}To{genome_build}.stdout.log", - stderr = CFG["logs"]["inputs"] + "liftover_regions_{tool_type}.{tool_build}To{genome_build}.stderr.log" + stdout = CFG["logs"]["inputs"] + "regions/liftover_regions_{tool_type}.{tool_build}To{genome_build}.stdout.log", + stderr = CFG["logs"]["inputs"] + "regions/liftover_regions_{tool_type}.{tool_build}To{genome_build}.stderr.log" shell: op.as_one_line(""" {input.liftover_script} @@ -231,13 +207,10 @@ rule _igv_liftover_regions: def _get_lifted_regions(wildcards): CFG = config["lcr-modules"]["igv"] return expand( - expand( - str(rules._igv_liftover_regions.output.regions), - tool_type = list(CFG["regions"]), - tool_build = ["grch37","hg38"], - allow_missing = True - ), - genome_build = wildcards.genome_build + str(rules._igv_liftover_regions.output.regions), + tool_type = list(CFG["regions"]), + tool_build = ["grch37","hg38"], + allow_missing = True ) rule _igv_merge_lifted_regions: @@ -255,7 +228,7 @@ rule _igv_merge_lifted_regions: except: print(f"Lifted regions file is empty: {region}") merged_df = merged_df.drop_duplicates() - merged_df.to_csv(output.regions, sep="\t", index=False, header=False) + merged_df.to_csv(output.regions, sep="\t", na_rep="NA", index=False, header=False) # Filter MAF to lines containing positions of interest rule _igv_filter_maf: @@ -268,536 +241,581 @@ rule _igv_filter_maf: regions_format = "bed", metadata = CFG["runs"] log: - stdout = CFG["logs"]["inputs"] + "filter_maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}/filter_maf.stdout.log", - stderr = CFG["logs"]["inputs"] + "filter_maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}/filter_maf.stderr.log" + stdout = CFG["logs"]["inputs"] + "filter_maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}.stdout.log", + stderr = CFG["logs"]["inputs"] + "filter_maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}.stderr.log" wildcard_constraints: seq_type = "[a-zA-Z]+" script: config["lcr-modules"]["igv"]["scripts"]["filter_script"] -if CFG["estimate_only"] == False and CFG["identify_failed_snaps"]==False: - - def _get_maf(wildcards): - CFG = config["lcr-modules"]["igv"] +def _get_maf(wildcards): + CFG = config["lcr-modules"]["igv"] - this_sample = op.filter_samples(CFG["runs"], tumour_sample_id=wildcards.tumour_id, tumour_seq_type=wildcards.seq_type) - genome_build = this_sample["tumour_genome_build"] + if wildcards.sample_id in list(CFG["runs"]["tumour_sample_id"]): + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.sample_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) normal_sample_id = this_sample["normal_sample_id"] pair_status = this_sample["pair_status"] - return ( - expand( - str(rules._igv_filter_maf.output.maf), - zip, - seq_type = wildcards.seq_type, - genome_build = genome_build, - tumour_id = wildcards.tumour_id, - normal_sample_id = normal_sample_id, - pair_status = pair_status - ) + return expand( + str(rules._igv_filter_maf.output.maf), + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + tumour_id = wildcards.sample_id, + normal_sample_id = normal_sample_id, + pair_status = pair_status ) - def _get_bam_files(wildcards): - CFG = config["lcr-modules"]["igv"] + if wildcards.sample_id in list(CFG["runs"]["normal_sample_id"]): + these_samples = op.filter_samples(CFG["runs"], normal_sample_id = wildcards.sample_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build, pair_status = "matched") - this_sample = op.filter_samples(CFG["runs"], tumour_sample_id=wildcards.tumour_id, tumour_seq_type=wildcards.seq_type) - normal_sample_id = this_sample["normal_sample_id"] - pair_status = this_sample["pair_status"] + assert sorted(these_samples["tumour_genome_build"].drop_duplicates()) == sorted(these_samples["normal_genome_build"].drop_duplicates()), f"Different genome builds between normal ID and tumour ID for {wildcards.sample_id}" - if pair_status.item() == "matched": - tumour_bam_file = expand(str(rules._igv_symlink_bam.output.bam), zip, seq_type=wildcards.seq_type, tumour_id=wildcards.tumour_id)[0] - normal_bam_file = expand(str(rules._igv_symlink_normal_bam.output.bam), zip, seq_type=wildcards.seq_type, normal_sample_id=normal_sample_id)[0] - return([tumour_bam_file, normal_bam_file]) + return expand( + str(rules._igv_filter_maf.output.maf), + zip, + seq_type = these_samples["tumour_seq_type"], + genome_build = these_samples["tumour_genome_build"], + tumour_id = these_samples["tumour_sample_id"], + normal_sample_id = these_samples["normal_sample_id"], + pair_status = these_samples["pair_status"] + ) - if pair_status.item() == "unmatched": - return ( - expand(str(rules._igv_symlink_bam.output.bam), zip, seq_type=wildcards.seq_type, tumour_id=wildcards.tumour_id) - ) +checkpoint _igv_create_batch_script_per_variant: + input: + bam_file = str(rules._igv_symlink_bam.output.bam), + bai_file = str(rules._igv_symlink_bai.output.bai), + filter_maf = _get_maf + output: + finished = CFG["dirs"]["batch_scripts"] + "completed/{seq_type}--{genome_build}/{sample_id}.finished", + variant_batch = expand(CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{{seq_type}}--{{genome_build}}/{preset}/{{sample_id}}" + SUFFIX + ".batch", preset = CFG["options"]["igv_presets"], allow_missing=True) + params: + tissue_status = lambda w: "normal" if w.sample_id in list(config["lcr-modules"]["igv"]["runs"]["normal_sample_id"]) else "tumour" if w.sample_id in list(config["lcr-modules"]["igv"]["runs"]["tumour_sample_id"]) else "unknown", + batch_dir = CFG["dirs"]["batch_scripts"], + snapshot_dir = CFG["dirs"]["snapshots"], + genome_build = "{genome_build}", + seq_type = "{seq_type}", + batch_options = CFG["options"]["generate_batch_script"], + suffix = SUFFIX, + igv_presets = CFG["options"]["igv_presets"] + log: + stdout = CFG["logs"]["batch_scripts"] + "{seq_type}--{genome_build}/{sample_id}" + SUFFIX + ".stdout.log", + stderr = CFG["logs"]["batch_scripts"] + "{seq_type}--{genome_build}/{sample_id}" + SUFFIX + ".stderr.log" + script: + config["lcr-modules"]["igv"]["scripts"]["batch_script_per_variant"] - def _get_bai_files(wildcards): - CFG = config["lcr-modules"]["igv"] + +rule _igv_batches_to_merge: + input: + batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{sample_id}" + SUFFIX + ".batch" + output: + dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{sample_id}" + SUFFIX + ".batch" + params: + merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset}/{sample_id}" + SUFFIX + ".batch", + igv_options = lambda w: config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["igv_options"][w.preset] + threads: (workflow.cores / 10) + run: + batch_script_path = os.path.abspath(input.batch_script) + output_file = os.path.abspath(params.merged_batch) - this_sample = op.filter_samples(CFG["runs"], tumour_sample_id=wildcards.tumour_id, tumour_seq_type=wildcards.seq_type) - normal_sample_id = this_sample["normal_sample_id"] - pair_status = this_sample["pair_status"] + batch_script = open(batch_script_path, "r") - if pair_status.item() == "matched": - tumour_bai_file = expand(str(rules._igv_symlink_bai.output.bai), zip, seq_type=wildcards.seq_type, tumour_id=wildcards.tumour_id)[0] - normal_bai_file = expand(str(rules._igv_symlink_normal_bai.output.bai), zip, seq_type=wildcards.seq_type, normal_sample_id=normal_sample_id)[0] - return([tumour_bai_file, normal_bai_file]) - if pair_status.item() == "unmatched": - return( - expand(str(rules._igv_symlink_bai.output.bai), zip, seq_type=wildcards.seq_type, tumour_id=wildcards.tumour_id) - ) + with open(output_file, "r") as f: + merged_lines = len(f.readlines()) - # Create batch scripts for each variant - checkpoint _igv_create_batch_script_per_variant: - input: - bam_file = _get_bam_files, - bai_file = _get_bai_files, - filter_maf = _get_maf - output: - batches_finished = CFG["dirs"]["batch_scripts"] + "completed/{seq_type}--{genome_build}/{tumour_id}.finished", - variant_batch = expand(CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{{seq_type}}--{{genome_build}}/{preset}/{{tumour_id}}" + SUFFIX + ".batch", preset = CFG["options"]["igv_presets"], allow_missing=True), - params: - batch_dir = config["lcr-modules"]["igv"]["dirs"]["batch_scripts"], - snapshot_dir = config["lcr-modules"]["igv"]["dirs"]["snapshots"], - genome_build = lambda w: w.genome_build, - seq_type = lambda w: w.seq_type, - batch_options = config["lcr-modules"]["igv"]["options"]["generate_batch_script"], - suffix = SUFFIX, - igv_presets = config["lcr-modules"]["igv"]["options"]["igv_presets"] - log: - stdout = CFG["logs"]["batch_scripts"] + "_igv_create_batch_script_per_variant/{seq_type}--{genome_build}/{tumour_id}" + SUFFIX + ".stdout.log", - stderr = CFG["logs"]["batch_scripts"] + "_igv_create_batch_script_per_variant/{seq_type}--{genome_build}/{tumour_id}" + SUFFIX + ".stderr.log" - script: - config["lcr-modules"]["igv"]["scripts"]["batch_script_per_variant"] - - # Keep track of which variant and sample_id combinations have been seen, merge individual variant batch scripts into a large batch script per sample_id - rule _igv_batches_to_merge: - input: - batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" - output: - dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch" - params: - merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset}/{tumour_id}" + SUFFIX + ".batch", - igv_options = lambda w: config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["igv_options"][w.preset] - threads: (workflow.cores / 10) - run: - batch_script_path = os.path.abspath(input.batch_script) - output_file = os.path.abspath(params.merged_batch) - - batch_script = open(batch_script_path, "r") - - with open(output_file, "r") as f: - merged_lines = len(f.readlines()) - - with open(output_file, "a") as handle: - for line in batch_script: - if merged_lines > 0: - if line.startswith(("load","maxPanelHeight","genome", "setSleepInterval", "collapse")): - continue - if line.startswith(tuple(params.igv_options)) and not line.startswith("sort"): - continue - handle.write(line) - batch_script.close() - - output_touch = open(output.dispatched_batch_script, "w") - output_touch.close() - - # Return list of all batch scripts that were created from the filtered maf and merged - def _evaluate_batches(wildcards): - CFG = config["lcr-modules"]["igv"] - checkpoint_output = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.variant_batch + with open(output_file, "a") as handle: + for line in batch_script: + if merged_lines > 0: + if line.startswith(("load", "maxPanelHeight", "genome","setSleepInterval", "collapse")): + continue + if line.startswith(tuple(params.igv_options)) and not line.startswith("sort"): + continue + handle.write(line) - this_sample = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_genome_build = wildcards.genome_build, tumour_seq_type = wildcards.seq_type) + batch_script.close() + output_touch = open(output.dispatched_batch_script, "w") + output_touch.close() + +def _evaluate_batches(wildcards): + CFG = config["lcr-modules"]["igv"] + checkpoint_output = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.variant_batch + + if wildcards.sample_id in list(CFG["runs"]["tumour_sample_id"]): + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id=wildcards.sample_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) normal_sample_id = this_sample["normal_sample_id"] pair_status = this_sample["pair_status"] maf = expand( - str(rules._igv_filter_maf.output.maf), - zip, - seq_type=wildcards.seq_type, - genome_build=wildcards.genome_build, - tumour_id=wildcards.tumour_id, - normal_sample_id=normal_sample_id, - pair_status=pair_status + str(rules._igv_filter_maf.output.maf), + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + tumour_id = wildcards.sample_id, + normal_sample_id = normal_sample_id, + pair_status = pair_status ) - if os.path.exists(maf[0]): - maf_table = pd.read_table(maf[0], comment="#", sep="\t") - - return expand( - expand( - str(rules._igv_batches_to_merge.output.dispatched_batch_script), - zip, - chromosome = maf_table["chr_std"], - start_position = maf_table["Start_Position"], - gene = maf_table["Hugo_Symbol"], - tumour_id = maf_table["Tumor_Sample_Barcode"], - seq_type = maf_table["seq_type"], - genome_build = maf_table["genome_build"], - allow_missing = True - ), - preset = wildcards.preset - ) - else: - return [] - - rule _igv_download_igv: - output: - igv_zip = CFG["dirs"]["igv"] + "IGV_2.7.2.zip", - igv_installed = CFG["dirs"]["igv"] + "igv_2.7.2.installed" - conda: - CFG["conda_envs"]["wget"] - log: - stdout = CFG["logs"]["igv"] + "download_igv.stdout.log", - stderr = CFG["logs"]["igv"] + "download_igv.stderr.log" - shell: - op.as_one_line(""" - wget -O {output.igv_zip} https://data.broadinstitute.org/igv/projects/downloads/2.7/IGV_Linux_2.7.2.zip && - unzip {output.igv_zip} -d $(dirname {output.igv_zip}) > {log.stdout} 2> {log.stderr} && - touch {output.igv_installed} - """) - - # Run IGV once all individual variant batch scripts have been merged into one script per sample_id - checkpoint _igv_run: - input: - igv = str(rules._igv_download_igv.output.igv_installed), - batch_script = _evaluate_batches, - finished_batches = str(rules._igv_create_batch_script_per_variant.output.batches_finished) - output: - complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}--{tumour_id}--{preset}.completed" - params: - merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset}/{tumour_id}" + SUFFIX + ".batch", - igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh", - max_time = CFG["options"]["generate_batch_script"]["sleep_timer"], - server_number = "-n " + CFG["options"]["xvfb_parameters"]["server_number"] if CFG["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", - server_args = CFG["options"]["xvfb_parameters"]["server_args"] - resources: - **CFG["resources"]["_igv_run"] - log: - stdout = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{preset}/{tumour_id}_igv_run.stdout.log", - stderr = CFG["logs"]["igv"] + "{seq_type}--{genome_build}/{preset}/{tumour_id}_igv_run.stderr.log" - threads: (workflow.cores) - shell: - op.as_one_line(""" - lines=$(wc -l < {params.merged_batch}) ; - if [ $lines -gt 0 ] ; - then - if ! grep -q -e "exit" {params.merged_batch} ; - then - echo 'exit' >> {params.merged_batch} ; - fi ; - maxtime=$(($(wc -l < {params.merged_batch}) * 60 + 15)) ; - timeout $maxtime xvfb-run -s "-screen 0 1920x1080x24" {params.server_number} {params.server_args} {params.igv} -b {params.merged_batch} > {log.stdout} 2> {log.stderr} && touch {output.complete} ; - exit=$? ; - if [ $exit -ne 0 ] ; - then - if grep -q -e "No such process" {log.stderr} && grep -q -e "Executing Command: exit" {log.stdout} ; - then - echo "All IGV batch script commands have completed succesfully, but an Xvfb-run kill error has occurred." >> {log.stdout} && touch {output.complete} ; - else - false ; - fi ; - fi ; - else - echo 'Skipping sample {wildcards.tumour_id} because it either has no variants to snapshot or all variants have been already been snapshot.' ; - touch {output.complete} ; - fi - """) - - rule _igv_track_failed: - output: - failed_summary = CFG["dirs"]["outputs"] + "snapshot_estimates/failed_summary.txt", - ready = temp(CFG["dirs"]["outputs"] + "snapshot_estimates/failed_summary.completed") - run: - header = "\t".join(["sample_id","seq_type","genome_build","gene","chromosome","position","preset","status","snapshot_path"]) - with open(output.failed_summary, "w") as handle: - handle.write(header + "\n") - ready = open(output.ready, "w") - ready.close() - - rule _igv_quality_control: - input: - igv = str(rules._igv_run.output.complete), - snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png", - failed_summary = str(rules._igv_track_failed.output.ready) - output: - snapshot_qc = temp(CFG["dirs"]["snapshots"] + "qc/{seq_type}--{genome_build}/{pair_status_directory}/{preset}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".qc") - params: - batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch", - merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset}/{tumour_id}" + SUFFIX + ".batch", - igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh", - server_number = "-n " + CFG["options"]["xvfb_parameters"]["server_number"] if CFG["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", - server_args = CFG["options"]["xvfb_parameters"]["server_args"], - batch_temp = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".batch.temp", - failed_summary = str(rules._igv_track_failed.output.failed_summary), - thresholds = CFG["options"]["quality_control"] - resources: - **CFG["resources"]["_igv_quality_control"] - log: - stdout = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stdout.log", - stderr = CFG["logs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + "_quality_control.stderr.log" - threads: (workflow.cores) - script: - CFG["scripts"]["quality_control"] - - # Symlinks the final output files into the module results directory (under '99-outputs/') - rule _igv_symlink_snapshot: - input: - snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png", - snapshot_qc = str(rules._igv_quality_control.output.snapshot_qc) - output: - snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{pair_status_directory}/{preset}/{chromosome}/{chromosome}:{start_position}--{gene}--{tumour_id}" + SUFFIX + ".png" - threads: - CFG["threads"]["_igv_symlink_snapshot"] - run: - op.relative_symlink(input.snapshot, output.snapshot) - - # Return a list of all snapshots that were taken during IGV for each specific tumour_id, tumour_seq_type, and tumour_genome_build combination - def _symlink_snapshot(wildcards): - CFG = config["lcr-modules"]["igv"] - checkpoint_outputs = checkpoints._igv_run.get(**wildcards).output.complete + maf_table = pd.read_table(maf[0], comment = "#", sep="\t") - this_sample = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) + if wildcards.sample_id in list(CFG["runs"]["normal_sample_id"]): + these_samples = op.filter_samples(CFG["runs"], normal_sample_id=wildcards.sample_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build, pair_status="matched") # only get MAFs from matched tumour + normal combos + mafs = expand( + str(rules._igv_filter_maf.output.maf), + zip, + seq_type = these_samples["tumour_seq_type"], + genome_build = these_samples["tumour_genome_build"], + tumour_id = these_samples["tumour_sample_id"], + normal_sample_id = these_samples["normal_sample_id"], + pair_status = these_samples["pair_status"] + ) + + maf_table = pd.concat([pd.read_table(m, comment="#", sep="\t") for m in mafs]) + + return expand( + expand( + str(rules._igv_batches_to_merge.output.dispatched_batch_script), + zip, + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + allow_missing = True + ), + preset = wildcards.preset, + allow_missing = True + ) + +rule _igv_download_igv: + output: + igv_zip = CFG["dirs"]["igv"] + "IGV_2.7.2.zip", + igv_installed = CFG["dirs"]["igv"] + "igv_2.7.2.installed" + conda: + CFG["conda_envs"]["wget"] + log: + stdout = CFG["logs"]["igv"] + "download/igv_download.stdout.log", + stderr = CFG["logs"]["igv"] + "download/igv_download.stderr.log" + shell: + op.as_one_line(""" + wget -O {output.igv_zip} https://data.broadinstitute.org/igv/projects/downloads/2.7/IGV_Linux_2.7.2.zip && + unzip {output.igv_zip} -d $(dirname {output.igv_zip}) > {log.stdout} 2> {log.stderr} && + touch {output.igv_installed} + """) + +checkpoint _igv_run: + input: + igv = str(rules._igv_download_igv.output.igv_installed), + finished_batches = str(rules._igv_create_batch_script_per_variant.output.finished), + batch_script = _evaluate_batches + output: + complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}/{preset}/{sample_id}.completed" + params: + merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset}/{sample_id}" + SUFFIX + ".batch", + igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh", + sleep_time = CFG["options"]["generate_batch_script"]["sleep_timer"], + server_number = "-n " + CFG["options"]["xvfb_parameters"]["server_number"] if CFG["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", + server_args = CFG["options"]["xvfb_parameters"]["server_args"] + resources: + **CFG["resources"]["_igv_run"] + threads: (workflow.cores) + log: + stdout = CFG["logs"]["igv"] + "igv_run/{seq_type}--{genome_build}/{preset}/{sample_id}.stdout.log", + stderr = CFG["logs"]["igv"] + "igv_run/{seq_type}--{genome_build}/{preset}/{sample_id}.stderr.log" + shell: + op.as_one_line(""" + lines=$(wc -l < {params.merged_batch}) ; + if [ $lines -gt 0 ] ; + then + if ! grep -q -e "exit" {params.merged_batch} ; + then + echo 'exit' >> {params.merged_batch} ; + fi ; + maxtime=$(($(wc -l < {params.merged_batch}) * 60 + 15 + {params.sleep_time})) ; + timeout $maxtime xvfb-run -s "-screen 0 1920x1080x24" {params.server_number} {params.server_args} {params.igv} -b {params.merged_batch} > {log.stdout} 2> {log.stderr} && touch {output.complete} ; + exit=$? ; + if [ $exit -ne 0 ] ; + then + if grep -q -e "No such process" {log.stderr} && grep -q -e "Executing Command: exit" {log.stdout} ; + then + echo "All IGV batch script commands have completed succesfully, but an Xvfb-run kill error has occurred." >> {log.stdout} && touch {output.complete} ; + else + false ; + fi ; + fi ; + else + echo 'Skipping sample {wildcards.sample_id} because it either has no variants to snapshot or all variants have already been snapshot.' >> {log.stdout} ; + touch {output.complete} ; + fi + """) + +rule _igv_track_failed: + output: + failed_summary = CFG["dirs"]["outputs"] + "snapshot_estimates/failed_summary_" + LAUNCH_DATE + ".txt" + run: + header = "\t".join(["sample_id","seq_type","genome_build","gene","chromosome","position","preset","status","snapshot_path"]) + with open(output.failed_summary, "w") as handle: + handle.write(header + "\n") + ready = open(output.ready, "w") + ready.close() + +rule _igv_quality_control: + input: + igv = str(rules._igv_run.output.complete), #"completed/{seq_type}--{genome_build}/{preset}/{sample_id}.completed" + failed_summary = str(rules._igv_track_failed.output.failed_summary), + snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".png" + output: + snapshot_qc = temp(CFG["dirs"]["snapshots"] + "qc/{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".qc") + params: + batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{sample_id}" + SUFFIX + ".batch", + merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset}/{sample_id}" + SUFFIX + ".batch", + igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh", + server_number = "-n " + CFG["options"]["xvfb_parameters"]["server_number"] if CFG["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", + server_args = CFG["options"]["xvfb_parameters"]["server_args"], + batch_temp = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{sample_id}" + SUFFIX + ".batch.temp", + thresholds = CFG["options"]["quality_control"] + resources: + **CFG["resources"]["_igv_quality_control"] + threads: (workflow.cores) + log: + stdout = CFG["logs"]["snapshots"] + "quality_control/{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".stdout.log", + stderr = CFG["logs"]["snapshots"] + "quality_control/{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".stderr.log" + script: + config["lcr-modules"]["igv"]["scripts"]["quality_control"] + +rule _igv_symlink_snapshot: + input: + snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".png", + snapshot_qc = str(rules._igv_quality_control.output.snapshot_qc) + output: + snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".png" + run: + op.relative_symlink(input.snapshot, output.snapshot) + +def _symlink_snapshot(wildcards): + CFG = config["lcr-modules"]["igv"] + checkpoint_outputs = checkpoints._igv_run.get(**wildcards).output.complete + + if wildcards.sample_id in list(CFG["runs"]["tumour_sample_id"]): + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id=wildcards.sample_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) normal_sample_id = this_sample["normal_sample_id"] pair_status = this_sample["pair_status"] maf = expand( - str(rules._igv_filter_maf.output.maf), - zip, - seq_type=wildcards.seq_type, - genome_build=wildcards.genome_build, - tumour_id=wildcards.tumour_id, - normal_sample_id=normal_sample_id, - pair_status=pair_status + str(rules._igv_filter_maf.output.maf), + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + tumour_id = wildcards.sample_id, + normal_sample_id = normal_sample_id, + pair_status = pair_status ) - if os.path.exists(maf[0]): - maf_table = pd.read_table(maf[0], comment="#", sep="\t") - - return expand( - expand( - str(rules._igv_symlink_snapshot.output.snapshot), - zip, - seq_type = maf_table["seq_type"], - genome_build = maf_table["genome_build"], - chromosome = maf_table["chr_std"], - start_position = maf_table["Start_Position"], - gene = maf_table["Hugo_Symbol"], - tumour_id = maf_table["Tumor_Sample_Barcode"], - allow_missing = True - ), - pair_status_directory = PAIR_STATUS_DICT[pair_status.item()], - preset = wildcards.preset - ) - else: - return [] - - def _quality_control(wildcards): - CFG = config["lcr-modules"]["igv"] + maf_table = pd.read_table(maf[0], comment = "#", sep="\t") - checkpoint_outputs = checkpoints._igv_run.get(**wildcards).output.complete + tumour_snaps = expand( + expand( + str(rules._igv_symlink_snapshot.output.snapshot), + zip, + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + ref_allele = maf_table["Reference_Allele"], + alt_allele = maf_table["Tumor_Seq_Allele2"], + sample_id = maf_table["Tumor_Sample_Barcode"], + allow_missing = True + ), + genome_build = wildcards.genome_build, + seq_type = wildcards.seq_type, + preset = wildcards.preset, + tissue_status = "tumour" + ) - this_sample = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.tumour_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) + return tumour_snaps + if wildcards.sample_id in list(CFG["runs"]["normal_sample_id"]): + these_samples = op.filter_samples(CFG["runs"], normal_sample_id=wildcards.sample_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build, pair_status="matched") + + mafs = expand( + str(rules._igv_filter_maf.output.maf), + zip, + seq_type = these_samples["tumour_seq_type"], + genome_build = these_samples["tumour_genome_build"], + tumour_id = these_samples["tumour_sample_id"], + normal_sample_id = these_samples["normal_sample_id"], + pair_status = these_samples["pair_status"] + ) + + maf_table = pd.concat([pd.read_table(m, comment="#", sep="\t") for m in mafs]) + + normal_snaps = expand( + expand( + str(rules._igv_symlink_snapshot.output.snapshot), + zip, + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + ref_allele = maf_table["Reference_Allele"], + alt_allele = maf_table["Reference_Allele"], + sample_id = maf_table["Matched_Norm_Sample_Barcode"], + allow_missing = True + ), + genome_build = wildcards.genome_build, + seq_type = wildcards.seq_type, + preset = wildcards.preset, + tissue_status = "normal" + ) + + return normal_snaps + +def _quality_control(wildcards): + CFG = config["lcr-modules"]["igv"] + checkpoint_outputs = checkpoints._igv_run.get(**wildcards).output.complete + + if wildcards.sample_id in list(CFG["runs"]["tumour_sample_id"]): + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id=wildcards.sample_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) normal_sample_id = this_sample["normal_sample_id"] pair_status = this_sample["pair_status"] maf = expand( str(rules._igv_filter_maf.output.maf), - zip, seq_type = wildcards.seq_type, genome_build = wildcards.genome_build, - tumour_id = wildcards.tumour_id, + tumour_id = wildcards.sample_id, normal_sample_id = normal_sample_id, pair_status = pair_status ) - if os.path.exists(maf[0]): - maf_table = pd.read_table(maf[0], comment="#", sep="\t") - - return expand( - expand( - str(rules._igv_quality_control.output.snapshot_qc), - zip, - seq_type = maf_table["seq_type"], - genome_build = maf_table["genome_build"], - chromosome = maf_table["chr_std"], - start_position = maf_table["Start_Position"], - gene = maf_table["Hugo_Symbol"], - tumour_id = maf_table["Tumor_Sample_Barcode"], - allow_missing = True - ), - pair_status_directory = PAIR_STATUS_DICT[pair_status.item()], - preset = wildcards.preset - ) - - # Check that snapshots have been symlinked and quality controlled - rule _igv_check_snapshots: - input: - igv_completed = str(rules._igv_run.output.complete), - snapshots = _symlink_snapshot, - quality_control = _quality_control - output: - snapshots = CFG["dirs"]["outputs"] + "completed/{preset}/{seq_type}--{genome_build}--{tumour_id}.completed" - shell: - "touch {output.snapshots}" - -if CFG["estimate_only"] is True and CFG["identify_failed_snaps"] is False: - - rule _igv_touch_summary: - output: - snapshot_summary = CFG["dirs"]["outputs"] + "snapshot_estimates/snapshot_summary.txt", - snapshot_estimate = CFG["dirs"]["outputs"] + "snapshot_estimates/snapshot_estimate.txt", - summary_ready = temp(CFG["dirs"]["outputs"] + "snapshot_estimates/touch_summary.completed") - run: - header = "\t".join(["sample_id","seq_type","genome_build","gene","chromosome","position", "igv_preset"]) - with open(output.snapshot_summary,"w") as handle: - handle.write(header + "\n") - with open(output.snapshot_estimate, "w") as handle: - handle.write(header + "\n") - ready = open(output.summary_ready, "w") - ready.close() - - rule _igv_estimate_snapshots: - input: - maf = str(rules._igv_filter_maf.output.maf), - summary_file_ready = str(rules._igv_touch_summary.output.summary_ready) - output: - estimate_finished = temp(CFG["dirs"]["batch_scripts"] + "estimate_batch_scripts/{seq_type}--{genome_build}/{preset}/{tumour_id}--{normal_sample_id}--{pair_status}.temp") - params: - snapshot_summary = str(rules._igv_touch_summary.output.snapshot_summary), - snapshot_estimate = str(rules._igv_touch_summary.output.snapshot_estimate) - threads: (workflow.cores) - run: - CFG = config["lcr-modules"]["igv"] - maf_table = pd.read_table(input.maf, sep="\t", comment="#") - - seq_type = wildcards.seq_type - genome_build = wildcards.genome_build - tumour_id = wildcards.tumour_id - preset = wildcards.preset - - snapshot_summary = open(params.snapshot_summary, "a") - snapshot_estimate = open(params.snapshot_estimate, "a") - - for index, row in maf_table.iterrows(): - gene = row["Hugo_Symbol"] - chromosome = row["chr_std"] - position = str(row["Start_Position"]) - - dispatch_path = CFG["dirs"]["batch_scripts"] + f"dispatched_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{position}--{gene}--{tumour_id}" + SUFFIX + ".batch" - - outline = "\t".join([tumour_id, seq_type, genome_build, gene, chromosome, position, preset]) - snapshot_summary.write(outline + "\n") - - if not os.path.exists(dispatch_path): - snapshot_estimate.write(outline + "\n") - - finished = open(output.estimate_finished, "w") - finished.close() - -if CFG["identify_failed_snaps"] is True and CFG["estimate_only"] is False: - rule _igv_touch_failed: - output: - failed_summary = CFG["dirs"]["outputs"] + "snapshot_estimates/failed_summary.txt", - failed_ready = temp(CFG["dirs"]["outputs"] + "snapshot_estimates/touch_failed.completed") - run: - header = "\t".join(["sample_id","seq_type","genome_build","gene","chromosome","position","preset","snapshot_path"]) - with open(output.failed_summary, "w") as handle: - handle.write(header + "\n") - ready = open(output.failed_ready, "w") - ready.close() - - rule _igv_find_failed: - input: - maf = str(rules._igv_filter_maf.output.maf), - failed_ready = str(rules._igv_touch_failed.output.failed_ready) - output: - failed_finished = temp(CFG["dirs"]["batch_scripts"] + "estimate_failed_scripts/{seq_type}--{genome_build}/{preset}/{tumour_id}--{normal_sample_id}--{pair_status}.temp") - params: - failed_summary = str(rules._igv_touch_failed.output.failed_summary) - threads: (workflow.cores) - run: - import subprocess - CFG = config["lcr-modules"]["igv"] - maf_table = pd.read_table(input.maf, sep="\t", comment="#") - - seq_type = wildcards.seq_type - genome_build = wildcards.genome_build - tumour_id = wildcards.tumour_id - preset = wildcards.preset - pair_status_directory = PAIR_STATUS_DICT[wildcards.pair_status] - - for index, row in maf_table.iterrows(): - gene = row["Hugo_Symbol"] - chromosome = row["chr_std"] - position = str(row["Start_Position"]) - - snapshot = CFG["dirs"]["snapshots"] + f"{seq_type}--{genome_build}/{pair_status_directory}/{preset}/{chromosome}/{chromosome}:{position}--{gene}--{tumour_id}" + SUFFIX + ".png" - snapshot_symlink = CFG["dirs"]["outputs"] + f"{seq_type}--{genome_build}/{pair_status_directory}/{preset}/{chromosome}/{chromosome}:{position}--{gene}--{tumour_id}" + SUFFIX + ".png" - - success = True - if not os.path.exists(snapshot): - print(f"{snapshot} doesn't exist yet, skipping...") - if os.path.exists(snapshot): - if not os.path.exists(snapshot_symlink): - success = False - if success is True: - try: - height = str(subprocess.check_output(f"identify -format '%h' {snapshot}", shell=True)).split("'")[1].split("\\n")[0] - width = str(subprocess.check_output(f"identify -format '%w' {snapshot}", shell=True)).split("'")[1].split("\\n")[0] - print(f"Height is {height} for {tumour_id} {gene} {chromosome}:{position}") - except: - success = False - if success is True and height in ["547","559"]: - kurtosis, skewness = [float(value.split(": ")[1]) for value in str(subprocess.check_output(f'identify -verbose {snapshot} | grep -E "kurtosis|skewness" | tail -n 2', shell=True)).replace("\\n'","").split("\\n ")] - print(f"Kurtosis is {kurtosis} for {tumour_id} {gene} {chromosome}:{position}") - blank_kurtosis = {"547": 18.5, "559": 18.2} - blank_skew = -4 - if kurtosis > blank_kurtosis[height] and skewness < blank_skew: - success = False - print(f"Success value is {success} for {tumour_id} {gene} {chromosome}:{position}") - if success is False: - with open(params.failed_summary, "a") as handle: - print("Writing line to failed file") - outline = "\t".join([tumour_id, seq_type, genome_build, gene, chromosome, position, preset, snapshot]) - handle.write(outline + "\n") - - finished = open(output.failed_finished, "w") - finished.close() - -# Generates the target sentinels for each run, which generate the symlinks -if CFG["estimate_only"] is False and CFG["identify_failed_snaps"] is False: + maf_table = pd.read_table(maf[0], comment = "#", sep="\t") + + tumour_snaps = expand( + expand( + str(rules._igv_quality_control.output.snapshot_qc), + zip, + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + ref_allele = maf_table["Reference_Allele"], + alt_allele = maf_table["Tumor_Seq_Allele2"], + sample_id = maf_table["Tumor_Sample_Barcode"], + allow_missing = True + ), + genome_build = wildcards.genome_build, + seq_type = wildcards.seq_type, + preset = wildcards.preset, + tissue_status = "tumour" + ) + + return tumour_snaps + + if wildcards.sample_id in list(CFG["runs"]["normal_sample_id"]): + these_samples = op.filter_samples(CFG["runs"], normal_sample_id=wildcards.sample_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build, pair_status="matched") + + mafs = expand( + str(rules._igv_filter_maf.output.maf), + zip, + seq_type = these_samples["tumour_seq_type"], + genome_build = these_samples["tumour_genome_build"], + tumour_id = these_samples["tumour_sample_id"], + normal_sample_id = these_samples["normal_sample_id"], + pair_status = these_samples["pair_status"] + ) + + maf_table = pd.concat([pd.read_table(m, comment="#", sep="\t") for m in mafs]) + + normal_snaps = expand( + expand( + str(rules._igv_quality_control.output.snapshot_qc), + zip, + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + ref_allele = maf_table["Reference_Allele"], + alt_allele = maf_table["Reference_Allele"], + sample_id = maf_table["Matched_Norm_Sample_Barcode"], + allow_missing = True + ), + genome_build = wildcards.genome_build, + seq_type = wildcards.seq_type, + preset = wildcards.preset, + tissue_status = "normal" + ) + + return normal_snaps + +rule _igv_check_snapshots: + input: + snapshots = _symlink_snapshot, + quality_control = _quality_control + output: + complete = CFG["dirs"]["outputs"] + "completed/check_snapshots/{preset}/{seq_type}--{genome_build}/{sample_id}.checked" + shell: + "touch {output.complete}" + +def _get_finished_samples(wildcards): + if wildcards.pair_status == "matched": + tumour_complete = ancient( + expand( + str(rules._igv_check_snapshots.output.complete), + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + sample_id = wildcards.tumour_id, + preset = wildcards.preset + ) + ) + + normal_complete = ancient( + expand( + str(rules._igv_check_snapshots.output.complete), + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + sample_id = wildcards.normal_sample_id, + preset = wildcards.preset + ) + ) + + return (tumour_complete + normal_complete) + + else: + return ancient( + expand( + str(rules._igv_check_snapshots.output.complete), + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + sample_id = wildcards.tumour_id, + preset = wildcards.preset + ) + ) + +rule _igv_check_samples: + input: + igv_completed = _get_finished_samples, + output: + checked = CFG["dirs"]["outputs"] + "completed/check_samples/{preset}/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}.completed" + shell: + "touch {output.checked}" + +##### Rules below will only run if CFG["estimate_only"] set to True + +rule _igv_setup_estimates: + output: + snapshot_summary = CFG["dirs"]["outputs"] + "snapshot_summaries/estimates/snapshot_summary_" + LAUNCH_DATE + ".txt", + snapshot_estimate = CFG["dirs"]["outputs"] + "snapshot_summaries/estimates/snapshot_estimate_" + LAUNCH_DATE + ".txt" + run: + header = "\t".join(["sample_id","seq_type","genome_build","gene","chromosome","position", "igv_preset"]) + with open(output.snapshot_summary,"w") as handle: + handle.write(header + "\n") + with open(output.snapshot_estimate, "w") as handle: + handle.write(header + "\n") + +rule _igv_estimate_snapshots: + input: + maf = _get_maf, + snapshot_summary = str(rules._igv_setup_estimates.output.snapshot_summary), + snapshot_estimate = str(rules._igv_setup_estimates.output.snapshot_estimate) + output: + complete = temp(CFG["dirs"]["batch_scripts"] + "estimate_batch_scripts/{seq_type}--{genome_build}/{preset}/{sample_id}.temp") + threads: (workflow.cores) + run: + CFG = config["lcr-modules"]["igv"] + snapshot_summary = open(input.snapshot_summary, "a") + snapshot_estimate = open(input.snapshot_estimate, "a") + + maf_table = pd.concat([pd.read_table(file, sep="\t", comment="#") for file in input.maf]) + + seq_type = wildcards.seq_type + genome_build = wildcards.genome_build + sample_id = wildcards.sample_id + preset = wildcards.preset + + for index, row in maf_table.iterrows(): + gene = row["Hugo_Symbol"] + chromosome = row["Chromosome"] + start_position = str(row["Start_Position"]) + + dispatch_path = CFG["dirs"]["batch_scripts"] + f"dispatched_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{sample_id}" + SUFFIX + ".batch" + + outline = "\t".join([sample_id, seq_type, genome_build, gene,chromosome, start_position, preset]) + snapshot_summary.write(outline + "\n") + + if not os.path.exists(dispatch_path): + snapshot_estimate.write(outline + "\n") + + finished = open(output.complete, "w") + finished.close() + snapshot_summary.close() + snapshot_estimate.close() + +def _check_estimates(wildcards): + if wildcards.pair_status == "matched": + tumour_complete = ancient( + expand( + str(rules._igv_estimate_snapshots.output.complete), + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + sample_id = wildcards.tumour_id, + preset = wildcards.preset + ) + ) + + normal_complete = ancient( + expand( + str(rules._igv_estimate_snapshots.output.complete), + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + sample_id = wildcards.normal_sample_id, + preset = wildcards.preset + ) + ) + + return (tumour_complete + normal_complete) + + else: + return ancient( + expand( + str(rules._igv_estimate_snapshots.output.complete), + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + sample_id = wildcards.tumour_id, + preset = wildcards.preset + ) + ) + +rule _igv_check_estimates: + input: + estimates_completed = _check_estimates + output: + sample_estimated = temp(CFG["dirs"]["outputs"] + "snapshot_summaries/temp/{seq_type}--{genome_build}/{preset}/{tumour_id}--{normal_sample_id}--{pair_status}.temp") + shell: + "touch {output.sample_estimated}" + +if CFG["estimate_only"] is False: rule _igv_all: input: expand( expand( [ - str(rules._igv_run.output.complete), - str(rules._igv_check_snapshots.output.snapshots) + str(rules._igv_check_samples.output.checked) ], zip, tumour_id=CFG["runs"]["tumour_sample_id"], + normal_sample_id=CFG["runs"]["normal_sample_id"], seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"], + pair_status = CFG["runs"]["pair_status"], allow_missing=True ), preset=CFG["options"]["igv_presets"] ) -if CFG["estimate_only"] is True and CFG["identify_failed_snaps"] is False: - rule _igv_all: - input: - expand( - expand( - str(rules._igv_estimate_snapshots.output.estimate_finished), - zip, - seq_type=CFG["runs"]["tumour_seq_type"], - genome_build=CFG["runs"]["tumour_genome_build"], - tumour_id=CFG["runs"]["tumour_sample_id"], - normal_sample_id=CFG["runs"]["normal_sample_id"], - pair_status=CFG["runs"]["pair_status"], - allow_missing=True - ), - preset=CFG["options"]["igv_presets"] - ) -if CFG["identify_failed_snaps"] is True and CFG["estimate_only"] is False: +if CFG["estimate_only"] is True: rule _igv_all: input: expand( expand( - str(rules._igv_find_failed.output.failed_finished), + str(rules._igv_check_estimates.output.sample_estimated), zip, - seq_type=CFG["runs"]["tumour_seq_type"], - genome_build=CFG["runs"]["tumour_genome_build"], tumour_id=CFG["runs"]["tumour_sample_id"], normal_sample_id=CFG["runs"]["normal_sample_id"], + seq_type = CFG["runs"]["tumour_seq_type"], + genome_build = CFG["runs"]["tumour_genome_build"], pair_status=CFG["runs"]["pair_status"], allow_missing=True ), diff --git a/modules/igv/CHANGELOG.md b/modules/igv/CHANGELOG.md index 24815232..31639933 100644 --- a/modules/igv/CHANGELOG.md +++ b/modules/igv/CHANGELOG.md @@ -9,4 +9,31 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 This release was authored by Manuela Cruz. -- No module design decisions explained here yet. +- This module requires four file types: + * Regions file containing desired regions to be snapshot in BED format, MAF format, mutation_id format in which mutations are in "{chromosome}:{start_position}:{end_position}" format or OncodriveCLUSTL / HotMAPS results files. HotMAPS results must be preformatted using a script that is executed in the last step of the HotMAPS lcr-module, `rule _hotmaps_detailed_hotspots`. Regions file format must be specified in CFG["inputs"]["regions_format"] (Can/should probably include the HotMAPS script in here) + * BAM and BAI files for each sample to generate IGV screenshots + * MAF files for each sample to determine which variants are included in desired regions to be snapshot and extract the corresponding Chromosome, Start_Position and Hugo_Symbol values + +- BAM and BAI file locations are sourced based on the sample_id and the corresponding genome_build and seq_type metadata values set in CFG["samples"] + +- The regions files must be entered in the config under the correct key denoting the genome build of the regions file. This is required to perform proper liftover if necessary, which allows the workflow to correctly filter sample MAFs that are in builds opposite what is provided in the regions file. + +- IGV batch scripts are created for each individual variant of interest. This is a checkpoint rule as it depends on the file contents that are present in the filtered MAF files. Based on this checkpoint, Snakemake can determine what variant batch scripts have been created. + +Creation of IGV snapshots: + +- The individual IGV batch scripts are appended to a single large "merged" batch script for each sample_id and an empty "dispatched" file is created for each indiviual batch script. Importantly, this rule only runs if the dispatched file has not been created in order to prevent variants that have already undergone IGV snapshots to be appended and rerun again. + +- IGV is then run on each sample's merged batch script. This is also a checkpoint rule as the specific snapshots that will be created depend on the contents of the merged batch scripts. + +Blank or truncated snapshots: + +- Truncated or blank snapshots can occur during the IGV run. The quality control rule checks for blank snapshots based on the images kurtosis and skewness values, and checks for truncated snapshots based on the image's height and width. Three attempts are performed in order to resolve the affected snapshots, and if they remain unresolved the quality control rule will fail. Note that the flags for blank and truncated snapshots have been determined based on IGV image dimensions 1920x1080x24, and modifying image dimensions may result in more blank or truncated snapshots. + +- Increasing the milliseconds set in the CFG["options"]["generate_batch_script"]["sleep_timer] typically reduces the amount of snapshots with issues. + +Estimating snapshots: + +- To estimate the number of IGV snapshots that will be created, the config parameter "estimate_only" can be set to True. Summary files are created in "99-outputs/snapshot_summaries/estimates/" based on the individual batch scripts that have been created and do not have pre-existing "dispatch" files. + + From 95f8f271784da8ddfba9e9d065c4d9178fd0ab1d Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Wed, 24 Jan 2024 16:39:01 -0800 Subject: [PATCH 108/132] Polishing --- modules/igv/1.0/etc/format_regions.py | 15 +- .../etc/generate_batch_script_per_variant.py | 9 +- modules/igv/1.0/etc/generate_batch_scripts.py | 223 ------------------ modules/igv/1.0/etc/quality_control.py | 99 ++++---- modules/igv/1.0/igv.smk | 27 +-- modules/igv/CHANGELOG.md | 13 +- 6 files changed, 79 insertions(+), 307 deletions(-) delete mode 100755 modules/igv/1.0/etc/generate_batch_scripts.py diff --git a/modules/igv/1.0/etc/format_regions.py b/modules/igv/1.0/etc/format_regions.py index 501a4b09..56acc0c1 100755 --- a/modules/igv/1.0/etc/format_regions.py +++ b/modules/igv/1.0/etc/format_regions.py @@ -1,5 +1,9 @@ #!/usr/bin/env python +""" +This script reformats MAF, HotMAPS, OncodriveCLUSTL or GENOMIC_POSITION files into BED files +""" + import os import sys import pandas as pd @@ -17,14 +21,13 @@ def log_exceptions(exctype, value, tb): def main(): with open(snakemake.log[0], "w") as stdout: - # Set up logging sys.stdout = stdout try: - regions_file = snakemake.input[0] - regions_format = snakemake.params[0] + regions_file = snakemake.input["regions"] + regions_format = snakemake.params["regions_format"] - output_file = snakemake.output[0] + output_file = snakemake.output["regions"] line_count = 0 with open(regions_file, "r") as handle: @@ -39,11 +42,11 @@ def main(): if regions_format == "oncodriveclustl": global CLUSTL_PARAMS - CLUSTL_PARAMS = snakemake.params[1] + CLUSTL_PARAMS = snakemake.params["oncodriveclustl_params"] if regions_format == "mutation_id": global REGIONS_BUILD - REGIONS_BUILD = snakemake.params[2] + REGIONS_BUILD = snakemake.params["regions_build"] REGIONS_BUILD = REGIONS_BUILD.lower() if regions_format == "bed": diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py index 1f7e08af..54d75ef6 100644 --- a/modules/igv/1.0/etc/generate_batch_script_per_variant.py +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -35,11 +35,10 @@ def main(): Bam files:\t{input_bam}\n\ Bai files:\t{input_bai}\n\ Filtered maf:\t{maf}\n\ - Parameters:\t{snakemake.params[6]}\n\ Batch options:\t{batch_options}") if not isinstance(maf, list): - maf = list(maf) + maf = [maf] empty_mafs = [] @@ -67,7 +66,7 @@ def main(): presets = snakemake.params["igv_presets"], sample_id = snakemake.wildcards["sample_id"], suffix = snakemake.params["suffix"], - finished_file = snakemake.output["batches_finished"] + finished_file = snakemake.output["finished"] ) exit() for e in empty_mafs: @@ -194,8 +193,6 @@ def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_bui header = generate_igv_batch_header(bam=bam, index=bai, max_height=max_height, genome_build=genome_build) all_lines.extend(header) - child_directory = tissue_status - seq_type_build = f"{seq_type}--{genome_build}" chrom_dir = row.chromosome @@ -221,7 +218,7 @@ def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_bui options = igv_options, coordinates = row.snapshot_coordinates, directory = snapshot_dir, - child_dir = child_directory, + child_dir = tissue_status, seq_build = seq_type_build, chrom_directory = chrom_dir, snapshot_filename = snap_filename diff --git a/modules/igv/1.0/etc/generate_batch_scripts.py b/modules/igv/1.0/etc/generate_batch_scripts.py deleted file mode 100755 index 207e6ff6..00000000 --- a/modules/igv/1.0/etc/generate_batch_scripts.py +++ /dev/null @@ -1,223 +0,0 @@ -#!/usr/bin/env python3 - -import os -import warnings -import argparse -import numpy as np -import pandas as pd -import oncopipe as op -import math - -def main(): - - input_bam = snakemake.input[0] - input_bai = snakemake.input[1] - input_maf = open(snakemake.input[2], "r") - outfile = open(snakemake.output[0], "w") - - # Skip sample if no variants in filtered MAF file - line_count = 0 - for line in input_maf: - line_count += 1 - if line_count > 1: - break - if line_count < 2: - line = generate_igv_batch_footer() - output_lines(line, outfile) - input_maf.close() - outfile.close() - exit() - - # Return to top of MAF - input_maf.seek(0) - - # Read MAF file containing variants and create a dataframe linking regions, sample_ids, and bam paths - regions = get_regions_df( - input_maf, - seq_type=snakemake.params[2], - padding=snakemake.params[3] - ) - - input_maf.close() - - # Format and output the batch script - generate_igv_batch( - bam = input_bam, - bai = input_bai, - regions = regions, - output = outfile, - max_height = snakemake.params[4], - seq_type = snakemake.params[2], - genome_build = snakemake.params[1], - snapshot_dir = snakemake.params[0], - igv_options = snakemake.params[5], - image_format = snakemake.params[6] - ) - - outfile.close() - -def get_regions_df(input_maf, seq_type, padding): - # Read MAF as dataframe - maf = pd.read_table(input_maf, comment="#", sep="\t") - - # Make sure required minimum columns are present in the maf - columns = [ - "Chromosome", - "Start_Position", - "End_Position", - "Hugo_Symbol", - ] - - assert(all(c in list(maf.columns) for c in columns)), ( - "The following required columns are missing: " - f"{[columns[missing_ix] for missing_ix in [ix for ix, bool_val in enumerate([col not in list(maf.columns) for col in columns]) if bool_val==True]]}" - ) - - # Check if there are issues with input file - for column in columns: - column_values = maf[column] - is_any_na = pd.isna(column_values).any() - assert not is_any_na, ( - f"The '{column}' column contains NA values. This might be caused " - "by an incorrectly formatted input MAF file. Please ensure that " - f"all of the following columns have values: {', '.join(columns)}." - f"Here's a preview of the MAF file after being parsed:\n\n {maf}" - ) - - # Create a pandas dataframe with to link regions with sample_ids and bam files - chrom = (maf["Chromosome"].astype(str)).apply(lambda x: x.replace("chr","")) - - # Specify the regions that will be captured by IGV based on variant positions - region_position = (maf["Start_Position"]).astype(str) - snapshot_start = (maf["Start_Position"] - padding).astype(str) - snapshot_end = (maf["End_Position"] + padding).astype(str) - snapshot_coordinates = "chr" + chrom + ":" + snapshot_start + "-" + snapshot_end - regions = "chr" + chrom + ":" + region_position - - regions_df = pd.DataFrame( - {"chromosome": "chr" + chrom, - "region": regions, - "region_name": maf.Hugo_Symbol, - "sample_id": maf.Tumor_Sample_Barcode, - "snapshot_coordinates": snapshot_coordinates, - "padding": padding - } - ) - - return regions_df - -def generate_igv_batch_header(bam_file, index_file, max_height, genome_build): - lines = [] - - genome_build = genome_build.replace("grch37","hg19").replace("grch38","hg38") - - bam_file = os.path.realpath(bam_file) - lines.append(f"load {bam_file}") - - bai_file = os.path.realpath(index_file) - lines.append(f"index={bai_file}") - - lines.append(f"maxPanelHeight {max_height}") - lines.append(f"genome {genome_build}") - - return lines - -def generate_igv_batch_per_row(coordinates, snapshot_filename, igv_options): - lines = [] - lines.append(f"goto {coordinates}") - lines.append("sort") - lines.append("collapse") - for option in igv_options: - lines.append(option) - lines.append(f"snapshot {snapshot_filename}") - - return lines - -def generate_igv_batch(bam, bai, regions, output, max_height, seq_type, genome_build, snapshot_dir, igv_options, image_format): - - # Lines for batch script encompassing all regions and sample_ids - all_lines = [] - - header = generate_igv_batch_header( - bam, bai, max_height, genome_build - ) - - all_lines.extend(header) - - for chrom in regions.chromosome.unique(): - chrom_regions = regions[regions["chromosome"]==chrom] - - lines = generate_igv_batch_per_region( - regions=chrom_regions, - max_height=max_height, - seq_type=seq_type, - genome_build=genome_build, - snapshot_dir=snapshot_dir, - options=igv_options, - image_format=image_format - ) - - if lines is not None: - all_lines.extend(lines) - - footer = generate_igv_batch_footer() - all_lines.extend(footer) - - output_lines(all_lines, output) - - -def generate_igv_batch_per_region(regions, max_height, seq_type, genome_build, snapshot_dir, options, image_format): - - # Batch script lines - lines = [] - - # Set up snapshot directory string - dir_chrom = regions.chromosome.unique()[0].split(":")[0] - seq_type_build = f"{seq_type}--{genome_build}" - - # Add snapshot directory line to batch script - snapshot_regions_dir = os.path.join(snapshot_dir, seq_type_build, dir_chrom, "") - lines.append(f"snapshotDirectory {snapshot_regions_dir}") - - # Add lines to batch script for each sample - for _, row in regions.iterrows(): - # Add components of filename as a list - filename = [] - - filename.append(row.region) - - filename.append(str(row.padding)) - - filename.append(row.region_name) - - filename.append(row.sample_id) - - #if not image_format.startswith("."): - # image_format = "." + image_format - - filename = "--".join(filename) + ".png" - - row_lines = generate_igv_batch_per_row(coordinates = row.snapshot_coordinates, snapshot_filename = filename, igv_options = options) - - lines.extend(row_lines) - return lines - -def close_files(args): - args_dict = vars(args) - for arg_value in args_dict.values(): - if hasattr(arg_value, "close"): - arg_value.close() - -def generate_igv_batch_footer(): - lines = [] - lines.append("exit") - return lines - -def output_lines(lines, output): - lines.append("") - text = "\n".join(lines) - output.write(text) - -if __name__ == "__main__": - main() - diff --git a/modules/igv/1.0/etc/quality_control.py b/modules/igv/1.0/etc/quality_control.py index d7d1847c..f086449d 100644 --- a/modules/igv/1.0/etc/quality_control.py +++ b/modules/igv/1.0/etc/quality_control.py @@ -2,6 +2,16 @@ import subprocess import sys +import os + +## Logging files +global stdout +global stderr +global stdout_f +global stderr_f + +stdout = snakemake.log["stdout"] +stderr = snakemake.log["stderr"] def increaseSleepInterval(batch_file): """ @@ -10,8 +20,8 @@ def increaseSleepInterval(batch_file): os.system(f'sleep=$(grep "Sleep" {batch_file} | cut -d " " -f2) && new_sleep=$(($sleep + 5000)) && sed -i "s/setSleepInterval $sleep/setSleepInterval $new_sleep/g" {batch_file}') def runIGV(batch_file, igv, status, message, attempt): - os.system(f'echo "Snapshot may be {status}. {message}... Rerunning IGV... Attempt {str(attempt)}: >> {stdout} 2>> {stderr}') - os.system(f'maxtime=$(($(wc -l < {temp_file}) * 60 + 15)) && timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" {igv["server_num"]} {igv["server_args"]} {igv["igv"]} -b {temp_file} >> {stdout} 2>> {stderr}') + os.system(f'echo "Snapshot may be {status}. {message}... Rerunning IGV... Attempt {str(attempt)}:" >> {stdout} 2>> {stderr}') + os.system(f'maxtime=$(($(wc -l < {batch_file}) * 60 + 15)) && timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" {igv["server_num"]} {igv["server_args"]} {igv["igv"]} -b {batch_file} >> {stdout} 2>> {stderr}') def getImageQualities(snapshot, batch_file, igv, summary_file, attempts = 0): height = None @@ -32,7 +42,7 @@ def getImageQualities(snapshot, batch_file, igv, summary_file, attempts = 0): if attempts < 3: runIGV(batch_file, igv, status, message, attempts) if attempts == 4 and corrupt: - # Quit because snapshot is corrupt and can't run quality control + # Quit because snapshot is corrupt and can't run quality control qc_status = "failed" logFailedSnapshots(snapshot, summary_file, qc_status) sys.exit() @@ -51,7 +61,7 @@ def handleIncorrectDimensions(snapshot, img_values, thresholds, batch_file, igv, attempts = 0 # Increase sleep interval - increaseSleepInterval() + increaseSleepInterval(batch_file) while float(img_values["width"]) == 640 and attempts < 5: attempts += 1 @@ -60,9 +70,9 @@ def handleIncorrectDimensions(snapshot, img_values, thresholds, batch_file, igv, if igv["server_num"] == "--auto-servernum" or int(igv["server_num"].replace("-n ","")) >= 99: new_server_arg = "-n 1" else: - new_server_arg = f'-n {str(float(igv["server_num"]) + 1)} + new_server_arg = f'-n {str(float(igv["server_num"]) + 1)}' - messsage = f'Current snapshot width is {img_values["width"]}, while 1020 is expected. This might occurr if xvfb-run is unable to connect to current server ({previous_server_arg}) due to a server lock. Switching server numbers... Attempting new server argument: {new_server_arg}' + messsage = f'Current snapshot width is {img_values["width"]}, but at least 1020px is expected. This might occurr if xvfb-run is unable to connect to current server ({previous_server_arg}) due to a server lock. Switching server numbers... Attempting new server argument: {new_server_arg}' igv["server_num"] = new_server_arg @@ -79,14 +89,13 @@ def handleTruncated(snapshot, img_values, thresholds, batch_file, igv, dim_attem attempts = 0 # Increase sleep interval - #os.system(f'sleep=$(grep "Sleep" {batch_file} | cut -d " " -f 2) && new_sleep=$(($sleep + 5000)) && sed -i "s/setSleepInterval $sleep/setSleepInterval $new_sleep/g" {batch_file}') if dim_attempts == 0: - increaseSleepInterval() + increaseSleepInterval(batch_file) # Rerun IGV until height is no longer truncated - while img_values["height"] in thresholds["truncated"] and attempts < 3: + while float(img_values["height"]) in thresholds["truncated"] and attempts < 3: attempts += 1 - message = f"Current snapshot height is {img_values["height"]}." + message = f'Current snapshot height is {img_values["height"]}.' # Rerun IGV runIGV(batch_file, igv, status, message, attempts) @@ -96,12 +105,13 @@ def handleTruncated(snapshot, img_values, thresholds, batch_file, igv, dim_attem return attempts, img_values -def handleBlank(snapshot, img_values, thresholds, batch_file, igv, failed_summary, truncated_attempts, dim_attempts): +def handleBlank(snapshot, img_values, thresholds, batch_file, igv, truncated_attempts, dim_attempts, failed_summary): status = "blank" + blank = True attempts = 0 - if dim_attempts == 0 and truncated attempts == 0: - increaseSleepInterval() + if dim_attempts == 0 and truncated_attempts == 0: + increaseSleepInterval(batch_file) while blank and attempts < 3: attempts += 1 @@ -118,12 +128,11 @@ def handleBlank(snapshot, img_values, thresholds, batch_file, igv, failed_summar return attempts, img_values - def is_blank(img_values, thresholds): blank_check = any( ( ( - float(img_values["height"] == float(height_threshold)) or + float(img_values["height"]) == float(height_threshold) or ("<" in height_threshold and float(img_values["height"]) < float(height_threshold.replace("<",""))) or (">" in height_threshold and float(img_values["height"]) > float(height_threshold.replace(">",""))) ) and @@ -149,7 +158,7 @@ def qualityControl(snapshot, batch_file, igv, img_values, thresholds, failed_sum dimension_attempts, img_values = handleIncorrectDimensions(snapshot, img_values, thresholds, batch_file, igv, failed_summary) # Handle truncated attempts - if img_values["height"] in thresholds["truncated"]: + if float(img_values["height"]) in thresholds["truncated"]: truncated_attempts, img_values = handleTruncated(snapshot, img_values, thresholds, batch_file, igv, dimension_attempts, failed_summary) # Check if blank @@ -158,40 +167,24 @@ def qualityControl(snapshot, batch_file, igv, img_values, thresholds, failed_sum blank = is_blank(img_values, blank_thresholds) if blank: - blank_attempts, img_values = handleBlank(snapshot, img_values, thresholds, batch_file, igv, failed_summary, truncated_attempts) - - #if blank: - # status = "blank" - # if truncated_attempts == 0: - # # Increase sleep timer if it hasn't been increased before - # os.system(f'sleep=$(grep "Sleep" {batch_file} | cut -d " " -f 2) && new_sleep=$(($sleep + 5000)) && sed -i "s/setSleepInterval $sleep/setSleepInterval $new_sleep/g" {batch_file}') - # blank_attempts = 0 - # while blank and blank_attempts < 3: - # blank_attempts += 1 - # message = f"Current snapshot values are: {img_values["height"]} height, {img_values["kurtosis"]}, and {img_values["skewness"]} skewness. Snapshots with these values may be blank. Blank snapshots may be due to errors reading BAM file headers, Java address bind erors, or other errors occurring during IGV run. Rerunning with increased sleep interval." - # # Rerun IGV - # runIGV(batch_file, igv, status, message, blank_attempts) - # # Get updated values - # img_values = getImageQualities(snapshot, batch_file, igv, failed_summary) - # # Check if blank - # blank = is_blank(img_values, blank_heights) + blank_attempts, img_values = handleBlank(snapshot, img_values, blank_thresholds, batch_file, igv, truncated_attempts, dimension_attempts, failed_summary) # Check final values and log failed/suspicious if float(img_values["width"]) == 640: os.system(f'echo "Snapshot width is {img_values["width"]}. Improper dimensions should be fixed and rerun. Check snapshot {snapshot}" >> {stdout}') qc_status = "fail" - if img_values["height"] in thresholds["failed"]: - os.system(f'echo "Snapshot height is {img_values["height"]} and may still be truncated or improperly loaded. Check snapshot {snapshot}"" >> {stdout}') + if float(img_values["height"]) in thresholds["failed"]: + os.system(f'echo "Snapshot height is {img_values["height"]} and may still be truncated or improperly loaded. Check snapshot {snapshot}" >> {stdout}') qc_status = "fail" - if qc_status == "pass" and any(dimension_attempts >= 5, truncated_attempts >= 3, blank_attempts >= 3): + if qc_status == "pass" and (dimension_attempts >= 5 or truncated_attempts >= 3 or blank_attempts >= 3): qc_status = "suspicious" return qc_status def logFailedSnapshots(snapshot, summary_file, qc_status): - outline = "\t".join([snakemake.wildcards["tumour_id"], + outline = "\t".join([snakemake.wildcards["sample_id"], snakemake.wildcards["seq_type"], snakemake.wildcards["genome_build"], snakemake.wildcards["gene"], @@ -200,16 +193,19 @@ def logFailedSnapshots(snapshot, summary_file, qc_status): snakemake.wildcards["preset"], qc_status, snapshot]) - with open(summary_file, "a" as handle: - handle.write(outline + "\n")) + with open(summary_file, "a") as handle: + handle.write(outline + "\n") def main(): + stdout_f = open(snakemake.log["stdout"], "a") + stderr_f = open(snakemake.log["stderr"], "a") + # Output file outfile = snakemake.output["snapshot_qc"] ## Quality control variables snapshot = snakemake.input["snapshot"] - qc_thresholds = snakemake.params["thresholds"][snakemake.wildcards["pair_status_directory"]] + qc_thresholds = snakemake.params["thresholds"] ## Batch scripts @@ -217,35 +213,38 @@ def main(): merged_batch = snakemake.params["merged_batch"] batch_temp = snakemake.params["batch_temp"] # Set up the temporary batch script file - os.system('cat {batch_script} > {batch_temp} && echo "exit" >> {batch_temp}') + os.system(f'cat {batch_script} > {batch_temp} && echo "exit" >> {batch_temp}') ## Variables for running IGV igv_exec = { "igv": snakemake.params["igv"], - "server_num": snakemake.params["server_number"] + "server_num": snakemake.params["server_number"], "server_args": snakemake.params["server_args"] } ## Summary file to append failed snapshots to - f_summary = snakemake.params["failed_summary"] - - ## Logging files - global stdout - global stderr - - stdout = snakemake.log["stdout"] - stderr = snakemake.log["stderr"] + f_summary = snakemake.input["failed_summary"] # Get image qualities img_values = getImageQualities(snapshot, batch_temp, igv_exec, f_summary) + description_line = f'Initial image values are:\nHeight: {img_values["height"]}\nWidth: {img_values["width"]}\nKurtosis: {img_values["kurtosis"]}\nSkewness:{img_values["skewness"]}\n' + stdout_f.write(description_line) + # Control the qualities results = qualityControl(snapshot, batch_temp, igv_exec, img_values, qc_thresholds, f_summary) if results != "pass": logFailedSnapshots(snapshot, f_summary, results) + if results != "fail": os.system(f'touch {outfile}') + # Cleanup + os.remove(batch_temp) + stdout_f.close() + stderr_f.close() +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 52e969ad..5130d8ef 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -46,16 +46,16 @@ CFG = op.setup_module( # Rename genome_build values in sample metadata to correlate with MAF values CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["options"]["genome_map"]["grch37"]), "grch37", inplace=True) CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["options"]["genome_map"]["hg38"]), "hg38", inplace=True) +CFG["runs"]["normal_genome_build"].mask(CFG["runs"]["normal_genome_build"].isin(CFG["options"]["genome_map"]["grch37"]), "grch37", inplace=True) +CFG["runs"]["normal_genome_build"].mask(CFG["runs"]["normal_genome_build"].isin(CFG["options"]["genome_map"]["hg38"]), "hg38", inplace=True) # Setup variables - SUFFIX = ".pad" + str(CFG["options"]["generate_batch_script"]["padding"]) if "launch_date" in CFG: LAUNCH_DATE = CFG["launch_date"] else: LAUNCH_DATE = datetime.today().strftime('%Y-%m-%d') - # Define rules to be run locally when using a compute cluster localrules: _igv_symlink_regions_file, @@ -98,21 +98,15 @@ def get_maf(wildcards): # Symlinks the input files into the module results directory (under '00-inputs/') - -rule _igv_symlink_bam: - input: - bam = get_bam - output: - bam = CFG["dirs"]["inputs"] + "bams/{seq_type}/{sample_id}.bam" - run: - op.absolute_symlink(input.bam, output.bam) - -rule _igv_symlink_bai: +rule _igv_symlink_bams: input: + bam = get_bam, bai = get_bai output: + bam = CFG["dirs"]["inputs"] + "bams/{seq_type}/{sample_id}.bam", bai = CFG["dirs"]["inputs"] + "bams/{seq_type}/{sample_id}.bam.bai" run: + op.absolute_symlink(input.bam, output.bam) op.absolute_symlink(input.bai, output.bai) rule _igv_symlink_maf: @@ -282,8 +276,8 @@ def _get_maf(wildcards): checkpoint _igv_create_batch_script_per_variant: input: - bam_file = str(rules._igv_symlink_bam.output.bam), - bai_file = str(rules._igv_symlink_bai.output.bai), + bam_file = str(rules._igv_symlink_bams.output.bam), + bai_file = str(rules._igv_symlink_bams.output.bai), filter_maf = _get_maf output: finished = CFG["dirs"]["batch_scripts"] + "completed/{seq_type}--{genome_build}/{sample_id}.finished", @@ -302,7 +296,6 @@ checkpoint _igv_create_batch_script_per_variant: stderr = CFG["logs"]["batch_scripts"] + "{seq_type}--{genome_build}/{sample_id}" + SUFFIX + ".stderr.log" script: config["lcr-modules"]["igv"]["scripts"]["batch_script_per_variant"] - rule _igv_batches_to_merge: input: @@ -448,7 +441,7 @@ checkpoint _igv_run: rule _igv_track_failed: output: - failed_summary = CFG["dirs"]["outputs"] + "snapshot_estimates/failed_summary_" + LAUNCH_DATE + ".txt" + failed_summary = CFG["dirs"]["outputs"] + "snapshot_summaries/quality_control/qc_summary_" + LAUNCH_DATE + ".txt" run: header = "\t".join(["sample_id","seq_type","genome_build","gene","chromosome","position","preset","status","snapshot_path"]) with open(output.failed_summary, "w") as handle: @@ -458,7 +451,7 @@ rule _igv_track_failed: rule _igv_quality_control: input: - igv = str(rules._igv_run.output.complete), #"completed/{seq_type}--{genome_build}/{preset}/{sample_id}.completed" + igv = str(rules._igv_run.output.complete), failed_summary = str(rules._igv_track_failed.output.failed_summary), snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".png" output: diff --git a/modules/igv/CHANGELOG.md b/modules/igv/CHANGELOG.md index 31639933..de3c8200 100644 --- a/modules/igv/CHANGELOG.md +++ b/modules/igv/CHANGELOG.md @@ -11,14 +11,15 @@ This release was authored by Manuela Cruz. - This module requires four file types: * Regions file containing desired regions to be snapshot in BED format, MAF format, mutation_id format in which mutations are in "{chromosome}:{start_position}:{end_position}" format or OncodriveCLUSTL / HotMAPS results files. HotMAPS results must be preformatted using a script that is executed in the last step of the HotMAPS lcr-module, `rule _hotmaps_detailed_hotspots`. Regions file format must be specified in CFG["inputs"]["regions_format"] (Can/should probably include the HotMAPS script in here) - * BAM and BAI files for each sample to generate IGV screenshots + * BAM and + * BAI files for each sample to generate IGV screenshots * MAF files for each sample to determine which variants are included in desired regions to be snapshot and extract the corresponding Chromosome, Start_Position and Hugo_Symbol values - BAM and BAI file locations are sourced based on the sample_id and the corresponding genome_build and seq_type metadata values set in CFG["samples"] - The regions files must be entered in the config under the correct key denoting the genome build of the regions file. This is required to perform proper liftover if necessary, which allows the workflow to correctly filter sample MAFs that are in builds opposite what is provided in the regions file. -- IGV batch scripts are created for each individual variant of interest. This is a checkpoint rule as it depends on the file contents that are present in the filtered MAF files. Based on this checkpoint, Snakemake can determine what variant batch scripts have been created. +- IGV batch scripts are created for each individual variant of interest. This is a checkpoint rule as it depends on the file contents of the filtered MAF files. Creation of IGV snapshots: @@ -28,12 +29,14 @@ Creation of IGV snapshots: Blank or truncated snapshots: -- Truncated or blank snapshots can occur during the IGV run. The quality control rule checks for blank snapshots based on the images kurtosis and skewness values, and checks for truncated snapshots based on the image's height and width. Three attempts are performed in order to resolve the affected snapshots, and if they remain unresolved the quality control rule will fail. Note that the flags for blank and truncated snapshots have been determined based on IGV image dimensions 1920x1080x24, and modifying image dimensions may result in more blank or truncated snapshots. +- Truncated or blank snapshots can occur during the IGV run. The quality control rule checks for blank snapshots based on the kurtosis and skewness values of the snapshot, and checks for truncated snapshots based on the height and width. Multiple attempts are performed in order to resolve the affected snapshots, and if they remain unresolved the quality control rule will fail. Note that the thresholds for blank and truncated snapshots have been determined based on IGV image dimensions 1920x1080x24, and modifying image dimensions may result in blank or truncated snapshots being missed during quality control. -- Increasing the milliseconds set in the CFG["options"]["generate_batch_script"]["sleep_timer] typically reduces the amount of snapshots with issues. +- Increasing the milliseconds set in CFG["options"]["generate_batch_script"]["sleep_timer] typically reduces the amount of snapshots with issues but increases workflow run time. + +- A summary file of failed or suspicious snapshots is created for each run in `99-outputs/snapshot_summaries/quality_control/`. Failed snapshots match specific height thresholds known to belong to blank or truncated snapshots. Suspicious snapshots are snapshots that reached the limit of re-run attempts during quality control. These may be blank, truncated, or they may be regular snapshots that display only a few reads and went through the quality control process due to their short heights. Estimating snapshots: -- To estimate the number of IGV snapshots that will be created, the config parameter "estimate_only" can be set to True. Summary files are created in "99-outputs/snapshot_summaries/estimates/" based on the individual batch scripts that have been created and do not have pre-existing "dispatch" files. +- To estimate the number of IGV snapshots that will be created, the config parameter "estimate_only" can be set to True. Summary files are created in `99-outputs/snapshot_summaries/estimates/` based on the individual batch scripts that have been created and do not have pre-existing "dispatch" files. From 0a5efd7c3a315e4e74bf7dc909b8ac275e313593 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Wed, 24 Jan 2024 16:52:15 -0800 Subject: [PATCH 109/132] Update config --- modules/igv/1.0/config/default.yaml | 47 ++++++++++------------------- 1 file changed, 16 insertions(+), 31 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index ae799ac6..6f4c7b2f 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -24,7 +24,6 @@ lcr-modules: hg38: [] estimate_only: False # Stop after MAF filtering step to estimate total number of snapshots - identify_failed_snaps: False options: filter_maf: @@ -34,18 +33,18 @@ lcr-modules: n_samples: 5 # Desired number of samples in OncodriveCLUSTL clusters genome_map: # Map builds between metadata and MAFs - grch37: ["grch37","hg19","hg19-clc","hg19-reddy","hs37d5"] - hg38: ["hg38","hg38-nci","hg38-panea","grch38"] + grch37: ["__UPDATE__"] + hg38: ["__UPDATE__"] liftover_regions: reference_chain_file: - grch37: "genomes/grch37/chains/grch37/hg19ToHg38.over.chain" - hg38: "genomes/hg38/chains/grch38/hg38ToHg19.over.chain" + grch37: "__UPDATE__" + hg38: "__UPDATE__" target_reference: - grch37: "/projects/rmorin/reference/igenomes/Homo_sapiens/GSC/GRCh37-lite/Sequence/WholeGenomeFasta/genome.fa" - hg38: "/projects/rmorin/reference/igenomes/Homo_sapiens/GSC/GRCh38/Sequence/WholeGenomeFasta/genome.fa" + grch37: "__UPDATE__" + hg38: "__UPDATE__" - igv_presets: ["default", "paired_reads"] + igv_presets: ["default"] #available options: default and paired_reads generate_batch_script: padding: 100 # Base pairs upstream and downstream of variant position @@ -55,37 +54,27 @@ lcr-modules: igv_options: default: ["preference SAM.COLOR_BY READ_STRAND", "preference SAM.SHOW_CENTER_LINE TRUE", "preference SAM.SHADE_BASE_QUALITY true", "preference SAM.DOWNSAMPLE_READS FALSE", "preference SAM.ALLELE_THRESHOLD 0.05", "sort"] paired_reads: ["viewaspairs", "preference SAM.COLOR_BY READ_STRAND", "preference SAM.SHOW_CENTER_LINE TRUE", "preference SAM.SHADE_BASE_QUALITY true", "preference SAM.DOWNSAMPLE_READS FALSE", "preference SAM.ALLELE_THRESHOLD 0.05", "sort QUALITY"] - view_as_pairs: False # Toggle pairwise orientation in IGV xvfb_parameters: server_number: "99" server_args: "" quality_control: - tumour_only: - truncated: ["506","545","547","559",570"] - blank: - "547": - kurtosis: 18.5 - skewness: -4 - "559": - kurtosis: 18.2 - skewness: -4 - failed: ["506"] - tumour_normal_pair: - truncated: ["533","1226","1055"] - blank: - "<1000": - kurtosis: 8 - skewness: -3 - failed: ["533"] + truncated: [506,533,545,547,559,570] + blank: + "547": + kurtosis: 18.5 + skewness: -4 + "559": + kurtosis: 18.2 + skewness: -4 + failed: [506,533] scripts: format_regions: "etc/format_regions.py" filter_script: "etc/filter_maf.py" region_liftover_script: "{MODSDIR}/etc/liftover_regions.sh" batch_script_per_variant: "etc/generate_batch_script_per_variant.py" - batch_script: "etc/generate_batch_scripts.py" quality_control: "etc/quality_control.py" scratch_subdirectories: [] @@ -95,10 +84,6 @@ lcr-modules: wget: "{REPODIR}/envs/wget/wget-1.20.1.yaml" threads: - _igv_symlink_bam: 30 - _igv_symlink_bai: 30 - _igv_quality_control: 20 - _igv_symlink_snapshot: 20 resources: _igv_liftover_regions: From f3e31836ef1e76461644494ba5e483ce2745a742 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Sat, 10 Feb 2024 06:36:50 -0800 Subject: [PATCH 110/132] Add local rules, add threads limit to merging batches script to prevent duplicates, remove typo in rule _igv_track_failed, edit how snapshot estimates are written --- modules/igv/1.0/igv.smk | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 5130d8ef..5a258017 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -58,23 +58,27 @@ else: # Define rules to be run locally when using a compute cluster localrules: - _igv_symlink_regions_file, - _igv_symlink_bam, - _igv_symlink_bai, + _igv_symlink_bams, _igv_symlink_maf, _igv_reduce_maf_cols, - _igv_format_regions_file, + _igv_merge_regions, + _igv_format_regions, _igv_liftover_regions, + _igv_merge_lifted_regions, _igv_filter_maf, _igv_create_batch_script_per_variant, _igv_batches_to_merge, _igv_download_igv, _igv_run, + _igv_track_failed, + _igv_quality_control, _igv_symlink_snapshot, _igv_check_snapshots, + _igv_check_samples, _igv_touch_summary, + _igv_setup_estimates, _igv_estimate_snapshots, - _igv_snapshot_estimate_finished + _igv_check_estimates ##### FUNCTIONS ##### @@ -305,7 +309,7 @@ rule _igv_batches_to_merge: params: merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset}/{sample_id}" + SUFFIX + ".batch", igv_options = lambda w: config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["igv_options"][w.preset] - threads: (workflow.cores / 10) + threads: (workflow.cores / 30) run: batch_script_path = os.path.abspath(input.batch_script) output_file = os.path.abspath(params.merged_batch) @@ -446,8 +450,6 @@ rule _igv_track_failed: header = "\t".join(["sample_id","seq_type","genome_build","gene","chromosome","position","preset","status","snapshot_path"]) with open(output.failed_summary, "w") as handle: handle.write(header + "\n") - ready = open(output.ready, "w") - ready.close() rule _igv_quality_control: input: @@ -455,7 +457,7 @@ rule _igv_quality_control: failed_summary = str(rules._igv_track_failed.output.failed_summary), snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".png" output: - snapshot_qc = temp(CFG["dirs"]["snapshots"] + "qc/{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".qc") + snapshot_qc = CFG["dirs"]["snapshots"] + "qc/{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".qc" params: batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{sample_id}" + SUFFIX + ".batch", merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset}/{sample_id}" + SUFFIX + ".batch", @@ -719,12 +721,12 @@ rule _igv_estimate_snapshots: for index, row in maf_table.iterrows(): gene = row["Hugo_Symbol"] - chromosome = row["Chromosome"] + chromosome = row["chr_std"] start_position = str(row["Start_Position"]) dispatch_path = CFG["dirs"]["batch_scripts"] + f"dispatched_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{sample_id}" + SUFFIX + ".batch" - outline = "\t".join([sample_id, seq_type, genome_build, gene,chromosome, start_position, preset]) + outline = "\t".join([sample_id, seq_type, genome_build, gene, chromosome, start_position, preset]) snapshot_summary.write(outline + "\n") if not os.path.exists(dispatch_path): From fea42ec78839281326b516fbc617286dcd7b5796 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Sat, 10 Feb 2024 06:37:28 -0800 Subject: [PATCH 111/132] Reorder config values to make it more understandable (i hope) --- modules/igv/1.0/config/default.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index 6f4c7b2f..f76b51cc 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -44,8 +44,6 @@ lcr-modules: grch37: "__UPDATE__" hg38: "__UPDATE__" - igv_presets: ["default"] #available options: default and paired_reads - generate_batch_script: padding: 100 # Base pairs upstream and downstream of variant position max_height: 1000 @@ -53,7 +51,9 @@ lcr-modules: # Available igv options: https://github.com/igvteam/igv/wiki/Batch-commands igv_options: default: ["preference SAM.COLOR_BY READ_STRAND", "preference SAM.SHOW_CENTER_LINE TRUE", "preference SAM.SHADE_BASE_QUALITY true", "preference SAM.DOWNSAMPLE_READS FALSE", "preference SAM.ALLELE_THRESHOLD 0.05", "sort"] - paired_reads: ["viewaspairs", "preference SAM.COLOR_BY READ_STRAND", "preference SAM.SHOW_CENTER_LINE TRUE", "preference SAM.SHADE_BASE_QUALITY true", "preference SAM.DOWNSAMPLE_READS FALSE", "preference SAM.ALLELE_THRESHOLD 0.05", "sort QUALITY"] + pairs: ["viewaspairs", "preference SAM.COLOR_BY READ_STRAND", "preference SAM.SHOW_CENTER_LINE TRUE", "preference SAM.SHADE_BASE_QUALITY true", "preference SAM.DOWNSAMPLE_READS FALSE", "preference SAM.ALLELE_THRESHOLD 0.05", "sort QUALITY"] + + igv_presets: ["default"] #available options: default and pairs xvfb_parameters: server_number: "99" From b0a69bfa765ce89dcb6a6d25ec10ad9d97db97aa Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Sat, 10 Feb 2024 06:40:32 -0800 Subject: [PATCH 112/132] Group variants by position so that snapshot instructions for multiple alleles in one position are incorporated into the batch scripts --- .../etc/generate_batch_script_per_variant.py | 76 +++++++++++++------ 1 file changed, 53 insertions(+), 23 deletions(-) diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py index 54d75ef6..4723c9f2 100644 --- a/modules/igv/1.0/etc/generate_batch_script_per_variant.py +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -187,44 +187,74 @@ def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_bui merged_batch_suffix = snakemake.wildcards["sample_id"] + suffix + ".batch" - for _, row in regions.iterrows(): + grouped_regions = regions.groupby("region") + + # Group by genomic coordinate + for coordinate, variants in grouped_regions: all_lines = [] header = generate_igv_batch_header(bam=bam, index=bai, max_height=max_height, genome_build=genome_build) all_lines.extend(header) seq_type_build = f"{seq_type}--{genome_build}" - chrom_dir = row.chromosome + chrom_dir = variants["chromosome"].unique()[0] filename = [] - filename.append(row.region), - filename.append(row.region_name) + filename.append(variants["region"].unique()[0]) + filename.append(variants["region_name"].unique()[0]) batch_filename = filename.copy() batch_filename.append(snakemake.wildcards["sample_id"]) batch_filename = "--".join(batch_filename) + suffix + ".batch" - - snap_filename = filename.copy() + if tissue_status == "tumour": - snap_filename.append(f"{row.ref_allele}_{row.alt_allele}") + # Iterate over variants at same position so that instructions to create multiple snapshots + # with different alleles in their filenames are added to the batch script. + + for _, row in variants.iterrows(): + + snap_filename = filename.copy() + snap_filename.append(f"{row.ref_allele}_{row.alt_allele}") + snap_filename.append(snakemake.wildcards["sample_id"]) + snap_filename = "--".join(snap_filename) + suffix + ".png" + + lines = generate_igv_batch_per_row( + sleep_interval = sleep_timer, + preset = preset, + options = igv_options, + coordinates = row.snapshot_coordinates, + directory = snapshot_dir, + child_dir = tissue_status, + seq_build = seq_type_build, + chrom_directory = chrom_dir, + snapshot_filename = snap_filename + ) + + all_lines.extend(lines) + elif tissue_status == "normal": - snap_filename.append(f"{row.ref_allele}_{row.ref_allele}") - snap_filename.append(snakemake.wildcards["sample_id"]) - snap_filename = "--".join(snap_filename) + suffix + ".png" - - lines = generate_igv_batch_per_row( - sleep_interval = sleep_timer, - preset = preset, - options = igv_options, - coordinates = row.snapshot_coordinates, - directory = snapshot_dir, - child_dir = tissue_status, - seq_build = seq_type_build, - chrom_directory = chrom_dir, - snapshot_filename = snap_filename - ) - all_lines.extend(lines) + # Only need to create one snapshot since only one allele will be expected from the ref + + snap_filename = filename.copy() + ref_allele = variants["ref_allele"].unique()[0] + snap_filename.append(f"{ref_allele}_{ref_allele}") + snap_filename.append(snakemake.wildcards["sample_id"]) + snap_filename = "--".join(snap_filename) + suffix + ".png" + + lines = generate_igv_batch_per_row( + sleep_interval = sleep_timer, + preset = preset, + options = igv_options, + coordinates = variants["snapshot_coordinates"].unique()[0], + directory = snapshot_dir, + child_dir = tissue_status, + seq_build = seq_type_build, + chrom_directory = chrom_dir, + snapshot_filename = snap_filename + ) + + all_lines.extend(lines) # Make subdirectories if necessary because snakemake won't make them since rule is a checkpoint os.makedirs(os.path.join(output_dir, "single_batch_scripts", seq_type_build, preset), exist_ok=True) From 50bfa6cda2e963bccb5cd826baf6b74c43a7cecc Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Thu, 29 Feb 2024 23:16:05 -0800 Subject: [PATCH 113/132] Update local rules --- modules/igv/1.0/igv.smk | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 5a258017..bf150e52 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -75,7 +75,6 @@ localrules: _igv_symlink_snapshot, _igv_check_snapshots, _igv_check_samples, - _igv_touch_summary, _igv_setup_estimates, _igv_estimate_snapshots, _igv_check_estimates From 746b073c1422befe610d27c7004aa3b5c761ce7a Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Sat, 9 Mar 2024 02:48:14 -0800 Subject: [PATCH 114/132] Update oncodriveclustl results reformatting using new module outputs (expanded genomic coordinate output) --- modules/igv/1.0/config/default.yaml | 6 ----- modules/igv/1.0/etc/format_regions.py | 38 +++------------------------ modules/igv/1.0/igv.smk | 1 - 3 files changed, 4 insertions(+), 41 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index f76b51cc..6c24c79c 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -26,12 +26,6 @@ lcr-modules: estimate_only: False # Stop after MAF filtering step to estimate total number of snapshots options: - filter_maf: - oncodriveclustl_options: # These parameters will filter the OncodriveCLUSTL cluster results file. - p_value: 0.001 # Maximum p-value of OncodriveCLUSTL clusters - score: # Minimum score of OncodriveCLUSTL clusters - n_samples: 5 # Desired number of samples in OncodriveCLUSTL clusters - genome_map: # Map builds between metadata and MAFs grch37: ["__UPDATE__"] hg38: ["__UPDATE__"] diff --git a/modules/igv/1.0/etc/format_regions.py b/modules/igv/1.0/etc/format_regions.py index 56acc0c1..77245745 100755 --- a/modules/igv/1.0/etc/format_regions.py +++ b/modules/igv/1.0/etc/format_regions.py @@ -40,10 +40,6 @@ def main(): touch_output.close() exit() - if regions_format == "oncodriveclustl": - global CLUSTL_PARAMS - CLUSTL_PARAMS = snakemake.params["oncodriveclustl_params"] - if regions_format == "mutation_id": global REGIONS_BUILD REGIONS_BUILD = snakemake.params["regions_build"] @@ -109,39 +105,13 @@ def format_clustl(clustl_regions): # Read regions into dataframe clustl_regions = pd.read_table(clustl_regions, comment="#", sep="\t") - p_filter = CLUSTL_PARAMS["p_value"] - score_filter = CLUSTL_PARAMS["score"] - n_samples_filter = CLUSTL_PARAMS["n_samples"] - - for key, filter_value in {"P": p_filter, "SCORE": score_filter, "N_SAMPLES": n_samples_filter}.items(): - if filter_value is not None: - if key != "P": - clustl_regions = clustl_regions[clustl_regions[key] >= float(filter_value)] - if key == "P": - clustl_regions = clustl_regions[clustl_regions[key] <= float(filter_value)] - - # Reformat CLUSTL coordinates to handle clusters that cross introns (when CLUSTL concatenated mode is used) - clustl_regions = clustl_regions.assign(COORDINATES = clustl_regions.COORDINATES.str.split(";")).explode("COORDINATES") - - # Convert OncodriveCLUSTL cluster coordinates to BED format - clustl_regions["COORDINATES"] = clustl_regions.apply( - lambda x: list( - range( - int(str(x["COORDINATES"]).split(",")[0]), int(str(x["COORDINATES"]).split(",")[1]) + 1 - ) - ) - if str(x["COORDINATES"]).split(",")[0] != str(x["COORDINATES"]).split(",")[1] else int(str(x["COORDINATES"]).split(",")[0]), - axis = 1 - ) - clustl_regions = clustl_regions.explode("COORDINATES") - - # Create columnsn required for BED format - chr_std = "chr" + clustl_regions["CHROMOSOME"].map(str) + # Create columnns required for BED format + chr_std = "chr" + clustl_regions["Chromosome"].map(str) clustl_reformatted = pd.DataFrame( { "chrom": chr_std, - "start": clustl_regions["COORDINATES"], - "end": clustl_regions["COORDINATES"] + "start": clustl_regions["Start_Position"], + "end": clustl_regions["Start_Position"] } ) return clustl_reformatted diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index bf150e52..21d4cdb9 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -158,7 +158,6 @@ rule _igv_format_regions: regions = CFG["dirs"]["inputs"] + "regions/{tool_type}_formatted.{tool_build}.tsv" params: regions_format = lambda w: w.tool_type, - oncodriveclustl_params = CFG["options"]["filter_maf"]["oncodriveclustl_options"], regions_build = lambda w: w.tool_build log: stdout = CFG["logs"]["inputs"] + "regions/format_regions_{tool_type}.{tool_build}.stdout.log", From 7c536f17324901f82bbb49db1cf6964005a7b3ce Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Thu, 14 Mar 2024 23:12:42 -0700 Subject: [PATCH 115/132] Add blank line to the end of scripts --- modules/igv/1.0/etc/filter_maf.py | 2 +- modules/igv/1.0/etc/format_regions.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/igv/1.0/etc/filter_maf.py b/modules/igv/1.0/etc/filter_maf.py index 1f85571b..6010e251 100644 --- a/modules/igv/1.0/etc/filter_maf.py +++ b/modules/igv/1.0/etc/filter_maf.py @@ -132,4 +132,4 @@ def write_output(maf, outfile): filemode='w' ) - main() \ No newline at end of file + main() diff --git a/modules/igv/1.0/etc/format_regions.py b/modules/igv/1.0/etc/format_regions.py index 77245745..0206f7ad 100755 --- a/modules/igv/1.0/etc/format_regions.py +++ b/modules/igv/1.0/etc/format_regions.py @@ -149,4 +149,4 @@ def format_regions(regions, regions_format): filename=snakemake.log[1], filemode='w' ) - main() \ No newline at end of file + main() From 27363e9de924708d7b7eb83b6194ac306c49f494 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Thu, 14 Mar 2024 23:22:26 -0700 Subject: [PATCH 116/132] Update CHANGELOG --- modules/igv/CHANGELOG.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/modules/igv/CHANGELOG.md b/modules/igv/CHANGELOG.md index de3c8200..aade97ae 100644 --- a/modules/igv/CHANGELOG.md +++ b/modules/igv/CHANGELOG.md @@ -10,13 +10,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 This release was authored by Manuela Cruz. - This module requires four file types: - * Regions file containing desired regions to be snapshot in BED format, MAF format, mutation_id format in which mutations are in "{chromosome}:{start_position}:{end_position}" format or OncodriveCLUSTL / HotMAPS results files. HotMAPS results must be preformatted using a script that is executed in the last step of the HotMAPS lcr-module, `rule _hotmaps_detailed_hotspots`. Regions file format must be specified in CFG["inputs"]["regions_format"] (Can/should probably include the HotMAPS script in here) + * Regions file containing desired regions to be snapshot in BED format, MAF format, mutation_id format in which mutations are in "{chromosome}:{start_position}:{end_position}" format or OncodriveCLUSTL / HotMAPS results files. HotMAPS and OncodriveCLUSTL results are preformatted using scripts that is executed in the last steps of their lcr-modules. * BAM and * BAI files for each sample to generate IGV screenshots * MAF files for each sample to determine which variants are included in desired regions to be snapshot and extract the corresponding Chromosome, Start_Position and Hugo_Symbol values - BAM and BAI file locations are sourced based on the sample_id and the corresponding genome_build and seq_type metadata values set in CFG["samples"] +- As multiple genome build values other than `grch37` and `hg38` can exist in the metadata genome_build column, add the genome build values to their respective `grch37` or `hg38` lists in the config at ["options"]["genome_map"] to ensure the correct MAF paths are resolved for each sample. + - The regions files must be entered in the config under the correct key denoting the genome build of the regions file. This is required to perform proper liftover if necessary, which allows the workflow to correctly filter sample MAFs that are in builds opposite what is provided in the regions file. - IGV batch scripts are created for each individual variant of interest. This is a checkpoint rule as it depends on the file contents of the filtered MAF files. @@ -27,6 +29,8 @@ Creation of IGV snapshots: - IGV is then run on each sample's merged batch script. This is also a checkpoint rule as the specific snapshots that will be created depend on the contents of the merged batch scripts. +- IGV snapshot presets can be defined in the config at ["options"]["generate_batch_script"]["igv_options"] parameters. Two presets have already been defined, `default` and `pairs`. To specify which presets you want to create images for in the analysis, define them in list format in the config at ["options"]["igv_presets"] + Blank or truncated snapshots: - Truncated or blank snapshots can occur during the IGV run. The quality control rule checks for blank snapshots based on the kurtosis and skewness values of the snapshot, and checks for truncated snapshots based on the height and width. Multiple attempts are performed in order to resolve the affected snapshots, and if they remain unresolved the quality control rule will fail. Note that the thresholds for blank and truncated snapshots have been determined based on IGV image dimensions 1920x1080x24, and modifying image dimensions may result in blank or truncated snapshots being missed during quality control. @@ -37,6 +41,6 @@ Blank or truncated snapshots: Estimating snapshots: -- To estimate the number of IGV snapshots that will be created, the config parameter "estimate_only" can be set to True. Summary files are created in `99-outputs/snapshot_summaries/estimates/` based on the individual batch scripts that have been created and do not have pre-existing "dispatch" files. +- To estimate the number of IGV snapshots that will be created, the config parameter ["estimate_only"] can be set to True. Summary files are created in `99-outputs/snapshot_summaries/estimates/` based on the individual batch scripts that have been created and do not have pre-existing "dispatch" files. From 24ea395c14599ae2fdfbbd8f852a018edb33b8d1 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Thu, 14 Mar 2024 23:29:14 -0700 Subject: [PATCH 117/132] Add slms 3 pairing config so that slms_3 can be set in the config["tool_names"] --- modules/igv/1.0/config/default.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index 6c24c79c..4599a96c 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -96,3 +96,14 @@ lcr-modules: run_paired_tumours: True run_unpaired_tumours_with: "unmatched_normal" run_paired_tumours_as_unpaired: False + + slms_3: + pairing_config: + genome: + run_paired_tumours: True + run_unpaired_tumours_with: "unmatched_normal" + run_paired_tumours_as_unpaired: False + capture: + run_paired_tumours: True + run_unpaired_tumours_with: "unmatched_normal" + run_paired_tumours_as_unpaired: False From 0ee08cbca9c9d5b0eccffcc341257c9a6af4bb7f Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Tue, 19 Mar 2024 00:57:18 -0700 Subject: [PATCH 118/132] Add more info to default config --- modules/igv/1.0/config/default.yaml | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index 4599a96c..1c7c239a 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -6,7 +6,8 @@ lcr-modules: # Available wildcards: {seq_type} {tumour_sample_id} {normal_sample_id} {pair_status} {genome_build} maf: "__UPDATE__" - regions: + regions: + # Provide regions files as lists in their respective genome builds so that liftover of coordinates occurs properly oncodriveclustl: grch37: [] hg38: [] @@ -23,38 +24,44 @@ lcr-modules: grch37: [] hg38: [] - estimate_only: False # Stop after MAF filtering step to estimate total number of snapshots + # Stop snakefile after MAF filtering step to estimate total number of snapshots that will be taken without running IGV + estimate_only: False options: - genome_map: # Map builds between metadata and MAFs + genome_map: + # Map builds between metadata and sample_ids so that MAF file locations are determined properly grch37: ["__UPDATE__"] hg38: ["__UPDATE__"] liftover_regions: reference_chain_file: - grch37: "__UPDATE__" - hg38: "__UPDATE__" + grch37: "genomes/grch37/chains/grch37/hg19ToHg38.over.chain" + hg38: "genomes/hg38/chains/grch38/hg38ToHg19.over.chain" target_reference: grch37: "__UPDATE__" hg38: "__UPDATE__" generate_batch_script: padding: 100 # Base pairs upstream and downstream of variant position - max_height: 1000 + max_height: 1000 # Maximum height of snapshot sleep_timer: 2000 # Batch scripts with more options may require longer sleep intervals - # Available igv options: https://github.com/igvteam/igv/wiki/Batch-commands - igv_options: + igv_options: + # Presets for IGV snapshots + # Available igv options: https://github.com/igvteam/igv/wiki/Batch-commands default: ["preference SAM.COLOR_BY READ_STRAND", "preference SAM.SHOW_CENTER_LINE TRUE", "preference SAM.SHADE_BASE_QUALITY true", "preference SAM.DOWNSAMPLE_READS FALSE", "preference SAM.ALLELE_THRESHOLD 0.05", "sort"] pairs: ["viewaspairs", "preference SAM.COLOR_BY READ_STRAND", "preference SAM.SHOW_CENTER_LINE TRUE", "preference SAM.SHADE_BASE_QUALITY true", "preference SAM.DOWNSAMPLE_READS FALSE", "preference SAM.ALLELE_THRESHOLD 0.05", "sort QUALITY"] - igv_presets: ["default"] #available options: default and pairs + igv_presets: ["default"] # Available options: "default" "pairs" xvfb_parameters: + # Server options for running xvfb server_number: "99" server_args: "" quality_control: + # Truncated heights that have been previously observed for dimensions 1920x1080x24 truncated: [506,533,545,547,559,570] + # Kurtosis and skewness values observed in blank snapshots at different height values blank: "547": kurtosis: 18.5 @@ -62,6 +69,7 @@ lcr-modules: "559": kurtosis: 18.2 skewness: -4 + # Previously observed heights of snapshots that fail IGV failed: [506,533] scripts: From e044266fb93fd3166de26bd583c195719135f43e Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Wed, 27 Mar 2024 05:21:53 -0700 Subject: [PATCH 119/132] Fix typo --- modules/igv/1.0/etc/quality_control.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/igv/1.0/etc/quality_control.py b/modules/igv/1.0/etc/quality_control.py index f086449d..861369ac 100644 --- a/modules/igv/1.0/etc/quality_control.py +++ b/modules/igv/1.0/etc/quality_control.py @@ -115,7 +115,7 @@ def handleBlank(snapshot, img_values, thresholds, batch_file, igv, truncated_att while blank and attempts < 3: attempts += 1 - message = f'Current snapshot values are: {img_values["height"]} height, {img_values["kurtosis"]} kurtosis, and {img_values["skewness"]} skewness. Snapshots with these values may be blank. Blank snapshots may be due to errors reading BAM file headers, Java address bind errors, or other errors during that occur during the IGV run. Rerunning with increased sleep interval.' + message = f'Current snapshot values are: {img_values["height"]} height, {img_values["kurtosis"]} kurtosis, and {img_values["skewness"]} skewness. Snapshots with these values may be blank. Blank snapshots may be due to errors reading BAM file headers, Java address bind errors, or other errors that occur during the IGV run. Rerunning with increased sleep interval.' # Rerun IGV runIGV(batch_file, igv, status, message, attempts) From 0a7b028b270ebbf5b22ba381eedcfa2cd5e3f63d Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Wed, 27 Mar 2024 12:12:38 -0700 Subject: [PATCH 120/132] Remove outdated commented --- modules/igv/1.0/etc/generate_batch_script_per_variant.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py index 4723c9f2..4514ddd3 100644 --- a/modules/igv/1.0/etc/generate_batch_script_per_variant.py +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -22,7 +22,6 @@ def main(): sys.stdout = stdout try: - # Handle matched samples with matched normal BAMs input_bam = snakemake.input["bam_file"] input_bai = snakemake.input["bai_file"] From 0f21edcb801ba2d8504eed874675dbe03f59748a Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Wed, 27 Mar 2024 13:29:11 -0700 Subject: [PATCH 121/132] Add more descriptions to config, reduce timelimit of IGV run --- modules/igv/1.0/config/default.yaml | 13 +++++++------ modules/igv/1.0/igv.smk | 6 +++--- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index 1c7c239a..f64d6129 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -3,7 +3,7 @@ lcr-modules: igv: inputs: - # Available wildcards: {seq_type} {tumour_sample_id} {normal_sample_id} {pair_status} {genome_build} + # Available wildcards: {unix_group} {seq_type} {tumour_sample_id} {normal_sample_id} {pair_status} {genome_build} maf: "__UPDATE__" regions: @@ -29,15 +29,16 @@ lcr-modules: options: genome_map: - # Map builds between metadata and sample_ids so that MAF file locations are determined properly - grch37: ["__UPDATE__"] - hg38: ["__UPDATE__"] + # Map metadata builds to grch37 and hg38 so that MAF file locations are determined correctly + grch37: ["__UPDATE__"] # e.g ["grch37","hg19","hs37d5"] + hg38: ["__UPDATE__"] # e.g ["hg38","grch38"] liftover_regions: reference_chain_file: grch37: "genomes/grch37/chains/grch37/hg19ToHg38.over.chain" hg38: "genomes/hg38/chains/grch38/hg38ToHg19.over.chain" target_reference: + # fasta of target build grch37: "__UPDATE__" hg38: "__UPDATE__" @@ -82,10 +83,10 @@ lcr-modules: scratch_subdirectories: [] conda_envs: - liftover_regions: "{MODSDIR}/envs/crossmap.yaml" + crossmap: "{MODSDIR}/envs/crossmap.yaml" wget: "{REPODIR}/envs/wget/wget-1.20.1.yaml" - threads: + threads: 4 resources: _igv_liftover_regions: diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 21d4cdb9..89afb700 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -120,7 +120,7 @@ rule _igv_symlink_maf: run: op.absolute_symlink(input.maf, output.maf) -# Reduce MAF columns to prevent parsing errors in Pandas +# Reduce MAF columns to prevent parsing errors in Pandas and reduce file size rule _igv_reduce_maf_cols: input: maf = str(rules._igv_symlink_maf.output.maf) @@ -186,7 +186,7 @@ rule _igv_liftover_regions: regions_build = lambda w: (w.tool_build).replace("grch37","GRCh37").replace("hg38","GRCh38"), target_build = lambda w: (w.genome_build).replace("grch37","GRCh37").replace("hg38","GRCh38") conda: - CFG["conda_envs"]["liftover_regions"] + CFG["conda_envs"]["crossmap"] resources: **CFG["resources"]["_igv_liftover_regions"] log: @@ -423,7 +423,7 @@ checkpoint _igv_run: then echo 'exit' >> {params.merged_batch} ; fi ; - maxtime=$(($(wc -l < {params.merged_batch}) * 60 + 15 + {params.sleep_time})) ; + maxtime=$(($(wc -l < {params.merged_batch}) * 10 + 15 + {params.sleep_time})) ; timeout $maxtime xvfb-run -s "-screen 0 1920x1080x24" {params.server_number} {params.server_args} {params.igv} -b {params.merged_batch} > {log.stdout} 2> {log.stderr} && touch {output.complete} ; exit=$? ; if [ $exit -ne 0 ] ; From 4e01bac572d8332ae091a409a29217eef06aa6a4 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Wed, 27 Mar 2024 14:22:28 -0700 Subject: [PATCH 122/132] Update changelog, add empty line to end of script --- modules/igv/1.0/etc/quality_control.py | 1 - modules/igv/CHANGELOG.md | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/igv/1.0/etc/quality_control.py b/modules/igv/1.0/etc/quality_control.py index 861369ac..fb4b389f 100644 --- a/modules/igv/1.0/etc/quality_control.py +++ b/modules/igv/1.0/etc/quality_control.py @@ -247,4 +247,3 @@ def main(): if __name__ == "__main__": main() - \ No newline at end of file diff --git a/modules/igv/CHANGELOG.md b/modules/igv/CHANGELOG.md index aade97ae..0e9cd0c3 100644 --- a/modules/igv/CHANGELOG.md +++ b/modules/igv/CHANGELOG.md @@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 This release was authored by Manuela Cruz. - This module requires four file types: - * Regions file containing desired regions to be snapshot in BED format, MAF format, mutation_id format in which mutations are in "{chromosome}:{start_position}:{end_position}" format or OncodriveCLUSTL / HotMAPS results files. HotMAPS and OncodriveCLUSTL results are preformatted using scripts that is executed in the last steps of their lcr-modules. + * Regions file containing desired regions to be snapshot in BED format, MAF format, mutation_id format in which mutations are in "{chromosome}:{start_position}:{end_position}" format or OncodriveCLUSTL / HotMAPS results files. HotMAPS and OncodriveCLUSTL results are preformatted using scripts that are executed in the last steps of their lcr-modules. * BAM and * BAI files for each sample to generate IGV screenshots * MAF files for each sample to determine which variants are included in desired regions to be snapshot and extract the corresponding Chromosome, Start_Position and Hugo_Symbol values @@ -29,7 +29,7 @@ Creation of IGV snapshots: - IGV is then run on each sample's merged batch script. This is also a checkpoint rule as the specific snapshots that will be created depend on the contents of the merged batch scripts. -- IGV snapshot presets can be defined in the config at ["options"]["generate_batch_script"]["igv_options"] parameters. Two presets have already been defined, `default` and `pairs`. To specify which presets you want to create images for in the analysis, define them in list format in the config at ["options"]["igv_presets"] +- IGV snapshot presets can be defined in the config at ["options"]["generate_batch_script"]["igv_options"] parameters. Two presets have already been defined, `default` and `pairs`. To specify which presets you want to create images for in the analysis, define them in list format in the config at ["options"]["igv_presets"]. More information on preset options here: https://github.com/igvteam/igv/wiki/Batch-commands Blank or truncated snapshots: From a8210325676a995807935d1344c2010bf6716293 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Thu, 4 Apr 2024 12:39:09 -0700 Subject: [PATCH 123/132] Switch to using liftover and add resources option to symlink rule --- modules/igv/1.0/config/default.yaml | 36 ++++++----------- modules/igv/1.0/etc/format_regions.py | 16 ++++++-- modules/igv/1.0/igv.smk | 56 ++++++++++++++++----------- 3 files changed, 56 insertions(+), 52 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index f64d6129..7abe4aa6 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -3,13 +3,14 @@ lcr-modules: igv: inputs: - # Available wildcards: {unix_group} {seq_type} {tumour_sample_id} {normal_sample_id} {pair_status} {genome_build} + # Available wildcards: {seq_type} {tumour_sample_id} {normal_sample_id} {pair_status} {genome_build} maf: "__UPDATE__" regions: # Provide regions files as lists in their respective genome builds so that liftover of coordinates occurs properly + # Please provide at least one regions file to filter MAF variants oncodriveclustl: - grch37: [] + grch37: ["__UPDATE__"] hg38: [] hotmaps: grch37: [] @@ -21,8 +22,9 @@ lcr-modules: grch37: [] hg38: [] mutation_id: - grch37: [] - hg38: [] + # mutation_id format: minimum requirements are header containing "mutation_id_{regions_build}" column and values in {chr}:{pos} format + grch37: [] # e.g at minimum requires column mutation_id_grch37 + hg38: [] # e.g at minimum requires columm mutation_id_grch37 # Stop snakefile after MAF filtering step to estimate total number of snapshots that will be taken without running IGV estimate_only: False @@ -30,17 +32,11 @@ lcr-modules: options: genome_map: # Map metadata builds to grch37 and hg38 so that MAF file locations are determined correctly - grch37: ["__UPDATE__"] # e.g ["grch37","hg19","hs37d5"] - hg38: ["__UPDATE__"] # e.g ["hg38","grch38"] + grch37: ["grch37","hg19","hs37d5"] + hg38: ["hg38","grch38"] liftover_regions: - reference_chain_file: - grch37: "genomes/grch37/chains/grch37/hg19ToHg38.over.chain" - hg38: "genomes/hg38/chains/grch38/hg38ToHg19.over.chain" - target_reference: - # fasta of target build - grch37: "__UPDATE__" - hg38: "__UPDATE__" + liftover_minMatch: "0.95" # Float number from 0 to 1 indicating minimal mapping when converting to a different genome build generate_batch_script: padding: 100 # Base pairs upstream and downstream of variant position @@ -76,14 +72,14 @@ lcr-modules: scripts: format_regions: "etc/format_regions.py" filter_script: "etc/filter_maf.py" - region_liftover_script: "{MODSDIR}/etc/liftover_regions.sh" + region_liftover_script: "{SCRIPTSDIR}/liftover/1.0/liftover.sh" batch_script_per_variant: "etc/generate_batch_script_per_variant.py" quality_control: "etc/quality_control.py" scratch_subdirectories: [] conda_envs: - crossmap: "{MODSDIR}/envs/crossmap.yaml" + liftover: "{SCRIPTSDIR}/liftover/1.0/liftover.yaml" wget: "{REPODIR}/envs/wget/wget-1.20.1.yaml" threads: 4 @@ -106,13 +102,3 @@ lcr-modules: run_unpaired_tumours_with: "unmatched_normal" run_paired_tumours_as_unpaired: False - slms_3: - pairing_config: - genome: - run_paired_tumours: True - run_unpaired_tumours_with: "unmatched_normal" - run_paired_tumours_as_unpaired: False - capture: - run_paired_tumours: True - run_unpaired_tumours_with: "unmatched_normal" - run_paired_tumours_as_unpaired: False diff --git a/modules/igv/1.0/etc/format_regions.py b/modules/igv/1.0/etc/format_regions.py index 0206f7ad..1ae9f52c 100755 --- a/modules/igv/1.0/etc/format_regions.py +++ b/modules/igv/1.0/etc/format_regions.py @@ -52,6 +52,10 @@ def main(): # Reformat for liftover based on regions format regions_formatted = format_regions(regions_file, regions_format) + regions_formatted = regions_formatted.drop_duplicates() + + # Add empty column to end for liftover + regions_formatted.insert(3, '', '') # Output regions file regions_formatted.to_csv(output_file, sep="\t", index=False) @@ -69,12 +73,13 @@ def format_mutation_id(mutation_id): for col, idx in {"chr_std": 0, "start": 1, "end": 2}.items(): mutation_id[col] = mutation_id.apply(lambda x: str(x[genomic_pos_col]).split(":")[idx].replace("chr",""), axis=1) + end = mutation_id["end"].apply(lambda x: x + 1) mutation_id_reformatted = pd.DataFrame( { "chrom": "chr" + mutation_id["chr_std"], "start": mutation_id["start"], - "end": mutation_id["end"] + "end": end } ) @@ -91,12 +96,13 @@ def format_hotmaps(hotmaps_regions): hotmaps_regions["chr_std"] = hotmaps_regions.apply(lambda x: str(x["Chromosome"]).replace("chr",""), axis=1) chr_std = "chr" + hotmaps_regions["chr_std"].map(str) + end = hotmaps_regions["Start_Position"].apply(lambda x: x + 1) hotmaps_reformatted = pd.DataFrame( { "chrom": chr_std, "start": hotmaps_regions["Start_Position"], - "end": hotmaps_regions["Start_Position"] + "end": end } ) return hotmaps_reformatted @@ -107,11 +113,12 @@ def format_clustl(clustl_regions): # Create columnns required for BED format chr_std = "chr" + clustl_regions["Chromosome"].map(str) + end = clustl_regions["Start_Position"].apply(lambda x: x + 1) clustl_reformatted = pd.DataFrame( { "chrom": chr_std, "start": clustl_regions["Start_Position"], - "end": clustl_regions["Start_Position"] + "end": end } ) return clustl_reformatted @@ -122,12 +129,13 @@ def format_maf(maf): # Create dataframe in BED format chr_std = "chr" + maf_regions["Chromosome"].map(str).replace("chr","") + end = maf_regions["Start_Position"].apply(lambda x: x + 1) maf_reformatted = pd.DataFrame( { "chrom": chr_std, "start": maf_regions["Start_Position"], - "end": maf_regions["End_Position"] + "end": end } ) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 89afb700..760b3840 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -91,11 +91,6 @@ def get_bai(wildcards): metadata = config["lcr-modules"]["igv"]["samples"] return expand("data/{{seq_type}}_bams/{{sample_id}}.{genome_build}.bam.bai", genome_build=metadata[(metadata.sample_id == wildcards.sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) -def get_maf(wildcards): - unix_group = config["unix_group"] - return expand(config["lcr-modules"]["igv"]["inputs"]["maf"], allow_missing=True, unix_group=unix_group) - - ##### RULES ##### @@ -114,7 +109,7 @@ rule _igv_symlink_bams: rule _igv_symlink_maf: input: - maf = get_maf + maf = CFG["inputs"]["maf"] output: maf = CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}.maf" run: @@ -173,31 +168,44 @@ REGIONS_FORMAT = { "mutation_id": "bed" } +def _igv_get_chain(wildcards): + if "38" in str({wildcards.tool_build}): + return reference_files("genomes/{tool_build}/chains/grch38/hg38ToHg19.over.chain") + else: + return reference_files("genomes/{tool_build}/chains/grch37/hg19ToHg38.over.chain") + rule _igv_liftover_regions: input: regions = str(rules._igv_format_regions.output.regions), - liftover_script = CFG["scripts"]["region_liftover_script"] + igv_chain = _igv_get_chain output: - regions = CFG["dirs"]["inputs"] + "regions/{tool_type}.{tool_build}To{genome_build}.crossmap.txt" + regions = CFG["dirs"]["inputs"] + "regions/{tool_type}.{tool_build}To{genome_build}.lifted.txt" params: - chain_file = lambda w: reference_files(config["lcr-modules"]["igv"]["options"]["liftover_regions"]["reference_chain_file"][w.tool_build]), - target_reference = lambda w: config["lcr-modules"]["igv"]["options"]["liftover_regions"]["target_reference"][w.genome_build], - regions_type = lambda w: REGIONS_FORMAT[(w.tool_type).lower()], - regions_build = lambda w: (w.tool_build).replace("grch37","GRCh37").replace("hg38","GRCh38"), - target_build = lambda w: (w.genome_build).replace("grch37","GRCh37").replace("hg38","GRCh38") + liftover_script = CFG["scripts"]["region_liftover_script"], + liftover_minmatch = CFG["options"]["liftover_regions"]["liftover_minMatch"] conda: - CFG["conda_envs"]["crossmap"] + CFG["conda_envs"]["liftover"] resources: **CFG["resources"]["_igv_liftover_regions"] log: - stdout = CFG["logs"]["inputs"] + "regions/liftover_regions_{tool_type}.{tool_build}To{genome_build}.stdout.log", stderr = CFG["logs"]["inputs"] + "regions/liftover_regions_{tool_type}.{tool_build}To{genome_build}.stderr.log" shell: op.as_one_line(""" - {input.liftover_script} - {input.regions} {params.regions_type} {params.regions_build} - {params.target_build} {output.regions} {params.chain_file} - {params.target_reference} > {log.stdout} 2> {log.stderr} + echo "Running {rule} for {wildcards.tool_type} {wildcards.tool_build}" > {log.stderr} ; + if [ {wildcards.tool_build} == {wildcards.genome_build} ] ; then + echo "{wildcards.tool_type} build {wildcards.tool_build} is already target {wildcards.genome_build}. Nothing to be done, copying {input.regions} to {output.regions}..." >> {log.stderr} ; + cat {input.regions} > {output.regions} ; + else + echo "Lifting over {wildcards.tool_type} {wildcards.tool_build} regions to {wildcards.genome_build}..." >> {log.stderr} ; + bash {params.liftover_script} + BED + {input.regions} + {output.regions} + {input.igv_chain} + YES + {params.liftover_minmatch} + 2>> {log.stderr} ; + fi """) def _get_lifted_regions(wildcards): @@ -218,7 +226,7 @@ rule _igv_merge_lifted_regions: merged_df = pd.DataFrame() for region in input.regions: try: - df = pd.read_table(region, comment = "#", sep = "\t", header=None) + df = pd.read_table(region, comment = "#", sep = "\t", header=None, usecols=[0,1,2]) df.drop(df[df[0] == "chrom"].index, inplace = True) merged_df = pd.concat([merged_df, df]) except: @@ -307,7 +315,7 @@ rule _igv_batches_to_merge: params: merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset}/{sample_id}" + SUFFIX + ".batch", igv_options = lambda w: config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["igv_options"][w.preset] - threads: (workflow.cores / 30) + threads: (workflow.cores / 10) run: batch_script_path = os.path.abspath(input.batch_script) output_file = os.path.abspath(params.merged_batch) @@ -452,7 +460,7 @@ rule _igv_track_failed: rule _igv_quality_control: input: igv = str(rules._igv_run.output.complete), - failed_summary = str(rules._igv_track_failed.output.failed_summary), + failed_summary = ancient(str(rules._igv_track_failed.output.failed_summary)), snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".png" output: snapshot_qc = CFG["dirs"]["snapshots"] + "qc/{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".qc" @@ -478,7 +486,9 @@ rule _igv_symlink_snapshot: snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".png", snapshot_qc = str(rules._igv_quality_control.output.snapshot_qc) output: - snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".png" + snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".png" + resources: + **CFG["resources"]["_igv_symlink"] run: op.relative_symlink(input.snapshot, output.snapshot) From 7d462f75cac4c2439b1c54b06115abd17eaeeb78 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Thu, 8 Aug 2024 13:46:00 -0700 Subject: [PATCH 124/132] Fix typo in config --- modules/igv/1.0/config/default.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index 7abe4aa6..4fd2d934 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -24,7 +24,7 @@ lcr-modules: mutation_id: # mutation_id format: minimum requirements are header containing "mutation_id_{regions_build}" column and values in {chr}:{pos} format grch37: [] # e.g at minimum requires column mutation_id_grch37 - hg38: [] # e.g at minimum requires columm mutation_id_grch37 + hg38: [] # e.g at minimum requires column mutation_id_hg38 # Stop snakefile after MAF filtering step to estimate total number of snapshots that will be taken without running IGV estimate_only: False From 931a8b5ee8680965484587c0cca3e19e9cd995e6 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Thu, 8 Aug 2024 13:56:06 -0700 Subject: [PATCH 125/132] Add more information for mutation_id file format --- modules/igv/1.0/config/default.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index 4fd2d934..e19fb676 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -22,7 +22,10 @@ lcr-modules: grch37: [] hg38: [] mutation_id: - # mutation_id format: minimum requirements are header containing "mutation_id_{regions_build}" column and values in {chr}:{pos} format + # mutation_id format: minimum requirements are header containing "mutation_id_{regions_build}" column with values in {chr}:{pos} format + # e.g + # mutation_id_grch37 + # chr22:23230361 grch37: [] # e.g at minimum requires column mutation_id_grch37 hg38: [] # e.g at minimum requires column mutation_id_hg38 From e922df5105750b474d5dd58248980d104ac589e1 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Mon, 26 Aug 2024 11:52:59 -0700 Subject: [PATCH 126/132] Clean up conda envs --- modules/igv/1.0/config/default.yaml | 4 +- modules/igv/1.0/envs/crossmap.yaml | 56 ------------------------ modules/igv/1.0/envs/format_regions.yaml | 38 ---------------- modules/igv/1.0/envs/wget-1.20.1.yaml | 1 + 4 files changed, 3 insertions(+), 96 deletions(-) delete mode 100644 modules/igv/1.0/envs/crossmap.yaml delete mode 100644 modules/igv/1.0/envs/format_regions.yaml create mode 120000 modules/igv/1.0/envs/wget-1.20.1.yaml diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index e19fb676..20e10690 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -34,7 +34,7 @@ lcr-modules: options: genome_map: - # Map metadata builds to grch37 and hg38 so that MAF file locations are determined correctly + # Maps metadata builds to either grch37 or hg38 so that MAF file locations are determined correctly. Additional genome builds can be added as necessary. grch37: ["grch37","hg19","hs37d5"] hg38: ["hg38","grch38"] @@ -83,7 +83,7 @@ lcr-modules: conda_envs: liftover: "{SCRIPTSDIR}/liftover/1.0/liftover.yaml" - wget: "{REPODIR}/envs/wget/wget-1.20.1.yaml" + wget: "{MODSDIR}/envs/wget/wget-1.20.1.yaml" threads: 4 diff --git a/modules/igv/1.0/envs/crossmap.yaml b/modules/igv/1.0/envs/crossmap.yaml deleted file mode 100644 index cdbfbed3..00000000 --- a/modules/igv/1.0/envs/crossmap.yaml +++ /dev/null @@ -1,56 +0,0 @@ -name: crossmap -channels: - - bioconda - - conda-forge - - defaults -dependencies: - - _libgcc_mutex=0.1 - - _openmp_mutex=4.5 - - bx-python=0.9.0 - - bzip2=1.0.8 - - ca-certificates=2022.12.7 - - crossmap=0.6.5 - - curl=7.71.1 - - cython=0.29.32 - - krb5=1.17.2 - - ld_impl_linux-64=2.40 - - libblas=3.9.0 - - libcblas=3.9.0 - - libcurl=7.71.1 - - libdeflate=1.0 - - libedit=3.1.20191231 - - libffi=3.4.2 - - libgcc=7.2.0 - - libgcc-ng=12.2.0 - - libgfortran-ng=12.2.0 - - libgfortran5=12.2.0 - - libgomp=12.2.0 - - liblapack=3.9.0 - - libnsl=2.0.0 - - libopenblas=0.3.21 - - libpng=1.6.39 - - libsqlite=3.40.0 - - libssh2=1.10.0 - - libstdcxx-ng=12.2.0 - - libuuid=2.32.1 - - libzlib=1.2.13 - - lzo=2.10 - - mysql-connector-c=6.1.6 - - ncurses=6.3 - - numpy=1.21.6 - - openssl=1.1.1s - - pip=22.3.1 - - pybigwig=0.3.18 - - pysam=0.15.3 - - python=3.7.12 - - python-lzo=1.14 - - python_abi=3.7 - - readline=8.1.2 - - setuptools=66.1.1 - - sqlite=3.40.0 - - tk=8.6.12 - - ucsc-wigtobigwig=357 - - wheel=0.38.4 - - xz=5.2.6 - - zlib=1.2.13 -prefix: /home/mcruz/miniconda3/envs/crossmap diff --git a/modules/igv/1.0/envs/format_regions.yaml b/modules/igv/1.0/envs/format_regions.yaml deleted file mode 100644 index cd9e0993..00000000 --- a/modules/igv/1.0/envs/format_regions.yaml +++ /dev/null @@ -1,38 +0,0 @@ -name: test-generate_igv_batch -channels: - - conda-forge - - defaults -dependencies: - - _libgcc_mutex=0.1 - - _openmp_mutex=4.5 - - ca-certificates=2020.4.5.1 - - certifi=2020.4.5.1 - - ld_impl_linux-64=2.34 - - libblas=3.8.0 - - libcblas=3.8.0 - - libffi=3.2.1 - - libgcc-ng=9.2.0 - - libgfortran-ng=7.3.0 - - liblapack=3.8.0 - - libopenblas=0.3.9 - - libstdcxx-ng=9.2.0 - - llvm-openmp=10.0.0 - - ncurses=6.1 - - numpy=1.18.1 - - openssl=1.1.1f - - pandas=1.0.3 - - pip=20.0.2 - - python=3.8.2 - - python-dateutil=2.8.1 - - python_abi=3.8 - - pytz=2019.3 - - pyvcf=0.6.8 - - readline=8.0 - - setuptools=46.1.3 - - six=1.14.0 - - sqlite=3.30.1 - - tk=8.6.10 - - wheel=0.34.2 - - xz=5.2.5 - - zlib=1.2.11 -prefix: /home/mcruz/miniconda3/envs/test-generate_igv_batch diff --git a/modules/igv/1.0/envs/wget-1.20.1.yaml b/modules/igv/1.0/envs/wget-1.20.1.yaml new file mode 120000 index 00000000..86501e72 --- /dev/null +++ b/modules/igv/1.0/envs/wget-1.20.1.yaml @@ -0,0 +1 @@ +../../../../envs/wget/wget-1.20.1.yaml \ No newline at end of file From 2cdf67519212a4ae2a1f155386e299b6adc7b897 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Mon, 26 Aug 2024 12:06:49 -0700 Subject: [PATCH 127/132] Remove scripy that runs crossmap since switched to liftover --- modules/igv/1.0/etc/liftover_regions.sh | 73 ------------------------- 1 file changed, 73 deletions(-) delete mode 100755 modules/igv/1.0/etc/liftover_regions.sh diff --git a/modules/igv/1.0/etc/liftover_regions.sh b/modules/igv/1.0/etc/liftover_regions.sh deleted file mode 100755 index 05c3cb61..00000000 --- a/modules/igv/1.0/etc/liftover_regions.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Use CrossMap.py to convert genomic coordinates between GRCh37 and GRCh38 for MAF, VCF, BED or BEDPE files. -# -# Usage: liftover_regions.sh \ -# \ -# \ -# \ -# \ -# \ -# \ -# - -input_regions=$1 -input_type=$2 -regions_build=$3 -target_build=$4 -output_file=$5 -chain_file=$6 -target_ref=$7 - -echo "Input regions file: $input_regions" -echo "Input regions type: $input_type" -echo "Input regions build: $regions_build" -echo "Target genome build: $target_build" -echo "Output file: $output_file" -echo "Chain file: $chain_file" -echo "Target reference: $target_ref" - -intermediate_output_file=$(echo $output_file)_int - -# Skip empty files -lines=$(wc -l < $input_regions) -if [ ! $lines -gt 0 ] ; -then - touch $output_file -fi - -# MAFs -# Check genome build of incoming MAF file to determine what build it needs to be changed to -if [ "$input_type" == "maf" ] ; -then - echo "Proceeding with MAF input..." - if [ $regions_build == $target_build ] && [ $lines -gt 0 ] ; - then - echo "WARNING: Input regions file $input_regions is already $target_build. Copying contents of $input_regions to $output_file"; - cut -f 1,5,6,7,9,10,11,13,16 $input_regions > $output_file - else - echo "Input regions file $input_regions does not appear to be $target_build. Proceeding with conversion to $target_build" - echo "CrossMap.py maf $chain_file $input_regions $target_ref $target_build $output_file" - CrossMap.py maf $chain_file $input_regions $target_ref $target_build $intermediate_output_file - cut -f 1,5,6,7,9,10,11,13,16 $intermediate_output_file > $output_file - rm $intermediate_output_file - fi - echo "Finished MAF block." -fi - -if [ "$input_type" == "bed" ] && [ $lines -gt 0 ] ; - then - echo "Proceeding with BED input..." - if [ $regions_build == $target_build ] ; - then - echo "WARNING: Input regions file $input_regions is already $target_build. Copying contents of $input_regions to $output_file"; - cat $input_regions > $output_file - else - echo "Input regions file $input_regions does not appear to be $target_build. Proceeding with conversion to $target_build" - echo "CrossMap.py bed $chain_file $input_regions $output_file" - CrossMap.py bed $chain_file $input_regions $output_file - fi - echo "Finished BED block." -fi - -echo "End of bash script" From 729019182a4da198d27dfdaac8a805d0fe174673 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Mon, 26 Aug 2024 12:26:26 -0700 Subject: [PATCH 128/132] Remove unnecessary REGIONS_FORMAT dict --- modules/igv/1.0/igv.smk | 8 -------- 1 file changed, 8 deletions(-) diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 760b3840..2facba78 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -160,14 +160,6 @@ rule _igv_format_regions: script: config["lcr-modules"]["igv"]["scripts"]["format_regions"] -REGIONS_FORMAT = { - "bed": "bed", - "maf": "bed", - "oncodriveclustl": "bed", - "hotmaps": "bed", - "mutation_id": "bed" -} - def _igv_get_chain(wildcards): if "38" in str({wildcards.tool_build}): return reference_files("genomes/{tool_build}/chains/grch38/hg38ToHg19.over.chain") From 5b9551c86cde5be9830b6ed034f6d7d2ff52f58b Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Sun, 1 Sep 2024 15:53:55 -0700 Subject: [PATCH 129/132] Allow ability to specify version of IGV to download --- modules/igv/1.0/config/default.yaml | 3 +++ modules/igv/1.0/igv.smk | 19 ++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index 20e10690..7a45f50d 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -33,6 +33,9 @@ lcr-modules: estimate_only: False options: + + igv_version: "https://data.broadinstitute.org/igv/projects/downloads/2.7/IGV_Linux_2.7.2.zip" + genome_map: # Maps metadata builds to either grch37 or hg38 so that MAF file locations are determined correctly. Additional genome builds can be added as necessary. grch37: ["grch37","hg19","hs37d5"] diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index 2facba78..c16e8100 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -379,32 +379,37 @@ def _evaluate_batches(wildcards): allow_missing = True ) +IGV_VERSION = CFG["options"]["igv_download_path"] +IGV_VERSION = IGV_VERSION.split("/")[-1].replace(".zip","") + rule _igv_download_igv: output: - igv_zip = CFG["dirs"]["igv"] + "IGV_2.7.2.zip", - igv_installed = CFG["dirs"]["igv"] + "igv_2.7.2.installed" + igv_zip = CFG["dirs"]["igv"] + IGV_VERSION + ".zip", + igv_installed = CFG["dirs"]["igv"] + IGV_VERSION + ".installed" + params: + igv = CFG["options"]["igv_download_path"] conda: CFG["conda_envs"]["wget"] log: - stdout = CFG["logs"]["igv"] + "download/igv_download.stdout.log", - stderr = CFG["logs"]["igv"] + "download/igv_download.stderr.log" + stdout = CFG["logs"]["igv"] + "download/igv_" + IGV_VERSION + "_download.stdout.log", + stderr = CFG["logs"]["igv"] + "download/igv_" + IGV_VERSION + "_download.stderr.log" shell: op.as_one_line(""" - wget -O {output.igv_zip} https://data.broadinstitute.org/igv/projects/downloads/2.7/IGV_Linux_2.7.2.zip && + wget -O {output.igv_zip} {params.igv} && unzip {output.igv_zip} -d $(dirname {output.igv_zip}) > {log.stdout} 2> {log.stderr} && touch {output.igv_installed} """) checkpoint _igv_run: input: - igv = str(rules._igv_download_igv.output.igv_installed), + igv = ancient(str(rules._igv_download_igv.output.igv_installed)), finished_batches = str(rules._igv_create_batch_script_per_variant.output.finished), batch_script = _evaluate_batches output: complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}/{preset}/{sample_id}.completed" params: merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset}/{sample_id}" + SUFFIX + ".batch", - igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh", + igv = CFG["dirs"]["igv"] + IGV_VERSION + "/igv.sh", sleep_time = CFG["options"]["generate_batch_script"]["sleep_timer"], server_number = "-n " + CFG["options"]["xvfb_parameters"]["server_number"] if CFG["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", server_args = CFG["options"]["xvfb_parameters"]["server_args"] From f864881be6d41fa57076a8c6844732158d0737af Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Wed, 4 Sep 2024 13:49:22 -0700 Subject: [PATCH 130/132] Fix typo in list of config wildcards so tumour_id matches wildcard in snakefile --- modules/igv/1.0/config/default.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index 7a45f50d..fb6a0ef0 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -3,7 +3,7 @@ lcr-modules: igv: inputs: - # Available wildcards: {seq_type} {tumour_sample_id} {normal_sample_id} {pair_status} {genome_build} + # Available wildcards: {seq_type} {tumour_id} {normal_sample_id} {pair_status} {genome_build} maf: "__UPDATE__" regions: From 7df9f19bac99535cc8533e47633d29f8eb48eff4 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Sun, 15 Sep 2024 13:09:07 -0700 Subject: [PATCH 131/132] Add ability to specify bam path in config --- modules/igv/1.0/config/default.yaml | 5 +++++ modules/igv/1.0/igv.smk | 24 ++++++++++++++++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml index fb6a0ef0..0dde0dd3 100644 --- a/modules/igv/1.0/config/default.yaml +++ b/modules/igv/1.0/config/default.yaml @@ -6,6 +6,11 @@ lcr-modules: # Available wildcards: {seq_type} {tumour_id} {normal_sample_id} {pair_status} {genome_build} maf: "__UPDATE__" + # Available wildcards: {seq_type} {sample_id} {genome_build} + bam_path: "__UPDATE__" + bai_path: "__UPDATE__" + + regions: # Provide regions files as lists in their respective genome builds so that liftover of coordinates occurs properly # Please provide at least one regions file to filter MAF variants diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk index c16e8100..16f206f1 100644 --- a/modules/igv/1.0/igv.smk +++ b/modules/igv/1.0/igv.smk @@ -84,12 +84,28 @@ localrules: def get_bam(wildcards): - metadata = config["lcr-modules"]["igv"]["samples"] - return expand("data/{{seq_type}}_bams/{{sample_id}}.{genome_build}.bam", genome_build=metadata[(metadata.sample_id == wildcards.sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) + CFG = config["lcr-modules"]["igv"] + metadata = CFG["samples"] + genome_build = metadata[(metadata.sample_id == wildcards.sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"] + + return expand( + CFG["inputs"]["bam_path"], + seq_type = wildcards.seq_type, + sample_id = wildcards.sample_id, + genome_build = genome_build + ) def get_bai(wildcards): - metadata = config["lcr-modules"]["igv"]["samples"] - return expand("data/{{seq_type}}_bams/{{sample_id}}.{genome_build}.bam.bai", genome_build=metadata[(metadata.sample_id == wildcards.sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"]) + CFG = config["lcr-modules"]["igv"] + metadata = CFG["samples"] + genome_build = metadata[(metadata.sample_id == wildcards.sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"] + + return expand( + CFG["inputs"]["bai_path"], + seq_type = wildcards.seq_type, + sample_id = wildcards.sample_id, + genome_build = genome_build + ) ##### RULES ##### From ef47a179e81f01608eb520714bcefde7d6359462 Mon Sep 17 00:00:00 2001 From: mannycruz <55067099+mannycruz@users.noreply.github.com> Date: Wed, 27 Nov 2024 16:58:00 -0800 Subject: [PATCH 132/132] Fix string indexing for mutation_id formatted files --- modules/igv/1.0/etc/format_regions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/igv/1.0/etc/format_regions.py b/modules/igv/1.0/etc/format_regions.py index 1ae9f52c..09b3d9e7 100755 --- a/modules/igv/1.0/etc/format_regions.py +++ b/modules/igv/1.0/etc/format_regions.py @@ -71,9 +71,9 @@ def format_mutation_id(mutation_id): # Create columns required for liftover in BED format genomic_pos_col = f"mutation_id_{REGIONS_BUILD}" - for col, idx in {"chr_std": 0, "start": 1, "end": 2}.items(): + for col, idx in {"chr_std": 0, "start": 1, "end": 1}.items(): mutation_id[col] = mutation_id.apply(lambda x: str(x[genomic_pos_col]).split(":")[idx].replace("chr",""), axis=1) - end = mutation_id["end"].apply(lambda x: x + 1) + end = mutation_id["end"].apply(lambda x: int(x) + 1) mutation_id_reformatted = pd.DataFrame( {