diff --git a/modules/igv/1.0/config/default.yaml b/modules/igv/1.0/config/default.yaml new file mode 100644 index 000000000..0dde0dd31 --- /dev/null +++ b/modules/igv/1.0/config/default.yaml @@ -0,0 +1,115 @@ +lcr-modules: + + igv: + + inputs: + # Available wildcards: {seq_type} {tumour_id} {normal_sample_id} {pair_status} {genome_build} + maf: "__UPDATE__" + + # Available wildcards: {seq_type} {sample_id} {genome_build} + bam_path: "__UPDATE__" + bai_path: "__UPDATE__" + + + regions: + # Provide regions files as lists in their respective genome builds so that liftover of coordinates occurs properly + # Please provide at least one regions file to filter MAF variants + oncodriveclustl: + grch37: ["__UPDATE__"] + hg38: [] + hotmaps: + grch37: [] + hg38: [] + bed: + grch37: [] + hg38: [] + maf: + grch37: [] + hg38: [] + mutation_id: + # mutation_id format: minimum requirements are header containing "mutation_id_{regions_build}" column with values in {chr}:{pos} format + # e.g + # mutation_id_grch37 + # chr22:23230361 + grch37: [] # e.g at minimum requires column mutation_id_grch37 + hg38: [] # e.g at minimum requires column mutation_id_hg38 + + # Stop snakefile after MAF filtering step to estimate total number of snapshots that will be taken without running IGV + estimate_only: False + + options: + + igv_version: "https://data.broadinstitute.org/igv/projects/downloads/2.7/IGV_Linux_2.7.2.zip" + + genome_map: + # Maps metadata builds to either grch37 or hg38 so that MAF file locations are determined correctly. Additional genome builds can be added as necessary. + grch37: ["grch37","hg19","hs37d5"] + hg38: ["hg38","grch38"] + + liftover_regions: + liftover_minMatch: "0.95" # Float number from 0 to 1 indicating minimal mapping when converting to a different genome build + + generate_batch_script: + padding: 100 # Base pairs upstream and downstream of variant position + max_height: 1000 # Maximum height of snapshot + sleep_timer: 2000 # Batch scripts with more options may require longer sleep intervals + igv_options: + # Presets for IGV snapshots + # Available igv options: https://github.com/igvteam/igv/wiki/Batch-commands + default: ["preference SAM.COLOR_BY READ_STRAND", "preference SAM.SHOW_CENTER_LINE TRUE", "preference SAM.SHADE_BASE_QUALITY true", "preference SAM.DOWNSAMPLE_READS FALSE", "preference SAM.ALLELE_THRESHOLD 0.05", "sort"] + pairs: ["viewaspairs", "preference SAM.COLOR_BY READ_STRAND", "preference SAM.SHOW_CENTER_LINE TRUE", "preference SAM.SHADE_BASE_QUALITY true", "preference SAM.DOWNSAMPLE_READS FALSE", "preference SAM.ALLELE_THRESHOLD 0.05", "sort QUALITY"] + + igv_presets: ["default"] # Available options: "default" "pairs" + + xvfb_parameters: + # Server options for running xvfb + server_number: "99" + server_args: "" + + quality_control: + # Truncated heights that have been previously observed for dimensions 1920x1080x24 + truncated: [506,533,545,547,559,570] + # Kurtosis and skewness values observed in blank snapshots at different height values + blank: + "547": + kurtosis: 18.5 + skewness: -4 + "559": + kurtosis: 18.2 + skewness: -4 + # Previously observed heights of snapshots that fail IGV + failed: [506,533] + + scripts: + format_regions: "etc/format_regions.py" + filter_script: "etc/filter_maf.py" + region_liftover_script: "{SCRIPTSDIR}/liftover/1.0/liftover.sh" + batch_script_per_variant: "etc/generate_batch_script_per_variant.py" + quality_control: "etc/quality_control.py" + + scratch_subdirectories: [] + + conda_envs: + liftover: "{SCRIPTSDIR}/liftover/1.0/liftover.yaml" + wget: "{MODSDIR}/envs/wget/wget-1.20.1.yaml" + + threads: 4 + + resources: + _igv_liftover_regions: + mem_mb: 2000 + _igv_run: + mem_mb: 2500 + _igv_quality_control: + mem_mb: 2500 + + pairing_config: + genome: + run_paired_tumours: True + run_unpaired_tumours_with: "unmatched_normal" + run_paired_tumours_as_unpaired: False + capture: + run_paired_tumours: True + run_unpaired_tumours_with: "unmatched_normal" + run_paired_tumours_as_unpaired: False + diff --git a/modules/igv/1.0/envs/wget-1.20.1.yaml b/modules/igv/1.0/envs/wget-1.20.1.yaml new file mode 120000 index 000000000..86501e72a --- /dev/null +++ b/modules/igv/1.0/envs/wget-1.20.1.yaml @@ -0,0 +1 @@ +../../../../envs/wget/wget-1.20.1.yaml \ No newline at end of file diff --git a/modules/igv/1.0/etc/filter_maf.py b/modules/igv/1.0/etc/filter_maf.py new file mode 100644 index 000000000..6010e251f --- /dev/null +++ b/modules/igv/1.0/etc/filter_maf.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python + +import os +import sys +import logging +import traceback +import pandas as pd +import oncopipe as op + + +def log_exceptions(exctype, value, tb): + logging.critical(''.join(traceback.format_tb(tb))) + logging.critical('{0}: {1}'.format(exctype, value)) + +sys.excepthook = log_exceptions + +def main(): + + with open(snakemake.log[0], "w") as stdout: + # Set up logging + sys.stdout = stdout + + try: + + maf_file = snakemake.input[0] + + regions_file = snakemake.input[1] + regions_format = snakemake.params[0] + + metadata = snakemake.params[1] + + output_file = snakemake.output[0] + + # Return empty dataframe if no lines in MAF + line_count = count_lines(maf_file) + if line_count == 1: + empty_maf = pd.read_table(maf_file, comment="#", sep="\t") + # Add columns required by workflow + required_columns = ["seq_type","genome_build","chr_std"] + empty_maf = empty_maf.assign(**{col:None for col in required_columns if col not in empty_maf.columns}) + write_output(empty_maf, output_file) + exit() + + maf = maf_add_columns(maf=maf_file, metadata=metadata, wildcards=snakemake.wildcards) + + # Perform filtering + filtered_maf = maf_filter( + maf=maf, + regions=regions_file, + regions_format=regions_format + ) + + write_output(filtered_maf, output_file) + + except Exception as e: + logging.error(e, exc_info=1) + raise + +def count_lines(maf): + with open(maf, "r") as handle: + total_lines = len(handle.readlines()) + return total_lines + +def filter_by_bed(maf, regions): + + # Remove row containing column names + regions = regions[regions[0].str.contains("chrom")==False] + + # Create common columns between BED and MAF + regions["chr_std"] = regions.apply(lambda x: "chr" + str(x[0]).replace("chr",""), axis=1) + regions["genomic_pos_std"] = regions["chr_std"] + ":" + regions[1].map(str) + + maf["chr_std"] = maf.apply(lambda x: "chr" + str(x["Chromosome"]).replace("chr",""), axis=1) + maf["genomic_pos_std"] = maf["chr_std"] + ":" + maf["Start_Position"].map(str) + + filtered_maf = maf[maf["genomic_pos_std"].isin(regions["genomic_pos_std"])] + return filtered_maf + +def filter_by_maf(maf, regions): + + # Create common column by which to subset MAF + for df in [maf, regions]: + df["chr_std"] = df.apply(lambda x: "chr" + str(x["Chromosome"]).replace("chr",""), axis=1) + df["genomic_pos_std"] = df["chr_std"] + ":" + df["Start_Position"].map(str) + + # Subset the MAF + filtered_maf = maf[maf["genomic_pos_std"].isin(regions["genomic_pos_std"])] + return filtered_maf + +def maf_filter(maf, regions, regions_format): + + if regions_format != "bed": + regions_df = pd.read_table(regions, comment="#", sep="\t") + else: + regions_df = pd.read_table(regions, comment="#", sep="\t", header=None) + + # Return empty dataframe without filtering if df is empty + if len(maf)==0: + return maf + + filter_functions = { + "maf": filter_by_maf, + "bed": filter_by_bed + } + + return filter_functions[regions_format](maf, regions_df) + +def maf_add_columns(maf, metadata, wildcards): + # Read input MAF as df + maf = pd.read_table(maf, comment="#", sep="\t") + + sample_id = snakemake.wildcards["tumour_id"] + seq_type = snakemake.wildcards["seq_type"] + genome_build = snakemake.wildcards["genome_build"] + normal_sample_id = snakemake.wildcards["normal_sample_id"] + pair_status = snakemake.wildcards["pair_status"] + + maf["seq_type"] = seq_type + maf["genome_build"] = genome_build + maf["normal_sample_id"] = normal_sample_id + maf["pair_status"] = pair_status + + return maf + +def write_output(maf, outfile): + maf.to_csv(outfile, sep="\t", na_rep="NA", index=False) + +if __name__ == "__main__": + logging.basicConfig( + level=logging.DEBUG, + filename=snakemake.log[1], + filemode='w' + ) + + main() diff --git a/modules/igv/1.0/etc/format_regions.py b/modules/igv/1.0/etc/format_regions.py new file mode 100755 index 000000000..09b3d9e73 --- /dev/null +++ b/modules/igv/1.0/etc/format_regions.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python + +""" +This script reformats MAF, HotMAPS, OncodriveCLUSTL or GENOMIC_POSITION files into BED files +""" + +import os +import sys +import pandas as pd +import oncopipe as op +import shutil +import logging +import traceback + +def log_exceptions(exctype, value, tb): + logging.critical(''.join(traceback.format_tb(tb))) + logging.critical('{0}: {1}'.format(exctype, value)) + +sys.excepthook = log_exceptions + +def main(): + + with open(snakemake.log[0], "w") as stdout: + sys.stdout = stdout + + try: + regions_file = snakemake.input["regions"] + regions_format = snakemake.params["regions_format"] + + output_file = snakemake.output["regions"] + + line_count = 0 + with open(regions_file, "r") as handle: + for line in handle: + line_count += 1 + if line_count > 1: + break + if line_count < 2: + touch_output = open(output_file, "w") + touch_output.close() + exit() + + if regions_format == "mutation_id": + global REGIONS_BUILD + REGIONS_BUILD = snakemake.params["regions_build"] + REGIONS_BUILD = REGIONS_BUILD.lower() + + if regions_format == "bed": + # Do not need to reformat for liftover + shutil.copy(regions_file, output_file) + exit() + + # Reformat for liftover based on regions format + regions_formatted = format_regions(regions_file, regions_format) + regions_formatted = regions_formatted.drop_duplicates() + + # Add empty column to end for liftover + regions_formatted.insert(3, '', '') + + # Output regions file + regions_formatted.to_csv(output_file, sep="\t", index=False) + + except Exception as e: + logging.error(e, exc_info=1) + raise + +def format_mutation_id(mutation_id): + # Read regions into dataframe + mutation_id = pd.read_table(mutation_id, comment="#", sep="\t") + + # Create columns required for liftover in BED format + genomic_pos_col = f"mutation_id_{REGIONS_BUILD}" + + for col, idx in {"chr_std": 0, "start": 1, "end": 1}.items(): + mutation_id[col] = mutation_id.apply(lambda x: str(x[genomic_pos_col]).split(":")[idx].replace("chr",""), axis=1) + end = mutation_id["end"].apply(lambda x: int(x) + 1) + + mutation_id_reformatted = pd.DataFrame( + { + "chrom": "chr" + mutation_id["chr_std"], + "start": mutation_id["start"], + "end": end + } + ) + + # Remove duplicate rows + mutation_id_reformatted = mutation_id_reformatted.drop_duplicates(keep='first') + + return mutation_id_reformatted + +def format_hotmaps(hotmaps_regions): + # Read regions into dataframe + hotmaps_regions = pd.read_table(hotmaps_regions, comment="#", sep="\t") + + # Convert HotMAPS coordinates to BED format + + hotmaps_regions["chr_std"] = hotmaps_regions.apply(lambda x: str(x["Chromosome"]).replace("chr",""), axis=1) + chr_std = "chr" + hotmaps_regions["chr_std"].map(str) + end = hotmaps_regions["Start_Position"].apply(lambda x: x + 1) + + hotmaps_reformatted = pd.DataFrame( + { + "chrom": chr_std, + "start": hotmaps_regions["Start_Position"], + "end": end + } + ) + return hotmaps_reformatted + +def format_clustl(clustl_regions): + # Read regions into dataframe + clustl_regions = pd.read_table(clustl_regions, comment="#", sep="\t") + + # Create columnns required for BED format + chr_std = "chr" + clustl_regions["Chromosome"].map(str) + end = clustl_regions["Start_Position"].apply(lambda x: x + 1) + clustl_reformatted = pd.DataFrame( + { + "chrom": chr_std, + "start": clustl_regions["Start_Position"], + "end": end + } + ) + return clustl_reformatted + +def format_maf(maf): + # Read regions into dataframe + maf_regions = pd.read_table(maf, comment="#", sep="\t") + + # Create dataframe in BED format + chr_std = "chr" + maf_regions["Chromosome"].map(str).replace("chr","") + end = maf_regions["Start_Position"].apply(lambda x: x + 1) + + maf_reformatted = pd.DataFrame( + { + "chrom": chr_std, + "start": maf_regions["Start_Position"], + "end": end + } + ) + + return maf_reformatted + +def format_regions(regions, regions_format): + format_functions = { + "oncodriveclustl": format_clustl, + "hotmaps": format_hotmaps, + "mutation_id": format_mutation_id, + "maf": format_maf + } + + return format_functions[regions_format](regions) + +if __name__ == "__main__": + logging.basicConfig( + level=logging.DEBUG, + filename=snakemake.log[1], + filemode='w' + ) + main() diff --git a/modules/igv/1.0/etc/generate_batch_script_per_variant.py b/modules/igv/1.0/etc/generate_batch_script_per_variant.py new file mode 100644 index 000000000..4514ddd37 --- /dev/null +++ b/modules/igv/1.0/etc/generate_batch_script_per_variant.py @@ -0,0 +1,279 @@ +#!/usr/bin/env python3 + +import os +import warnings +import numpy as np +import pandas as pd +import oncopipe as op +import copy +import sys +import logging +import traceback + +def log_exceptions(exctype, value, tb): + logging.critical(''.join(traceback.format_tb(tb))) + logging.critical('{0}: {1}'.format(exctype, value)) + +sys.excepthook = log_exceptions + +def main(): + with open(snakemake.log[0], "w") as stdout: + # Set up logging + sys.stdout = stdout + + try: + input_bam = snakemake.input["bam_file"] + input_bai = snakemake.input["bai_file"] + + maf = snakemake.input["filter_maf"] + + batch_options = snakemake.params["batch_options"] + + # Print run info for logging + print(f"Setting up batch scripts using the following inputs:\n\ + Bam files:\t{input_bam}\n\ + Bai files:\t{input_bai}\n\ + Filtered maf:\t{maf}\n\ + Batch options:\t{batch_options}") + + if not isinstance(maf, list): + maf = [maf] + + empty_mafs = [] + + for m in maf: + # Skip if no variants in outfile + input_maf = open(m, "r") + + line_count = 0 + for line in input_maf: + line_count += 1 + if line_count > 1: + # Return to top of MAF + input_maf.seek(0) + break + if line_count < 2: + input_maf.close() + empty_mafs.append(m) + + if len(empty_mafs) != 0: + if all(m in empty_mafs for m in maf): + touch_outputs( + output_dir = snakemake.params["batch_dir"], + seq_type = snakemake.wildcards["seq_type"], + genome_build = snakemake.wildcards["genome_build"], + presets = snakemake.params["igv_presets"], + sample_id = snakemake.wildcards["sample_id"], + suffix = snakemake.params["suffix"], + finished_file = snakemake.output["finished"] + ) + exit() + for e in empty_mafs: + maf.remove(e) + + # Read MAF file and create dataframe + regions = get_regions_df( + maf, + padding=batch_options["padding"] + ) + + # Create the batch scripts + generate_igv_batches( + regions = regions, + bam = input_bam, + bai = input_bai, + output_dir = snakemake.params["batch_dir"], + snapshot_dir = snakemake.params["snapshot_dir"], + genome_build = snakemake.params["genome_build"], + seq_type = snakemake.params["seq_type"], + suffix = snakemake.params["suffix"], + igv_presets = snakemake.params["igv_presets"], + igv_options = batch_options["igv_options"], + max_height = batch_options["max_height"], + tissue_status = snakemake.params["tissue_status"], + sleep_timer = batch_options["sleep_timer"] + ) + + touch_output = open(snakemake.output[0], "w") + touch_output.close() + + except Exception as e: + logging.error(e, exc_info=1) + raise + +def touch_outputs(output_dir, seq_type, genome_build, presets, sample_id, suffix, finished_file): + sample_suffix = sample_id + suffix + ".batch" + for preset in presets: + os.makedirs(os.path.join(output_dir, "merged_batch_scripts", "--".join([seq_type, genome_build]), preset), exist_ok = True) + merged_batch = os.path.join(output_dir, "merged_batch_scripts", "--".join([seq_type, genome_build]), preset, sample_suffix) + merged_file = open(merged_batch, "w") + merged_file.close() + touch_finished = open(finished_file, "w") + touch_finished.close() + +def get_regions_df(input_maf, padding): + # Read MAF as dataframe + if len(input_maf) > 1: + maf = pd.concat([pd.read_table(file, comment="#", sep="\t") for file in input_maf]) + else: + maf = pd.read_table(input_maf[0], comment="#", sep="\t") + + chrom = (maf["Chromosome"].astype(str)).apply(lambda x: x.replace("chr","")) + + # Specify regions that will be captured by IGV based on variant positions + region_start = (maf["Start_Position"]).astype(str) + snapshot_start = (maf["Start_Position"] - padding).astype(str) + snapshot_end = (maf["End_Position"] + padding).astype(str) + snapshot_coordinates = "chr" + chrom + ":" + snapshot_start + "-" + snapshot_end + regions = "chr" + chrom + ":" + region_start + + regions_df = pd.DataFrame( + {"chromosome": "chr" + chrom, + "region": regions, + "region_name": maf.Hugo_Symbol, + "tumour_id": maf.Tumor_Sample_Barcode, + "normal_id": maf.Matched_Norm_Sample_Barcode, + "ref_allele": maf.Reference_Allele, + "alt_allele": maf.Tumor_Seq_Allele2, + "snapshot_coordinates": snapshot_coordinates, + "padding": padding, + "pair_status": maf.pair_status + } + ) + + samp_id = snakemake.wildcards["sample_id"] + + assert len(regions_df["normal_id"].drop_duplicates()) == 1, f"More than one normal ID found within the MAF files' `Matched_Norm_Sample_Barcode` column for this sample: {samp_id}. Please double check MAF files: {input_maf}" + + return regions_df + +def output_lines(lines, batch_output): + output = open(batch_output, "w") + lines.append("") + text = "\n".join(lines) + output.write(text) + output.close() + +def generate_igv_batch_per_row(sleep_interval, preset, options, coordinates, directory, child_dir, seq_build, chrom_directory, snapshot_filename): + lines = [] + + lines.append(f"goto {coordinates}") + + snapshot_regions_dir = os.path.join(directory, seq_build, child_dir, preset, chrom_directory, "") + + lines.append(f"snapshotDirectory {snapshot_regions_dir}") + lines.append("collapse") + for igv_option in options[preset]: + lines.append(igv_option) + lines.append(f"setSleepInterval {sleep_interval}") + lines.append(f"snapshot {snapshot_filename}") + + return lines + +def generate_igv_batch_header(bam, index, max_height, genome_build): + lines = [] + + genome_build = genome_build.replace("grch37","hg19") + + lines.append(f"load {bam} index={index}") + lines.append(f"maxPanelHeight {max_height}") + lines.append(f"genome {genome_build}") + + return lines + +def generate_igv_batches(regions, bam, bai, output_dir, snapshot_dir, genome_build, seq_type, suffix, igv_presets, igv_options, max_height, tissue_status, sleep_timer=2000): + for preset in igv_presets: + + merged_batch_suffix = snakemake.wildcards["sample_id"] + suffix + ".batch" + + grouped_regions = regions.groupby("region") + + # Group by genomic coordinate + for coordinate, variants in grouped_regions: + all_lines = [] + + header = generate_igv_batch_header(bam=bam, index=bai, max_height=max_height, genome_build=genome_build) + all_lines.extend(header) + + seq_type_build = f"{seq_type}--{genome_build}" + chrom_dir = variants["chromosome"].unique()[0] + + filename = [] + filename.append(variants["region"].unique()[0]) + filename.append(variants["region_name"].unique()[0]) + + batch_filename = filename.copy() + batch_filename.append(snakemake.wildcards["sample_id"]) + batch_filename = "--".join(batch_filename) + suffix + ".batch" + + if tissue_status == "tumour": + # Iterate over variants at same position so that instructions to create multiple snapshots + # with different alleles in their filenames are added to the batch script. + + for _, row in variants.iterrows(): + + snap_filename = filename.copy() + snap_filename.append(f"{row.ref_allele}_{row.alt_allele}") + snap_filename.append(snakemake.wildcards["sample_id"]) + snap_filename = "--".join(snap_filename) + suffix + ".png" + + lines = generate_igv_batch_per_row( + sleep_interval = sleep_timer, + preset = preset, + options = igv_options, + coordinates = row.snapshot_coordinates, + directory = snapshot_dir, + child_dir = tissue_status, + seq_build = seq_type_build, + chrom_directory = chrom_dir, + snapshot_filename = snap_filename + ) + + all_lines.extend(lines) + + elif tissue_status == "normal": + + # Only need to create one snapshot since only one allele will be expected from the ref + + snap_filename = filename.copy() + ref_allele = variants["ref_allele"].unique()[0] + snap_filename.append(f"{ref_allele}_{ref_allele}") + snap_filename.append(snakemake.wildcards["sample_id"]) + snap_filename = "--".join(snap_filename) + suffix + ".png" + + lines = generate_igv_batch_per_row( + sleep_interval = sleep_timer, + preset = preset, + options = igv_options, + coordinates = variants["snapshot_coordinates"].unique()[0], + directory = snapshot_dir, + child_dir = tissue_status, + seq_build = seq_type_build, + chrom_directory = chrom_dir, + snapshot_filename = snap_filename + ) + + all_lines.extend(lines) + + # Make subdirectories if necessary because snakemake won't make them since rule is a checkpoint + os.makedirs(os.path.join(output_dir, "single_batch_scripts", seq_type_build, preset), exist_ok=True) + + batch_file_path = os.path.join(output_dir, "single_batch_scripts", seq_type_build, preset, batch_filename) + + output_lines(all_lines, batch_file_path) + + os.makedirs(os.path.join(output_dir, "merged_batch_scripts", seq_type_build, preset), exist_ok=True) + + merged_preset_path = os.path.join(output_dir, "merged_batch_scripts", seq_type_build, preset, merged_batch_suffix) + + merged_preset_touch = open(merged_preset_path, "w") + merged_preset_touch.close() + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.DEBUG, + filename=snakemake.log[1], + filemode='w' + ) + main() diff --git a/modules/igv/1.0/etc/quality_control.py b/modules/igv/1.0/etc/quality_control.py new file mode 100644 index 000000000..fb4b389fe --- /dev/null +++ b/modules/igv/1.0/etc/quality_control.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python + +import subprocess +import sys +import os + +## Logging files +global stdout +global stderr +global stdout_f +global stderr_f + +stdout = snakemake.log["stdout"] +stderr = snakemake.log["stderr"] + +def increaseSleepInterval(batch_file): + """ + Increase sleep interval between batch commands by 5 seconds + """ + os.system(f'sleep=$(grep "Sleep" {batch_file} | cut -d " " -f2) && new_sleep=$(($sleep + 5000)) && sed -i "s/setSleepInterval $sleep/setSleepInterval $new_sleep/g" {batch_file}') + +def runIGV(batch_file, igv, status, message, attempt): + os.system(f'echo "Snapshot may be {status}. {message}... Rerunning IGV... Attempt {str(attempt)}:" >> {stdout} 2>> {stderr}') + os.system(f'maxtime=$(($(wc -l < {batch_file}) * 60 + 15)) && timeout --foreground $maxtime xvfb-run -s "-screen 0 1980x1020x24" {igv["server_num"]} {igv["server_args"]} {igv["igv"]} -b {batch_file} >> {stdout} 2>> {stderr}') + +def getImageQualities(snapshot, batch_file, igv, summary_file, attempts = 0): + height = None + width = None + kurtosis = None + skewness = None + corrupt = True + while attempts < 4 and corrupt: + attempts += 1 + try: + height = str(subprocess.check_output(f"identify -format '%h' {snapshot}", shell=True)).split("'")[1].split("\\n")[0] + width = str(subprocess.check_output(f"identify -format '%w' {snapshot}", shell=True)).split("'")[1].split("\\n")[0] + kurtosis, skewness = [float(value.split(": ")[1]) for value in str(subprocess.check_output(f"identify -verbose {snapshot} | grep -E 'kurtosis|skewness' | tail -n 2", shell=True)).replace("\\n'","").split("\\n ")] + corrupt = False + except: + status = "corrupt" + message = "" + if attempts < 3: + runIGV(batch_file, igv, status, message, attempts) + if attempts == 4 and corrupt: + # Quit because snapshot is corrupt and can't run quality control + qc_status = "failed" + logFailedSnapshots(snapshot, summary_file, qc_status) + sys.exit() + + quality_dict = { + "height": height, + "width": width, + "kurtosis": kurtosis, + "skewness": skewness + } + + return quality_dict + +def handleIncorrectDimensions(snapshot, img_values, thresholds, batch_file, igv, failed_summary): + status = "in incorrect dimensions" + attempts = 0 + + # Increase sleep interval + increaseSleepInterval(batch_file) + + while float(img_values["width"]) == 640 and attempts < 5: + attempts += 1 + previous_server_arg = igv["server_num"] + + if igv["server_num"] == "--auto-servernum" or int(igv["server_num"].replace("-n ","")) >= 99: + new_server_arg = "-n 1" + else: + new_server_arg = f'-n {str(float(igv["server_num"]) + 1)}' + + messsage = f'Current snapshot width is {img_values["width"]}, but at least 1020px is expected. This might occurr if xvfb-run is unable to connect to current server ({previous_server_arg}) due to a server lock. Switching server numbers... Attempting new server argument: {new_server_arg}' + + igv["server_num"] = new_server_arg + + # Rerun IGV + runIGV(batch_file, igv, status, message, attempts) + + # Update image values + img_values = getImageQualities(snapshot, batch_file, igv, failed_summary) + + return attempts, img_values + +def handleTruncated(snapshot, img_values, thresholds, batch_file, igv, dim_attempts, failed_summary): + status = "truncated" + attempts = 0 + + # Increase sleep interval + if dim_attempts == 0: + increaseSleepInterval(batch_file) + + # Rerun IGV until height is no longer truncated + while float(img_values["height"]) in thresholds["truncated"] and attempts < 3: + attempts += 1 + message = f'Current snapshot height is {img_values["height"]}.' + + # Rerun IGV + runIGV(batch_file, igv, status, message, attempts) + + # Update image values + img_values = getImageQualities(snapshot, batch_file, igv, failed_summary) + + return attempts, img_values + +def handleBlank(snapshot, img_values, thresholds, batch_file, igv, truncated_attempts, dim_attempts, failed_summary): + status = "blank" + blank = True + attempts = 0 + + if dim_attempts == 0 and truncated_attempts == 0: + increaseSleepInterval(batch_file) + + while blank and attempts < 3: + attempts += 1 + message = f'Current snapshot values are: {img_values["height"]} height, {img_values["kurtosis"]} kurtosis, and {img_values["skewness"]} skewness. Snapshots with these values may be blank. Blank snapshots may be due to errors reading BAM file headers, Java address bind errors, or other errors that occur during the IGV run. Rerunning with increased sleep interval.' + + # Rerun IGV + runIGV(batch_file, igv, status, message, attempts) + + # Get updated values + img_values = getImageQualities(snapshot, batch_file, igv, failed_summary) + + # Check if still blank + blank = is_blank(img_values, thresholds) + + return attempts, img_values + +def is_blank(img_values, thresholds): + blank_check = any( + ( + ( + float(img_values["height"]) == float(height_threshold) or + ("<" in height_threshold and float(img_values["height"]) < float(height_threshold.replace("<",""))) or + (">" in height_threshold and float(img_values["height"]) > float(height_threshold.replace(">",""))) + ) and + (float(img_values["kurtosis"]) > float(thresholds[height_threshold]["kurtosis"])) and + (float(img_values["skewness"]) < float(thresholds[height_threshold]["skewness"])) + ) + for height_threshold in list(thresholds) + ) + + return blank_check + +def qualityControl(snapshot, batch_file, igv, img_values, thresholds, failed_summary, attempts=0): + # Set default values for attempts so sleep value is not perpetually increased + dimension_attempts = 0 + truncated_attempts = 0 + blank_attempts = 0 + + # Set default qc status + qc_status = "pass" + + # Check width + if float(img_values["width"]) == 640: + dimension_attempts, img_values = handleIncorrectDimensions(snapshot, img_values, thresholds, batch_file, igv, failed_summary) + + # Handle truncated attempts + if float(img_values["height"]) in thresholds["truncated"]: + truncated_attempts, img_values = handleTruncated(snapshot, img_values, thresholds, batch_file, igv, dimension_attempts, failed_summary) + + # Check if blank + blank_thresholds = thresholds["blank"] + + blank = is_blank(img_values, blank_thresholds) + + if blank: + blank_attempts, img_values = handleBlank(snapshot, img_values, blank_thresholds, batch_file, igv, truncated_attempts, dimension_attempts, failed_summary) + + # Check final values and log failed/suspicious + if float(img_values["width"]) == 640: + os.system(f'echo "Snapshot width is {img_values["width"]}. Improper dimensions should be fixed and rerun. Check snapshot {snapshot}" >> {stdout}') + qc_status = "fail" + + if float(img_values["height"]) in thresholds["failed"]: + os.system(f'echo "Snapshot height is {img_values["height"]} and may still be truncated or improperly loaded. Check snapshot {snapshot}" >> {stdout}') + qc_status = "fail" + + if qc_status == "pass" and (dimension_attempts >= 5 or truncated_attempts >= 3 or blank_attempts >= 3): + qc_status = "suspicious" + + return qc_status + +def logFailedSnapshots(snapshot, summary_file, qc_status): + outline = "\t".join([snakemake.wildcards["sample_id"], + snakemake.wildcards["seq_type"], + snakemake.wildcards["genome_build"], + snakemake.wildcards["gene"], + snakemake.wildcards["chromosome"], + snakemake.wildcards["start_position"], + snakemake.wildcards["preset"], + qc_status, + snapshot]) + with open(summary_file, "a") as handle: + handle.write(outline + "\n") + +def main(): + stdout_f = open(snakemake.log["stdout"], "a") + stderr_f = open(snakemake.log["stderr"], "a") + + # Output file + outfile = snakemake.output["snapshot_qc"] + + ## Quality control variables + snapshot = snakemake.input["snapshot"] + qc_thresholds = snakemake.params["thresholds"] + + ## Batch scripts + + batch_script = snakemake.params["batch_script"] + merged_batch = snakemake.params["merged_batch"] + batch_temp = snakemake.params["batch_temp"] + # Set up the temporary batch script file + os.system(f'cat {batch_script} > {batch_temp} && echo "exit" >> {batch_temp}') + + ## Variables for running IGV + igv_exec = { + "igv": snakemake.params["igv"], + "server_num": snakemake.params["server_number"], + "server_args": snakemake.params["server_args"] + } + + ## Summary file to append failed snapshots to + f_summary = snakemake.input["failed_summary"] + + # Get image qualities + img_values = getImageQualities(snapshot, batch_temp, igv_exec, f_summary) + + description_line = f'Initial image values are:\nHeight: {img_values["height"]}\nWidth: {img_values["width"]}\nKurtosis: {img_values["kurtosis"]}\nSkewness:{img_values["skewness"]}\n' + stdout_f.write(description_line) + + # Control the qualities + results = qualityControl(snapshot, batch_temp, igv_exec, img_values, qc_thresholds, f_summary) + + if results != "pass": + logFailedSnapshots(snapshot, f_summary, results) + + if results != "fail": + os.system(f'touch {outfile}') + + # Cleanup + os.remove(batch_temp) + stdout_f.close() + stderr_f.close() + +if __name__ == "__main__": + main() diff --git a/modules/igv/1.0/igv.smk b/modules/igv/1.0/igv.smk new file mode 100644 index 000000000..16f206f1d --- /dev/null +++ b/modules/igv/1.0/igv.smk @@ -0,0 +1,847 @@ +#!/usr/bin/env snakemake + + +##### ATTRIBUTION ##### + + +# Original Author: N/A +# Module Author: Manuela Cruz +# Contributors: N/A + + +##### SETUP ##### + +# Import package with useful functions for developing analysis modules +import oncopipe as op +import pandas as pd +from datetime import datetime +import os + +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + print('\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + '\x1b[0m') + print('\x1b[0;31;40m' + f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m') + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + +# Setup module and store module-specific configuration in `CFG` +# `CFG` is a shortcut to `config["lcr-modules"]["igv"]` +CFG = op.setup_module( + name = "igv", + version = "1.0", + subdirectories = ["inputs", "batch_scripts", "igv", "snapshots", "outputs"], +) + +# Rename genome_build values in sample metadata to correlate with MAF values +CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["options"]["genome_map"]["grch37"]), "grch37", inplace=True) +CFG["runs"]["tumour_genome_build"].mask(CFG["runs"]["tumour_genome_build"].isin(CFG["options"]["genome_map"]["hg38"]), "hg38", inplace=True) +CFG["runs"]["normal_genome_build"].mask(CFG["runs"]["normal_genome_build"].isin(CFG["options"]["genome_map"]["grch37"]), "grch37", inplace=True) +CFG["runs"]["normal_genome_build"].mask(CFG["runs"]["normal_genome_build"].isin(CFG["options"]["genome_map"]["hg38"]), "hg38", inplace=True) + +# Setup variables +SUFFIX = ".pad" + str(CFG["options"]["generate_batch_script"]["padding"]) +if "launch_date" in CFG: + LAUNCH_DATE = CFG["launch_date"] +else: + LAUNCH_DATE = datetime.today().strftime('%Y-%m-%d') + +# Define rules to be run locally when using a compute cluster +localrules: + _igv_symlink_bams, + _igv_symlink_maf, + _igv_reduce_maf_cols, + _igv_merge_regions, + _igv_format_regions, + _igv_liftover_regions, + _igv_merge_lifted_regions, + _igv_filter_maf, + _igv_create_batch_script_per_variant, + _igv_batches_to_merge, + _igv_download_igv, + _igv_run, + _igv_track_failed, + _igv_quality_control, + _igv_symlink_snapshot, + _igv_check_snapshots, + _igv_check_samples, + _igv_setup_estimates, + _igv_estimate_snapshots, + _igv_check_estimates + + +##### FUNCTIONS ##### + + +def get_bam(wildcards): + CFG = config["lcr-modules"]["igv"] + metadata = CFG["samples"] + genome_build = metadata[(metadata.sample_id == wildcards.sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"] + + return expand( + CFG["inputs"]["bam_path"], + seq_type = wildcards.seq_type, + sample_id = wildcards.sample_id, + genome_build = genome_build + ) + +def get_bai(wildcards): + CFG = config["lcr-modules"]["igv"] + metadata = CFG["samples"] + genome_build = metadata[(metadata.sample_id == wildcards.sample_id) & (metadata.seq_type == wildcards.seq_type)]["genome_build"] + + return expand( + CFG["inputs"]["bai_path"], + seq_type = wildcards.seq_type, + sample_id = wildcards.sample_id, + genome_build = genome_build + ) + +##### RULES ##### + + + +# Symlinks the input files into the module results directory (under '00-inputs/') +rule _igv_symlink_bams: + input: + bam = get_bam, + bai = get_bai + output: + bam = CFG["dirs"]["inputs"] + "bams/{seq_type}/{sample_id}.bam", + bai = CFG["dirs"]["inputs"] + "bams/{seq_type}/{sample_id}.bam.bai" + run: + op.absolute_symlink(input.bam, output.bam) + op.absolute_symlink(input.bai, output.bai) + +rule _igv_symlink_maf: + input: + maf = CFG["inputs"]["maf"] + output: + maf = CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}.maf" + run: + op.absolute_symlink(input.maf, output.maf) + +# Reduce MAF columns to prevent parsing errors in Pandas and reduce file size +rule _igv_reduce_maf_cols: + input: + maf = str(rules._igv_symlink_maf.output.maf) + output: + maf = temp(CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}.maf.temp") + shell: + op.as_one_line(""" + cut -f 1,5,6,7,9,10,11,13,16,17 {input.maf} > {output.maf} + """) + +rule _igv_merge_regions: + input: + input_regions = lambda w: config["lcr-modules"]["igv"]["regions"][w.tool_type][w.tool_build] + output: + merged_regions = CFG["dirs"]["inputs"] + "regions/{tool_type}_merged.{tool_build}.tsv" + log: + stdout = CFG["logs"]["inputs"] + "regions/merge_{tool_type}_regions.{tool_build}.stdout.log", + stderr = CFG["logs"]["inputs"] + "regions/merge_{tool_type}_regions.{tool_build}.stderr.log" + run: + merged_df = pd.DataFrame() + for result in input.input_regions: + try: + df = pd.read_table(result, comment="#", sep="\t") + merged_df = pd.concat([merged_df, df]) + except: + with open(log.stdout, "a") as header: + header.write(f"Error reading or merging file {result}\n") + merged_df.to_csv(output.merged_regions, sep="\t", na_rep="NA", index=False) + +# Convert input regions file into BED format +rule _igv_format_regions: + input: + regions = str(rules._igv_merge_regions.output.merged_regions) + output: + regions = CFG["dirs"]["inputs"] + "regions/{tool_type}_formatted.{tool_build}.tsv" + params: + regions_format = lambda w: w.tool_type, + regions_build = lambda w: w.tool_build + log: + stdout = CFG["logs"]["inputs"] + "regions/format_regions_{tool_type}.{tool_build}.stdout.log", + stderr = CFG["logs"]["inputs"] + "regions/format_regions_{tool_type}.{tool_build}.stderr.log" + script: + config["lcr-modules"]["igv"]["scripts"]["format_regions"] + +def _igv_get_chain(wildcards): + if "38" in str({wildcards.tool_build}): + return reference_files("genomes/{tool_build}/chains/grch38/hg38ToHg19.over.chain") + else: + return reference_files("genomes/{tool_build}/chains/grch37/hg19ToHg38.over.chain") + +rule _igv_liftover_regions: + input: + regions = str(rules._igv_format_regions.output.regions), + igv_chain = _igv_get_chain + output: + regions = CFG["dirs"]["inputs"] + "regions/{tool_type}.{tool_build}To{genome_build}.lifted.txt" + params: + liftover_script = CFG["scripts"]["region_liftover_script"], + liftover_minmatch = CFG["options"]["liftover_regions"]["liftover_minMatch"] + conda: + CFG["conda_envs"]["liftover"] + resources: + **CFG["resources"]["_igv_liftover_regions"] + log: + stderr = CFG["logs"]["inputs"] + "regions/liftover_regions_{tool_type}.{tool_build}To{genome_build}.stderr.log" + shell: + op.as_one_line(""" + echo "Running {rule} for {wildcards.tool_type} {wildcards.tool_build}" > {log.stderr} ; + if [ {wildcards.tool_build} == {wildcards.genome_build} ] ; then + echo "{wildcards.tool_type} build {wildcards.tool_build} is already target {wildcards.genome_build}. Nothing to be done, copying {input.regions} to {output.regions}..." >> {log.stderr} ; + cat {input.regions} > {output.regions} ; + else + echo "Lifting over {wildcards.tool_type} {wildcards.tool_build} regions to {wildcards.genome_build}..." >> {log.stderr} ; + bash {params.liftover_script} + BED + {input.regions} + {output.regions} + {input.igv_chain} + YES + {params.liftover_minmatch} + 2>> {log.stderr} ; + fi + """) + +def _get_lifted_regions(wildcards): + CFG = config["lcr-modules"]["igv"] + return expand( + str(rules._igv_liftover_regions.output.regions), + tool_type = list(CFG["regions"]), + tool_build = ["grch37","hg38"], + allow_missing = True + ) + +rule _igv_merge_lifted_regions: + input: + regions = _get_lifted_regions + output: + regions = CFG["dirs"]["inputs"] + "regions/regions.{genome_build}.txt" + run: + merged_df = pd.DataFrame() + for region in input.regions: + try: + df = pd.read_table(region, comment = "#", sep = "\t", header=None, usecols=[0,1,2]) + df.drop(df[df[0] == "chrom"].index, inplace = True) + merged_df = pd.concat([merged_df, df]) + except: + print(f"Lifted regions file is empty: {region}") + merged_df = merged_df.drop_duplicates() + merged_df.to_csv(output.regions, sep="\t", na_rep="NA", index=False, header=False) + +# Filter MAF to lines containing positions of interest +rule _igv_filter_maf: + input: + maf = str(rules._igv_reduce_maf_cols.output.maf), + regions = str(rules._igv_merge_lifted_regions.output.regions) + output: + maf = CFG["dirs"]["inputs"] + "maf/filtered_maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}.maf" + params: + regions_format = "bed", + metadata = CFG["runs"] + log: + stdout = CFG["logs"]["inputs"] + "filter_maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}.stdout.log", + stderr = CFG["logs"]["inputs"] + "filter_maf/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}.stderr.log" + wildcard_constraints: + seq_type = "[a-zA-Z]+" + script: + config["lcr-modules"]["igv"]["scripts"]["filter_script"] + +def _get_maf(wildcards): + CFG = config["lcr-modules"]["igv"] + + if wildcards.sample_id in list(CFG["runs"]["tumour_sample_id"]): + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.sample_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) + normal_sample_id = this_sample["normal_sample_id"] + pair_status = this_sample["pair_status"] + + return expand( + str(rules._igv_filter_maf.output.maf), + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + tumour_id = wildcards.sample_id, + normal_sample_id = normal_sample_id, + pair_status = pair_status + ) + + if wildcards.sample_id in list(CFG["runs"]["normal_sample_id"]): + these_samples = op.filter_samples(CFG["runs"], normal_sample_id = wildcards.sample_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build, pair_status = "matched") + + assert sorted(these_samples["tumour_genome_build"].drop_duplicates()) == sorted(these_samples["normal_genome_build"].drop_duplicates()), f"Different genome builds between normal ID and tumour ID for {wildcards.sample_id}" + + return expand( + str(rules._igv_filter_maf.output.maf), + zip, + seq_type = these_samples["tumour_seq_type"], + genome_build = these_samples["tumour_genome_build"], + tumour_id = these_samples["tumour_sample_id"], + normal_sample_id = these_samples["normal_sample_id"], + pair_status = these_samples["pair_status"] + ) + +checkpoint _igv_create_batch_script_per_variant: + input: + bam_file = str(rules._igv_symlink_bams.output.bam), + bai_file = str(rules._igv_symlink_bams.output.bai), + filter_maf = _get_maf + output: + finished = CFG["dirs"]["batch_scripts"] + "completed/{seq_type}--{genome_build}/{sample_id}.finished", + variant_batch = expand(CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{{seq_type}}--{{genome_build}}/{preset}/{{sample_id}}" + SUFFIX + ".batch", preset = CFG["options"]["igv_presets"], allow_missing=True) + params: + tissue_status = lambda w: "normal" if w.sample_id in list(config["lcr-modules"]["igv"]["runs"]["normal_sample_id"]) else "tumour" if w.sample_id in list(config["lcr-modules"]["igv"]["runs"]["tumour_sample_id"]) else "unknown", + batch_dir = CFG["dirs"]["batch_scripts"], + snapshot_dir = CFG["dirs"]["snapshots"], + genome_build = "{genome_build}", + seq_type = "{seq_type}", + batch_options = CFG["options"]["generate_batch_script"], + suffix = SUFFIX, + igv_presets = CFG["options"]["igv_presets"] + log: + stdout = CFG["logs"]["batch_scripts"] + "{seq_type}--{genome_build}/{sample_id}" + SUFFIX + ".stdout.log", + stderr = CFG["logs"]["batch_scripts"] + "{seq_type}--{genome_build}/{sample_id}" + SUFFIX + ".stderr.log" + script: + config["lcr-modules"]["igv"]["scripts"]["batch_script_per_variant"] + +rule _igv_batches_to_merge: + input: + batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{sample_id}" + SUFFIX + ".batch" + output: + dispatched_batch_script = CFG["dirs"]["batch_scripts"] + "dispatched_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{sample_id}" + SUFFIX + ".batch" + params: + merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset}/{sample_id}" + SUFFIX + ".batch", + igv_options = lambda w: config["lcr-modules"]["igv"]["options"]["generate_batch_script"]["igv_options"][w.preset] + threads: (workflow.cores / 10) + run: + batch_script_path = os.path.abspath(input.batch_script) + output_file = os.path.abspath(params.merged_batch) + + batch_script = open(batch_script_path, "r") + + with open(output_file, "r") as f: + merged_lines = len(f.readlines()) + + with open(output_file, "a") as handle: + for line in batch_script: + if merged_lines > 0: + if line.startswith(("load", "maxPanelHeight", "genome","setSleepInterval", "collapse")): + continue + if line.startswith(tuple(params.igv_options)) and not line.startswith("sort"): + continue + handle.write(line) + + batch_script.close() + + output_touch = open(output.dispatched_batch_script, "w") + output_touch.close() + +def _evaluate_batches(wildcards): + CFG = config["lcr-modules"]["igv"] + checkpoint_output = checkpoints._igv_create_batch_script_per_variant.get(**wildcards).output.variant_batch + + if wildcards.sample_id in list(CFG["runs"]["tumour_sample_id"]): + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id=wildcards.sample_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) + normal_sample_id = this_sample["normal_sample_id"] + pair_status = this_sample["pair_status"] + + maf = expand( + str(rules._igv_filter_maf.output.maf), + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + tumour_id = wildcards.sample_id, + normal_sample_id = normal_sample_id, + pair_status = pair_status + ) + + maf_table = pd.read_table(maf[0], comment = "#", sep="\t") + + if wildcards.sample_id in list(CFG["runs"]["normal_sample_id"]): + these_samples = op.filter_samples(CFG["runs"], normal_sample_id=wildcards.sample_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build, pair_status="matched") # only get MAFs from matched tumour + normal combos + + mafs = expand( + str(rules._igv_filter_maf.output.maf), + zip, + seq_type = these_samples["tumour_seq_type"], + genome_build = these_samples["tumour_genome_build"], + tumour_id = these_samples["tumour_sample_id"], + normal_sample_id = these_samples["normal_sample_id"], + pair_status = these_samples["pair_status"] + ) + + maf_table = pd.concat([pd.read_table(m, comment="#", sep="\t") for m in mafs]) + + return expand( + expand( + str(rules._igv_batches_to_merge.output.dispatched_batch_script), + zip, + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + allow_missing = True + ), + preset = wildcards.preset, + allow_missing = True + ) + +IGV_VERSION = CFG["options"]["igv_download_path"] +IGV_VERSION = IGV_VERSION.split("/")[-1].replace(".zip","") + +rule _igv_download_igv: + output: + igv_zip = CFG["dirs"]["igv"] + IGV_VERSION + ".zip", + igv_installed = CFG["dirs"]["igv"] + IGV_VERSION + ".installed" + params: + igv = CFG["options"]["igv_download_path"] + conda: + CFG["conda_envs"]["wget"] + log: + stdout = CFG["logs"]["igv"] + "download/igv_" + IGV_VERSION + "_download.stdout.log", + stderr = CFG["logs"]["igv"] + "download/igv_" + IGV_VERSION + "_download.stderr.log" + shell: + op.as_one_line(""" + wget -O {output.igv_zip} {params.igv} && + unzip {output.igv_zip} -d $(dirname {output.igv_zip}) > {log.stdout} 2> {log.stderr} && + touch {output.igv_installed} + """) + +checkpoint _igv_run: + input: + igv = ancient(str(rules._igv_download_igv.output.igv_installed)), + finished_batches = str(rules._igv_create_batch_script_per_variant.output.finished), + batch_script = _evaluate_batches + output: + complete = CFG["dirs"]["snapshots"] + "completed/{seq_type}--{genome_build}/{preset}/{sample_id}.completed" + params: + merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset}/{sample_id}" + SUFFIX + ".batch", + igv = CFG["dirs"]["igv"] + IGV_VERSION + "/igv.sh", + sleep_time = CFG["options"]["generate_batch_script"]["sleep_timer"], + server_number = "-n " + CFG["options"]["xvfb_parameters"]["server_number"] if CFG["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", + server_args = CFG["options"]["xvfb_parameters"]["server_args"] + resources: + **CFG["resources"]["_igv_run"] + threads: (workflow.cores) + log: + stdout = CFG["logs"]["igv"] + "igv_run/{seq_type}--{genome_build}/{preset}/{sample_id}.stdout.log", + stderr = CFG["logs"]["igv"] + "igv_run/{seq_type}--{genome_build}/{preset}/{sample_id}.stderr.log" + shell: + op.as_one_line(""" + lines=$(wc -l < {params.merged_batch}) ; + if [ $lines -gt 0 ] ; + then + if ! grep -q -e "exit" {params.merged_batch} ; + then + echo 'exit' >> {params.merged_batch} ; + fi ; + maxtime=$(($(wc -l < {params.merged_batch}) * 10 + 15 + {params.sleep_time})) ; + timeout $maxtime xvfb-run -s "-screen 0 1920x1080x24" {params.server_number} {params.server_args} {params.igv} -b {params.merged_batch} > {log.stdout} 2> {log.stderr} && touch {output.complete} ; + exit=$? ; + if [ $exit -ne 0 ] ; + then + if grep -q -e "No such process" {log.stderr} && grep -q -e "Executing Command: exit" {log.stdout} ; + then + echo "All IGV batch script commands have completed succesfully, but an Xvfb-run kill error has occurred." >> {log.stdout} && touch {output.complete} ; + else + false ; + fi ; + fi ; + else + echo 'Skipping sample {wildcards.sample_id} because it either has no variants to snapshot or all variants have already been snapshot.' >> {log.stdout} ; + touch {output.complete} ; + fi + """) + +rule _igv_track_failed: + output: + failed_summary = CFG["dirs"]["outputs"] + "snapshot_summaries/quality_control/qc_summary_" + LAUNCH_DATE + ".txt" + run: + header = "\t".join(["sample_id","seq_type","genome_build","gene","chromosome","position","preset","status","snapshot_path"]) + with open(output.failed_summary, "w") as handle: + handle.write(header + "\n") + +rule _igv_quality_control: + input: + igv = str(rules._igv_run.output.complete), + failed_summary = ancient(str(rules._igv_track_failed.output.failed_summary)), + snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".png" + output: + snapshot_qc = CFG["dirs"]["snapshots"] + "qc/{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".qc" + params: + batch_script = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{sample_id}" + SUFFIX + ".batch", + merged_batch = CFG["dirs"]["batch_scripts"] + "merged_batch_scripts/{seq_type}--{genome_build}/{preset}/{sample_id}" + SUFFIX + ".batch", + igv = CFG["dirs"]["igv"] + "IGV_Linux_2.7.2/igv.sh", + server_number = "-n " + CFG["options"]["xvfb_parameters"]["server_number"] if CFG["options"]["xvfb_parameters"]["server_number"] is not None else "--auto-servernum", + server_args = CFG["options"]["xvfb_parameters"]["server_args"], + batch_temp = CFG["dirs"]["batch_scripts"] + "single_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{sample_id}" + SUFFIX + ".batch.temp", + thresholds = CFG["options"]["quality_control"] + resources: + **CFG["resources"]["_igv_quality_control"] + threads: (workflow.cores) + log: + stdout = CFG["logs"]["snapshots"] + "quality_control/{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".stdout.log", + stderr = CFG["logs"]["snapshots"] + "quality_control/{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".stderr.log" + script: + config["lcr-modules"]["igv"]["scripts"]["quality_control"] + +rule _igv_symlink_snapshot: + input: + snapshot = CFG["dirs"]["snapshots"] + "{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".png", + snapshot_qc = str(rules._igv_quality_control.output.snapshot_qc) + output: + snapshot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{tissue_status}/{preset}/{chromosome}/{chromosome}:{start_position}--{gene}--{ref_allele}_{alt_allele}--{sample_id}" + SUFFIX + ".png" + resources: + **CFG["resources"]["_igv_symlink"] + run: + op.relative_symlink(input.snapshot, output.snapshot) + +def _symlink_snapshot(wildcards): + CFG = config["lcr-modules"]["igv"] + checkpoint_outputs = checkpoints._igv_run.get(**wildcards).output.complete + + if wildcards.sample_id in list(CFG["runs"]["tumour_sample_id"]): + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id=wildcards.sample_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) + normal_sample_id = this_sample["normal_sample_id"] + pair_status = this_sample["pair_status"] + + maf = expand( + str(rules._igv_filter_maf.output.maf), + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + tumour_id = wildcards.sample_id, + normal_sample_id = normal_sample_id, + pair_status = pair_status + ) + + maf_table = pd.read_table(maf[0], comment = "#", sep="\t") + + tumour_snaps = expand( + expand( + str(rules._igv_symlink_snapshot.output.snapshot), + zip, + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + ref_allele = maf_table["Reference_Allele"], + alt_allele = maf_table["Tumor_Seq_Allele2"], + sample_id = maf_table["Tumor_Sample_Barcode"], + allow_missing = True + ), + genome_build = wildcards.genome_build, + seq_type = wildcards.seq_type, + preset = wildcards.preset, + tissue_status = "tumour" + ) + + return tumour_snaps + + if wildcards.sample_id in list(CFG["runs"]["normal_sample_id"]): + these_samples = op.filter_samples(CFG["runs"], normal_sample_id=wildcards.sample_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build, pair_status="matched") + + mafs = expand( + str(rules._igv_filter_maf.output.maf), + zip, + seq_type = these_samples["tumour_seq_type"], + genome_build = these_samples["tumour_genome_build"], + tumour_id = these_samples["tumour_sample_id"], + normal_sample_id = these_samples["normal_sample_id"], + pair_status = these_samples["pair_status"] + ) + + maf_table = pd.concat([pd.read_table(m, comment="#", sep="\t") for m in mafs]) + + normal_snaps = expand( + expand( + str(rules._igv_symlink_snapshot.output.snapshot), + zip, + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + ref_allele = maf_table["Reference_Allele"], + alt_allele = maf_table["Reference_Allele"], + sample_id = maf_table["Matched_Norm_Sample_Barcode"], + allow_missing = True + ), + genome_build = wildcards.genome_build, + seq_type = wildcards.seq_type, + preset = wildcards.preset, + tissue_status = "normal" + ) + + return normal_snaps + +def _quality_control(wildcards): + CFG = config["lcr-modules"]["igv"] + checkpoint_outputs = checkpoints._igv_run.get(**wildcards).output.complete + + if wildcards.sample_id in list(CFG["runs"]["tumour_sample_id"]): + this_sample = op.filter_samples(CFG["runs"], tumour_sample_id=wildcards.sample_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build) + normal_sample_id = this_sample["normal_sample_id"] + pair_status = this_sample["pair_status"] + + maf = expand( + str(rules._igv_filter_maf.output.maf), + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + tumour_id = wildcards.sample_id, + normal_sample_id = normal_sample_id, + pair_status = pair_status + ) + + maf_table = pd.read_table(maf[0], comment = "#", sep="\t") + + tumour_snaps = expand( + expand( + str(rules._igv_quality_control.output.snapshot_qc), + zip, + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + ref_allele = maf_table["Reference_Allele"], + alt_allele = maf_table["Tumor_Seq_Allele2"], + sample_id = maf_table["Tumor_Sample_Barcode"], + allow_missing = True + ), + genome_build = wildcards.genome_build, + seq_type = wildcards.seq_type, + preset = wildcards.preset, + tissue_status = "tumour" + ) + + return tumour_snaps + + if wildcards.sample_id in list(CFG["runs"]["normal_sample_id"]): + these_samples = op.filter_samples(CFG["runs"], normal_sample_id=wildcards.sample_id, tumour_seq_type = wildcards.seq_type, tumour_genome_build = wildcards.genome_build, pair_status="matched") + + mafs = expand( + str(rules._igv_filter_maf.output.maf), + zip, + seq_type = these_samples["tumour_seq_type"], + genome_build = these_samples["tumour_genome_build"], + tumour_id = these_samples["tumour_sample_id"], + normal_sample_id = these_samples["normal_sample_id"], + pair_status = these_samples["pair_status"] + ) + + maf_table = pd.concat([pd.read_table(m, comment="#", sep="\t") for m in mafs]) + + normal_snaps = expand( + expand( + str(rules._igv_quality_control.output.snapshot_qc), + zip, + chromosome = maf_table["chr_std"], + start_position = maf_table["Start_Position"], + gene = maf_table["Hugo_Symbol"], + ref_allele = maf_table["Reference_Allele"], + alt_allele = maf_table["Reference_Allele"], + sample_id = maf_table["Matched_Norm_Sample_Barcode"], + allow_missing = True + ), + genome_build = wildcards.genome_build, + seq_type = wildcards.seq_type, + preset = wildcards.preset, + tissue_status = "normal" + ) + + return normal_snaps + +rule _igv_check_snapshots: + input: + snapshots = _symlink_snapshot, + quality_control = _quality_control + output: + complete = CFG["dirs"]["outputs"] + "completed/check_snapshots/{preset}/{seq_type}--{genome_build}/{sample_id}.checked" + shell: + "touch {output.complete}" + +def _get_finished_samples(wildcards): + if wildcards.pair_status == "matched": + tumour_complete = ancient( + expand( + str(rules._igv_check_snapshots.output.complete), + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + sample_id = wildcards.tumour_id, + preset = wildcards.preset + ) + ) + + normal_complete = ancient( + expand( + str(rules._igv_check_snapshots.output.complete), + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + sample_id = wildcards.normal_sample_id, + preset = wildcards.preset + ) + ) + + return (tumour_complete + normal_complete) + + else: + return ancient( + expand( + str(rules._igv_check_snapshots.output.complete), + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + sample_id = wildcards.tumour_id, + preset = wildcards.preset + ) + ) + +rule _igv_check_samples: + input: + igv_completed = _get_finished_samples, + output: + checked = CFG["dirs"]["outputs"] + "completed/check_samples/{preset}/{seq_type}--{genome_build}/{tumour_id}--{normal_sample_id}--{pair_status}.completed" + shell: + "touch {output.checked}" + +##### Rules below will only run if CFG["estimate_only"] set to True + +rule _igv_setup_estimates: + output: + snapshot_summary = CFG["dirs"]["outputs"] + "snapshot_summaries/estimates/snapshot_summary_" + LAUNCH_DATE + ".txt", + snapshot_estimate = CFG["dirs"]["outputs"] + "snapshot_summaries/estimates/snapshot_estimate_" + LAUNCH_DATE + ".txt" + run: + header = "\t".join(["sample_id","seq_type","genome_build","gene","chromosome","position", "igv_preset"]) + with open(output.snapshot_summary,"w") as handle: + handle.write(header + "\n") + with open(output.snapshot_estimate, "w") as handle: + handle.write(header + "\n") + +rule _igv_estimate_snapshots: + input: + maf = _get_maf, + snapshot_summary = str(rules._igv_setup_estimates.output.snapshot_summary), + snapshot_estimate = str(rules._igv_setup_estimates.output.snapshot_estimate) + output: + complete = temp(CFG["dirs"]["batch_scripts"] + "estimate_batch_scripts/{seq_type}--{genome_build}/{preset}/{sample_id}.temp") + threads: (workflow.cores) + run: + CFG = config["lcr-modules"]["igv"] + snapshot_summary = open(input.snapshot_summary, "a") + snapshot_estimate = open(input.snapshot_estimate, "a") + + maf_table = pd.concat([pd.read_table(file, sep="\t", comment="#") for file in input.maf]) + + seq_type = wildcards.seq_type + genome_build = wildcards.genome_build + sample_id = wildcards.sample_id + preset = wildcards.preset + + for index, row in maf_table.iterrows(): + gene = row["Hugo_Symbol"] + chromosome = row["chr_std"] + start_position = str(row["Start_Position"]) + + dispatch_path = CFG["dirs"]["batch_scripts"] + f"dispatched_batch_scripts/{seq_type}--{genome_build}/{preset}/{chromosome}:{start_position}--{gene}--{sample_id}" + SUFFIX + ".batch" + + outline = "\t".join([sample_id, seq_type, genome_build, gene, chromosome, start_position, preset]) + snapshot_summary.write(outline + "\n") + + if not os.path.exists(dispatch_path): + snapshot_estimate.write(outline + "\n") + + finished = open(output.complete, "w") + finished.close() + snapshot_summary.close() + snapshot_estimate.close() + +def _check_estimates(wildcards): + if wildcards.pair_status == "matched": + tumour_complete = ancient( + expand( + str(rules._igv_estimate_snapshots.output.complete), + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + sample_id = wildcards.tumour_id, + preset = wildcards.preset + ) + ) + + normal_complete = ancient( + expand( + str(rules._igv_estimate_snapshots.output.complete), + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + sample_id = wildcards.normal_sample_id, + preset = wildcards.preset + ) + ) + + return (tumour_complete + normal_complete) + + else: + return ancient( + expand( + str(rules._igv_estimate_snapshots.output.complete), + seq_type = wildcards.seq_type, + genome_build = wildcards.genome_build, + sample_id = wildcards.tumour_id, + preset = wildcards.preset + ) + ) + +rule _igv_check_estimates: + input: + estimates_completed = _check_estimates + output: + sample_estimated = temp(CFG["dirs"]["outputs"] + "snapshot_summaries/temp/{seq_type}--{genome_build}/{preset}/{tumour_id}--{normal_sample_id}--{pair_status}.temp") + shell: + "touch {output.sample_estimated}" + +if CFG["estimate_only"] is False: + rule _igv_all: + input: + expand( + expand( + [ + str(rules._igv_check_samples.output.checked) + ], + zip, + tumour_id=CFG["runs"]["tumour_sample_id"], + normal_sample_id=CFG["runs"]["normal_sample_id"], + seq_type=CFG["runs"]["tumour_seq_type"], + genome_build=CFG["runs"]["tumour_genome_build"], + pair_status = CFG["runs"]["pair_status"], + allow_missing=True + ), + preset=CFG["options"]["igv_presets"] + ) + + +if CFG["estimate_only"] is True: + rule _igv_all: + input: + expand( + expand( + str(rules._igv_check_estimates.output.sample_estimated), + zip, + tumour_id=CFG["runs"]["tumour_sample_id"], + normal_sample_id=CFG["runs"]["normal_sample_id"], + seq_type = CFG["runs"]["tumour_seq_type"], + genome_build = CFG["runs"]["tumour_genome_build"], + pair_status=CFG["runs"]["pair_status"], + allow_missing=True + ), + preset=CFG["options"]["igv_presets"] + ) + + +##### CLEANUP ##### + + +# Perform some clean-up tasks, including storing the module-specific +# configuration on disk and deleting the `CFG` variable +op.cleanup_module(CFG) diff --git a/modules/igv/1.0/schemas/base-1.0.yaml b/modules/igv/1.0/schemas/base-1.0.yaml new file mode 120000 index 000000000..0a69d1ceb --- /dev/null +++ b/modules/igv/1.0/schemas/base-1.0.yaml @@ -0,0 +1 @@ +../../../../schemas/base/base-1.0.yaml \ No newline at end of file diff --git a/modules/igv/CHANGELOG.md b/modules/igv/CHANGELOG.md new file mode 100644 index 000000000..0e9cd0c38 --- /dev/null +++ b/modules/igv/CHANGELOG.md @@ -0,0 +1,46 @@ +# Changelog + +All notable changes to the `igv` module will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [1.0] - 2023-01-10 + +This release was authored by Manuela Cruz. + +- This module requires four file types: + * Regions file containing desired regions to be snapshot in BED format, MAF format, mutation_id format in which mutations are in "{chromosome}:{start_position}:{end_position}" format or OncodriveCLUSTL / HotMAPS results files. HotMAPS and OncodriveCLUSTL results are preformatted using scripts that are executed in the last steps of their lcr-modules. + * BAM and + * BAI files for each sample to generate IGV screenshots + * MAF files for each sample to determine which variants are included in desired regions to be snapshot and extract the corresponding Chromosome, Start_Position and Hugo_Symbol values + +- BAM and BAI file locations are sourced based on the sample_id and the corresponding genome_build and seq_type metadata values set in CFG["samples"] + +- As multiple genome build values other than `grch37` and `hg38` can exist in the metadata genome_build column, add the genome build values to their respective `grch37` or `hg38` lists in the config at ["options"]["genome_map"] to ensure the correct MAF paths are resolved for each sample. + +- The regions files must be entered in the config under the correct key denoting the genome build of the regions file. This is required to perform proper liftover if necessary, which allows the workflow to correctly filter sample MAFs that are in builds opposite what is provided in the regions file. + +- IGV batch scripts are created for each individual variant of interest. This is a checkpoint rule as it depends on the file contents of the filtered MAF files. + +Creation of IGV snapshots: + +- The individual IGV batch scripts are appended to a single large "merged" batch script for each sample_id and an empty "dispatched" file is created for each indiviual batch script. Importantly, this rule only runs if the dispatched file has not been created in order to prevent variants that have already undergone IGV snapshots to be appended and rerun again. + +- IGV is then run on each sample's merged batch script. This is also a checkpoint rule as the specific snapshots that will be created depend on the contents of the merged batch scripts. + +- IGV snapshot presets can be defined in the config at ["options"]["generate_batch_script"]["igv_options"] parameters. Two presets have already been defined, `default` and `pairs`. To specify which presets you want to create images for in the analysis, define them in list format in the config at ["options"]["igv_presets"]. More information on preset options here: https://github.com/igvteam/igv/wiki/Batch-commands + +Blank or truncated snapshots: + +- Truncated or blank snapshots can occur during the IGV run. The quality control rule checks for blank snapshots based on the kurtosis and skewness values of the snapshot, and checks for truncated snapshots based on the height and width. Multiple attempts are performed in order to resolve the affected snapshots, and if they remain unresolved the quality control rule will fail. Note that the thresholds for blank and truncated snapshots have been determined based on IGV image dimensions 1920x1080x24, and modifying image dimensions may result in blank or truncated snapshots being missed during quality control. + +- Increasing the milliseconds set in CFG["options"]["generate_batch_script"]["sleep_timer] typically reduces the amount of snapshots with issues but increases workflow run time. + +- A summary file of failed or suspicious snapshots is created for each run in `99-outputs/snapshot_summaries/quality_control/`. Failed snapshots match specific height thresholds known to belong to blank or truncated snapshots. Suspicious snapshots are snapshots that reached the limit of re-run attempts during quality control. These may be blank, truncated, or they may be regular snapshots that display only a few reads and went through the quality control process due to their short heights. + +Estimating snapshots: + +- To estimate the number of IGV snapshots that will be created, the config parameter ["estimate_only"] can be set to True. Summary files are created in `99-outputs/snapshot_summaries/estimates/` based on the individual batch scripts that have been created and do not have pre-existing "dispatch" files. + +