From fda4b0a1aa1f3dec63ce3aab78851a747f9de8e9 Mon Sep 17 00:00:00 2001 From: lkhilton Date: Mon, 12 Sep 2022 10:28:06 -0700 Subject: [PATCH 01/14] Improve maf subsetting; use pre-filled Battenberg results --- modules/phylowgs/1.0/phylowgs.smk | 547 ++++++++++++++++++------------ 1 file changed, 323 insertions(+), 224 deletions(-) diff --git a/modules/phylowgs/1.0/phylowgs.smk b/modules/phylowgs/1.0/phylowgs.smk index 44028ef3..1cdba641 100644 --- a/modules/phylowgs/1.0/phylowgs.smk +++ b/modules/phylowgs/1.0/phylowgs.smk @@ -15,269 +15,368 @@ # Import package with useful functions for developing analysis modules import oncopipe as op import hashlib +import glob # Setup module and store module-specific configuration in `CFG` # `CFG` is a shortcut to `config["lcr-modules"]["phylowgs"]` CFG = op.setup_module( - name = "phylowgs", - version = "1.0", - subdirectories = ["inputs", "preprocess_battenberg", "preprocess_inputs", "multievolve", "results", "outputs"], + name = "phylowgs", + version = "1.0", + subdirectories = ["inputs", "maf_to_vcf", "preprocess_battenberg", "preprocess_inputs", "multievolve", "results", "outputs"], ) # Define rules to be run locally when using a compute cluster localrules: - _phylowgs_input_vcf, - _phylowgs_input_battenberg, - _phylowgs_output_html, - _phylowgs_all + _phylowgs_input_maf, + _phylowgs_input_battenberg, + _phylowgs_process_output, + _phylowgs_output_plots, + _phylowgs_all, + _phylowgs_priority_ssms -# Generate a de-duplicated table of patient_ids etc. +# Generate a de-duplicated table of patient_ids etc. PATIENTS = CFG["runs"][["tumour_patient_id", "normal_patient_id", "tumour_genome_build", "tumour_seq_type", "tumour_sex"]].drop_duplicates(subset = None, ignore_index = True) # Obtain the path to the phylowgs conda environment md5hash = hashlib.md5() -if workflow.conda_prefix: - conda_prefix = workflow.conda_prefix -else: - conda_prefix = os.path.abspath(".snakemake/conda") +if workflow.conda_prefix: + conda_prefix = workflow.conda_prefix +else: + conda_prefix = os.path.abspath(".snakemake/conda") md5hash.update(conda_prefix.encode()) f = open(CFG['conda_envs']['phylowgs'], 'rb') md5hash.update(f.read()) f.close() h = md5hash.hexdigest() -PHYLO = conda_prefix + "/" + h[:8] + "/share/phylowgs/" +PHYLO = "".join(glob.glob(conda_prefix + "/" + h[:8] + "*/share/phylowgs/")) + +##### FUNCTIONS ##### + +# Input function to get all MAFs per patient +def get_input_mafs(wildcards): + CFG = config["lcr-modules"]["phylowgs"] + PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id).sort_values(by = ["tumour_time_point"]) + inputs = expand( + [ + str(rules._phylowgs_input_maf.output.maf) + ], + zip, + tumour_id = PATIENT["tumour_sample_id"], + normal_id = PATIENT["normal_sample_id"], + pair_status = PATIENT["pair_status"], + allow_missing = True + ) + return(inputs) + +def get_maf_cli(wildcards): + CFG = config["lcr-modules"]["phylowgs"] + PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id).sort_values(by = ["tumour_time_point"]) + inputs = expand( + [ + str(rules._phylowgs_input_maf.output.maf) + ], + zip, + tumour_id = PATIENT["tumour_sample_id"], + normal_id = PATIENT["normal_sample_id"], + pair_status = PATIENT["pair_status"], + genome_build = PATIENT["tumour_genome_build"], + seq_type = PATIENT["tumour_seq_type"] + ) + cli = ",".join([str(elem) for elem in inputs]) + return(cli) + +# Define the order of sample labels by time point +def order_samples(wildcards): + CFG = config["lcr-modules"]["phylowgs"] + PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id) + samples = str(",".join(PATIENT.sort_values(by = ["tumour_time_point"]).tumour_sample_id.tolist())) + return(samples) +# Expand the input files to create a command-line argument for create_phylowgs_inputs.py +def create_phylowgs_inputs_cli(wildcards): + CFG = config["lcr-modules"]["phylowgs"] + PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id).sort_values(by = ["tumour_time_point"]) + cnvs = expand( + "--cnvs {time_point}=" + CFG['dirs']['preprocess_battenberg'] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.cnvs.txt", + zip, + time_point = PATIENT["tumour_time_point"], + tumour_id = PATIENT["tumour_sample_id"], + normal_id = PATIENT["normal_sample_id"], + pair_status = PATIENT["pair_status"], + seq_type = PATIENT["tumour_seq_type"], + genome_build = PATIENT["tumour_genome_build"], + patient_id = PATIENT["tumour_patient_id"] + ) + vcf_types = expand( + "--vcf-type {time_point}=mutect_smchet", + zip, + time_point = PATIENT["tumour_time_point"] + ) + vcfs = expand( + "{time_point}=" + CFG['dirs']['maf_to_vcf'] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.maf_to.vcf.gz", + zip, + time_point = PATIENT["tumour_time_point"], + tumour_id = PATIENT["tumour_sample_id"], + normal_id = PATIENT["normal_sample_id"], + pair_status = PATIENT["pair_status"], + seq_type = PATIENT["tumour_seq_type"], + genome_build = PATIENT["tumour_genome_build"], + patient_id = PATIENT["tumour_patient_id"] + ) + cli = cnvs + vcf_types + vcfs + cli = " ".join([str(elem) for elem in cli]) + return(cli) + +# Input function to pull in input VCF and preprocessed CNV data +def create_phylowgs_inputs(wildcards): + CFG = config["lcr-modules"]["phylowgs"] + PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id).sort_values(by = ["tumour_time_point"]) + inputs = expand( + [ + CFG["dirs"]["preprocess_battenberg"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.cnvs.txt", + CFG['dirs']['maf_to_vcf'] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.maf_to.vcf.gz" + ], + zip, + tumour_id = PATIENT["tumour_sample_id"], + normal_id = PATIENT["normal_sample_id"], + pair_status = PATIENT["pair_status"], + allow_missing = True + ) + return(inputs) ##### RULES ##### # Symlinks the input files into the module results directory (under '00-inputs/') -rule _phylowgs_input_vcf: - input: - vcf = CFG["inputs"]["vcf"], - tbi = CFG["inputs"]["tbi"] - output: - vcf = CFG["dirs"]["inputs"] + "vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--matched.vcf.gz", - tbi = CFG["dirs"]["inputs"] + "vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--matched.vcf.gz.tbi" - run: - op.absolute_symlink(input.vcf, output.vcf) - op.absolute_symlink(input.tbi, output.tbi) +rule _phylowgs_input_maf: + input: + maf = CFG["inputs"]["maf"], + output: + maf = CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.maf", + run: + op.absolute_symlink(input.maf, output.maf) rule _phylowgs_input_battenberg: - input: - cellularity = CFG["inputs"]["cellularity"], - subclones = CFG["inputs"]["subclones"] - output: - cellularity = CFG["dirs"]["inputs"] + "battenberg/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--matched.cellularity_ploidy.txt", - subclones = CFG["dirs"]["inputs"] + "battenberg/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--matched.subclones.txt" - run: - op.absolute_symlink(input.cellularity, output.cellularity) - op.absolute_symlink(input.subclones, output.subclones) - - -# Preprocess the battenberg file to match requirements -rule _phylowgs_preprocess_battenberg: - input: - cellularity = str(rules._phylowgs_input_battenberg.output.cellularity), - subclones = str(rules._phylowgs_input_battenberg.output.subclones) - output: - txt = CFG["dirs"]["preprocess_battenberg"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--matched.cnvs.txt" - log: - stderr = CFG["logs"]["preprocess_battenberg"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--matched.preprocess_battenberg.stderr.log", - stdout = CFG["logs"]["preprocess_battenberg"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--matched.preprocess_battenberg.stdout.log" - params: - script = PHYLO + "parser/parse_cnvs.py" - conda: - CFG["conda_envs"]["phylowgs"] - threads: - CFG["threads"]["create_inputs"] - resources: - **CFG["resources"]["create_inputs"] - shell: - op.as_one_line(""" - cellularity=$(tail -n +2 {input.cellularity} | cut -f 1); - python2 {params.script} -f battenberg-smchet -c $cellularity --cnv-output {output.txt} {input.subclones} - 2> {log.stderr} > {log.stdout} - """) - -# Expand the input files to create a command-line argument for create_phylowgs_inputs.py -def create_phylowgs_inputs_cli(wildcards): - CFG = config["lcr-modules"]["phylowgs"] - PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id) - cnvs = expand( - "--cnvs {time_point}=" + CFG['dirs']['preprocess_battenberg'] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--matched.cnvs.txt", - zip, - time_point = PATIENT["tumour_time_point"], - tumour_id = PATIENT["tumour_sample_id"], - normal_id = PATIENT["normal_sample_id"], - seq_type = PATIENT["tumour_seq_type"], - genome_build = PATIENT["tumour_genome_build"], - patient_id = PATIENT["tumour_patient_id"] - ) - vcf_types = expand( - "--vcf-type {time_point}=" + CFG['options']['create_inputs']['vcf_type'], - zip, - time_point = PATIENT["tumour_time_point"] - ) - vcfs = expand( - "{time_point}=" + CFG['dirs']['inputs'] + "vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--matched.vcf.gz", - zip, - time_point = PATIENT["tumour_time_point"], - tumour_id = PATIENT["tumour_sample_id"], - normal_id = PATIENT["normal_sample_id"], - seq_type = PATIENT["tumour_seq_type"], - genome_build = PATIENT["tumour_genome_build"], - patient_id = PATIENT["tumour_patient_id"] - ) - cli = cnvs + vcf_types + vcfs - cli = " ".join([str(elem) for elem in cli]) - return(cli) - -# Input function to pull in input VCF and preprocessed CNV data -def create_phylowgs_inputs(wildcards): - CFG = config["lcr-modules"]["phylowgs"] - PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id) - inputs = expand( - [ - CFG["dirs"]["preprocess_battenberg"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--matched.cnvs.txt", - CFG['dirs']['inputs'] + "vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--matched.vcf.gz" - ], - zip, - tumour_id = PATIENT["tumour_sample_id"], - normal_id = PATIENT["normal_sample_id"], - allow_missing = True - ) - return(inputs) + input: + cellularity = CFG["inputs"]["cellularity"], + subclones = CFG["inputs"]["subclones"] + output: + cellularity = CFG["dirs"]["inputs"] + "battenberg/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.cellularity_ploidy.txt", + subclones = CFG["dirs"]["inputs"] + "battenberg/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.subclones.txt" + run: + op.absolute_symlink(input.cellularity, output.cellularity) + op.absolute_symlink(input.subclones, output.subclones) + + +rule _phylowgs_parse_battenberg: + input: + cellularity = str(rules._phylowgs_input_battenberg.output.cellularity), + subclones = str(rules._phylowgs_input_battenberg.output.subclones) + output: + txt = CFG["dirs"]["preprocess_battenberg"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.cnvs.txt" + log: + stderr = CFG["logs"]["preprocess_battenberg"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.preprocess_battenberg.stderr.log", + stdout = CFG["logs"]["preprocess_battenberg"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.preprocess_battenberg.stdout.log" + params: + script = PHYLO + "parser/parse_cnvs.py" + conda: + CFG["conda_envs"]["phylowgs"] + threads: + CFG["threads"]["create_inputs"] + resources: + **CFG["resources"]["create_inputs"] + shell: + op.as_one_line(""" + cellularity=$(tail -n +2 {input.cellularity} | cut -f 1); + python2 {params.script} -f battenberg-smchet -c $cellularity --cnv-output {output.txt} {input.subclones} + 2> {log.stderr} > {log.stdout} + """) + +# Convert the input maf file to a vcf file +rule _phylowgs_maf_to_vcf: + input: + maf = str(rules._phylowgs_input_maf.output.maf), + fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa") + output: + vcf = temp(CFG['dirs']['maf_to_vcf'] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.maf_to.vcf") + conda: + CFG["conda_envs"]["vcf2maf"] + shell: + op.as_one_line(""" + maf2vcf.pl --input-maf {input.maf} --output-dir $(dirname {output.vcf}) --output-vcf {output.vcf} --ref-fasta {input.fasta} + """) + +rule _phylowgs_priority_ssms: + input: + mafs = get_input_mafs + output: + ssms = CFG['dirs']['maf_to_vcf'] + "{seq_type}--{genome_build}/{patient_id}/coding_ssms.txt" + params: + noncoding = CFG["scripts"]["noncoding"] + conda: + CFG["conda_envs"]["coreutils"] + shell: + op.as_one_line(""" + grep -hvf {params.noncoding} {input.mafs} | awk '{{FS="\\t"}} {{OFS="_"}} {{print $5, $6}}' | sed 's/chr//g' > {output.ssms} + """) + +rule _phylowgs_bgzip_vcf: + input: + vcf = str(rules._phylowgs_maf_to_vcf.output.vcf) + output: + vcf = temp(CFG['dirs']['maf_to_vcf'] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.maf_to.vcf.gz"), + tbi = CFG['dirs']['maf_to_vcf'] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.maf_to.vcf.gz.tbi", + conda: + CFG["conda_envs"]["bcftools"] + shell: + op.as_one_line(""" + bcftools sort {input.vcf} | bcftools view -s "{wildcards.normal_id},{wildcards.tumour_id}" -i 'FMT/DP[0] > 0 && FMT/AD[0:1] > 1' -Oz -o {output.vcf} && tabix -p vcf {output.vcf} + """) # Preprocess vcf and battenberg inputs together rule _phylowgs_create_inputs: - input: - create_phylowgs_inputs - output: - ssms = CFG["dirs"]["preprocess_inputs"] + "{seq_type}--{genome_build}/{patient_id}/ssm_data.txt", - cnvs = CFG["dirs"]["preprocess_inputs"] + "{seq_type}--{genome_build}/{patient_id}/cnv_data.txt", - params = CFG["dirs"]["preprocess_inputs"] + "{seq_type}--{genome_build}/{patient_id}/params.json" - log: - stderr = CFG["logs"]["preprocess_inputs"] + "{seq_type}--{genome_build}/{patient_id}/create_phylowgs_inputs.stderr.log", - stdout = CFG["logs"]["preprocess_inputs"] + "{seq_type}--{genome_build}/{patient_id}/create_phylowgs_inputs.stdout.log" - params: - cli = create_phylowgs_inputs_cli, - opts = CFG["options"]["create_inputs"]["opts"], - sex = lambda w: config["lcr-modules"]["phylowgs"]["switches"]["sex"][op.filter_samples(PATIENTS, tumour_patient_id = w.patient_id)["tumour_sex"].values[0]] if op.filter_samples(PATIENTS, tumour_patient_id = w.patient_id)["tumour_sex"].values[0] in config["lcr-modules"]["phylowgs"]["switches"]["sex"].keys() else "auto", - script = PHYLO + "parser/create_phylowgs_inputs.py" - conda: - CFG["conda_envs"]["phylowgs"] - threads: - CFG["threads"]["create_inputs"] - resources: - **CFG["resources"]["create_inputs"] - shell: - op.as_one_line(""" - python2 {params.script} - --output-cnvs {output.cnvs} - --output-variants {output.ssms} - --output-params {output.params} - --sex {params.sex} - {params.opts} - {params.cli} - 2> {log.stderr} > {log.stdout} - """) + input: + create_phylowgs_inputs, + priority = str(rules._phylowgs_priority_ssms.output.ssms) + output: + ssms = CFG["dirs"]["preprocess_inputs"] + "{seq_type}--{genome_build}/{patient_id}/ssm_data.txt", + cnvs = CFG["dirs"]["preprocess_inputs"] + "{seq_type}--{genome_build}/{patient_id}/cnv_data.txt", + params = CFG["dirs"]["preprocess_inputs"] + "{seq_type}--{genome_build}/{patient_id}/params.json" + log: + stderr = CFG["logs"]["preprocess_inputs"] + "{seq_type}--{genome_build}/{patient_id}/create_phylowgs_inputs.stderr.log", + stdout = CFG["logs"]["preprocess_inputs"] + "{seq_type}--{genome_build}/{patient_id}/create_phylowgs_inputs.stdout.log" + params: + cli = create_phylowgs_inputs_cli, + opts = CFG["options"]["create_inputs"]["opts"], + sex = lambda w: config["lcr-modules"]["phylowgs"]["switches"]["sex"][op.filter_samples(PATIENTS, tumour_patient_id = w.patient_id)["tumour_sex"].values[0]] if op.filter_samples(PATIENTS, tumour_patient_id = w.patient_id)["tumour_sex"].values[0] in config["lcr-modules"]["phylowgs"]["switches"]["sex"].keys() else "auto", + script = PHYLO + "parser/create_phylowgs_inputs.py" + conda: + CFG["conda_envs"]["phylowgs"] + threads: + CFG["threads"]["create_inputs"] + resources: + **CFG["resources"]["create_inputs"] + shell: + op.as_one_line(""" + python2 {params.script} + --output-cnvs {output.cnvs} + --output-variants {output.ssms} + --output-params {output.params} + --priority-ssms {input.priority} + --sex {params.sex} + {params.opts} + {params.cli} + 2> {log.stderr} > {log.stdout} + """) # Run multievolve to sample trees and reconstruct phylogeny -rule _phylowgs_multievolve: - input: - **rules._phylowgs_create_inputs.output - output: - trees = CFG["dirs"]["multievolve"] + "{seq_type}--{genome_build}/{patient_id}/trees.zip" - log: - stderr = CFG["logs"]["multievolve"] + "{seq_type}--{genome_build}/{patient_id}/multievolve.stderr.log", - stdout = CFG["logs"]["multievolve"] + "{seq_type}--{genome_build}/{patient_id}/multievolve.stdout.log" - params: - script = PHYLO + "multievolve.py", - opts = CFG["options"]["multievolve"] - conda: - CFG["conda_envs"]["phylowgs"] - threads: - CFG["threads"]["multievolve"] - resources: - **CFG["resources"]["multievolve"] - shell: - op.as_one_line(""" - python2 {params.script} - {params.opts} - -n {threads} - -O $(dirname {output.trees}) - --ssms {input.ssms} - --cnvs {input.cnvs} - 2> {log.stderr} > {log.stdout} - """) - -# Write the results -rule _phylowgs_write_results: - input: - str(rules._phylowgs_multievolve.output.trees) - output: - muts = CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.muts.json", - summ = CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.summ.json", - mutass = CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.mutass.zip" - params: - script = PHYLO + "write_results.py", - opts = CFG["options"]["write_results"] - conda: - CFG["conda_envs"]["phylowgs"] - threads: - CFG["threads"]["write_results"] - resources: - **CFG["resources"]["write_results"] - shell: - op.as_one_line(""" - python2 {params.script} - {params.opts} - {wildcards.patient_id} - {input} - {output.summ}.gz - {output.muts}.gz - {output.mutass} && - gunzip -f $(dirname {output.mutass})/*.gz - """) - +rule _phylowgs_multievolve: + input: + **rules._phylowgs_create_inputs.output + output: + trees = CFG["dirs"]["multievolve"] + "{seq_type}--{genome_build}/{patient_id}/trees.zip" + log: + stderr = CFG["logs"]["multievolve"] + "{seq_type}--{genome_build}/{patient_id}/multievolve.stderr.log", + stdout = CFG["logs"]["multievolve"] + "{seq_type}--{genome_build}/{patient_id}/multievolve.stdout.log" + params: + script = PHYLO + "multievolve.py", + opts = CFG["options"]["multievolve"] + conda: + CFG["conda_envs"]["phylowgs"] + threads: + CFG["threads"]["multievolve"] + resources: + **CFG["resources"]["multievolve"] + shell: + op.as_one_line(""" + python2 {params.script} + {params.opts} + -n {threads} + -O $(dirname {output.trees}) + --ssms {input.ssms} + --cnvs {input.cnvs} + 2> {log.stderr} > {log.stdout} + """) + +# Write the results +rule _phylowgs_write_results: + input: + trees = str(rules._phylowgs_multievolve.output.trees) + output: + muts = CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.muts.json", + summ = CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.summ.json", + mutass = CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.mutass.zip" + params: + script = PHYLO + "write_results.py", + opts = CFG["options"]["write_results"] + conda: + CFG["conda_envs"]["phylowgs"] + threads: + CFG["threads"]["write_results"] + resources: + **CFG["resources"]["write_results"] + shell: + op.as_one_line(""" + python2 {params.script} + {params.opts} + {wildcards.patient_id} + {input.trees} + {output.summ}.gz + {output.muts}.gz + {output.mutass} && + gunzip -f $(dirname {output.mutass})/*.gz && + rm -rf $(dirname {input.trees})/chain* + """) + + + # Symlinks the final output files to the witness directory in preparation for HTTP browsing -rule _phylowgs_output_html: - input: - mutass = str(rules._phylowgs_write_results.output.mutass) - output: - complete = CFG["dirs"]["outputs"] + "txt/{seq_type}--{genome_build}/{patient_id}.phylo_complete" - params: - witness = PHYLO + "witness/data/{patient_id}" - run: - op.absolute_symlink(os.path.split(input.mutass)[0], params.witness) - f = open(output.complete, "a") - f.write("To view PhyloWGS results, navigate to " + PHYLO + "witness\n") - f.write("Run the following commands: \n") - f.write("python2 index_data.py\n") - f.write("python2 -m SimpleHTTPServer\n") - f.write("On a local machine you will be able to view your results in a browser at http://localhost:8000\n") - f.write("For a remote machine, launch the following command in a terminal: \n") - f.write("ssh -N -L localhost:8000:localhost:8000 \n") - f.write("Now you can view your results in a browser at http://localhost:8000\n") - f.close() +rule _phylowgs_process_output: + input: + mafs = get_input_mafs, + **rules._phylowgs_create_inputs.output, + **rules._phylowgs_write_results.output, + output: + tree_summary = CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/results/tree_summary.tsv", + maf = CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/results/merged_ssm_cluster_assignments.maf", + cnvs = CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/results/merged_cnvs_cluster_assignments.tsv", + CCF = CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/results/CCF.tsv", + plots = directory(CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/results/plots/") + params: + sample_order = order_samples, + maf_list = get_maf_cli, + drivers = CFG['inputs']['drivers'], + script = CFG["scripts"]["process_outputs"] + conda: + CFG["conda_envs"]["phylowgs_results"] + script: + "{params.script}" + +rule _phylowgs_output_plots: + input: + plots = str(rules._phylowgs_process_output.output.plots) + output: + plots = directory(CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/plots/{patient_id}") + run: + op.relative_symlink(input.plots, output.plots) + # Generates the target sentinels for each run, which generate the symlinks rule _phylowgs_all: - input: - expand( - [ - str(rules._phylowgs_output_html.output.complete), - ], - zip, # Run expand() with zip(), not product() - seq_type=PATIENTS["tumour_seq_type"], - genome_build=PATIENTS["tumour_genome_build"], - patient_id=PATIENTS["tumour_patient_id"] - ) + input: + expand( + [ + str(rules._phylowgs_output_plots.output.plots), + ], + zip, # Run expand() with zip(), not product() + seq_type=PATIENTS["tumour_seq_type"], + genome_build=PATIENTS["tumour_genome_build"], + patient_id=PATIENTS["tumour_patient_id"] + ) ##### CLEANUP ##### From 8db3edede2afee1df2b1513ce9f77d3b88417259 Mon Sep 17 00:00:00 2001 From: lkhilton Date: Mon, 12 Sep 2022 10:29:08 -0700 Subject: [PATCH 02/14] Add accessory files for PhyloWGS module --- envs/phylowgs/fill_battenberg.yaml | 41 + envs/phylowgs/phylowgs_results.yaml | 204 +++ modules/phylowgs/1.0/config/default.yaml | 28 +- .../phylowgs/1.0/envs/bcftools-1.10.2.yaml | 1 + modules/phylowgs/1.0/envs/coreutils-8.31.yaml | 1 + .../phylowgs/1.0/envs/fill_battenberg.yaml | 1 + .../phylowgs/1.0/envs/phylowgs_results.yaml | 1 + modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml | 1 + .../1.0/etc/chromArmFiles/chromArm.grch37.tsv | 49 + .../1.0/etc/chromArmFiles/chromArm.grch38.tsv | 49 + .../1.0/etc/chromArmFiles/chromArm.hg19.tsv | 49 + .../1.0/etc/chromArmFiles/chromArm.hg38.tsv | 49 + .../1.0/etc/chromArmFiles/chromArm.hs37d5.tsv | 1 + modules/phylowgs/1.0/etc/noncoding.txt | 8 + .../1.0/src/create_phylowgs_inputs.py | 1356 ----------------- modules/phylowgs/1.0/src/fill_battenberg.py | 379 +++++ .../1.0/src/process_phyloWGS_outputs.R | 536 +++++++ .../src/process_phyloWGS_outputs_updated.R | 417 ----- 18 files changed, 1388 insertions(+), 1783 deletions(-) create mode 100644 envs/phylowgs/fill_battenberg.yaml create mode 100644 envs/phylowgs/phylowgs_results.yaml create mode 120000 modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml create mode 120000 modules/phylowgs/1.0/envs/coreutils-8.31.yaml create mode 120000 modules/phylowgs/1.0/envs/fill_battenberg.yaml create mode 120000 modules/phylowgs/1.0/envs/phylowgs_results.yaml create mode 120000 modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml create mode 100644 modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch37.tsv create mode 100644 modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch38.tsv create mode 100644 modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg19.tsv create mode 100644 modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg38.tsv create mode 120000 modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hs37d5.tsv create mode 100644 modules/phylowgs/1.0/etc/noncoding.txt delete mode 100755 modules/phylowgs/1.0/src/create_phylowgs_inputs.py create mode 100644 modules/phylowgs/1.0/src/fill_battenberg.py create mode 100644 modules/phylowgs/1.0/src/process_phyloWGS_outputs.R delete mode 100644 modules/phylowgs/1.0/src/process_phyloWGS_outputs_updated.R diff --git a/envs/phylowgs/fill_battenberg.yaml b/envs/phylowgs/fill_battenberg.yaml new file mode 100644 index 00000000..52741e52 --- /dev/null +++ b/envs/phylowgs/fill_battenberg.yaml @@ -0,0 +1,41 @@ +name: fill_segments +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - ca-certificates=2020.12.5 + - certifi=2020.12.5 + - ld_impl_linux-64=2.35.1 + - libblas=3.9.0 + - libcblas=3.9.0 + - libffi=3.3 + - libgcc-ng=9.3.0 + - libgfortran-ng=9.3.0 + - libgfortran5=9.3.0 + - libgomp=9.3.0 + - liblapack=3.9.0 + - libopenblas=0.3.12 + - libstdcxx-ng=9.3.0 + - ncurses=6.2 + - numpy=1.19.4 + - openssl=1.1.1i + - pandas=1.2.0 + - pip=20.3.3 + - python=3.9.1 + - python-dateutil=2.8.1 + - python_abi=3.9 + - pytz=2020.5 + - readline=8.0 + - setuptools=49.6.0 + - simplejson=3.17.2 + - six=1.15.0 + - sqlite=3.34.0 + - tk=8.6.10 + - tzdata=2020e + - wheel=0.36.2 + - xz=5.2.5 + - zlib=1.2.11 +prefix: /home/dreval/miniconda3/envs/fill_segments diff --git a/envs/phylowgs/phylowgs_results.yaml b/envs/phylowgs/phylowgs_results.yaml new file mode 100644 index 00000000..f64126d7 --- /dev/null +++ b/envs/phylowgs/phylowgs_results.yaml @@ -0,0 +1,204 @@ +name: phylowgs_outputs +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.1 + - binutils_impl_linux-64=2.35.1 + - binutils_linux-64=2.35 + - bwidget=1.9.14 + - bzip2=1.0.8 + - c-ares=1.17.1 + - ca-certificates=2021.5.30 + - cairo=1.16.0 + - curl=7.77.0 + - font-ttf-dejavu-sans-mono=2.37 + - font-ttf-inconsolata=3.000 + - font-ttf-source-code-pro=2.038 + - font-ttf-ubuntu=0.83 + - fontconfig=2.13.1 + - fonts-conda-ecosystem=1 + - fonts-conda-forge=1 + - freetype=2.10.4 + - fribidi=1.0.10 + - gcc_impl_linux-64=9.3.0 + - gcc_linux-64=9.3.0 + - gettext=0.21.0 + - gfortran_impl_linux-64=9.3.0 + - gfortran_linux-64=9.3.0 + - graphite2=1.3.14 + - gsl=2.6 + - gxx_impl_linux-64=9.3.0 + - gxx_linux-64=9.3.0 + - harfbuzz=2.8.1 + - icu=68.1 + - jbig=2.1 + - jpeg=9d + - kernel-headers_linux-64=2.6.32 + - krb5=1.19.1 + - ld_impl_linux-64=2.35.1 + - lerc=2.2.1 + - libblas=3.9.0 + - libcblas=3.9.0 + - libcurl=7.77.0 + - libdeflate=1.7 + - libedit=3.1.20210216 + - libev=4.33 + - libffi=3.3 + - libgcc-devel_linux-64=9.3.0 + - libgcc-ng=9.3.0 + - libgfortran-ng=9.3.0 + - libgfortran5=9.3.0 + - libglib=2.68.3 + - libgomp=9.3.0 + - libiconv=1.16 + - liblapack=3.9.0 + - libnghttp2=1.43.0 + - libopenblas=0.3.15 + - libpng=1.6.37 + - libssh2=1.9.0 + - libstdcxx-devel_linux-64=9.3.0 + - libstdcxx-ng=9.3.0 + - libtiff=4.3.0 + - libuuid=2.32.1 + - libwebp-base=1.2.0 + - libxcb=1.14 + - libxml2=2.9.12 + - lz4-c=1.9.3 + - make=4.3 + - ncurses=6.2 + - openssl=1.1.1k + - pandoc=2.14.0.3 + - pango=1.48.6 + - pcre=8.45 + - pcre2=10.36 + - pixman=0.40.0 + - r-askpass=1.1 + - r-assertthat=0.2.1 + - r-backports=1.2.1 + - r-base=4.1.0 + - r-base64enc=0.1_3 + - r-blob=1.2.1 + - r-brio=1.1.2 + - r-broom=0.7.8 + - r-callr=3.7.0 + - r-cellranger=1.1.0 + - r-cli=3.0.0 + - r-clipr=0.7.1 + - r-colorspace=2.0_2 + - r-cpp11=0.3.1 + - r-crayon=1.4.1 + - r-curl=4.3.2 + - r-data.table=1.14.0 + - r-dbi=1.1.1 + - r-dbplyr=2.1.1 + - r-desc=1.3.0 + - r-diffobj=0.3.4 + - r-digest=0.6.27 + - r-dplyr=1.0.7 + - r-dtplyr=1.1.0 + - r-ellipsis=0.3.2 + - r-evaluate=0.14 + - r-fansi=0.4.2 + - r-farver=2.1.0 + - r-forcats=0.5.1 + - r-fs=1.5.0 + - r-gargle=1.1.0 + - r-generics=0.1.0 + - r-ggplot2=3.3.5 + - r-ggrepel=0.9.1 + - r-glue=1.4.2 + - r-googledrive=1.0.1 + - r-googlesheets4=0.3.0 + - r-gtable=0.3.0 + - r-haven=2.4.1 + - r-highr=0.9 + - r-hms=1.1.0 + - r-htmltools=0.5.1.1 + - r-httr=1.4.2 + - r-ids=1.0.1 + - r-isoband=0.2.4 + - r-jsonlite=1.7.2 + - r-knitr=1.33 + - r-labeling=0.4.2 + - r-lattice=0.20_44 + - r-lifecycle=1.0.0 + - r-lubridate=1.7.10 + - r-magrittr=2.0.1 + - r-markdown=1.1 + - r-mass=7.3_54 + - r-matrix=1.3_4 + - r-mgcv=1.8_36 + - r-mime=0.11 + - r-modelr=0.1.8 + - r-munsell=0.5.0 + - r-nlme=3.1_152 + - r-openssl=1.4.4 + - r-pillar=1.6.1 + - r-pkgconfig=2.0.3 + - r-pkgload=1.2.1 + - r-plyr=1.8.6 + - r-praise=1.0.0 + - r-prettyunits=1.1.1 + - r-processx=3.5.2 + - r-progress=1.2.2 + - r-ps=1.6.0 + - r-purrr=0.3.4 + - r-r6=2.5.0 + - r-rappdirs=0.3.3 + - r-rcolorbrewer=1.1_2 + - r-rcpp=1.0.6 + - r-readr=1.4.0 + - r-readxl=1.3.1 + - r-rematch=1.0.1 + - r-rematch2=2.1.2 + - r-reprex=2.0.0 + - r-reshape2=1.4.4 + - r-rjson=0.2.20 + - r-rlang=0.4.11 + - r-rmarkdown=2.9 + - r-rprojroot=2.0.2 + - r-rstudioapi=0.13 + - r-rvest=1.0.0 + - r-scales=1.1.1 + - r-selectr=0.4_2 + - r-stringi=1.6.2 + - r-stringr=1.4.0 + - r-sys=3.4 + - r-testthat=3.0.3 + - r-tibble=3.1.2 + - r-tidyr=1.1.3 + - r-tidyselect=1.1.1 + - r-tidyverse=1.3.1 + - r-tinytex=0.32 + - r-utf8=1.2.1 + - r-uuid=0.1_4 + - r-vctrs=0.3.8 + - r-viridislite=0.4.0 + - r-waldo=0.2.5 + - r-withr=2.4.2 + - r-xfun=0.24 + - r-xml2=1.3.2 + - r-yaml=2.2.1 + - readline=8.1 + - sed=4.8 + - sysroot_linux-64=2.12 + - tk=8.6.10 + - tktable=2.10 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.7.2 + - xorg-libxext=1.3.4 + - xorg-libxrender=0.9.10 + - xorg-libxt=1.2.1 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.5.0 +prefix: /home/lhilton/miniconda3/envs/phylowgs_outputs diff --git a/modules/phylowgs/1.0/config/default.yaml b/modules/phylowgs/1.0/config/default.yaml index bf4705b6..e8e0c1e4 100644 --- a/modules/phylowgs/1.0/config/default.yaml +++ b/modules/phylowgs/1.0/config/default.yaml @@ -4,25 +4,22 @@ lcr-modules: inputs: # Available wildcards: {tumour_id} {normal_id} {pair_status} {genome_build} {sample_id} - # IMPORTANT: PhyloWGS assumes the second sample in the VCF is the tumour. - # If this assumption is wrong, fix your VCF file. - vcf: "__UPDATE__" # Must be strelka- or mutect-formatted VCF file - tbi: "__UPDATE__" + maf: "__UPDATE__" cellularity: "__UPDATE__" # battenberg/99-outputs/txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_subclones.txt subclones: "__UPDATE__" # battenberg/99-outputs/txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_cellularity_ploidy.txt + drivers: "__UPDATE__" # newline-separated list of driver gene HUGO symbols to be included on plots scratch_subdirectories: [] options: create_inputs: opts: - "-s 5000 --verbose" - vcf_type: "__UPDATE__" # Usually either strelka or mutect_smchet + "-s 5000 --verbose --regions all" # -s controls how many variants should be sub-sampled. multievolve: "" write_results: "--include-ssm-names" switches: - # NOTE: You must include a "sex" column in the input samples table. + # NOTE: You must include a "sex" column in the input samples table, formatted with "M" and "F". # If patient sex is unknown, you can leave it empty and phyloWGS will run in "auto" mode. sex: M: "male" @@ -30,20 +27,31 @@ lcr-modules: conda_envs: phylowgs: "{MODSDIR}/envs/phylowgs.yaml" + phylowgs_results: "{MODSDIR}/envs/phylowgs_results.yaml" + fill_battenberg: "{MODSDIR}/envs/fill_battenberg.yaml" + vcf2maf: "{MODSDIR}/envs/vcf2maf-1.6.18.yaml" + bcftools: "{MODSDIR}/envs/bcftools-1.10.2.yaml" + coreutils: "{MODSDIR}/envs/coreutils-8.31.yaml" + + scripts: + fill_battenberg: "{MODSDIR}/src/fill_battenberg.py" + arm_file: "{MODSDIR}/etc/chromArmFiles/chromArm.{genome_build}.tsv" + process_outputs: "src/process_phyloWGS_outputs.R" + noncoding: "{MODSDIR}/etc/noncoding.txt" threads: create_inputs: 1 - multievolve: 24 + multievolve: 4 write_results: 1 resources: create_inputs: - mem_mb: 2000 + mem_mb: 10000 multievolve: mem_mb: 40000 evolve: 1 write_results: - mem_mb: 2000 + mem_mb: 20000 pairing_config: genome: diff --git a/modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml b/modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml new file mode 120000 index 00000000..72959e7b --- /dev/null +++ b/modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml @@ -0,0 +1 @@ +../../../../envs/bcftools/bcftools-1.10.2.yaml \ No newline at end of file diff --git a/modules/phylowgs/1.0/envs/coreutils-8.31.yaml b/modules/phylowgs/1.0/envs/coreutils-8.31.yaml new file mode 120000 index 00000000..050452f7 --- /dev/null +++ b/modules/phylowgs/1.0/envs/coreutils-8.31.yaml @@ -0,0 +1 @@ +../../../../envs/coreutils/coreutils-8.31.yaml \ No newline at end of file diff --git a/modules/phylowgs/1.0/envs/fill_battenberg.yaml b/modules/phylowgs/1.0/envs/fill_battenberg.yaml new file mode 120000 index 00000000..e667a8b1 --- /dev/null +++ b/modules/phylowgs/1.0/envs/fill_battenberg.yaml @@ -0,0 +1 @@ +../../../../envs/phylowgs/fill_battenberg.yaml \ No newline at end of file diff --git a/modules/phylowgs/1.0/envs/phylowgs_results.yaml b/modules/phylowgs/1.0/envs/phylowgs_results.yaml new file mode 120000 index 00000000..926ec438 --- /dev/null +++ b/modules/phylowgs/1.0/envs/phylowgs_results.yaml @@ -0,0 +1 @@ +../../../../envs/phylowgs/phylowgs_results.yaml \ No newline at end of file diff --git a/modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml b/modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml new file mode 120000 index 00000000..829077c7 --- /dev/null +++ b/modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml @@ -0,0 +1 @@ +../../../../envs/vcf2maf/vcf2maf-1.6.18.yaml \ No newline at end of file diff --git a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch37.tsv b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch37.tsv new file mode 100644 index 00000000..91da51a7 --- /dev/null +++ b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch37.tsv @@ -0,0 +1,49 @@ +chromosome start end arm +1 10000 121500000 p +1 142600000 249250621 q +2 10000 90500000 p +2 96800000 243199373 q +3 10000 87900000 p +3 98300000 198022430 q +4 10000 48200000 p +4 52700000 191154276 q +5 10000 46100000 p +5 50700000 180915260 q +6 10000 58700000 p +6 63300000 171115067 q +7 10000 58000000 p +7 61700000 159138663 q +8 10000 43100000 p +8 48100000 146364022 q +9 10000 47300000 p +9 65900000 141213431 q +10 10000 38000000 p +10 42300000 135534747 q +11 10000 51600000 p +11 55700000 135006516 q +12 10000 33300000 p +12 38200000 133851895 q +13 10000 16000000 p +13 19500000 115169878 q +14 10000 14000000 p +14 19100000 107349540 q +15 10000 14000000 p +15 20700000 102531392 q +16 10000 34600000 p +16 47000000 90354753 q +17 10000 22200000 p +17 25800000 81195210 q +18 10000 15400000 p +18 19000000 78077248 q +19 10000 20000000 p +19 32400000 59128983 q +20 10000 25600000 p +20 29400000 63025520 q +21 10000 10000000 p +21 14300000 48129895 q +22 10000 11900000 p +22 17900000 51304566 q +X 10000 58100000 p +X 63000000 155270560 q +Y 10000 11600000 p +Y 13400000 28800000 q diff --git a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch38.tsv b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch38.tsv new file mode 100644 index 00000000..58b866e2 --- /dev/null +++ b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch38.tsv @@ -0,0 +1,49 @@ +chromosome start end arm +1 10000 121700000 p +1 143200000 248956422 q +2 0 91800000 p +2 96000000 242193529 q +3 0 87800000 p +3 98600000 198295559 q +4 0 48200000 p +4 51800000 190214555 q +5 0 46100000 p +5 51400000 181538259 q +6 0 58500000 p +6 62600000 170805979 q +7 0 58100000 p +7 62100000 159345973 q +8 0 43200000 p +8 47200000 145138636 q +9 0 42200000 p +9 61500000 138394717 q +10 10000 38000000 p +10 41600000 133797422 q +11 10000 51000000 p +11 55800000 135086622 q +12 10000 33200000 p +12 37800000 133275309 q +13 10000 16000000 p +13 18900000 114364328 q +14 10000 16000000 p +14 18200000 107043718 q +15 10000 16000000 p +15 20500000 101991189 q +16 0 35300000 p +16 47000000 90338345 q +17 0 22700000 p +17 27400000 83257441 q +18 0 15400000 p +18 21500000 80373285 q +19 0 19900000 p +19 31900000 58617616 q +20 0 25700000 p +20 30400000 64444167 q +21 0 10500000 p +21 13000000 46709983 q +22 10000 14000000 p +22 17400000 50818468 q +X 0 58100000 p +X 63800000 156040895 q +Y 0 10300000 p +Y 10600000 26600000 q diff --git a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg19.tsv b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg19.tsv new file mode 100644 index 00000000..a3c8be28 --- /dev/null +++ b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg19.tsv @@ -0,0 +1,49 @@ +chromosome start end arm +chr1 10000 121500000 p +chr1 142600000 249250621 q +chr2 10000 90500000 p +chr2 96800000 243199373 q +chr3 10000 87900000 p +chr3 98300000 198022430 q +chr4 10000 48200000 p +chr4 52700000 191154276 q +chr5 10000 46100000 p +chr5 50700000 180915260 q +chr6 10000 58700000 p +chr6 63300000 171115067 q +chr7 10000 58000000 p +chr7 61700000 159138663 q +chr8 10000 43100000 p +chr8 48100000 146364022 q +chr9 10000 47300000 p +chr9 65900000 141213431 q +chr10 10000 38000000 p +chr10 42300000 135534747 q +chr11 10000 51600000 p +chr11 55700000 135006516 q +chr12 10000 33300000 p +chr12 38200000 133851895 q +chr13 10000 16000000 p +chr13 19500000 115169878 q +chr14 10000 14000000 p +chr14 19100000 107349540 q +chr15 10000 14000000 p +chr15 20700000 102531392 q +chr16 10000 34600000 p +chr16 47000000 90354753 q +chr17 10000 22200000 p +chr17 25800000 81195210 q +chr18 10000 15400000 p +chr18 19000000 78077248 q +chr19 10000 20000000 p +chr19 32400000 59128983 q +chr20 10000 25600000 p +chr20 29400000 63025520 q +chr21 10000 10000000 p +chr21 14300000 48129895 q +chr22 10000 11900000 p +chr22 17900000 51304566 q +chrX 10000 58100000 p +chrX 63000000 155270560 q +chrY 10000 11600000 p +chrY 13400000 28800000 q diff --git a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg38.tsv b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg38.tsv new file mode 100644 index 00000000..4b5d7b6a --- /dev/null +++ b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg38.tsv @@ -0,0 +1,49 @@ +chromosome start end arm +chr1 10000 121700000 p +chr1 143200000 248956422 q +chr2 0 91800000 p +chr2 96000000 242193529 q +chr3 0 87800000 p +chr3 98600000 198295559 q +chr4 0 48200000 p +chr4 51800000 190214555 q +chr5 0 46100000 p +chr5 51400000 181538259 q +chr6 0 58500000 p +chr6 62600000 170805979 q +chr7 0 58100000 p +chr7 62100000 159345973 q +chr8 0 43200000 p +chr8 47200000 145138636 q +chr9 0 42200000 p +chr9 61500000 138394717 q +chr10 10000 38000000 p +chr10 41600000 133797422 q +chr11 10000 51000000 p +chr11 55800000 135086622 q +chr12 10000 33200000 p +chr12 37800000 133275309 q +chr13 10000 16000000 p +chr13 18900000 114364328 q +chr14 10000 16000000 p +chr14 18200000 107043718 q +chr15 10000 16000000 p +chr15 20500000 101991189 q +chr16 0 35300000 p +chr16 47000000 90338345 q +chr17 0 22700000 p +chr17 27400000 83257441 q +chr18 0 15400000 p +chr18 21500000 80373285 q +chr19 0 19900000 p +chr19 31900000 58617616 q +chr20 0 25700000 p +chr20 30400000 64444167 q +chr21 0 10500000 p +chr21 13000000 46709983 q +chr22 10000 14000000 p +chr22 17400000 50818468 q +chrX 0 58100000 p +chrX 63800000 156040895 q +chrY 0 10300000 p +chrY 10600000 26600000 q diff --git a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hs37d5.tsv b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hs37d5.tsv new file mode 120000 index 00000000..c8477855 --- /dev/null +++ b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hs37d5.tsv @@ -0,0 +1 @@ +chromArm.grch37.tsv \ No newline at end of file diff --git a/modules/phylowgs/1.0/etc/noncoding.txt b/modules/phylowgs/1.0/etc/noncoding.txt new file mode 100644 index 00000000..258f27f0 --- /dev/null +++ b/modules/phylowgs/1.0/etc/noncoding.txt @@ -0,0 +1,8 @@ +Hugo_Symbol +Silent +RNA +IGR +Intron +5'Flank +3'Flank +5'UTR diff --git a/modules/phylowgs/1.0/src/create_phylowgs_inputs.py b/modules/phylowgs/1.0/src/create_phylowgs_inputs.py deleted file mode 100755 index f9ee1474..00000000 --- a/modules/phylowgs/1.0/src/create_phylowgs_inputs.py +++ /dev/null @@ -1,1356 +0,0 @@ -#!/usr/bin/env python2 -from __future__ import print_function - -# Requires PyVCF. To install: pip2 install pyvcf -import vcf -import argparse -import csv -from collections import defaultdict, namedtuple, OrderedDict -import random -import sys -import numpy as np -import numpy.ma as ma -import json -from scipy.stats.mstats import gmean - -VariantId = namedtuple('VariantId', ['CHROM', 'POS']) - -class ReadCountsUnavailableError(Exception): - pass - -class VariantParser(object): - def __init__(self): - # Child classes must give the following variables sensible values in - # constructor so that list_variants() works subsequently. - self._cnvs = None - self._vcf_filename = None - - def list_variants(self): - variants = self._filter(self._vcf_filename) - variants_and_reads = [] - for variant in variants: - try: - ref_reads, total_reads = self._calc_read_counts(variant) - except ReadCountsUnavailableError as exc: - log('Read counts unavailable for %s_%s' % (variant.CHROM, variant.POS)) - continue - variants_and_reads.append((variant, ref_reads, total_reads)) - return variants_and_reads - - def _calc_read_counts(self, variant): - raise Exception('Not implemented -- use child class') - - def _parse_vcf(self, vcf_filename): - vcfr = vcf.Reader(filename=vcf_filename) - records = [] - for variant in vcfr: - variant.CHROM = variant.CHROM.upper() - # Some VCF dialects prepend "chr", some don't. Remove the prefix to - # standardize. - if variant.CHROM.startswith('CHR'): - variant.CHROM = variant.CHROM[3:] - records.append(variant) - return records - - def _does_variant_pass_filters(self, variant): - if variant.FILTER is None: - return True - if len(variant.FILTER) > 0: - # Variant failed one or more filters. - return False - return True - - def _filter(self, vcf_filename): - variants = [] - - all_variants = self._parse_vcf(vcf_filename) - - for variant in all_variants: - if not is_good_chrom(variant.CHROM): - continue - if not self._does_variant_pass_filters(variant): - continue - variants.append(variant) - return variants - - def _get_tumor_index(self, variant, tumor_sample=None): - """Find the index of the tumor sample. - - Currently hardcodes tumour sample as the last column if name not specified. - Might not always be true - """ - if self._tumor_sample: - tumor_is = [i for i, s in enumerate(variant.samples) if s.sample == tumor_sample] - assert len(tumor_is) == 1, "Did not find tumor name %s in samples" % tumor_sample - return tumor_is[0] - else: - # Don't make this -1, as some code assumes it will be >= 0. - return len(variant.samples) - 1 - -class SangerParser(VariantParser): - ''' - Works with PCAWG variant calls from the Sanger. - ''' - def __init__(self, vcf_filename, tumor_sample=None): - self._vcf_filename = vcf_filename - self._tumor_sample = tumor_sample - - def _find_ref_and_variant_nt(self, variant): - assert len(variant.REF) == len(variant.ALT) == 1 - return (str(variant.REF[0]), str(variant.ALT[0])) - - def _calc_read_counts(self, variant): - normal = variant.genotype('NORMAL') - tumor = variant.genotype('TUMOUR') - - reference_nt, variant_nt = self._find_ref_and_variant_nt(variant) - tumor_reads = { - 'forward': { - 'A': int(tumor['FAZ']), - 'C': int(tumor['FCZ']), - 'G': int(tumor['FGZ']), - 'T': int(tumor['FTZ']), - }, - 'reverse': { - 'A': int(tumor['RAZ']), - 'C': int(tumor['RCZ']), - 'G': int(tumor['RGZ']), - 'T': int(tumor['RTZ']), - }, - } - - ref_reads = tumor_reads['forward'][reference_nt] + tumor_reads['reverse'][reference_nt] - # For now, variant reads are defined as only the non-reference nucleotide in - # the inferred tumor SNP. We ignore reads of a third or fourth base. - variant_reads = tumor_reads['forward'][variant_nt] + tumor_reads['reverse'][variant_nt] - total_reads = ref_reads + variant_reads - - return (ref_reads, total_reads) - -class PcawgConsensusParser(VariantParser): - def __init__(self, vcf_filename, tumor_sample=None): - self._vcf_filename = vcf_filename - self._tumor_sample = tumor_sample - - def _find_ref_and_variant_nt(self, variant): - assert len(variant.REF) == len(variant.ALT) == 1 - return (str(variant.REF[0]), str(variant.ALT[0])) - - def _calc_read_counts(self, variant): - if not ('t_alt_count' in variant.INFO and 't_ref_count' in variant.INFO): - raise ReadCountsUnavailableError() - assert len(variant.INFO['t_alt_count']) == len(variant.INFO['t_ref_count']) == 1 - - alt_reads = int(variant.INFO['t_alt_count'][0]) - ref_reads = int(variant.INFO['t_ref_count'][0]) - total_reads = alt_reads + ref_reads - # Some variants havezero alt and ref reads. - if total_reads == 0: - raise ReadCountsUnavailableError() - return (ref_reads, total_reads) - -class MuseParser(VariantParser): - def __init__(self, vcf_filename, tier=0, tumor_sample=None): - self._vcf_filename = vcf_filename - self._tier = tier - self._tumor_sample = tumor_sample - - def _get_normal_genotype(self, variant): - tumor_i = self._get_tumor_index(variant, self._tumor_sample) - assert tumor_i in (0, 1), 'Tumor index %s is not 0 or 1' % tumor_i - normal_i = 1 - tumor_i - return set([int(t) for t in variant.samples[normal_i]['GT'].split('/')]) - - def _calc_read_counts(self, variant): - normal_gt = self._get_normal_genotype(variant) - assert len(normal_gt) == 1 - normal_gt = normal_gt.pop() - - tumor_i = self._get_tumor_index(variant, self._tumor_sample) - total_reads = int(variant.samples[tumor_i]['DP']) - ref_reads = int(variant.samples[tumor_i]['AD'][normal_gt]) - - return (ref_reads, total_reads) - - def _does_variant_pass_filters(self, variant): - # Ignore heterozygous normal variants. - if len(self._get_normal_genotype(variant)) != 1: - return False - if variant.FILTER is None or len(variant.FILTER) == 0: - return True - if int(variant.FILTER[0][-1]) <= self._tier: - # Variant failed one or more filters, but we still accept it. - return True - return False - -class StrelkaParser(VariantParser): - def __init__(self, vcf_filename, tumor_sample=None): - self._vcf_filename = vcf_filename - self._tumor_sample = tumor_sample - - def _does_variant_pass_filters(self, variant): - # Strelka outputs two files one for SNPs, the other for InDels - # For now only deal with SNP file from Strelka - if variant.is_snp: - if variant.FILTER is None or len(variant.FILTER) == 0: - return True - return False - - def _calc_read_counts(self, variant): - alt = variant.ALT[0] - tumor_i = self._get_tumor_index(variant, self._tumor_sample) - total_reads = int(variant.samples[tumor_i]['DP']) - - if alt is None: - total_reads = 0 - variant_reads = 0 - else: - variant_reads = int(getattr(variant.samples[tumor_i].data, str(alt)+'U')[0]) - - ref_reads = total_reads - variant_reads - return (ref_reads, total_reads) - -class SomSnipParser(VariantParser): - def __init__(self, vcf_filename, tumor_sample=None): - self._vcf_filename = vcf_filename - self._tumor_sample = tumor_sample - - def _calc_read_counts(self, variant): - tumor_i = self._get_tumor_index(variant, self._tumor_sample) - highqual_reads = (variant.samples[tumor_i]['DP4']) - assert len(highqual_reads) == 4 - - ref_reads = int(highqual_reads[0]) + int(highqual_reads[1]) - variant_reads = int(highqual_reads[2]) + int(highqual_reads[3]) - - return (ref_reads, ref_reads + variant_reads) - -class MutectTcgaParser(VariantParser): - def __init__(self, vcf_filename, tumor_sample=None): - self._vcf_filename = vcf_filename - self._tumor_sample = tumor_sample - - def _calc_read_counts(self, variant): - tumor_i = self._get_tumor_index(variant, self._tumor_sample) - # TD: Tumor allelic depths for the ref and alt alleles in the order listed - ref_reads, variant_reads = variant.samples[tumor_i]['TD'] - total_reads = ref_reads + variant_reads - return (ref_reads, total_reads) - -class MutectPcawgParser(VariantParser): - def __init__(self, vcf_filename, tumor_sample=None): - self._vcf_filename = vcf_filename - self._tumor_sample = tumor_sample - - def _calc_read_counts(self, variant): - tumor_i = self._get_tumor_index(variant, self._tumor_sample) - ref_reads = int(variant.samples[tumor_i].data.ref_count) - variant_reads = int(variant.samples[tumor_i].data.alt_count) - total_reads = ref_reads + variant_reads - - return (ref_reads, total_reads) - -class MutectSmchetParser(VariantParser): - def __init__(self, vcf_filename, tumor_sample=None): - self._vcf_filename = vcf_filename - self._tumor_sample = tumor_sample - - def _calc_read_counts(self, variant): - tumor_i = self._get_tumor_index(variant, self._tumor_sample) - ref_reads = int(variant.samples[tumor_i]['AD'][0]) - variant_reads = int(variant.samples[tumor_i]['AD'][1]) - total_reads = ref_reads + variant_reads - - return (ref_reads, total_reads) - -class VarDictParser(MutectSmchetParser): - """Support VarDict somatic variant caller. - - https://github.com/AstraZeneca-NGS/VarDictJava - https://github.com/AstraZeneca-NGS/VarDict - - Uses the same read-extraction logic as MuTect (SMC-Het). - """ - pass - -class DKFZParser(VariantParser): - def __init__(self, vcf_filename, tumor_sample=None): - self._vcf_filename = vcf_filename - self._tumor_sample = tumor_sample - - def _calc_read_counts(self, variant): - # This doesn't handle multisample correctly, as I don't know how to get the - # DP4 attribute on multiple DKFZ samples currently. - for_ref_reads = int(variant.INFO['DP4'][0]) - back_ref_reads = int(variant.INFO['DP4'][1]) - for_variant_reads = int(variant.INFO['DP4'][2]) - back_variant_reads = int(variant.INFO['DP4'][3]) - ref_reads = for_ref_reads + back_ref_reads - var_reads = for_variant_reads + back_variant_reads - total_reads = ref_reads + var_reads - - return (ref_reads, total_reads) - -class CnvFormatter(object): - def __init__(self, read_depth, sampidxs, hetsnp_rate): - self._read_depth = read_depth - self._sampidxs = sampidxs - self._hetsnp_rate = hetsnp_rate - - def _find_overlapping_variants(self, chrom, cnv, variants): - overlapping = [] - - start = cnv['start'] - end = cnv['end'] - for variant in variants: - if chrom.upper() == variant['chrom'].upper(): - if start <= variant['pos'] <= end: - overlapping.append(variant['ssm_id']) - return overlapping - - def _calc_ref_reads(self, cellular_prev, total_reads): - ref_reads = np.zeros(len(self._sampidxs), dtype=np.int64) - for sampidx in self._sampidxs: - vaf = cellular_prev[sampidx] / 2 - ref_reads[sampidx] = int((1 - vaf) * total_reads[sampidx]) - return ref_reads - - def _calc_total_reads(self, locus_start, locus_end): - def _calc(samp_read_depth): - # We estimate 7 heterozygous SNPs per 10 kb, which goes as input to CNA - # algorithms. Thus, we determine how many SNPs are equivalent to a region - # of the given size, then weight accordingly. - assert locus_start < locus_end - # Figure out approximately equivalent number of SSMs to this region. - equiv_ssms = (locus_end - locus_start) * self._hetsnp_rate - return int(np.round(equiv_ssms * samp_read_depth)) - - D = [_calc(self._read_depth[sampidx]) for sampidx in self._sampidxs] - return self._cap_cnv_D(D) - - def _format_overlapping_variants(self, variants, maj_cn, min_cn): - assert len(set(maj_cn)) == len(set(min_cn)) == 1 - variants = [(ssm_id, str(min_cn[0]), str(maj_cn[0])) for ssm_id in variants] - return variants - - def _cap_cnv_D(self, D): - # Average tumour has ~3k SSMs, so say that a CNA region should be - # equivalent to no more than this. - avg_ssms_in_tumour = 3000 - D_max = np.round(avg_ssms_in_tumour * self._read_depth).astype(np.int) - D_min = 1 - - D = np.minimum(D_max, D) - D = np.maximum(D_min, D) - return D - - def _format_cnvs(self, cnvs, variants): - log('Estimated read depth: %s' % self._read_depth) - - for chrom, chrom_cnvs in cnvs.items(): - for cnv in chrom_cnvs: - overlapping_variants = self._find_overlapping_variants(chrom, cnv, variants) - total_reads = self._calc_total_reads(cnv['start'], cnv['end']) - ref_reads = self._calc_ref_reads(cnv['cell_prev'], total_reads) - yield { - 'chrom': chrom, - 'start': cnv['start'], - 'end': cnv['end'], - 'major_cn': cnv['major_cn'], - 'minor_cn': cnv['minor_cn'], - 'cellular_prevalence': cnv['cell_prev'], - 'ref_reads': ref_reads, - 'total_reads': total_reads, - 'overlapping_variants': self._format_overlapping_variants(overlapping_variants, cnv['major_cn'], cnv['minor_cn']) - } - - def _merge_variants(self, cnv1, cnv2): - cnv1_variant_names = set([v[0] for v in cnv1['overlapping_variants']]) - for variant in cnv2['overlapping_variants']: - variant_name = variant[0] - if variant_name not in cnv1_variant_names: - cnv1['overlapping_variants'].append(variant) - else: - # If variant already in cnv1's list, ignore it. This should only occur - # if two subclonal CNVs have close to 0.5 frequency each. In this case, - # we lose information about major/minor status of the cnv2 relative to - # its SSMs. - log('%s already in %s' % (variant, cnv1['cnv_id'])) - - # CNVs with similar a/d values should not be free to move around the - # phylogeny independently, and so we merge them into a single entity. We may - # do the same with SNVs bearing similar frequencies later on. - def format_and_merge_cnvs(self, cnvs, variants, cellularity): - formatted = list(self._format_cnvs(cnvs, variants)) - formatted.sort(key = lambda f: f['cellular_prevalence'][0], reverse = True) - if len(formatted) == 0: - return [] - - for cnv in formatted: - physical_cnvs = OrderedDict() - for K in ('chrom', 'start', 'end', 'major_cn', 'minor_cn'): - physical_cnvs[K] = cnv[K] - - assert len(set(physical_cnvs['major_cn'])) == len(set(physical_cnvs['major_cn'])) == 1 - physical_cnvs['major_cn'] = physical_cnvs['major_cn'][0] - physical_cnvs['minor_cn'] = physical_cnvs['minor_cn'][0] - - physical_cnvs['cell_prev'] = '|'.join([str(C) for C in cnv['cellular_prevalence']]) - cnv['physical_cnvs'] = ','.join(['%s=%s' % (K, physical_cnvs[K]) for K in physical_cnvs.keys()]) - - merged, formatted = formatted[:1], formatted[1:] - merged[0]['cnv_id'] = 'c0' - counter = 1 - - for current in formatted: - last = merged[-1] - assert np.all(current['cellular_prevalence'] <= cellularity) and np.all(last['cellular_prevalence'] <= cellularity) - - # Only merge CNVs if they're clonal. If they're subclonal, leave them - # free to move around the tree. - if np.array_equal(current['cellular_prevalence'], last['cellular_prevalence']) \ - and np.array_equal(last['cellular_prevalence'], cellularity): - # Merge the CNVs. - log('Merging %s_%s and %s_%s' % (current['chrom'], current['start'], last['chrom'], last['start'])) - last['total_reads'] = self._cap_cnv_D(current['total_reads'] + last['total_reads']) - last['ref_reads'] = self._calc_ref_reads(last['cellular_prevalence'], last['total_reads']) - last['physical_cnvs'] += ';' + current['physical_cnvs'] - self._merge_variants(last, current) - else: - # Do not merge the CNVs. - current['cnv_id'] = 'c%s' % counter - merged.append(current) - counter += 1 - - return merged - -class VariantFormatter(object): - def __init__(self): - self._counter = 0 - - def _split_types(self, genotype): - types = [int(e) for e in genotype.split('/')] - if len(types) != 2: - raise Exception('Not diploid: %s' % types) - return types - - def _calc_ref_freq(self, ref_genotype, error_rate): - types = self._split_types(ref_genotype) - num_ref = len([t for t in types if t == 0]) - freq = (num_ref / 2) - error_rate - if freq < 0: - freq = 0.0 - if freq > 1: - raise Exception('Nonsensical frequency: %s' % freq) - return freq - - def format_variants(self, variants, ref_read_counts, total_read_counts, error_rate, sex): - for variant_idx, variant in enumerate(variants): - ssm_id = 's%s' % self._counter - if hasattr(variant, 'ID') and variant.ID is not None: - # This field will be defined by PyVCF, but not by our VariantId named - # tuple that we have switched to, so this code will never actually run. - # TODO: fix that. - variant_name = variant.ID - else: - variant_name = '%s_%s' % (variant.CHROM, variant.POS) - - # TODO: switch back to using calc_ref_freq() when we no longer want mu_r - # and mu_v fixed. - # This is mu_r in PhyloWGS. - expected_ref_freq = 1 - error_rate - if variant.CHROM in ('Y', 'M') or (variant.CHROM == 'X' and sex == 'male'): - # Haploid, so should only see non-variants when sequencing error - # occurred. Note that chrY and chrM are always haploid; chrX is haploid - # only in men, so script must know sex of patient to choose correct - # value. Currently, I just assume that all data comes from men. - # - # This is mu_v in PhyloWGS. - expected_var_freq = error_rate - else: - # Diploid, so should see variants in (0.5 - error_rate) proportion of - # reads. - # - # This is mu_v in PhyloWGS. - expected_var_freq = 0.5 - error_rate - - yield { - 'ssm_id': ssm_id, - 'chrom': variant.CHROM, - 'pos': variant.POS, - 'variant_name': variant_name, - 'ref_reads': list(ref_read_counts[variant_idx,:]), - 'total_reads': list(total_read_counts[variant_idx,:]), - 'expected_ref_freq': expected_ref_freq, - 'expected_var_freq': expected_var_freq, - } - self._counter += 1 - -def restricted_float(x): - x = float(x) - if x < 0.0 or x > 1.0: - raise argparse.ArgumentTypeError('%r not in range [0.0, 1.0]' % x) - return x - -def chrom_key(chrom): - if chrom.isdigit(): - return int(chrom) - elif chrom == 'X': - return 100 - elif chrom == 'Y': - return 101 - else: - raise Exception('Unknown chrom: %s' % chrom) - -def variant_key(var): - chrom = chrom_key(var.CHROM) - return (chrom, var.POS) - -class Segmenter(object): - def _organize_cnvs(self, cnv_set): - organized = defaultdict(list) - - for sampidx, cnvs in enumerate(cnv_set): - for chrom, chrom_cnvs in cnvs.items(): - for cnv in chrom_cnvs: - organized[chrom].append({ - 'sample': sampidx, - 'start': cnv['start'], - 'end': cnv['end'], - 'major_cn': cnv['major_cn'], - 'minor_cn': cnv['minor_cn'], - 'cell_prev': cnv['cellular_prevalence'] - }) - - for chrom, cnvs in organized.items(): - # Intervals may not be sorted in input file. - cnvs.sort(key = lambda c: c['start']) - - return organized - - def _create_intervals(self, cnv_set): - # intervals[chrom][(major, minor)] - intervals = defaultdict(list) - min_size_for_inclusion = 1 - - for chrom, cnvs in cnv_set.items(): - for cnv in cnvs: - # We sorted above to place start coordinates after end coordinates. But - # if a CNV was listed with *the same* start and end position (meaning a - # zero-length record, assuming intervals that are left-closed but - # right-open), we will encounter the end for that record before its - # start. As such, the "open_samples.remove()" call below will fail, as - # the given intervals will not have been opened when we encounter its - # end. - # - # Note the above assumes a half-open interpretation of intervals. I - # don't think I implemented this -- if I recall, the code dealing with - # CNVs (such as determining SSM overlap) assumes fully-closed intervals - # (i.e., it doesn't check if cnv.start <= ssm.locus <= (cnv.end + 1)). - # Normally this doesn't matter, given the low resolution of CNV calls - # -- we should never encounter such small intervals. But a pathological - # case in which CNV inputs had the same start & end coordinates for - # some intervals revealed that the code crashes on this input. We - # should provide a more informative error in such cases, which the - # following assertion does. - assert cnv['start'] < cnv['end'], ('In CNV %s, start position occurs at or after the end position' % cnv) - - start_pos = [(c['start'], 'start', (c['sample'], c['cell_prev'], c['major_cn'], c['minor_cn'])) for c in cnvs] - end_pos = [(c['end'], 'end', (c['sample'], c['cell_prev'], c['major_cn'], c['minor_cn'])) for c in cnvs] - - # True > False, so this sorting will place start positions after end - # positions if both have same coordinate. - positions = sorted(start_pos + end_pos, key = lambda e: (e[0], e[1] == 'start')) - assert len(positions) >= 2, 'Fewer than two positions in %s' % positions - - # prev_pos is updated each time we move to a new coordinate on the - # chromosome. Multiple start or end points may be associated with any - # given coordinate. - prev_pos = None - open_samples = [] - idx = 0 - - while idx < len(positions): - points_at_locus = [positions[idx]] - locus = points_at_locus[0][0] - - # Gather all interval breakpoints at this locus. - while True: - assert positions[idx][0] >= locus - idx += 1 - if idx == len(positions) or positions[idx][0] > locus: - break - points_at_locus.append(positions[idx]) - - if prev_pos is None: - assert len(open_samples) == 0 - - if len(open_samples) > 0: - # If some samples are already open from previous loci (such that - # last_pos will not be None), add this interval. - assert locus > prev_pos - interval = (prev_pos, locus) - if interval[1] - interval[0] > min_size_for_inclusion: - intervals[chrom].append((interval[0], interval[1], sorted(open_samples))) - else: - # All points should be start points. - assert set([i[1] for i in points_at_locus]) == set(['start']) - - prev_pos = locus - - # Update open_samples in accordance with whether each breakpoint at - # this locus starts or ends an interval. - for pos, pt_type, (sampidx, cell_prev, major_cn, minor_cn) in points_at_locus: - if pt_type == 'start': - log('Adding ', (pos, pt_type, sampidx, cell_prev, major_cn, minor_cn)) - open_samples.append((sampidx, cell_prev, major_cn, minor_cn)) - elif pt_type == 'end': - log('Removing ', (pos, pt_type, sampidx, cell_prev, major_cn, minor_cn)) - open_samples.remove((sampidx, cell_prev, major_cn, minor_cn)) - else: - raise Exception('Unknown point type: %s' % pt_type) - - assert len(open_samples) == 0 - - return intervals - - def _merge_adjacent(self, cncalls, allowed_gap = 0): - cncalls.sort(key = lambda c: (Util.chrom_key(c['chrom']), c['start'])) - merged = [] - idx = 0 - while idx < len(cncalls): - adjacent = [cncalls[idx]] - idx += 1 - - while idx < len(cncalls) and \ - cncalls[idx]['chrom'] == adjacent[-1]['chrom'] and \ - cncalls[idx]['major_cn'] == adjacent[-1]['major_cn'] and \ - cncalls[idx]['minor_cn'] == adjacent[-1]['minor_cn'] and \ - 0 <= cncalls[idx]['start'] - adjacent[-1]['end'] <= allowed_gap: - adjacent.append(cncalls[idx]) - idx += 1 - - if len(adjacent) > 1: - log('Merging ', adjacent) - copy = dict(adjacent[0]) - copy['end'] = adjacent[-1]['end'] - merged.append(copy) - else: - merged.append(adjacent[0]) - - return merged - - def segment(self, cn_calls): - # Merge adjacent CNVs here rather than when data loaded, as what can be - # merged will be determined by what tetraploidy correction, if any, is - # applied to the data. - #for sampidx, cnvs in enumerate(cn_calls): - #cn_calls[sampidx] = self._merge_adjacent(cnvs) - organized = self._organize_cnvs(cn_calls) - return self._create_intervals(organized) - -class MultisampleCnvCombiner(object): - def __init__(self, cn_regions, cellularity, sex): - self.sampidxs = set(range(len(cn_regions))) - segments = Segmenter().segment(cn_regions) - self._cnvs = self._reformat_segments_as_cnvs(segments) - self._cellularity = cellularity - self._sex = sex - - def _reformat_segments_as_cnvs(self, segments): - reformatted = defaultdict(list) - _retrieve_val = lambda idx: np.array(zip(*open_samples)[idx]) - - for chrom, chrom_cnvs in segments.items(): - for start, end, open_samples in chrom_cnvs: - sampidx = _retrieve_val(0) - cell_prev = _retrieve_val(1) - major_cn = _retrieve_val(2) - minor_cn = _retrieve_val(3) - cnv = { - 'start': start, - 'end': end, - 'cell_prev': cell_prev, - 'major_cn': major_cn, - 'minor_cn': minor_cn, - 'sampidx': sampidx, - } - reformatted[chrom].append(cnv) - - return reformatted - - def _ensure_no_overlap(self, cnvs): - for chrom, chrom_cnvs in cnvs.items(): - for idx in range(len(chrom_cnvs) - 1): - current, next = chrom_cnvs[idx], chrom_cnvs[idx + 1] - assert current['start'] < current['end'] <= next['start'] < next['end'] - - def _is_region_normal_cn(self, chrom, major, minor): - return self._is_multisample_region_normal_cn(chrom, [major], [minor]) - - def _is_multisample_region_normal_cn(self, chrom, major, minor): - normal_major = set([1]) - if self._sex == 'male' and chrom in (('X', 'Y')): - normal_minor = set([0]) - else: - normal_minor = set([1]) - return set(major) == normal_major and set(minor) == normal_minor - - def _get_abnormal_state_for_all_samples(self, chrom, cnv): - '''On a per-sample basis, record which samples report the CNA is abnormal - CN, and which report it is normal CN. If multiple different abnormal states - occur in different samples, return None.''' - # All samples must have at least one record for this region, or don't - # include it. - if set(cnv['sampidx']) != self.sampidxs: - return None - - abnormal_state = None - filtered = [] - - for sampidx, cell_prev, major, minor in zip(cnv['sampidx'], cnv['cell_prev'], cnv['major_cn'], cnv['minor_cn']): - # Region may be (clonal or subclonal) normal in a sample, so ignore such records. - if self._is_region_normal_cn(chrom, major, minor): - continue - - # Either we haven't observed an abnormal CN state in this region before, - # or the observed abnormal state matches what we've already seen. - if abnormal_state is None or abnormal_state == (major, minor): - abnormal_state = (major, minor) - filtered.append({'sampidx': sampidx, 'cell_prev': cell_prev, 'major_cn': major, 'minor_cn': minor}) - continue - # The abnormal state (i.e., major & minor alleles) is *different* from - # what we've seen before. The PWGS model doesn't currently account for - # such cases, so ignore the region. - else: - return None - - # None of the observed records were abnormal -- i.e., all samples report - # the region is normal. Reject the region. - if abnormal_state is None: - return None - - retained_sampidxs = [F['sampidx'] for F in filtered] - # Sanity check: when we originally parsed the CNVs, the samples should have - # been added in order, and that ought not to have changed. - assert retained_sampidxs == sorted(retained_sampidxs) - # Sanity check: we should have no duplicate samples. While a given sample - # may report any number of records for a region, above we discarded normal - # regions, and ensured that only one abnormal state exists in all samples. - # Thus, we should have no more than one record per sample for this region. - assert len(retained_sampidxs) == len(set(retained_sampidxs)) - - # Add a record for all samples that reported this region as clonal normal. - cell_prev_when_absent = 0 - for missing_sampidx in self.sampidxs - set(retained_sampidxs): - filtered.append({ - 'sampidx': missing_sampidx, - 'cell_prev': cell_prev_when_absent, - 'major_cn': abnormal_state[0], - 'minor_cn': abnormal_state[1] - }) - # Sort by sampidx. - filtered.sort(key = lambda F: F['sampidx']) - # Ensure all samples have one record. - assert len(filtered) == len(self.sampidxs) - - return filtered - - def load_single_abnormal_state_cnvs(self): - ''' - Return all regions that possess at most one abnormal state across samples. - E.g., given three samples, S_1 and S_3 report the region as (2, 1) (with - potentially different cellular prevalences), while S_2 lists it as clonal - (1, 1). In such an instance, the record for S_2 will *not* indicate the - region is normal. Instead, the S_2 record will show a state of (2, 1) with - a cellular prevalence of zero. This is done so that we can calculate - sensible `a` and `d` values for cnv_data.txt. - ''' - # In Battenberg, either one region is normal and the other abnormal, - # or both are abnormal. - # In TITAN, only one abnormal region will be listed, without a - # corresponding normal region. - abnormal_cnvs = defaultdict(list) - - for chrom, chrom_cnvs in self._cnvs.items(): - if not is_good_chrom(chrom): - continue - for cnv in chrom_cnvs: - states_for_all_samples = self._get_abnormal_state_for_all_samples(chrom, cnv) - if states_for_all_samples is None: - continue - - combined_states = { K: np.array([S[K] for S in states_for_all_samples]) for K in states_for_all_samples[0].keys() } - cnv.update(combined_states) - abnormal_cnvs[chrom].append(cnv) - abnormal_cnvs[chrom].sort(key = lambda C: C['start']) - - self._ensure_no_overlap(abnormal_cnvs) - return abnormal_cnvs - - def load_normal_cnvs(self): - ''' - Return all regions that are clonal normal across all samples. - ''' - normal_cnvs = defaultdict(list) - - for chrom, chrom_cnvs in self._cnvs.items(): - if not is_good_chrom(chrom): - continue - for cnv in chrom_cnvs: - if not self._is_multisample_region_normal_cn(chrom, cnv['major_cn'], cnv['minor_cn']): - continue - if not set(cnv['sampidx']) == self.sampidxs: - continue - if not np.array_equal(cnv['cell_prev'], self._cellularity): - # The region must be clonal normal to be retained. This check - # shouldn't be necessary, as we've already ensured all calls have - # major = minor = 1, but we perform it just to be thorough. - continue - normal_cnvs[chrom].append(cnv) - normal_cnvs[chrom].sort(key = lambda C: C['start']) - - self._ensure_no_overlap(normal_cnvs) - return normal_cnvs - - def load_cnvs(self): - ''' - Return both normal and abnormal regions. - ''' - combined = defaultdict(list) - - normal_cnvs = self.load_normal_cnvs() - abnormal_cnvs = self.load_single_abnormal_state_cnvs() - for chrom in set(normal_cnvs.keys()) | set(abnormal_cnvs.keys()): - combined[chrom] = normal_cnvs[chrom] + abnormal_cnvs[chrom] - combined[chrom].sort(key = lambda C: C['start']) - self._ensure_no_overlap(combined) - - return combined - -class VariantAndCnvGroup(object): - def __init__(self, hetsnp_rate): - self._multisamp_cnv = None - self._cellularity = None - self._hetsnp_rate = hetsnp_rate - - def add_variants(self, variants, ref_read_counts, total_read_counts): - self._variants = variants - # Ensure no duoplicates. - assert len(variants) == len(set(variants)) - # Note that self._variant_idxs will change as we filter out variants, - # reflecting only the remaining valid variants. self._variants, however, - # will not change. - self._variant_idxs = list(range(len(variants))) - self._ref_read_counts = ref_read_counts - self._total_read_counts = total_read_counts - # Estimate read depth before any filtering of variants is performed, in - # case no SSMs remain afterward. - self._estimated_read_depth = self._estimate_read_depth() - - def _find_cellularity(self, cnvs): - max_cellular_prevs = np.zeros(len(cnvs)) - - for sampidx, sample_cnvs in enumerate(cnvs): - for chrom_regions in sample_cnvs.values(): - for cnr in chrom_regions: - if cnr['cellular_prevalence'] > max_cellular_prevs[sampidx]: - max_cellular_prevs[sampidx] = cnr['cellular_prevalence'] - - return max_cellular_prevs - - def add_cnvs(self, cn_regions, sex): - self._cellularity = self._find_cellularity(cn_regions) - self._multisamp_cnv = MultisampleCnvCombiner(cn_regions, self._cellularity, sex) - self._sampidxs = self._multisamp_cnv.sampidxs - - def has_cnvs(self): - return self._multisamp_cnv is not None - - def _filter_variants_outside_regions(self, regions, before_label, after_label): - def _is_pos_in_regions(chrom, pos): - for cnv in regions[chrom]: - if cnv['start'] <= pos <= cnv['end']: - return True - return False - - filtered = [] - - for vidx in self._variant_idxs: - variant = self._variants[vidx] - if _is_pos_in_regions(variant.CHROM, variant.POS): - filtered.append(vidx) - - self._print_variant_differences( - [self._variants[idx] for idx in self._variant_idxs], - [self._variants[idx] for idx in filtered], - before_label, - after_label - ) - self._variant_idxs = filtered - - def _print_variant_differences(self, before, after, before_label, after_label): - before = set(before) - after = set(after) - log('%s=%s %s=%s delta=%s' % (before_label, len(before), after_label, len(after), len(before) - len(after))) - - assert after.issubset(before) - removed = list(before - after) - removed.sort(key = variant_key) - - def _print_region(var): - var_name = '%s_%s' % (var.CHROM, var.POS) - region_type = None - containing_cnv = None - - for cnv in self._multisamp_cnv.load_normal_cnvs()[var.CHROM]: - if cnv['start'] <= var.POS <= cnv['end']: - region_type = 'normal' - containing_cnv = cnv - break - for cnv in self._multisamp_cnv.load_single_abnormal_state_cnvs()[var.CHROM]: - if cnv['start'] <= var.POS <= cnv['end']: - assert region_type is None and containing_cnv is None - region_type = 'abnormal' - containing_cnv = cnv - break - - if containing_cnv is not None: - log('%s\t[in %s-CN region chr%s(%s, %s)]' % (var_name, region_type, var.CHROM, containing_cnv['start'], containing_cnv['end'])) - else: - log('%s\t[outside all regions]' % var_name) - - for var in removed: - _print_region(var) - - def retain_only_variants_in_normal_cn_regions(self): - if not self.has_cnvs(): - raise Exception('CN regions not yet provided') - - normal_cn = self._multisamp_cnv.load_normal_cnvs() - filtered = self._filter_variants_outside_regions(normal_cn, 'all_variants', 'only_normal_cn') - - def exclude_variants_in_multiple_abnormal_or_unlisted_regions(self): - # Battenberg: - # Five possible placements for variant in Battenberg according to CN records: - # 1 record: - # That record has normal CN: include - # That record has abnormal CN: include - # 2 records: - # One record is normal CN, one record is abnormal CN: include - # Both records are abnormal CN: exclude (as we don't know what order the CN events occurred in) - # TITAN: - # In output seen to date, TITAN will only list one record per region. If - # the CN state is abnormal and clonal_frac < 1, this implies the - # remainder of the region will be normal CN. Multiple abnormal records - # for the same region are likely possible, but I haven't yet seen any. - # Regardless, when they occur, they should be properly handled by the - # code. - if not self.has_cnvs(): - raise Exception('CN regions not yet provided') - - # If variant isn't listed in *any* region: exclude (as we suspect CNV - # caller didn't know what to do with the region). - self._filter_variants_outside_regions(self._multisamp_cnv.load_cnvs(), 'all_variants', 'within_cn_regions') - - def format_variants(self, sample_size, error_rate, priority_ssms, only_priority, sex): - if sample_size is None: - sample_size = len(self._variant_idxs) - random.shuffle(self._variant_idxs) - - subsampled, nonsubsampled = [], [] - variant_idx_map = {self._variants[idx]: idx for idx in self._variant_idxs} - used_variant_idxs = set() # Use a set for O(1) testing of membership. - - for prissm in priority_ssms: - if prissm not in variant_idx_map: - continue - if len(subsampled) >= sample_size: - break - log('%s_%s in priority' % (prissm.CHROM, prissm.POS)) - varidx = variant_idx_map[prissm] - used_variant_idxs.add(varidx) - subsampled.append(varidx) - - for variant_idx in self._variant_idxs: - if variant_idx in used_variant_idxs: - continue - used_variant_idxs.add(variant_idx) - variant = self._variants[variant_idx] - if (not only_priority) and len(subsampled) < sample_size: - subsampled.append(variant_idx) - else: - nonsubsampled.append(variant_idx) - - assert len(used_variant_idxs) == len(self._variant_idxs) == len(subsampled) + len(nonsubsampled) - - subsampled.sort(key = lambda idx: variant_key(self._variants[idx])) - subsampled_variants = get_elements_at_indices(self._variants, subsampled) - subsampled_ref_counts = self._ref_read_counts[subsampled,:] - subsampled_total_counts = self._total_read_counts[subsampled,:] - - nonsubsampled.sort(key = lambda idx: variant_key(self._variants[idx])) - nonsubsampled_variants = get_elements_at_indices(self._variants, nonsubsampled) - nonsubsampled_ref_counts = self._ref_read_counts[nonsubsampled,:] - nonsubsampled_total_counts = self._total_read_counts[nonsubsampled,:] - - formatter = VariantFormatter() - subsampled_formatted = list(formatter.format_variants(subsampled_variants, subsampled_ref_counts, subsampled_total_counts, error_rate, sex)) - nonsubsampled_formatted = list(formatter.format_variants(nonsubsampled_variants, nonsubsampled_ref_counts, nonsubsampled_total_counts, error_rate, sex)) - - return (subsampled_formatted, nonsubsampled_formatted) - - def write_variants(self, variants, outfn): - with open(outfn, 'w') as outf: - print('\t'.join(('id', 'gene', 'a', 'd', 'mu_r', 'mu_v')), file=outf) - for variant in variants: - variant['ref_reads'] = ','.join([str(v) for v in variant['ref_reads']]) - variant['total_reads'] = ','.join([str(v) for v in variant['total_reads']]) - vals = ( - 'ssm_id', - 'variant_name', - 'ref_reads', - 'total_reads', - 'expected_ref_freq', - 'expected_var_freq', - ) - vals = [variant[k] for k in vals] - print('\t'.join([str(v) for v in vals]), file=outf) - - def _estimate_read_depth(self): - read_sum = 0 - if len(self._variants) == 0: - default_read_depth = 50 - log('No variants available, so fixing read depth at %s.' % default_read_depth) - return default_read_depth - else: - return np.nanmedian(self._total_read_counts, axis=0) - - def write_cnvs(self, variants, outfn): - with open(outfn, 'w') as outf: - print('\t'.join(('cnv', 'a', 'd', 'ssms', 'physical_cnvs')), file=outf) - formatter = CnvFormatter(self._estimated_read_depth, self._sampidxs, self._hetsnp_rate) - for cnv in formatter.format_and_merge_cnvs(self._multisamp_cnv.load_single_abnormal_state_cnvs(), variants, self._cellularity): - overlapping = [','.join(o) for o in cnv['overlapping_variants']] - vals = ( - cnv['cnv_id'], - ','.join([str(V) for V in cnv['ref_reads']]), - ','.join([str(V) for V in cnv['total_reads']]), - ';'.join(overlapping), - cnv['physical_cnvs'] - ) - print('\t'.join(vals), file=outf) - -def log(*msgs): - if log.verbose: - print(*msgs, file=sys.stderr) -log.verbose = False - -class CnvParser(object): - def __init__(self, cn_filename): - self._cn_filename = cn_filename - - def parse(self): - cn_regions = defaultdict(list) - - with open(self._cn_filename) as cnf: - reader = csv.DictReader(cnf, delimiter='\t') - for record in reader: - chrom = record['chromosome'].upper() - del record['chromosome'] - for key in ('start', 'end', 'major_cn', 'minor_cn'): - # Some records from Battenberg have major and minor listed as, e.g., - # "1.0", so cast to float before int. - assert float(record[key]) == int(float(record[key])) - record[key] = int(float(record[key])) - record['cellular_prevalence'] = float(record['cellular_prevalence']) - cn_regions[chrom].append(record) - - # Ensure CN regions are properly sorted, which we later rely on when - # filtering out regions with multiple abnormal CN states. - for chrom, regions in cn_regions.items(): - cn_regions[chrom] = sorted(regions, key = lambda r: r['start']) - - return cn_regions - -def get_elements_at_indices(L, indices): - elem = [] - for idx in indices: - elem.append(L[idx]) - return elem - -def parse_priority_ssms(priority_ssm_filename): - if priority_ssm_filename is None: - return [] - priority_ssms = [] - already_seen = set() - - with open(priority_ssm_filename) as priof: - for line in priof: - chrom, pos = line.strip().split('_', 1) - variant = VariantId(CHROM=chrom.upper(), POS=int(pos)) - # Prevent duplicates -- otherwise, we'll add the variant to our - # subsampled list of variants twice. This manifested as a problem in the - # PCAWG 6cfce053-bfd6-4ca0-b74b-b2e4549e4f1f sample. - if variant in already_seen: - continue - priority_ssms.append(variant) - already_seen.add(variant) - - return priority_ssms - -def impute_missing_total_reads(total_reads, missing_variant_confidence): - # Change NaNs to masked values via SciPy. - masked_total_reads = ma.fix_invalid(total_reads) - - # Going forward, suppose you have v variants and s samples in a v*s matrix of - # read counts. Missing values are masked. - - # Calculate geometric mean of variant read depth in each sample. Result: s*1 - sample_means = gmean(masked_total_reads, axis=0) - assert np.sum(sample_means <= 0) == np.sum(np.isnan(sample_means)) == 0 - # Divide every variant's read count by its mean sample read depth to get read - # depth enrichment relative to other variants in sample. Result: v*s - normalized_to_sample = np.dot(masked_total_reads, np.diag(1./sample_means)) - # For each variant, calculate geometric mean of its read depth enrichment - # across samples. Result: v*1 - variant_mean_reads = gmean(normalized_to_sample, axis=1) - assert np.sum(variant_mean_reads <= 0) == np.sum(np.isnan(variant_mean_reads)) == 0 - - # Convert 1D arrays to vectors to permit matrix multiplication. - imputed_counts = np.dot(variant_mean_reads.reshape((-1, 1)), sample_means.reshape((1, -1))) - nan_coords = np.where(np.isnan(total_reads)) - total_reads[nan_coords] = imputed_counts[nan_coords] - assert np.sum(total_reads <= 0) == np.sum(np.isnan(total_reads)) == 0 - - total_reads[nan_coords] *= missing_variant_confidence - return np.floor(total_reads).astype(np.int) - -def impute_missing_ref_reads(ref_reads, total_reads): - ref_reads = np.copy(ref_reads) - - assert np.sum(np.isnan(total_reads)) == 0 - nan_coords = np.where(np.isnan(ref_reads)) - ref_reads[nan_coords] = total_reads[nan_coords] - assert np.sum(np.isnan(ref_reads)) == 0 - - return ref_reads.astype(np.int) - -def is_good_chrom(chrom): - # Ignore the following: - # * Variants unmapped ('chrUn') or mapped to fragmented chromosome ('_random') - # * Weird chromosomes from Mutect (e.g., "chr17_ctg5_hap1"). - # * Mitochondrial ("mt" or "m"), which are weird - # * Sex chromosomes difficult to deal with, as expected frequency depends on - # whether patient is male or female, so ignore them for now. TODO: fix this. - if chrom in [str(i) for i in range(1, 23)] + ['X', 'Y']: - return True - else: - return False - -def parse_variants(samples, vcf_files, vcf_types, tumor_sample, missing_variant_confidence): - parsed_variants = [] - all_variant_ids = [] - num_samples = len(samples) - - for sample in samples: - vcf_fn, vcf_type = vcf_files[sample], vcf_types[sample] - - if vcf_type == 'sanger': - variant_parser = SangerParser(vcf_fn, tumor_sample) - elif vcf_type == 'mutect_pcawg': - variant_parser = MutectPcawgParser(vcf_fn, tumor_sample) - elif vcf_type == 'mutect_smchet': - variant_parser = MutectSmchetParser(vcf_fn, tumor_sample) - elif vcf_type == 'mutect_tcga': - variant_parser = MutectTcgaParser(vcf_fn, tumor_sample) - elif vcf_type == 'muse': - variant_parser = MuseParser(vcf_fn, muse_tier, tumor_sample) - elif vcf_type == 'dkfz': - variant_parser = DKFZParser(vcf_fn, tumor_sample) - elif vcf_type == 'strelka': - variant_parser = StrelkaParser(vcf_fn, tumor_sample) - elif vcf_type == 'vardict': - variant_parser = VarDictParser(vcf_fn, tumor_sample) - elif vcf_type == 'pcawg_consensus': - variant_parser = PcawgConsensusParser(vcf_fn, tumor_sample) - elif vcf_type == 'somsnip': - variant_parser = SomSnipParser(vcf_fn, tumor_sample) - else: - raise Exception('Unknowon variant type: %s' % vcf_type) - - parsed_variants.append(variant_parser.list_variants()) - variant_ids = [VariantId(str(v[0].CHROM), int(v[0].POS)) for v in parsed_variants[-1]] - all_variant_ids += variant_ids - - all_variant_ids = list(set(all_variant_ids)) # Eliminate duplicates. - all_variant_ids.sort(key = variant_key) - num_variants = len(all_variant_ids) - variant_positions = dict(zip(all_variant_ids, range(num_variants))) - - total_read_counts = np.zeros((num_variants, num_samples)) - total_read_counts.fill(np.nan) - ref_read_counts = np.copy(total_read_counts) - - for sample_idx, parsed in enumerate(parsed_variants): - for variant, ref_reads, total_reads in parsed: - variant_id = VariantId(str(variant.CHROM), int(variant.POS)) - variant_idx = variant_positions[variant_id] - ref_read_counts[variant_idx, sample_idx] = ref_reads - total_read_counts[variant_idx, sample_idx] = total_reads - - total_read_counts = impute_missing_total_reads(total_read_counts, missing_variant_confidence) - ref_read_counts = impute_missing_ref_reads(ref_read_counts, total_read_counts) - return (all_variant_ids, ref_read_counts, total_read_counts) - -def infer_sex(variant_ids): - num_y_variants = len([V for V in variant_ids if V.CHROM == 'Y']) - if num_y_variants > 0: - return 'male' - else: - return 'female' - -def extract_sample_data(vcf_files_and_samples, vcf_types_and_samples, cnv_files_and_samples): - vcf_files = {} - vcf_types = {} - cnv_files = {} - - assert len(vcf_files_and_samples) == len(vcf_types_and_samples), 'Must specify same number of VCF files and VCF types' - srcs_and_dsts = [(vcf_files_and_samples, vcf_files), (vcf_types_and_samples, vcf_types)] - - should_use_cnvs = cnv_files_and_samples is not None - if should_use_cnvs: - assert len(cnv_files_and_samples) == len(vcf_files_and_samples), 'Must specify same number of VCF and CNV files' - srcs_and_dsts.append( (cnv_files_and_samples, cnv_files) ) - - for (src, dst) in srcs_and_dsts: - for combined in src: - assert '=' in combined, ('%s should be in format =' % combined) - sample, val = combined.split('=', 1) - dst[sample] = val - - # Sample order will dictate eventual output order. - common_samps = reduce(lambda s1, s2: s1 & s2, [set(D[1].keys()) for D in srcs_and_dsts]) - ordered_samps = [S.split('=', 1)[0] for S in vcf_files_and_samples] - assert len(ordered_samps) == len(common_samps) # Ensure no duplicates. - - assert set(vcf_files.keys()) == common_samps, \ - ('VCF file samples (%s) differ from common samples (%s)' % (vcf_files.keys(), common_samps)) - assert set(vcf_types.keys()) == common_samps, \ - ('VCF type samples (%s) differ from common samples (%s)' % (vcf_types.keys(), common_samps)) - if should_use_cnvs: - assert set(cnv_files.keys()) == common_samps, \ - ('CNV file samples (%s) differ from CNV file samples (%s)' % (cnv_files.keys(), common_samps)) - - return (ordered_samps, vcf_files, vcf_types, cnv_files) - -def main(): - all_vcf_types = set(('sanger', 'mutect_pcawg', 'mutect_smchet', 'mutect_tcga', 'muse','dkfz', 'strelka', 'vardict', 'pcawg_consensus')) - - parser = argparse.ArgumentParser( - description='Create ssm_data.txt and cnv_data.txt input files for PhyloWGS from VCF and CNV data.', - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument('--vcf-type', dest='vcf_types', action='append', required=True, - help='Type of VCF file for each sample, specified as =. Valid VCF types are %s.' % ','.join(all_vcf_types)) - parser.add_argument('-e', '--error-rate', dest='error_rate', type=restricted_float, default=0.001, - help='Expected error rate of sequencing platform') - parser.add_argument('--missing-variant-confidence', dest='missing_variant_confidence', type=restricted_float, default=1., - help='Confidence in range [0, 1] that SSMs missing from a sample are indeed not present in that sample') - parser.add_argument('-s', '--sample-size', dest='sample_size', type=int, - help='Subsample SSMs to reduce PhyloWGS runtime') - parser.add_argument('-P', '--priority-ssms', dest='priority_ssm_filename', - help='File containing newline-separated list of SSMs in "_" format to prioritize for inclusion') - parser.add_argument('--only-priority', dest='only_priority', action='store_true', - help='Only sample variants provided on priority list') - parser.add_argument('--cnvs', dest='cnv_files', action='append', - help='Path to CNV file created with parse_cnvs.py for each sample. Specified as =.') - parser.add_argument('--regions', dest='regions', choices=('normal_cn', 'normal_and_abnormal_cn', 'all'), default='normal_and_abnormal_cn', - help='Which regions to use variants from. Refer to the parser README for more details.') - parser.add_argument('--output-cnvs', dest='output_cnvs', default='cnv_data.txt', - help='Output destination for CNVs') - parser.add_argument('--output-variants', dest='output_variants', default='ssm_data.txt', - help='Output destination for variants') - parser.add_argument('--output-params', dest='output_params', default='params.json', - help='Output destination for run parameters') - parser.add_argument('--tumor-sample', dest='tumor_sample', - help='Name of the tumor sample in the input VCF file. Defaults to last sample if not specified.') - parser.add_argument('--muse-tier', dest='muse_tier', type=int, default=0, - help='Maximum MuSE tier to include') - parser.add_argument('--nonsubsampled-variants', dest='output_nonsubsampled_variants', - help='If subsampling, write nonsubsampled variants to separate file, in addition to subsampled variants') - parser.add_argument('--nonsubsampled-variants-cnvs', dest='output_nonsubsampled_variants_cnvs', - help='If subsampling, write CNVs for nonsubsampled variants to separate file') - parser.add_argument('--sex', dest='sex', default='auto', choices=('auto', 'male', 'female'), - help='Sex of patient. Used to adjust expected variant frequencies on sex chromosomes. ' + - 'If auto, patient is set to male if any variants are provided on the Y chromosome, and female otherwise.') - parser.add_argument('--het-snp-rate', dest='hetsnp_rate', default=7e-4, type=float, - help='Average number of heterozygous SNPs per base used to call copy ' + - 'number. This determines how heavily we weight somatic CNAs relative to ' + - 'SNVs. Defaults to 7 SNPs per 10 kb, as per Battenberg.') - parser.add_argument('--verbose', dest='verbose', action='store_true') - parser.add_argument('vcf_files', nargs='+', help='Path to VCF file for each sample. Specified as =.') - args = parser.parse_args() - - log.verbose = args.verbose - params = {} - - samples, vcf_files, vcf_types, cnv_files = extract_sample_data(args.vcf_files, args.vcf_types, args.cnv_files) - params['samples'], params['vcf_files'], params['vcf_types'], params['cnv_files'] = samples, vcf_files, vcf_types, cnv_files - num_samples = len(samples) - variant_ids, ref_read_counts, total_read_counts = parse_variants(samples, vcf_files, vcf_types, args.tumor_sample, args.missing_variant_confidence) - - # Fix random seed to ensure same set of SSMs chosen when subsampling on each - # invocation. - random.seed(1) - - if args.sex == 'auto': - sex = infer_sex(variant_ids) - else: - sex = args.sex - - grouper = VariantAndCnvGroup(args.hetsnp_rate) - grouper.add_variants(variant_ids, ref_read_counts, total_read_counts) - - if len(cnv_files) > 0: - # Load CNV files in same order as sample order given for VCFs. - cn_regions = [CnvParser(cnv_files[S]).parse() for S in samples] - grouper.add_cnvs(cn_regions, sex) - - if not grouper.has_cnvs(): - assert args.regions == 'all', 'If you do not provide CNA data, you must specify --regions=all' - - if args.regions == 'normal_cn': - grouper.retain_only_variants_in_normal_cn_regions() - elif args.regions == 'normal_and_abnormal_cn': - grouper.exclude_variants_in_multiple_abnormal_or_unlisted_regions() - elif args.regions == 'all': - pass - else: - raise Exception('Unknown --regions value: %s' % args.regions) - - priority_ssms = parse_priority_ssms(args.priority_ssm_filename) - - subsampled_vars, nonsubsampled_vars = grouper.format_variants(args.sample_size, args.error_rate, priority_ssms, args.only_priority, sex) - if len(subsampled_vars) == 0: - print('No variants to write', file=sys.stderr) - sys.exit(0) - grouper.write_variants(subsampled_vars, args.output_variants) - if args.output_nonsubsampled_variants: - grouper.write_variants(nonsubsampled_vars, args.output_nonsubsampled_variants) - - if grouper.has_cnvs() and args.regions != 'normal_cn': - # Write CNVs. - grouper.write_cnvs(subsampled_vars, args.output_cnvs) - if args.output_nonsubsampled_variants and args.output_nonsubsampled_variants_cnvs: - grouper.write_cnvs(nonsubsampled_vars, args.output_nonsubsampled_variants_cnvs) - else: - # Write empty CNV file. - with open(args.output_cnvs, 'w'): - pass - - with open(args.output_params, 'w') as F: - json.dump(params, F) - -if __name__ == '__main__': - main() diff --git a/modules/phylowgs/1.0/src/fill_battenberg.py b/modules/phylowgs/1.0/src/fill_battenberg.py new file mode 100644 index 00000000..ec6d3cd5 --- /dev/null +++ b/modules/phylowgs/1.0/src/fill_battenberg.py @@ -0,0 +1,379 @@ +#!/usr/bin/python3 + +""" +This script will fill empty segments in Battenberg subclones.txt files. It is adapted from Kostiantyn Dreval's fill_segments.py script. +It requires seg file and chromosome arms file as mandatory inputs. The path to output file also must be specified. + +Example: + +python3 fill_battenberg.py --input .subclones.txt --chromArm .tsv --output .subclones.txt +""" + +# import required modules +import pandas as pd +import argparse + + +def main(): + # initiate the parser and handle arguments from command line + args = parse_args() + input_file = args.input + output_file = args.output + chrom_file = args.chromArm + + # determine the format of input file + input_format = input_file[-3:] + + # check arguments given in command line + # check_arguments(args, input_format) + + # create a dictionary containing coordinates of chromosome arms + arm_chrom = load_chrom_arm(chrom_file) + # get the order of chromosomes + chrom_order = list(arm_chrom.keys()) + ["buffer"] + + # initialize empty variable for the new segments + columns_new = [] + columns_edges = [] + + # initialize list to store all segments, since it is faster than concatenating pd df with large number of segments + seg_filled = [] + + # assign values to be used to fill normal CN segments + empty_baf = float(0.5) + empty_pval = int(1) + empty_logr = int(0) + empty_ntot = float(2.0) + empty_nMaj1_A = int(1) + empty_nMin1_A = int(1) + empty_frac1_A = int(1) + empty_nMaj2_A = int(1) + empty_nMin2_A = int(1) + empty_frac2_A = int(1) + + columns_fill = [empty_baf, empty_pval, empty_logr, empty_ntot, empty_nMaj1_A, empty_nMin1_A, empty_frac1_A, empty_nMaj2_A, empty_nMin2_A, empty_frac2_A] + + # fill segments + seg = open(input_file, 'r+') + lines=seg.readlines() + + # remove segments with NA values for nMaj1_A or nMin1_B + print("Removing segments with NA values for nMaj1_A or nMin1_A...") + to_remove = [i for i, line in enumerate(lines) if line[7]=="NA" or line[8] == "NA"] + if len(to_remove) > 0: + for index in reversed(to_remove): # start at the end to avoid recomputing offsets + del lines[index] + + + # first, get header of the file + header=lines[0].rstrip("\n").rstrip("\r").split("\t") + + print("Filling missing segments and smoothing centromeres...") + # next, go through each segment, skipping the header + for i in range(1,len(lines)-1): + + # read 2 segments at a time to compare coordinates of end of previous sefment, and start of the next segments + columns_first = (lines[i].rstrip("\n").rstrip("\r")).split("\t") + columns_second = (lines[i+1].rstrip("\n").rstrip("\r")).split("\t") + + # insert empty segment from the beginning of chromosome of the first segment in file to complete the telomeric region of first chromosome + if i==1: + columns_new = [columns_first[0], str(arm_chrom[columns_first[0]]['p']['start']), str(int(columns_first[1])-1)] + columns_fill + seg_filled.append(columns_new) + # seg_filled.append(columns_first) I think this is a duplicate of the one below after fixing the possible centromeric end of the segment + # deal with fencepost problem + if (int(columns_first[2]) > arm_chrom[columns_first[0]]['p']['end'] and int(columns_first[2]) < arm_chrom[columns_first[0]]['q']['start']): + columns_first[2] = str(arm_chrom[columns_first[0]]['p']['end']) + seg_filled.append(columns_first) + + if (chrom_order[chrom_order.index(columns_second[0])] == chrom_order[chrom_order.index(columns_first[0])+1]): + missing_arm = chrom_order[chrom_order.index(columns_first[0])] + columns_edges = [columns_first[0], str(arm_chrom[missing_arm]['q']['start']), str(arm_chrom[missing_arm]['q']['end'])] + columns_fill + seg_filled.append(columns_edges) + seg_filled.append(columns_second) + continue + + # scenario 1: segments on the same chromosome + if (columns_first[0]==columns_second[0]): + + # handle very rare overlapping segments (occurs ~ 0.008%) + if (int(columns_first[2]) > int(columns_second[1])): + columns_first[2] = int(columns_second[1])-1 + seg_filled.append(columns_first) + pass + + # for segments in p arm + if (int(columns_second[1]) < arm_chrom[columns_second[0]]['p']['end']): + # create empty segment to fill in + columns_new = [columns_first[0], str(int(columns_first[2])+1), str(int(columns_second[1])-1)] + columns_fill + seg_filled.append(columns_new) + next_segment = (lines[i+1].rstrip("\n").rstrip("\r")).split("\t") + if (int(columns_second[2]) < arm_chrom[columns_second[0]]['p']['end'] and int(columns_second[2]) < int(next_segment[1])): + seg_filled.append(columns_second) + seg_filled.append(columns_second) + + # deal with centromeres + # I already know that this is same sample, and same chromosome + elif (int(columns_first[1]) < arm_chrom[columns_first[0]]['p']['end'] and int(columns_second[1]) > arm_chrom[columns_second[0]]['p']['end']): + + # first lets deal with end of p arm: segment 1 might end before centromere, or within centromere + if int(columns_first[2]) < arm_chrom[columns_first[0]]['p']['end']: + columns_new = [columns_first[0], str(int(columns_first[2])+1), str(arm_chrom[columns_first[0]]['p']['end'])] + columns_fill + seg_filled.append(columns_new) + # if it extends into centromere, cut segment 1 at the end of p arm + else: + columns_first[2] = str(arm_chrom[columns_first[0]]['p']['end']) + seg_filled.append(columns_first) + + # now lets deal with start of q arm: it might start within or after centromere + if int(columns_second[1]) < arm_chrom[columns_second[0]]['q']['start']: + columns_second[1] = str(arm_chrom[columns_second[0]]['q']['start']) + seg_filled.append(columns_second) + next_segment = (lines[i+1].rstrip("\n").rstrip("\r")).split("\t") + if (int(next_segment[1]) < arm_chrom[next_segment[0]]['q']['start'] and int(next_segment[2]) > arm_chrom[next_segment[0]]['q']['start']): + next_segment[1] = str(arm_chrom[next_segment[0]]['q']['start']) + seg_filled.append(next_segment) + + # possible edge cases around centromere + else: + columns_new = [columns_second[0], str(arm_chrom[columns_second[0]]['q']['start']), str(int(columns_second[2])-1)] + columns_fill + previous_segment = (lines[i].rstrip("\n").rstrip("\r")).split("\t") + if (int(previous_segment[2])>arm_chrom[columns_second[0]]['q']['start']): + columns_edges = [columns_second[0], str(arm_chrom[columns_second[0]]['q']['start']), str(int(previous_segment[2]))] + columns_first[3:13] + seg_filled.append(columns_edges) + columns_new = [columns_edges[0], str(int(columns_edges[2])+1), str(int(columns_second[1])-1)] + columns_fill + seg_filled.append(columns_new) + seg_filled.append(columns_second) + + # for segments in q arm + elif (int(columns_first[1]) > arm_chrom[columns_second[0]]['q']['start']): + # create empty segment to fill in + columns_new = [columns_first[0], str(int(columns_first[2])+1), str(int(columns_second[1])-1)] + columns_fill + seg_filled.append(columns_new) + seg_filled.append(columns_second) + + # some segments are completely within centromere. drop them + elif (int(columns_first[1]) > arm_chrom[columns_first[0]]['p']['end'] and int(columns_first[1]) < arm_chrom[columns_first[0]]['q']['start']): + if (int(columns_first[2]) > arm_chrom[columns_first[0]]['q']['start']): + columns_new = [columns_first[0], str(arm_chrom[columns_first[0]]['q']['start']), str(int(columns_first[2]))] + columns_first[3:13] + columns_edges = [columns_first[0], str(int(columns_first[2])+1), str(int(columns_second[1])-1)] + columns_fill + seg_filled.append(columns_new) + else: + columns_edges = [columns_second[0], str(arm_chrom[columns_second[0]]['q']['start']), str(int(columns_second[1])-1)] + columns_fill + if (int(columns_second[1]) > arm_chrom[columns_second[0]]['p']['end'] and int(columns_second[1]) < arm_chrom[columns_second[0]]['q']['start']): + columns_second[1] = str(arm_chrom[columns_second[0]]['q']['start']) + seg_filled.append(columns_second) + seg_filled.append(columns_edges) + if (int(columns_second[1]) > arm_chrom[columns_second[0]]['p']['end'] and int(columns_second[2]) < arm_chrom[columns_second[0]]['q']['start']): + pass # this just drops the segment from output if it is within centromere + else: + if (int(columns_second[1])>arm_chrom[columns_second[0]]['q']['start']): + seg_filled.append(columns_second) + elif (int(columns_second[1]) > arm_chrom[columns_second[0]]['p']['end'] and int(columns_second[1]) < arm_chrom[columns_second[0]]['q']['start']): + pass # this is handled later + + # did I miss anything? it is possible some edge cases were not considered at time of script development + else: + print(columns_first[0], columns_second[0], columns_first[1], columns_second[1], columns_first[2], columns_second[2]) + raise ValueError ("Other sort of way. This is an edge case that needs debugging!") + + # scenario 2: same sample, but going over to the new chromosome + elif (columns_first[0]!=columns_second[0]): + # very rare cases when whole chromosome is missing, identify them here + if (chrom_order[chrom_order.index(columns_second[0])] != chrom_order[chrom_order.index(columns_first[0])+1]): + missing_chrom = chrom_order[chrom_order.index(columns_first[0])+1] + missing_p = [missing_chrom, str(arm_chrom[missing_chrom]['p']['start']), str(arm_chrom[missing_chrom]['p']['end'])] + columns_fill + missing_q = [missing_chrom, str(arm_chrom[missing_chrom]['q']['start']), str(arm_chrom[missing_chrom]['q']['end'])] + columns_fill + seg_filled.append(missing_p) + seg_filled.append(missing_q) + + # first, are there any segments in the p arm? that means second segments starts all the way in centromere or q arm + if (int(columns_first[2]) > arm_chrom[columns_first[0]]['q']['start']): #TRUE + if (int(columns_first[1]) > arm_chrom[columns_first[0]]['p']['end'] and int(columns_first[1]) < arm_chrom[columns_first[0]]['q']['start']): #FALSE + previous_segment = (lines[i-1].rstrip("\n").rstrip("\r")).split("\t") + if (chrom_order[chrom_order.index(previous_segment[0])] != chrom_order[chrom_order.index(columns_first[0])]): + columns_edges = [columns_first[0], str(arm_chrom[columns_first[0]]['p']['start']), str(arm_chrom[columns_first[0]]['p']['end'])] + columns_fill + seg_filled.append(columns_edges) + # pass + elif (int(columns_first[1]) < arm_chrom[columns_first[0]]['p']['end'] and int(columns_first[2]) > arm_chrom[columns_first[0]]['q']['start']): #TRUE + previous_segment = (lines[i-1].rstrip("\n").rstrip("\r")).split("\t") + if (chrom_order[chrom_order.index(previous_segment[0])] != chrom_order[chrom_order.index(columns_first[0])-1]): + columns_edges = [columns_first[0], columns_first[1], str(arm_chrom[columns_first[0]]['p']['end'])] + columns_first[3:13] + columns_first[1] = arm_chrom[columns_first[0]]['q']['start'] + seg_filled.append(columns_edges) + seg_filled.append(columns_first) + # Case when a chromosome has a single event that spans the centromere, split into four parts (p-start to seg-start, seg-start to p-end, q-start to seg-end, seg-end to q-end) + else: + columns_pedge = [columns_first[0], str(arm_chrom[columns_first[0]]['p']['start']), int(columns_first[1])-1] + columns_fill + columns_segp = [columns_first[0], columns_first[1], str(arm_chrom[columns_first[0]]['p']['end'])] + columns_first[3:13] + columns_segq = [columns_first[0], str(arm_chrom[columns_first[0]]['q']['start']), columns_first[2]] + columns_first[3:13] + columns_qedge = [columns_first[0], int(columns_first[2]) + 1, str(arm_chrom[columns_first[0]]['q']['end'])] + columns_fill + seg_filled.append(columns_pedge) + seg_filled.append(columns_segp) + seg_filled.append(columns_segq) + seg_filled.append(columns_qedge) + + else: + columns_edges = [columns_first[0], str(int(columns_first[2])+1), str(arm_chrom[columns_first[0]]['q']['end'])] + columns_fill + seg_filled.append(columns_edges) + if (int(columns_second[1]) > arm_chrom[columns_second[0]]['p']['end']): + if (chrom_order[chrom_order.index(columns_second[0])] != chrom_order[chrom_order.index(columns_first[0])+1]): + seg_filled.append(missing_p) + seg_filled.append(missing_q) + if (int(columns_first[1]) < arm_chrom[columns_first[0]]['q']['start'] and int(columns_first[1]) > arm_chrom[columns_first[0]]['p']['end']): + columns_first[1] = arm_chrom[columns_first[0]]['q']['start'] + seg_filled.append(columns_first) + columns_new = [columns_second[0], str(arm_chrom[columns_second[0]]['p']['start']), str(arm_chrom[columns_second[0]]['p']['end'])] + columns_fill + seg_filled.append(columns_new) + if (int(columns_second[1]) > arm_chrom[columns_second[0]]['q']['start']): + columns_edges = [columns_second[0], str(arm_chrom[columns_second[0]]['q']['start']), str(int(columns_second[1])-1)] + columns_fill + seg_filled.append(columns_edges) + else: + columns_second[1] = str(arm_chrom[columns_second[0]]['q']['start']) + seg_filled.append(columns_second) + + # are there any segments in the q arm? that means first segment ends before start of q arm + elif (int(columns_first[2]) < arm_chrom[columns_first[0]]['q']['start']): + columns_new = [columns_first[0], str(arm_chrom[columns_first[0]]['q']['start']), str(arm_chrom[columns_first[0]]['q']['end'])] + columns_fill + seg_filled.append(columns_first) + if (int(columns_first[2]) < arm_chrom[columns_first[0]]['p']['end']): + columns_edges = [columns_first[0], str(int(columns_first[2])+1), str(arm_chrom[columns_first[0]]['p']['end'])] + columns_fill + seg_filled.append(columns_edges) + seg_filled.append(columns_new) + seg_filled.append(columns_second) + + # are there any segments that starts in p arm and span centromere? if so, maintain loh flag and logr, but cut out centromere + elif (int(columns_second[1]) < arm_chrom[columns_second[0]]['p']['end'] and int(columns_second[2]) > arm_chrom[columns_second[0]]['q']['start']): + previous_segment = (lines[i].rstrip("\n").rstrip("\r")).split("\t") + if "X" not in str(columns_second[0]): + next_segment = (lines[i+2].rstrip("\n").rstrip("\r")).split("\t") + columns_new = [columns_second[0], str(int(columns_second[1])+1), str(arm_chrom[columns_second[0]]['p']['end'])] + columns_first[3:13] + columns_edges = [columns_second[0], str(arm_chrom[columns_second[0]]['q']['start']), str(int(columns_second[2]))] + columns_first[3:13] + if (columns_new[0]!=previous_segment[0]): + columns_new[1]=str(arm_chrom[columns_new[0]]['p']['start']) + if (columns_second[0]==next_segment[0]): + seg_filled.append(columns_new) + seg_filled.append(columns_edges) + + # in other cases, there are segments both in p and q arms + else: + columns_edges = [columns_second[0], str(arm_chrom[columns_first[0]]['p']['start']), str(int(columns_second[1])-1)] + columns_fill + if (int(columns_first[1]) > arm_chrom[columns_second[0]]['p']['end']): + if (int(columns_first[1]) > arm_chrom[columns_first[0]]['p']['end'] and int(columns_first[1]) < arm_chrom[columns_first[0]]['q']['start']): + columns_first[1] = arm_chrom[columns_first[0]]['q']['start'] + seg_filled.append(columns_first) + columns_new = [columns_first[0], str(int(columns_first[2])+1), str(arm_chrom[columns_first[0]]['q']['end'])] + columns_fill + seg_filled.append(columns_new) + if (chrom_order[chrom_order.index(columns_second[0])] != chrom_order[chrom_order.index(columns_first[0])+1]): + seg_filled.append(missing_p) + seg_filled.append(missing_q) + seg_filled.append(columns_edges) + if (int(columns_second[2]) < arm_chrom[columns_second[0]]['p']['end']): + seg_filled.append(columns_second) + + + # scenario 3: new sample, obviously new chromosome + else: + previous_segment = (lines[i].rstrip("\n").rstrip("\r")).split("\t") + columns_edges = [columns_first[0], str(int(previous_segment[2])+1), str(arm_chrom[columns_first[0]]['q']['end'])] + columns_fill + columns_new = [columns_second[0], str(arm_chrom[columns_first[0]]['p']['start']), str(int(columns_second[1])-1)] + columns_fill + seg_filled.append(columns_edges) + seg_filled.append(columns_new) + seg_filled.append(columns_second) + + + seg.close() + + # make df from list of lists and convert chromosome coordinates to integers + seg_filled_df = pd.DataFrame(seg_filled, columns = header) + seg_filled_df["startpos"] = seg_filled_df["startpos"].astype(int) + seg_filled_df["endpos"] = seg_filled_df["endpos"].astype(int) + + # remove any inverted segments, if there are + print("Checking and removing inverted segments...") + seg_filled_df = seg_filled_df[(seg_filled_df["endpos"]>seg_filled_df["startpos"])] + + # remove any duplicated segments, if there are + print("Checking and removing duplicated segments...") + seg_filled_df = seg_filled_df.drop_duplicates() + # seg_filled_df = seg_filled_df.groupby((seg_filled_df["endpos"] != seg_filled_df["endpos"].shift(-2)).cumsum().values).first() + # seg_filled_df = seg_filled_df.groupby((seg_filled_df["endpos"] != seg_filled_df["endpos"].shift(-1)).cumsum().values).first() + # seg_filled_df = seg_filled_df.groupby((seg_filled_df["startpos"] != seg_filled_df["startpos"].shift(-1)).cumsum().values).first() + + # save to the output file specified by user + print("Saving to file...") + seg_filled_df.to_csv(output_file, header=True, index=False, sep="\t") + print("Done!") + + +# Create nested dictionary to store shromosome arms coordinates. It is adopted from Chris's implementation in other script that summarizes CNVs +def load_chrom_arm(chrom_file): + arm_chrom = {} + required_cols = ["chromosome", "start", "end", "arm"] + header_cols = {} + + i = 0 + with open(chrom_file) as f: + for line in f: + i += 1 + line = line.rstrip("\n").rstrip("\r") # Remove line endings + cols = line.split("\t") + + # Skip empty lines + if not line: + continue + + # If we haven't parsed the header yet, assume this is the first line of the file (aka the header) + if not header_cols: + j = 0 + for col in cols: + if col in required_cols: + header_cols[col] = j + j += 1 + + # Check to make sure all required columns are found + for col in required_cols: + if col not in header_cols: + raise AttributeError("Unable to locate column %s in the chromosome arm positions file \'%s\'" % (col, chrom_file)) + # If we get this far, the header is valid + continue + + if cols[0] not in arm_chrom: + arm_chrom[cols[0]] = {} + if cols[3]: + if cols[3] not in arm_chrom[cols[0]]: + arm_chrom[cols[0]][cols[3]]={} + arm_chrom[cols[0]][cols[3]]['start'] = int(cols[1]) + arm_chrom[cols[0]][cols[3]]['end'] = int(cols[2]) + return arm_chrom + + +# Check that required arguments are provided, and the input is in .seg format +def check_arguments(args, input_format): + if input_format == 'seg' and not all([args.input, args.output, args.chromArm]): + raise ValueError ('Must specify input .seg file, output file, and file listing coordinates of chromosome arms.') + elif input_format != 'seg': + raise ValueError ('Input file must be in .seg format') + else: + pass + + +# Parse arguments from command line +def parse_args(): + parser = argparse.ArgumentParser() + + parser.add_argument("--input", + help="Imput file in .seg format to fill segments", required=True) + parser.add_argument("--output", + help="Resulting file after filling missing segments", required=True) + parser.add_argument("--chromArm", + help="File with coordinates of chromosme arms for a given genome build", required=True) + + # ignore everything else that is not required by this script + args, unknown = parser.parse_known_args() + # return arguments provided by user + return args + + +if __name__ == '__main__': + main() diff --git a/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R b/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R new file mode 100644 index 00000000..587cdb05 --- /dev/null +++ b/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R @@ -0,0 +1,536 @@ + + +#' +#' processing phyloWGS outputs pipeline that takes output json files and preprocessing output files +#' SAMPLE_ID.mutass.zip file must be unzipped before runing the script + +#E example: how to run +#mkdir -p output +#Rscript ./process.R --samplename SAMPLE_ID -j SAMPLE_ID.summ.json -t unziped.mutass/ -s ssm_data.txt -c cnv_data.txt -a SAMPLE_ID--matched_slms-3.final_deblacklisted_augmented.maf -b SAMPLE_ID_matched_slms-3.final_deblacklisted_augmented.maf -m SAMPLE_ID.muts.json -o out + +################################################## +# load required libraries +################################################## + +# library("optparse") +library("rjson") +library("tidyverse") +library("ggrepel") +library("data.table") + + +########################## +#### Snakemake Input ##### +########################## + +samplename = snakemake@wildcards[["patient_id"]] +json_file = snakemake@input[["summ"]] +trees_out= snakemake@input[["mutass"]] +ssm_file = snakemake@input[["ssms"]] +cnv_file = snakemake@input[["cnvs"]] +mafs = unlist(strsplit(snakemake@params[["maf_list"]], ",")) +mut_file = snakemake@input[["muts"]] +driver_genes = snakemake@params[["drivers"]] +sample_order = unlist(strsplit(snakemake@params[["sample_order"]], ",")) +genome_build = snakemake@wildcards[["genome_build"]] + +# Define the chr_prefix parameter based on the genome_build +chr_prefixed = str_detect(genome_build, "hg") + + +# option_list = list( +# make_option(c("-n", "--samplename"), type="character", default=NULL, help="Samplename of the sample to run", metavar="character"), +# make_option(c("-j", "--json_summ"), type="character", default=NULL, help="SAMPLE_ID.summ.json file generated by phyloWGS", metavar="character"), +# make_option(c("-t", "--trees_out"), type="character", default=NULL, help="Directory containing unzipped XX.mutass.zip trees", metavar="character"), +# make_option(c("-s", "--ssm"), type="character", default=NULL, help="Preprocessing ssm_data.txt output file", metavar="character"), +# make_option(c("-c", "--copynumber"), type="character", default=NULL, help="Preprocessing cnv_data.txt output file", metavar="character"), +# make_option(c("-a", "--tumourA_maf"), type="character", default=NULL, help="Agument maf file of tumour A", metavar="character"), +# make_option(c("-b", "--tumourB_maf"), type="character", default=NULL, help="Agument maf file of tumour B", metavar="character"), +# make_option(c("-m", "--json_muts"), type="character", default=NULL, help="SAMPLE_ID.muts.json file", metavar="character"), +# make_option(c("-o", "--output"), type="character", default=NULL, help="Output directory", metavar="character") +# ) +# +# opt_parser = OptionParser(option_list=option_list) +# opt = parse_args(opt_parser) +# +# samplename = opt$samplename +# json_file = opt$json_summ +# trees_out= opt$trees_out ### directory where unziped SAMPLE_ID.mutass.zip trees are +# ssm_file = opt$ssm +# cnv_file = opt$copynumber +# mafA = opt$tumourA_maf +# mafB = opt$tumourB_maf +# mut_file = opt$json_muts +# output_dir = opt$output +# +# +# +# .checkfile = function(infile) { +# +# if (!file.exists(infile)) { +# +# stop(paste("File", infile, "does not exist", sep="")) +# +# } +# +# } +# +# +# .checkfile(json_file) +# .checkfile(ssm_file) +# .checkfile(cnv_file) +# .checkfile(mafA) +# .checkfile(mafB) +# .checkfile(mut_file) + +################################################## +# Process input files +################################################### +# Parse the input file and obtain the required data for this run +result1 <- fromJSON(file = json_file) +result_mut<-fromJSON(file = mut_file) +ssm_pre<-read.table(file = ssm_file, header = TRUE) +cnv_pre<-read.delim(file = cnv_file, header = TRUE)[,c("cnv","a","d")] + + + +################################################## +# define output files +################################################## +out_json_to_Rtable= snakemake@output[["tree_summary"]] +ssm_to_trees= snakemake@output[["maf"]] +cnv_to_trees= snakemake@output[["cnvs"]] +cellular_prevalence_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_cellular_prevalence.pdf")) +CCF_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_cancer_cell_fraction_.pdf")) +VAF_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_vaf_.pdf")) +VAF_coding_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_vaf_coding.pdf")) +tree_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_tree.pdf")) +CCF_table = snakemake@output[["CCF"]] + +if(!dir.exists(snakemake@output[["plots"]])){dir.create(snakemake@output[["plots"]])} + +# out_json_to_Rtable= file.path(output_dir, paste("out_res_",samplename,"_json_converted_toR.table", sep = "")) +# ssm_to_trees= file.path(output_dir, paste("out_res_",samplename,"_assigned_ssms_to_best_tree_maf_format.table", sep = "")) +# cnv_to_trees= file.path(output_dir, paste("out_res_",samplename,"_assigned_cnvs_to_best_tree_maf_format.table", sep = "")) +# cellular_prevalence_plot= file.path(output_dir, paste("cellular_prevalence_",samplename,".pdf", sep = "")) +# CCF_plot= file.path(output_dir, paste("cancer_cell_fraction_",samplename,".pdf", sep = "")) +# VAF_plot= file.path(output_dir, paste("vaf_",samplename,".pdf", sep = "")) +# VAF_coding_plot= file.path(output_dir, paste("vaf_ccoding",samplename,".pdf", sep = "")) + + +################################################### +# open summ.json file and convert it into humam readable format +################################################### + +#this function opens SAMPLE_ID_summ.jason and converts it into R table +open_tree = function(json_summ_file,out_json_to_Rtable){ + + out_res<-NULL + for (j in 1:length(json_summ_file[["trees"]])){ + + tree_focal<-json_summ_file[["trees"]][j] + tree_focal_statA<-as.data.frame(t(unlist(sapply(tree_focal,function(x)x[c("clustering_index","branching_index","llh","linearity_index")])))) + colnames(tree_focal_statA)<-c("clustering_index","branching_index","llh","linearity_index") + tree_focal_statA$tree_id<-j-1 + rownames(tree_focal_statA)<-NULL + + + tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,-c(3,6,9,12,15,18,21,24,27,30)] + #tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,!(grepl("cellular_prevalence",colnames(tree_focal_statB)))] + colnames(tree_focal_statB)<-sub("^[^.]*.", "", colnames(tree_focal_statB)) + stat_both<-cbind(tree_focal_statA,tree_focal_statB) + out_res<-bind_rows(stat_both,out_res) + out_res_ordered<-out_res[order(out_res$tree_id),] + } # for j loop + +density<-json_summ_file["tree_densities"] +density_unlist<-data.frame("density"=unlist(density)) +row.names(density_unlist)<-sub("^[^.]*.", "", row.names(density_unlist)) + +density_unlist$tree_id<-row.names(density_unlist) +row.names(density_unlist)<-NULL + +final_table=merge(out_res_ordered,density_unlist, by.x = "tree_id", by.y = "tree_id") ## add tree densities to all tress table +write.table(final_table, file =out_json_to_Rtable ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE) +return(final_table) +} + + +result_tree<-open_tree(result1,out_json_to_Rtable) + + +################################################### +# extrcats the best tree +################################################### +#the best tree is the tree with the highest density + +best_tree_id = function(R_table, density) { + best=R_table[which.max(R_table$density),] + best_tree_focal_name<-best$tree_id + best_tree_id<-paste(best$tree_id,"json",sep = ".") + return(best_tree_id) + return(best_tree_focal_name) +} +best_tree_fileID<-best_tree_id(result_tree, density) + + +####################################################################### +# extract the stats (SNvs and CNVs assigned to each population) from the best tree +####################################################################### + +open_best_tree = function(trees_out,best_tree_id){ + unzip(trees_out, files = best_tree_id, exdir = dirname(trees_out), overwrite = TRUE) + best_tree_path = paste0(dirname(trees_out), "/", best_tree_id) + rr <- fromJSON(file = best_tree_path) + return(rr) +} +rr= open_best_tree(trees_out,best_tree_fileID) + + +####################################################################### +# annotate point mutations and CNVs in the best tree +####################################################################### +best_focal<-result1[["trees"]][as.numeric(gsub(".json","",best_tree_fileID))+1] +tree_structure<-as.data.frame(sapply(best_focal,function(x)x["structure"])) ##[6] +tree_roots <- best_focal[[1]]$structure$`0` + + +merge_both<-function(result1,best_tree_fileID,tree_structure){ + best_tree<-as.numeric(gsub(".json","",best_tree_fileID)) + best_focal<-result1[["trees"]][best_tree+1] + tree_focal_statB<-as.data.frame(sapply(best_focal,function(x)x["populations"])) ##[3] + qq<-tree_focal_statB[,grep("cellular_prevalence",colnames(tree_focal_statB))] %>% + rownames_to_column("sample") %>% + pivot_longer(-sample, + names_to = "population", + values_to = "cellular_prevalence") %>% + mutate(population = str_remove(str_remove(population, ".*populations[.]"), "[.]cellular_prevalence")) %>% + mutate(is_root = ifelse(population %in% tree_roots, TRUE, FALSE)) %>% + group_by(sample) %>% + mutate(purity = sum(cellular_prevalence[is_root]), + CCF = cellular_prevalence / purity) + + return(qq) + +} + +both_samples<-merge_both(result1,best_tree_fileID,tree_structure) + + +write_tsv(both_samples, CCF_table) + + +ssm = function(stat_best_tree, ssm_pre,ssm_to_trees,tree_structure, maf_list){ + + out_res_ssm<-NULL + for ( i in 1:length(stat_best_tree$mut_assignments)){ + + focal<-(stat_best_tree$mut_assignments)[i] + + focal_ssms<-data.frame(sapply(focal, function(x) x[1])) + + colnames(focal_ssms)<-sub("^[^.]*.", "", colnames(focal_ssms)) + focal_ssms$phyloWGS_population<-i + ssm_assign<-merge(ssm_pre,focal_ssms, by.x = "id", by.y = "ssms")[,c("id", "gene","phyloWGS_population")] + ssm_assign_spi<-separate(ssm_assign, col = gene, into = c("Chromosome","Start_Position"), sep = "_", convert = FALSE) %>% + mutate(Start_Position = as.numeric(Start_Position)) + if(chr_prefixed) { + ssm_assign_spi$Chromosome = str_c("chr", as.character(ssm_assign_spi$Chromosome)) + } + + + out_res_ssm<-rbind(ssm_assign_spi,out_res_ssm) + + } ## i loop + + ssm_assign_with_maf <- lapply(maf_list, function(x){ + maf <- read_tsv(x, + col_types = cols(Chromosome = col_character())) %>% + # PhyloWGS changes the start position of deletions. This makes the maf start position match that in the PhyloWGS SSM table. + mutate(Start_Position = ifelse(Variant_Type == "DEL", Start_Position - 1, Start_Position)) + maf <- out_res_ssm %>% + left_join(maf, by = c("Chromosome", "Start_Position")) %>% + # Restore the true MAF start postion after the hack above + mutate(Start_Position = ifelse(Variant_Type == "DEL", Start_Position + 1, Start_Position)) %>% + select(colnames(maf), everything()) + }) + out_res_ssm <- rbindlist(ssm_assign_with_maf) %>% + mutate(clonal_status = case_when( + phyloWGS_population %in% tree_roots & length(tree_roots) > 1 ~ "polyclonal", + phyloWGS_population %in% tree_roots & length(tree_roots) == 1 ~ "clonal", + TRUE ~ "subclonal" + )) + + return(out_res_ssm) + + +} + +ss<-ssm(rr,ssm_pre,ssm_to_trees,tree_structure, mafs) + +write_tsv(ss, ssm_to_trees, na = "") + + +########################################################### +## load mut file to extrcat CNVs start and end positions +########################################################### + +cnv = function(stat_best_tree, cnv_pre,mutation_file,cnv_to_trees){ + out_res_cnv <- + bind_rows(lapply(1:length(stat_best_tree$mut_assignments), function(x) + data.frame(cnvs = stat_best_tree$mut_assignments[[x]]$cnvs) %>% mutate(phyloWGS_population = x))) + + + + #return(out_res_cnv) + out_res_mut<-NULL + for (cn in 1:length(result_mut$cnvs)){ + focal_mut_cnv<-(result_mut$cnvs)[cn] + + focal_mut<-data.frame(sapply(focal_mut_cnv, function(x) x[1]))[1,] + colnames(focal_mut)<-sub("^[^.]*.", "", colnames(focal_mut)) + focal_mut$cnv_id<-names(focal_mut_cnv) + out_res_mut<-bind_rows(focal_mut,out_res_mut) + } ## cn loop + + both_cnvs<-merge(out_res_cnv, out_res_mut, by.x = "cnvs",by.y = "cnv_id") %>% + select(cnvs, phyloWGS_population, physical_cnvs.chrom, + physical_cnvs.start, physical_cnvs.end, + physical_cnvs.major_cn, physical_cnvs.minor_cn, physical_cnvs.cell_prev) + + +} + + +cnv<-cnv(rr, cnv_pre,result_mut,cnv_to_trees) +write.table(cnv, file =cnv_to_trees ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE) + +##################### plot the results #################### +########################################################### +#### Slope chart the best tree, cellular prevalence ####### + +plot_cp<-function(both_samples,cellular_prevalence_plot){ +pdf(cellular_prevalence_plot, width = 8, height =8 ) +plotA<-ggplot(data = both_samples, aes(x = sample, y = cellular_prevalence, group = population)) + + geom_line(aes(color = population), size = 2) + + labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ + geom_point(aes(color = population), size = 4) + + # Labelling as desired + xlab("Sample") + ylab("Cellular prevalence")+ + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + panel.background = element_blank(), axis.line = element_line(colour = "black"), + legend.key = element_rect(fill = NA, colour = NA, size = 0.25),axis.text=element_text(size=16),axis.title=element_text(size=18), + plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))) +print(plotA) +dev.off() +} + +plot_cp(both_samples,cellular_prevalence_plot) + +########################################################### +#### Slope chart the best tree, CCF ####### + +plot_cp<-function(both_samples,CCF_plot){ +pdf(CCF_plot, width = 8, height =8 ) +plotB<-ggplot(data = both_samples[both_samples$population != 0, ], aes(x = sample, y = CCF, group = population)) + + geom_line(aes(color = population), size = 2) + + labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ + geom_point(aes(color = population), size = 4) + + # Labelling as desired + xlab("Sample") + ylab("CCF")+ + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + panel.background = element_blank(), axis.line = element_line(colour = "black"), + legend.key = element_rect(fill = NA, colour = NA, size = 0.25),axis.text=element_text(size=16),axis.title=element_text(size=18), + plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))) +print(plotB) +dev.off() +} + +plot_cp(both_samples,CCF_plot) + + +############################################# +##### Slope chart the best tree (VAF) ####### + + +plot_vaf<-function(ss,VAF_plot){ + pdf(VAF_plot, width = 8, height =8 ) + plotC <- ss %>% + select(Hugo_Symbol, Chromosome, Tumor_Sample_Barcode, Start_Position, t_depth, t_alt_count, populations = phyloWGS_population) %>% + mutate(VAF = t_alt_count/t_depth, + populations = as.factor(populations)) %>% + filter(!is.na(Tumor_Sample_Barcode)) %>% + ggplot(aes(x = Tumor_Sample_Barcode, + y = VAF, + group = interaction(populations, Start_Position), + color = populations)) + + geom_line(aes(color = populations), size=0.2, alpha=0.4)+ + labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ + xlab("Sample") + ylab("VAF") + + guides(colour = guide_legend(override.aes = list(alpha = 3)))+ + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + panel.background = element_blank(), axis.line = element_line(colour = "black"), + legend.key = element_rect(fill = NA, colour = NA, size = 2),axis.text=element_text(size=16),axis.title=element_text(size=18), + plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))) +print(plotC) +dev.off() +} + + + +plot_vaf(ss,VAF_plot) + +############################################################# +##### Slope chart the best tree (VAF, coding regions, nonsense, missense and splicing sites) ####### + +drivers <- read_tsv(driver_genes, col_names = "gene") %>% pull(gene) + +plot_vaf_coding<-function(maf,VAF_coding_plot){ + + coding <- ss %>% + select(Hugo_Symbol, HGVSp_Short, Chromosome, Tumor_Sample_Barcode, Variant_Classification, Start_Position, t_depth, t_alt_count, populations = phyloWGS_population) %>% + mutate(VAF = t_alt_count/t_depth, + populations = as.factor(populations), + Tumor_Sample_Barcode = factor(Tumor_Sample_Barcode, levels = sample_order)) %>% + filter(!is.na(Tumor_Sample_Barcode), + !Variant_Classification %in% c("Silent", "RNA", "IGR", "Intron", "5'Flank", "3'Flank", "5'UTR")) %>% + mutate(label = ifelse(!is.na(HGVSp_Short), str_c(Hugo_Symbol, "_", HGVSp_Short), str_c(Hugo_Symbol, "_", Variant_Classification))) +pdf(VAF_coding_plot, width = 8, height =8 ) +plotD<-coding %>% + ggplot(aes(x = Tumor_Sample_Barcode, + y = VAF, + group = interaction(populations, Start_Position), + color = populations)) + + geom_line(aes(color = populations), size=0.5, alpha=0.4)+ + geom_text_repel( + data = filter(group_by(coding, Hugo_Symbol, Start_Position), VAF == max(VAF), Tumor_Sample_Barcode == sample_order[1], Hugo_Symbol %in% drivers), + aes(label = label, + x = Tumor_Sample_Barcode, + y = VAF), + nudge_x = -0.2, + size = 4 + ) + + geom_text_repel( + data = filter(group_by(coding, Hugo_Symbol, Start_Position), VAF == max(VAF), Tumor_Sample_Barcode == sample_order[length(sample_order)], Hugo_Symbol %in% drivers), + aes(label = label, + x = Tumor_Sample_Barcode, + y = VAF), + nudge_x = 0.2, + size = 4 + ) + + labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ + xlab("Sample") + ylab("VAF") + + guides(colour = guide_legend(override.aes = list(alpha = 3)))+ + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + panel.background = element_blank(), axis.line = element_line(colour = "black"), + legend.key = element_rect(fill = NA, colour = NA, size = 2),axis.text=element_text(size=16),axis.title=element_text(size=18), + plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))) + +print(plotD) +dev.off() +} + + +plot_vaf_coding(ss,VAF_coding_plot) + +############################################# +##### Draw the best tree ####### +############################################# + + +tree_structure_long <- tree_structure %>% + pivot_longer(everything(), + names_to = "parent", + values_to = "node") %>% + mutate(parent = str_remove_all(parent, ".*[.]")) %>% + distinct() + + +positions_x <- function(parents){ + x <- 1:length(unique(parents)) + names(x) <- unique(parents) + col_vals <- unname(x[parents]) + return(col_vals) +} + +tree_structure_long$x <- positions_x(tree_structure_long$parent) + +positions_y <- function(tree_df){ + y = c("0" = 0.5) + for(parent in unique(tree_df$parent)){ + # parent = "1" + child_index = 1 + num_children <- nrow(tree_df[tree_df$parent == parent,]) + if(num_children == 1){ + child <- tree_df[tree_df$parent == parent,]$node + child_y <- unname(y[parent]) + names(child_y) <- child + y = c(y, child_y) + + } else { + children <- tree_df[tree_df$parent == parent,]$node + y_max <- unname(y[parent]) + (0.25 / child_index) + y_min <- unname(y[parent]) - (0.25 / child_index) + y_range <- seq(y_min, y_max, length.out = length(children)) + names(y_range) <- children + y = c(y, y_range) + } + child_index = child_index + 1 + } + return(y) +} + +tree_structure_long$y <- unname(positions_y(tree_structure_long)[as.character(tree_structure_long$node)]) + +tree_structure_long <- add_row(tree_structure_long, parent = "0", node = 0, x = 0, y = 0.5) + +get_ssms <- function(tree_df, best_focal, best_tree_fileID){ + data <- best_focal[[str_remove_all(best_tree_fileID, "[.].*")]]$populations + ssm_vec <- c() + for(node in tree_df$node){ + # node = "1" + num_ssms <- data[[as.character(node)]]$num_ssms + names(num_ssms) <- as.character(node) + ssm_vec <- c(ssm_vec, num_ssms) + } + return(ssm_vec) +} + +tree_structure_long$num_ssms <- get_ssms(tree_structure_long, best_focal, best_tree_fileID)[as.character(tree_structure_long$node)] + +tree_structure_long <- tree_structure_long %>% + mutate(parent = as.numeric(parent)) %>% + left_join(select(tree_structure_long, node, xstart = x, ystart = y), + by = c("parent" = "node")) + + + +ggplot(tree_structure_long, + aes(x = x, + y = y, + label = node)) + + geom_segment(inherit.aes = FALSE, + aes(x = xstart, + xend = x, + y = ystart, + yend = y)) + + geom_point(aes(size = num_ssms), + fill = "white", + colour = "black", + pch = 21) + + geom_text() + + scale_size(range = c(5,20)) + + ylim(0,1) + + theme_void() + + ggtitle(samplename) + + theme(legend.position = "none") + +ggsave(tree_plot, height = 6, width = 6) + +############ +##### END ## +############ + + + + diff --git a/modules/phylowgs/1.0/src/process_phyloWGS_outputs_updated.R b/modules/phylowgs/1.0/src/process_phyloWGS_outputs_updated.R deleted file mode 100644 index 27a79dcc..00000000 --- a/modules/phylowgs/1.0/src/process_phyloWGS_outputs_updated.R +++ /dev/null @@ -1,417 +0,0 @@ - - -#' -#' processing phyloWGS outputs pipeline that takes output json files and preprocessing output files -#' SAMPLE_ID.mutass.zip file must be unzipped before runing the script - -#E example: how to run -#mkdir -p output -#Rscript ./process.R --samplename SAMPLE_ID -j SAMPLE_ID.summ.json -t unziped.mutass/ -s ssm_data.txt -c cnv_data.txt -a SAMPLE_ID--matched_slms-3.final_deblacklisted_augmented.maf -b SAMPLE_ID_matched_slms-3.final_deblacklisted_augmented.maf -m SAMPLE_ID.muts.json -o out - -################################################## -# load required libraries -################################################## - -library("optparse") -library("rjson") -library("plyr") -require("dplyr") -require("ggplot2") -library("tidyr") - - -################################################## -# Command line options -################################################## - -option_list = list( - make_option(c("-n", "--samplename"), type="character", default=NULL, help="Samplename of the sample to run", metavar="character"), - make_option(c("-j", "--json_summ"), type="character", default=NULL, help="SAMPLE_ID.summ.json file generated by phyloWGS", metavar="character"), - make_option(c("-t", "--trees_out"), type="character", default=NULL, help="Directory containing unzipped XX.mutass.zip trees", metavar="character"), - make_option(c("-s", "--ssm"), type="character", default=NULL, help="Preprocessing ssm_data.txt output file", metavar="character"), - make_option(c("-c", "--copynumber"), type="character", default=NULL, help="Preprocessing cnv_data.txt output file", metavar="character"), - make_option(c("-a", "--tumourA_maf"), type="character", default=NULL, help="Agument maf file of tumour A", metavar="character"), - make_option(c("-b", "--tumourB_maf"), type="character", default=NULL, help="Agument maf file of tumour B", metavar="character"), - make_option(c("-m", "--json_muts"), type="character", default=NULL, help="SAMPLE_ID.muts.json file", metavar="character"), - make_option(c("-o", "--output"), type="character", default=NULL, help="Output directory", metavar="character") -) - -opt_parser = OptionParser(option_list=option_list) -opt = parse_args(opt_parser) - -samplename = opt$samplename -json_file = opt$json_summ -trees_out= opt$trees_out ### directory where unziped SAMPLE_ID.mutass.zip trees are -ssm_file = opt$ssm -cnv_file = opt$copynumber -mafA = opt$tumourA_maf -mafB = opt$tumourB_maf -mut_file = opt$json_muts -output_dir = opt$output - - - -.checkfile = function(infile) { - - if (!file.exists(infile)) { - - stop(paste("File", infile, "does not exist", sep="")) - - } - -} - - -.checkfile(json_file) -.checkfile(ssm_file) -.checkfile(cnv_file) -.checkfile(mafA) -.checkfile(mafB) -.checkfile(mut_file) - -################################################## -# Process input files -################################################### -# Parse the input file and obtain the required data for this run -result1 <- fromJSON(file = json_file) -result_mut<-fromJSON(file = mut_file) -ssm_pre<-read.table(file = ssm_file, header = TRUE) -cnv_pre<-read.delim(file = cnv_file, header = TRUE)[,c("cnv","a","d")] -maf_TA<-read.delim(file = mafA, header = TRUE) -maf_TB<-read.delim(file = mafB, header = TRUE) - - -################################################## -# define output files -################################################## -out_json_to_Rtable= file.path(output_dir, paste("out_res_",samplename,"_json_converted_toR.table", sep = "")) -ssm_to_trees= file.path(output_dir, paste("out_res_",samplename,"_assigned_ssms_to_best_tree_maf_format.table", sep = "")) -cnv_to_trees= file.path(output_dir, paste("out_res_",samplename,"_assigned_cnvs_to_best_tree_maf_format.table", sep = "")) -cellular_prevalence_plot= file.path(output_dir, paste("cellular_prevalence_",samplename,".pdf", sep = "")) -CCF_plot= file.path(output_dir, paste("cancer_cell_fraction_",samplename,".pdf", sep = "")) -VAF_plot= file.path(output_dir, paste("vaf_",samplename,".pdf", sep = "")) -VAF_coding_plot= file.path(output_dir, paste("vaf_ccoding",samplename,".pdf", sep = "")) - - -################################################### -# open summ.json file and convert it into humam readable format -################################################### - -#this function opens SAMPLE_ID_summ.jason and converts it into R table -open_tree = function(json_summ_file,out_json_to_Rtable){ - - out_res<-NULL - for (j in 1:length(json_summ_file[["trees"]])){ - - tree_focal<-json_summ_file[["trees"]][j] - tree_focal_statA<-as.data.frame(t(unlist(sapply(tree_focal,function(x)x[c("clustering_index","branching_index","llh","linearity_index")])))) - colnames(tree_focal_statA)<-c("clustering_index","branching_index","llh","linearity_index") - tree_focal_statA$tree_id<-j-1 - rownames(tree_focal_statA)<-NULL - - - tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,-c(3,6,9,12,15,18,21,24,27,30)] - #tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,!(grepl("cellular_prevalence",colnames(tree_focal_statB)))] - colnames(tree_focal_statB)<-sub("^[^.]*.", "", colnames(tree_focal_statB)) - stat_both<-cbind(tree_focal_statA,tree_focal_statB) - out_res<-rbind.fill(stat_both,out_res) - out_res_ordered<-out_res[order(out_res$tree_id),] - } # for j loop - -density<-json_summ_file["tree_densities"] -density_unlist<-data.frame("density"=unlist(density)) -row.names(density_unlist)<-sub("^[^.]*.", "", row.names(density_unlist)) - -density_unlist$tree_id<-row.names(density_unlist) -row.names(density_unlist)<-NULL - -final_table=merge(out_res_ordered,density_unlist, by.x = "tree_id", by.y = "tree_id") ## add tree densities to all tress table -write.table(final_table, file =out_json_to_Rtable ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE) -return(final_table) -} - - -result_tree<-open_tree(result1,out_json_to_Rtable) - - -################################################### -# extrcats the best tree -################################################### -#the best tree is the tree with the highest density - -best_tree_id = function(R_table, density) { - best=R_table[which.max(R_table$density),] - best_tree_focal_name<-best$tree_id - best_tree_id<-paste(best$tree_id,"json",sep = ".") - return(best_tree_id) - return(best_tree_focal_name) -} -best_tree_fileID<-best_tree_id(result_tree, density) - - -####################################################################### -# extract the stats (SNvs and CNVs assigned to each population) from the best tree -####################################################################### - -open_best_tree = function(trees_out,best_tree_id){ - rr <- fromJSON(file = paste(trees_out,best_tree_id, sep = "/")) - return(rr) -} -rr= open_best_tree(trees_out,best_tree_fileID) - - -####################################################################### -# annotate point mutations and CNVs in the best tree -####################################################################### -best_focal<-result1[["trees"]][as.numeric(gsub(".json","",best_tree_fileID))+1] -tree_structure<-as.data.frame(sapply(best_focal,function(x)x["structure"])) ##[6] - - -merge_both<-function(result1,best_tree_fileID,tree_structure){ -best_tree<-as.numeric(gsub(".json","",best_tree_fileID)) -best_focal<-result1[["trees"]][best_tree+1] -tree_focal_statB<-as.data.frame(sapply(best_focal,function(x)x["populations"])) ##[3] -qq<-tree_focal_statB[,grep("cellular_prevalence",colnames(tree_focal_statB))] - -sample1<-data.frame(t(qq[1,])) -sample1$sample_id<-rep("TumourA") -sample1$population<-row.names(sample1) -row.names(sample1)<-NULL -colnames(sample1)<-c("cellular_prevalence","sample_id","population") -sample1$population<-sub(".*?\\.(.*?\\..*?)\\..*", "\\1", sample1$population) -sample1<-sample1[!(sample1$population=="populations.0"),] - - if (length(unique(tree_structure[,grep("structure.0",names(tree_structure))]))==1){ - sample1$CCF<-(sample1$cellular_prevalence/(max(sample1$cellular_prevalence))) - } - if (length(unique(tree_structure[,grep("structure.0",names(tree_structure))]))==2){ - sample1$CCF<-(sample1$cellular_prevalence/((max(sample1$cellular_prevalence))+(sort(sample1$cellular_prevalence)[length(sample1$cellular_prevalence)-1]))) - } - if (length(unique(tree_structure[,grep("structure.0",names(tree_structure))]))==3){ - sample1$CCF<-(sample1$cellular_prevalence/((max(sample1$cellular_prevalence))+(sort(sample1$cellular_prevalence)[length(sample1$cellular_prevalence)-1])+(sort(sample1$cellular_prevalence) [length(sample1$cellular_prevalence)-2]))) - } - - -sample2<-data.frame(t(qq[2,])) -sample2$sample_id<-rep("TumourB") -sample2$population<-row.names(sample2) -row.names(sample2)<-NULL -colnames(sample2)<-c("cellular_prevalence","sample_id","population") -sample2$population<-sub(".*?\\.(.*?\\..*?)\\..*", "\\1", sample2$population) -sample2<-sample2[!(sample2$population=="populations.0"),] - - - if (length(unique(tree_structure[,grep("structure.0",names(tree_structure))]))==1){ - sample2$CCF<-(sample2$cellular_prevalence/(max(sample2$cellular_prevalence))) - } - if (length(unique(tree_structure[,grep("structure.0",names(tree_structure))]))==2){ - sample2$CCF<-(sample2$cellular_prevalence/((max(sample2$cellular_prevalence))+(sort(sample2$cellular_prevalence)[length(sample2$cellular_prevalence)-1]))) - } - if (length(unique(tree_structure[,grep("structure.0",names(tree_structure))]))==3){ - sample2$CCF<-(sample2$cellular_prevalence/((max(sample2$cellular_prevalence))+(sort(sample2$cellular_prevalence)[length(sample2$cellular_prevalence)-1])+(sort(sample2$cellular_prevalence) [length(sample2$cellular_prevalence)-2]))) - } - - -both_samples<-rbind(sample1,sample2) -both_samples$best_tree_ID<-rep(best_tree) -both_samples<-both_samples[!(both_samples$population=="populations.0"),] - -} - -both_samples<-merge_both(result1,best_tree_fileID,tree_structure) - - - -ssm = function(stat_best_tree, ssm_pre,ssm_to_trees,tree_structure){ - - out_res_ssm<-NULL - for ( i in 1:length(stat_best_tree$mut_assignments)){ - focal<-(stat_best_tree$mut_assignments)[i] - - focal_ssms<-data.frame(sapply(focal, function(x) x[1])) - - colnames(focal_ssms)<-sub("^[^.]*.", "", colnames(focal_ssms)) - focal_ssms$populations_ssms<-paste("population",i, sep = "_") - ssm_assign<-merge(ssm_pre,focal_ssms, by.x = "id", by.y = "ssms")[,c("id", "gene","populations_ssms")] - ssm_assign_spi<-separate(ssm_assign, col = gene, into = c("Chromosome","Start_Position"), sep = "_") - ssm_assign_with_maf<- merge(ssm_assign_spi, maf_TA, by.x=c("Chromosome","Start_Position"), by.y=c("Chromosome","Start_Position")) - out_res_ssm<-rbind(ssm_assign_with_maf,out_res_ssm) - - } ## i loop - - - #return(out_res_ssm) - - out_res_clonality<-NULL - for (pp in 1:nrow(out_res_ssm)){ - focal_structure<-data.frame("yy"=unique(tree_structure[,grep("structure.0",names(tree_structure))])) - focal_row<-out_res_ssm[pp,] - if (gsub("population_","",focal_row$populations)%in%(focal_structure$yy)){ - focal_row$clonal_status<-"clonal" - - } else { - focal_row$clonal_status<-"subclonal" - } - - out_res_clonality<-rbind(out_res_clonality,focal_row) - - } ## pp loop - write.table(out_res_clonality, file =ssm_to_trees ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE) - - return(out_res_clonality) -} - -ss<-ssm(rr,ssm_pre,ssm_to_trees,tree_structure) - - -########################################################### -## load mut file to extrcat CNVs start and end positions -########################################################### - -cnv = function(stat_best_tree, cnv_pre,mutation_file,cnv_to_trees){ - out_res_cnv<-NULL - for ( ii in 1:length(stat_best_tree$mut_assignments)){ ## ii loop assigns populations to CNVs - focal_cnvs_l<-(stat_best_tree$mut_assignments)[ii] - - focal_cnvs<-data.frame(sapply(focal_cnvs_l, function(x) x[2])) - colnames(focal_cnvs)<-sub("^[^.]*.", "", colnames(focal_cnvs)) - focal_cnvs$populations_cnvs<-paste("population",ii, sep = "_") - out_res_cnv<-rbind.fill(focal_cnvs,out_res_cnv) - - } ## ii loop - - #return(out_res_cnv) - out_res_mut<-NULL - for (cn in 1:length(result_mut$cnvs)){ - focal_mut_cnv<-(result_mut$cnvs)[cn] - - focal_mut<-data.frame(sapply(focal_mut_cnv, function(x) x[1]))[1,] - colnames(focal_mut)<-sub("^[^.]*.", "", colnames(focal_mut)) - focal_mut$cnv_id<-names(focal_mut_cnv) - out_res_mut<-rbind.fill(focal_mut,out_res_mut) - } ## cn loop - - both_cnvs<-merge(out_res_cnv, out_res_mut, by.x = "cnvs",by.y = "cnv_id") - write.table(both_cnvs, file =cnv_to_trees ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE) - -} - - -cnv<-cnv(rr, cnv_pre,result_mut,cnv_to_trees) - - -##################### plot the results #################### -########################################################### -#### Slope chart the best tree, cellular prevalence ####### - -plot_cp<-function(both_samples,cellular_prevalence_plot){ -pdf(cellular_prevalence_plot, width = 8, height =8 ) -plotA<-ggplot(data = both_samples, aes(x = sample_id, y = cellular_prevalence, group = population)) + - geom_line(aes(color = population), size = 2) + - labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ - geom_point(aes(color = population), size = 4) + - # Labelling as desired - xlab("Sample") + ylab("Cellular prevalence")+ - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), - panel.background = element_blank(), axis.line = element_line(colour = "black"), - legend.key = element_rect(fill = NA, colour = NA, size = 0.25),axis.text=element_text(size=16),axis.title=element_text(size=18), - plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))) -print(plotA) -dev.off() -} - -plot_cp(both_samples,cellular_prevalence_plot) - -########################################################### -#### Slope chart the best tree, CCF ####### - -plot_cp<-function(both_samples,CCF_plot){ -pdf(CCF_plot, width = 8, height =8 ) -plotB<-ggplot(data = both_samples, aes(x = sample_id, y = CCF, group = population)) + - geom_line(aes(color = population), size = 2) + - labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ - geom_point(aes(color = population), size = 4) + - # Labelling as desired - xlab("Sample") + ylab("CCF")+ - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), - panel.background = element_blank(), axis.line = element_line(colour = "black"), - legend.key = element_rect(fill = NA, colour = NA, size = 0.25),axis.text=element_text(size=16),axis.title=element_text(size=18), - plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))) -print(plotB) -dev.off() -} - -plot_cp(both_samples,CCF_plot) - - -############################################# -##### Slope chart the best tree (VAF) ####### - -maf_TA$VAF<-(maf_TA$t_alt_count/maf_TA$t_depth) -maf_TA$tumour_type<-rep("TumourA") -TA = merge(ss[,c("Chromosome","Start_Position","id", "populations_ssms")], maf_TA, by.x = c("Chromosome","Start_Position"),by.y=c("Chromosome","Start_Position")) - - -maf_TB$VAF<-(maf_TB$t_alt_count/maf_TB$t_depth) -maf_TB$tumour_type<-rep("TumourB") -TB = merge(ss[,c("Chromosome","Start_Position","id", "populations_ssms")], maf_TB, by.x = c("Chromosome","Start_Position"),by.y=c("Chromosome","Start_Position")) - -both_Ts<-rbind(TA,TB) -both_Ts$populations_ssms<-gsub("_","",both_Ts$populations_ssms) -colnames(both_Ts)[4]<-"populations" - - -plot_vaf<-function(both_Ts,VAF_plot){ -pdf(VAF_plot, width = 8, height =8 ) -plotC<-ggplot (data =both_Ts , aes(x = tumour_type, y = VAF, group = interaction(populations, Start_Position) ,color = populations)) + - geom_line(aes(color = populations), size=0.2, alpha=0.4)+ - labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ - xlab("Sample") + ylab("VAF") + - guides(colour = guide_legend(override.aes = list(alpha = 3)))+ - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), - panel.background = element_blank(), axis.line = element_line(colour = "black"), - legend.key = element_rect(fill = NA, colour = NA, size = 2),axis.text=element_text(size=16),axis.title=element_text(size=18), - plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))) -print(plotC) -dev.off() -} - -plot_vaf(both_Ts,VAF_plot) - -############################################################# -##### Slope chart the best tree (VAF, coding regions, nonsense, missense and splicing sites) ####### - -both_Ts_coding<-both_Ts[(both_Ts$Variant_Classification == "Missense_Mutation"| both_Ts$Variant_Classification =="Nonsense_Mutation"|both_Ts$Variant_Classification =="Splice_Site"),] - -plot_vaf_coding<-function(vaf_coding,VAF_coding_plot){ - -pdf(VAF_coding_plot, width = 8, height =8 ) -plotD<-ggplot (data =both_Ts_coding , aes(x = tumour_type, y = VAF, group = interaction(populations, Start_Position) ,color = populations)) + - geom_line(aes(color = populations), size=0.7, alpha=0.8)+ - labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ - geom_text(data = both_Ts_coding %>% filter(tumour_type == "TumourB"), - aes(label = Hugo_Symbol) , - hjust = -0.3, - size = 2) + - xlab("Sample") + ylab("VAF") + - guides(colour = guide_legend(override.aes = list(alpha = 3)))+ - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), - panel.background = element_blank(), axis.line = element_line(colour = "black"), - legend.key = element_rect(fill = NA, colour = NA, size = 2),axis.text=element_text(size=16),axis.title=element_text(size=18), - plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))) - -print(plotD) -dev.off() -} - - -plot_vaf_coding(both_Ts_coding,VAF_coding_plot) - - -############ -##### END ## -############ - - - - From c5d47948ce3f086c8246e9ba26bb8f3ccb0f0416 Mon Sep 17 00:00:00 2001 From: lkhilton Date: Mon, 12 Sep 2022 10:31:05 -0700 Subject: [PATCH 03/14] Remove unneeded fill_battenberg files --- .../phylowgs/1.0/envs/fill_battenberg.yaml | 1 - .../1.0/etc/chromArmFiles/chromArm.grch37.tsv | 49 --- .../1.0/etc/chromArmFiles/chromArm.grch38.tsv | 49 --- .../1.0/etc/chromArmFiles/chromArm.hg19.tsv | 49 --- .../1.0/etc/chromArmFiles/chromArm.hg38.tsv | 49 --- .../1.0/etc/chromArmFiles/chromArm.hs37d5.tsv | 1 - modules/phylowgs/1.0/src/fill_battenberg.py | 379 ------------------ 7 files changed, 577 deletions(-) delete mode 120000 modules/phylowgs/1.0/envs/fill_battenberg.yaml delete mode 100644 modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch37.tsv delete mode 100644 modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch38.tsv delete mode 100644 modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg19.tsv delete mode 100644 modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg38.tsv delete mode 120000 modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hs37d5.tsv delete mode 100644 modules/phylowgs/1.0/src/fill_battenberg.py diff --git a/modules/phylowgs/1.0/envs/fill_battenberg.yaml b/modules/phylowgs/1.0/envs/fill_battenberg.yaml deleted file mode 120000 index e667a8b1..00000000 --- a/modules/phylowgs/1.0/envs/fill_battenberg.yaml +++ /dev/null @@ -1 +0,0 @@ -../../../../envs/phylowgs/fill_battenberg.yaml \ No newline at end of file diff --git a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch37.tsv b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch37.tsv deleted file mode 100644 index 91da51a7..00000000 --- a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch37.tsv +++ /dev/null @@ -1,49 +0,0 @@ -chromosome start end arm -1 10000 121500000 p -1 142600000 249250621 q -2 10000 90500000 p -2 96800000 243199373 q -3 10000 87900000 p -3 98300000 198022430 q -4 10000 48200000 p -4 52700000 191154276 q -5 10000 46100000 p -5 50700000 180915260 q -6 10000 58700000 p -6 63300000 171115067 q -7 10000 58000000 p -7 61700000 159138663 q -8 10000 43100000 p -8 48100000 146364022 q -9 10000 47300000 p -9 65900000 141213431 q -10 10000 38000000 p -10 42300000 135534747 q -11 10000 51600000 p -11 55700000 135006516 q -12 10000 33300000 p -12 38200000 133851895 q -13 10000 16000000 p -13 19500000 115169878 q -14 10000 14000000 p -14 19100000 107349540 q -15 10000 14000000 p -15 20700000 102531392 q -16 10000 34600000 p -16 47000000 90354753 q -17 10000 22200000 p -17 25800000 81195210 q -18 10000 15400000 p -18 19000000 78077248 q -19 10000 20000000 p -19 32400000 59128983 q -20 10000 25600000 p -20 29400000 63025520 q -21 10000 10000000 p -21 14300000 48129895 q -22 10000 11900000 p -22 17900000 51304566 q -X 10000 58100000 p -X 63000000 155270560 q -Y 10000 11600000 p -Y 13400000 28800000 q diff --git a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch38.tsv b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch38.tsv deleted file mode 100644 index 58b866e2..00000000 --- a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch38.tsv +++ /dev/null @@ -1,49 +0,0 @@ -chromosome start end arm -1 10000 121700000 p -1 143200000 248956422 q -2 0 91800000 p -2 96000000 242193529 q -3 0 87800000 p -3 98600000 198295559 q -4 0 48200000 p -4 51800000 190214555 q -5 0 46100000 p -5 51400000 181538259 q -6 0 58500000 p -6 62600000 170805979 q -7 0 58100000 p -7 62100000 159345973 q -8 0 43200000 p -8 47200000 145138636 q -9 0 42200000 p -9 61500000 138394717 q -10 10000 38000000 p -10 41600000 133797422 q -11 10000 51000000 p -11 55800000 135086622 q -12 10000 33200000 p -12 37800000 133275309 q -13 10000 16000000 p -13 18900000 114364328 q -14 10000 16000000 p -14 18200000 107043718 q -15 10000 16000000 p -15 20500000 101991189 q -16 0 35300000 p -16 47000000 90338345 q -17 0 22700000 p -17 27400000 83257441 q -18 0 15400000 p -18 21500000 80373285 q -19 0 19900000 p -19 31900000 58617616 q -20 0 25700000 p -20 30400000 64444167 q -21 0 10500000 p -21 13000000 46709983 q -22 10000 14000000 p -22 17400000 50818468 q -X 0 58100000 p -X 63800000 156040895 q -Y 0 10300000 p -Y 10600000 26600000 q diff --git a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg19.tsv b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg19.tsv deleted file mode 100644 index a3c8be28..00000000 --- a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg19.tsv +++ /dev/null @@ -1,49 +0,0 @@ -chromosome start end arm -chr1 10000 121500000 p -chr1 142600000 249250621 q -chr2 10000 90500000 p -chr2 96800000 243199373 q -chr3 10000 87900000 p -chr3 98300000 198022430 q -chr4 10000 48200000 p -chr4 52700000 191154276 q -chr5 10000 46100000 p -chr5 50700000 180915260 q -chr6 10000 58700000 p -chr6 63300000 171115067 q -chr7 10000 58000000 p -chr7 61700000 159138663 q -chr8 10000 43100000 p -chr8 48100000 146364022 q -chr9 10000 47300000 p -chr9 65900000 141213431 q -chr10 10000 38000000 p -chr10 42300000 135534747 q -chr11 10000 51600000 p -chr11 55700000 135006516 q -chr12 10000 33300000 p -chr12 38200000 133851895 q -chr13 10000 16000000 p -chr13 19500000 115169878 q -chr14 10000 14000000 p -chr14 19100000 107349540 q -chr15 10000 14000000 p -chr15 20700000 102531392 q -chr16 10000 34600000 p -chr16 47000000 90354753 q -chr17 10000 22200000 p -chr17 25800000 81195210 q -chr18 10000 15400000 p -chr18 19000000 78077248 q -chr19 10000 20000000 p -chr19 32400000 59128983 q -chr20 10000 25600000 p -chr20 29400000 63025520 q -chr21 10000 10000000 p -chr21 14300000 48129895 q -chr22 10000 11900000 p -chr22 17900000 51304566 q -chrX 10000 58100000 p -chrX 63000000 155270560 q -chrY 10000 11600000 p -chrY 13400000 28800000 q diff --git a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg38.tsv b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg38.tsv deleted file mode 100644 index 4b5d7b6a..00000000 --- a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg38.tsv +++ /dev/null @@ -1,49 +0,0 @@ -chromosome start end arm -chr1 10000 121700000 p -chr1 143200000 248956422 q -chr2 0 91800000 p -chr2 96000000 242193529 q -chr3 0 87800000 p -chr3 98600000 198295559 q -chr4 0 48200000 p -chr4 51800000 190214555 q -chr5 0 46100000 p -chr5 51400000 181538259 q -chr6 0 58500000 p -chr6 62600000 170805979 q -chr7 0 58100000 p -chr7 62100000 159345973 q -chr8 0 43200000 p -chr8 47200000 145138636 q -chr9 0 42200000 p -chr9 61500000 138394717 q -chr10 10000 38000000 p -chr10 41600000 133797422 q -chr11 10000 51000000 p -chr11 55800000 135086622 q -chr12 10000 33200000 p -chr12 37800000 133275309 q -chr13 10000 16000000 p -chr13 18900000 114364328 q -chr14 10000 16000000 p -chr14 18200000 107043718 q -chr15 10000 16000000 p -chr15 20500000 101991189 q -chr16 0 35300000 p -chr16 47000000 90338345 q -chr17 0 22700000 p -chr17 27400000 83257441 q -chr18 0 15400000 p -chr18 21500000 80373285 q -chr19 0 19900000 p -chr19 31900000 58617616 q -chr20 0 25700000 p -chr20 30400000 64444167 q -chr21 0 10500000 p -chr21 13000000 46709983 q -chr22 10000 14000000 p -chr22 17400000 50818468 q -chrX 0 58100000 p -chrX 63800000 156040895 q -chrY 0 10300000 p -chrY 10600000 26600000 q diff --git a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hs37d5.tsv b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hs37d5.tsv deleted file mode 120000 index c8477855..00000000 --- a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hs37d5.tsv +++ /dev/null @@ -1 +0,0 @@ -chromArm.grch37.tsv \ No newline at end of file diff --git a/modules/phylowgs/1.0/src/fill_battenberg.py b/modules/phylowgs/1.0/src/fill_battenberg.py deleted file mode 100644 index ec6d3cd5..00000000 --- a/modules/phylowgs/1.0/src/fill_battenberg.py +++ /dev/null @@ -1,379 +0,0 @@ -#!/usr/bin/python3 - -""" -This script will fill empty segments in Battenberg subclones.txt files. It is adapted from Kostiantyn Dreval's fill_segments.py script. -It requires seg file and chromosome arms file as mandatory inputs. The path to output file also must be specified. - -Example: - -python3 fill_battenberg.py --input .subclones.txt --chromArm .tsv --output .subclones.txt -""" - -# import required modules -import pandas as pd -import argparse - - -def main(): - # initiate the parser and handle arguments from command line - args = parse_args() - input_file = args.input - output_file = args.output - chrom_file = args.chromArm - - # determine the format of input file - input_format = input_file[-3:] - - # check arguments given in command line - # check_arguments(args, input_format) - - # create a dictionary containing coordinates of chromosome arms - arm_chrom = load_chrom_arm(chrom_file) - # get the order of chromosomes - chrom_order = list(arm_chrom.keys()) + ["buffer"] - - # initialize empty variable for the new segments - columns_new = [] - columns_edges = [] - - # initialize list to store all segments, since it is faster than concatenating pd df with large number of segments - seg_filled = [] - - # assign values to be used to fill normal CN segments - empty_baf = float(0.5) - empty_pval = int(1) - empty_logr = int(0) - empty_ntot = float(2.0) - empty_nMaj1_A = int(1) - empty_nMin1_A = int(1) - empty_frac1_A = int(1) - empty_nMaj2_A = int(1) - empty_nMin2_A = int(1) - empty_frac2_A = int(1) - - columns_fill = [empty_baf, empty_pval, empty_logr, empty_ntot, empty_nMaj1_A, empty_nMin1_A, empty_frac1_A, empty_nMaj2_A, empty_nMin2_A, empty_frac2_A] - - # fill segments - seg = open(input_file, 'r+') - lines=seg.readlines() - - # remove segments with NA values for nMaj1_A or nMin1_B - print("Removing segments with NA values for nMaj1_A or nMin1_A...") - to_remove = [i for i, line in enumerate(lines) if line[7]=="NA" or line[8] == "NA"] - if len(to_remove) > 0: - for index in reversed(to_remove): # start at the end to avoid recomputing offsets - del lines[index] - - - # first, get header of the file - header=lines[0].rstrip("\n").rstrip("\r").split("\t") - - print("Filling missing segments and smoothing centromeres...") - # next, go through each segment, skipping the header - for i in range(1,len(lines)-1): - - # read 2 segments at a time to compare coordinates of end of previous sefment, and start of the next segments - columns_first = (lines[i].rstrip("\n").rstrip("\r")).split("\t") - columns_second = (lines[i+1].rstrip("\n").rstrip("\r")).split("\t") - - # insert empty segment from the beginning of chromosome of the first segment in file to complete the telomeric region of first chromosome - if i==1: - columns_new = [columns_first[0], str(arm_chrom[columns_first[0]]['p']['start']), str(int(columns_first[1])-1)] + columns_fill - seg_filled.append(columns_new) - # seg_filled.append(columns_first) I think this is a duplicate of the one below after fixing the possible centromeric end of the segment - # deal with fencepost problem - if (int(columns_first[2]) > arm_chrom[columns_first[0]]['p']['end'] and int(columns_first[2]) < arm_chrom[columns_first[0]]['q']['start']): - columns_first[2] = str(arm_chrom[columns_first[0]]['p']['end']) - seg_filled.append(columns_first) - - if (chrom_order[chrom_order.index(columns_second[0])] == chrom_order[chrom_order.index(columns_first[0])+1]): - missing_arm = chrom_order[chrom_order.index(columns_first[0])] - columns_edges = [columns_first[0], str(arm_chrom[missing_arm]['q']['start']), str(arm_chrom[missing_arm]['q']['end'])] + columns_fill - seg_filled.append(columns_edges) - seg_filled.append(columns_second) - continue - - # scenario 1: segments on the same chromosome - if (columns_first[0]==columns_second[0]): - - # handle very rare overlapping segments (occurs ~ 0.008%) - if (int(columns_first[2]) > int(columns_second[1])): - columns_first[2] = int(columns_second[1])-1 - seg_filled.append(columns_first) - pass - - # for segments in p arm - if (int(columns_second[1]) < arm_chrom[columns_second[0]]['p']['end']): - # create empty segment to fill in - columns_new = [columns_first[0], str(int(columns_first[2])+1), str(int(columns_second[1])-1)] + columns_fill - seg_filled.append(columns_new) - next_segment = (lines[i+1].rstrip("\n").rstrip("\r")).split("\t") - if (int(columns_second[2]) < arm_chrom[columns_second[0]]['p']['end'] and int(columns_second[2]) < int(next_segment[1])): - seg_filled.append(columns_second) - seg_filled.append(columns_second) - - # deal with centromeres - # I already know that this is same sample, and same chromosome - elif (int(columns_first[1]) < arm_chrom[columns_first[0]]['p']['end'] and int(columns_second[1]) > arm_chrom[columns_second[0]]['p']['end']): - - # first lets deal with end of p arm: segment 1 might end before centromere, or within centromere - if int(columns_first[2]) < arm_chrom[columns_first[0]]['p']['end']: - columns_new = [columns_first[0], str(int(columns_first[2])+1), str(arm_chrom[columns_first[0]]['p']['end'])] + columns_fill - seg_filled.append(columns_new) - # if it extends into centromere, cut segment 1 at the end of p arm - else: - columns_first[2] = str(arm_chrom[columns_first[0]]['p']['end']) - seg_filled.append(columns_first) - - # now lets deal with start of q arm: it might start within or after centromere - if int(columns_second[1]) < arm_chrom[columns_second[0]]['q']['start']: - columns_second[1] = str(arm_chrom[columns_second[0]]['q']['start']) - seg_filled.append(columns_second) - next_segment = (lines[i+1].rstrip("\n").rstrip("\r")).split("\t") - if (int(next_segment[1]) < arm_chrom[next_segment[0]]['q']['start'] and int(next_segment[2]) > arm_chrom[next_segment[0]]['q']['start']): - next_segment[1] = str(arm_chrom[next_segment[0]]['q']['start']) - seg_filled.append(next_segment) - - # possible edge cases around centromere - else: - columns_new = [columns_second[0], str(arm_chrom[columns_second[0]]['q']['start']), str(int(columns_second[2])-1)] + columns_fill - previous_segment = (lines[i].rstrip("\n").rstrip("\r")).split("\t") - if (int(previous_segment[2])>arm_chrom[columns_second[0]]['q']['start']): - columns_edges = [columns_second[0], str(arm_chrom[columns_second[0]]['q']['start']), str(int(previous_segment[2]))] + columns_first[3:13] - seg_filled.append(columns_edges) - columns_new = [columns_edges[0], str(int(columns_edges[2])+1), str(int(columns_second[1])-1)] + columns_fill - seg_filled.append(columns_new) - seg_filled.append(columns_second) - - # for segments in q arm - elif (int(columns_first[1]) > arm_chrom[columns_second[0]]['q']['start']): - # create empty segment to fill in - columns_new = [columns_first[0], str(int(columns_first[2])+1), str(int(columns_second[1])-1)] + columns_fill - seg_filled.append(columns_new) - seg_filled.append(columns_second) - - # some segments are completely within centromere. drop them - elif (int(columns_first[1]) > arm_chrom[columns_first[0]]['p']['end'] and int(columns_first[1]) < arm_chrom[columns_first[0]]['q']['start']): - if (int(columns_first[2]) > arm_chrom[columns_first[0]]['q']['start']): - columns_new = [columns_first[0], str(arm_chrom[columns_first[0]]['q']['start']), str(int(columns_first[2]))] + columns_first[3:13] - columns_edges = [columns_first[0], str(int(columns_first[2])+1), str(int(columns_second[1])-1)] + columns_fill - seg_filled.append(columns_new) - else: - columns_edges = [columns_second[0], str(arm_chrom[columns_second[0]]['q']['start']), str(int(columns_second[1])-1)] + columns_fill - if (int(columns_second[1]) > arm_chrom[columns_second[0]]['p']['end'] and int(columns_second[1]) < arm_chrom[columns_second[0]]['q']['start']): - columns_second[1] = str(arm_chrom[columns_second[0]]['q']['start']) - seg_filled.append(columns_second) - seg_filled.append(columns_edges) - if (int(columns_second[1]) > arm_chrom[columns_second[0]]['p']['end'] and int(columns_second[2]) < arm_chrom[columns_second[0]]['q']['start']): - pass # this just drops the segment from output if it is within centromere - else: - if (int(columns_second[1])>arm_chrom[columns_second[0]]['q']['start']): - seg_filled.append(columns_second) - elif (int(columns_second[1]) > arm_chrom[columns_second[0]]['p']['end'] and int(columns_second[1]) < arm_chrom[columns_second[0]]['q']['start']): - pass # this is handled later - - # did I miss anything? it is possible some edge cases were not considered at time of script development - else: - print(columns_first[0], columns_second[0], columns_first[1], columns_second[1], columns_first[2], columns_second[2]) - raise ValueError ("Other sort of way. This is an edge case that needs debugging!") - - # scenario 2: same sample, but going over to the new chromosome - elif (columns_first[0]!=columns_second[0]): - # very rare cases when whole chromosome is missing, identify them here - if (chrom_order[chrom_order.index(columns_second[0])] != chrom_order[chrom_order.index(columns_first[0])+1]): - missing_chrom = chrom_order[chrom_order.index(columns_first[0])+1] - missing_p = [missing_chrom, str(arm_chrom[missing_chrom]['p']['start']), str(arm_chrom[missing_chrom]['p']['end'])] + columns_fill - missing_q = [missing_chrom, str(arm_chrom[missing_chrom]['q']['start']), str(arm_chrom[missing_chrom]['q']['end'])] + columns_fill - seg_filled.append(missing_p) - seg_filled.append(missing_q) - - # first, are there any segments in the p arm? that means second segments starts all the way in centromere or q arm - if (int(columns_first[2]) > arm_chrom[columns_first[0]]['q']['start']): #TRUE - if (int(columns_first[1]) > arm_chrom[columns_first[0]]['p']['end'] and int(columns_first[1]) < arm_chrom[columns_first[0]]['q']['start']): #FALSE - previous_segment = (lines[i-1].rstrip("\n").rstrip("\r")).split("\t") - if (chrom_order[chrom_order.index(previous_segment[0])] != chrom_order[chrom_order.index(columns_first[0])]): - columns_edges = [columns_first[0], str(arm_chrom[columns_first[0]]['p']['start']), str(arm_chrom[columns_first[0]]['p']['end'])] + columns_fill - seg_filled.append(columns_edges) - # pass - elif (int(columns_first[1]) < arm_chrom[columns_first[0]]['p']['end'] and int(columns_first[2]) > arm_chrom[columns_first[0]]['q']['start']): #TRUE - previous_segment = (lines[i-1].rstrip("\n").rstrip("\r")).split("\t") - if (chrom_order[chrom_order.index(previous_segment[0])] != chrom_order[chrom_order.index(columns_first[0])-1]): - columns_edges = [columns_first[0], columns_first[1], str(arm_chrom[columns_first[0]]['p']['end'])] + columns_first[3:13] - columns_first[1] = arm_chrom[columns_first[0]]['q']['start'] - seg_filled.append(columns_edges) - seg_filled.append(columns_first) - # Case when a chromosome has a single event that spans the centromere, split into four parts (p-start to seg-start, seg-start to p-end, q-start to seg-end, seg-end to q-end) - else: - columns_pedge = [columns_first[0], str(arm_chrom[columns_first[0]]['p']['start']), int(columns_first[1])-1] + columns_fill - columns_segp = [columns_first[0], columns_first[1], str(arm_chrom[columns_first[0]]['p']['end'])] + columns_first[3:13] - columns_segq = [columns_first[0], str(arm_chrom[columns_first[0]]['q']['start']), columns_first[2]] + columns_first[3:13] - columns_qedge = [columns_first[0], int(columns_first[2]) + 1, str(arm_chrom[columns_first[0]]['q']['end'])] + columns_fill - seg_filled.append(columns_pedge) - seg_filled.append(columns_segp) - seg_filled.append(columns_segq) - seg_filled.append(columns_qedge) - - else: - columns_edges = [columns_first[0], str(int(columns_first[2])+1), str(arm_chrom[columns_first[0]]['q']['end'])] + columns_fill - seg_filled.append(columns_edges) - if (int(columns_second[1]) > arm_chrom[columns_second[0]]['p']['end']): - if (chrom_order[chrom_order.index(columns_second[0])] != chrom_order[chrom_order.index(columns_first[0])+1]): - seg_filled.append(missing_p) - seg_filled.append(missing_q) - if (int(columns_first[1]) < arm_chrom[columns_first[0]]['q']['start'] and int(columns_first[1]) > arm_chrom[columns_first[0]]['p']['end']): - columns_first[1] = arm_chrom[columns_first[0]]['q']['start'] - seg_filled.append(columns_first) - columns_new = [columns_second[0], str(arm_chrom[columns_second[0]]['p']['start']), str(arm_chrom[columns_second[0]]['p']['end'])] + columns_fill - seg_filled.append(columns_new) - if (int(columns_second[1]) > arm_chrom[columns_second[0]]['q']['start']): - columns_edges = [columns_second[0], str(arm_chrom[columns_second[0]]['q']['start']), str(int(columns_second[1])-1)] + columns_fill - seg_filled.append(columns_edges) - else: - columns_second[1] = str(arm_chrom[columns_second[0]]['q']['start']) - seg_filled.append(columns_second) - - # are there any segments in the q arm? that means first segment ends before start of q arm - elif (int(columns_first[2]) < arm_chrom[columns_first[0]]['q']['start']): - columns_new = [columns_first[0], str(arm_chrom[columns_first[0]]['q']['start']), str(arm_chrom[columns_first[0]]['q']['end'])] + columns_fill - seg_filled.append(columns_first) - if (int(columns_first[2]) < arm_chrom[columns_first[0]]['p']['end']): - columns_edges = [columns_first[0], str(int(columns_first[2])+1), str(arm_chrom[columns_first[0]]['p']['end'])] + columns_fill - seg_filled.append(columns_edges) - seg_filled.append(columns_new) - seg_filled.append(columns_second) - - # are there any segments that starts in p arm and span centromere? if so, maintain loh flag and logr, but cut out centromere - elif (int(columns_second[1]) < arm_chrom[columns_second[0]]['p']['end'] and int(columns_second[2]) > arm_chrom[columns_second[0]]['q']['start']): - previous_segment = (lines[i].rstrip("\n").rstrip("\r")).split("\t") - if "X" not in str(columns_second[0]): - next_segment = (lines[i+2].rstrip("\n").rstrip("\r")).split("\t") - columns_new = [columns_second[0], str(int(columns_second[1])+1), str(arm_chrom[columns_second[0]]['p']['end'])] + columns_first[3:13] - columns_edges = [columns_second[0], str(arm_chrom[columns_second[0]]['q']['start']), str(int(columns_second[2]))] + columns_first[3:13] - if (columns_new[0]!=previous_segment[0]): - columns_new[1]=str(arm_chrom[columns_new[0]]['p']['start']) - if (columns_second[0]==next_segment[0]): - seg_filled.append(columns_new) - seg_filled.append(columns_edges) - - # in other cases, there are segments both in p and q arms - else: - columns_edges = [columns_second[0], str(arm_chrom[columns_first[0]]['p']['start']), str(int(columns_second[1])-1)] + columns_fill - if (int(columns_first[1]) > arm_chrom[columns_second[0]]['p']['end']): - if (int(columns_first[1]) > arm_chrom[columns_first[0]]['p']['end'] and int(columns_first[1]) < arm_chrom[columns_first[0]]['q']['start']): - columns_first[1] = arm_chrom[columns_first[0]]['q']['start'] - seg_filled.append(columns_first) - columns_new = [columns_first[0], str(int(columns_first[2])+1), str(arm_chrom[columns_first[0]]['q']['end'])] + columns_fill - seg_filled.append(columns_new) - if (chrom_order[chrom_order.index(columns_second[0])] != chrom_order[chrom_order.index(columns_first[0])+1]): - seg_filled.append(missing_p) - seg_filled.append(missing_q) - seg_filled.append(columns_edges) - if (int(columns_second[2]) < arm_chrom[columns_second[0]]['p']['end']): - seg_filled.append(columns_second) - - - # scenario 3: new sample, obviously new chromosome - else: - previous_segment = (lines[i].rstrip("\n").rstrip("\r")).split("\t") - columns_edges = [columns_first[0], str(int(previous_segment[2])+1), str(arm_chrom[columns_first[0]]['q']['end'])] + columns_fill - columns_new = [columns_second[0], str(arm_chrom[columns_first[0]]['p']['start']), str(int(columns_second[1])-1)] + columns_fill - seg_filled.append(columns_edges) - seg_filled.append(columns_new) - seg_filled.append(columns_second) - - - seg.close() - - # make df from list of lists and convert chromosome coordinates to integers - seg_filled_df = pd.DataFrame(seg_filled, columns = header) - seg_filled_df["startpos"] = seg_filled_df["startpos"].astype(int) - seg_filled_df["endpos"] = seg_filled_df["endpos"].astype(int) - - # remove any inverted segments, if there are - print("Checking and removing inverted segments...") - seg_filled_df = seg_filled_df[(seg_filled_df["endpos"]>seg_filled_df["startpos"])] - - # remove any duplicated segments, if there are - print("Checking and removing duplicated segments...") - seg_filled_df = seg_filled_df.drop_duplicates() - # seg_filled_df = seg_filled_df.groupby((seg_filled_df["endpos"] != seg_filled_df["endpos"].shift(-2)).cumsum().values).first() - # seg_filled_df = seg_filled_df.groupby((seg_filled_df["endpos"] != seg_filled_df["endpos"].shift(-1)).cumsum().values).first() - # seg_filled_df = seg_filled_df.groupby((seg_filled_df["startpos"] != seg_filled_df["startpos"].shift(-1)).cumsum().values).first() - - # save to the output file specified by user - print("Saving to file...") - seg_filled_df.to_csv(output_file, header=True, index=False, sep="\t") - print("Done!") - - -# Create nested dictionary to store shromosome arms coordinates. It is adopted from Chris's implementation in other script that summarizes CNVs -def load_chrom_arm(chrom_file): - arm_chrom = {} - required_cols = ["chromosome", "start", "end", "arm"] - header_cols = {} - - i = 0 - with open(chrom_file) as f: - for line in f: - i += 1 - line = line.rstrip("\n").rstrip("\r") # Remove line endings - cols = line.split("\t") - - # Skip empty lines - if not line: - continue - - # If we haven't parsed the header yet, assume this is the first line of the file (aka the header) - if not header_cols: - j = 0 - for col in cols: - if col in required_cols: - header_cols[col] = j - j += 1 - - # Check to make sure all required columns are found - for col in required_cols: - if col not in header_cols: - raise AttributeError("Unable to locate column %s in the chromosome arm positions file \'%s\'" % (col, chrom_file)) - # If we get this far, the header is valid - continue - - if cols[0] not in arm_chrom: - arm_chrom[cols[0]] = {} - if cols[3]: - if cols[3] not in arm_chrom[cols[0]]: - arm_chrom[cols[0]][cols[3]]={} - arm_chrom[cols[0]][cols[3]]['start'] = int(cols[1]) - arm_chrom[cols[0]][cols[3]]['end'] = int(cols[2]) - return arm_chrom - - -# Check that required arguments are provided, and the input is in .seg format -def check_arguments(args, input_format): - if input_format == 'seg' and not all([args.input, args.output, args.chromArm]): - raise ValueError ('Must specify input .seg file, output file, and file listing coordinates of chromosome arms.') - elif input_format != 'seg': - raise ValueError ('Input file must be in .seg format') - else: - pass - - -# Parse arguments from command line -def parse_args(): - parser = argparse.ArgumentParser() - - parser.add_argument("--input", - help="Imput file in .seg format to fill segments", required=True) - parser.add_argument("--output", - help="Resulting file after filling missing segments", required=True) - parser.add_argument("--chromArm", - help="File with coordinates of chromosme arms for a given genome build", required=True) - - # ignore everything else that is not required by this script - args, unknown = parser.parse_known_args() - # return arguments provided by user - return args - - -if __name__ == '__main__': - main() From 8a48e7319be0310e89425be627611891f869fcae Mon Sep 17 00:00:00 2001 From: lkhilton Date: Mon, 12 Sep 2022 17:08:51 -0700 Subject: [PATCH 04/14] Working Py/PhyClone module --- modules/pyclone_vi/1.0/config/default.yaml | 64 +++ modules/pyclone_vi/1.0/envs/gamblr.yaml | 1 + modules/pyclone_vi/1.0/envs/phyclone.yaml | 17 + modules/pyclone_vi/1.0/envs/pyclone-vi.yaml | 5 + modules/pyclone_vi/1.0/envs/pyclone.yaml | 5 + modules/pyclone_vi/1.0/envs/python.yaml | 6 + modules/pyclone_vi/1.0/pyclone_vi.smk | 397 ++++++++++++++++++ modules/pyclone_vi/1.0/schemas/base-1.0.yaml | 1 + .../1.0/schemas/time_point-1.0.yaml | 1 + modules/pyclone_vi/1.0/src/build_input.py | 213 ++++++++++ .../1.0/src/build_pyclone_summary_file.py | 33 ++ .../1.0/src/compute_expected_statistics.py | 126 ++++++ modules/pyclone_vi/1.0/src/merge_files.py | 26 ++ .../1.0/src/subset_maf_for_pyclone.R | 56 +++ modules/pyclone_vi/CHANGELOG.md | 16 + 15 files changed, 967 insertions(+) create mode 100644 modules/pyclone_vi/1.0/config/default.yaml create mode 120000 modules/pyclone_vi/1.0/envs/gamblr.yaml create mode 100644 modules/pyclone_vi/1.0/envs/phyclone.yaml create mode 100644 modules/pyclone_vi/1.0/envs/pyclone-vi.yaml create mode 100644 modules/pyclone_vi/1.0/envs/pyclone.yaml create mode 100644 modules/pyclone_vi/1.0/envs/python.yaml create mode 100644 modules/pyclone_vi/1.0/pyclone_vi.smk create mode 120000 modules/pyclone_vi/1.0/schemas/base-1.0.yaml create mode 120000 modules/pyclone_vi/1.0/schemas/time_point-1.0.yaml create mode 100644 modules/pyclone_vi/1.0/src/build_input.py create mode 100644 modules/pyclone_vi/1.0/src/build_pyclone_summary_file.py create mode 100644 modules/pyclone_vi/1.0/src/compute_expected_statistics.py create mode 100644 modules/pyclone_vi/1.0/src/merge_files.py create mode 100644 modules/pyclone_vi/1.0/src/subset_maf_for_pyclone.R create mode 100644 modules/pyclone_vi/CHANGELOG.md diff --git a/modules/pyclone_vi/1.0/config/default.yaml b/modules/pyclone_vi/1.0/config/default.yaml new file mode 100644 index 00000000..2e7ee60b --- /dev/null +++ b/modules/pyclone_vi/1.0/config/default.yaml @@ -0,0 +1,64 @@ +lcr-modules: + + pyclone_vi: + + # TODO: Update the list of available wildcards, if applicable + inputs: + # Available wildcards: {seq_type} {genome_build} {tumour_id} {normal_id} {pair_status} + sample_maf: "__UPDATE__" + sample_subclones: "__UPDATE__" + sample_cellularity: "__UPDATE__" + sample_sex: "__UPDATE__" # Only {normal_id} available + + scratch_subdirectories: [] + + options: + build_input: + subset_maf: True # Whether to use the full maf or subset to only coding and aSHM mutations. + gamblr_branch: "" + gamblr_config_url: "https://raw.githubusercontent.com/morinlab/GAMBLR/master/config.yml" + fit: + num_clusters: 30 + num_restarts: 100 + opts: "--mix-weight-prior 10" + phyclone: + burnin: 100 + num_iters: 350 + density: "binomial" # Options: "binomial" or "beta-binomial" + + conda_envs: + python: "{MODSDIR}/envs/python.yaml" + pyclone-vi: "{MODSDIR}/envs/pyclone-vi.yaml" + pyclone: "{MODSDIR}/envs/pyclone.yaml" + phyclone: "{MODSDIR}/envs/phyclone.yaml" + fill_battenberg: "{MODSDIR}/envs/fill_battenberg.yaml" + gamblr: "{MODSDIR}/envs/gamblr.yaml" + + scripts: + subset_maf: "{MODSDIR}/src/subset_maf_for_pyclone.R" + build_input: "{MODSDIR}/src/build_input.py" + compute_stats: "{MODSDIR}/src/compute_expected_statistics.py" + + threads: + build_input: 1 + fit: 4 + phyclone: 4 + + resources: + build_input: + mem_mb: 2000 + fit: + mem_mb: 50000 + phyclone: + mem_mb: 50000 + + + pairing_config: + genome: + run_paired_tumours: True + run_unpaired_tumours_with: null + run_paired_tumours_as_unpaired: False + capture: + run_paired_tumours: True + run_unpaired_tumours_with: null + run_paired_tumours_as_unpaired: False diff --git a/modules/pyclone_vi/1.0/envs/gamblr.yaml b/modules/pyclone_vi/1.0/envs/gamblr.yaml new file mode 120000 index 00000000..5b0e0256 --- /dev/null +++ b/modules/pyclone_vi/1.0/envs/gamblr.yaml @@ -0,0 +1 @@ +/home/lhilton/repos/lcr-modules-starfish/envs/GAMBLR/gamblr.yaml \ No newline at end of file diff --git a/modules/pyclone_vi/1.0/envs/phyclone.yaml b/modules/pyclone_vi/1.0/envs/phyclone.yaml new file mode 100644 index 00000000..bc365861 --- /dev/null +++ b/modules/pyclone_vi/1.0/envs/phyclone.yaml @@ -0,0 +1,17 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python =3.9 + - biopython + - click + - matplotlib + - networkx + - numba + - numpy + - pandas + - scikit-learn + - scipy + - pip + - pip: + - "git+ssh://git@github.com/aroth85/phyclone.git@7c717f79e62535de952defc15f9907a82deda520" diff --git a/modules/pyclone_vi/1.0/envs/pyclone-vi.yaml b/modules/pyclone_vi/1.0/envs/pyclone-vi.yaml new file mode 100644 index 00000000..69f82281 --- /dev/null +++ b/modules/pyclone_vi/1.0/envs/pyclone-vi.yaml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - pyclone-vi diff --git a/modules/pyclone_vi/1.0/envs/pyclone.yaml b/modules/pyclone_vi/1.0/envs/pyclone.yaml new file mode 100644 index 00000000..339a0081 --- /dev/null +++ b/modules/pyclone_vi/1.0/envs/pyclone.yaml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - pyclone diff --git a/modules/pyclone_vi/1.0/envs/python.yaml b/modules/pyclone_vi/1.0/envs/python.yaml new file mode 100644 index 00000000..d26c3e48 --- /dev/null +++ b/modules/pyclone_vi/1.0/envs/python.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge +dependencies: + - python =3.9 + - networkx + - pandas diff --git a/modules/pyclone_vi/1.0/pyclone_vi.smk b/modules/pyclone_vi/1.0/pyclone_vi.smk new file mode 100644 index 00000000..3a038b61 --- /dev/null +++ b/modules/pyclone_vi/1.0/pyclone_vi.smk @@ -0,0 +1,397 @@ +#!/usr/bin/env snakemake + + +##### ATTRIBUTION ##### + + +# Original Author: Andrew Roth +# Module Author: Laura Hilton +# Contributors: N/A + + +##### SETUP ##### + + +# Import package with useful functions for developing analysis modules +import oncopipe as op +import hashlib +import glob + +# Setup module and store module-specific configuration in `CFG` +# `CFG` is a shortcut to `config["lcr-modules"]["pyclone_vi"]` +CFG = op.setup_module( + name = "pyclone_vi", + version = "1.0", + subdirectories = ["inputs", "build_inputs", "fit", "og-pyclone", "phyclone", "outputs"], +) + +# Define rules to be run locally when using a compute cluster +# TODO: Replace with actual rules once you change the rule names +localrules: + _pyclone_vi_write_results, + _pyclone_vi_all + +# Install GAMBLR + +# Obtain the path to the GAMBLR conda environment +md5hash = hashlib.md5() +if workflow.conda_prefix: + conda_prefix = workflow.conda_prefix +else: + conda_prefix = os.path.abspath(".snakemake/conda") + +md5hash.update(conda_prefix.encode()) +f = open("config/envs/GAMBLR.yaml", 'rb') +md5hash.update(f.read()) +f.close() +h = md5hash.hexdigest() +GAMBLR = glob.glob(conda_prefix + "/" + h[:8] + "*")[0] + +rule _pyclone_vi_install_GAMBLR: + params: + branch = ", ref = \"" + CFG["options"]["build_input"]['gamblr_branch'] + "\"" if CFG["options"]["build_input"]['gamblr_branch'] != "" else "", + config_url = CFG["options"]["build_input"]["gamblr_config_url"] + output: + installed = directory(GAMBLR + "/lib/R/library/GAMBLR"), + config = "gamblr.yaml" + conda: + CFG['conda_envs']['gamblr'] + shell: + op.as_one_line(""" + wget -qO {output.config} {params.config_url} && + R -q -e 'options(timeout=9999999); devtools::install_github("morinlab/GAMBLR"{params.branch})' + """) + + +##### RULES ##### + + +# Symlinks the input files into the module results directory (under '00-inputs/') +rule _pyclone_vi_input_maf: + input: + maf = CFG["inputs"]["sample_maf"] + output: + maf = CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.maf" + group: "input_and_build" + run: + op.relative_symlink(input.maf, output.maf) + +rule _pyclone_vi_input_battenberg: + input: + subclones = CFG["inputs"]["sample_subclones"], + cellularity = CFG["inputs"]["sample_cellularity"], + sex = CFG["inputs"]["sample_sex"] + output: + subclones = CFG["dirs"]["inputs"] + "battenberg/{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.battenberg.subclones.txt", + cellularity = CFG["dirs"]["inputs"] + "battenberg/{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.battenberg.cellularity_ploidy.txt", + sex = CFG["dirs"]["inputs"] + "battenberg/{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.battenberg.inferred_sex.txt" + group: "input_and_build" + wildcard_constraints: + seq_type = "genome" + run: + op.absolute_symlink(input.subclones, output.subclones) + op.absolute_symlink(input.cellularity, output.cellularity) + op.absolute_symlink(input.sex, output.sex) + + +# Prepare Pyclone inputs + + +rule _pyclone_vi_subset_maf: + input: + maf = str(rules._pyclone_vi_input_maf.output.maf), + GAMBLR = ancient(rules._pyclone_vi_install_GAMBLR.output.installed) + output: + maf = CFG["dirs"]["build_inputs"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.subset.maf" + params: + script = CFG["scripts"]["subset_maf"] + conda: + CFG["conda_envs"]["gamblr"] + script: + "{params.script}" + +subset_maf = CFG["options"]["build_input"]["subset_maf"] + +rule _pyclone_vi_build_input: + input: + maf = str(rules._pyclone_vi_subset_maf.output.maf) if subset_maf else str(rules._pyclone_vi_input_maf.output.maf), + cnv = expand(str(rules._pyclone_vi_input_battenberg.output.subclones), seq_type = "genome", allow_missing = True), + cellularity = expand(str(rules._pyclone_vi_input_battenberg.output.cellularity), seq_type = "genome", allow_missing = True), + sex = expand(str(rules._pyclone_vi_input_battenberg.output.sex), seq_type = "genome", allow_missing = True) + output: + inputs = CFG["dirs"]["build_inputs"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.inputs.tsv" + params: + script = CFG["scripts"]["build_input"], + version = lambda w: {"genome": "pyclone-vi", "capture": "pyclone"}[w.seq_type] + log: + CFG["logs"]["build_inputs"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}/build_input.log" + conda: + CFG["conda_envs"]["python"] + threads: + CFG["threads"]["build_input"] + resources: + **CFG["resources"]["build_input"] + group: "input_and_build" + shell: + op.as_one_line(""" + cellularity=$(tail -n +2 {input.cellularity} | cut -f 1); + if [[ $(tail -n+2 {input.sex} | cut -f4) == "female" ]]; then sex="F"; else sex="M"; fi; + echo "Prepping PyClone-vi inputs with cellularity $cellularity and sex $sex. "; + python {params.script} -c battenberg -s $sex -t $cellularity -ic {input.cnv} -is {input.maf} -o {output.inputs} -id {wildcards.tumour_id} -p {params.version} + > {log} 2>&1 + """) + +# Merge all built inputs into a single tsv file per patient +def get_built_inputs(wildcards): + CFG = config["lcr-modules"]["pyclone_vi"] + PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id).sort_values(by = ["tumour_time_point"]) + inputs = expand( + [ + str(rules._pyclone_vi_build_input.output.inputs) + ], + zip, + tumour_id = PATIENT["tumour_sample_id"], + normal_id = PATIENT["normal_sample_id"], + pair_status = PATIENT["pair_status"], + allow_missing = True + ) + return(inputs) + +def get_cellularity(wildcards): + CFG = config["lcr-modules"]["pyclone_vi"] + PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id).sort_values(by = ["tumour_time_point"]) + inputs = expand( + rules._pyclone_vi_input_battenberg.input.cellularity, + zip, + seq_type = ["genome"]*len(PATIENT["tumour_genome_build"]), + genome_build = PATIENT["tumour_genome_build"], + tumour_id = PATIENT["tumour_sample_id"], + normal_id = PATIENT["normal_sample_id"], + pair_status = PATIENT["pair_status"] + ) + cell_list = [] + for file in inputs: + cellularity = pd.read_csv(file, sep = "\t") + cellularity = cellularity["cellularity"].tolist() + cell_list = cell_list + cellularity + cell_list = [str(i) for i in cell_list] + cellularities = " ".join(cell_list) + sample_ids = " ".join(PATIENT.tumour_sample_id.tolist()) + return dict(cellularity = cellularities, sample_ids = sample_ids) + +rule _pyclone_vi_merge_input: + input: + get_built_inputs + output: + merged = CFG["dirs"]["build_inputs"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.merged_inputs.tsv" + group: "input_and_build" + run: + df_list = [] + for tsv in input: + df = pd.read_csv(tsv, sep = "\t", header = 0, index_col = None) + df_list.append(df) + df_merged = pd.concat(df_list, ignore_index = True, axis = 0) + df_merged.sort_values("mutation_id", inplace=True) + df_merged.rename(columns = {"var_counts": "alt_counts"}, inplace=True) + df_merged.to_csv(output.merged, sep = "\t", index=False) + +# PyClone for capture data +rule _pyclone_run_analysis_pipeline: + input: + tsv = get_built_inputs + output: + loci = CFG["dirs"]["og-pyclone"] + "{seq_type}--{genome_build}/{patient_id}/tables/loci.tsv" + params: + workdir = CFG["dirs"]["og-pyclone"] + "{seq_type}--{genome_build}/{patient_id}", + cellularity = lambda w: get_cellularity(w)["cellularity"], + sample_ids = lambda w: get_cellularity(w)["sample_ids"] + conda: CFG["conda_envs"]["pyclone"] + wildcard_constraints: seq_type = "capture" + shell: + op.as_one_line(""" + PyClone run_analysis_pipeline + --in_files {input.tsv} + --working_dir {params.workdir} + --tumour_contents {params.cellularity} + --samples {params.sample_ids} + --plot_file_format pdf + """) + + +# Fit pyclone-vi +rule _pyclone_vi_fit: + input: + tsv = str(rules._pyclone_vi_merge_input.output.merged) + output: + trace = CFG["dirs"]["fit"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.h5" + log: + CFG["logs"]["fit"] + "{seq_type}--{genome_build}/{patient_id}/fit.log" + params: + **CFG["options"]["fit"] + threads: + CFG["threads"]["fit"] + resources: + **CFG["resources"]["fit"] + conda: + CFG["conda_envs"]["pyclone-vi"] + group: "fit_and_write" + wildcard_constraints: seq_type = "genome" + shell: + op.as_one_line(""" + pyclone-vi fit + -i {input.tsv} + -o {output.trace} + -c {params.num_clusters} + -d binomial + -r {params.num_restarts} + {params.opts} + > {log} 2>&1 + """) + +# Fit pyclone-vi +rule _pyclone_vi_write_results: + input: + trace = str(rules._pyclone_vi_fit.output.trace) + output: + results = CFG["dirs"]["fit"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.pyclone_vi.tsv" + log: + CFG["logs"]["fit"] + "{seq_type}--{genome_build}/{patient_id}/write.log" + conda: + CFG["conda_envs"]["pyclone-vi"] + group: "fit_and_write" + wildcard_constraints: seq_type = "genome" + shell: + op.as_one_line(""" + pyclone-vi write-results-file -i {input.trace} -o {output.results} > {log} 2>&1 + """) + +# Run PhyClone to get phylogenetic tree +rule _pyclone_vi_run_phyclone: + input: + merged = str(rules._pyclone_vi_merge_input.output.merged), + pyclone = lambda w: str(rules._pyclone_vi_write_results.output.results) if w.seq_type == "genome" else str(rules._pyclone_run_analysis_pipeline.output.loci) + output: + trace = CFG["dirs"]["phyclone"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.phyclone.pkl.gz" + params: + **CFG["options"]["phyclone"] + threads: + CFG["threads"]["phyclone"] + resources: + **CFG["resources"]["phyclone"] + conda: + CFG["conda_envs"]["phyclone"] + shell: + op.as_one_line(""" + phyclone run + -i {input.merged} + -c {input.pyclone} + -o {output.trace} + -b {params.burnin} + -d {params.density} + -n {params.num_iters} + """) + +rule _pyclone_vi_phyclone_consensus: + input: + merged = str(rules._pyclone_vi_merge_input.output.merged), + pyclone = str(rules._pyclone_vi_run_phyclone.output.trace) + output: + tree = CFG["dirs"]["phyclone"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.phyclone.tree.nwk", + clusters = CFG["dirs"]["phyclone"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.phyclone.clusters.tsv" + params: + **CFG["options"]["phyclone"] + threads: + CFG["threads"]["phyclone"] + resources: + **CFG["resources"]["phyclone"] + conda: + CFG["conda_envs"]["phyclone"] + shell: + op.as_one_line(""" + phyclone consensus + -i {input.pyclone} + -t {output.tree} + -o {output.clusters} + """) + +rule _pyclone_vi_compute_tree_stats: + input: + trace = str(rules._pyclone_vi_run_phyclone.output.trace) + output: + stats = CFG["dirs"]["phyclone"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.phyclone.stats.tsv" + params: + **CFG["options"]["phyclone"], + script = CFG["scripts"]["compute_stats"] + threads: + CFG["threads"]["phyclone"] + resources: + **CFG["resources"]["phyclone"] + conda: + CFG["conda_envs"]["phyclone"] + shell: + op.as_one_line(""" + python {params.script} + -i {input.trace} + -o {output.stats} + -p {wildcards.patient_id} + -b {params.burnin} + """) + + +# Symlinks the final output files into the module results directory (under '99-outputs/') +rule _pyclone_vi_output_tsv: + input: + pyclone = lambda w: str(rules._pyclone_vi_write_results.output.results) if w.seq_type == "genome" else str(rules._pyclone_run_analysis_pipeline.output.loci), + phyclone = str(rules._pyclone_vi_compute_tree_stats.output.stats), + tree = str(rules._pyclone_vi_phyclone_consensus.output.tree), + clusters = str(rules._pyclone_vi_phyclone_consensus.output.clusters), + output: + pyclone = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{patient_id}.pyclone.results.tsv", + phyclone = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{patient_id}.phyclone.stats.tsv", + tree = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{patient_id}.phyclone.tree.nwk", + clusters = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{patient_id}.phyclone.clusters.tsv" + run: + op.relative_symlink(input.pyclone, output.pyclone) + op.relative_symlink(input.phyclone, output.phyclone) + op.relative_symlink(input.tree, output.tree) + op.relative_symlink(input.clusters, output.clusters) + + +# Generates the target sentinels for each run, which generate the symlinks + +# Generate a de-duplicated table of patient_ids etc. +PATIENTS_GENOMES = op.filter_samples(CFG["runs"][["tumour_patient_id", "normal_patient_id", "tumour_genome_build", "tumour_seq_type"]], tumour_seq_type = "genome")\ + .drop_duplicates(subset = None, ignore_index = True) +PATIENTS_CAPTURE = op.filter_samples(CFG["runs"][["tumour_patient_id", "normal_patient_id", "tumour_genome_build", "tumour_seq_type"]], tumour_seq_type = "capture")\ + .drop_duplicates(subset = None, ignore_index = True) +if isinstance(PATIENTS_GENOMES, pd.DataFrame) and isinstance(PATIENTS_CAPTURE, pd.DataFrame): + PATIENTS = pd.concat([PATIENTS_GENOMES, PATIENTS_CAPTURE]) + +rule _pyclone_vi_all: + input: + expand( + [ + str(rules._pyclone_vi_output_tsv.output.phyclone), + str(rules._pyclone_vi_output_tsv.output.pyclone), + str(rules._pyclone_vi_output_tsv.output.tree), + str(rules._pyclone_vi_output_tsv.output.clusters) + ], + zip, # Run expand() with zip(), not product() + seq_type=PATIENTS["tumour_seq_type"], + genome_build=PATIENTS["tumour_genome_build"], + patient_id=PATIENTS["tumour_patient_id"]) + # expand( + # str(rules._pyclone_run_analysis_pipeline.output.workdir), + # zip, + # seq_type=PATIENTS_CAPTURE["tumour_seq_type"], + # genome_build=PATIENTS_CAPTURE["tumour_genome_build"], + # patient_id=PATIENTS_CAPTURE["tumour_patient_id"] + # ) + + +##### CLEANUP ##### + + +# Perform some clean-up tasks, including storing the module-specific +# configuration on disk and deleting the `CFG` variable +op.cleanup_module(CFG) diff --git a/modules/pyclone_vi/1.0/schemas/base-1.0.yaml b/modules/pyclone_vi/1.0/schemas/base-1.0.yaml new file mode 120000 index 00000000..0a69d1ce --- /dev/null +++ b/modules/pyclone_vi/1.0/schemas/base-1.0.yaml @@ -0,0 +1 @@ +../../../../schemas/base/base-1.0.yaml \ No newline at end of file diff --git a/modules/pyclone_vi/1.0/schemas/time_point-1.0.yaml b/modules/pyclone_vi/1.0/schemas/time_point-1.0.yaml new file mode 120000 index 00000000..c163d396 --- /dev/null +++ b/modules/pyclone_vi/1.0/schemas/time_point-1.0.yaml @@ -0,0 +1 @@ +../../../../schemas/time_point/time_point-1.0.yaml \ No newline at end of file diff --git a/modules/pyclone_vi/1.0/src/build_input.py b/modules/pyclone_vi/1.0/src/build_input.py new file mode 100644 index 00000000..f412fd6c --- /dev/null +++ b/modules/pyclone_vi/1.0/src/build_input.py @@ -0,0 +1,213 @@ +from operator import index +import numpy as np +import pandas as pd + + +def main(args): + df = build_input_df( + args.cnv_file, + args.snv_file, + sample_id=args.sample_id, + cnv_caller=args.cnv_caller, + sex=args.sex, + tumour_content=args.tumour_content, + pyclone_tool=args.pyclone_tool + ) + + assert np.all(df["major_cn"] >= df["minor_cn"]) + + df = df[df["major_cn"] > 0] + + df.to_csv(args.out_file, index=False, sep="\t") + + +def build_input_df(cnv_file, snv_file, sample_id="tumour", cnv_caller="battenberg", sex=None, tumour_content=1.0, pyclone_tool="pyclone-vi"): + if cnv_caller == "battenberg": + cnv_df = load_battenberg_cnv_df(cnv_file, sample_id=sample_id, sex=sex) + else: + cnv_df = load_sequenza_cnv_df(cnv_file, sex=sex) + snv_df = load_snv_df(snv_file, sample_id=sample_id) + df = merge_files(cnv_df, snv_df) + df["tumour_content"] = tumour_content + if pyclone_tool == "pyclone": + df = df[["mutation_id", "ref_counts", "alt_counts", "normal_cn", "major_cn", "minor_cn", "tumour_content", "sample_id"]].rename(columns = {"alt_counts": "var_counts"}) + return df + + +def load_battenberg_cnv_df(file_name, sample_id="tumour", sex=None, solution="A"): + def get_dominant_clone(x, solution="A"): + frac_1_key = "frac1_{}".format(solution) + frac_2_key = "frac2_{}".format(solution) + if np.isnan(x[frac_2_key]): + clone = 1 + else: + if x[frac_1_key] >= x[frac_2_key]: + clone = 1 + else: + clone = 2 + return clone + + def get_major_cn(x, solution="A"): + clone = get_dominant_clone(x, solution=solution) + return x["nMaj{clone}_{solution}".format(clone=clone, solution=solution)] + + def get_minor_cn(x, solution="A"): + clone = get_dominant_clone(x, solution=solution) + return x["nMin{clone}_{solution}".format(clone=clone, solution=solution)] + + df = pd.read_csv(file_name, sep="\t") + df = df.rename(columns={"chr": "chrom", "startpos": "start", "endpos": "end"}) + df["major_cn"] = df.apply(lambda row: get_major_cn(row, solution=solution), axis=1) + df["minor_cn"] = df.apply(lambda row: get_minor_cn(row, solution=solution), axis=1) + df["normal_cn"] = df["chrom"].apply(lambda row: get_normal_cn(row, sex=sex)) + df["sample_id"] = sample_id + df = df[["sample_id", "chrom", "start", "end", "normal_cn", "major_cn", "minor_cn"]] + if sex is None: + df = df[~df["chrom"].isin(["X", "Y"])] + df.to_csv("loaded_battenberg.tsv", sep="\t", index=False) + return df + + +def load_sequenza_cnv_df(file_name, sex=None): + df = pd.read_csv(file_name, sep="\t") + df = df.rename(columns={"chromosome": "chrom", "start.pos": "start", "end.pos": "end"}) + df["major_cn"] = df.apply(lambda row: max(row["A"], row["B"]), axis=1) + df["minor_cn"] = df.apply(lambda row: min(row["A"], row["B"]), axis=1) + df["normal_cn"] = df["chrom"].apply(lambda row: get_normal_cn(row, sex=sex)) + df = df[["chrom", "start", "end", "normal_cn", "major_cn", "minor_cn"]] + if sex is None: + df = df[~df["chrom"].isin(["X", "Y"])] + df["sample_id"] = "tumour" + df = df.dropna() + return df + + +def get_normal_cn(chrom, sex): + if sex == "M": + if chrom in ["X", "Y"]: + cn = 1 + else: + cn = 2 + else: + cn = 2 + return cn + + +def load_snv_df(file_name, sample_id="tumour"): + df = pd.read_csv(file_name, sep="\t") + # PyClone only works on SNPs, not InDels + df = df[df["Variant_Type"].isin(["SNP"])] + # Ignore intergenic mutations (IGR) + df = df[~df["Variant_Classification"].isin(["IGR"])] + # Acutally I can't do this sub-sampling here because the mutations in all files for all tumours + # per patient need to be the same! Have to do this at the merging step. + # Except that by the time this file is generated, info about coding status of + # mutations is lost. Need to think about this. + # Separate the df into coding and non-coding mutations first + # df_coding = df[~df["Variant_Classification"].isin(["3'Flank", "5'Flank", "Intron"])] + # df_noncoding = df[df["Variant_Classification"].isin(["3'Flank", "5'Flank", "Intron"])] + # # To get up to 5000 total mutations, first check how many coding mutations there are: + # if len(df_coding.index) >= 5000: + # df = df_coding.sample(n = 5000) + # else: + # to_add = min([(5000 - len(df_coding.index)), len(df_noncoding.index)]) + # df_noncoding = df_noncoding.sample(n = to_add) + # df = pd.concat([df_coding, df_noncoding]) + df = df.rename(columns={ + "Chromosome": "chrom", + "Start_Position": "coord", + "Reference_Allele": "ref", + "Tumor_Seq_Allele2": "alt", + "t_ref_count": "ref_counts", + "t_alt_count": "alt_counts" + }) + df = df[["chrom", "coord", "ref", "alt", "ref_counts", "alt_counts"]] + df["mutation_id"] = df.apply(lambda row: "{chrom}:{coord}:{ref}:{alt}".format(**row.to_dict()), axis=1) + df["sample_id"] = sample_id + return df + + +def position_segment_merge(positions, segments): + """ + Merge positions with segments that contain them + + Args: + positions (pandas.DataFrame): ['chrom', 'coord'] columns required + segments (pandas.DataFrame): ['chrom', 'start', 'end'] columns required + + Returns: + pandas.DataFrame: merged table with ['chrom', 'coord', 'start', 'end'] columns + + + Assuming a set of non-overlapping segments, merge a set of positions so that + each entry in the new table provides coord and containing segment start/end + """ + + positions = positions[['chrom', 'coord']].copy().sort_values(by=['chrom','coord']) + positions["chrom"] = positions["chrom"].astype(str) + positions["chrom"] = positions["chrom"].str.strip() + segments = segments[['chrom', 'start', 'end']].copy().sort_values(by=['chrom','start']) + segments["chrom"] = segments["chrom"].astype(str) + segments["chrom"] = segments["chrom"].str.strip() + + merged = positions.merge(segments, left_on='chrom', right_on='chrom', how="left")\ + .sort_values(by=['chrom', 'coord']) + + + merged['start'] = merged['start'].fillna(method='ffill') + merged['end'] = merged['end'].fillna(method='ffill') + + + merged = merged[(merged['coord'] >= merged['start']) & + (merged['coord'] <= merged['end'])] + + return merged + + +def merge_files(cnv_df, snv_df): + df = [] + for sample_id in cnv_df['sample_id'].unique(): + df_1 = snv_df[snv_df['sample_id'] == sample_id] + df_2 = cnv_df[cnv_df['sample_id'] == sample_id] + df_2["chrom"] = df_2["chrom"].astype(str).str.strip() + merged = position_segment_merge(df_1, df_2) + merged["chrom"] = merged["chrom"].astype(str).str.strip() + merged = pd.merge(merged, df_2, on=['chrom', 'start', 'end']) + merged = merged[['chrom', 'coord', 'major_cn', 'minor_cn', 'normal_cn']] + # df_1.to_csv("snv_df.tsv", sep="\t", index=False) + df_1["chrom"] = df_1["chrom"].astype(str).str.strip() + merged["chrom"] = merged["chrom"].astype(str).str.strip() + merged = pd.merge(merged, df_1, on=['chrom', 'coord']) + # merged.to_csv("merged.tsv", sep="\t", index=False) + merged = merged[['mutation_id', 'sample_id', 'ref_counts', 'alt_counts', 'normal_cn', 'major_cn', 'minor_cn']] + df.append(merged) + df = pd.concat(df) + df.drop_duplicates(subset=['mutation_id', 'sample_id'], inplace=True) + df['major_cn'], df['minor_cn'] = df['major_cn'].astype('Int64'), df['minor_cn'].astype('Int64') + return df + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + + parser.add_argument("-ic", "--cnv-file", required=True) + + parser.add_argument("-is", "--snv-file", required=True) + + parser.add_argument("-id", "--sample-id", required=True) + + parser.add_argument("-o", "--out-file", required=True) + + parser.add_argument("-c", "--cnv-caller", default="battenberg", choices=["battenberg", "sequenza"]) + + parser.add_argument("-s", "--sex", default=None, choices=["M", "F"]) + + parser.add_argument("-t", "--tumour-content", default=1.0, type=float) + + parser.add_argument("-p", "--pyclone_tool", default = "pyclone-vi", choices = ["pyclone-vi", "pyclone"]) + + cli_args = parser.parse_args() + + main(cli_args) diff --git a/modules/pyclone_vi/1.0/src/build_pyclone_summary_file.py b/modules/pyclone_vi/1.0/src/build_pyclone_summary_file.py new file mode 100644 index 00000000..d48e9b56 --- /dev/null +++ b/modules/pyclone_vi/1.0/src/build_pyclone_summary_file.py @@ -0,0 +1,33 @@ +import pandas as pd + + +def main(args): + df = pd.read_csv(args.in_file, sep="\t") + + out_df = pd.merge( + df.groupby("cluster_id")["mutation_id"].nunique().reset_index(), + df[["cluster_id", "cellular_prevalence"]].drop_duplicates(), + on="cluster_id" + ) + + out_df.insert(0, "patient_id", args.patient_id) + + out_df = out_df.rename(columns={"mutation_id": "num_snvs"}) + + out_df.to_csv(args.out_file, index=False, sep="\t") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + + parser.add_argument("-i", "--in-file", required=True) + + parser.add_argument("-o", "--out-file", required=True) + + parser.add_argument("-p", "--patient-id", required=True) + + cli_args = parser.parse_args() + + main(cli_args) diff --git a/modules/pyclone_vi/1.0/src/compute_expected_statistics.py b/modules/pyclone_vi/1.0/src/compute_expected_statistics.py new file mode 100644 index 00000000..a98a01a7 --- /dev/null +++ b/modules/pyclone_vi/1.0/src/compute_expected_statistics.py @@ -0,0 +1,126 @@ +from phyclone.map import get_map_node_ccfs +from phyclone.run import get_labels_table +from phyclone.tree import Tree + +import gzip +import networkx as nx +import numpy as np +import pandas as pd +import pickle + + +def main(args): + with gzip.GzipFile(args.in_file, "rb") as fh: + results = pickle.load(fh) + + out_df = [] + + for state in results["trace"][args.burnin:]: + phyclone_tree = Tree.from_dict(results["data"], state["tree"]) + + tree = get_tree_from_phyclone_tree(phyclone_tree) + + prevs = np.array([tree.nodes[n]["clonal_prev"][0] for n in tree.nodes]) + + prevs += 1e-6 + + prevs = prevs / np.sum(prevs) + + entropy = -np.sum(prevs * np.log2(prevs)) + + num_clones = len(prevs) + + labels = get_labels_table(results["data"], phyclone_tree, clusters=results["clusters"]) + + num_snvs = labels.groupby("cluster_id")["mutation_id"].nunique().values + + min_num_snvs = min(num_snvs) + + max_num_snvs = max(num_snvs) + + mean_num_snvs = np.mean(num_snvs) + + median_num_snvs = np.median(num_snvs) + + out_df.append({ + "entropy": entropy, + "num_clones": num_clones, + "min_num_snvs": min_num_snvs, + "max_num_snvs": max_num_snvs, + "mean_num_snvs": mean_num_snvs, + "median_num_snvs": median_num_snvs + }) + + out_df = pd.DataFrame(out_df) + + out_df = pd.DataFrame([out_df.mean()]) + + out_df.insert(0, "patient_id", args.patient_id) + + out_df.to_csv(args.out_file, index=False, sep="\t") + + +def get_tree_from_phyclone_tree(tree): + """ Convert a Phyclone tree object to a graph for benchmarking. + + Parameters + ---------- + tree: (phyclone.tree.Tree) + + Returns + ------- + nx.Digraph representing clone phylogeny with nodes "snvs", "cellular_prev" and "clonal_prev" set for each node + """ + ccfs = get_map_node_ccfs(tree) + + G = nx.DiGraph() + + for n in tree.graph.nodes: + G.add_node(n) + + G.nodes[n]["cellular_prev"] = ccfs[n] + + G.nodes[n]["snvs"] = [x.name for x in tree.node_data[n]] + + for u, v in tree.graph.edges: + G.add_edge(u, v) + + roots = [] + + for n in G.nodes: + if G.in_degree(n) == 0: + roots.append(n) + + for r in roots: + set_clonal_prev(G, r) + + return G + + +def set_clonal_prev(G, node): + clonal_prev = G.nodes[node]["cellular_prev"].copy() + + for child in G.successors(node): + clonal_prev -= G.nodes[child]["cellular_prev"] + + set_clonal_prev(G, child) + + G.nodes[node]["clonal_prev"] = clonal_prev + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + + parser.add_argument("-i", "--in-file", required=True) + + parser.add_argument("-o", "--out-file", required=True) + + parser.add_argument("-p", "--patient-id", required=True) + + parser.add_argument("-b", "--burnin", default=0, type=int) + + cli_args = parser.parse_args() + + main(cli_args) diff --git a/modules/pyclone_vi/1.0/src/merge_files.py b/modules/pyclone_vi/1.0/src/merge_files.py new file mode 100644 index 00000000..b8e22b8e --- /dev/null +++ b/modules/pyclone_vi/1.0/src/merge_files.py @@ -0,0 +1,26 @@ +import pandas as pd + + +def main(args): + out_df = [] + + for file_name in args.in_files: + out_df.append(pd.read_csv(file_name, sep="\t")) + + out_df = pd.concat(out_df) + + out_df.to_csv(args.out_file, index=False, sep="\t") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + + parser.add_argument("-i", "--in-files", nargs="+", required=True) + + parser.add_argument("-o", "--out-file", required=True) + + cli_args = parser.parse_args() + + main(cli_args) diff --git a/modules/pyclone_vi/1.0/src/subset_maf_for_pyclone.R b/modules/pyclone_vi/1.0/src/subset_maf_for_pyclone.R new file mode 100644 index 00000000..b4d1ca55 --- /dev/null +++ b/modules/pyclone_vi/1.0/src/subset_maf_for_pyclone.R @@ -0,0 +1,56 @@ +library(GAMBLR) +library(tidyverse) +library(data.table) + +genome_build <- snakemake@wildcards[["genome_build"]] +in_maf <- snakemake@input[["maf"]] +outfile <- snakemake@output[["maf"]] + +# Load the input maf file for this sample +maf <- fread_maf(in_maf) + +coding_types <- c( + "Frame_Shift_Del", + "Frame_Shift_Ins", + "In_Frame_Del", + "In_Frame_Ins", + "Missense_Mutation", + "Nonsense_Mutation", + "Nonstop_Mutation", + "Silent", + "Splice_Region", + "Splice_Site", + "Translation_Start_Site" +) + +# Load the correct aSHM regions file for the current genome_build +if (str_detect(genome_build, "grch37|hg19|hs37d5")){ + ashm_regions <- grch37_ashm_regions[1:3] +} else { + ashm_regions <- hg38_ashm_regions[1:3] +} + +# Remove the chr prefix if necessary +if (!str_detect("chr", maf$Chromosome[1])){ + ashm_regions <- mutate(ashm_regions, + chr_name = str_remove(chr_name, "chr")) +} + +# Rename the columns and set the keys for foverlaps +colnames(ashm_regions) <- c("Chromosome", "Start_Position", "End_Position") +ashm_regions <- data.table(ashm_regions) +setkey(ashm_regions, Chromosome, Start_Position, End_Position) + +# Subset the maf to aSHM regions using foverlaps +subset_maf <- foverlaps(maf, ashm_regions) %>% + filter(!is.na(Start_Position) | + Variant_Classification %in% coding_types) %>% + select(-Start_Position, -End_Position) %>% + select(Start_Position = i.Start_Position, + End_Position = i.End_Position, + everything()) %>% + select(all_of(colnames(maf))) + +# Write to file +write_tsv(subset_maf, outfile) + diff --git a/modules/pyclone_vi/CHANGELOG.md b/modules/pyclone_vi/CHANGELOG.md new file mode 100644 index 00000000..34537789 --- /dev/null +++ b/modules/pyclone_vi/CHANGELOG.md @@ -0,0 +1,16 @@ +# Changelog + +All notable changes to the `pyclone_vi` module will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [1.0] - 2022-02-22 + +This release was authored by Laura Hilton. +See details about [PyClone](https://github.com/Roth-Lab/pyclone) and [PyClone-VI](https://github.com/Roth-Lab/pyclone-vi). +The PhyClone GitHub repository is currently private but the tool is available on Conda. Several scripts were adapted from those written by Andrew Roth. + +This module runs PyClone (either the original meant for capture data) or PhyClone-VI (newer, meant for genome data), followed by PhyClone to refine clusters and assign a phylogeny. Runs on multi-time point patients, and requires a time_point column in the input metadata. The original PyClone does include some plots in its outputs. PhyClone and PyClone-VI do not generate any plots currently but this will be added as a feature in the near future. + +PyClone-VI seems to offer superior performance using only coding and aSHM mutations, rather than all mutations in the genome. This is togglable in the config with the `subset_maf` option. From 07989c1bdbfdc4aa84fe0aa87b41754622d7a291 Mon Sep 17 00:00:00 2001 From: lkhilton Date: Mon, 12 Sep 2022 22:21:03 -0700 Subject: [PATCH 05/14] Restore files messed up by git --- envs/phylowgs/fill_battenberg.yaml | 41 ++ envs/phylowgs/phylowgs_results.yaml | 204 +++++++ .../phylowgs/1.0/envs/bcftools-1.10.2.yaml | 1 + modules/phylowgs/1.0/envs/coreutils-8.31.yaml | 1 + modules/phylowgs/1.0/envs/phylowgs.yaml | 1 + .../phylowgs/1.0/envs/phylowgs_results.yaml | 1 + modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml | 1 + modules/phylowgs/1.0/etc/noncoding.txt | 8 + modules/phylowgs/1.0/schemas/base-1.0.yaml | 1 + .../phylowgs/1.0/schemas/time_point-1.0.yaml | 1 + .../1.0/src/process_phyloWGS_outputs.R | 536 ++++++++++++++++++ 11 files changed, 796 insertions(+) create mode 100644 envs/phylowgs/fill_battenberg.yaml create mode 100644 envs/phylowgs/phylowgs_results.yaml create mode 120000 modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml create mode 120000 modules/phylowgs/1.0/envs/coreutils-8.31.yaml create mode 120000 modules/phylowgs/1.0/envs/phylowgs.yaml create mode 120000 modules/phylowgs/1.0/envs/phylowgs_results.yaml create mode 120000 modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml create mode 100644 modules/phylowgs/1.0/etc/noncoding.txt create mode 120000 modules/phylowgs/1.0/schemas/base-1.0.yaml create mode 120000 modules/phylowgs/1.0/schemas/time_point-1.0.yaml create mode 100644 modules/phylowgs/1.0/src/process_phyloWGS_outputs.R diff --git a/envs/phylowgs/fill_battenberg.yaml b/envs/phylowgs/fill_battenberg.yaml new file mode 100644 index 00000000..52741e52 --- /dev/null +++ b/envs/phylowgs/fill_battenberg.yaml @@ -0,0 +1,41 @@ +name: fill_segments +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - ca-certificates=2020.12.5 + - certifi=2020.12.5 + - ld_impl_linux-64=2.35.1 + - libblas=3.9.0 + - libcblas=3.9.0 + - libffi=3.3 + - libgcc-ng=9.3.0 + - libgfortran-ng=9.3.0 + - libgfortran5=9.3.0 + - libgomp=9.3.0 + - liblapack=3.9.0 + - libopenblas=0.3.12 + - libstdcxx-ng=9.3.0 + - ncurses=6.2 + - numpy=1.19.4 + - openssl=1.1.1i + - pandas=1.2.0 + - pip=20.3.3 + - python=3.9.1 + - python-dateutil=2.8.1 + - python_abi=3.9 + - pytz=2020.5 + - readline=8.0 + - setuptools=49.6.0 + - simplejson=3.17.2 + - six=1.15.0 + - sqlite=3.34.0 + - tk=8.6.10 + - tzdata=2020e + - wheel=0.36.2 + - xz=5.2.5 + - zlib=1.2.11 +prefix: /home/dreval/miniconda3/envs/fill_segments diff --git a/envs/phylowgs/phylowgs_results.yaml b/envs/phylowgs/phylowgs_results.yaml new file mode 100644 index 00000000..f64126d7 --- /dev/null +++ b/envs/phylowgs/phylowgs_results.yaml @@ -0,0 +1,204 @@ +name: phylowgs_outputs +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.1 + - binutils_impl_linux-64=2.35.1 + - binutils_linux-64=2.35 + - bwidget=1.9.14 + - bzip2=1.0.8 + - c-ares=1.17.1 + - ca-certificates=2021.5.30 + - cairo=1.16.0 + - curl=7.77.0 + - font-ttf-dejavu-sans-mono=2.37 + - font-ttf-inconsolata=3.000 + - font-ttf-source-code-pro=2.038 + - font-ttf-ubuntu=0.83 + - fontconfig=2.13.1 + - fonts-conda-ecosystem=1 + - fonts-conda-forge=1 + - freetype=2.10.4 + - fribidi=1.0.10 + - gcc_impl_linux-64=9.3.0 + - gcc_linux-64=9.3.0 + - gettext=0.21.0 + - gfortran_impl_linux-64=9.3.0 + - gfortran_linux-64=9.3.0 + - graphite2=1.3.14 + - gsl=2.6 + - gxx_impl_linux-64=9.3.0 + - gxx_linux-64=9.3.0 + - harfbuzz=2.8.1 + - icu=68.1 + - jbig=2.1 + - jpeg=9d + - kernel-headers_linux-64=2.6.32 + - krb5=1.19.1 + - ld_impl_linux-64=2.35.1 + - lerc=2.2.1 + - libblas=3.9.0 + - libcblas=3.9.0 + - libcurl=7.77.0 + - libdeflate=1.7 + - libedit=3.1.20210216 + - libev=4.33 + - libffi=3.3 + - libgcc-devel_linux-64=9.3.0 + - libgcc-ng=9.3.0 + - libgfortran-ng=9.3.0 + - libgfortran5=9.3.0 + - libglib=2.68.3 + - libgomp=9.3.0 + - libiconv=1.16 + - liblapack=3.9.0 + - libnghttp2=1.43.0 + - libopenblas=0.3.15 + - libpng=1.6.37 + - libssh2=1.9.0 + - libstdcxx-devel_linux-64=9.3.0 + - libstdcxx-ng=9.3.0 + - libtiff=4.3.0 + - libuuid=2.32.1 + - libwebp-base=1.2.0 + - libxcb=1.14 + - libxml2=2.9.12 + - lz4-c=1.9.3 + - make=4.3 + - ncurses=6.2 + - openssl=1.1.1k + - pandoc=2.14.0.3 + - pango=1.48.6 + - pcre=8.45 + - pcre2=10.36 + - pixman=0.40.0 + - r-askpass=1.1 + - r-assertthat=0.2.1 + - r-backports=1.2.1 + - r-base=4.1.0 + - r-base64enc=0.1_3 + - r-blob=1.2.1 + - r-brio=1.1.2 + - r-broom=0.7.8 + - r-callr=3.7.0 + - r-cellranger=1.1.0 + - r-cli=3.0.0 + - r-clipr=0.7.1 + - r-colorspace=2.0_2 + - r-cpp11=0.3.1 + - r-crayon=1.4.1 + - r-curl=4.3.2 + - r-data.table=1.14.0 + - r-dbi=1.1.1 + - r-dbplyr=2.1.1 + - r-desc=1.3.0 + - r-diffobj=0.3.4 + - r-digest=0.6.27 + - r-dplyr=1.0.7 + - r-dtplyr=1.1.0 + - r-ellipsis=0.3.2 + - r-evaluate=0.14 + - r-fansi=0.4.2 + - r-farver=2.1.0 + - r-forcats=0.5.1 + - r-fs=1.5.0 + - r-gargle=1.1.0 + - r-generics=0.1.0 + - r-ggplot2=3.3.5 + - r-ggrepel=0.9.1 + - r-glue=1.4.2 + - r-googledrive=1.0.1 + - r-googlesheets4=0.3.0 + - r-gtable=0.3.0 + - r-haven=2.4.1 + - r-highr=0.9 + - r-hms=1.1.0 + - r-htmltools=0.5.1.1 + - r-httr=1.4.2 + - r-ids=1.0.1 + - r-isoband=0.2.4 + - r-jsonlite=1.7.2 + - r-knitr=1.33 + - r-labeling=0.4.2 + - r-lattice=0.20_44 + - r-lifecycle=1.0.0 + - r-lubridate=1.7.10 + - r-magrittr=2.0.1 + - r-markdown=1.1 + - r-mass=7.3_54 + - r-matrix=1.3_4 + - r-mgcv=1.8_36 + - r-mime=0.11 + - r-modelr=0.1.8 + - r-munsell=0.5.0 + - r-nlme=3.1_152 + - r-openssl=1.4.4 + - r-pillar=1.6.1 + - r-pkgconfig=2.0.3 + - r-pkgload=1.2.1 + - r-plyr=1.8.6 + - r-praise=1.0.0 + - r-prettyunits=1.1.1 + - r-processx=3.5.2 + - r-progress=1.2.2 + - r-ps=1.6.0 + - r-purrr=0.3.4 + - r-r6=2.5.0 + - r-rappdirs=0.3.3 + - r-rcolorbrewer=1.1_2 + - r-rcpp=1.0.6 + - r-readr=1.4.0 + - r-readxl=1.3.1 + - r-rematch=1.0.1 + - r-rematch2=2.1.2 + - r-reprex=2.0.0 + - r-reshape2=1.4.4 + - r-rjson=0.2.20 + - r-rlang=0.4.11 + - r-rmarkdown=2.9 + - r-rprojroot=2.0.2 + - r-rstudioapi=0.13 + - r-rvest=1.0.0 + - r-scales=1.1.1 + - r-selectr=0.4_2 + - r-stringi=1.6.2 + - r-stringr=1.4.0 + - r-sys=3.4 + - r-testthat=3.0.3 + - r-tibble=3.1.2 + - r-tidyr=1.1.3 + - r-tidyselect=1.1.1 + - r-tidyverse=1.3.1 + - r-tinytex=0.32 + - r-utf8=1.2.1 + - r-uuid=0.1_4 + - r-vctrs=0.3.8 + - r-viridislite=0.4.0 + - r-waldo=0.2.5 + - r-withr=2.4.2 + - r-xfun=0.24 + - r-xml2=1.3.2 + - r-yaml=2.2.1 + - readline=8.1 + - sed=4.8 + - sysroot_linux-64=2.12 + - tk=8.6.10 + - tktable=2.10 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.7.2 + - xorg-libxext=1.3.4 + - xorg-libxrender=0.9.10 + - xorg-libxt=1.2.1 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.5.0 +prefix: /home/lhilton/miniconda3/envs/phylowgs_outputs diff --git a/modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml b/modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml new file mode 120000 index 00000000..72959e7b --- /dev/null +++ b/modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml @@ -0,0 +1 @@ +../../../../envs/bcftools/bcftools-1.10.2.yaml \ No newline at end of file diff --git a/modules/phylowgs/1.0/envs/coreutils-8.31.yaml b/modules/phylowgs/1.0/envs/coreutils-8.31.yaml new file mode 120000 index 00000000..050452f7 --- /dev/null +++ b/modules/phylowgs/1.0/envs/coreutils-8.31.yaml @@ -0,0 +1 @@ +../../../../envs/coreutils/coreutils-8.31.yaml \ No newline at end of file diff --git a/modules/phylowgs/1.0/envs/phylowgs.yaml b/modules/phylowgs/1.0/envs/phylowgs.yaml new file mode 120000 index 00000000..6e962c7f --- /dev/null +++ b/modules/phylowgs/1.0/envs/phylowgs.yaml @@ -0,0 +1 @@ +../../../../envs/phylowgs/phylowgs.yaml \ No newline at end of file diff --git a/modules/phylowgs/1.0/envs/phylowgs_results.yaml b/modules/phylowgs/1.0/envs/phylowgs_results.yaml new file mode 120000 index 00000000..926ec438 --- /dev/null +++ b/modules/phylowgs/1.0/envs/phylowgs_results.yaml @@ -0,0 +1 @@ +../../../../envs/phylowgs/phylowgs_results.yaml \ No newline at end of file diff --git a/modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml b/modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml new file mode 120000 index 00000000..829077c7 --- /dev/null +++ b/modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml @@ -0,0 +1 @@ +../../../../envs/vcf2maf/vcf2maf-1.6.18.yaml \ No newline at end of file diff --git a/modules/phylowgs/1.0/etc/noncoding.txt b/modules/phylowgs/1.0/etc/noncoding.txt new file mode 100644 index 00000000..258f27f0 --- /dev/null +++ b/modules/phylowgs/1.0/etc/noncoding.txt @@ -0,0 +1,8 @@ +Hugo_Symbol +Silent +RNA +IGR +Intron +5'Flank +3'Flank +5'UTR diff --git a/modules/phylowgs/1.0/schemas/base-1.0.yaml b/modules/phylowgs/1.0/schemas/base-1.0.yaml new file mode 120000 index 00000000..0a69d1ce --- /dev/null +++ b/modules/phylowgs/1.0/schemas/base-1.0.yaml @@ -0,0 +1 @@ +../../../../schemas/base/base-1.0.yaml \ No newline at end of file diff --git a/modules/phylowgs/1.0/schemas/time_point-1.0.yaml b/modules/phylowgs/1.0/schemas/time_point-1.0.yaml new file mode 120000 index 00000000..c163d396 --- /dev/null +++ b/modules/phylowgs/1.0/schemas/time_point-1.0.yaml @@ -0,0 +1 @@ +../../../../schemas/time_point/time_point-1.0.yaml \ No newline at end of file diff --git a/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R b/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R new file mode 100644 index 00000000..587cdb05 --- /dev/null +++ b/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R @@ -0,0 +1,536 @@ + + +#' +#' processing phyloWGS outputs pipeline that takes output json files and preprocessing output files +#' SAMPLE_ID.mutass.zip file must be unzipped before runing the script + +#E example: how to run +#mkdir -p output +#Rscript ./process.R --samplename SAMPLE_ID -j SAMPLE_ID.summ.json -t unziped.mutass/ -s ssm_data.txt -c cnv_data.txt -a SAMPLE_ID--matched_slms-3.final_deblacklisted_augmented.maf -b SAMPLE_ID_matched_slms-3.final_deblacklisted_augmented.maf -m SAMPLE_ID.muts.json -o out + +################################################## +# load required libraries +################################################## + +# library("optparse") +library("rjson") +library("tidyverse") +library("ggrepel") +library("data.table") + + +########################## +#### Snakemake Input ##### +########################## + +samplename = snakemake@wildcards[["patient_id"]] +json_file = snakemake@input[["summ"]] +trees_out= snakemake@input[["mutass"]] +ssm_file = snakemake@input[["ssms"]] +cnv_file = snakemake@input[["cnvs"]] +mafs = unlist(strsplit(snakemake@params[["maf_list"]], ",")) +mut_file = snakemake@input[["muts"]] +driver_genes = snakemake@params[["drivers"]] +sample_order = unlist(strsplit(snakemake@params[["sample_order"]], ",")) +genome_build = snakemake@wildcards[["genome_build"]] + +# Define the chr_prefix parameter based on the genome_build +chr_prefixed = str_detect(genome_build, "hg") + + +# option_list = list( +# make_option(c("-n", "--samplename"), type="character", default=NULL, help="Samplename of the sample to run", metavar="character"), +# make_option(c("-j", "--json_summ"), type="character", default=NULL, help="SAMPLE_ID.summ.json file generated by phyloWGS", metavar="character"), +# make_option(c("-t", "--trees_out"), type="character", default=NULL, help="Directory containing unzipped XX.mutass.zip trees", metavar="character"), +# make_option(c("-s", "--ssm"), type="character", default=NULL, help="Preprocessing ssm_data.txt output file", metavar="character"), +# make_option(c("-c", "--copynumber"), type="character", default=NULL, help="Preprocessing cnv_data.txt output file", metavar="character"), +# make_option(c("-a", "--tumourA_maf"), type="character", default=NULL, help="Agument maf file of tumour A", metavar="character"), +# make_option(c("-b", "--tumourB_maf"), type="character", default=NULL, help="Agument maf file of tumour B", metavar="character"), +# make_option(c("-m", "--json_muts"), type="character", default=NULL, help="SAMPLE_ID.muts.json file", metavar="character"), +# make_option(c("-o", "--output"), type="character", default=NULL, help="Output directory", metavar="character") +# ) +# +# opt_parser = OptionParser(option_list=option_list) +# opt = parse_args(opt_parser) +# +# samplename = opt$samplename +# json_file = opt$json_summ +# trees_out= opt$trees_out ### directory where unziped SAMPLE_ID.mutass.zip trees are +# ssm_file = opt$ssm +# cnv_file = opt$copynumber +# mafA = opt$tumourA_maf +# mafB = opt$tumourB_maf +# mut_file = opt$json_muts +# output_dir = opt$output +# +# +# +# .checkfile = function(infile) { +# +# if (!file.exists(infile)) { +# +# stop(paste("File", infile, "does not exist", sep="")) +# +# } +# +# } +# +# +# .checkfile(json_file) +# .checkfile(ssm_file) +# .checkfile(cnv_file) +# .checkfile(mafA) +# .checkfile(mafB) +# .checkfile(mut_file) + +################################################## +# Process input files +################################################### +# Parse the input file and obtain the required data for this run +result1 <- fromJSON(file = json_file) +result_mut<-fromJSON(file = mut_file) +ssm_pre<-read.table(file = ssm_file, header = TRUE) +cnv_pre<-read.delim(file = cnv_file, header = TRUE)[,c("cnv","a","d")] + + + +################################################## +# define output files +################################################## +out_json_to_Rtable= snakemake@output[["tree_summary"]] +ssm_to_trees= snakemake@output[["maf"]] +cnv_to_trees= snakemake@output[["cnvs"]] +cellular_prevalence_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_cellular_prevalence.pdf")) +CCF_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_cancer_cell_fraction_.pdf")) +VAF_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_vaf_.pdf")) +VAF_coding_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_vaf_coding.pdf")) +tree_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_tree.pdf")) +CCF_table = snakemake@output[["CCF"]] + +if(!dir.exists(snakemake@output[["plots"]])){dir.create(snakemake@output[["plots"]])} + +# out_json_to_Rtable= file.path(output_dir, paste("out_res_",samplename,"_json_converted_toR.table", sep = "")) +# ssm_to_trees= file.path(output_dir, paste("out_res_",samplename,"_assigned_ssms_to_best_tree_maf_format.table", sep = "")) +# cnv_to_trees= file.path(output_dir, paste("out_res_",samplename,"_assigned_cnvs_to_best_tree_maf_format.table", sep = "")) +# cellular_prevalence_plot= file.path(output_dir, paste("cellular_prevalence_",samplename,".pdf", sep = "")) +# CCF_plot= file.path(output_dir, paste("cancer_cell_fraction_",samplename,".pdf", sep = "")) +# VAF_plot= file.path(output_dir, paste("vaf_",samplename,".pdf", sep = "")) +# VAF_coding_plot= file.path(output_dir, paste("vaf_ccoding",samplename,".pdf", sep = "")) + + +################################################### +# open summ.json file and convert it into humam readable format +################################################### + +#this function opens SAMPLE_ID_summ.jason and converts it into R table +open_tree = function(json_summ_file,out_json_to_Rtable){ + + out_res<-NULL + for (j in 1:length(json_summ_file[["trees"]])){ + + tree_focal<-json_summ_file[["trees"]][j] + tree_focal_statA<-as.data.frame(t(unlist(sapply(tree_focal,function(x)x[c("clustering_index","branching_index","llh","linearity_index")])))) + colnames(tree_focal_statA)<-c("clustering_index","branching_index","llh","linearity_index") + tree_focal_statA$tree_id<-j-1 + rownames(tree_focal_statA)<-NULL + + + tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,-c(3,6,9,12,15,18,21,24,27,30)] + #tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,!(grepl("cellular_prevalence",colnames(tree_focal_statB)))] + colnames(tree_focal_statB)<-sub("^[^.]*.", "", colnames(tree_focal_statB)) + stat_both<-cbind(tree_focal_statA,tree_focal_statB) + out_res<-bind_rows(stat_both,out_res) + out_res_ordered<-out_res[order(out_res$tree_id),] + } # for j loop + +density<-json_summ_file["tree_densities"] +density_unlist<-data.frame("density"=unlist(density)) +row.names(density_unlist)<-sub("^[^.]*.", "", row.names(density_unlist)) + +density_unlist$tree_id<-row.names(density_unlist) +row.names(density_unlist)<-NULL + +final_table=merge(out_res_ordered,density_unlist, by.x = "tree_id", by.y = "tree_id") ## add tree densities to all tress table +write.table(final_table, file =out_json_to_Rtable ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE) +return(final_table) +} + + +result_tree<-open_tree(result1,out_json_to_Rtable) + + +################################################### +# extrcats the best tree +################################################### +#the best tree is the tree with the highest density + +best_tree_id = function(R_table, density) { + best=R_table[which.max(R_table$density),] + best_tree_focal_name<-best$tree_id + best_tree_id<-paste(best$tree_id,"json",sep = ".") + return(best_tree_id) + return(best_tree_focal_name) +} +best_tree_fileID<-best_tree_id(result_tree, density) + + +####################################################################### +# extract the stats (SNvs and CNVs assigned to each population) from the best tree +####################################################################### + +open_best_tree = function(trees_out,best_tree_id){ + unzip(trees_out, files = best_tree_id, exdir = dirname(trees_out), overwrite = TRUE) + best_tree_path = paste0(dirname(trees_out), "/", best_tree_id) + rr <- fromJSON(file = best_tree_path) + return(rr) +} +rr= open_best_tree(trees_out,best_tree_fileID) + + +####################################################################### +# annotate point mutations and CNVs in the best tree +####################################################################### +best_focal<-result1[["trees"]][as.numeric(gsub(".json","",best_tree_fileID))+1] +tree_structure<-as.data.frame(sapply(best_focal,function(x)x["structure"])) ##[6] +tree_roots <- best_focal[[1]]$structure$`0` + + +merge_both<-function(result1,best_tree_fileID,tree_structure){ + best_tree<-as.numeric(gsub(".json","",best_tree_fileID)) + best_focal<-result1[["trees"]][best_tree+1] + tree_focal_statB<-as.data.frame(sapply(best_focal,function(x)x["populations"])) ##[3] + qq<-tree_focal_statB[,grep("cellular_prevalence",colnames(tree_focal_statB))] %>% + rownames_to_column("sample") %>% + pivot_longer(-sample, + names_to = "population", + values_to = "cellular_prevalence") %>% + mutate(population = str_remove(str_remove(population, ".*populations[.]"), "[.]cellular_prevalence")) %>% + mutate(is_root = ifelse(population %in% tree_roots, TRUE, FALSE)) %>% + group_by(sample) %>% + mutate(purity = sum(cellular_prevalence[is_root]), + CCF = cellular_prevalence / purity) + + return(qq) + +} + +both_samples<-merge_both(result1,best_tree_fileID,tree_structure) + + +write_tsv(both_samples, CCF_table) + + +ssm = function(stat_best_tree, ssm_pre,ssm_to_trees,tree_structure, maf_list){ + + out_res_ssm<-NULL + for ( i in 1:length(stat_best_tree$mut_assignments)){ + + focal<-(stat_best_tree$mut_assignments)[i] + + focal_ssms<-data.frame(sapply(focal, function(x) x[1])) + + colnames(focal_ssms)<-sub("^[^.]*.", "", colnames(focal_ssms)) + focal_ssms$phyloWGS_population<-i + ssm_assign<-merge(ssm_pre,focal_ssms, by.x = "id", by.y = "ssms")[,c("id", "gene","phyloWGS_population")] + ssm_assign_spi<-separate(ssm_assign, col = gene, into = c("Chromosome","Start_Position"), sep = "_", convert = FALSE) %>% + mutate(Start_Position = as.numeric(Start_Position)) + if(chr_prefixed) { + ssm_assign_spi$Chromosome = str_c("chr", as.character(ssm_assign_spi$Chromosome)) + } + + + out_res_ssm<-rbind(ssm_assign_spi,out_res_ssm) + + } ## i loop + + ssm_assign_with_maf <- lapply(maf_list, function(x){ + maf <- read_tsv(x, + col_types = cols(Chromosome = col_character())) %>% + # PhyloWGS changes the start position of deletions. This makes the maf start position match that in the PhyloWGS SSM table. + mutate(Start_Position = ifelse(Variant_Type == "DEL", Start_Position - 1, Start_Position)) + maf <- out_res_ssm %>% + left_join(maf, by = c("Chromosome", "Start_Position")) %>% + # Restore the true MAF start postion after the hack above + mutate(Start_Position = ifelse(Variant_Type == "DEL", Start_Position + 1, Start_Position)) %>% + select(colnames(maf), everything()) + }) + out_res_ssm <- rbindlist(ssm_assign_with_maf) %>% + mutate(clonal_status = case_when( + phyloWGS_population %in% tree_roots & length(tree_roots) > 1 ~ "polyclonal", + phyloWGS_population %in% tree_roots & length(tree_roots) == 1 ~ "clonal", + TRUE ~ "subclonal" + )) + + return(out_res_ssm) + + +} + +ss<-ssm(rr,ssm_pre,ssm_to_trees,tree_structure, mafs) + +write_tsv(ss, ssm_to_trees, na = "") + + +########################################################### +## load mut file to extrcat CNVs start and end positions +########################################################### + +cnv = function(stat_best_tree, cnv_pre,mutation_file,cnv_to_trees){ + out_res_cnv <- + bind_rows(lapply(1:length(stat_best_tree$mut_assignments), function(x) + data.frame(cnvs = stat_best_tree$mut_assignments[[x]]$cnvs) %>% mutate(phyloWGS_population = x))) + + + + #return(out_res_cnv) + out_res_mut<-NULL + for (cn in 1:length(result_mut$cnvs)){ + focal_mut_cnv<-(result_mut$cnvs)[cn] + + focal_mut<-data.frame(sapply(focal_mut_cnv, function(x) x[1]))[1,] + colnames(focal_mut)<-sub("^[^.]*.", "", colnames(focal_mut)) + focal_mut$cnv_id<-names(focal_mut_cnv) + out_res_mut<-bind_rows(focal_mut,out_res_mut) + } ## cn loop + + both_cnvs<-merge(out_res_cnv, out_res_mut, by.x = "cnvs",by.y = "cnv_id") %>% + select(cnvs, phyloWGS_population, physical_cnvs.chrom, + physical_cnvs.start, physical_cnvs.end, + physical_cnvs.major_cn, physical_cnvs.minor_cn, physical_cnvs.cell_prev) + + +} + + +cnv<-cnv(rr, cnv_pre,result_mut,cnv_to_trees) +write.table(cnv, file =cnv_to_trees ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE) + +##################### plot the results #################### +########################################################### +#### Slope chart the best tree, cellular prevalence ####### + +plot_cp<-function(both_samples,cellular_prevalence_plot){ +pdf(cellular_prevalence_plot, width = 8, height =8 ) +plotA<-ggplot(data = both_samples, aes(x = sample, y = cellular_prevalence, group = population)) + + geom_line(aes(color = population), size = 2) + + labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ + geom_point(aes(color = population), size = 4) + + # Labelling as desired + xlab("Sample") + ylab("Cellular prevalence")+ + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + panel.background = element_blank(), axis.line = element_line(colour = "black"), + legend.key = element_rect(fill = NA, colour = NA, size = 0.25),axis.text=element_text(size=16),axis.title=element_text(size=18), + plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))) +print(plotA) +dev.off() +} + +plot_cp(both_samples,cellular_prevalence_plot) + +########################################################### +#### Slope chart the best tree, CCF ####### + +plot_cp<-function(both_samples,CCF_plot){ +pdf(CCF_plot, width = 8, height =8 ) +plotB<-ggplot(data = both_samples[both_samples$population != 0, ], aes(x = sample, y = CCF, group = population)) + + geom_line(aes(color = population), size = 2) + + labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ + geom_point(aes(color = population), size = 4) + + # Labelling as desired + xlab("Sample") + ylab("CCF")+ + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + panel.background = element_blank(), axis.line = element_line(colour = "black"), + legend.key = element_rect(fill = NA, colour = NA, size = 0.25),axis.text=element_text(size=16),axis.title=element_text(size=18), + plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))) +print(plotB) +dev.off() +} + +plot_cp(both_samples,CCF_plot) + + +############################################# +##### Slope chart the best tree (VAF) ####### + + +plot_vaf<-function(ss,VAF_plot){ + pdf(VAF_plot, width = 8, height =8 ) + plotC <- ss %>% + select(Hugo_Symbol, Chromosome, Tumor_Sample_Barcode, Start_Position, t_depth, t_alt_count, populations = phyloWGS_population) %>% + mutate(VAF = t_alt_count/t_depth, + populations = as.factor(populations)) %>% + filter(!is.na(Tumor_Sample_Barcode)) %>% + ggplot(aes(x = Tumor_Sample_Barcode, + y = VAF, + group = interaction(populations, Start_Position), + color = populations)) + + geom_line(aes(color = populations), size=0.2, alpha=0.4)+ + labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ + xlab("Sample") + ylab("VAF") + + guides(colour = guide_legend(override.aes = list(alpha = 3)))+ + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + panel.background = element_blank(), axis.line = element_line(colour = "black"), + legend.key = element_rect(fill = NA, colour = NA, size = 2),axis.text=element_text(size=16),axis.title=element_text(size=18), + plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))) +print(plotC) +dev.off() +} + + + +plot_vaf(ss,VAF_plot) + +############################################################# +##### Slope chart the best tree (VAF, coding regions, nonsense, missense and splicing sites) ####### + +drivers <- read_tsv(driver_genes, col_names = "gene") %>% pull(gene) + +plot_vaf_coding<-function(maf,VAF_coding_plot){ + + coding <- ss %>% + select(Hugo_Symbol, HGVSp_Short, Chromosome, Tumor_Sample_Barcode, Variant_Classification, Start_Position, t_depth, t_alt_count, populations = phyloWGS_population) %>% + mutate(VAF = t_alt_count/t_depth, + populations = as.factor(populations), + Tumor_Sample_Barcode = factor(Tumor_Sample_Barcode, levels = sample_order)) %>% + filter(!is.na(Tumor_Sample_Barcode), + !Variant_Classification %in% c("Silent", "RNA", "IGR", "Intron", "5'Flank", "3'Flank", "5'UTR")) %>% + mutate(label = ifelse(!is.na(HGVSp_Short), str_c(Hugo_Symbol, "_", HGVSp_Short), str_c(Hugo_Symbol, "_", Variant_Classification))) +pdf(VAF_coding_plot, width = 8, height =8 ) +plotD<-coding %>% + ggplot(aes(x = Tumor_Sample_Barcode, + y = VAF, + group = interaction(populations, Start_Position), + color = populations)) + + geom_line(aes(color = populations), size=0.5, alpha=0.4)+ + geom_text_repel( + data = filter(group_by(coding, Hugo_Symbol, Start_Position), VAF == max(VAF), Tumor_Sample_Barcode == sample_order[1], Hugo_Symbol %in% drivers), + aes(label = label, + x = Tumor_Sample_Barcode, + y = VAF), + nudge_x = -0.2, + size = 4 + ) + + geom_text_repel( + data = filter(group_by(coding, Hugo_Symbol, Start_Position), VAF == max(VAF), Tumor_Sample_Barcode == sample_order[length(sample_order)], Hugo_Symbol %in% drivers), + aes(label = label, + x = Tumor_Sample_Barcode, + y = VAF), + nudge_x = 0.2, + size = 4 + ) + + labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ + xlab("Sample") + ylab("VAF") + + guides(colour = guide_legend(override.aes = list(alpha = 3)))+ + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + panel.background = element_blank(), axis.line = element_line(colour = "black"), + legend.key = element_rect(fill = NA, colour = NA, size = 2),axis.text=element_text(size=16),axis.title=element_text(size=18), + plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))) + +print(plotD) +dev.off() +} + + +plot_vaf_coding(ss,VAF_coding_plot) + +############################################# +##### Draw the best tree ####### +############################################# + + +tree_structure_long <- tree_structure %>% + pivot_longer(everything(), + names_to = "parent", + values_to = "node") %>% + mutate(parent = str_remove_all(parent, ".*[.]")) %>% + distinct() + + +positions_x <- function(parents){ + x <- 1:length(unique(parents)) + names(x) <- unique(parents) + col_vals <- unname(x[parents]) + return(col_vals) +} + +tree_structure_long$x <- positions_x(tree_structure_long$parent) + +positions_y <- function(tree_df){ + y = c("0" = 0.5) + for(parent in unique(tree_df$parent)){ + # parent = "1" + child_index = 1 + num_children <- nrow(tree_df[tree_df$parent == parent,]) + if(num_children == 1){ + child <- tree_df[tree_df$parent == parent,]$node + child_y <- unname(y[parent]) + names(child_y) <- child + y = c(y, child_y) + + } else { + children <- tree_df[tree_df$parent == parent,]$node + y_max <- unname(y[parent]) + (0.25 / child_index) + y_min <- unname(y[parent]) - (0.25 / child_index) + y_range <- seq(y_min, y_max, length.out = length(children)) + names(y_range) <- children + y = c(y, y_range) + } + child_index = child_index + 1 + } + return(y) +} + +tree_structure_long$y <- unname(positions_y(tree_structure_long)[as.character(tree_structure_long$node)]) + +tree_structure_long <- add_row(tree_structure_long, parent = "0", node = 0, x = 0, y = 0.5) + +get_ssms <- function(tree_df, best_focal, best_tree_fileID){ + data <- best_focal[[str_remove_all(best_tree_fileID, "[.].*")]]$populations + ssm_vec <- c() + for(node in tree_df$node){ + # node = "1" + num_ssms <- data[[as.character(node)]]$num_ssms + names(num_ssms) <- as.character(node) + ssm_vec <- c(ssm_vec, num_ssms) + } + return(ssm_vec) +} + +tree_structure_long$num_ssms <- get_ssms(tree_structure_long, best_focal, best_tree_fileID)[as.character(tree_structure_long$node)] + +tree_structure_long <- tree_structure_long %>% + mutate(parent = as.numeric(parent)) %>% + left_join(select(tree_structure_long, node, xstart = x, ystart = y), + by = c("parent" = "node")) + + + +ggplot(tree_structure_long, + aes(x = x, + y = y, + label = node)) + + geom_segment(inherit.aes = FALSE, + aes(x = xstart, + xend = x, + y = ystart, + yend = y)) + + geom_point(aes(size = num_ssms), + fill = "white", + colour = "black", + pch = 21) + + geom_text() + + scale_size(range = c(5,20)) + + ylim(0,1) + + theme_void() + + ggtitle(samplename) + + theme(legend.position = "none") + +ggsave(tree_plot, height = 6, width = 6) + +############ +##### END ## +############ + + + + From ada0c037c1c4d7edfc5267ea4789f47294c4c411 Mon Sep 17 00:00:00 2001 From: lkhilton Date: Mon, 12 Sep 2022 22:21:03 -0700 Subject: [PATCH 06/14] Restore files messed up by git --- envs/phylowgs/fill_battenberg.yaml | 41 ++ envs/phylowgs/phylowgs.yaml | 82 +++ envs/phylowgs/phylowgs_results.yaml | 204 +++++++ .../phylowgs/1.0/envs/bcftools-1.10.2.yaml | 1 + modules/phylowgs/1.0/envs/coreutils-8.31.yaml | 1 + modules/phylowgs/1.0/envs/phylowgs.yaml | 1 + .../phylowgs/1.0/envs/phylowgs_results.yaml | 1 + modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml | 1 + modules/phylowgs/1.0/etc/noncoding.txt | 8 + modules/phylowgs/1.0/schemas/base-1.0.yaml | 1 + .../phylowgs/1.0/schemas/time_point-1.0.yaml | 1 + .../1.0/src/process_phyloWGS_outputs.R | 536 ++++++++++++++++++ 12 files changed, 878 insertions(+) create mode 100644 envs/phylowgs/fill_battenberg.yaml create mode 100644 envs/phylowgs/phylowgs.yaml create mode 100644 envs/phylowgs/phylowgs_results.yaml create mode 120000 modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml create mode 120000 modules/phylowgs/1.0/envs/coreutils-8.31.yaml create mode 120000 modules/phylowgs/1.0/envs/phylowgs.yaml create mode 120000 modules/phylowgs/1.0/envs/phylowgs_results.yaml create mode 120000 modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml create mode 100644 modules/phylowgs/1.0/etc/noncoding.txt create mode 120000 modules/phylowgs/1.0/schemas/base-1.0.yaml create mode 120000 modules/phylowgs/1.0/schemas/time_point-1.0.yaml create mode 100644 modules/phylowgs/1.0/src/process_phyloWGS_outputs.R diff --git a/envs/phylowgs/fill_battenberg.yaml b/envs/phylowgs/fill_battenberg.yaml new file mode 100644 index 00000000..52741e52 --- /dev/null +++ b/envs/phylowgs/fill_battenberg.yaml @@ -0,0 +1,41 @@ +name: fill_segments +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - ca-certificates=2020.12.5 + - certifi=2020.12.5 + - ld_impl_linux-64=2.35.1 + - libblas=3.9.0 + - libcblas=3.9.0 + - libffi=3.3 + - libgcc-ng=9.3.0 + - libgfortran-ng=9.3.0 + - libgfortran5=9.3.0 + - libgomp=9.3.0 + - liblapack=3.9.0 + - libopenblas=0.3.12 + - libstdcxx-ng=9.3.0 + - ncurses=6.2 + - numpy=1.19.4 + - openssl=1.1.1i + - pandas=1.2.0 + - pip=20.3.3 + - python=3.9.1 + - python-dateutil=2.8.1 + - python_abi=3.9 + - pytz=2020.5 + - readline=8.0 + - setuptools=49.6.0 + - simplejson=3.17.2 + - six=1.15.0 + - sqlite=3.34.0 + - tk=8.6.10 + - tzdata=2020e + - wheel=0.36.2 + - xz=5.2.5 + - zlib=1.2.11 +prefix: /home/dreval/miniconda3/envs/fill_segments diff --git a/envs/phylowgs/phylowgs.yaml b/envs/phylowgs/phylowgs.yaml new file mode 100644 index 00000000..dc531881 --- /dev/null +++ b/envs/phylowgs/phylowgs.yaml @@ -0,0 +1,82 @@ +name: phylo +channels: + - conda-forge + - defaults + - bioconda + - r +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - binutils_impl_linux-64=2.35.1 + - binutils_linux-64=2.35 + - blas=1.1 + - ca-certificates=2020.12.5 + - certifi=2019.11.28 + - dbus=1.13.6 + - expat=2.2.10 + - fontconfig=2.13.1 + - freetype=2.10.4 + - gcc_impl_linux-64=9.3.0 + - gcc_linux-64=9.3.0 + - gettext=0.19.8.1 + - gfortran_impl_linux-64=9.3.0 + - gfortran_linux-64=9.3.0 + - glib=2.58.3 + - gsl=2.6 + - gst-plugins-base=1.14.5 + - gstreamer=1.14.5 + - gxx_impl_linux-64=9.3.0 + - gxx_linux-64=9.3.0 + - icu=64.2 + - jpeg=9d + - kernel-headers_linux-64=2.6.32 + - ld_impl_linux-64=2.35.1 + - libblas=3.9.0 + - libcblas=3.9.0 + - libffi=3.2.1 + - libgcc-devel_linux-64=9.3.0 + - libgcc-ng=9.3.0 + - libgfortran=3.0.0 + - libgfortran-ng=9.3.0 + - libgfortran5=9.3.0 + - libgomp=9.3.0 + - libiconv=1.16 + - liblapack=3.9.0 + - libpng=1.6.37 + - libstdcxx-devel_linux-64=9.3.0 + - libstdcxx-ng=9.3.0 + - libuuid=2.32.1 + - libxcb=1.13 + - libxml2=2.9.10 + - libxslt=1.1.33 + - lxml=3.8.0 + - mysql-connector-c=6.1.11 + - mysql-python=1.2.5 + - ncurses=6.2 + - numpy=1.15.4 + - openblas=0.3.3 + - openssl=1.1.1j + - pcre=8.44 + - phylowgs=20181105 + - pip=20.1.1 + - pthread-stubs=0.4 + - pyqt=5.9.2 + - python=2.7.15 + - python_abi=2.7 + - pyvcf=0.6.8 + - qt=5.9.7 + - readline=8.0 + - setuptools=44.0.0 + - sip=4.19.8 + - sqlite=3.34.0 + - sysroot_linux-64=2.12 + - tk=8.6.10 + - wheel=0.36.2 + - xorg-libxau=1.0.9 + - xorg-libxdmcp=1.1.3 + - xz=5.2.5 + - zlib=1.2.11 + - pip: + - ete2==2.3.10 + - scipy==0.16.0 +prefix: /home/ssoudi/miniconda3/envs/phylo diff --git a/envs/phylowgs/phylowgs_results.yaml b/envs/phylowgs/phylowgs_results.yaml new file mode 100644 index 00000000..f64126d7 --- /dev/null +++ b/envs/phylowgs/phylowgs_results.yaml @@ -0,0 +1,204 @@ +name: phylowgs_outputs +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.1 + - binutils_impl_linux-64=2.35.1 + - binutils_linux-64=2.35 + - bwidget=1.9.14 + - bzip2=1.0.8 + - c-ares=1.17.1 + - ca-certificates=2021.5.30 + - cairo=1.16.0 + - curl=7.77.0 + - font-ttf-dejavu-sans-mono=2.37 + - font-ttf-inconsolata=3.000 + - font-ttf-source-code-pro=2.038 + - font-ttf-ubuntu=0.83 + - fontconfig=2.13.1 + - fonts-conda-ecosystem=1 + - fonts-conda-forge=1 + - freetype=2.10.4 + - fribidi=1.0.10 + - gcc_impl_linux-64=9.3.0 + - gcc_linux-64=9.3.0 + - gettext=0.21.0 + - gfortran_impl_linux-64=9.3.0 + - gfortran_linux-64=9.3.0 + - graphite2=1.3.14 + - gsl=2.6 + - gxx_impl_linux-64=9.3.0 + - gxx_linux-64=9.3.0 + - harfbuzz=2.8.1 + - icu=68.1 + - jbig=2.1 + - jpeg=9d + - kernel-headers_linux-64=2.6.32 + - krb5=1.19.1 + - ld_impl_linux-64=2.35.1 + - lerc=2.2.1 + - libblas=3.9.0 + - libcblas=3.9.0 + - libcurl=7.77.0 + - libdeflate=1.7 + - libedit=3.1.20210216 + - libev=4.33 + - libffi=3.3 + - libgcc-devel_linux-64=9.3.0 + - libgcc-ng=9.3.0 + - libgfortran-ng=9.3.0 + - libgfortran5=9.3.0 + - libglib=2.68.3 + - libgomp=9.3.0 + - libiconv=1.16 + - liblapack=3.9.0 + - libnghttp2=1.43.0 + - libopenblas=0.3.15 + - libpng=1.6.37 + - libssh2=1.9.0 + - libstdcxx-devel_linux-64=9.3.0 + - libstdcxx-ng=9.3.0 + - libtiff=4.3.0 + - libuuid=2.32.1 + - libwebp-base=1.2.0 + - libxcb=1.14 + - libxml2=2.9.12 + - lz4-c=1.9.3 + - make=4.3 + - ncurses=6.2 + - openssl=1.1.1k + - pandoc=2.14.0.3 + - pango=1.48.6 + - pcre=8.45 + - pcre2=10.36 + - pixman=0.40.0 + - r-askpass=1.1 + - r-assertthat=0.2.1 + - r-backports=1.2.1 + - r-base=4.1.0 + - r-base64enc=0.1_3 + - r-blob=1.2.1 + - r-brio=1.1.2 + - r-broom=0.7.8 + - r-callr=3.7.0 + - r-cellranger=1.1.0 + - r-cli=3.0.0 + - r-clipr=0.7.1 + - r-colorspace=2.0_2 + - r-cpp11=0.3.1 + - r-crayon=1.4.1 + - r-curl=4.3.2 + - r-data.table=1.14.0 + - r-dbi=1.1.1 + - r-dbplyr=2.1.1 + - r-desc=1.3.0 + - r-diffobj=0.3.4 + - r-digest=0.6.27 + - r-dplyr=1.0.7 + - r-dtplyr=1.1.0 + - r-ellipsis=0.3.2 + - r-evaluate=0.14 + - r-fansi=0.4.2 + - r-farver=2.1.0 + - r-forcats=0.5.1 + - r-fs=1.5.0 + - r-gargle=1.1.0 + - r-generics=0.1.0 + - r-ggplot2=3.3.5 + - r-ggrepel=0.9.1 + - r-glue=1.4.2 + - r-googledrive=1.0.1 + - r-googlesheets4=0.3.0 + - r-gtable=0.3.0 + - r-haven=2.4.1 + - r-highr=0.9 + - r-hms=1.1.0 + - r-htmltools=0.5.1.1 + - r-httr=1.4.2 + - r-ids=1.0.1 + - r-isoband=0.2.4 + - r-jsonlite=1.7.2 + - r-knitr=1.33 + - r-labeling=0.4.2 + - r-lattice=0.20_44 + - r-lifecycle=1.0.0 + - r-lubridate=1.7.10 + - r-magrittr=2.0.1 + - r-markdown=1.1 + - r-mass=7.3_54 + - r-matrix=1.3_4 + - r-mgcv=1.8_36 + - r-mime=0.11 + - r-modelr=0.1.8 + - r-munsell=0.5.0 + - r-nlme=3.1_152 + - r-openssl=1.4.4 + - r-pillar=1.6.1 + - r-pkgconfig=2.0.3 + - r-pkgload=1.2.1 + - r-plyr=1.8.6 + - r-praise=1.0.0 + - r-prettyunits=1.1.1 + - r-processx=3.5.2 + - r-progress=1.2.2 + - r-ps=1.6.0 + - r-purrr=0.3.4 + - r-r6=2.5.0 + - r-rappdirs=0.3.3 + - r-rcolorbrewer=1.1_2 + - r-rcpp=1.0.6 + - r-readr=1.4.0 + - r-readxl=1.3.1 + - r-rematch=1.0.1 + - r-rematch2=2.1.2 + - r-reprex=2.0.0 + - r-reshape2=1.4.4 + - r-rjson=0.2.20 + - r-rlang=0.4.11 + - r-rmarkdown=2.9 + - r-rprojroot=2.0.2 + - r-rstudioapi=0.13 + - r-rvest=1.0.0 + - r-scales=1.1.1 + - r-selectr=0.4_2 + - r-stringi=1.6.2 + - r-stringr=1.4.0 + - r-sys=3.4 + - r-testthat=3.0.3 + - r-tibble=3.1.2 + - r-tidyr=1.1.3 + - r-tidyselect=1.1.1 + - r-tidyverse=1.3.1 + - r-tinytex=0.32 + - r-utf8=1.2.1 + - r-uuid=0.1_4 + - r-vctrs=0.3.8 + - r-viridislite=0.4.0 + - r-waldo=0.2.5 + - r-withr=2.4.2 + - r-xfun=0.24 + - r-xml2=1.3.2 + - r-yaml=2.2.1 + - readline=8.1 + - sed=4.8 + - sysroot_linux-64=2.12 + - tk=8.6.10 + - tktable=2.10 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.7.2 + - xorg-libxext=1.3.4 + - xorg-libxrender=0.9.10 + - xorg-libxt=1.2.1 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.5.0 +prefix: /home/lhilton/miniconda3/envs/phylowgs_outputs diff --git a/modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml b/modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml new file mode 120000 index 00000000..72959e7b --- /dev/null +++ b/modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml @@ -0,0 +1 @@ +../../../../envs/bcftools/bcftools-1.10.2.yaml \ No newline at end of file diff --git a/modules/phylowgs/1.0/envs/coreutils-8.31.yaml b/modules/phylowgs/1.0/envs/coreutils-8.31.yaml new file mode 120000 index 00000000..050452f7 --- /dev/null +++ b/modules/phylowgs/1.0/envs/coreutils-8.31.yaml @@ -0,0 +1 @@ +../../../../envs/coreutils/coreutils-8.31.yaml \ No newline at end of file diff --git a/modules/phylowgs/1.0/envs/phylowgs.yaml b/modules/phylowgs/1.0/envs/phylowgs.yaml new file mode 120000 index 00000000..6e962c7f --- /dev/null +++ b/modules/phylowgs/1.0/envs/phylowgs.yaml @@ -0,0 +1 @@ +../../../../envs/phylowgs/phylowgs.yaml \ No newline at end of file diff --git a/modules/phylowgs/1.0/envs/phylowgs_results.yaml b/modules/phylowgs/1.0/envs/phylowgs_results.yaml new file mode 120000 index 00000000..926ec438 --- /dev/null +++ b/modules/phylowgs/1.0/envs/phylowgs_results.yaml @@ -0,0 +1 @@ +../../../../envs/phylowgs/phylowgs_results.yaml \ No newline at end of file diff --git a/modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml b/modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml new file mode 120000 index 00000000..829077c7 --- /dev/null +++ b/modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml @@ -0,0 +1 @@ +../../../../envs/vcf2maf/vcf2maf-1.6.18.yaml \ No newline at end of file diff --git a/modules/phylowgs/1.0/etc/noncoding.txt b/modules/phylowgs/1.0/etc/noncoding.txt new file mode 100644 index 00000000..258f27f0 --- /dev/null +++ b/modules/phylowgs/1.0/etc/noncoding.txt @@ -0,0 +1,8 @@ +Hugo_Symbol +Silent +RNA +IGR +Intron +5'Flank +3'Flank +5'UTR diff --git a/modules/phylowgs/1.0/schemas/base-1.0.yaml b/modules/phylowgs/1.0/schemas/base-1.0.yaml new file mode 120000 index 00000000..0a69d1ce --- /dev/null +++ b/modules/phylowgs/1.0/schemas/base-1.0.yaml @@ -0,0 +1 @@ +../../../../schemas/base/base-1.0.yaml \ No newline at end of file diff --git a/modules/phylowgs/1.0/schemas/time_point-1.0.yaml b/modules/phylowgs/1.0/schemas/time_point-1.0.yaml new file mode 120000 index 00000000..c163d396 --- /dev/null +++ b/modules/phylowgs/1.0/schemas/time_point-1.0.yaml @@ -0,0 +1 @@ +../../../../schemas/time_point/time_point-1.0.yaml \ No newline at end of file diff --git a/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R b/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R new file mode 100644 index 00000000..587cdb05 --- /dev/null +++ b/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R @@ -0,0 +1,536 @@ + + +#' +#' processing phyloWGS outputs pipeline that takes output json files and preprocessing output files +#' SAMPLE_ID.mutass.zip file must be unzipped before runing the script + +#E example: how to run +#mkdir -p output +#Rscript ./process.R --samplename SAMPLE_ID -j SAMPLE_ID.summ.json -t unziped.mutass/ -s ssm_data.txt -c cnv_data.txt -a SAMPLE_ID--matched_slms-3.final_deblacklisted_augmented.maf -b SAMPLE_ID_matched_slms-3.final_deblacklisted_augmented.maf -m SAMPLE_ID.muts.json -o out + +################################################## +# load required libraries +################################################## + +# library("optparse") +library("rjson") +library("tidyverse") +library("ggrepel") +library("data.table") + + +########################## +#### Snakemake Input ##### +########################## + +samplename = snakemake@wildcards[["patient_id"]] +json_file = snakemake@input[["summ"]] +trees_out= snakemake@input[["mutass"]] +ssm_file = snakemake@input[["ssms"]] +cnv_file = snakemake@input[["cnvs"]] +mafs = unlist(strsplit(snakemake@params[["maf_list"]], ",")) +mut_file = snakemake@input[["muts"]] +driver_genes = snakemake@params[["drivers"]] +sample_order = unlist(strsplit(snakemake@params[["sample_order"]], ",")) +genome_build = snakemake@wildcards[["genome_build"]] + +# Define the chr_prefix parameter based on the genome_build +chr_prefixed = str_detect(genome_build, "hg") + + +# option_list = list( +# make_option(c("-n", "--samplename"), type="character", default=NULL, help="Samplename of the sample to run", metavar="character"), +# make_option(c("-j", "--json_summ"), type="character", default=NULL, help="SAMPLE_ID.summ.json file generated by phyloWGS", metavar="character"), +# make_option(c("-t", "--trees_out"), type="character", default=NULL, help="Directory containing unzipped XX.mutass.zip trees", metavar="character"), +# make_option(c("-s", "--ssm"), type="character", default=NULL, help="Preprocessing ssm_data.txt output file", metavar="character"), +# make_option(c("-c", "--copynumber"), type="character", default=NULL, help="Preprocessing cnv_data.txt output file", metavar="character"), +# make_option(c("-a", "--tumourA_maf"), type="character", default=NULL, help="Agument maf file of tumour A", metavar="character"), +# make_option(c("-b", "--tumourB_maf"), type="character", default=NULL, help="Agument maf file of tumour B", metavar="character"), +# make_option(c("-m", "--json_muts"), type="character", default=NULL, help="SAMPLE_ID.muts.json file", metavar="character"), +# make_option(c("-o", "--output"), type="character", default=NULL, help="Output directory", metavar="character") +# ) +# +# opt_parser = OptionParser(option_list=option_list) +# opt = parse_args(opt_parser) +# +# samplename = opt$samplename +# json_file = opt$json_summ +# trees_out= opt$trees_out ### directory where unziped SAMPLE_ID.mutass.zip trees are +# ssm_file = opt$ssm +# cnv_file = opt$copynumber +# mafA = opt$tumourA_maf +# mafB = opt$tumourB_maf +# mut_file = opt$json_muts +# output_dir = opt$output +# +# +# +# .checkfile = function(infile) { +# +# if (!file.exists(infile)) { +# +# stop(paste("File", infile, "does not exist", sep="")) +# +# } +# +# } +# +# +# .checkfile(json_file) +# .checkfile(ssm_file) +# .checkfile(cnv_file) +# .checkfile(mafA) +# .checkfile(mafB) +# .checkfile(mut_file) + +################################################## +# Process input files +################################################### +# Parse the input file and obtain the required data for this run +result1 <- fromJSON(file = json_file) +result_mut<-fromJSON(file = mut_file) +ssm_pre<-read.table(file = ssm_file, header = TRUE) +cnv_pre<-read.delim(file = cnv_file, header = TRUE)[,c("cnv","a","d")] + + + +################################################## +# define output files +################################################## +out_json_to_Rtable= snakemake@output[["tree_summary"]] +ssm_to_trees= snakemake@output[["maf"]] +cnv_to_trees= snakemake@output[["cnvs"]] +cellular_prevalence_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_cellular_prevalence.pdf")) +CCF_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_cancer_cell_fraction_.pdf")) +VAF_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_vaf_.pdf")) +VAF_coding_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_vaf_coding.pdf")) +tree_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_tree.pdf")) +CCF_table = snakemake@output[["CCF"]] + +if(!dir.exists(snakemake@output[["plots"]])){dir.create(snakemake@output[["plots"]])} + +# out_json_to_Rtable= file.path(output_dir, paste("out_res_",samplename,"_json_converted_toR.table", sep = "")) +# ssm_to_trees= file.path(output_dir, paste("out_res_",samplename,"_assigned_ssms_to_best_tree_maf_format.table", sep = "")) +# cnv_to_trees= file.path(output_dir, paste("out_res_",samplename,"_assigned_cnvs_to_best_tree_maf_format.table", sep = "")) +# cellular_prevalence_plot= file.path(output_dir, paste("cellular_prevalence_",samplename,".pdf", sep = "")) +# CCF_plot= file.path(output_dir, paste("cancer_cell_fraction_",samplename,".pdf", sep = "")) +# VAF_plot= file.path(output_dir, paste("vaf_",samplename,".pdf", sep = "")) +# VAF_coding_plot= file.path(output_dir, paste("vaf_ccoding",samplename,".pdf", sep = "")) + + +################################################### +# open summ.json file and convert it into humam readable format +################################################### + +#this function opens SAMPLE_ID_summ.jason and converts it into R table +open_tree = function(json_summ_file,out_json_to_Rtable){ + + out_res<-NULL + for (j in 1:length(json_summ_file[["trees"]])){ + + tree_focal<-json_summ_file[["trees"]][j] + tree_focal_statA<-as.data.frame(t(unlist(sapply(tree_focal,function(x)x[c("clustering_index","branching_index","llh","linearity_index")])))) + colnames(tree_focal_statA)<-c("clustering_index","branching_index","llh","linearity_index") + tree_focal_statA$tree_id<-j-1 + rownames(tree_focal_statA)<-NULL + + + tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,-c(3,6,9,12,15,18,21,24,27,30)] + #tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,!(grepl("cellular_prevalence",colnames(tree_focal_statB)))] + colnames(tree_focal_statB)<-sub("^[^.]*.", "", colnames(tree_focal_statB)) + stat_both<-cbind(tree_focal_statA,tree_focal_statB) + out_res<-bind_rows(stat_both,out_res) + out_res_ordered<-out_res[order(out_res$tree_id),] + } # for j loop + +density<-json_summ_file["tree_densities"] +density_unlist<-data.frame("density"=unlist(density)) +row.names(density_unlist)<-sub("^[^.]*.", "", row.names(density_unlist)) + +density_unlist$tree_id<-row.names(density_unlist) +row.names(density_unlist)<-NULL + +final_table=merge(out_res_ordered,density_unlist, by.x = "tree_id", by.y = "tree_id") ## add tree densities to all tress table +write.table(final_table, file =out_json_to_Rtable ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE) +return(final_table) +} + + +result_tree<-open_tree(result1,out_json_to_Rtable) + + +################################################### +# extrcats the best tree +################################################### +#the best tree is the tree with the highest density + +best_tree_id = function(R_table, density) { + best=R_table[which.max(R_table$density),] + best_tree_focal_name<-best$tree_id + best_tree_id<-paste(best$tree_id,"json",sep = ".") + return(best_tree_id) + return(best_tree_focal_name) +} +best_tree_fileID<-best_tree_id(result_tree, density) + + +####################################################################### +# extract the stats (SNvs and CNVs assigned to each population) from the best tree +####################################################################### + +open_best_tree = function(trees_out,best_tree_id){ + unzip(trees_out, files = best_tree_id, exdir = dirname(trees_out), overwrite = TRUE) + best_tree_path = paste0(dirname(trees_out), "/", best_tree_id) + rr <- fromJSON(file = best_tree_path) + return(rr) +} +rr= open_best_tree(trees_out,best_tree_fileID) + + +####################################################################### +# annotate point mutations and CNVs in the best tree +####################################################################### +best_focal<-result1[["trees"]][as.numeric(gsub(".json","",best_tree_fileID))+1] +tree_structure<-as.data.frame(sapply(best_focal,function(x)x["structure"])) ##[6] +tree_roots <- best_focal[[1]]$structure$`0` + + +merge_both<-function(result1,best_tree_fileID,tree_structure){ + best_tree<-as.numeric(gsub(".json","",best_tree_fileID)) + best_focal<-result1[["trees"]][best_tree+1] + tree_focal_statB<-as.data.frame(sapply(best_focal,function(x)x["populations"])) ##[3] + qq<-tree_focal_statB[,grep("cellular_prevalence",colnames(tree_focal_statB))] %>% + rownames_to_column("sample") %>% + pivot_longer(-sample, + names_to = "population", + values_to = "cellular_prevalence") %>% + mutate(population = str_remove(str_remove(population, ".*populations[.]"), "[.]cellular_prevalence")) %>% + mutate(is_root = ifelse(population %in% tree_roots, TRUE, FALSE)) %>% + group_by(sample) %>% + mutate(purity = sum(cellular_prevalence[is_root]), + CCF = cellular_prevalence / purity) + + return(qq) + +} + +both_samples<-merge_both(result1,best_tree_fileID,tree_structure) + + +write_tsv(both_samples, CCF_table) + + +ssm = function(stat_best_tree, ssm_pre,ssm_to_trees,tree_structure, maf_list){ + + out_res_ssm<-NULL + for ( i in 1:length(stat_best_tree$mut_assignments)){ + + focal<-(stat_best_tree$mut_assignments)[i] + + focal_ssms<-data.frame(sapply(focal, function(x) x[1])) + + colnames(focal_ssms)<-sub("^[^.]*.", "", colnames(focal_ssms)) + focal_ssms$phyloWGS_population<-i + ssm_assign<-merge(ssm_pre,focal_ssms, by.x = "id", by.y = "ssms")[,c("id", "gene","phyloWGS_population")] + ssm_assign_spi<-separate(ssm_assign, col = gene, into = c("Chromosome","Start_Position"), sep = "_", convert = FALSE) %>% + mutate(Start_Position = as.numeric(Start_Position)) + if(chr_prefixed) { + ssm_assign_spi$Chromosome = str_c("chr", as.character(ssm_assign_spi$Chromosome)) + } + + + out_res_ssm<-rbind(ssm_assign_spi,out_res_ssm) + + } ## i loop + + ssm_assign_with_maf <- lapply(maf_list, function(x){ + maf <- read_tsv(x, + col_types = cols(Chromosome = col_character())) %>% + # PhyloWGS changes the start position of deletions. This makes the maf start position match that in the PhyloWGS SSM table. + mutate(Start_Position = ifelse(Variant_Type == "DEL", Start_Position - 1, Start_Position)) + maf <- out_res_ssm %>% + left_join(maf, by = c("Chromosome", "Start_Position")) %>% + # Restore the true MAF start postion after the hack above + mutate(Start_Position = ifelse(Variant_Type == "DEL", Start_Position + 1, Start_Position)) %>% + select(colnames(maf), everything()) + }) + out_res_ssm <- rbindlist(ssm_assign_with_maf) %>% + mutate(clonal_status = case_when( + phyloWGS_population %in% tree_roots & length(tree_roots) > 1 ~ "polyclonal", + phyloWGS_population %in% tree_roots & length(tree_roots) == 1 ~ "clonal", + TRUE ~ "subclonal" + )) + + return(out_res_ssm) + + +} + +ss<-ssm(rr,ssm_pre,ssm_to_trees,tree_structure, mafs) + +write_tsv(ss, ssm_to_trees, na = "") + + +########################################################### +## load mut file to extrcat CNVs start and end positions +########################################################### + +cnv = function(stat_best_tree, cnv_pre,mutation_file,cnv_to_trees){ + out_res_cnv <- + bind_rows(lapply(1:length(stat_best_tree$mut_assignments), function(x) + data.frame(cnvs = stat_best_tree$mut_assignments[[x]]$cnvs) %>% mutate(phyloWGS_population = x))) + + + + #return(out_res_cnv) + out_res_mut<-NULL + for (cn in 1:length(result_mut$cnvs)){ + focal_mut_cnv<-(result_mut$cnvs)[cn] + + focal_mut<-data.frame(sapply(focal_mut_cnv, function(x) x[1]))[1,] + colnames(focal_mut)<-sub("^[^.]*.", "", colnames(focal_mut)) + focal_mut$cnv_id<-names(focal_mut_cnv) + out_res_mut<-bind_rows(focal_mut,out_res_mut) + } ## cn loop + + both_cnvs<-merge(out_res_cnv, out_res_mut, by.x = "cnvs",by.y = "cnv_id") %>% + select(cnvs, phyloWGS_population, physical_cnvs.chrom, + physical_cnvs.start, physical_cnvs.end, + physical_cnvs.major_cn, physical_cnvs.minor_cn, physical_cnvs.cell_prev) + + +} + + +cnv<-cnv(rr, cnv_pre,result_mut,cnv_to_trees) +write.table(cnv, file =cnv_to_trees ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE) + +##################### plot the results #################### +########################################################### +#### Slope chart the best tree, cellular prevalence ####### + +plot_cp<-function(both_samples,cellular_prevalence_plot){ +pdf(cellular_prevalence_plot, width = 8, height =8 ) +plotA<-ggplot(data = both_samples, aes(x = sample, y = cellular_prevalence, group = population)) + + geom_line(aes(color = population), size = 2) + + labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ + geom_point(aes(color = population), size = 4) + + # Labelling as desired + xlab("Sample") + ylab("Cellular prevalence")+ + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + panel.background = element_blank(), axis.line = element_line(colour = "black"), + legend.key = element_rect(fill = NA, colour = NA, size = 0.25),axis.text=element_text(size=16),axis.title=element_text(size=18), + plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))) +print(plotA) +dev.off() +} + +plot_cp(both_samples,cellular_prevalence_plot) + +########################################################### +#### Slope chart the best tree, CCF ####### + +plot_cp<-function(both_samples,CCF_plot){ +pdf(CCF_plot, width = 8, height =8 ) +plotB<-ggplot(data = both_samples[both_samples$population != 0, ], aes(x = sample, y = CCF, group = population)) + + geom_line(aes(color = population), size = 2) + + labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ + geom_point(aes(color = population), size = 4) + + # Labelling as desired + xlab("Sample") + ylab("CCF")+ + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + panel.background = element_blank(), axis.line = element_line(colour = "black"), + legend.key = element_rect(fill = NA, colour = NA, size = 0.25),axis.text=element_text(size=16),axis.title=element_text(size=18), + plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))) +print(plotB) +dev.off() +} + +plot_cp(both_samples,CCF_plot) + + +############################################# +##### Slope chart the best tree (VAF) ####### + + +plot_vaf<-function(ss,VAF_plot){ + pdf(VAF_plot, width = 8, height =8 ) + plotC <- ss %>% + select(Hugo_Symbol, Chromosome, Tumor_Sample_Barcode, Start_Position, t_depth, t_alt_count, populations = phyloWGS_population) %>% + mutate(VAF = t_alt_count/t_depth, + populations = as.factor(populations)) %>% + filter(!is.na(Tumor_Sample_Barcode)) %>% + ggplot(aes(x = Tumor_Sample_Barcode, + y = VAF, + group = interaction(populations, Start_Position), + color = populations)) + + geom_line(aes(color = populations), size=0.2, alpha=0.4)+ + labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ + xlab("Sample") + ylab("VAF") + + guides(colour = guide_legend(override.aes = list(alpha = 3)))+ + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + panel.background = element_blank(), axis.line = element_line(colour = "black"), + legend.key = element_rect(fill = NA, colour = NA, size = 2),axis.text=element_text(size=16),axis.title=element_text(size=18), + plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))) +print(plotC) +dev.off() +} + + + +plot_vaf(ss,VAF_plot) + +############################################################# +##### Slope chart the best tree (VAF, coding regions, nonsense, missense and splicing sites) ####### + +drivers <- read_tsv(driver_genes, col_names = "gene") %>% pull(gene) + +plot_vaf_coding<-function(maf,VAF_coding_plot){ + + coding <- ss %>% + select(Hugo_Symbol, HGVSp_Short, Chromosome, Tumor_Sample_Barcode, Variant_Classification, Start_Position, t_depth, t_alt_count, populations = phyloWGS_population) %>% + mutate(VAF = t_alt_count/t_depth, + populations = as.factor(populations), + Tumor_Sample_Barcode = factor(Tumor_Sample_Barcode, levels = sample_order)) %>% + filter(!is.na(Tumor_Sample_Barcode), + !Variant_Classification %in% c("Silent", "RNA", "IGR", "Intron", "5'Flank", "3'Flank", "5'UTR")) %>% + mutate(label = ifelse(!is.na(HGVSp_Short), str_c(Hugo_Symbol, "_", HGVSp_Short), str_c(Hugo_Symbol, "_", Variant_Classification))) +pdf(VAF_coding_plot, width = 8, height =8 ) +plotD<-coding %>% + ggplot(aes(x = Tumor_Sample_Barcode, + y = VAF, + group = interaction(populations, Start_Position), + color = populations)) + + geom_line(aes(color = populations), size=0.5, alpha=0.4)+ + geom_text_repel( + data = filter(group_by(coding, Hugo_Symbol, Start_Position), VAF == max(VAF), Tumor_Sample_Barcode == sample_order[1], Hugo_Symbol %in% drivers), + aes(label = label, + x = Tumor_Sample_Barcode, + y = VAF), + nudge_x = -0.2, + size = 4 + ) + + geom_text_repel( + data = filter(group_by(coding, Hugo_Symbol, Start_Position), VAF == max(VAF), Tumor_Sample_Barcode == sample_order[length(sample_order)], Hugo_Symbol %in% drivers), + aes(label = label, + x = Tumor_Sample_Barcode, + y = VAF), + nudge_x = 0.2, + size = 4 + ) + + labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ + xlab("Sample") + ylab("VAF") + + guides(colour = guide_legend(override.aes = list(alpha = 3)))+ + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + panel.background = element_blank(), axis.line = element_line(colour = "black"), + legend.key = element_rect(fill = NA, colour = NA, size = 2),axis.text=element_text(size=16),axis.title=element_text(size=18), + plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))) + +print(plotD) +dev.off() +} + + +plot_vaf_coding(ss,VAF_coding_plot) + +############################################# +##### Draw the best tree ####### +############################################# + + +tree_structure_long <- tree_structure %>% + pivot_longer(everything(), + names_to = "parent", + values_to = "node") %>% + mutate(parent = str_remove_all(parent, ".*[.]")) %>% + distinct() + + +positions_x <- function(parents){ + x <- 1:length(unique(parents)) + names(x) <- unique(parents) + col_vals <- unname(x[parents]) + return(col_vals) +} + +tree_structure_long$x <- positions_x(tree_structure_long$parent) + +positions_y <- function(tree_df){ + y = c("0" = 0.5) + for(parent in unique(tree_df$parent)){ + # parent = "1" + child_index = 1 + num_children <- nrow(tree_df[tree_df$parent == parent,]) + if(num_children == 1){ + child <- tree_df[tree_df$parent == parent,]$node + child_y <- unname(y[parent]) + names(child_y) <- child + y = c(y, child_y) + + } else { + children <- tree_df[tree_df$parent == parent,]$node + y_max <- unname(y[parent]) + (0.25 / child_index) + y_min <- unname(y[parent]) - (0.25 / child_index) + y_range <- seq(y_min, y_max, length.out = length(children)) + names(y_range) <- children + y = c(y, y_range) + } + child_index = child_index + 1 + } + return(y) +} + +tree_structure_long$y <- unname(positions_y(tree_structure_long)[as.character(tree_structure_long$node)]) + +tree_structure_long <- add_row(tree_structure_long, parent = "0", node = 0, x = 0, y = 0.5) + +get_ssms <- function(tree_df, best_focal, best_tree_fileID){ + data <- best_focal[[str_remove_all(best_tree_fileID, "[.].*")]]$populations + ssm_vec <- c() + for(node in tree_df$node){ + # node = "1" + num_ssms <- data[[as.character(node)]]$num_ssms + names(num_ssms) <- as.character(node) + ssm_vec <- c(ssm_vec, num_ssms) + } + return(ssm_vec) +} + +tree_structure_long$num_ssms <- get_ssms(tree_structure_long, best_focal, best_tree_fileID)[as.character(tree_structure_long$node)] + +tree_structure_long <- tree_structure_long %>% + mutate(parent = as.numeric(parent)) %>% + left_join(select(tree_structure_long, node, xstart = x, ystart = y), + by = c("parent" = "node")) + + + +ggplot(tree_structure_long, + aes(x = x, + y = y, + label = node)) + + geom_segment(inherit.aes = FALSE, + aes(x = xstart, + xend = x, + y = ystart, + yend = y)) + + geom_point(aes(size = num_ssms), + fill = "white", + colour = "black", + pch = 21) + + geom_text() + + scale_size(range = c(5,20)) + + ylim(0,1) + + theme_void() + + ggtitle(samplename) + + theme(legend.position = "none") + +ggsave(tree_plot, height = 6, width = 6) + +############ +##### END ## +############ + + + + From d5af90f6bfe97556a2199122cc47ea9687fa0e77 Mon Sep 17 00:00:00 2001 From: lkhilton Date: Mon, 12 Sep 2022 22:51:32 -0700 Subject: [PATCH 07/14] Add input job grouping --- modules/phylowgs/1.0/phylowgs.smk | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/phylowgs/1.0/phylowgs.smk b/modules/phylowgs/1.0/phylowgs.smk index 1cdba641..1ceb3110 100644 --- a/modules/phylowgs/1.0/phylowgs.smk +++ b/modules/phylowgs/1.0/phylowgs.smk @@ -154,6 +154,7 @@ rule _phylowgs_input_maf: maf = CFG["inputs"]["maf"], output: maf = CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.maf", + group: "input_maf" run: op.absolute_symlink(input.maf, output.maf) @@ -165,6 +166,7 @@ rule _phylowgs_input_battenberg: output: cellularity = CFG["dirs"]["inputs"] + "battenberg/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.cellularity_ploidy.txt", subclones = CFG["dirs"]["inputs"] + "battenberg/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.subclones.txt" + group: "input_battenberg" run: op.absolute_symlink(input.cellularity, output.cellularity) op.absolute_symlink(input.subclones, output.subclones) @@ -187,6 +189,7 @@ rule _phylowgs_parse_battenberg: CFG["threads"]["create_inputs"] resources: **CFG["resources"]["create_inputs"] + group: "input_battenberg" shell: op.as_one_line(""" cellularity=$(tail -n +2 {input.cellularity} | cut -f 1); @@ -203,6 +206,7 @@ rule _phylowgs_maf_to_vcf: vcf = temp(CFG['dirs']['maf_to_vcf'] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.maf_to.vcf") conda: CFG["conda_envs"]["vcf2maf"] + group: "input_maf" shell: op.as_one_line(""" maf2vcf.pl --input-maf {input.maf} --output-dir $(dirname {output.vcf}) --output-vcf {output.vcf} --ref-fasta {input.fasta} From 7030305ad3492bd9fb13537d5ddcc086a8c64fc4 Mon Sep 17 00:00:00 2001 From: lkhilton Date: Mon, 12 Sep 2022 22:53:17 -0700 Subject: [PATCH 08/14] Note about scratch subdirectories --- modules/phylowgs/1.0/config/default.yaml | 31 ++++++++++++------------ 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/modules/phylowgs/1.0/config/default.yaml b/modules/phylowgs/1.0/config/default.yaml index e8e0c1e4..f1557fa1 100644 --- a/modules/phylowgs/1.0/config/default.yaml +++ b/modules/phylowgs/1.0/config/default.yaml @@ -1,27 +1,27 @@ lcr-modules: - + phylowgs: inputs: # Available wildcards: {tumour_id} {normal_id} {pair_status} {genome_build} {sample_id} - maf: "__UPDATE__" + maf: "__UPDATE__" cellularity: "__UPDATE__" # battenberg/99-outputs/txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_subclones.txt subclones: "__UPDATE__" # battenberg/99-outputs/txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_cellularity_ploidy.txt drivers: "__UPDATE__" # newline-separated list of driver gene HUGO symbols to be included on plots - scratch_subdirectories: [] + scratch_subdirectories: [] # Recommended: "04-multievolve" options: - create_inputs: - opts: - "-s 5000 --verbose --regions all" # -s controls how many variants should be sub-sampled. + create_inputs: + opts: + "-s 5000 --verbose --regions all" # -s controls how many variants should be sub-sampled. multievolve: "" write_results: "--include-ssm-names" - switches: - # NOTE: You must include a "sex" column in the input samples table, formatted with "M" and "F". - # If patient sex is unknown, you can leave it empty and phyloWGS will run in "auto" mode. - sex: + switches: + # NOTE: You must include a "sex" column in the input samples table, formatted with "M" and "F". + # If patient sex is unknown, you can leave it empty and phyloWGS will run in "auto" mode. + sex: M: "male" F: "female" @@ -33,24 +33,24 @@ lcr-modules: bcftools: "{MODSDIR}/envs/bcftools-1.10.2.yaml" coreutils: "{MODSDIR}/envs/coreutils-8.31.yaml" - scripts: + scripts: fill_battenberg: "{MODSDIR}/src/fill_battenberg.py" arm_file: "{MODSDIR}/etc/chromArmFiles/chromArm.{genome_build}.tsv" process_outputs: "src/process_phyloWGS_outputs.R" noncoding: "{MODSDIR}/etc/noncoding.txt" - + threads: create_inputs: 1 multievolve: 4 write_results: 1 resources: - create_inputs: + create_inputs: mem_mb: 10000 - multievolve: + multievolve: mem_mb: 40000 evolve: 1 - write_results: + write_results: mem_mb: 20000 pairing_config: @@ -58,4 +58,3 @@ lcr-modules: run_paired_tumours: True run_unpaired_tumours_with: None run_paired_tumours_as_unpaired: False - \ No newline at end of file From 59159c0efa6f4d0247f4e6eb909f36a56d3e6c91 Mon Sep 17 00:00:00 2001 From: lkhilton Date: Fri, 3 Mar 2023 15:36:15 -0800 Subject: [PATCH 09/14] Unneeded env yaml file --- envs/phylowgs/fill_battenberg.yaml | 41 ------------------------------ 1 file changed, 41 deletions(-) delete mode 100644 envs/phylowgs/fill_battenberg.yaml diff --git a/envs/phylowgs/fill_battenberg.yaml b/envs/phylowgs/fill_battenberg.yaml deleted file mode 100644 index 52741e52..00000000 --- a/envs/phylowgs/fill_battenberg.yaml +++ /dev/null @@ -1,41 +0,0 @@ -name: fill_segments -channels: - - bioconda - - conda-forge - - defaults -dependencies: - - _libgcc_mutex=0.1 - - _openmp_mutex=4.5 - - ca-certificates=2020.12.5 - - certifi=2020.12.5 - - ld_impl_linux-64=2.35.1 - - libblas=3.9.0 - - libcblas=3.9.0 - - libffi=3.3 - - libgcc-ng=9.3.0 - - libgfortran-ng=9.3.0 - - libgfortran5=9.3.0 - - libgomp=9.3.0 - - liblapack=3.9.0 - - libopenblas=0.3.12 - - libstdcxx-ng=9.3.0 - - ncurses=6.2 - - numpy=1.19.4 - - openssl=1.1.1i - - pandas=1.2.0 - - pip=20.3.3 - - python=3.9.1 - - python-dateutil=2.8.1 - - python_abi=3.9 - - pytz=2020.5 - - readline=8.0 - - setuptools=49.6.0 - - simplejson=3.17.2 - - six=1.15.0 - - sqlite=3.34.0 - - tk=8.6.10 - - tzdata=2020e - - wheel=0.36.2 - - xz=5.2.5 - - zlib=1.2.11 -prefix: /home/dreval/miniconda3/envs/fill_segments From 869a266ef4670d1e312f66208ba3faf5cdeec192 Mon Sep 17 00:00:00 2001 From: lkhilton Date: Fri, 3 Mar 2023 15:39:45 -0800 Subject: [PATCH 10/14] Add dependency checking --- modules/phylowgs/1.0/phylowgs.smk | 42 ++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/modules/phylowgs/1.0/phylowgs.smk b/modules/phylowgs/1.0/phylowgs.smk index 1ceb3110..39a6b990 100644 --- a/modules/phylowgs/1.0/phylowgs.smk +++ b/modules/phylowgs/1.0/phylowgs.smk @@ -16,7 +16,47 @@ import oncopipe as op import hashlib import glob - +import inspect + +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section # Setup module and store module-specific configuration in `CFG` # `CFG` is a shortcut to `config["lcr-modules"]["phylowgs"]` From 9e3161607ec147040474d81f738138612b59a21e Mon Sep 17 00:00:00 2001 From: lkhilton Date: Wed, 12 Apr 2023 15:58:26 -0700 Subject: [PATCH 11/14] Add 3'UTR --- modules/phylowgs/1.0/etc/noncoding.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/phylowgs/1.0/etc/noncoding.txt b/modules/phylowgs/1.0/etc/noncoding.txt index 258f27f0..c5fd789c 100644 --- a/modules/phylowgs/1.0/etc/noncoding.txt +++ b/modules/phylowgs/1.0/etc/noncoding.txt @@ -6,3 +6,4 @@ Intron 5'Flank 3'Flank 5'UTR +3'UTR From e9803de4ce9ef717b54272231fe035f924e08331 Mon Sep 17 00:00:00 2001 From: lkhilton Date: Wed, 12 Apr 2023 15:58:46 -0700 Subject: [PATCH 12/14] Patch for GAMBLR installation --- modules/pyclone_vi/1.0/pyclone_vi.smk | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/modules/pyclone_vi/1.0/pyclone_vi.smk b/modules/pyclone_vi/1.0/pyclone_vi.smk index 3a038b61..a882ebd1 100644 --- a/modules/pyclone_vi/1.0/pyclone_vi.smk +++ b/modules/pyclone_vi/1.0/pyclone_vi.smk @@ -45,7 +45,10 @@ f = open("config/envs/GAMBLR.yaml", 'rb') md5hash.update(f.read()) f.close() h = md5hash.hexdigest() -GAMBLR = glob.glob(conda_prefix + "/" + h[:8] + "*")[0] +GAMBLR = glob.glob(conda_prefix + "/" + h[:8] + "*") +for file in GAMBLR: + if os.path.isdir(file): + GAMBLR = file rule _pyclone_vi_install_GAMBLR: params: @@ -369,6 +372,16 @@ if isinstance(PATIENTS_GENOMES, pd.DataFrame) and isinstance(PATIENTS_CAPTURE, p rule _pyclone_vi_all: input: + expand( + rules._pyclone_vi_input_maf.output.maf, + zip, + tumour_id = CFG["runs"]["tumour_sample_id"], + normal_id = CFG["runs"]["normal_sample_id"], + pair_status = CFG["runs"]["pair_status"], + seq_type = CFG["runs"]["tumour_seq_type"], + genome_build = CFG["runs"]["tumour_genome_build"], + patient_id = CFG["runs"]["tumour_patient_id"] + ), expand( [ str(rules._pyclone_vi_output_tsv.output.phyclone), From 0cec80e90f26dc3fba7d9e44771ce1c73ab6c3ed Mon Sep 17 00:00:00 2001 From: lkhilton Date: Wed, 12 Apr 2023 15:59:07 -0700 Subject: [PATCH 13/14] Include indels in maf --- modules/pyclone_vi/1.0/src/build_input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/pyclone_vi/1.0/src/build_input.py b/modules/pyclone_vi/1.0/src/build_input.py index f412fd6c..48170bab 100644 --- a/modules/pyclone_vi/1.0/src/build_input.py +++ b/modules/pyclone_vi/1.0/src/build_input.py @@ -96,7 +96,7 @@ def get_normal_cn(chrom, sex): def load_snv_df(file_name, sample_id="tumour"): df = pd.read_csv(file_name, sep="\t") # PyClone only works on SNPs, not InDels - df = df[df["Variant_Type"].isin(["SNP"])] + # df = df[df["Variant_Type"].isin(["SNP"])] # Ignore intergenic mutations (IGR) df = df[~df["Variant_Classification"].isin(["IGR"])] # Acutally I can't do this sub-sampling here because the mutations in all files for all tumours From d484a11772706825d65081b356563c508c08ad84 Mon Sep 17 00:00:00 2001 From: lkhilton Date: Wed, 12 Jun 2024 11:11:55 -0700 Subject: [PATCH 14/14] Implement suggested changes from PR --- modules/phylowgs/1.0/config/default.yaml | 4 +- .../1.0/src/process_phyloWGS_outputs.R | 679 +++++++++--------- modules/pyclone_vi/1.0/config/default.yaml | 6 +- modules/pyclone_vi/1.0/pyclone_vi.smk | 10 +- 4 files changed, 337 insertions(+), 362 deletions(-) diff --git a/modules/phylowgs/1.0/config/default.yaml b/modules/phylowgs/1.0/config/default.yaml index f1557fa1..ff6a3332 100644 --- a/modules/phylowgs/1.0/config/default.yaml +++ b/modules/phylowgs/1.0/config/default.yaml @@ -5,8 +5,8 @@ lcr-modules: inputs: # Available wildcards: {tumour_id} {normal_id} {pair_status} {genome_build} {sample_id} maf: "__UPDATE__" - cellularity: "__UPDATE__" # battenberg/99-outputs/txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_subclones.txt - subclones: "__UPDATE__" # battenberg/99-outputs/txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_cellularity_ploidy.txt + cellularity: "__UPDATE__" # battenberg/99-outputs/txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_cellularity_ploidy.txt + subclones: "__UPDATE__" # battenberg/99-outputs/txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_subclones.txt drivers: "__UPDATE__" # newline-separated list of driver gene HUGO symbols to be included on plots scratch_subdirectories: [] # Recommended: "04-multievolve" diff --git a/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R b/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R index 587cdb05..ae57c9ab 100644 --- a/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R +++ b/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R @@ -1,12 +1,10 @@ - - #' #' processing phyloWGS outputs pipeline that takes output json files and preprocessing output files #' SAMPLE_ID.mutass.zip file must be unzipped before runing the script -#E example: how to run -#mkdir -p output -#Rscript ./process.R --samplename SAMPLE_ID -j SAMPLE_ID.summ.json -t unziped.mutass/ -s ssm_data.txt -c cnv_data.txt -a SAMPLE_ID--matched_slms-3.final_deblacklisted_augmented.maf -b SAMPLE_ID_matched_slms-3.final_deblacklisted_augmented.maf -m SAMPLE_ID.muts.json -o out +# E example: how to run +# mkdir -p output +# Rscript ./process.R --samplename SAMPLE_ID -j SAMPLE_ID.summ.json -t unziped.mutass/ -s ssm_data.txt -c cnv_data.txt -a SAMPLE_ID--matched_slms-3.final_deblacklisted_augmented.maf -b SAMPLE_ID_matched_slms-3.final_deblacklisted_augmented.maf -m SAMPLE_ID.muts.json -o out ################################################## # load required libraries @@ -23,91 +21,48 @@ library("data.table") #### Snakemake Input ##### ########################## -samplename = snakemake@wildcards[["patient_id"]] -json_file = snakemake@input[["summ"]] -trees_out= snakemake@input[["mutass"]] -ssm_file = snakemake@input[["ssms"]] -cnv_file = snakemake@input[["cnvs"]] -mafs = unlist(strsplit(snakemake@params[["maf_list"]], ",")) -mut_file = snakemake@input[["muts"]] -driver_genes = snakemake@params[["drivers"]] -sample_order = unlist(strsplit(snakemake@params[["sample_order"]], ",")) -genome_build = snakemake@wildcards[["genome_build"]] +samplename <- snakemake@wildcards[["patient_id"]] +json_file <- snakemake@input[["summ"]] +trees_out <- snakemake@input[["mutass"]] +ssm_file <- snakemake@input[["ssms"]] +cnv_file <- snakemake@input[["cnvs"]] +mafs <- unlist(strsplit(snakemake@params[["maf_list"]], ",")) +mut_file <- snakemake@input[["muts"]] +driver_genes <- snakemake@params[["drivers"]] +sample_order <- unlist(strsplit(snakemake@params[["sample_order"]], ",")) +genome_build <- snakemake@wildcards[["genome_build"]] # Define the chr_prefix parameter based on the genome_build -chr_prefixed = str_detect(genome_build, "hg") - - -# option_list = list( -# make_option(c("-n", "--samplename"), type="character", default=NULL, help="Samplename of the sample to run", metavar="character"), -# make_option(c("-j", "--json_summ"), type="character", default=NULL, help="SAMPLE_ID.summ.json file generated by phyloWGS", metavar="character"), -# make_option(c("-t", "--trees_out"), type="character", default=NULL, help="Directory containing unzipped XX.mutass.zip trees", metavar="character"), -# make_option(c("-s", "--ssm"), type="character", default=NULL, help="Preprocessing ssm_data.txt output file", metavar="character"), -# make_option(c("-c", "--copynumber"), type="character", default=NULL, help="Preprocessing cnv_data.txt output file", metavar="character"), -# make_option(c("-a", "--tumourA_maf"), type="character", default=NULL, help="Agument maf file of tumour A", metavar="character"), -# make_option(c("-b", "--tumourB_maf"), type="character", default=NULL, help="Agument maf file of tumour B", metavar="character"), -# make_option(c("-m", "--json_muts"), type="character", default=NULL, help="SAMPLE_ID.muts.json file", metavar="character"), -# make_option(c("-o", "--output"), type="character", default=NULL, help="Output directory", metavar="character") -# ) -# -# opt_parser = OptionParser(option_list=option_list) -# opt = parse_args(opt_parser) -# -# samplename = opt$samplename -# json_file = opt$json_summ -# trees_out= opt$trees_out ### directory where unziped SAMPLE_ID.mutass.zip trees are -# ssm_file = opt$ssm -# cnv_file = opt$copynumber -# mafA = opt$tumourA_maf -# mafB = opt$tumourB_maf -# mut_file = opt$json_muts -# output_dir = opt$output -# -# -# -# .checkfile = function(infile) { -# -# if (!file.exists(infile)) { -# -# stop(paste("File", infile, "does not exist", sep="")) -# -# } -# -# } -# -# -# .checkfile(json_file) -# .checkfile(ssm_file) -# .checkfile(cnv_file) -# .checkfile(mafA) -# .checkfile(mafB) -# .checkfile(mut_file) +chr_prefixed <- str_detect(genome_build, "hg") + ################################################## # Process input files ################################################### # Parse the input file and obtain the required data for this run result1 <- fromJSON(file = json_file) -result_mut<-fromJSON(file = mut_file) -ssm_pre<-read.table(file = ssm_file, header = TRUE) -cnv_pre<-read.delim(file = cnv_file, header = TRUE)[,c("cnv","a","d")] +result_mut <- fromJSON(file = mut_file) +ssm_pre <- read.table(file = ssm_file, header = TRUE) +cnv_pre <- read.delim(file = cnv_file, header = TRUE)[, c("cnv", "a", "d")] ################################################## # define output files ################################################## -out_json_to_Rtable= snakemake@output[["tree_summary"]] -ssm_to_trees= snakemake@output[["maf"]] -cnv_to_trees= snakemake@output[["cnvs"]] -cellular_prevalence_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_cellular_prevalence.pdf")) -CCF_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_cancer_cell_fraction_.pdf")) -VAF_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_vaf_.pdf")) -VAF_coding_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_vaf_coding.pdf")) -tree_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_tree.pdf")) -CCF_table = snakemake@output[["CCF"]] - -if(!dir.exists(snakemake@output[["plots"]])){dir.create(snakemake@output[["plots"]])} +out_json_to_Rtable <- snakemake@output[["tree_summary"]] +ssm_to_trees <- snakemake@output[["maf"]] +cnv_to_trees <- snakemake@output[["cnvs"]] +cellular_prevalence_plot <- file.path(snakemake@output[["plots"]], paste0(samplename, "_cellular_prevalence.pdf")) +CCF_plot <- file.path(snakemake@output[["plots"]], paste0(samplename, "_cancer_cell_fraction_.pdf")) +VAF_plot <- file.path(snakemake@output[["plots"]], paste0(samplename, "_vaf_.pdf")) +VAF_coding_plot <- file.path(snakemake@output[["plots"]], paste0(samplename, "_vaf_coding.pdf")) +tree_plot <- file.path(snakemake@output[["plots"]], paste0(samplename, "_tree.pdf")) +CCF_table <- snakemake@output[["CCF"]] + +if (!dir.exists(snakemake@output[["plots"]])) { + dir.create(snakemake@output[["plots"]]) +} # out_json_to_Rtable= file.path(output_dir, paste("out_res_",samplename,"_json_converted_toR.table", sep = "")) # ssm_to_trees= file.path(output_dir, paste("out_res_",samplename,"_assigned_ssms_to_best_tree_maf_format.table", sep = "")) @@ -122,151 +77,147 @@ if(!dir.exists(snakemake@output[["plots"]])){dir.create(snakemake@output[["plots # open summ.json file and convert it into humam readable format ################################################### -#this function opens SAMPLE_ID_summ.jason and converts it into R table -open_tree = function(json_summ_file,out_json_to_Rtable){ - - out_res<-NULL - for (j in 1:length(json_summ_file[["trees"]])){ - - tree_focal<-json_summ_file[["trees"]][j] - tree_focal_statA<-as.data.frame(t(unlist(sapply(tree_focal,function(x)x[c("clustering_index","branching_index","llh","linearity_index")])))) - colnames(tree_focal_statA)<-c("clustering_index","branching_index","llh","linearity_index") - tree_focal_statA$tree_id<-j-1 - rownames(tree_focal_statA)<-NULL - - - tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,-c(3,6,9,12,15,18,21,24,27,30)] - #tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,!(grepl("cellular_prevalence",colnames(tree_focal_statB)))] - colnames(tree_focal_statB)<-sub("^[^.]*.", "", colnames(tree_focal_statB)) - stat_both<-cbind(tree_focal_statA,tree_focal_statB) - out_res<-bind_rows(stat_both,out_res) - out_res_ordered<-out_res[order(out_res$tree_id),] - } # for j loop - -density<-json_summ_file["tree_densities"] -density_unlist<-data.frame("density"=unlist(density)) -row.names(density_unlist)<-sub("^[^.]*.", "", row.names(density_unlist)) - -density_unlist$tree_id<-row.names(density_unlist) -row.names(density_unlist)<-NULL - -final_table=merge(out_res_ordered,density_unlist, by.x = "tree_id", by.y = "tree_id") ## add tree densities to all tress table -write.table(final_table, file =out_json_to_Rtable ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE) -return(final_table) +# this function opens SAMPLE_ID_summ.jason and converts it into R table +open_tree <- function(json_summ_file, out_json_to_Rtable) { + out_res <- NULL + for (j in 1:length(json_summ_file[["trees"]])) { + tree_focal <- json_summ_file[["trees"]][j] + tree_focal_statA <- as.data.frame(t(unlist(sapply(tree_focal, function(x) x[c("clustering_index", "branching_index", "llh", "linearity_index")])))) + colnames(tree_focal_statA) <- c("clustering_index", "branching_index", "llh", "linearity_index") + tree_focal_statA$tree_id <- j - 1 + rownames(tree_focal_statA) <- NULL + + + tree_focal_statB <- as.data.frame(sapply(tree_focal, function(x) x[3]))[1, -c(3, 6, 9, 12, 15, 18, 21, 24, 27, 30)] + # tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,!(grepl("cellular_prevalence",colnames(tree_focal_statB)))] + colnames(tree_focal_statB) <- sub("^[^.]*.", "", colnames(tree_focal_statB)) + stat_both <- cbind(tree_focal_statA, tree_focal_statB) + out_res <- bind_rows(stat_both, out_res) + out_res_ordered <- out_res[order(out_res$tree_id), ] + } # for j loop + + density <- json_summ_file["tree_densities"] + density_unlist <- data.frame("density" = unlist(density)) + row.names(density_unlist) <- sub("^[^.]*.", "", row.names(density_unlist)) + + density_unlist$tree_id <- row.names(density_unlist) + row.names(density_unlist) <- NULL + + final_table <- merge(out_res_ordered, density_unlist, by.x = "tree_id", by.y = "tree_id") ## add tree densities to all tress table + write.table(final_table, file = out_json_to_Rtable, col.names = TRUE, row.names = FALSE, sep = "\t", quote = FALSE) + return(final_table) } -result_tree<-open_tree(result1,out_json_to_Rtable) +result_tree <- open_tree(result1, out_json_to_Rtable) ################################################### # extrcats the best tree ################################################### -#the best tree is the tree with the highest density +# the best tree is the tree with the highest density -best_tree_id = function(R_table, density) { - best=R_table[which.max(R_table$density),] - best_tree_focal_name<-best$tree_id - best_tree_id<-paste(best$tree_id,"json",sep = ".") +best_tree_id <- function(R_table, density) { + best <- R_table[which.max(R_table$density), ] + best_tree_focal_name <- best$tree_id + best_tree_id <- paste(best$tree_id, "json", sep = ".") return(best_tree_id) return(best_tree_focal_name) } -best_tree_fileID<-best_tree_id(result_tree, density) +best_tree_fileID <- best_tree_id(result_tree, density) ####################################################################### -# extract the stats (SNvs and CNVs assigned to each population) from the best tree +# extract the stats (SNvs and CNVs assigned to each population) from the best tree ####################################################################### -open_best_tree = function(trees_out,best_tree_id){ +open_best_tree <- function(trees_out, best_tree_id) { unzip(trees_out, files = best_tree_id, exdir = dirname(trees_out), overwrite = TRUE) - best_tree_path = paste0(dirname(trees_out), "/", best_tree_id) - rr <- fromJSON(file = best_tree_path) - return(rr) + best_tree_path <- paste0(dirname(trees_out), "/", best_tree_id) + rr <- fromJSON(file = best_tree_path) + return(rr) } -rr= open_best_tree(trees_out,best_tree_fileID) +rr <- open_best_tree(trees_out, best_tree_fileID) ####################################################################### # annotate point mutations and CNVs in the best tree ####################################################################### -best_focal<-result1[["trees"]][as.numeric(gsub(".json","",best_tree_fileID))+1] -tree_structure<-as.data.frame(sapply(best_focal,function(x)x["structure"])) ##[6] +best_focal <- result1[["trees"]][as.numeric(gsub(".json", "", best_tree_fileID)) + 1] +tree_structure <- as.data.frame(sapply(best_focal, function(x) x["structure"])) ## [6] tree_roots <- best_focal[[1]]$structure$`0` -merge_both<-function(result1,best_tree_fileID,tree_structure){ - best_tree<-as.numeric(gsub(".json","",best_tree_fileID)) - best_focal<-result1[["trees"]][best_tree+1] - tree_focal_statB<-as.data.frame(sapply(best_focal,function(x)x["populations"])) ##[3] - qq<-tree_focal_statB[,grep("cellular_prevalence",colnames(tree_focal_statB))] %>% - rownames_to_column("sample") %>% - pivot_longer(-sample, - names_to = "population", - values_to = "cellular_prevalence") %>% - mutate(population = str_remove(str_remove(population, ".*populations[.]"), "[.]cellular_prevalence")) %>% - mutate(is_root = ifelse(population %in% tree_roots, TRUE, FALSE)) %>% - group_by(sample) %>% - mutate(purity = sum(cellular_prevalence[is_root]), - CCF = cellular_prevalence / purity) - - return(qq) +merge_both <- function(result1, best_tree_fileID, tree_structure) { + best_tree <- as.numeric(gsub(".json", "", best_tree_fileID)) + best_focal <- result1[["trees"]][best_tree + 1] + tree_focal_statB <- as.data.frame(sapply(best_focal, function(x) x["populations"])) ## [3] + qq <- tree_focal_statB[, grep("cellular_prevalence", colnames(tree_focal_statB))] %>% + rownames_to_column("sample") %>% + pivot_longer(-sample, + names_to = "population", + values_to = "cellular_prevalence" + ) %>% + mutate(population = str_remove(str_remove(population, ".*populations[.]"), "[.]cellular_prevalence")) %>% + mutate(is_root = ifelse(population %in% tree_roots, TRUE, FALSE)) %>% + group_by(sample) %>% + mutate( + purity = sum(cellular_prevalence[is_root]), + CCF = cellular_prevalence / purity + ) + return(qq) } -both_samples<-merge_both(result1,best_tree_fileID,tree_structure) +both_samples <- merge_both(result1, best_tree_fileID, tree_structure) write_tsv(both_samples, CCF_table) -ssm = function(stat_best_tree, ssm_pre,ssm_to_trees,tree_structure, maf_list){ - - out_res_ssm<-NULL - for ( i in 1:length(stat_best_tree$mut_assignments)){ - - focal<-(stat_best_tree$mut_assignments)[i] - - focal_ssms<-data.frame(sapply(focal, function(x) x[1])) - - colnames(focal_ssms)<-sub("^[^.]*.", "", colnames(focal_ssms)) - focal_ssms$phyloWGS_population<-i - ssm_assign<-merge(ssm_pre,focal_ssms, by.x = "id", by.y = "ssms")[,c("id", "gene","phyloWGS_population")] - ssm_assign_spi<-separate(ssm_assign, col = gene, into = c("Chromosome","Start_Position"), sep = "_", convert = FALSE) %>% - mutate(Start_Position = as.numeric(Start_Position)) - if(chr_prefixed) { - ssm_assign_spi$Chromosome = str_c("chr", as.character(ssm_assign_spi$Chromosome)) - } - - - out_res_ssm<-rbind(ssm_assign_spi,out_res_ssm) - - } ## i loop - - ssm_assign_with_maf <- lapply(maf_list, function(x){ - maf <- read_tsv(x, - col_types = cols(Chromosome = col_character())) %>% - # PhyloWGS changes the start position of deletions. This makes the maf start position match that in the PhyloWGS SSM table. +ssm <- function(stat_best_tree, ssm_pre, ssm_to_trees, tree_structure, maf_list) { + out_res_ssm <- NULL + for (i in 1:length(stat_best_tree$mut_assignments)) { + focal <- (stat_best_tree$mut_assignments)[i] + + focal_ssms <- data.frame(sapply(focal, function(x) x[1])) + + colnames(focal_ssms) <- sub("^[^.]*.", "", colnames(focal_ssms)) + focal_ssms$phyloWGS_population <- i + ssm_assign <- merge(ssm_pre, focal_ssms, by.x = "id", by.y = "ssms")[, c("id", "gene", "phyloWGS_population")] + ssm_assign_spi <- separate(ssm_assign, col = gene, into = c("Chromosome", "Start_Position"), sep = "_", convert = FALSE) %>% + mutate(Start_Position = as.numeric(Start_Position)) + if (chr_prefixed) { + ssm_assign_spi$Chromosome <- str_c("chr", as.character(ssm_assign_spi$Chromosome)) + } + + + out_res_ssm <- rbind(ssm_assign_spi, out_res_ssm) + } ## i loop + + ssm_assign_with_maf <- lapply(maf_list, function(x) { + maf <- read_tsv(x, + col_types = cols(Chromosome = col_character()) + ) %>% + # PhyloWGS changes the start position of deletions. This makes the maf start position match that in the PhyloWGS SSM table. mutate(Start_Position = ifelse(Variant_Type == "DEL", Start_Position - 1, Start_Position)) - maf <- out_res_ssm %>% - left_join(maf, by = c("Chromosome", "Start_Position")) %>% + maf <- out_res_ssm %>% + left_join(maf, by = c("Chromosome", "Start_Position")) %>% # Restore the true MAF start postion after the hack above mutate(Start_Position = ifelse(Variant_Type == "DEL", Start_Position + 1, Start_Position)) %>% select(colnames(maf), everything()) }) - out_res_ssm <- rbindlist(ssm_assign_with_maf) %>% + out_res_ssm <- rbindlist(ssm_assign_with_maf) %>% mutate(clonal_status = case_when( - phyloWGS_population %in% tree_roots & length(tree_roots) > 1 ~ "polyclonal", + phyloWGS_population %in% tree_roots & length(tree_roots) > 1 ~ "polyclonal", phyloWGS_population %in% tree_roots & length(tree_roots) == 1 ~ "clonal", TRUE ~ "subclonal" )) - - return(out_res_ssm) - - + + return(out_res_ssm) } -ss<-ssm(rr,ssm_pre,ssm_to_trees,tree_structure, mafs) +ss <- ssm(rr, ssm_pre, ssm_to_trees, tree_structure, mafs) write_tsv(ss, ssm_to_trees, na = "") @@ -275,178 +226,205 @@ write_tsv(ss, ssm_to_trees, na = "") ## load mut file to extrcat CNVs start and end positions ########################################################### -cnv = function(stat_best_tree, cnv_pre,mutation_file,cnv_to_trees){ +cnv <- function(stat_best_tree, cnv_pre, mutation_file, cnv_to_trees) { out_res_cnv <- - bind_rows(lapply(1:length(stat_best_tree$mut_assignments), function(x) - data.frame(cnvs = stat_best_tree$mut_assignments[[x]]$cnvs) %>% mutate(phyloWGS_population = x))) - - - - #return(out_res_cnv) - out_res_mut<-NULL - for (cn in 1:length(result_mut$cnvs)){ - focal_mut_cnv<-(result_mut$cnvs)[cn] - - focal_mut<-data.frame(sapply(focal_mut_cnv, function(x) x[1]))[1,] - colnames(focal_mut)<-sub("^[^.]*.", "", colnames(focal_mut)) - focal_mut$cnv_id<-names(focal_mut_cnv) - out_res_mut<-bind_rows(focal_mut,out_res_mut) - } ## cn loop - - both_cnvs<-merge(out_res_cnv, out_res_mut, by.x = "cnvs",by.y = "cnv_id") %>% - select(cnvs, phyloWGS_population, physical_cnvs.chrom, - physical_cnvs.start, physical_cnvs.end, - physical_cnvs.major_cn, physical_cnvs.minor_cn, physical_cnvs.cell_prev) - - + bind_rows(lapply(1:length(stat_best_tree$mut_assignments), function(x) { + data.frame(cnvs = stat_best_tree$mut_assignments[[x]]$cnvs) %>% mutate(phyloWGS_population = x) + })) + + + + # return(out_res_cnv) + out_res_mut <- NULL + for (cn in 1:length(result_mut$cnvs)) { + focal_mut_cnv <- (result_mut$cnvs)[cn] + + focal_mut <- data.frame(sapply(focal_mut_cnv, function(x) x[1]))[1, ] + colnames(focal_mut) <- sub("^[^.]*.", "", colnames(focal_mut)) + focal_mut$cnv_id <- names(focal_mut_cnv) + out_res_mut <- bind_rows(focal_mut, out_res_mut) + } ## cn loop + + both_cnvs <- merge(out_res_cnv, out_res_mut, by.x = "cnvs", by.y = "cnv_id") %>% + select( + cnvs, phyloWGS_population, physical_cnvs.chrom, + physical_cnvs.start, physical_cnvs.end, + physical_cnvs.major_cn, physical_cnvs.minor_cn, physical_cnvs.cell_prev + ) } -cnv<-cnv(rr, cnv_pre,result_mut,cnv_to_trees) -write.table(cnv, file =cnv_to_trees ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE) +cnv <- cnv(rr, cnv_pre, result_mut, cnv_to_trees) +write.table(cnv, file = cnv_to_trees, col.names = TRUE, row.names = FALSE, sep = "\t", quote = FALSE) ##################### plot the results #################### ########################################################### #### Slope chart the best tree, cellular prevalence ####### -plot_cp<-function(both_samples,cellular_prevalence_plot){ -pdf(cellular_prevalence_plot, width = 8, height =8 ) -plotA<-ggplot(data = both_samples, aes(x = sample, y = cellular_prevalence, group = population)) + - geom_line(aes(color = population), size = 2) + - labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ - geom_point(aes(color = population), size = 4) + - # Labelling as desired - xlab("Sample") + ylab("Cellular prevalence")+ - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), - panel.background = element_blank(), axis.line = element_line(colour = "black"), - legend.key = element_rect(fill = NA, colour = NA, size = 0.25),axis.text=element_text(size=16),axis.title=element_text(size=18), - plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))) -print(plotA) -dev.off() +plot_cp <- function(both_samples, cellular_prevalence_plot) { + pdf(cellular_prevalence_plot, width = 8, height = 8) + plotA <- ggplot(data = both_samples, aes(x = sample, y = cellular_prevalence, group = population)) + + geom_line(aes(color = population), size = 2) + + labs(title = paste("Best Tree", gsub(".json", "", best_tree_fileID), sep = " ")) + + geom_point(aes(color = population), size = 4) + + # Labelling as desired + xlab("Sample") + + ylab("Cellular prevalence") + + theme( + panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + panel.background = element_blank(), axis.line = element_line(colour = "black"), + legend.key = element_rect(fill = NA, colour = NA, size = 0.25), axis.text = element_text(size = 16), axis.title = element_text(size = 18), + plot.margin = margin(1, 1., 1, 1.5, "cm"), axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)) + ) + print(plotA) + dev.off() } -plot_cp(both_samples,cellular_prevalence_plot) +plot_cp(both_samples, cellular_prevalence_plot) ########################################################### #### Slope chart the best tree, CCF ####### -plot_cp<-function(both_samples,CCF_plot){ -pdf(CCF_plot, width = 8, height =8 ) -plotB<-ggplot(data = both_samples[both_samples$population != 0, ], aes(x = sample, y = CCF, group = population)) + - geom_line(aes(color = population), size = 2) + - labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ - geom_point(aes(color = population), size = 4) + - # Labelling as desired - xlab("Sample") + ylab("CCF")+ - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), - panel.background = element_blank(), axis.line = element_line(colour = "black"), - legend.key = element_rect(fill = NA, colour = NA, size = 0.25),axis.text=element_text(size=16),axis.title=element_text(size=18), - plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))) -print(plotB) -dev.off() +plot_cp <- function(both_samples, CCF_plot) { + pdf(CCF_plot, width = 8, height = 8) + plotB <- ggplot(data = both_samples[both_samples$population != 0, ], aes(x = sample, y = CCF, group = population)) + + geom_line(aes(color = population), size = 2) + + labs(title = paste("Best Tree", gsub(".json", "", best_tree_fileID), sep = " ")) + + geom_point(aes(color = population), size = 4) + + # Labelling as desired + xlab("Sample") + + ylab("CCF") + + theme( + panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + panel.background = element_blank(), axis.line = element_line(colour = "black"), + legend.key = element_rect(fill = NA, colour = NA, size = 0.25), axis.text = element_text(size = 16), axis.title = element_text(size = 18), + plot.margin = margin(1, 1., 1, 1.5, "cm"), axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)) + ) + print(plotB) + dev.off() } -plot_cp(both_samples,CCF_plot) +plot_cp(both_samples, CCF_plot) ############################################# ##### Slope chart the best tree (VAF) ####### -plot_vaf<-function(ss,VAF_plot){ - pdf(VAF_plot, width = 8, height =8 ) - plotC <- ss %>% - select(Hugo_Symbol, Chromosome, Tumor_Sample_Barcode, Start_Position, t_depth, t_alt_count, populations = phyloWGS_population) %>% - mutate(VAF = t_alt_count/t_depth, - populations = as.factor(populations)) %>% +plot_vaf <- function(ss, VAF_plot) { + pdf(VAF_plot, width = 8, height = 8) + plotC <- ss %>% + select(Hugo_Symbol, Chromosome, Tumor_Sample_Barcode, Start_Position, t_depth, t_alt_count, populations = phyloWGS_population) %>% + mutate( + VAF = t_alt_count / t_depth, + populations = as.factor(populations) + ) %>% filter(!is.na(Tumor_Sample_Barcode)) %>% - ggplot(aes(x = Tumor_Sample_Barcode, - y = VAF, - group = interaction(populations, Start_Position), - color = populations)) + - geom_line(aes(color = populations), size=0.2, alpha=0.4)+ - labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ - xlab("Sample") + ylab("VAF") + - guides(colour = guide_legend(override.aes = list(alpha = 3)))+ - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), - panel.background = element_blank(), axis.line = element_line(colour = "black"), - legend.key = element_rect(fill = NA, colour = NA, size = 2),axis.text=element_text(size=16),axis.title=element_text(size=18), - plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))) -print(plotC) -dev.off() + ggplot(aes( + x = Tumor_Sample_Barcode, + y = VAF, + group = interaction(populations, Start_Position), + color = populations + )) + + geom_line(aes(color = populations), size = 0.2, alpha = 0.4) + + labs(title = paste("Best Tree", gsub(".json", "", best_tree_fileID), sep = " ")) + + xlab("Sample") + + ylab("VAF") + + guides(colour = guide_legend(override.aes = list(alpha = 3))) + + theme( + panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + panel.background = element_blank(), axis.line = element_line(colour = "black"), + legend.key = element_rect(fill = NA, colour = NA, size = 2), axis.text = element_text(size = 16), axis.title = element_text(size = 18), + plot.margin = margin(1, 1., 1, 1.5, "cm"), axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)) + ) + print(plotC) + dev.off() } -plot_vaf(ss,VAF_plot) +plot_vaf(ss, VAF_plot) ############################################################# ##### Slope chart the best tree (VAF, coding regions, nonsense, missense and splicing sites) ####### -drivers <- read_tsv(driver_genes, col_names = "gene") %>% pull(gene) - -plot_vaf_coding<-function(maf,VAF_coding_plot){ - - coding <- ss %>% - select(Hugo_Symbol, HGVSp_Short, Chromosome, Tumor_Sample_Barcode, Variant_Classification, Start_Position, t_depth, t_alt_count, populations = phyloWGS_population) %>% - mutate(VAF = t_alt_count/t_depth, - populations = as.factor(populations), - Tumor_Sample_Barcode = factor(Tumor_Sample_Barcode, levels = sample_order)) %>% - filter(!is.na(Tumor_Sample_Barcode), - !Variant_Classification %in% c("Silent", "RNA", "IGR", "Intron", "5'Flank", "3'Flank", "5'UTR")) %>% +drivers <- read_tsv(driver_genes, col_names = "gene") %>% pull(gene) + +plot_vaf_coding <- function(maf, VAF_coding_plot) { + coding <- ss %>% + select(Hugo_Symbol, HGVSp_Short, Chromosome, Tumor_Sample_Barcode, Variant_Classification, Start_Position, t_depth, t_alt_count, populations = phyloWGS_population) %>% + mutate( + VAF = t_alt_count / t_depth, + populations = as.factor(populations), + Tumor_Sample_Barcode = factor(Tumor_Sample_Barcode, levels = sample_order) + ) %>% + filter( + !is.na(Tumor_Sample_Barcode), + !Variant_Classification %in% c("Silent", "RNA", "IGR", "Intron", "5'Flank", "3'Flank", "5'UTR") + ) %>% mutate(label = ifelse(!is.na(HGVSp_Short), str_c(Hugo_Symbol, "_", HGVSp_Short), str_c(Hugo_Symbol, "_", Variant_Classification))) -pdf(VAF_coding_plot, width = 8, height =8 ) -plotD<-coding %>% - ggplot(aes(x = Tumor_Sample_Barcode, - y = VAF, - group = interaction(populations, Start_Position), - color = populations)) + - geom_line(aes(color = populations), size=0.5, alpha=0.4)+ - geom_text_repel( - data = filter(group_by(coding, Hugo_Symbol, Start_Position), VAF == max(VAF), Tumor_Sample_Barcode == sample_order[1], Hugo_Symbol %in% drivers), - aes(label = label, - x = Tumor_Sample_Barcode, - y = VAF), - nudge_x = -0.2, - size = 4 + pdf(VAF_coding_plot, width = 8, height = 8) + plotD <- coding %>% + ggplot(aes( + x = Tumor_Sample_Barcode, + y = VAF, + group = interaction(populations, Start_Position), + color = populations + )) + + geom_line(aes(color = populations), size = 0.5, alpha = 0.4) + + geom_text_repel( + data = filter(group_by(coding, Hugo_Symbol, Start_Position), VAF == max(VAF), Tumor_Sample_Barcode == sample_order[1], Hugo_Symbol %in% drivers), + aes( + label = label, + x = Tumor_Sample_Barcode, + y = VAF + ), + nudge_x = -0.2, + size = 4 ) + - geom_text_repel( - data = filter(group_by(coding, Hugo_Symbol, Start_Position), VAF == max(VAF), Tumor_Sample_Barcode == sample_order[length(sample_order)], Hugo_Symbol %in% drivers), - aes(label = label, - x = Tumor_Sample_Barcode, - y = VAF), - nudge_x = 0.2, - size = 4 - ) + - labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ - xlab("Sample") + ylab("VAF") + - guides(colour = guide_legend(override.aes = list(alpha = 3)))+ - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), - panel.background = element_blank(), axis.line = element_line(colour = "black"), - legend.key = element_rect(fill = NA, colour = NA, size = 2),axis.text=element_text(size=16),axis.title=element_text(size=18), - plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))) - -print(plotD) -dev.off() -} + geom_text_repel( + data = filter(group_by(coding, Hugo_Symbol, Start_Position), VAF == max(VAF), Tumor_Sample_Barcode == sample_order[length(sample_order)], Hugo_Symbol %in% drivers), + aes( + label = label, + x = Tumor_Sample_Barcode, + y = VAF + ), + nudge_x = 0.2, + size = 4 + ) + + labs(title = paste("Best Tree", gsub(".json", "", best_tree_fileID), sep = " ")) + + xlab("Sample") + + ylab("VAF") + + guides(colour = guide_legend(override.aes = list(alpha = 3))) + + theme( + panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + panel.background = element_blank(), axis.line = element_line(colour = "black"), + legend.key = element_rect(fill = NA, colour = NA, size = 2), axis.text = element_text(size = 16), axis.title = element_text(size = 18), + plot.margin = margin(1, 1., 1, 1.5, "cm"), axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)) + ) + + print(plotD) + dev.off() +} -plot_vaf_coding(ss,VAF_coding_plot) +plot_vaf_coding(ss, VAF_coding_plot) ############################################# ##### Draw the best tree ####### ############################################# -tree_structure_long <- tree_structure %>% - pivot_longer(everything(), - names_to = "parent", - values_to = "node") %>% - mutate(parent = str_remove_all(parent, ".*[.]")) %>% - distinct() +tree_structure_long <- tree_structure %>% + pivot_longer(everything(), + names_to = "parent", + values_to = "node" + ) %>% + mutate(parent = str_remove_all(parent, ".*[.]")) %>% + distinct() -positions_x <- function(parents){ +positions_x <- function(parents) { x <- 1:length(unique(parents)) names(x) <- unique(parents) col_vals <- unname(x[parents]) @@ -455,39 +433,38 @@ positions_x <- function(parents){ tree_structure_long$x <- positions_x(tree_structure_long$parent) -positions_y <- function(tree_df){ - y = c("0" = 0.5) - for(parent in unique(tree_df$parent)){ +positions_y <- function(tree_df) { + y <- c("0" = 0.5) + for (parent in unique(tree_df$parent)) { # parent = "1" - child_index = 1 - num_children <- nrow(tree_df[tree_df$parent == parent,]) - if(num_children == 1){ - child <- tree_df[tree_df$parent == parent,]$node + child_index <- 1 + num_children <- nrow(tree_df[tree_df$parent == parent, ]) + if (num_children == 1) { + child <- tree_df[tree_df$parent == parent, ]$node child_y <- unname(y[parent]) names(child_y) <- child - y = c(y, child_y) - + y <- c(y, child_y) } else { - children <- tree_df[tree_df$parent == parent,]$node + children <- tree_df[tree_df$parent == parent, ]$node y_max <- unname(y[parent]) + (0.25 / child_index) y_min <- unname(y[parent]) - (0.25 / child_index) y_range <- seq(y_min, y_max, length.out = length(children)) names(y_range) <- children - y = c(y, y_range) + y <- c(y, y_range) } - child_index = child_index + 1 + child_index <- child_index + 1 } return(y) } tree_structure_long$y <- unname(positions_y(tree_structure_long)[as.character(tree_structure_long$node)]) -tree_structure_long <- add_row(tree_structure_long, parent = "0", node = 0, x = 0, y = 0.5) +tree_structure_long <- add_row(tree_structure_long, parent = "0", node = 0, x = 0, y = 0.5) -get_ssms <- function(tree_df, best_focal, best_tree_fileID){ +get_ssms <- function(tree_df, best_focal, best_tree_fileID) { data <- best_focal[[str_remove_all(best_tree_fileID, "[.].*")]]$populations ssm_vec <- c() - for(node in tree_df$node){ + for (node in tree_df$node) { # node = "1" num_ssms <- data[[as.character(node)]]$num_ssms names(num_ssms) <- as.character(node) @@ -498,39 +475,45 @@ get_ssms <- function(tree_df, best_focal, best_tree_fileID){ tree_structure_long$num_ssms <- get_ssms(tree_structure_long, best_focal, best_tree_fileID)[as.character(tree_structure_long$node)] -tree_structure_long <- tree_structure_long %>% +tree_structure_long <- tree_structure_long %>% mutate(parent = as.numeric(parent)) %>% - left_join(select(tree_structure_long, node, xstart = x, ystart = y), - by = c("parent" = "node")) - - - -ggplot(tree_structure_long, - aes(x = x, - y = y, - label = node)) + - geom_segment(inherit.aes = FALSE, - aes(x = xstart, - xend = x, - y = ystart, - yend = y)) + - geom_point(aes(size = num_ssms), - fill = "white", - colour = "black", - pch = 21) + - geom_text() + - scale_size(range = c(5,20)) + - ylim(0,1) + - theme_void() + - ggtitle(samplename) + + left_join(select(tree_structure_long, node, xstart = x, ystart = y), + by = c("parent" = "node") + ) + + + +plot_tree <- ggplot( + tree_structure_long, + aes( + x = x, + y = y, + label = node + ) +) + + geom_segment( + inherit.aes = FALSE, + aes( + x = xstart, + xend = x, + y = ystart, + yend = y + ) + ) + + geom_point(aes(size = num_ssms), + fill = "white", + colour = "black", + pch = 21 + ) + + geom_text() + + scale_size(range = c(5, 20)) + + ylim(0, 1) + + theme_void() + + ggtitle(samplename) + theme(legend.position = "none") - -ggsave(tree_plot, height = 6, width = 6) + +ggsave(tree_plot, plot_tree, height = 6, width = 6) ############ ##### END ## ############ - - - - diff --git a/modules/pyclone_vi/1.0/config/default.yaml b/modules/pyclone_vi/1.0/config/default.yaml index 2e7ee60b..c6638372 100644 --- a/modules/pyclone_vi/1.0/config/default.yaml +++ b/modules/pyclone_vi/1.0/config/default.yaml @@ -5,9 +5,9 @@ lcr-modules: # TODO: Update the list of available wildcards, if applicable inputs: # Available wildcards: {seq_type} {genome_build} {tumour_id} {normal_id} {pair_status} - sample_maf: "__UPDATE__" - sample_subclones: "__UPDATE__" - sample_cellularity: "__UPDATE__" + sample_maf: "__UPDATE__" # slms_3-1.0_vcf2maf-1.3/99-outputs/deblacklisted/augmented_maf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.slms-3.final.maf + sample_subclones: "__UPDATE__" # battenberg/99-outputs/txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_subclones.txt + sample_cellularity: "__UPDATE__" # battenberg/99-outputs/txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_cellularity_ploidy.txt sample_sex: "__UPDATE__" # Only {normal_id} available scratch_subdirectories: [] diff --git a/modules/pyclone_vi/1.0/pyclone_vi.smk b/modules/pyclone_vi/1.0/pyclone_vi.smk index a882ebd1..7321ea39 100644 --- a/modules/pyclone_vi/1.0/pyclone_vi.smk +++ b/modules/pyclone_vi/1.0/pyclone_vi.smk @@ -26,7 +26,6 @@ CFG = op.setup_module( ) # Define rules to be run locally when using a compute cluster -# TODO: Replace with actual rules once you change the rule names localrules: _pyclone_vi_write_results, _pyclone_vi_all @@ -354,7 +353,7 @@ rule _pyclone_vi_output_tsv: tree = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{patient_id}.phyclone.tree.nwk", clusters = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{patient_id}.phyclone.clusters.tsv" run: - op.relative_symlink(input.pyclone, output.pyclone) + op.relative_symlink(input.pyclone, output.pyclone, in_module = TRUE) op.relative_symlink(input.phyclone, output.phyclone) op.relative_symlink(input.tree, output.tree) op.relative_symlink(input.clusters, output.clusters) @@ -393,13 +392,6 @@ rule _pyclone_vi_all: seq_type=PATIENTS["tumour_seq_type"], genome_build=PATIENTS["tumour_genome_build"], patient_id=PATIENTS["tumour_patient_id"]) - # expand( - # str(rules._pyclone_run_analysis_pipeline.output.workdir), - # zip, - # seq_type=PATIENTS_CAPTURE["tumour_seq_type"], - # genome_build=PATIENTS_CAPTURE["tumour_genome_build"], - # patient_id=PATIENTS_CAPTURE["tumour_patient_id"] - # ) ##### CLEANUP #####