From fda4b0a1aa1f3dec63ce3aab78851a747f9de8e9 Mon Sep 17 00:00:00 2001
From: lkhilton <laura.k.hilton@gmail.com>
Date: Mon, 12 Sep 2022 10:28:06 -0700
Subject: [PATCH 01/14] Improve maf subsetting; use pre-filled Battenberg
 results

---
 modules/phylowgs/1.0/phylowgs.smk | 547 ++++++++++++++++++------------
 1 file changed, 323 insertions(+), 224 deletions(-)

diff --git a/modules/phylowgs/1.0/phylowgs.smk b/modules/phylowgs/1.0/phylowgs.smk
index 44028ef3..1cdba641 100644
--- a/modules/phylowgs/1.0/phylowgs.smk
+++ b/modules/phylowgs/1.0/phylowgs.smk
@@ -15,269 +15,368 @@
 # Import package with useful functions for developing analysis modules
 import oncopipe as op
 import hashlib
+import glob
 
 
 # Setup module and store module-specific configuration in `CFG`
 # `CFG` is a shortcut to `config["lcr-modules"]["phylowgs"]`
 CFG = op.setup_module(
-	name = "phylowgs",
-	version = "1.0",
-	subdirectories = ["inputs", "preprocess_battenberg", "preprocess_inputs", "multievolve", "results",  "outputs"],
+    name = "phylowgs",
+    version = "1.0",
+    subdirectories = ["inputs", "maf_to_vcf", "preprocess_battenberg", "preprocess_inputs", "multievolve", "results",  "outputs"],
 )
 
 # Define rules to be run locally when using a compute cluster
 localrules:
-	_phylowgs_input_vcf,
-	_phylowgs_input_battenberg,
-	_phylowgs_output_html,
-	_phylowgs_all
+    _phylowgs_input_maf,
+    _phylowgs_input_battenberg,
+    _phylowgs_process_output,
+    _phylowgs_output_plots,
+    _phylowgs_all,
+    _phylowgs_priority_ssms
 
 
-# Generate a de-duplicated table of patient_ids etc. 
+# Generate a de-duplicated table of patient_ids etc.
 PATIENTS = CFG["runs"][["tumour_patient_id", "normal_patient_id", "tumour_genome_build", "tumour_seq_type", "tumour_sex"]].drop_duplicates(subset = None, ignore_index = True)
 
 # Obtain the path to the phylowgs conda environment
 md5hash = hashlib.md5()
-if workflow.conda_prefix: 
-	conda_prefix = workflow.conda_prefix
-else: 
-	conda_prefix = os.path.abspath(".snakemake/conda")
+if workflow.conda_prefix:
+    conda_prefix = workflow.conda_prefix
+else:
+    conda_prefix = os.path.abspath(".snakemake/conda")
 md5hash.update(conda_prefix.encode())
 f = open(CFG['conda_envs']['phylowgs'], 'rb')
 md5hash.update(f.read())
 f.close()
 h = md5hash.hexdigest()
-PHYLO = conda_prefix + "/" + h[:8] + "/share/phylowgs/"
+PHYLO = "".join(glob.glob(conda_prefix + "/" + h[:8] + "*/share/phylowgs/"))
+
+##### FUNCTIONS #####
+
+# Input function to get all MAFs per patient
+def get_input_mafs(wildcards):
+    CFG = config["lcr-modules"]["phylowgs"]
+    PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id).sort_values(by = ["tumour_time_point"])
+    inputs = expand(
+        [
+            str(rules._phylowgs_input_maf.output.maf)
+        ],
+        zip,
+        tumour_id = PATIENT["tumour_sample_id"],
+        normal_id = PATIENT["normal_sample_id"],
+        pair_status = PATIENT["pair_status"],
+        allow_missing = True
+    )
+    return(inputs)
+
+def get_maf_cli(wildcards):
+    CFG = config["lcr-modules"]["phylowgs"]
+    PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id).sort_values(by = ["tumour_time_point"])
+    inputs = expand(
+        [
+            str(rules._phylowgs_input_maf.output.maf)
+        ],
+        zip,
+        tumour_id = PATIENT["tumour_sample_id"],
+        normal_id = PATIENT["normal_sample_id"],
+        pair_status = PATIENT["pair_status"],
+        genome_build = PATIENT["tumour_genome_build"],
+        seq_type = PATIENT["tumour_seq_type"]
+    )
+    cli =  ",".join([str(elem) for elem in inputs])
+    return(cli)
+
+# Define the order of sample labels by time point
+def order_samples(wildcards):
+    CFG = config["lcr-modules"]["phylowgs"]
+    PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id)
+    samples = str(",".join(PATIENT.sort_values(by = ["tumour_time_point"]).tumour_sample_id.tolist()))
+    return(samples)
 
+# Expand the input files to create a command-line argument for create_phylowgs_inputs.py
+def create_phylowgs_inputs_cli(wildcards):
+    CFG = config["lcr-modules"]["phylowgs"]
+    PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id).sort_values(by = ["tumour_time_point"])
+    cnvs = expand(
+        "--cnvs {time_point}=" + CFG['dirs']['preprocess_battenberg'] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.cnvs.txt",
+        zip,
+        time_point = PATIENT["tumour_time_point"],
+        tumour_id = PATIENT["tumour_sample_id"],
+        normal_id = PATIENT["normal_sample_id"],
+        pair_status = PATIENT["pair_status"],
+        seq_type = PATIENT["tumour_seq_type"],
+        genome_build = PATIENT["tumour_genome_build"],
+        patient_id = PATIENT["tumour_patient_id"]
+    )
+    vcf_types = expand(
+        "--vcf-type {time_point}=mutect_smchet",
+        zip,
+        time_point = PATIENT["tumour_time_point"]
+    )
+    vcfs = expand(
+        "{time_point}=" + CFG['dirs']['maf_to_vcf'] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.maf_to.vcf.gz",
+        zip,
+        time_point = PATIENT["tumour_time_point"],
+        tumour_id = PATIENT["tumour_sample_id"],
+        normal_id = PATIENT["normal_sample_id"],
+        pair_status = PATIENT["pair_status"],
+        seq_type = PATIENT["tumour_seq_type"],
+        genome_build = PATIENT["tumour_genome_build"],
+        patient_id = PATIENT["tumour_patient_id"]
+    )
+    cli = cnvs + vcf_types + vcfs
+    cli = " ".join([str(elem) for elem in cli])
+    return(cli)
+
+# Input function to pull in input VCF and preprocessed CNV data
+def create_phylowgs_inputs(wildcards):
+    CFG = config["lcr-modules"]["phylowgs"]
+    PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id).sort_values(by = ["tumour_time_point"])
+    inputs = expand(
+        [
+            CFG["dirs"]["preprocess_battenberg"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.cnvs.txt",
+            CFG['dirs']['maf_to_vcf'] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.maf_to.vcf.gz"
+        ],
+        zip,
+        tumour_id = PATIENT["tumour_sample_id"],
+        normal_id = PATIENT["normal_sample_id"],
+        pair_status = PATIENT["pair_status"],
+        allow_missing = True
+    )
+    return(inputs)
 
 ##### RULES #####
 
 # Symlinks the input files into the module results directory (under '00-inputs/')
-rule _phylowgs_input_vcf:
-	input:
-		vcf = CFG["inputs"]["vcf"],
-		tbi = CFG["inputs"]["tbi"]
-	output:
-		vcf = CFG["dirs"]["inputs"] + "vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--matched.vcf.gz",
-		tbi = CFG["dirs"]["inputs"] + "vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--matched.vcf.gz.tbi"
-	run:
-		op.absolute_symlink(input.vcf, output.vcf)
-		op.absolute_symlink(input.tbi, output.tbi)
+rule _phylowgs_input_maf:
+    input:
+        maf = CFG["inputs"]["maf"],
+    output:
+        maf = CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.maf",
+    run:
+        op.absolute_symlink(input.maf, output.maf)
 
 
 rule _phylowgs_input_battenberg:
-	input:
-		cellularity = CFG["inputs"]["cellularity"],
-		subclones = CFG["inputs"]["subclones"]
-	output:
-		cellularity = CFG["dirs"]["inputs"] + "battenberg/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--matched.cellularity_ploidy.txt",
-		subclones = CFG["dirs"]["inputs"] + "battenberg/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--matched.subclones.txt"
-	run:
-		op.absolute_symlink(input.cellularity, output.cellularity)
-		op.absolute_symlink(input.subclones, output.subclones)
-
-
-# Preprocess the battenberg file to match requirements
-rule _phylowgs_preprocess_battenberg: 
-	input:
-		cellularity = str(rules._phylowgs_input_battenberg.output.cellularity),
-		subclones = str(rules._phylowgs_input_battenberg.output.subclones)
-	output:
-		txt = CFG["dirs"]["preprocess_battenberg"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--matched.cnvs.txt"
-	log:
-		stderr = CFG["logs"]["preprocess_battenberg"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--matched.preprocess_battenberg.stderr.log", 
-		stdout = CFG["logs"]["preprocess_battenberg"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--matched.preprocess_battenberg.stdout.log"
-	params:
-		script = PHYLO + "parser/parse_cnvs.py"
-	conda: 
-		 CFG["conda_envs"]["phylowgs"]
-	threads: 
-		CFG["threads"]["create_inputs"]
-	resources: 
-		**CFG["resources"]["create_inputs"]
-	shell:
-		op.as_one_line("""
-		cellularity=$(tail -n +2 {input.cellularity} | cut -f 1); 
-		python2 {params.script} -f battenberg-smchet -c $cellularity --cnv-output {output.txt} {input.subclones} 
-		2> {log.stderr} > {log.stdout}
-		""")
-
-# Expand the input files to create a command-line argument for create_phylowgs_inputs.py
-def create_phylowgs_inputs_cli(wildcards): 
-	CFG = config["lcr-modules"]["phylowgs"]
-	PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id)
-	cnvs = expand(
-		"--cnvs {time_point}=" + CFG['dirs']['preprocess_battenberg'] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--matched.cnvs.txt", 
-		zip, 
-		time_point = PATIENT["tumour_time_point"], 
-		tumour_id = PATIENT["tumour_sample_id"], 
-		normal_id = PATIENT["normal_sample_id"], 
-		seq_type = PATIENT["tumour_seq_type"], 
-		genome_build = PATIENT["tumour_genome_build"], 
-		patient_id = PATIENT["tumour_patient_id"]
-	)
-	vcf_types = expand(
-		"--vcf-type {time_point}=" + CFG['options']['create_inputs']['vcf_type'], 
-		zip, 
-		time_point = PATIENT["tumour_time_point"]
-	)
-	vcfs = expand(
-		"{time_point}=" + CFG['dirs']['inputs'] + "vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--matched.vcf.gz", 
-		zip, 
-		time_point = PATIENT["tumour_time_point"], 
-		tumour_id = PATIENT["tumour_sample_id"], 
-		normal_id = PATIENT["normal_sample_id"], 
-		seq_type = PATIENT["tumour_seq_type"], 
-		genome_build = PATIENT["tumour_genome_build"], 
-		patient_id = PATIENT["tumour_patient_id"]
-	)
-	cli = cnvs + vcf_types + vcfs
-	cli = " ".join([str(elem) for elem in cli])
-	return(cli)
-
-# Input function to pull in input VCF and preprocessed CNV data
-def create_phylowgs_inputs(wildcards): 
-	CFG = config["lcr-modules"]["phylowgs"]
-	PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id)
-	inputs = expand(
-		[
-			CFG["dirs"]["preprocess_battenberg"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--matched.cnvs.txt",
-			CFG['dirs']['inputs'] + "vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--matched.vcf.gz"
-		], 
-		zip, 
-		tumour_id = PATIENT["tumour_sample_id"], 
-		normal_id = PATIENT["normal_sample_id"], 
-		allow_missing = True
-	)
-	return(inputs)
+    input:
+        cellularity = CFG["inputs"]["cellularity"],
+        subclones = CFG["inputs"]["subclones"]
+    output:
+        cellularity = CFG["dirs"]["inputs"] + "battenberg/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.cellularity_ploidy.txt",
+        subclones = CFG["dirs"]["inputs"] + "battenberg/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.subclones.txt"
+    run:
+        op.absolute_symlink(input.cellularity, output.cellularity)
+        op.absolute_symlink(input.subclones, output.subclones)
+
+
+rule _phylowgs_parse_battenberg:
+    input:
+        cellularity = str(rules._phylowgs_input_battenberg.output.cellularity),
+        subclones = str(rules._phylowgs_input_battenberg.output.subclones)
+    output:
+        txt = CFG["dirs"]["preprocess_battenberg"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.cnvs.txt"
+    log:
+        stderr = CFG["logs"]["preprocess_battenberg"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.preprocess_battenberg.stderr.log",
+        stdout = CFG["logs"]["preprocess_battenberg"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.preprocess_battenberg.stdout.log"
+    params:
+        script = PHYLO + "parser/parse_cnvs.py"
+    conda:
+         CFG["conda_envs"]["phylowgs"]
+    threads:
+        CFG["threads"]["create_inputs"]
+    resources:
+        **CFG["resources"]["create_inputs"]
+    shell:
+        op.as_one_line("""
+        cellularity=$(tail -n +2 {input.cellularity} | cut -f 1);
+        python2 {params.script} -f battenberg-smchet -c $cellularity --cnv-output {output.txt} {input.subclones}
+        2> {log.stderr} > {log.stdout}
+        """)
+
+# Convert the input maf file to a vcf file
+rule _phylowgs_maf_to_vcf:
+    input:
+        maf = str(rules._phylowgs_input_maf.output.maf),
+        fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa")
+    output:
+        vcf = temp(CFG['dirs']['maf_to_vcf'] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.maf_to.vcf")
+    conda:
+        CFG["conda_envs"]["vcf2maf"]
+    shell:
+        op.as_one_line("""
+        maf2vcf.pl --input-maf {input.maf} --output-dir $(dirname {output.vcf}) --output-vcf {output.vcf} --ref-fasta {input.fasta}
+        """)
+
+rule _phylowgs_priority_ssms:
+    input:
+        mafs = get_input_mafs
+    output:
+        ssms = CFG['dirs']['maf_to_vcf'] + "{seq_type}--{genome_build}/{patient_id}/coding_ssms.txt"
+    params:
+        noncoding = CFG["scripts"]["noncoding"]
+    conda:
+        CFG["conda_envs"]["coreutils"]
+    shell:
+        op.as_one_line("""
+        grep -hvf {params.noncoding} {input.mafs} | awk '{{FS="\\t"}} {{OFS="_"}} {{print $5, $6}}' | sed 's/chr//g' > {output.ssms}
+        """)
+
+rule _phylowgs_bgzip_vcf:
+    input:
+        vcf = str(rules._phylowgs_maf_to_vcf.output.vcf)
+    output:
+        vcf = temp(CFG['dirs']['maf_to_vcf'] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.maf_to.vcf.gz"),
+        tbi = CFG['dirs']['maf_to_vcf'] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.maf_to.vcf.gz.tbi",
+    conda:
+        CFG["conda_envs"]["bcftools"]
+    shell:
+        op.as_one_line("""
+        bcftools sort {input.vcf} | bcftools view -s "{wildcards.normal_id},{wildcards.tumour_id}" -i 'FMT/DP[0] > 0 && FMT/AD[0:1] > 1' -Oz -o {output.vcf} && tabix -p vcf {output.vcf}
+        """)
 
 
 # Preprocess vcf and battenberg inputs together
 rule _phylowgs_create_inputs:
-	input:
-		create_phylowgs_inputs
-	output:
-		ssms = CFG["dirs"]["preprocess_inputs"] + "{seq_type}--{genome_build}/{patient_id}/ssm_data.txt", 
-		cnvs = CFG["dirs"]["preprocess_inputs"] + "{seq_type}--{genome_build}/{patient_id}/cnv_data.txt", 
-		params = CFG["dirs"]["preprocess_inputs"] + "{seq_type}--{genome_build}/{patient_id}/params.json"
-	log:
-		stderr = CFG["logs"]["preprocess_inputs"] + "{seq_type}--{genome_build}/{patient_id}/create_phylowgs_inputs.stderr.log",
-		stdout = CFG["logs"]["preprocess_inputs"] + "{seq_type}--{genome_build}/{patient_id}/create_phylowgs_inputs.stdout.log"
-	params:
-		cli = create_phylowgs_inputs_cli,
-		opts = CFG["options"]["create_inputs"]["opts"], 
-		sex = lambda w: config["lcr-modules"]["phylowgs"]["switches"]["sex"][op.filter_samples(PATIENTS, tumour_patient_id = w.patient_id)["tumour_sex"].values[0]] if op.filter_samples(PATIENTS, tumour_patient_id = w.patient_id)["tumour_sex"].values[0] in config["lcr-modules"]["phylowgs"]["switches"]["sex"].keys() else "auto",
-		script = PHYLO + "parser/create_phylowgs_inputs.py"
-	conda:
-		CFG["conda_envs"]["phylowgs"]
-	threads:
-		CFG["threads"]["create_inputs"]
-	resources:
-		**CFG["resources"]["create_inputs"]
-	shell:
-		op.as_one_line("""
-		python2 {params.script}
-		--output-cnvs {output.cnvs} 
-		--output-variants {output.ssms} 
-		--output-params {output.params} 
-		--sex {params.sex}
-		{params.opts}
-		{params.cli}
-		2> {log.stderr} > {log.stdout}
-		""")
+    input:
+        create_phylowgs_inputs,
+        priority = str(rules._phylowgs_priority_ssms.output.ssms)
+    output:
+        ssms = CFG["dirs"]["preprocess_inputs"] + "{seq_type}--{genome_build}/{patient_id}/ssm_data.txt",
+        cnvs = CFG["dirs"]["preprocess_inputs"] + "{seq_type}--{genome_build}/{patient_id}/cnv_data.txt",
+        params = CFG["dirs"]["preprocess_inputs"] + "{seq_type}--{genome_build}/{patient_id}/params.json"
+    log:
+        stderr = CFG["logs"]["preprocess_inputs"] + "{seq_type}--{genome_build}/{patient_id}/create_phylowgs_inputs.stderr.log",
+        stdout = CFG["logs"]["preprocess_inputs"] + "{seq_type}--{genome_build}/{patient_id}/create_phylowgs_inputs.stdout.log"
+    params:
+        cli = create_phylowgs_inputs_cli,
+        opts = CFG["options"]["create_inputs"]["opts"],
+        sex = lambda w: config["lcr-modules"]["phylowgs"]["switches"]["sex"][op.filter_samples(PATIENTS, tumour_patient_id = w.patient_id)["tumour_sex"].values[0]] if op.filter_samples(PATIENTS, tumour_patient_id = w.patient_id)["tumour_sex"].values[0] in config["lcr-modules"]["phylowgs"]["switches"]["sex"].keys() else "auto",
+        script = PHYLO + "parser/create_phylowgs_inputs.py"
+    conda:
+        CFG["conda_envs"]["phylowgs"]
+    threads:
+        CFG["threads"]["create_inputs"]
+    resources:
+        **CFG["resources"]["create_inputs"]
+    shell:
+        op.as_one_line("""
+        python2 {params.script}
+        --output-cnvs {output.cnvs}
+        --output-variants {output.ssms}
+        --output-params {output.params}
+        --priority-ssms {input.priority}
+        --sex {params.sex}
+        {params.opts}
+        {params.cli}
+        2> {log.stderr} > {log.stdout}
+        """)
 
 # Run multievolve to sample trees and reconstruct phylogeny
-rule _phylowgs_multievolve: 
-	input: 
-		**rules._phylowgs_create_inputs.output
-	output: 
-		trees = CFG["dirs"]["multievolve"] + "{seq_type}--{genome_build}/{patient_id}/trees.zip"
-	log:
-		stderr = CFG["logs"]["multievolve"] + "{seq_type}--{genome_build}/{patient_id}/multievolve.stderr.log",
-		stdout = CFG["logs"]["multievolve"] + "{seq_type}--{genome_build}/{patient_id}/multievolve.stdout.log"
-	params: 
-		script = PHYLO + "multievolve.py",
-		opts = CFG["options"]["multievolve"]
-	conda:
-		CFG["conda_envs"]["phylowgs"]
-	threads: 
-		CFG["threads"]["multievolve"]
-	resources: 
-		**CFG["resources"]["multievolve"]
-	shell: 
-		op.as_one_line("""
-		python2 {params.script} 
-		{params.opts}
-		-n {threads} 
-		-O $(dirname {output.trees}) 
-		--ssms {input.ssms} 
-		--cnvs {input.cnvs} 
-		2> {log.stderr} > {log.stdout}
-		""")
-
-# Write the results	
-rule _phylowgs_write_results: 
-	input: 
-		str(rules._phylowgs_multievolve.output.trees)
-	output: 
-		muts = CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.muts.json",
-		summ = CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.summ.json",
-		mutass = CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.mutass.zip"
-	params: 
-		script = PHYLO + "write_results.py", 
-		opts = CFG["options"]["write_results"]
-	conda: 
-		CFG["conda_envs"]["phylowgs"]
-	threads: 
-		CFG["threads"]["write_results"]
-	resources: 
-		**CFG["resources"]["write_results"]
-	shell: 
-		op.as_one_line("""
-		python2 {params.script} 
-		{params.opts}
-		{wildcards.patient_id}
-		{input}
-		{output.summ}.gz
-		{output.muts}.gz
-		{output.mutass} && 
-		gunzip -f $(dirname {output.mutass})/*.gz 
-		""")
-	
+rule _phylowgs_multievolve:
+    input:
+        **rules._phylowgs_create_inputs.output
+    output:
+        trees = CFG["dirs"]["multievolve"] + "{seq_type}--{genome_build}/{patient_id}/trees.zip"
+    log:
+        stderr = CFG["logs"]["multievolve"] + "{seq_type}--{genome_build}/{patient_id}/multievolve.stderr.log",
+        stdout = CFG["logs"]["multievolve"] + "{seq_type}--{genome_build}/{patient_id}/multievolve.stdout.log"
+    params:
+        script = PHYLO + "multievolve.py",
+        opts = CFG["options"]["multievolve"]
+    conda:
+        CFG["conda_envs"]["phylowgs"]
+    threads:
+        CFG["threads"]["multievolve"]
+    resources:
+        **CFG["resources"]["multievolve"]
+    shell:
+        op.as_one_line("""
+        python2 {params.script}
+        {params.opts}
+        -n {threads}
+        -O $(dirname {output.trees})
+        --ssms {input.ssms}
+        --cnvs {input.cnvs}
+        2> {log.stderr} > {log.stdout}
+        """)
+
+# Write the results
+rule _phylowgs_write_results:
+    input:
+        trees = str(rules._phylowgs_multievolve.output.trees)
+    output:
+        muts = CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.muts.json",
+        summ = CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.summ.json",
+        mutass = CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.mutass.zip"
+    params:
+        script = PHYLO + "write_results.py",
+        opts = CFG["options"]["write_results"]
+    conda:
+        CFG["conda_envs"]["phylowgs"]
+    threads:
+        CFG["threads"]["write_results"]
+    resources:
+        **CFG["resources"]["write_results"]
+    shell:
+        op.as_one_line("""
+        python2 {params.script}
+        {params.opts}
+        {wildcards.patient_id}
+        {input.trees}
+        {output.summ}.gz
+        {output.muts}.gz
+        {output.mutass} &&
+        gunzip -f $(dirname {output.mutass})/*.gz &&
+        rm -rf $(dirname {input.trees})/chain*
+        """)
+
+
+
 # Symlinks the final output files to the witness directory in preparation for HTTP browsing
-rule _phylowgs_output_html:
-	input:
-		mutass = str(rules._phylowgs_write_results.output.mutass)
-	output:
-		complete = CFG["dirs"]["outputs"] + "txt/{seq_type}--{genome_build}/{patient_id}.phylo_complete"
-	params: 
-		witness = PHYLO + "witness/data/{patient_id}"
-	run:
-		op.absolute_symlink(os.path.split(input.mutass)[0], params.witness)
-		f = open(output.complete, "a")
-		f.write("To view PhyloWGS results, navigate to " + PHYLO + "witness\n")
-		f.write("Run the following commands: \n")
-		f.write("python2 index_data.py\n")
-		f.write("python2 -m SimpleHTTPServer\n")
-		f.write("On a local machine you will be able to view your results in a browser at http://localhost:8000\n")
-		f.write("For a remote machine, launch the following command in a terminal: \n")
-		f.write("ssh -N -L localhost:8000:localhost:8000 <ssh_config>\n")
-		f.write("Now you can view your results in a browser at http://localhost:8000\n")
-		f.close()
+rule _phylowgs_process_output:
+    input:
+        mafs = get_input_mafs,
+        **rules._phylowgs_create_inputs.output,
+        **rules._phylowgs_write_results.output,
+    output:
+        tree_summary = CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/results/tree_summary.tsv",
+        maf = CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/results/merged_ssm_cluster_assignments.maf",
+        cnvs = CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/results/merged_cnvs_cluster_assignments.tsv",
+        CCF = CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/results/CCF.tsv",
+        plots = directory(CFG["dirs"]["results"] + "{seq_type}--{genome_build}/{patient_id}/results/plots/")
+    params:
+        sample_order = order_samples,
+        maf_list = get_maf_cli,
+        drivers = CFG['inputs']['drivers'],
+        script = CFG["scripts"]["process_outputs"]
+    conda:
+        CFG["conda_envs"]["phylowgs_results"]
+    script:
+        "{params.script}"
+
+rule _phylowgs_output_plots:
+    input:
+        plots = str(rules._phylowgs_process_output.output.plots)
+    output:
+        plots = directory(CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/plots/{patient_id}")
+    run:
+        op.relative_symlink(input.plots, output.plots)
+
 
 
 # Generates the target sentinels for each run, which generate the symlinks
 rule _phylowgs_all:
-	input:
-		expand(
-			[
-				str(rules._phylowgs_output_html.output.complete),
-			],
-			zip,  # Run expand() with zip(), not product()
-			seq_type=PATIENTS["tumour_seq_type"],
-			genome_build=PATIENTS["tumour_genome_build"],
-			patient_id=PATIENTS["tumour_patient_id"]
-		)
+    input:
+        expand(
+            [
+                str(rules._phylowgs_output_plots.output.plots),
+            ],
+            zip,  # Run expand() with zip(), not product()
+            seq_type=PATIENTS["tumour_seq_type"],
+            genome_build=PATIENTS["tumour_genome_build"],
+            patient_id=PATIENTS["tumour_patient_id"]
+        )
 
 
 ##### CLEANUP #####

From 8db3edede2afee1df2b1513ce9f77d3b88417259 Mon Sep 17 00:00:00 2001
From: lkhilton <laura.k.hilton@gmail.com>
Date: Mon, 12 Sep 2022 10:29:08 -0700
Subject: [PATCH 02/14] Add accessory files for PhyloWGS module

---
 envs/phylowgs/fill_battenberg.yaml            |   41 +
 envs/phylowgs/phylowgs_results.yaml           |  204 +++
 modules/phylowgs/1.0/config/default.yaml      |   28 +-
 .../phylowgs/1.0/envs/bcftools-1.10.2.yaml    |    1 +
 modules/phylowgs/1.0/envs/coreutils-8.31.yaml |    1 +
 .../phylowgs/1.0/envs/fill_battenberg.yaml    |    1 +
 .../phylowgs/1.0/envs/phylowgs_results.yaml   |    1 +
 modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml |    1 +
 .../1.0/etc/chromArmFiles/chromArm.grch37.tsv |   49 +
 .../1.0/etc/chromArmFiles/chromArm.grch38.tsv |   49 +
 .../1.0/etc/chromArmFiles/chromArm.hg19.tsv   |   49 +
 .../1.0/etc/chromArmFiles/chromArm.hg38.tsv   |   49 +
 .../1.0/etc/chromArmFiles/chromArm.hs37d5.tsv |    1 +
 modules/phylowgs/1.0/etc/noncoding.txt        |    8 +
 .../1.0/src/create_phylowgs_inputs.py         | 1356 -----------------
 modules/phylowgs/1.0/src/fill_battenberg.py   |  379 +++++
 .../1.0/src/process_phyloWGS_outputs.R        |  536 +++++++
 .../src/process_phyloWGS_outputs_updated.R    |  417 -----
 18 files changed, 1388 insertions(+), 1783 deletions(-)
 create mode 100644 envs/phylowgs/fill_battenberg.yaml
 create mode 100644 envs/phylowgs/phylowgs_results.yaml
 create mode 120000 modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml
 create mode 120000 modules/phylowgs/1.0/envs/coreutils-8.31.yaml
 create mode 120000 modules/phylowgs/1.0/envs/fill_battenberg.yaml
 create mode 120000 modules/phylowgs/1.0/envs/phylowgs_results.yaml
 create mode 120000 modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml
 create mode 100644 modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch37.tsv
 create mode 100644 modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch38.tsv
 create mode 100644 modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg19.tsv
 create mode 100644 modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg38.tsv
 create mode 120000 modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hs37d5.tsv
 create mode 100644 modules/phylowgs/1.0/etc/noncoding.txt
 delete mode 100755 modules/phylowgs/1.0/src/create_phylowgs_inputs.py
 create mode 100644 modules/phylowgs/1.0/src/fill_battenberg.py
 create mode 100644 modules/phylowgs/1.0/src/process_phyloWGS_outputs.R
 delete mode 100644 modules/phylowgs/1.0/src/process_phyloWGS_outputs_updated.R

diff --git a/envs/phylowgs/fill_battenberg.yaml b/envs/phylowgs/fill_battenberg.yaml
new file mode 100644
index 00000000..52741e52
--- /dev/null
+++ b/envs/phylowgs/fill_battenberg.yaml
@@ -0,0 +1,41 @@
+name: fill_segments
+channels:
+  - bioconda
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1
+  - _openmp_mutex=4.5
+  - ca-certificates=2020.12.5
+  - certifi=2020.12.5
+  - ld_impl_linux-64=2.35.1
+  - libblas=3.9.0
+  - libcblas=3.9.0
+  - libffi=3.3
+  - libgcc-ng=9.3.0
+  - libgfortran-ng=9.3.0
+  - libgfortran5=9.3.0
+  - libgomp=9.3.0
+  - liblapack=3.9.0
+  - libopenblas=0.3.12
+  - libstdcxx-ng=9.3.0
+  - ncurses=6.2
+  - numpy=1.19.4
+  - openssl=1.1.1i
+  - pandas=1.2.0
+  - pip=20.3.3
+  - python=3.9.1
+  - python-dateutil=2.8.1
+  - python_abi=3.9
+  - pytz=2020.5
+  - readline=8.0
+  - setuptools=49.6.0
+  - simplejson=3.17.2
+  - six=1.15.0
+  - sqlite=3.34.0
+  - tk=8.6.10
+  - tzdata=2020e
+  - wheel=0.36.2
+  - xz=5.2.5
+  - zlib=1.2.11
+prefix: /home/dreval/miniconda3/envs/fill_segments
diff --git a/envs/phylowgs/phylowgs_results.yaml b/envs/phylowgs/phylowgs_results.yaml
new file mode 100644
index 00000000..f64126d7
--- /dev/null
+++ b/envs/phylowgs/phylowgs_results.yaml
@@ -0,0 +1,204 @@
+name: phylowgs_outputs
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1
+  - _openmp_mutex=4.5
+  - _r-mutex=1.0.1
+  - binutils_impl_linux-64=2.35.1
+  - binutils_linux-64=2.35
+  - bwidget=1.9.14
+  - bzip2=1.0.8
+  - c-ares=1.17.1
+  - ca-certificates=2021.5.30
+  - cairo=1.16.0
+  - curl=7.77.0
+  - font-ttf-dejavu-sans-mono=2.37
+  - font-ttf-inconsolata=3.000
+  - font-ttf-source-code-pro=2.038
+  - font-ttf-ubuntu=0.83
+  - fontconfig=2.13.1
+  - fonts-conda-ecosystem=1
+  - fonts-conda-forge=1
+  - freetype=2.10.4
+  - fribidi=1.0.10
+  - gcc_impl_linux-64=9.3.0
+  - gcc_linux-64=9.3.0
+  - gettext=0.21.0
+  - gfortran_impl_linux-64=9.3.0
+  - gfortran_linux-64=9.3.0
+  - graphite2=1.3.14
+  - gsl=2.6
+  - gxx_impl_linux-64=9.3.0
+  - gxx_linux-64=9.3.0
+  - harfbuzz=2.8.1
+  - icu=68.1
+  - jbig=2.1
+  - jpeg=9d
+  - kernel-headers_linux-64=2.6.32
+  - krb5=1.19.1
+  - ld_impl_linux-64=2.35.1
+  - lerc=2.2.1
+  - libblas=3.9.0
+  - libcblas=3.9.0
+  - libcurl=7.77.0
+  - libdeflate=1.7
+  - libedit=3.1.20210216
+  - libev=4.33
+  - libffi=3.3
+  - libgcc-devel_linux-64=9.3.0
+  - libgcc-ng=9.3.0
+  - libgfortran-ng=9.3.0
+  - libgfortran5=9.3.0
+  - libglib=2.68.3
+  - libgomp=9.3.0
+  - libiconv=1.16
+  - liblapack=3.9.0
+  - libnghttp2=1.43.0
+  - libopenblas=0.3.15
+  - libpng=1.6.37
+  - libssh2=1.9.0
+  - libstdcxx-devel_linux-64=9.3.0
+  - libstdcxx-ng=9.3.0
+  - libtiff=4.3.0
+  - libuuid=2.32.1
+  - libwebp-base=1.2.0
+  - libxcb=1.14
+  - libxml2=2.9.12
+  - lz4-c=1.9.3
+  - make=4.3
+  - ncurses=6.2
+  - openssl=1.1.1k
+  - pandoc=2.14.0.3
+  - pango=1.48.6
+  - pcre=8.45
+  - pcre2=10.36
+  - pixman=0.40.0
+  - r-askpass=1.1
+  - r-assertthat=0.2.1
+  - r-backports=1.2.1
+  - r-base=4.1.0
+  - r-base64enc=0.1_3
+  - r-blob=1.2.1
+  - r-brio=1.1.2
+  - r-broom=0.7.8
+  - r-callr=3.7.0
+  - r-cellranger=1.1.0
+  - r-cli=3.0.0
+  - r-clipr=0.7.1
+  - r-colorspace=2.0_2
+  - r-cpp11=0.3.1
+  - r-crayon=1.4.1
+  - r-curl=4.3.2
+  - r-data.table=1.14.0
+  - r-dbi=1.1.1
+  - r-dbplyr=2.1.1
+  - r-desc=1.3.0
+  - r-diffobj=0.3.4
+  - r-digest=0.6.27
+  - r-dplyr=1.0.7
+  - r-dtplyr=1.1.0
+  - r-ellipsis=0.3.2
+  - r-evaluate=0.14
+  - r-fansi=0.4.2
+  - r-farver=2.1.0
+  - r-forcats=0.5.1
+  - r-fs=1.5.0
+  - r-gargle=1.1.0
+  - r-generics=0.1.0
+  - r-ggplot2=3.3.5
+  - r-ggrepel=0.9.1
+  - r-glue=1.4.2
+  - r-googledrive=1.0.1
+  - r-googlesheets4=0.3.0
+  - r-gtable=0.3.0
+  - r-haven=2.4.1
+  - r-highr=0.9
+  - r-hms=1.1.0
+  - r-htmltools=0.5.1.1
+  - r-httr=1.4.2
+  - r-ids=1.0.1
+  - r-isoband=0.2.4
+  - r-jsonlite=1.7.2
+  - r-knitr=1.33
+  - r-labeling=0.4.2
+  - r-lattice=0.20_44
+  - r-lifecycle=1.0.0
+  - r-lubridate=1.7.10
+  - r-magrittr=2.0.1
+  - r-markdown=1.1
+  - r-mass=7.3_54
+  - r-matrix=1.3_4
+  - r-mgcv=1.8_36
+  - r-mime=0.11
+  - r-modelr=0.1.8
+  - r-munsell=0.5.0
+  - r-nlme=3.1_152
+  - r-openssl=1.4.4
+  - r-pillar=1.6.1
+  - r-pkgconfig=2.0.3
+  - r-pkgload=1.2.1
+  - r-plyr=1.8.6
+  - r-praise=1.0.0
+  - r-prettyunits=1.1.1
+  - r-processx=3.5.2
+  - r-progress=1.2.2
+  - r-ps=1.6.0
+  - r-purrr=0.3.4
+  - r-r6=2.5.0
+  - r-rappdirs=0.3.3
+  - r-rcolorbrewer=1.1_2
+  - r-rcpp=1.0.6
+  - r-readr=1.4.0
+  - r-readxl=1.3.1
+  - r-rematch=1.0.1
+  - r-rematch2=2.1.2
+  - r-reprex=2.0.0
+  - r-reshape2=1.4.4
+  - r-rjson=0.2.20
+  - r-rlang=0.4.11
+  - r-rmarkdown=2.9
+  - r-rprojroot=2.0.2
+  - r-rstudioapi=0.13
+  - r-rvest=1.0.0
+  - r-scales=1.1.1
+  - r-selectr=0.4_2
+  - r-stringi=1.6.2
+  - r-stringr=1.4.0
+  - r-sys=3.4
+  - r-testthat=3.0.3
+  - r-tibble=3.1.2
+  - r-tidyr=1.1.3
+  - r-tidyselect=1.1.1
+  - r-tidyverse=1.3.1
+  - r-tinytex=0.32
+  - r-utf8=1.2.1
+  - r-uuid=0.1_4
+  - r-vctrs=0.3.8
+  - r-viridislite=0.4.0
+  - r-waldo=0.2.5
+  - r-withr=2.4.2
+  - r-xfun=0.24
+  - r-xml2=1.3.2
+  - r-yaml=2.2.1
+  - readline=8.1
+  - sed=4.8
+  - sysroot_linux-64=2.12
+  - tk=8.6.10
+  - tktable=2.10
+  - xorg-kbproto=1.0.7
+  - xorg-libice=1.0.10
+  - xorg-libsm=1.2.3
+  - xorg-libx11=1.7.2
+  - xorg-libxext=1.3.4
+  - xorg-libxrender=0.9.10
+  - xorg-libxt=1.2.1
+  - xorg-renderproto=0.11.1
+  - xorg-xextproto=7.3.0
+  - xorg-xproto=7.0.31
+  - xz=5.2.5
+  - zlib=1.2.11
+  - zstd=1.5.0
+prefix: /home/lhilton/miniconda3/envs/phylowgs_outputs
diff --git a/modules/phylowgs/1.0/config/default.yaml b/modules/phylowgs/1.0/config/default.yaml
index bf4705b6..e8e0c1e4 100644
--- a/modules/phylowgs/1.0/config/default.yaml
+++ b/modules/phylowgs/1.0/config/default.yaml
@@ -4,25 +4,22 @@ lcr-modules:
 
         inputs:
             # Available wildcards: {tumour_id} {normal_id} {pair_status} {genome_build} {sample_id}
-            # IMPORTANT: PhyloWGS assumes the second sample in the VCF is the tumour. 
-            # If this assumption is wrong, fix your VCF file. 
-            vcf: "__UPDATE__" # Must be strelka- or mutect-formatted VCF file
-            tbi: "__UPDATE__"
+            maf: "__UPDATE__" 
             cellularity: "__UPDATE__" # battenberg/99-outputs/txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_subclones.txt
             subclones: "__UPDATE__" # battenberg/99-outputs/txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_cellularity_ploidy.txt
+            drivers: "__UPDATE__" # newline-separated list of driver gene HUGO symbols to be included on plots
 
         scratch_subdirectories: []
 
         options:
             create_inputs: 
                 opts: 
-                    "-s 5000 --verbose"
-                vcf_type: "__UPDATE__" # Usually either strelka or mutect_smchet
+                    "-s 5000 --verbose --regions all" # -s controls how many variants should be sub-sampled.  
             multievolve: ""
             write_results: "--include-ssm-names"
 
         switches: 
-            # NOTE: You must include a "sex" column in the input samples table. 
+            # NOTE: You must include a "sex" column in the input samples table, formatted with "M" and "F".  
             # If patient sex is unknown, you can leave it empty and phyloWGS will run in "auto" mode. 
             sex: 
                 M: "male"
@@ -30,20 +27,31 @@ lcr-modules:
 
         conda_envs:
             phylowgs: "{MODSDIR}/envs/phylowgs.yaml"
+            phylowgs_results: "{MODSDIR}/envs/phylowgs_results.yaml"
+            fill_battenberg: "{MODSDIR}/envs/fill_battenberg.yaml"
+            vcf2maf: "{MODSDIR}/envs/vcf2maf-1.6.18.yaml"
+            bcftools: "{MODSDIR}/envs/bcftools-1.10.2.yaml"
+            coreutils: "{MODSDIR}/envs/coreutils-8.31.yaml"
+
+        scripts: 
+            fill_battenberg: "{MODSDIR}/src/fill_battenberg.py"
+            arm_file: "{MODSDIR}/etc/chromArmFiles/chromArm.{genome_build}.tsv"
+            process_outputs: "src/process_phyloWGS_outputs.R"
+            noncoding: "{MODSDIR}/etc/noncoding.txt"
             
         threads:
             create_inputs: 1
-            multievolve: 24
+            multievolve: 4
             write_results: 1
 
         resources:
             create_inputs: 
-                mem_mb: 2000
+                mem_mb: 10000
             multievolve: 
                 mem_mb: 40000
                 evolve: 1
             write_results: 
-                mem_mb: 2000
+                mem_mb: 20000
 
         pairing_config:
             genome:
diff --git a/modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml b/modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml
new file mode 120000
index 00000000..72959e7b
--- /dev/null
+++ b/modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml
@@ -0,0 +1 @@
+../../../../envs/bcftools/bcftools-1.10.2.yaml
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/envs/coreutils-8.31.yaml b/modules/phylowgs/1.0/envs/coreutils-8.31.yaml
new file mode 120000
index 00000000..050452f7
--- /dev/null
+++ b/modules/phylowgs/1.0/envs/coreutils-8.31.yaml
@@ -0,0 +1 @@
+../../../../envs/coreutils/coreutils-8.31.yaml
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/envs/fill_battenberg.yaml b/modules/phylowgs/1.0/envs/fill_battenberg.yaml
new file mode 120000
index 00000000..e667a8b1
--- /dev/null
+++ b/modules/phylowgs/1.0/envs/fill_battenberg.yaml
@@ -0,0 +1 @@
+../../../../envs/phylowgs/fill_battenberg.yaml
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/envs/phylowgs_results.yaml b/modules/phylowgs/1.0/envs/phylowgs_results.yaml
new file mode 120000
index 00000000..926ec438
--- /dev/null
+++ b/modules/phylowgs/1.0/envs/phylowgs_results.yaml
@@ -0,0 +1 @@
+../../../../envs/phylowgs/phylowgs_results.yaml
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml b/modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml
new file mode 120000
index 00000000..829077c7
--- /dev/null
+++ b/modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml
@@ -0,0 +1 @@
+../../../../envs/vcf2maf/vcf2maf-1.6.18.yaml
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch37.tsv b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch37.tsv
new file mode 100644
index 00000000..91da51a7
--- /dev/null
+++ b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch37.tsv
@@ -0,0 +1,49 @@
+chromosome	start	end	arm
+1	10000	121500000	p
+1	142600000	249250621	q
+2	10000	90500000	p
+2	96800000	243199373	q
+3	10000	87900000	p
+3	98300000	198022430	q
+4	10000	48200000	p
+4	52700000	191154276	q
+5	10000	46100000	p
+5	50700000	180915260	q
+6	10000	58700000	p
+6	63300000	171115067	q
+7	10000	58000000	p
+7	61700000	159138663	q
+8	10000	43100000	p
+8	48100000	146364022	q
+9	10000	47300000	p
+9	65900000	141213431	q
+10	10000	38000000	p
+10	42300000	135534747	q
+11	10000	51600000	p
+11	55700000	135006516	q
+12	10000	33300000	p
+12	38200000	133851895	q
+13	10000	16000000	p
+13	19500000	115169878	q
+14	10000	14000000	p
+14	19100000	107349540	q
+15	10000	14000000	p
+15	20700000	102531392	q
+16	10000	34600000	p
+16	47000000	90354753	q
+17	10000	22200000	p
+17	25800000	81195210	q
+18	10000	15400000	p
+18	19000000	78077248	q
+19	10000	20000000	p
+19	32400000	59128983	q
+20	10000	25600000	p
+20	29400000	63025520	q
+21	10000	10000000	p
+21	14300000	48129895	q
+22	10000	11900000	p
+22	17900000	51304566	q
+X	10000	58100000	p
+X	63000000	155270560	q
+Y	10000	11600000	p
+Y	13400000	28800000	q
diff --git a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch38.tsv b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch38.tsv
new file mode 100644
index 00000000..58b866e2
--- /dev/null
+++ b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch38.tsv
@@ -0,0 +1,49 @@
+chromosome	start	end	arm
+1	10000	121700000	p
+1	143200000	248956422	q
+2	0	91800000	p
+2	96000000	242193529	q
+3	0	87800000	p
+3	98600000	198295559	q
+4	0	48200000	p
+4	51800000	190214555	q
+5	0	46100000	p
+5	51400000	181538259	q
+6	0	58500000	p
+6	62600000	170805979	q
+7	0	58100000	p
+7	62100000	159345973	q
+8	0	43200000	p
+8	47200000	145138636	q
+9	0	42200000	p
+9	61500000	138394717	q
+10	10000	38000000	p
+10	41600000	133797422	q
+11	10000	51000000	p
+11	55800000	135086622	q
+12	10000	33200000	p
+12	37800000	133275309	q
+13	10000	16000000	p
+13	18900000	114364328	q
+14	10000	16000000	p
+14	18200000	107043718	q
+15	10000	16000000	p
+15	20500000	101991189	q
+16	0	35300000	p
+16	47000000	90338345	q
+17	0	22700000	p
+17	27400000	83257441	q
+18	0	15400000	p
+18	21500000	80373285	q
+19	0	19900000	p
+19	31900000	58617616	q
+20	0	25700000	p
+20	30400000	64444167	q
+21	0	10500000	p
+21	13000000	46709983	q
+22	10000	14000000	p
+22	17400000	50818468	q
+X	0	58100000	p
+X	63800000	156040895	q
+Y	0	10300000	p
+Y	10600000	26600000	q
diff --git a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg19.tsv b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg19.tsv
new file mode 100644
index 00000000..a3c8be28
--- /dev/null
+++ b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg19.tsv
@@ -0,0 +1,49 @@
+chromosome	start	end	arm
+chr1	10000	121500000	p
+chr1	142600000	249250621	q
+chr2	10000	90500000	p
+chr2	96800000	243199373	q
+chr3	10000	87900000	p
+chr3	98300000	198022430	q
+chr4	10000	48200000	p
+chr4	52700000	191154276	q
+chr5	10000	46100000	p
+chr5	50700000	180915260	q
+chr6	10000	58700000	p
+chr6	63300000	171115067	q
+chr7	10000	58000000	p
+chr7	61700000	159138663	q
+chr8	10000	43100000	p
+chr8	48100000	146364022	q
+chr9	10000	47300000	p
+chr9	65900000	141213431	q
+chr10	10000	38000000	p
+chr10	42300000	135534747	q
+chr11	10000	51600000	p
+chr11	55700000	135006516	q
+chr12	10000	33300000	p
+chr12	38200000	133851895	q
+chr13	10000	16000000	p
+chr13	19500000	115169878	q
+chr14	10000	14000000	p
+chr14	19100000	107349540	q
+chr15	10000	14000000	p
+chr15	20700000	102531392	q
+chr16	10000	34600000	p
+chr16	47000000	90354753	q
+chr17	10000	22200000	p
+chr17	25800000	81195210	q
+chr18	10000	15400000	p
+chr18	19000000	78077248	q
+chr19	10000	20000000	p
+chr19	32400000	59128983	q
+chr20	10000	25600000	p
+chr20	29400000	63025520	q
+chr21	10000	10000000	p
+chr21	14300000	48129895	q
+chr22	10000	11900000	p
+chr22	17900000	51304566	q
+chrX	10000	58100000	p
+chrX	63000000	155270560	q
+chrY	10000	11600000	p
+chrY	13400000	28800000	q
diff --git a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg38.tsv b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg38.tsv
new file mode 100644
index 00000000..4b5d7b6a
--- /dev/null
+++ b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg38.tsv
@@ -0,0 +1,49 @@
+chromosome	start	end	arm
+chr1	10000	121700000	p
+chr1	143200000	248956422	q
+chr2	0	91800000	p
+chr2	96000000	242193529	q
+chr3	0	87800000	p
+chr3	98600000	198295559	q
+chr4	0	48200000	p
+chr4	51800000	190214555	q
+chr5	0	46100000	p
+chr5	51400000	181538259	q
+chr6	0	58500000	p
+chr6	62600000	170805979	q
+chr7	0	58100000	p
+chr7	62100000	159345973	q
+chr8	0	43200000	p
+chr8	47200000	145138636	q
+chr9	0	42200000	p
+chr9	61500000	138394717	q
+chr10	10000	38000000	p
+chr10	41600000	133797422	q
+chr11	10000	51000000	p
+chr11	55800000	135086622	q
+chr12	10000	33200000	p
+chr12	37800000	133275309	q
+chr13	10000	16000000	p
+chr13	18900000	114364328	q
+chr14	10000	16000000	p
+chr14	18200000	107043718	q
+chr15	10000	16000000	p
+chr15	20500000	101991189	q
+chr16	0	35300000	p
+chr16	47000000	90338345	q
+chr17	0	22700000	p
+chr17	27400000	83257441	q
+chr18	0	15400000	p
+chr18	21500000	80373285	q
+chr19	0	19900000	p
+chr19	31900000	58617616	q
+chr20	0	25700000	p
+chr20	30400000	64444167	q
+chr21	0	10500000	p
+chr21	13000000	46709983	q
+chr22	10000	14000000	p
+chr22	17400000	50818468	q
+chrX	0	58100000	p
+chrX	63800000	156040895	q
+chrY	0	10300000	p
+chrY	10600000	26600000	q
diff --git a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hs37d5.tsv b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hs37d5.tsv
new file mode 120000
index 00000000..c8477855
--- /dev/null
+++ b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hs37d5.tsv
@@ -0,0 +1 @@
+chromArm.grch37.tsv
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/etc/noncoding.txt b/modules/phylowgs/1.0/etc/noncoding.txt
new file mode 100644
index 00000000..258f27f0
--- /dev/null
+++ b/modules/phylowgs/1.0/etc/noncoding.txt
@@ -0,0 +1,8 @@
+Hugo_Symbol
+Silent
+RNA
+IGR
+Intron
+5'Flank
+3'Flank
+5'UTR
diff --git a/modules/phylowgs/1.0/src/create_phylowgs_inputs.py b/modules/phylowgs/1.0/src/create_phylowgs_inputs.py
deleted file mode 100755
index f9ee1474..00000000
--- a/modules/phylowgs/1.0/src/create_phylowgs_inputs.py
+++ /dev/null
@@ -1,1356 +0,0 @@
-#!/usr/bin/env python2
-from __future__ import print_function
-
-# Requires PyVCF. To install: pip2 install pyvcf
-import vcf
-import argparse
-import csv
-from collections import defaultdict, namedtuple, OrderedDict
-import random
-import sys
-import numpy as np
-import numpy.ma as ma
-import json
-from scipy.stats.mstats import gmean
-
-VariantId = namedtuple('VariantId', ['CHROM', 'POS'])
-
-class ReadCountsUnavailableError(Exception):
-  pass
-
-class VariantParser(object):
-  def __init__(self):
-    # Child classes must give the following variables sensible values in
-    # constructor so that list_variants() works subsequently.
-    self._cnvs = None
-    self._vcf_filename = None
-
-  def list_variants(self):
-    variants = self._filter(self._vcf_filename)
-    variants_and_reads = []
-    for variant in variants:
-      try:
-        ref_reads, total_reads = self._calc_read_counts(variant)
-      except ReadCountsUnavailableError as exc:
-        log('Read counts unavailable for %s_%s' % (variant.CHROM, variant.POS))
-        continue
-      variants_and_reads.append((variant, ref_reads, total_reads))
-    return variants_and_reads
-
-  def _calc_read_counts(self, variant):
-    raise Exception('Not implemented -- use child class')
-
-  def _parse_vcf(self, vcf_filename):
-    vcfr = vcf.Reader(filename=vcf_filename)
-    records = []
-    for variant in vcfr:
-      variant.CHROM = variant.CHROM.upper()
-      # Some VCF dialects prepend "chr", some don't. Remove the prefix to
-      # standardize.
-      if variant.CHROM.startswith('CHR'):
-        variant.CHROM = variant.CHROM[3:]
-      records.append(variant)
-    return records
-
-  def _does_variant_pass_filters(self, variant):
-    if variant.FILTER is None:
-      return True
-    if len(variant.FILTER) > 0:
-      # Variant failed one or more filters.
-      return False
-    return True
-
-  def _filter(self, vcf_filename):
-    variants = []
-
-    all_variants = self._parse_vcf(vcf_filename)
-
-    for variant in all_variants:
-      if not is_good_chrom(variant.CHROM):
-        continue
-      if not self._does_variant_pass_filters(variant):
-        continue
-      variants.append(variant)
-    return variants
-
-  def _get_tumor_index(self, variant, tumor_sample=None):
-    """Find the index of the tumor sample.
-
-    Currently hardcodes tumour sample as the last column if name not specified.
-    Might not always be true
-    """
-    if self._tumor_sample:
-      tumor_is = [i for i, s in enumerate(variant.samples) if s.sample == tumor_sample]
-      assert len(tumor_is) == 1, "Did not find tumor name %s in samples" % tumor_sample
-      return tumor_is[0]
-    else:
-      # Don't make this -1, as some code assumes it will be >= 0.
-      return len(variant.samples) - 1
-
-class SangerParser(VariantParser):
-  '''
-  Works with PCAWG variant calls from the Sanger.
-  '''
-  def __init__(self, vcf_filename, tumor_sample=None):
-    self._vcf_filename = vcf_filename
-    self._tumor_sample = tumor_sample
-
-  def _find_ref_and_variant_nt(self, variant):
-    assert len(variant.REF) == len(variant.ALT) == 1
-    return (str(variant.REF[0]), str(variant.ALT[0]))
-
-  def _calc_read_counts(self, variant):
-    normal = variant.genotype('NORMAL')
-    tumor = variant.genotype('TUMOUR')
-
-    reference_nt, variant_nt = self._find_ref_and_variant_nt(variant)
-    tumor_reads = {
-      'forward': {
-        'A': int(tumor['FAZ']),
-        'C': int(tumor['FCZ']),
-        'G': int(tumor['FGZ']),
-        'T': int(tumor['FTZ']),
-      },
-      'reverse': {
-        'A': int(tumor['RAZ']),
-        'C': int(tumor['RCZ']),
-        'G': int(tumor['RGZ']),
-        'T': int(tumor['RTZ']),
-      },
-    }
-
-    ref_reads = tumor_reads['forward'][reference_nt] + tumor_reads['reverse'][reference_nt]
-    # For now, variant reads are defined as only the non-reference nucleotide in
-    # the inferred tumor SNP. We ignore reads of a third or fourth base.
-    variant_reads = tumor_reads['forward'][variant_nt] + tumor_reads['reverse'][variant_nt]
-    total_reads = ref_reads + variant_reads
-
-    return (ref_reads, total_reads)
-
-class PcawgConsensusParser(VariantParser):
-  def __init__(self, vcf_filename, tumor_sample=None):
-    self._vcf_filename = vcf_filename
-    self._tumor_sample = tumor_sample
-
-  def _find_ref_and_variant_nt(self, variant):
-    assert len(variant.REF) == len(variant.ALT) == 1
-    return (str(variant.REF[0]), str(variant.ALT[0]))
-
-  def _calc_read_counts(self, variant):
-    if not ('t_alt_count' in variant.INFO and 't_ref_count' in variant.INFO):
-      raise ReadCountsUnavailableError()
-    assert len(variant.INFO['t_alt_count']) == len(variant.INFO['t_ref_count']) == 1
-
-    alt_reads = int(variant.INFO['t_alt_count'][0])
-    ref_reads = int(variant.INFO['t_ref_count'][0])
-    total_reads = alt_reads + ref_reads
-    # Some variants havezero alt and ref reads.
-    if total_reads == 0:
-      raise ReadCountsUnavailableError()
-    return (ref_reads, total_reads)
-
-class MuseParser(VariantParser):
-  def __init__(self, vcf_filename, tier=0, tumor_sample=None):
-    self._vcf_filename = vcf_filename
-    self._tier = tier
-    self._tumor_sample = tumor_sample
-
-  def _get_normal_genotype(self, variant):
-    tumor_i = self._get_tumor_index(variant, self._tumor_sample)
-    assert tumor_i in (0, 1), 'Tumor index %s is not 0 or 1' % tumor_i
-    normal_i = 1 - tumor_i
-    return set([int(t) for t in variant.samples[normal_i]['GT'].split('/')])
-
-  def _calc_read_counts(self, variant):
-    normal_gt = self._get_normal_genotype(variant)
-    assert len(normal_gt) == 1
-    normal_gt = normal_gt.pop()
-
-    tumor_i = self._get_tumor_index(variant, self._tumor_sample)
-    total_reads = int(variant.samples[tumor_i]['DP'])
-    ref_reads = int(variant.samples[tumor_i]['AD'][normal_gt])
-
-    return (ref_reads, total_reads)
-
-  def _does_variant_pass_filters(self, variant):
-    # Ignore heterozygous normal variants.
-    if len(self._get_normal_genotype(variant)) != 1:
-      return False
-    if variant.FILTER is None or len(variant.FILTER) == 0:
-      return True
-    if int(variant.FILTER[0][-1]) <= self._tier:
-      # Variant failed one or more filters, but we still accept it.
-      return True
-    return False
-    
-class StrelkaParser(VariantParser):
-  def __init__(self, vcf_filename, tumor_sample=None):
-    self._vcf_filename = vcf_filename
-    self._tumor_sample = tumor_sample    
-
-  def _does_variant_pass_filters(self, variant):
-    # Strelka outputs two files one for SNPs, the other for InDels
-    # For now only deal with SNP file from Strelka
-    if variant.is_snp:
-      if variant.FILTER is None or len(variant.FILTER) == 0: 
-        return True
-    return False
-
-  def _calc_read_counts(self, variant):
-    alt = variant.ALT[0]
-    tumor_i = self._get_tumor_index(variant, self._tumor_sample)
-    total_reads = int(variant.samples[tumor_i]['DP'])
-
-    if alt is None:
-      total_reads = 0
-      variant_reads = 0
-    else:
-      variant_reads = int(getattr(variant.samples[tumor_i].data, str(alt)+'U')[0])
-
-    ref_reads = total_reads - variant_reads
-    return (ref_reads, total_reads)
-
-class SomSnipParser(VariantParser):
-  def __init__(self, vcf_filename, tumor_sample=None):
-    self._vcf_filename = vcf_filename
-    self._tumor_sample = tumor_sample
-
-  def _calc_read_counts(self, variant):
-    tumor_i = self._get_tumor_index(variant, self._tumor_sample)
-    highqual_reads = (variant.samples[tumor_i]['DP4'])
-    assert len(highqual_reads) == 4
-
-    ref_reads = int(highqual_reads[0]) + int(highqual_reads[1])
-    variant_reads = int(highqual_reads[2]) + int(highqual_reads[3])
-
-    return (ref_reads, ref_reads + variant_reads)
-
-class MutectTcgaParser(VariantParser):
-  def __init__(self, vcf_filename, tumor_sample=None):
-    self._vcf_filename = vcf_filename
-    self._tumor_sample = tumor_sample
-
-  def _calc_read_counts(self, variant):
-    tumor_i = self._get_tumor_index(variant, self._tumor_sample)
-    # TD: Tumor allelic depths for the ref and alt alleles in the order listed
-    ref_reads, variant_reads = variant.samples[tumor_i]['TD']
-    total_reads = ref_reads + variant_reads
-    return (ref_reads, total_reads)
-
-class MutectPcawgParser(VariantParser):
-  def __init__(self, vcf_filename, tumor_sample=None):
-    self._vcf_filename = vcf_filename
-    self._tumor_sample = tumor_sample
-
-  def _calc_read_counts(self, variant):
-    tumor_i = self._get_tumor_index(variant, self._tumor_sample)
-    ref_reads = int(variant.samples[tumor_i].data.ref_count)
-    variant_reads = int(variant.samples[tumor_i].data.alt_count)
-    total_reads = ref_reads + variant_reads
-
-    return (ref_reads, total_reads)
-
-class MutectSmchetParser(VariantParser):
-  def __init__(self, vcf_filename, tumor_sample=None):
-    self._vcf_filename = vcf_filename
-    self._tumor_sample = tumor_sample
-
-  def _calc_read_counts(self, variant):
-    tumor_i = self._get_tumor_index(variant, self._tumor_sample)
-    ref_reads = int(variant.samples[tumor_i]['AD'][0])
-    variant_reads = int(variant.samples[tumor_i]['AD'][1])
-    total_reads = ref_reads + variant_reads
-
-    return (ref_reads, total_reads)
-
-class VarDictParser(MutectSmchetParser):
-  """Support VarDict somatic variant caller.
-
-  https://github.com/AstraZeneca-NGS/VarDictJava
-  https://github.com/AstraZeneca-NGS/VarDict
-
-  Uses the same read-extraction logic as MuTect (SMC-Het).
-  """
-  pass
-
-class DKFZParser(VariantParser):
-  def __init__(self, vcf_filename, tumor_sample=None):
-    self._vcf_filename = vcf_filename
-    self._tumor_sample = tumor_sample
-
-  def _calc_read_counts(self, variant):
-    # This doesn't handle multisample correctly, as I don't know how to get the
-    # DP4 attribute on multiple DKFZ samples currently.
-    for_ref_reads = int(variant.INFO['DP4'][0])
-    back_ref_reads = int(variant.INFO['DP4'][1])
-    for_variant_reads = int(variant.INFO['DP4'][2])
-    back_variant_reads = int(variant.INFO['DP4'][3])
-    ref_reads = for_ref_reads + back_ref_reads
-    var_reads = for_variant_reads + back_variant_reads
-    total_reads = ref_reads + var_reads
-
-    return (ref_reads, total_reads)
-
-class CnvFormatter(object):
-  def __init__(self, read_depth, sampidxs, hetsnp_rate):
-    self._read_depth = read_depth
-    self._sampidxs = sampidxs
-    self._hetsnp_rate = hetsnp_rate
-
-  def _find_overlapping_variants(self, chrom, cnv, variants):
-    overlapping = []
-
-    start = cnv['start']
-    end = cnv['end']
-    for variant in variants:
-      if chrom.upper() == variant['chrom'].upper():
-        if start <= variant['pos'] <= end:
-          overlapping.append(variant['ssm_id'])
-    return overlapping
-
-  def _calc_ref_reads(self, cellular_prev, total_reads):
-    ref_reads = np.zeros(len(self._sampidxs), dtype=np.int64)
-    for sampidx in self._sampidxs:
-      vaf = cellular_prev[sampidx] / 2
-      ref_reads[sampidx] = int((1 - vaf) * total_reads[sampidx])
-    return ref_reads
-
-  def _calc_total_reads(self, locus_start, locus_end):
-    def _calc(samp_read_depth):
-      # We estimate 7 heterozygous SNPs per 10 kb, which goes as input to CNA
-      # algorithms. Thus, we determine how many SNPs are equivalent to a region
-      # of the given size, then weight accordingly.
-      assert locus_start < locus_end
-      # Figure out approximately equivalent number of SSMs to this region.
-      equiv_ssms = (locus_end - locus_start) * self._hetsnp_rate
-      return int(np.round(equiv_ssms * samp_read_depth))
-
-    D = [_calc(self._read_depth[sampidx]) for sampidx in self._sampidxs]
-    return self._cap_cnv_D(D)
-
-  def _format_overlapping_variants(self, variants, maj_cn, min_cn):
-    assert len(set(maj_cn)) == len(set(min_cn)) == 1
-    variants = [(ssm_id, str(min_cn[0]), str(maj_cn[0])) for ssm_id in variants]
-    return variants
-
-  def _cap_cnv_D(self, D):
-    # Average tumour has ~3k SSMs, so say that a CNA region should be
-    # equivalent to no more than this.
-    avg_ssms_in_tumour = 3000
-    D_max = np.round(avg_ssms_in_tumour * self._read_depth).astype(np.int)
-    D_min = 1
-
-    D = np.minimum(D_max, D)
-    D = np.maximum(D_min, D)
-    return D
-
-  def _format_cnvs(self, cnvs, variants):
-    log('Estimated read depth: %s' % self._read_depth)
-
-    for chrom, chrom_cnvs in cnvs.items():
-      for cnv in chrom_cnvs:
-        overlapping_variants = self._find_overlapping_variants(chrom, cnv, variants)
-        total_reads = self._calc_total_reads(cnv['start'], cnv['end'])
-        ref_reads = self._calc_ref_reads(cnv['cell_prev'], total_reads)
-        yield {
-          'chrom': chrom,
-          'start': cnv['start'],
-          'end': cnv['end'],
-          'major_cn': cnv['major_cn'],
-          'minor_cn': cnv['minor_cn'],
-          'cellular_prevalence': cnv['cell_prev'],
-          'ref_reads': ref_reads,
-          'total_reads': total_reads,
-          'overlapping_variants': self._format_overlapping_variants(overlapping_variants, cnv['major_cn'], cnv['minor_cn'])
-        }
-
-  def _merge_variants(self, cnv1, cnv2):
-    cnv1_variant_names = set([v[0] for v in cnv1['overlapping_variants']])
-    for variant in cnv2['overlapping_variants']:
-      variant_name = variant[0]
-      if variant_name not in cnv1_variant_names:
-        cnv1['overlapping_variants'].append(variant)
-      else:
-        # If variant already in cnv1's list, ignore it. This should only occur
-        # if two subclonal CNVs have close to 0.5 frequency each. In this case,
-        # we lose information about major/minor status of the cnv2 relative to
-        # its SSMs.
-        log('%s already in %s' % (variant, cnv1['cnv_id']))
-
-  # CNVs with similar a/d values should not be free to move around the
-  # phylogeny independently, and so we merge them into a single entity. We may
-  # do the same with SNVs bearing similar frequencies later on.
-  def format_and_merge_cnvs(self, cnvs, variants, cellularity):
-    formatted = list(self._format_cnvs(cnvs, variants))
-    formatted.sort(key = lambda f: f['cellular_prevalence'][0], reverse = True)
-    if len(formatted) == 0:
-      return []
-
-    for cnv in formatted:
-      physical_cnvs = OrderedDict()
-      for K in ('chrom', 'start', 'end', 'major_cn', 'minor_cn'):
-        physical_cnvs[K] = cnv[K]
-
-      assert len(set(physical_cnvs['major_cn'])) == len(set(physical_cnvs['major_cn'])) == 1
-      physical_cnvs['major_cn'] = physical_cnvs['major_cn'][0]
-      physical_cnvs['minor_cn'] = physical_cnvs['minor_cn'][0]
-
-      physical_cnvs['cell_prev'] = '|'.join([str(C) for C in cnv['cellular_prevalence']])
-      cnv['physical_cnvs'] = ','.join(['%s=%s' % (K, physical_cnvs[K]) for K in physical_cnvs.keys()])
-
-    merged, formatted = formatted[:1], formatted[1:]
-    merged[0]['cnv_id'] = 'c0'
-    counter = 1
-
-    for current in formatted:
-      last = merged[-1]
-      assert np.all(current['cellular_prevalence'] <= cellularity) and np.all(last['cellular_prevalence'] <= cellularity)
-
-      # Only merge CNVs if they're clonal. If they're subclonal, leave them
-      # free to move around the tree.
-      if np.array_equal(current['cellular_prevalence'], last['cellular_prevalence']) \
-      and np.array_equal(last['cellular_prevalence'], cellularity):
-        # Merge the CNVs.
-        log('Merging %s_%s and %s_%s' % (current['chrom'], current['start'], last['chrom'], last['start']))
-        last['total_reads'] = self._cap_cnv_D(current['total_reads'] + last['total_reads'])
-        last['ref_reads'] = self._calc_ref_reads(last['cellular_prevalence'], last['total_reads'])
-        last['physical_cnvs'] += ';' + current['physical_cnvs']
-        self._merge_variants(last, current)
-      else:
-        # Do not merge the CNVs.
-        current['cnv_id'] = 'c%s' % counter
-        merged.append(current)
-        counter += 1
-
-    return merged
-
-class VariantFormatter(object):
-  def __init__(self):
-    self._counter = 0
-
-  def _split_types(self, genotype):
-    types = [int(e) for e in genotype.split('/')]
-    if len(types) != 2:
-      raise Exception('Not diploid: %s' % types)
-    return types
-
-  def _calc_ref_freq(self, ref_genotype, error_rate):
-    types = self._split_types(ref_genotype)
-    num_ref = len([t for t in types if t == 0])
-    freq = (num_ref / 2) - error_rate
-    if freq < 0:
-      freq = 0.0
-    if freq > 1:
-      raise Exception('Nonsensical frequency: %s' % freq)
-    return freq
-
-  def format_variants(self, variants, ref_read_counts, total_read_counts, error_rate, sex):
-    for variant_idx, variant in enumerate(variants):
-      ssm_id = 's%s' % self._counter
-      if hasattr(variant, 'ID') and variant.ID is not None:
-        # This field will be defined by PyVCF, but not by our VariantId named
-        # tuple that we have switched to, so this code will never actually run.
-        # TODO: fix that.
-        variant_name = variant.ID
-      else:
-        variant_name = '%s_%s' % (variant.CHROM, variant.POS)
-
-      # TODO: switch back to using calc_ref_freq() when we no longer want mu_r
-      # and mu_v fixed.
-      # This is mu_r in PhyloWGS.
-      expected_ref_freq = 1 - error_rate
-      if variant.CHROM in ('Y', 'M') or (variant.CHROM == 'X' and sex == 'male'):
-        # Haploid, so should only see non-variants when sequencing error
-        # occurred. Note that chrY and chrM are always haploid; chrX is haploid
-        # only in men, so script must know sex of patient to choose correct
-        # value. Currently, I just assume that all data comes from men.
-        #
-        # This is mu_v in PhyloWGS.
-        expected_var_freq = error_rate
-      else:
-        # Diploid, so should see variants in (0.5 - error_rate) proportion of
-        # reads.
-        #
-        # This is mu_v in PhyloWGS.
-        expected_var_freq = 0.5 - error_rate
-
-      yield {
-        'ssm_id': ssm_id,
-        'chrom': variant.CHROM,
-        'pos': variant.POS,
-        'variant_name': variant_name,
-        'ref_reads': list(ref_read_counts[variant_idx,:]),
-        'total_reads': list(total_read_counts[variant_idx,:]),
-        'expected_ref_freq': expected_ref_freq,
-        'expected_var_freq': expected_var_freq,
-      }
-      self._counter += 1
-
-def restricted_float(x):
-  x = float(x)
-  if x < 0.0 or x > 1.0:
-    raise argparse.ArgumentTypeError('%r not in range [0.0, 1.0]' % x)
-  return x
-
-def chrom_key(chrom):
-  if chrom.isdigit():
-    return int(chrom)
-  elif chrom == 'X':
-    return 100
-  elif chrom == 'Y':
-    return 101
-  else:
-    raise Exception('Unknown chrom: %s' % chrom)
-
-def variant_key(var):
-  chrom = chrom_key(var.CHROM)
-  return (chrom, var.POS)
-
-class Segmenter(object):
-  def _organize_cnvs(self, cnv_set):
-    organized = defaultdict(list)
-
-    for sampidx, cnvs in enumerate(cnv_set):
-      for chrom, chrom_cnvs in cnvs.items():
-        for cnv in chrom_cnvs:
-          organized[chrom].append({
-            'sample': sampidx,
-            'start': cnv['start'],
-            'end': cnv['end'],
-            'major_cn': cnv['major_cn'],
-            'minor_cn': cnv['minor_cn'],
-            'cell_prev': cnv['cellular_prevalence']
-          })
-
-    for chrom, cnvs in organized.items():
-      # Intervals may not be sorted in input file.
-      cnvs.sort(key = lambda c: c['start'])
-
-    return organized
-
-  def _create_intervals(self, cnv_set):
-    # intervals[chrom][(major, minor)]
-    intervals = defaultdict(list)
-    min_size_for_inclusion = 1
-
-    for chrom, cnvs in cnv_set.items():
-      for cnv in cnvs:
-        # We sorted above to place start coordinates after end coordinates. But
-        # if a CNV was listed with *the same* start and end position (meaning a
-        # zero-length record, assuming intervals that are left-closed but
-        # right-open), we will encounter the end for that record before its
-        # start. As such, the "open_samples.remove()" call below will fail, as
-        # the given intervals will not have been opened when we encounter its
-        # end.
-        #
-        # Note the above assumes a half-open interpretation of intervals. I
-        # don't think I implemented this -- if I recall, the code dealing with
-        # CNVs (such as determining SSM overlap) assumes fully-closed intervals
-        # (i.e., it doesn't check if cnv.start <= ssm.locus <= (cnv.end + 1)).
-        # Normally this doesn't matter, given the low resolution of CNV calls
-        # -- we should never encounter such small intervals. But a pathological
-        # case in which CNV inputs had the same start & end coordinates for
-        # some intervals revealed that the code crashes on this input. We
-        # should provide a more informative error in such cases, which the
-        # following assertion does.
-        assert cnv['start'] < cnv['end'], ('In CNV %s, start position occurs at or after the end position' % cnv)
-
-      start_pos = [(c['start'], 'start', (c['sample'], c['cell_prev'], c['major_cn'], c['minor_cn'])) for c in cnvs]
-      end_pos   = [(c['end'],   'end',   (c['sample'], c['cell_prev'], c['major_cn'], c['minor_cn'])) for c in cnvs]
-
-      # True > False, so this sorting will place start positions after end
-      # positions if both have same coordinate.
-      positions = sorted(start_pos + end_pos, key = lambda e: (e[0], e[1] == 'start'))
-      assert len(positions) >= 2, 'Fewer than two positions in %s' % positions
-
-      # prev_pos is updated each time we move to a new coordinate on the
-      # chromosome. Multiple start or end points may be associated with any
-      # given coordinate.
-      prev_pos = None
-      open_samples = []
-      idx = 0
-
-      while idx < len(positions):
-        points_at_locus = [positions[idx]]
-        locus = points_at_locus[0][0]
-
-        # Gather all interval breakpoints at this locus.
-        while True:
-          assert positions[idx][0] >= locus
-          idx += 1
-          if idx == len(positions) or positions[idx][0] > locus:
-            break
-          points_at_locus.append(positions[idx])
-
-        if prev_pos is None:
-          assert len(open_samples) == 0
-
-        if len(open_samples) > 0:
-          # If some samples are already open from previous loci (such that
-          # last_pos will not be None), add this interval.
-          assert locus > prev_pos
-          interval = (prev_pos, locus)
-          if interval[1] - interval[0] > min_size_for_inclusion:
-            intervals[chrom].append((interval[0], interval[1], sorted(open_samples)))
-        else:
-          # All points should be start points.
-          assert set([i[1] for i in points_at_locus]) == set(['start'])
-
-        prev_pos = locus
-
-        # Update open_samples in accordance with whether each breakpoint at
-        # this locus starts or ends an interval.
-        for pos, pt_type, (sampidx, cell_prev, major_cn, minor_cn) in points_at_locus:
-          if pt_type == 'start':
-            log('Adding ', (pos, pt_type, sampidx, cell_prev, major_cn, minor_cn))
-            open_samples.append((sampidx, cell_prev, major_cn, minor_cn))
-          elif pt_type == 'end':
-            log('Removing ', (pos, pt_type, sampidx, cell_prev, major_cn, minor_cn))
-            open_samples.remove((sampidx, cell_prev, major_cn, minor_cn))
-          else:
-            raise Exception('Unknown point type: %s' % pt_type)
-
-      assert len(open_samples) == 0
-
-    return intervals
-
-  def _merge_adjacent(self, cncalls, allowed_gap = 0):
-    cncalls.sort(key = lambda c: (Util.chrom_key(c['chrom']), c['start']))
-    merged = []
-    idx = 0
-    while idx < len(cncalls):
-      adjacent = [cncalls[idx]]
-      idx += 1
-
-      while idx < len(cncalls) and \
-      cncalls[idx]['chrom'] == adjacent[-1]['chrom'] and \
-      cncalls[idx]['major_cn'] == adjacent[-1]['major_cn'] and \
-      cncalls[idx]['minor_cn'] == adjacent[-1]['minor_cn'] and \
-      0 <= cncalls[idx]['start'] - adjacent[-1]['end'] <= allowed_gap:
-        adjacent.append(cncalls[idx])
-        idx += 1
-
-      if len(adjacent) > 1:
-        log('Merging ', adjacent)
-        copy = dict(adjacent[0])
-        copy['end'] = adjacent[-1]['end']
-        merged.append(copy)
-      else:
-        merged.append(adjacent[0])
-
-    return merged
-
-  def segment(self, cn_calls):
-    # Merge adjacent CNVs here rather than when data loaded, as what can be
-    # merged will be determined by what tetraploidy correction, if any, is
-    # applied to the data.
-    #for sampidx, cnvs in enumerate(cn_calls):
-      #cn_calls[sampidx] = self._merge_adjacent(cnvs)
-    organized = self._organize_cnvs(cn_calls)
-    return self._create_intervals(organized)
-
-class MultisampleCnvCombiner(object):
-  def __init__(self, cn_regions, cellularity, sex):
-    self.sampidxs = set(range(len(cn_regions)))
-    segments = Segmenter().segment(cn_regions)
-    self._cnvs = self._reformat_segments_as_cnvs(segments)
-    self._cellularity = cellularity
-    self._sex = sex
-
-  def _reformat_segments_as_cnvs(self, segments):
-    reformatted = defaultdict(list)
-    _retrieve_val = lambda idx: np.array(zip(*open_samples)[idx])
-
-    for chrom, chrom_cnvs in segments.items():
-        for start, end, open_samples in chrom_cnvs:
-          sampidx = _retrieve_val(0)
-          cell_prev = _retrieve_val(1)
-          major_cn = _retrieve_val(2)
-          minor_cn = _retrieve_val(3)
-          cnv = {
-            'start': start,
-            'end': end,
-            'cell_prev': cell_prev,
-            'major_cn': major_cn,
-            'minor_cn': minor_cn,
-            'sampidx': sampidx,
-          }
-          reformatted[chrom].append(cnv)
-
-    return reformatted
-
-  def _ensure_no_overlap(self, cnvs):
-    for chrom, chrom_cnvs in cnvs.items():
-      for idx in range(len(chrom_cnvs) - 1):
-        current, next = chrom_cnvs[idx], chrom_cnvs[idx + 1]
-        assert current['start'] < current['end'] <= next['start'] < next['end']
-
-  def _is_region_normal_cn(self, chrom, major, minor):
-    return self._is_multisample_region_normal_cn(chrom, [major], [minor])
-
-  def _is_multisample_region_normal_cn(self, chrom, major, minor):
-    normal_major = set([1])
-    if self._sex == 'male' and chrom in (('X', 'Y')):
-      normal_minor = set([0])
-    else:
-      normal_minor = set([1])
-    return set(major) == normal_major and set(minor) == normal_minor
-
-  def _get_abnormal_state_for_all_samples(self, chrom, cnv):
-    '''On a per-sample basis, record which samples report the CNA is abnormal
-    CN, and which report it is normal CN. If multiple different abnormal states
-    occur in different samples, return None.'''
-    # All samples must have at least one record for this region, or don't
-    # include it.
-    if set(cnv['sampidx']) != self.sampidxs:
-      return None
-
-    abnormal_state = None
-    filtered = []
-
-    for sampidx, cell_prev, major, minor in zip(cnv['sampidx'], cnv['cell_prev'], cnv['major_cn'], cnv['minor_cn']):
-      # Region may be (clonal or subclonal) normal in a sample, so ignore such records.
-      if self._is_region_normal_cn(chrom, major, minor):
-        continue
-
-      # Either we haven't observed an abnormal CN state in this region before,
-      # or the observed abnormal state matches what we've already seen.
-      if abnormal_state is None or abnormal_state == (major, minor):
-        abnormal_state = (major, minor)
-        filtered.append({'sampidx': sampidx, 'cell_prev': cell_prev, 'major_cn': major, 'minor_cn': minor})
-        continue
-      # The abnormal state (i.e., major & minor alleles) is *different* from
-      # what we've seen before. The PWGS model doesn't currently account for
-      # such cases, so ignore the region.
-      else:
-        return None
-
-    # None of the observed records were abnormal -- i.e., all samples report
-    # the region is normal. Reject the region.
-    if abnormal_state is None:
-      return None
-
-    retained_sampidxs = [F['sampidx'] for F in filtered]
-    # Sanity check: when we originally parsed the CNVs, the samples should have
-    # been added in order, and that ought not to have changed.
-    assert retained_sampidxs == sorted(retained_sampidxs)
-    # Sanity check: we should have no duplicate samples. While a given sample
-    # may report any number of records for a region, above we discarded normal
-    # regions, and ensured that only one abnormal state exists in all samples.
-    # Thus, we should have no more than one record per sample for this region.
-    assert len(retained_sampidxs) == len(set(retained_sampidxs))
-
-    # Add a record for all samples that reported this region as clonal normal.
-    cell_prev_when_absent = 0
-    for missing_sampidx in self.sampidxs - set(retained_sampidxs):
-      filtered.append({
-        'sampidx': missing_sampidx,
-        'cell_prev': cell_prev_when_absent,
-        'major_cn': abnormal_state[0],
-        'minor_cn': abnormal_state[1]
-      })
-    # Sort by sampidx.
-    filtered.sort(key = lambda F: F['sampidx'])
-    # Ensure all samples have one record.
-    assert len(filtered) == len(self.sampidxs)
-
-    return filtered
-
-  def load_single_abnormal_state_cnvs(self):
-    '''
-    Return all regions that possess at most one abnormal state across samples.
-    E.g., given three samples, S_1 and S_3 report the region as (2, 1) (with
-    potentially different cellular prevalences), while S_2 lists it as clonal
-    (1, 1). In such an instance, the record for S_2 will *not* indicate the
-    region is normal. Instead, the S_2 record will show a state of (2, 1) with
-    a cellular prevalence of zero. This is done so that we can calculate
-    sensible `a` and `d` values for cnv_data.txt.
-    '''
-    # In Battenberg, either one region is normal and the other abnormal,
-    # or both are abnormal.
-    # In TITAN, only one abnormal region will be listed, without a
-    # corresponding normal region.
-    abnormal_cnvs = defaultdict(list)
-
-    for chrom, chrom_cnvs in self._cnvs.items():
-      if not is_good_chrom(chrom):
-        continue
-      for cnv in chrom_cnvs:
-        states_for_all_samples = self._get_abnormal_state_for_all_samples(chrom, cnv)
-        if states_for_all_samples is None:
-          continue
-
-        combined_states = { K: np.array([S[K] for S in states_for_all_samples]) for K in states_for_all_samples[0].keys() }
-        cnv.update(combined_states)
-        abnormal_cnvs[chrom].append(cnv)
-      abnormal_cnvs[chrom].sort(key = lambda C: C['start'])
-
-    self._ensure_no_overlap(abnormal_cnvs)
-    return abnormal_cnvs
-
-  def load_normal_cnvs(self):
-    '''
-    Return all regions that are clonal normal across all samples.
-    '''
-    normal_cnvs = defaultdict(list)
-
-    for chrom, chrom_cnvs in self._cnvs.items():
-      if not is_good_chrom(chrom):
-        continue
-      for cnv in chrom_cnvs:
-        if not self._is_multisample_region_normal_cn(chrom, cnv['major_cn'], cnv['minor_cn']):
-          continue
-        if not set(cnv['sampidx']) == self.sampidxs:
-          continue
-        if not np.array_equal(cnv['cell_prev'], self._cellularity):
-          # The region must be clonal normal to be retained. This check
-          # shouldn't be necessary, as we've already ensured all calls have
-          # major = minor = 1, but we perform it just to be thorough.
-          continue
-        normal_cnvs[chrom].append(cnv)
-      normal_cnvs[chrom].sort(key = lambda C: C['start'])
-
-    self._ensure_no_overlap(normal_cnvs)
-    return normal_cnvs
-
-  def load_cnvs(self):
-    '''
-    Return both normal and abnormal regions.
-    '''
-    combined = defaultdict(list)
-
-    normal_cnvs = self.load_normal_cnvs()
-    abnormal_cnvs = self.load_single_abnormal_state_cnvs()
-    for chrom in set(normal_cnvs.keys()) | set(abnormal_cnvs.keys()):
-      combined[chrom] = normal_cnvs[chrom] + abnormal_cnvs[chrom]
-      combined[chrom].sort(key = lambda C: C['start'])
-    self._ensure_no_overlap(combined)
-
-    return combined
-
-class VariantAndCnvGroup(object):
-  def __init__(self, hetsnp_rate):
-    self._multisamp_cnv = None
-    self._cellularity = None
-    self._hetsnp_rate = hetsnp_rate
-
-  def add_variants(self, variants, ref_read_counts, total_read_counts):
-    self._variants = variants
-    # Ensure no duoplicates.
-    assert len(variants) == len(set(variants))
-    # Note that self._variant_idxs will change as we filter out variants,
-    # reflecting only the remaining valid variants. self._variants, however,
-    # will not change.
-    self._variant_idxs = list(range(len(variants)))
-    self._ref_read_counts = ref_read_counts
-    self._total_read_counts = total_read_counts
-    # Estimate read depth before any filtering of variants is performed, in
-    # case no SSMs remain afterward.
-    self._estimated_read_depth = self._estimate_read_depth()
-
-  def _find_cellularity(self, cnvs):
-    max_cellular_prevs = np.zeros(len(cnvs))
-
-    for sampidx, sample_cnvs in enumerate(cnvs):
-      for chrom_regions in sample_cnvs.values():
-        for cnr in chrom_regions:
-          if cnr['cellular_prevalence'] > max_cellular_prevs[sampidx]:
-            max_cellular_prevs[sampidx] = cnr['cellular_prevalence']
-
-    return max_cellular_prevs
-
-  def add_cnvs(self, cn_regions, sex):
-    self._cellularity = self._find_cellularity(cn_regions)
-    self._multisamp_cnv = MultisampleCnvCombiner(cn_regions, self._cellularity, sex)
-    self._sampidxs = self._multisamp_cnv.sampidxs
-
-  def has_cnvs(self):
-    return self._multisamp_cnv is not None
-
-  def _filter_variants_outside_regions(self, regions, before_label, after_label):
-    def _is_pos_in_regions(chrom, pos):
-      for cnv in regions[chrom]:
-        if cnv['start'] <= pos <= cnv['end']:
-          return True
-      return False
-
-    filtered = []
-
-    for vidx in self._variant_idxs:
-      variant = self._variants[vidx]
-      if _is_pos_in_regions(variant.CHROM, variant.POS):
-        filtered.append(vidx)
-
-    self._print_variant_differences(
-      [self._variants[idx] for idx in self._variant_idxs],
-      [self._variants[idx] for idx in filtered],
-      before_label,
-      after_label
-    )
-    self._variant_idxs = filtered
-
-  def _print_variant_differences(self, before, after, before_label, after_label):
-    before = set(before)
-    after = set(after)
-    log('%s=%s %s=%s delta=%s' % (before_label, len(before), after_label, len(after), len(before) - len(after)))
-
-    assert after.issubset(before)
-    removed = list(before - after)
-    removed.sort(key = variant_key)
-
-    def _print_region(var):
-      var_name = '%s_%s' % (var.CHROM, var.POS)
-      region_type = None
-      containing_cnv = None
-
-      for cnv in self._multisamp_cnv.load_normal_cnvs()[var.CHROM]:
-        if cnv['start'] <= var.POS <= cnv['end']:
-          region_type = 'normal'
-          containing_cnv = cnv
-          break
-      for cnv in self._multisamp_cnv.load_single_abnormal_state_cnvs()[var.CHROM]:
-        if cnv['start'] <= var.POS <= cnv['end']:
-          assert region_type is None and containing_cnv is None
-          region_type = 'abnormal'
-          containing_cnv = cnv
-          break
-
-      if containing_cnv is not None:
-        log('%s\t[in %s-CN region chr%s(%s, %s)]' % (var_name, region_type, var.CHROM, containing_cnv['start'], containing_cnv['end']))
-      else:
-        log('%s\t[outside all regions]' % var_name)
-
-    for var in removed:
-      _print_region(var)
-
-  def retain_only_variants_in_normal_cn_regions(self):
-    if not self.has_cnvs():
-      raise Exception('CN regions not yet provided')
-
-    normal_cn = self._multisamp_cnv.load_normal_cnvs()
-    filtered = self._filter_variants_outside_regions(normal_cn, 'all_variants', 'only_normal_cn')
-
-  def exclude_variants_in_multiple_abnormal_or_unlisted_regions(self):
-    # Battenberg:
-    #   Five possible placements for variant in Battenberg according to CN records:
-    #   1 record:
-    #     That record has normal CN: include
-    #     That record has abnormal CN: include
-    #   2 records:
-    #     One record is normal CN, one record is abnormal CN: include
-    #     Both records are abnormal CN: exclude (as we don't know what order the CN events occurred in)
-    # TITAN:
-    #   In output seen to date, TITAN will only list one record per region. If
-    #   the CN state is abnormal and clonal_frac < 1, this implies the
-    #   remainder of the region will be normal CN. Multiple abnormal records
-    #   for the same region are likely possible, but I haven't yet seen any.
-    #   Regardless, when they occur, they should be properly handled by the
-    #   code.
-    if not self.has_cnvs():
-      raise Exception('CN regions not yet provided')
-
-    # If variant isn't listed in *any* region: exclude (as we suspect CNV
-    # caller didn't know what to do with the region).
-    self._filter_variants_outside_regions(self._multisamp_cnv.load_cnvs(), 'all_variants', 'within_cn_regions')
-
-  def format_variants(self, sample_size, error_rate, priority_ssms, only_priority, sex):
-    if sample_size is None:
-      sample_size = len(self._variant_idxs)
-    random.shuffle(self._variant_idxs)
-
-    subsampled, nonsubsampled = [], []
-    variant_idx_map = {self._variants[idx]: idx for idx in self._variant_idxs}
-    used_variant_idxs = set() # Use a set for O(1) testing of membership.
-
-    for prissm in priority_ssms:
-      if prissm not in variant_idx_map:
-        continue
-      if len(subsampled) >= sample_size:
-        break
-      log('%s_%s in priority' % (prissm.CHROM, prissm.POS))
-      varidx = variant_idx_map[prissm]
-      used_variant_idxs.add(varidx)
-      subsampled.append(varidx)
-
-    for variant_idx in self._variant_idxs:
-      if variant_idx in used_variant_idxs:
-        continue
-      used_variant_idxs.add(variant_idx)
-      variant = self._variants[variant_idx]
-      if (not only_priority) and len(subsampled) < sample_size:
-        subsampled.append(variant_idx)
-      else:
-        nonsubsampled.append(variant_idx)
-
-    assert len(used_variant_idxs) == len(self._variant_idxs) == len(subsampled) + len(nonsubsampled)
-
-    subsampled.sort(key = lambda idx: variant_key(self._variants[idx]))
-    subsampled_variants = get_elements_at_indices(self._variants, subsampled)
-    subsampled_ref_counts = self._ref_read_counts[subsampled,:]
-    subsampled_total_counts = self._total_read_counts[subsampled,:]
-
-    nonsubsampled.sort(key = lambda idx: variant_key(self._variants[idx]))
-    nonsubsampled_variants = get_elements_at_indices(self._variants, nonsubsampled)
-    nonsubsampled_ref_counts = self._ref_read_counts[nonsubsampled,:]
-    nonsubsampled_total_counts = self._total_read_counts[nonsubsampled,:]
-
-    formatter = VariantFormatter()
-    subsampled_formatted = list(formatter.format_variants(subsampled_variants, subsampled_ref_counts, subsampled_total_counts, error_rate, sex))
-    nonsubsampled_formatted = list(formatter.format_variants(nonsubsampled_variants, nonsubsampled_ref_counts, nonsubsampled_total_counts, error_rate, sex))
-
-    return (subsampled_formatted, nonsubsampled_formatted)
-
-  def write_variants(self, variants, outfn):
-    with open(outfn, 'w') as outf:
-      print('\t'.join(('id', 'gene', 'a', 'd', 'mu_r', 'mu_v')), file=outf)
-      for variant in variants:
-        variant['ref_reads'] = ','.join([str(v) for v in variant['ref_reads']])
-        variant['total_reads'] = ','.join([str(v) for v in variant['total_reads']])
-        vals = (
-          'ssm_id',
-          'variant_name',
-          'ref_reads',
-          'total_reads',
-          'expected_ref_freq',
-          'expected_var_freq',
-        )
-        vals = [variant[k] for k in vals]
-        print('\t'.join([str(v) for v in vals]), file=outf)
-
-  def _estimate_read_depth(self):
-    read_sum = 0
-    if len(self._variants) == 0:
-      default_read_depth = 50
-      log('No variants available, so fixing read depth at %s.' % default_read_depth)
-      return default_read_depth
-    else:
-      return np.nanmedian(self._total_read_counts, axis=0)
-
-  def write_cnvs(self, variants, outfn):
-    with open(outfn, 'w') as outf:
-      print('\t'.join(('cnv', 'a', 'd', 'ssms', 'physical_cnvs')), file=outf)
-      formatter = CnvFormatter(self._estimated_read_depth, self._sampidxs, self._hetsnp_rate)
-      for cnv in formatter.format_and_merge_cnvs(self._multisamp_cnv.load_single_abnormal_state_cnvs(), variants, self._cellularity):
-        overlapping = [','.join(o) for o in cnv['overlapping_variants']]
-        vals = (
-          cnv['cnv_id'],
-          ','.join([str(V) for V in cnv['ref_reads']]),
-          ','.join([str(V) for V in cnv['total_reads']]),
-          ';'.join(overlapping),
-          cnv['physical_cnvs']
-        )
-        print('\t'.join(vals), file=outf)
-
-def log(*msgs):
-  if log.verbose:
-    print(*msgs, file=sys.stderr)
-log.verbose = False
-
-class CnvParser(object):
-  def __init__(self, cn_filename):
-    self._cn_filename = cn_filename
-
-  def parse(self):
-    cn_regions = defaultdict(list)
-
-    with open(self._cn_filename) as cnf:
-      reader = csv.DictReader(cnf, delimiter='\t')
-      for record in reader:
-        chrom = record['chromosome'].upper()
-        del record['chromosome']
-        for key in ('start', 'end', 'major_cn', 'minor_cn'):
-          # Some records from Battenberg have major and minor listed as, e.g.,
-          # "1.0", so cast to float before int.
-          assert float(record[key]) == int(float(record[key]))
-          record[key] = int(float(record[key]))
-        record['cellular_prevalence'] = float(record['cellular_prevalence'])
-        cn_regions[chrom].append(record)
-
-    # Ensure CN regions are properly sorted, which we later rely on when
-    # filtering out regions with multiple abnormal CN states.
-    for chrom, regions in cn_regions.items():
-      cn_regions[chrom] = sorted(regions, key = lambda r: r['start'])
-
-    return cn_regions
-
-def get_elements_at_indices(L, indices):
-  elem = []
-  for idx in indices:
-    elem.append(L[idx])
-  return elem
-
-def parse_priority_ssms(priority_ssm_filename):
-  if priority_ssm_filename is None:
-    return []
-  priority_ssms = []
-  already_seen = set()
-
-  with open(priority_ssm_filename) as priof:
-    for line in priof:
-      chrom, pos = line.strip().split('_', 1)
-      variant = VariantId(CHROM=chrom.upper(), POS=int(pos))
-      # Prevent duplicates -- otherwise, we'll add the variant to our
-      # subsampled list of variants twice. This manifested as a problem in the
-      # PCAWG 6cfce053-bfd6-4ca0-b74b-b2e4549e4f1f sample.
-      if variant in already_seen:
-        continue
-      priority_ssms.append(variant)
-      already_seen.add(variant)
-
-  return priority_ssms
-
-def impute_missing_total_reads(total_reads, missing_variant_confidence):
-  # Change NaNs to masked values via SciPy.
-  masked_total_reads = ma.fix_invalid(total_reads)
-
-  # Going forward, suppose you have v variants and s samples in a v*s matrix of
-  # read counts. Missing values are masked.
-
-  # Calculate geometric mean of variant read depth in each sample. Result: s*1
-  sample_means = gmean(masked_total_reads, axis=0)
-  assert np.sum(sample_means <= 0) == np.sum(np.isnan(sample_means)) == 0
-  # Divide every variant's read count by its mean sample read depth to get read
-  # depth enrichment relative to other variants in sample. Result: v*s
-  normalized_to_sample = np.dot(masked_total_reads, np.diag(1./sample_means))
-  # For each variant, calculate geometric mean of its read depth enrichment
-  # across samples. Result: v*1
-  variant_mean_reads = gmean(normalized_to_sample, axis=1)
-  assert np.sum(variant_mean_reads <= 0) == np.sum(np.isnan(variant_mean_reads)) == 0
-
-  # Convert 1D arrays to vectors to permit matrix multiplication.
-  imputed_counts = np.dot(variant_mean_reads.reshape((-1, 1)), sample_means.reshape((1, -1)))
-  nan_coords = np.where(np.isnan(total_reads))
-  total_reads[nan_coords] = imputed_counts[nan_coords]
-  assert np.sum(total_reads <= 0) == np.sum(np.isnan(total_reads)) == 0
-
-  total_reads[nan_coords] *= missing_variant_confidence
-  return np.floor(total_reads).astype(np.int)
-
-def impute_missing_ref_reads(ref_reads, total_reads):
-  ref_reads = np.copy(ref_reads)
-
-  assert np.sum(np.isnan(total_reads)) == 0
-  nan_coords = np.where(np.isnan(ref_reads))
-  ref_reads[nan_coords] = total_reads[nan_coords]
-  assert np.sum(np.isnan(ref_reads)) == 0
-
-  return ref_reads.astype(np.int)
-
-def is_good_chrom(chrom):
-  # Ignore the following:
-  #   * Variants unmapped ('chrUn') or mapped to fragmented chromosome ('_random')
-  #   * Weird chromosomes from Mutect (e.g., "chr17_ctg5_hap1").
-  #   * Mitochondrial ("mt" or "m"), which are weird
-  #   * Sex chromosomes difficult to deal with, as expected frequency depends on
-  #     whether patient is male or female, so ignore them for now. TODO: fix this.
-  if chrom in [str(i) for i in range(1, 23)] + ['X', 'Y']:
-    return True
-  else:
-    return False
-
-def parse_variants(samples, vcf_files, vcf_types, tumor_sample, missing_variant_confidence):
-  parsed_variants = []
-  all_variant_ids = []
-  num_samples = len(samples)
-
-  for sample in samples:
-    vcf_fn, vcf_type = vcf_files[sample], vcf_types[sample]
-
-    if vcf_type == 'sanger':
-      variant_parser = SangerParser(vcf_fn, tumor_sample)
-    elif vcf_type == 'mutect_pcawg':
-      variant_parser = MutectPcawgParser(vcf_fn, tumor_sample)
-    elif vcf_type == 'mutect_smchet':
-      variant_parser = MutectSmchetParser(vcf_fn, tumor_sample)
-    elif vcf_type == 'mutect_tcga':
-      variant_parser = MutectTcgaParser(vcf_fn, tumor_sample)
-    elif vcf_type == 'muse':
-      variant_parser = MuseParser(vcf_fn, muse_tier, tumor_sample)
-    elif vcf_type == 'dkfz':
-      variant_parser = DKFZParser(vcf_fn, tumor_sample)
-    elif vcf_type == 'strelka':
-      variant_parser = StrelkaParser(vcf_fn, tumor_sample)
-    elif vcf_type == 'vardict':
-      variant_parser = VarDictParser(vcf_fn, tumor_sample)
-    elif vcf_type == 'pcawg_consensus':
-      variant_parser = PcawgConsensusParser(vcf_fn, tumor_sample)
-    elif vcf_type == 'somsnip':
-      variant_parser = SomSnipParser(vcf_fn, tumor_sample)
-    else:
-      raise Exception('Unknowon variant type: %s' % vcf_type)
-
-    parsed_variants.append(variant_parser.list_variants())
-    variant_ids = [VariantId(str(v[0].CHROM), int(v[0].POS)) for v in parsed_variants[-1]]
-    all_variant_ids += variant_ids
-
-  all_variant_ids = list(set(all_variant_ids)) # Eliminate duplicates.
-  all_variant_ids.sort(key = variant_key)
-  num_variants = len(all_variant_ids)
-  variant_positions = dict(zip(all_variant_ids, range(num_variants)))
-
-  total_read_counts = np.zeros((num_variants, num_samples))
-  total_read_counts.fill(np.nan)
-  ref_read_counts = np.copy(total_read_counts)
-
-  for sample_idx, parsed in enumerate(parsed_variants):
-    for variant, ref_reads, total_reads in parsed:
-      variant_id = VariantId(str(variant.CHROM), int(variant.POS))
-      variant_idx = variant_positions[variant_id]
-      ref_read_counts[variant_idx, sample_idx] = ref_reads
-      total_read_counts[variant_idx, sample_idx] = total_reads
-
-  total_read_counts = impute_missing_total_reads(total_read_counts, missing_variant_confidence)
-  ref_read_counts = impute_missing_ref_reads(ref_read_counts, total_read_counts)
-  return (all_variant_ids, ref_read_counts, total_read_counts)
-
-def infer_sex(variant_ids):
-  num_y_variants = len([V for V in variant_ids if V.CHROM == 'Y'])
-  if num_y_variants > 0:
-    return 'male'
-  else:
-    return 'female'
-
-def extract_sample_data(vcf_files_and_samples, vcf_types_and_samples, cnv_files_and_samples):
-  vcf_files = {}
-  vcf_types = {}
-  cnv_files = {}
-
-  assert len(vcf_files_and_samples) == len(vcf_types_and_samples), 'Must specify same number of VCF files and VCF types'
-  srcs_and_dsts = [(vcf_files_and_samples, vcf_files), (vcf_types_and_samples, vcf_types)]
-
-  should_use_cnvs = cnv_files_and_samples is not None
-  if should_use_cnvs:
-    assert len(cnv_files_and_samples) == len(vcf_files_and_samples), 'Must specify same number of VCF and CNV files'
-    srcs_and_dsts.append( (cnv_files_and_samples, cnv_files) )
-
-  for (src, dst) in srcs_and_dsts:
-    for combined in src:
-      assert '=' in combined, ('%s should be in format <sample>=<item>' % combined)
-      sample, val = combined.split('=', 1)
-      dst[sample] = val
-
-  # Sample order will dictate eventual output order.
-  common_samps = reduce(lambda s1, s2: s1 & s2, [set(D[1].keys()) for D in srcs_and_dsts])
-  ordered_samps = [S.split('=', 1)[0] for S in vcf_files_and_samples]
-  assert len(ordered_samps) == len(common_samps) # Ensure no duplicates.
-
-  assert set(vcf_files.keys()) == common_samps, \
-    ('VCF file samples (%s) differ from common samples (%s)' % (vcf_files.keys(), common_samps))
-  assert set(vcf_types.keys()) == common_samps, \
-    ('VCF type samples (%s) differ from common samples (%s)' % (vcf_types.keys(), common_samps))
-  if should_use_cnvs:
-    assert set(cnv_files.keys()) == common_samps, \
-      ('CNV file samples (%s) differ from CNV file samples (%s)' % (cnv_files.keys(), common_samps))
-
-  return (ordered_samps, vcf_files, vcf_types, cnv_files)
-
-def main():
-  all_vcf_types = set(('sanger', 'mutect_pcawg', 'mutect_smchet', 'mutect_tcga', 'muse','dkfz', 'strelka', 'vardict', 'pcawg_consensus'))
-
-  parser = argparse.ArgumentParser(
-    description='Create ssm_data.txt and cnv_data.txt input files for PhyloWGS from VCF and CNV data.',
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter
-  )
-  parser.add_argument('--vcf-type', dest='vcf_types', action='append', required=True,
-    help='Type of VCF file for each sample, specified as <sample>=<vcf_type>. Valid VCF types are %s.' % ','.join(all_vcf_types))
-  parser.add_argument('-e', '--error-rate', dest='error_rate', type=restricted_float, default=0.001,
-    help='Expected error rate of sequencing platform')
-  parser.add_argument('--missing-variant-confidence', dest='missing_variant_confidence', type=restricted_float, default=1.,
-    help='Confidence in range [0, 1] that SSMs missing from a sample are indeed not present in that sample')
-  parser.add_argument('-s', '--sample-size', dest='sample_size', type=int,
-    help='Subsample SSMs to reduce PhyloWGS runtime')
-  parser.add_argument('-P', '--priority-ssms', dest='priority_ssm_filename',
-    help='File containing newline-separated list of SSMs in "<chr>_<locus>" format to prioritize for inclusion')
-  parser.add_argument('--only-priority', dest='only_priority', action='store_true',
-    help='Only sample variants provided on priority list')
-  parser.add_argument('--cnvs', dest='cnv_files', action='append',
-    help='Path to CNV file created with parse_cnvs.py for each sample. Specified as <sample>=<CNV path>.')
-  parser.add_argument('--regions', dest='regions', choices=('normal_cn', 'normal_and_abnormal_cn', 'all'), default='normal_and_abnormal_cn',
-    help='Which regions to use variants from. Refer to the parser README for more details.')
-  parser.add_argument('--output-cnvs', dest='output_cnvs', default='cnv_data.txt',
-    help='Output destination for CNVs')
-  parser.add_argument('--output-variants', dest='output_variants', default='ssm_data.txt',
-    help='Output destination for variants')
-  parser.add_argument('--output-params', dest='output_params', default='params.json',
-    help='Output destination for run parameters')
-  parser.add_argument('--tumor-sample', dest='tumor_sample',
-    help='Name of the tumor sample in the input VCF file. Defaults to last sample if not specified.')
-  parser.add_argument('--muse-tier', dest='muse_tier', type=int, default=0,
-    help='Maximum MuSE tier to include')
-  parser.add_argument('--nonsubsampled-variants', dest='output_nonsubsampled_variants',
-    help='If subsampling, write nonsubsampled variants to separate file, in addition to subsampled variants')
-  parser.add_argument('--nonsubsampled-variants-cnvs', dest='output_nonsubsampled_variants_cnvs',
-    help='If subsampling, write CNVs for nonsubsampled variants to separate file')
-  parser.add_argument('--sex', dest='sex', default='auto', choices=('auto', 'male', 'female'),
-    help='Sex of patient. Used to adjust expected variant frequencies on sex chromosomes. ' +
-    'If auto, patient is set to male if any variants are provided on the Y chromosome, and female otherwise.')
-  parser.add_argument('--het-snp-rate', dest='hetsnp_rate', default=7e-4, type=float,
-    help='Average number of heterozygous SNPs per base used to call copy ' +
-    'number. This determines how heavily we weight somatic CNAs relative to ' +
-    'SNVs. Defaults to 7 SNPs per 10 kb, as per Battenberg.')
-  parser.add_argument('--verbose', dest='verbose', action='store_true')
-  parser.add_argument('vcf_files', nargs='+', help='Path to VCF file for each sample. Specified as <sample>=<VCF path>.')
-  args = parser.parse_args()
-
-  log.verbose = args.verbose
-  params = {}
-
-  samples, vcf_files, vcf_types, cnv_files = extract_sample_data(args.vcf_files, args.vcf_types, args.cnv_files)
-  params['samples'], params['vcf_files'], params['vcf_types'], params['cnv_files'] = samples, vcf_files, vcf_types, cnv_files
-  num_samples = len(samples)
-  variant_ids, ref_read_counts, total_read_counts = parse_variants(samples, vcf_files, vcf_types, args.tumor_sample, args.missing_variant_confidence)
-
-  # Fix random seed to ensure same set of SSMs chosen when subsampling on each
-  # invocation.
-  random.seed(1)
-
-  if args.sex == 'auto':
-    sex = infer_sex(variant_ids)
-  else:
-    sex = args.sex
-
-  grouper = VariantAndCnvGroup(args.hetsnp_rate)
-  grouper.add_variants(variant_ids, ref_read_counts, total_read_counts)
-
-  if len(cnv_files) > 0:
-    # Load CNV files in same order as sample order given for VCFs.
-    cn_regions = [CnvParser(cnv_files[S]).parse() for S in samples]
-    grouper.add_cnvs(cn_regions, sex)
-
-  if not grouper.has_cnvs():
-    assert args.regions == 'all', 'If you do not provide CNA data, you must specify --regions=all'
-
-  if args.regions == 'normal_cn':
-    grouper.retain_only_variants_in_normal_cn_regions()
-  elif args.regions == 'normal_and_abnormal_cn':
-    grouper.exclude_variants_in_multiple_abnormal_or_unlisted_regions()
-  elif args.regions == 'all':
-    pass
-  else:
-    raise Exception('Unknown --regions value: %s' % args.regions)
-
-  priority_ssms = parse_priority_ssms(args.priority_ssm_filename)
-
-  subsampled_vars, nonsubsampled_vars = grouper.format_variants(args.sample_size, args.error_rate, priority_ssms, args.only_priority, sex)
-  if len(subsampled_vars) == 0:
-    print('No variants to write', file=sys.stderr)
-    sys.exit(0)
-  grouper.write_variants(subsampled_vars, args.output_variants)
-  if args.output_nonsubsampled_variants:
-    grouper.write_variants(nonsubsampled_vars, args.output_nonsubsampled_variants)
-
-  if grouper.has_cnvs() and args.regions != 'normal_cn':
-    # Write CNVs.
-    grouper.write_cnvs(subsampled_vars, args.output_cnvs)
-    if args.output_nonsubsampled_variants and args.output_nonsubsampled_variants_cnvs:
-      grouper.write_cnvs(nonsubsampled_vars, args.output_nonsubsampled_variants_cnvs)
-  else:
-    # Write empty CNV file.
-    with open(args.output_cnvs, 'w'):
-      pass
-
-  with open(args.output_params, 'w') as F:
-    json.dump(params, F)
-
-if __name__ == '__main__':
-  main()
diff --git a/modules/phylowgs/1.0/src/fill_battenberg.py b/modules/phylowgs/1.0/src/fill_battenberg.py
new file mode 100644
index 00000000..ec6d3cd5
--- /dev/null
+++ b/modules/phylowgs/1.0/src/fill_battenberg.py
@@ -0,0 +1,379 @@
+#!/usr/bin/python3
+
+"""
+This script will fill empty segments in Battenberg subclones.txt files. It is adapted from Kostiantyn Dreval's fill_segments.py script. 
+It requires seg file and chromosome arms file as mandatory inputs. The path to output file also must be specified.
+
+Example:
+
+python3 fill_battenberg.py --input </path/to/input_file>.subclones.txt --chromArm </path/to/chrom_arm_file>.tsv --output </path/to/output_file>.subclones.txt
+"""
+
+# import required modules
+import pandas as pd
+import argparse
+
+
+def main():
+    # initiate the parser and handle arguments from command line
+    args = parse_args()
+    input_file = args.input
+    output_file = args.output
+    chrom_file = args.chromArm
+
+    # determine the format of input file
+    input_format = input_file[-3:]
+    
+    # check arguments given in command line
+    # check_arguments(args, input_format)
+
+    # create a dictionary containing coordinates of chromosome arms
+    arm_chrom = load_chrom_arm(chrom_file)
+    # get the order of chromosomes
+    chrom_order = list(arm_chrom.keys()) + ["buffer"]
+
+    # initialize empty variable for the new segments
+    columns_new = []
+    columns_edges = []
+
+    # initialize list to store all segments, since it is faster than concatenating pd df with large number of segments
+    seg_filled = []
+
+    # assign values to be used to fill normal CN segments
+    empty_baf = float(0.5)
+    empty_pval = int(1)
+    empty_logr = int(0)
+    empty_ntot = float(2.0)
+    empty_nMaj1_A = int(1)
+    empty_nMin1_A = int(1)
+    empty_frac1_A = int(1)
+    empty_nMaj2_A = int(1)
+    empty_nMin2_A = int(1)
+    empty_frac2_A = int(1)
+    
+    columns_fill = [empty_baf, empty_pval, empty_logr, empty_ntot, empty_nMaj1_A, empty_nMin1_A, empty_frac1_A, empty_nMaj2_A, empty_nMin2_A, empty_frac2_A]
+
+    # fill segments
+    seg = open(input_file, 'r+') 
+    lines=seg.readlines()
+
+    # remove segments with NA values for nMaj1_A or nMin1_B
+    print("Removing segments with NA values for nMaj1_A or nMin1_A...")
+    to_remove = [i for i, line in enumerate(lines) if line[7]=="NA" or line[8] == "NA"]
+    if len(to_remove) > 0: 
+      for index in reversed(to_remove): # start at the end to avoid recomputing offsets
+          del lines[index]
+    
+
+    # first, get header of the file
+    header=lines[0].rstrip("\n").rstrip("\r").split("\t")
+
+    print("Filling missing segments and smoothing centromeres...")
+    # next, go through each segment, skipping the header
+    for i in range(1,len(lines)-1):
+
+        # read 2 segments at a time to compare coordinates of end of previous sefment, and start of the next segments
+        columns_first = (lines[i].rstrip("\n").rstrip("\r")).split("\t")
+        columns_second = (lines[i+1].rstrip("\n").rstrip("\r")).split("\t")
+
+        # insert empty segment from the beginning of chromosome of the first segment in file to complete the telomeric region of first chromosome
+        if i==1: 
+            columns_new = [columns_first[0], str(arm_chrom[columns_first[0]]['p']['start']), str(int(columns_first[1])-1)] + columns_fill
+            seg_filled.append(columns_new)
+            # seg_filled.append(columns_first) I think this is a duplicate of the one below after fixing the possible centromeric end of the segment
+            # deal with fencepost problem
+            if (int(columns_first[2]) > arm_chrom[columns_first[0]]['p']['end'] and int(columns_first[2]) < arm_chrom[columns_first[0]]['q']['start']):            
+                columns_first[2] = str(arm_chrom[columns_first[0]]['p']['end'])
+            seg_filled.append(columns_first)
+
+            if (chrom_order[chrom_order.index(columns_second[0])] == chrom_order[chrom_order.index(columns_first[0])+1]):
+              missing_arm = chrom_order[chrom_order.index(columns_first[0])]
+              columns_edges = [columns_first[0], str(arm_chrom[missing_arm]['q']['start']), str(arm_chrom[missing_arm]['q']['end'])] + columns_fill
+              seg_filled.append(columns_edges)
+              seg_filled.append(columns_second)
+              continue        
+
+        # scenario 1: segments on the same chromosome
+        if (columns_first[0]==columns_second[0]):
+
+            # handle very rare overlapping segments (occurs ~ 0.008%)
+            if (int(columns_first[2]) > int(columns_second[1])):
+                columns_first[2] = int(columns_second[1])-1
+                seg_filled.append(columns_first)
+                pass
+
+            # for segments in p arm
+            if (int(columns_second[1]) < arm_chrom[columns_second[0]]['p']['end']):
+                # create empty segment to fill in
+                columns_new = [columns_first[0], str(int(columns_first[2])+1), str(int(columns_second[1])-1)] + columns_fill
+                seg_filled.append(columns_new)
+                next_segment = (lines[i+1].rstrip("\n").rstrip("\r")).split("\t")
+                if (int(columns_second[2]) < arm_chrom[columns_second[0]]['p']['end'] and int(columns_second[2]) < int(next_segment[1])):
+                    seg_filled.append(columns_second)
+                seg_filled.append(columns_second)    
+
+            # deal with centromeres
+            # I already know that this is same sample, and same chromosome
+            elif (int(columns_first[1]) < arm_chrom[columns_first[0]]['p']['end'] and int(columns_second[1]) > arm_chrom[columns_second[0]]['p']['end']):
+
+                # first lets deal with end of p arm: segment 1 might end before centromere, or within centromere
+                if int(columns_first[2]) < arm_chrom[columns_first[0]]['p']['end']:
+                    columns_new = [columns_first[0], str(int(columns_first[2])+1), str(arm_chrom[columns_first[0]]['p']['end'])] + columns_fill
+                    seg_filled.append(columns_new)
+                # if it extends into centromere, cut segment 1 at the end of p arm
+                else:
+                    columns_first[2] = str(arm_chrom[columns_first[0]]['p']['end'])
+                    seg_filled.append(columns_first)
+
+                # now lets deal with start of q arm: it might start within or after centromere
+                if int(columns_second[1]) < arm_chrom[columns_second[0]]['q']['start']:
+                    columns_second[1] = str(arm_chrom[columns_second[0]]['q']['start'])
+                    seg_filled.append(columns_second)
+                    next_segment = (lines[i+1].rstrip("\n").rstrip("\r")).split("\t")
+                    if (int(next_segment[1]) < arm_chrom[next_segment[0]]['q']['start'] and int(next_segment[2]) > arm_chrom[next_segment[0]]['q']['start']):
+                        next_segment[1] = str(arm_chrom[next_segment[0]]['q']['start'])
+                        seg_filled.append(next_segment)
+
+                # possible edge cases around centromere
+                else:
+                    columns_new = [columns_second[0], str(arm_chrom[columns_second[0]]['q']['start']), str(int(columns_second[2])-1)] + columns_fill
+                    previous_segment = (lines[i].rstrip("\n").rstrip("\r")).split("\t")
+                    if (int(previous_segment[2])>arm_chrom[columns_second[0]]['q']['start']):
+                        columns_edges = [columns_second[0], str(arm_chrom[columns_second[0]]['q']['start']), str(int(previous_segment[2]))] + columns_first[3:13]
+                        seg_filled.append(columns_edges)
+                        columns_new = [columns_edges[0], str(int(columns_edges[2])+1), str(int(columns_second[1])-1)] + columns_fill
+                    seg_filled.append(columns_new)
+                    seg_filled.append(columns_second)
+
+            # for segments in q arm
+            elif (int(columns_first[1]) > arm_chrom[columns_second[0]]['q']['start']):
+                # create empty segment to fill in
+                columns_new = [columns_first[0], str(int(columns_first[2])+1), str(int(columns_second[1])-1)] + columns_fill
+                seg_filled.append(columns_new)
+                seg_filled.append(columns_second)
+
+            # some segments are completely within centromere. drop them
+            elif (int(columns_first[1]) > arm_chrom[columns_first[0]]['p']['end'] and int(columns_first[1]) < arm_chrom[columns_first[0]]['q']['start']): 
+                if (int(columns_first[2]) > arm_chrom[columns_first[0]]['q']['start']):
+                  columns_new = [columns_first[0], str(arm_chrom[columns_first[0]]['q']['start']), str(int(columns_first[2]))] + columns_first[3:13]
+                  columns_edges = [columns_first[0], str(int(columns_first[2])+1), str(int(columns_second[1])-1)] + columns_fill
+                  seg_filled.append(columns_new)
+                else:
+                  columns_edges = [columns_second[0], str(arm_chrom[columns_second[0]]['q']['start']), str(int(columns_second[1])-1)] + columns_fill
+                  if (int(columns_second[1]) > arm_chrom[columns_second[0]]['p']['end'] and int(columns_second[1]) < arm_chrom[columns_second[0]]['q']['start']):
+                    columns_second[1] = str(arm_chrom[columns_second[0]]['q']['start'])
+                    seg_filled.append(columns_second)
+                seg_filled.append(columns_edges)
+                if (int(columns_second[1]) > arm_chrom[columns_second[0]]['p']['end'] and int(columns_second[2]) < arm_chrom[columns_second[0]]['q']['start']):
+                  pass # this just drops the segment from output if it is within centromere
+                else:
+                  if (int(columns_second[1])>arm_chrom[columns_second[0]]['q']['start']):
+                    seg_filled.append(columns_second)
+            elif (int(columns_second[1]) > arm_chrom[columns_second[0]]['p']['end'] and int(columns_second[1]) < arm_chrom[columns_second[0]]['q']['start']):
+                pass # this is handled later
+
+            # did I miss anything? it is possible some edge cases were not considered at time of script development
+            else:
+              print(columns_first[0], columns_second[0], columns_first[1], columns_second[1], columns_first[2], columns_second[2])
+              raise ValueError ("Other sort of way. This is an edge case that needs debugging!")
+
+        # scenario 2: same sample, but going over to the new chromosome
+        elif (columns_first[0]!=columns_second[0]):
+            # very rare cases when whole chromosome is missing, identify them here
+            if (chrom_order[chrom_order.index(columns_second[0])] != chrom_order[chrom_order.index(columns_first[0])+1]):
+              missing_chrom = chrom_order[chrom_order.index(columns_first[0])+1]
+              missing_p = [missing_chrom, str(arm_chrom[missing_chrom]['p']['start']), str(arm_chrom[missing_chrom]['p']['end'])] + columns_fill
+              missing_q = [missing_chrom, str(arm_chrom[missing_chrom]['q']['start']), str(arm_chrom[missing_chrom]['q']['end'])] + columns_fill
+              seg_filled.append(missing_p)
+              seg_filled.append(missing_q)
+            
+            # first, are there any segments in the p arm? that means second segments starts all the way in centromere or q arm
+            if (int(columns_first[2]) > arm_chrom[columns_first[0]]['q']['start']): #TRUE
+              if (int(columns_first[1]) > arm_chrom[columns_first[0]]['p']['end'] and int(columns_first[1]) < arm_chrom[columns_first[0]]['q']['start']): #FALSE
+                  previous_segment = (lines[i-1].rstrip("\n").rstrip("\r")).split("\t")
+                  if (chrom_order[chrom_order.index(previous_segment[0])] != chrom_order[chrom_order.index(columns_first[0])]):
+                    columns_edges = [columns_first[0], str(arm_chrom[columns_first[0]]['p']['start']), str(arm_chrom[columns_first[0]]['p']['end'])] + columns_fill
+                    seg_filled.append(columns_edges)  
+                  # pass
+              elif (int(columns_first[1]) < arm_chrom[columns_first[0]]['p']['end'] and int(columns_first[2]) > arm_chrom[columns_first[0]]['q']['start']): #TRUE
+                  previous_segment = (lines[i-1].rstrip("\n").rstrip("\r")).split("\t")
+                  if (chrom_order[chrom_order.index(previous_segment[0])] != chrom_order[chrom_order.index(columns_first[0])-1]):
+                      columns_edges = [columns_first[0], columns_first[1], str(arm_chrom[columns_first[0]]['p']['end'])] + columns_first[3:13]
+                      columns_first[1] = arm_chrom[columns_first[0]]['q']['start']
+                      seg_filled.append(columns_edges)
+                      seg_filled.append(columns_first)
+                  # Case when a chromosome has a single event that spans the centromere, split into four parts (p-start to seg-start, seg-start to p-end, q-start to seg-end, seg-end to q-end)
+                  else: 
+                    columns_pedge = [columns_first[0], str(arm_chrom[columns_first[0]]['p']['start']), int(columns_first[1])-1] + columns_fill
+                    columns_segp = [columns_first[0], columns_first[1], str(arm_chrom[columns_first[0]]['p']['end'])] + columns_first[3:13]
+                    columns_segq = [columns_first[0], str(arm_chrom[columns_first[0]]['q']['start']), columns_first[2]] + columns_first[3:13]
+                    columns_qedge = [columns_first[0], int(columns_first[2]) + 1, str(arm_chrom[columns_first[0]]['q']['end'])] + columns_fill
+                    seg_filled.append(columns_pedge)
+                    seg_filled.append(columns_segp)
+                    seg_filled.append(columns_segq)
+                    seg_filled.append(columns_qedge)
+
+              else:
+                  columns_edges = [columns_first[0], str(int(columns_first[2])+1), str(arm_chrom[columns_first[0]]['q']['end'])] + columns_fill
+                  seg_filled.append(columns_edges)
+            if (int(columns_second[1]) > arm_chrom[columns_second[0]]['p']['end']):
+              if (chrom_order[chrom_order.index(columns_second[0])] != chrom_order[chrom_order.index(columns_first[0])+1]):
+                seg_filled.append(missing_p)
+                seg_filled.append(missing_q)
+              if (int(columns_first[1]) < arm_chrom[columns_first[0]]['q']['start'] and int(columns_first[1]) > arm_chrom[columns_first[0]]['p']['end']):
+                columns_first[1] = arm_chrom[columns_first[0]]['q']['start']
+                seg_filled.append(columns_first)                  
+              columns_new = [columns_second[0], str(arm_chrom[columns_second[0]]['p']['start']), str(arm_chrom[columns_second[0]]['p']['end'])] + columns_fill
+              seg_filled.append(columns_new)
+              if (int(columns_second[1]) > arm_chrom[columns_second[0]]['q']['start']):
+                columns_edges = [columns_second[0], str(arm_chrom[columns_second[0]]['q']['start']), str(int(columns_second[1])-1)] + columns_fill
+                seg_filled.append(columns_edges)
+              else:
+                columns_second[1] = str(arm_chrom[columns_second[0]]['q']['start'])
+              seg_filled.append(columns_second)
+
+            # are there any segments in the q arm? that means first segment ends before start of q arm
+            elif (int(columns_first[2]) < arm_chrom[columns_first[0]]['q']['start']):
+              columns_new = [columns_first[0], str(arm_chrom[columns_first[0]]['q']['start']), str(arm_chrom[columns_first[0]]['q']['end'])] + columns_fill
+              seg_filled.append(columns_first)
+              if (int(columns_first[2]) < arm_chrom[columns_first[0]]['p']['end']):
+                  columns_edges = [columns_first[0], str(int(columns_first[2])+1), str(arm_chrom[columns_first[0]]['p']['end'])] + columns_fill
+                  seg_filled.append(columns_edges)
+              seg_filled.append(columns_new)
+              seg_filled.append(columns_second)              
+
+            # are there any segments that starts in p arm and span centromere? if so, maintain loh flag and logr, but cut out centromere
+            elif (int(columns_second[1]) < arm_chrom[columns_second[0]]['p']['end'] and int(columns_second[2]) > arm_chrom[columns_second[0]]['q']['start']):
+              previous_segment = (lines[i].rstrip("\n").rstrip("\r")).split("\t")
+              if "X" not in str(columns_second[0]):
+                next_segment = (lines[i+2].rstrip("\n").rstrip("\r")).split("\t")
+                columns_new = [columns_second[0], str(int(columns_second[1])+1), str(arm_chrom[columns_second[0]]['p']['end'])] + columns_first[3:13]
+                columns_edges = [columns_second[0], str(arm_chrom[columns_second[0]]['q']['start']), str(int(columns_second[2]))] + columns_first[3:13]
+                if (columns_new[0]!=previous_segment[0]):
+                    columns_new[1]=str(arm_chrom[columns_new[0]]['p']['start'])
+                if (columns_second[0]==next_segment[0]):
+                    seg_filled.append(columns_new)
+                    seg_filled.append(columns_edges)
+
+            # in other cases, there are segments both in p and q arms
+            else:
+              columns_edges = [columns_second[0], str(arm_chrom[columns_first[0]]['p']['start']), str(int(columns_second[1])-1)] + columns_fill
+              if (int(columns_first[1]) > arm_chrom[columns_second[0]]['p']['end']):
+                if (int(columns_first[1]) > arm_chrom[columns_first[0]]['p']['end'] and int(columns_first[1]) < arm_chrom[columns_first[0]]['q']['start']):
+                  columns_first[1] = arm_chrom[columns_first[0]]['q']['start']
+                  seg_filled.append(columns_first)                
+                columns_new = [columns_first[0], str(int(columns_first[2])+1), str(arm_chrom[columns_first[0]]['q']['end'])] + columns_fill
+                seg_filled.append(columns_new)
+              if (chrom_order[chrom_order.index(columns_second[0])] != chrom_order[chrom_order.index(columns_first[0])+1]):
+                seg_filled.append(missing_p)
+                seg_filled.append(missing_q)
+              seg_filled.append(columns_edges)
+              if (int(columns_second[2]) < arm_chrom[columns_second[0]]['p']['end']):
+                seg_filled.append(columns_second)
+
+
+        # scenario 3: new sample, obviously new chromosome
+        else:
+            previous_segment = (lines[i].rstrip("\n").rstrip("\r")).split("\t")
+            columns_edges = [columns_first[0], str(int(previous_segment[2])+1), str(arm_chrom[columns_first[0]]['q']['end'])] + columns_fill
+            columns_new = [columns_second[0], str(arm_chrom[columns_first[0]]['p']['start']), str(int(columns_second[1])-1)] + columns_fill
+            seg_filled.append(columns_edges)
+            seg_filled.append(columns_new)
+            seg_filled.append(columns_second)
+
+
+    seg.close()
+
+    # make df from list of lists and convert chromosome coordinates to integers
+    seg_filled_df = pd.DataFrame(seg_filled, columns = header)
+    seg_filled_df["startpos"] = seg_filled_df["startpos"].astype(int)
+    seg_filled_df["endpos"] = seg_filled_df["endpos"].astype(int)
+
+    # remove any inverted segments, if there are
+    print("Checking and removing inverted segments...")
+    seg_filled_df = seg_filled_df[(seg_filled_df["endpos"]>seg_filled_df["startpos"])]
+
+    # remove any duplicated segments, if there are
+    print("Checking and removing duplicated segments...")
+    seg_filled_df = seg_filled_df.drop_duplicates()
+    # seg_filled_df = seg_filled_df.groupby((seg_filled_df["endpos"] != seg_filled_df["endpos"].shift(-2)).cumsum().values).first()    
+    # seg_filled_df = seg_filled_df.groupby((seg_filled_df["endpos"] != seg_filled_df["endpos"].shift(-1)).cumsum().values).first()
+    # seg_filled_df = seg_filled_df.groupby((seg_filled_df["startpos"] != seg_filled_df["startpos"].shift(-1)).cumsum().values).first()
+
+    # save to the output file specified by user
+    print("Saving to file...")
+    seg_filled_df.to_csv(output_file, header=True, index=False, sep="\t")
+    print("Done!")
+
+
+# Create nested dictionary to store shromosome arms coordinates. It is adopted from Chris's implementation in other script that summarizes CNVs
+def load_chrom_arm(chrom_file):
+    arm_chrom = {}
+    required_cols = ["chromosome", "start", "end", "arm"]
+    header_cols = {}
+
+    i = 0
+    with open(chrom_file) as f:
+        for line in f:
+            i += 1
+            line = line.rstrip("\n").rstrip("\r")  # Remove line endings
+            cols = line.split("\t")
+
+            # Skip empty lines
+            if not line:
+                continue
+
+            # If we haven't parsed the header yet, assume this is the first line of the file (aka the header)
+            if not header_cols:
+                j = 0
+                for col in cols:
+                    if col in required_cols:
+                        header_cols[col] = j
+                    j += 1
+
+                # Check to make sure all required columns are found
+                for col in required_cols:
+                    if col not in header_cols:
+                        raise AttributeError("Unable to locate column %s in the chromosome arm positions file \'%s\'" % (col, chrom_file))
+                # If we get this far, the header is valid
+                continue
+            
+            if cols[0] not in arm_chrom:
+                arm_chrom[cols[0]] = {}
+            if cols[3]:
+                if cols[3] not in arm_chrom[cols[0]]:
+                    arm_chrom[cols[0]][cols[3]]={}
+                arm_chrom[cols[0]][cols[3]]['start'] = int(cols[1])
+                arm_chrom[cols[0]][cols[3]]['end'] = int(cols[2])
+    return arm_chrom
+
+
+# Check that required arguments are provided, and the input is in .seg format
+def check_arguments(args, input_format):
+    if input_format == 'seg' and not all([args.input, args.output, args.chromArm]):
+        raise ValueError ('Must specify input .seg file, output file, and file listing coordinates of chromosome arms.')
+    elif input_format != 'seg':
+        raise ValueError ('Input file must be in .seg format')
+    else:
+      pass
+
+
+# Parse arguments from command line
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--input",
+                        help="Imput file in .seg format to fill segments", required=True)
+    parser.add_argument("--output",
+                        help="Resulting file after filling missing segments", required=True)
+    parser.add_argument("--chromArm",
+                        help="File with coordinates of chromosme arms for a given genome build", required=True)
+
+    # ignore everything else that is not required by this script
+    args, unknown = parser.parse_known_args()
+    # return arguments provided by user
+    return args
+
+
+if __name__ == '__main__':
+    main()
diff --git a/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R b/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R
new file mode 100644
index 00000000..587cdb05
--- /dev/null
+++ b/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R
@@ -0,0 +1,536 @@
+
+
+#'
+#' processing phyloWGS outputs pipeline that takes output json files and preprocessing output files
+#' SAMPLE_ID.mutass.zip file must be unzipped before runing the script
+
+#E example: how to run 
+#mkdir -p output
+#Rscript ./process.R --samplename SAMPLE_ID -j SAMPLE_ID.summ.json -t unziped.mutass/ -s ssm_data.txt -c cnv_data.txt -a SAMPLE_ID--matched_slms-3.final_deblacklisted_augmented.maf -b SAMPLE_ID_matched_slms-3.final_deblacklisted_augmented.maf -m SAMPLE_ID.muts.json -o out
+
+##################################################
+# load required libraries
+##################################################
+
+# library("optparse")
+library("rjson")
+library("tidyverse")
+library("ggrepel")
+library("data.table")
+
+
+##########################
+#### Snakemake Input #####
+##########################
+
+samplename = snakemake@wildcards[["patient_id"]]
+json_file = snakemake@input[["summ"]]
+trees_out= snakemake@input[["mutass"]]   
+ssm_file = snakemake@input[["ssms"]]
+cnv_file = snakemake@input[["cnvs"]]
+mafs = unlist(strsplit(snakemake@params[["maf_list"]], ","))
+mut_file = snakemake@input[["muts"]]
+driver_genes = snakemake@params[["drivers"]]
+sample_order = unlist(strsplit(snakemake@params[["sample_order"]], ","))
+genome_build = snakemake@wildcards[["genome_build"]]
+
+# Define the chr_prefix parameter based on the genome_build
+chr_prefixed = str_detect(genome_build, "hg")
+
+
+# option_list = list(
+#   make_option(c("-n", "--samplename"), type="character", default=NULL, help="Samplename of the sample to run", metavar="character"),
+#   make_option(c("-j", "--json_summ"), type="character", default=NULL, help="SAMPLE_ID.summ.json file generated by phyloWGS", metavar="character"),
+#   make_option(c("-t", "--trees_out"), type="character", default=NULL, help="Directory containing unzipped XX.mutass.zip trees", metavar="character"),
+#   make_option(c("-s", "--ssm"), type="character", default=NULL, help="Preprocessing ssm_data.txt output file", metavar="character"),
+#   make_option(c("-c", "--copynumber"), type="character", default=NULL, help="Preprocessing cnv_data.txt output file", metavar="character"),
+#   make_option(c("-a", "--tumourA_maf"), type="character", default=NULL, help="Agument maf file of tumour A", metavar="character"),
+#   make_option(c("-b", "--tumourB_maf"), type="character", default=NULL, help="Agument maf file of tumour B", metavar="character"),
+#   make_option(c("-m", "--json_muts"), type="character", default=NULL, help="SAMPLE_ID.muts.json file", metavar="character"),
+#   make_option(c("-o", "--output"), type="character", default=NULL, help="Output directory", metavar="character")
+# )
+# 
+# opt_parser = OptionParser(option_list=option_list)
+# opt = parse_args(opt_parser)
+# 
+# samplename = opt$samplename
+# json_file = opt$json_summ
+# trees_out= opt$trees_out   ###  directory where unziped SAMPLE_ID.mutass.zip trees are
+# ssm_file = opt$ssm
+# cnv_file = opt$copynumber
+# mafA = opt$tumourA_maf
+# mafB = opt$tumourB_maf
+# mut_file = opt$json_muts
+# output_dir = opt$output
+# 
+# 
+# 
+# .checkfile = function(infile) {
+#   
+#   if (!file.exists(infile)) {
+#     
+#     stop(paste("File", infile, "does not exist", sep=""))
+#     
+#   }
+#   
+# }
+# 
+# 
+# .checkfile(json_file)
+# .checkfile(ssm_file)
+# .checkfile(cnv_file)
+# .checkfile(mafA)
+# .checkfile(mafB)
+# .checkfile(mut_file)
+
+##################################################
+# Process input files
+###################################################
+# Parse the input file and obtain the required data for this run
+result1 <- fromJSON(file = json_file)
+result_mut<-fromJSON(file = mut_file)
+ssm_pre<-read.table(file = ssm_file, header = TRUE)
+cnv_pre<-read.delim(file = cnv_file, header = TRUE)[,c("cnv","a","d")]
+
+
+
+##################################################
+# define output files
+##################################################
+out_json_to_Rtable= snakemake@output[["tree_summary"]]
+ssm_to_trees= snakemake@output[["maf"]]
+cnv_to_trees= snakemake@output[["cnvs"]]
+cellular_prevalence_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_cellular_prevalence.pdf"))
+CCF_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_cancer_cell_fraction_.pdf"))
+VAF_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_vaf_.pdf"))
+VAF_coding_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_vaf_coding.pdf"))
+tree_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_tree.pdf"))
+CCF_table = snakemake@output[["CCF"]]
+
+if(!dir.exists(snakemake@output[["plots"]])){dir.create(snakemake@output[["plots"]])}
+
+# out_json_to_Rtable= file.path(output_dir, paste("out_res_",samplename,"_json_converted_toR.table", sep = ""))
+# ssm_to_trees= file.path(output_dir, paste("out_res_",samplename,"_assigned_ssms_to_best_tree_maf_format.table", sep = ""))
+# cnv_to_trees= file.path(output_dir, paste("out_res_",samplename,"_assigned_cnvs_to_best_tree_maf_format.table", sep = ""))
+# cellular_prevalence_plot= file.path(output_dir, paste("cellular_prevalence_",samplename,".pdf", sep = ""))
+# CCF_plot= file.path(output_dir, paste("cancer_cell_fraction_",samplename,".pdf", sep = ""))
+# VAF_plot= file.path(output_dir, paste("vaf_",samplename,".pdf", sep = ""))
+# VAF_coding_plot= file.path(output_dir, paste("vaf_ccoding",samplename,".pdf", sep = ""))
+
+
+###################################################
+# open summ.json file and convert it into humam readable format
+###################################################
+
+#this function opens SAMPLE_ID_summ.jason and converts it into R table
+open_tree = function(json_summ_file,out_json_to_Rtable){
+
+  out_res<-NULL
+  for (j in 1:length(json_summ_file[["trees"]])){
+  
+  tree_focal<-json_summ_file[["trees"]][j]
+  tree_focal_statA<-as.data.frame(t(unlist(sapply(tree_focal,function(x)x[c("clustering_index","branching_index","llh","linearity_index")])))) 
+  colnames(tree_focal_statA)<-c("clustering_index","branching_index","llh","linearity_index")
+  tree_focal_statA$tree_id<-j-1
+  rownames(tree_focal_statA)<-NULL
+  
+  
+  tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,-c(3,6,9,12,15,18,21,24,27,30)]
+  #tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,!(grepl("cellular_prevalence",colnames(tree_focal_statB)))]
+  colnames(tree_focal_statB)<-sub("^[^.]*.", "", colnames(tree_focal_statB))
+  stat_both<-cbind(tree_focal_statA,tree_focal_statB)
+  out_res<-bind_rows(stat_both,out_res)  
+  out_res_ordered<-out_res[order(out_res$tree_id),] 
+  } # for j loop 
+
+density<-json_summ_file["tree_densities"]
+density_unlist<-data.frame("density"=unlist(density))
+row.names(density_unlist)<-sub("^[^.]*.", "", row.names(density_unlist))
+
+density_unlist$tree_id<-row.names(density_unlist)
+row.names(density_unlist)<-NULL
+
+final_table=merge(out_res_ordered,density_unlist, by.x = "tree_id", by.y = "tree_id")   ## add tree densities to all tress table
+write.table(final_table, file =out_json_to_Rtable ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE)
+return(final_table)
+}
+
+
+result_tree<-open_tree(result1,out_json_to_Rtable)
+
+
+###################################################
+# extrcats the best tree
+###################################################
+#the best tree is the tree with the highest density
+
+best_tree_id = function(R_table, density) {
+  best=R_table[which.max(R_table$density),]
+  best_tree_focal_name<-best$tree_id
+  best_tree_id<-paste(best$tree_id,"json",sep = ".")
+  return(best_tree_id)
+  return(best_tree_focal_name)
+}
+best_tree_fileID<-best_tree_id(result_tree, density)
+
+
+#######################################################################
+# extract the stats (SNvs and CNVs assigned to each population) from the best tree 
+#######################################################################
+
+open_best_tree = function(trees_out,best_tree_id){
+  unzip(trees_out, files = best_tree_id, exdir = dirname(trees_out), overwrite = TRUE)
+  best_tree_path = paste0(dirname(trees_out), "/", best_tree_id)
+	rr <- fromJSON(file = best_tree_path) 
+	return(rr)
+}
+rr= open_best_tree(trees_out,best_tree_fileID)
+
+
+#######################################################################
+# annotate point mutations and CNVs in the best tree
+#######################################################################
+best_focal<-result1[["trees"]][as.numeric(gsub(".json","",best_tree_fileID))+1]
+tree_structure<-as.data.frame(sapply(best_focal,function(x)x["structure"]))  ##[6]
+tree_roots <- best_focal[[1]]$structure$`0`
+
+
+merge_both<-function(result1,best_tree_fileID,tree_structure){
+  best_tree<-as.numeric(gsub(".json","",best_tree_fileID))
+  best_focal<-result1[["trees"]][best_tree+1]
+  tree_focal_statB<-as.data.frame(sapply(best_focal,function(x)x["populations"]))   ##[3]
+  qq<-tree_focal_statB[,grep("cellular_prevalence",colnames(tree_focal_statB))] %>% 
+    rownames_to_column("sample") %>% 
+    pivot_longer(-sample, 
+                 names_to = "population", 
+                 values_to = "cellular_prevalence") %>% 
+    mutate(population = str_remove(str_remove(population, ".*populations[.]"), "[.]cellular_prevalence")) %>% 
+    mutate(is_root = ifelse(population %in% tree_roots, TRUE, FALSE)) %>% 
+    group_by(sample) %>% 
+    mutate(purity = sum(cellular_prevalence[is_root]), 
+           CCF = cellular_prevalence / purity)
+  
+  return(qq)
+
+}
+
+both_samples<-merge_both(result1,best_tree_fileID,tree_structure)
+
+
+write_tsv(both_samples, CCF_table)
+
+
+ssm = function(stat_best_tree, ssm_pre,ssm_to_trees,tree_structure, maf_list){
+  
+  out_res_ssm<-NULL
+  for ( i in 1:length(stat_best_tree$mut_assignments)){
+
+    focal<-(stat_best_tree$mut_assignments)[i]
+  
+   focal_ssms<-data.frame(sapply(focal, function(x) x[1]))
+   	
+   	colnames(focal_ssms)<-sub("^[^.]*.", "", colnames(focal_ssms))
+   focal_ssms$phyloWGS_population<-i
+   ssm_assign<-merge(ssm_pre,focal_ssms, by.x = "id", by.y = "ssms")[,c("id", "gene","phyloWGS_population")]
+   ssm_assign_spi<-separate(ssm_assign, col = gene, into = c("Chromosome","Start_Position"), sep = "_", convert = FALSE) %>% 
+     mutate(Start_Position = as.numeric(Start_Position)) 
+   if(chr_prefixed) {
+     ssm_assign_spi$Chromosome = str_c("chr", as.character(ssm_assign_spi$Chromosome))
+   }
+   
+   
+   out_res_ssm<-rbind(ssm_assign_spi,out_res_ssm)    
+   	
+   } ## i loop 
+  
+  ssm_assign_with_maf <- lapply(maf_list, function(x){
+    maf <- read_tsv(x, 
+                    col_types = cols(Chromosome = col_character())) %>% 
+      # PhyloWGS changes the start position of deletions. This makes the maf start position match that in the PhyloWGS SSM table. 
+      mutate(Start_Position = ifelse(Variant_Type == "DEL", Start_Position - 1, Start_Position))
+    maf <- out_res_ssm %>% 
+      left_join(maf, by = c("Chromosome", "Start_Position")) %>% 
+      # Restore the true MAF start postion after the hack above
+      mutate(Start_Position = ifelse(Variant_Type == "DEL", Start_Position + 1, Start_Position)) %>%
+      select(colnames(maf), everything())
+  })
+  out_res_ssm <- rbindlist(ssm_assign_with_maf) %>% 
+    mutate(clonal_status = case_when(
+      phyloWGS_population %in% tree_roots & length(tree_roots) > 1 ~ "polyclonal", 
+      phyloWGS_population %in% tree_roots & length(tree_roots) == 1 ~ "clonal",
+      TRUE ~ "subclonal"
+    ))
+  
+   return(out_res_ssm)
+  
+   
+}
+
+ss<-ssm(rr,ssm_pre,ssm_to_trees,tree_structure, mafs)
+
+write_tsv(ss, ssm_to_trees, na = "")
+
+
+###########################################################
+## load mut file to extrcat CNVs start and end positions
+###########################################################
+
+cnv = function(stat_best_tree, cnv_pre,mutation_file,cnv_to_trees){
+  out_res_cnv <-
+    bind_rows(lapply(1:length(stat_best_tree$mut_assignments), function(x)
+      data.frame(cnvs = stat_best_tree$mut_assignments[[x]]$cnvs) %>% mutate(phyloWGS_population = x)))
+  
+   
+  
+  #return(out_res_cnv)
+      out_res_mut<-NULL
+      for (cn in 1:length(result_mut$cnvs)){
+      focal_mut_cnv<-(result_mut$cnvs)[cn]
+  
+      focal_mut<-data.frame(sapply(focal_mut_cnv, function(x) x[1]))[1,]
+      colnames(focal_mut)<-sub("^[^.]*.", "", colnames(focal_mut))
+      focal_mut$cnv_id<-names(focal_mut_cnv)
+      out_res_mut<-bind_rows(focal_mut,out_res_mut)
+      } ## cn loop
+  
+    both_cnvs<-merge(out_res_cnv, out_res_mut, by.x = "cnvs",by.y = "cnv_id")  %>% 
+      select(cnvs, phyloWGS_population, physical_cnvs.chrom, 
+             physical_cnvs.start, physical_cnvs.end,
+             physical_cnvs.major_cn, physical_cnvs.minor_cn, physical_cnvs.cell_prev)
+    
+  
+}
+
+
+cnv<-cnv(rr, cnv_pre,result_mut,cnv_to_trees)
+write.table(cnv, file =cnv_to_trees ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE)
+
+##################### plot the results ####################
+###########################################################
+#### Slope chart the best tree, cellular prevalence #######
+
+plot_cp<-function(both_samples,cellular_prevalence_plot){
+pdf(cellular_prevalence_plot, width = 8, height =8 )
+plotA<-ggplot(data = both_samples, aes(x = sample, y = cellular_prevalence, group = population)) +
+  geom_line(aes(color = population), size = 2) +
+  labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+  
+  geom_point(aes(color = population), size = 4) +
+  #  Labelling as desired
+  xlab("Sample") + ylab("Cellular prevalence")+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+        panel.background = element_blank(), axis.line = element_line(colour = "black"),
+        legend.key = element_rect(fill = NA, colour = NA, size = 0.25),axis.text=element_text(size=16),axis.title=element_text(size=18),
+        plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)))
+print(plotA)
+dev.off() 
+}
+
+plot_cp(both_samples,cellular_prevalence_plot)
+
+###########################################################
+#### Slope chart the best tree, CCF #######
+
+plot_cp<-function(both_samples,CCF_plot){
+pdf(CCF_plot, width = 8, height =8 )
+plotB<-ggplot(data = both_samples[both_samples$population != 0, ], aes(x = sample, y = CCF, group = population)) +
+  geom_line(aes(color = population), size = 2) +
+  labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+  
+  geom_point(aes(color = population), size = 4) +
+  #  Labelling as desired
+  xlab("Sample") + ylab("CCF")+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+        panel.background = element_blank(), axis.line = element_line(colour = "black"),
+        legend.key = element_rect(fill = NA, colour = NA, size = 0.25),axis.text=element_text(size=16),axis.title=element_text(size=18),
+        plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)))
+print(plotB)
+dev.off() 
+}
+
+plot_cp(both_samples,CCF_plot)
+
+
+#############################################
+##### Slope chart the best tree (VAF) #######
+
+
+plot_vaf<-function(ss,VAF_plot){
+  pdf(VAF_plot, width = 8, height =8 )
+  plotC <- ss %>% 
+    select(Hugo_Symbol, Chromosome, Tumor_Sample_Barcode, Start_Position, t_depth, t_alt_count, populations = phyloWGS_population) %>% 
+    mutate(VAF = t_alt_count/t_depth, 
+           populations = as.factor(populations)) %>% 
+    filter(!is.na(Tumor_Sample_Barcode)) %>%
+    ggplot(aes(x = Tumor_Sample_Barcode, 
+               y = VAF, 
+               group = interaction(populations, Start_Position),
+               color = populations)) + 
+    geom_line(aes(color = populations), size=0.2, alpha=0.4)+
+    labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+  
+    xlab("Sample") + ylab("VAF") +
+    guides(colour = guide_legend(override.aes = list(alpha = 3)))+
+    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+          panel.background = element_blank(), axis.line = element_line(colour = "black"),
+          legend.key = element_rect(fill = NA, colour = NA, size = 2),axis.text=element_text(size=16),axis.title=element_text(size=18),
+          plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)))
+print(plotC)
+dev.off() 
+}
+
+
+
+plot_vaf(ss,VAF_plot)
+
+#############################################################
+##### Slope chart the best tree (VAF, coding regions, nonsense, missense and splicing sites) #######
+
+drivers <- read_tsv(driver_genes, col_names = "gene") %>% pull(gene) 
+
+plot_vaf_coding<-function(maf,VAF_coding_plot){
+
+  coding <- ss %>% 
+    select(Hugo_Symbol, HGVSp_Short, Chromosome, Tumor_Sample_Barcode, Variant_Classification, Start_Position, t_depth, t_alt_count, populations = phyloWGS_population) %>% 
+    mutate(VAF = t_alt_count/t_depth, 
+           populations = as.factor(populations), 
+           Tumor_Sample_Barcode = factor(Tumor_Sample_Barcode, levels = sample_order)) %>% 
+    filter(!is.na(Tumor_Sample_Barcode), 
+           !Variant_Classification %in% c("Silent", "RNA", "IGR", "Intron", "5'Flank", "3'Flank", "5'UTR")) %>% 
+    mutate(label = ifelse(!is.na(HGVSp_Short), str_c(Hugo_Symbol, "_", HGVSp_Short), str_c(Hugo_Symbol, "_", Variant_Classification)))
+pdf(VAF_coding_plot, width = 8, height =8 )
+plotD<-coding %>%
+  ggplot(aes(x = Tumor_Sample_Barcode, 
+             y = VAF, 
+             group = interaction(populations, Start_Position),
+             color = populations)) + 
+  geom_line(aes(color = populations), size=0.5, alpha=0.4)+
+  geom_text_repel(
+    data = filter(group_by(coding, Hugo_Symbol, Start_Position), VAF == max(VAF), Tumor_Sample_Barcode == sample_order[1], Hugo_Symbol %in% drivers),
+    aes(label = label, 
+        x = Tumor_Sample_Barcode, 
+        y = VAF), 
+    nudge_x = -0.2,
+    size = 4
+    ) +
+  geom_text_repel(
+    data = filter(group_by(coding, Hugo_Symbol, Start_Position), VAF == max(VAF), Tumor_Sample_Barcode == sample_order[length(sample_order)], Hugo_Symbol %in% drivers),
+    aes(label = label, 
+        x = Tumor_Sample_Barcode, 
+        y = VAF), 
+    nudge_x = 0.2,
+    size = 4
+  ) +
+  labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+  
+  xlab("Sample") + ylab("VAF") +
+  guides(colour = guide_legend(override.aes = list(alpha = 3)))+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+        panel.background = element_blank(), axis.line = element_line(colour = "black"),
+        legend.key = element_rect(fill = NA, colour = NA, size = 2),axis.text=element_text(size=16),axis.title=element_text(size=18),
+        plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)))
+
+print(plotD)
+dev.off() 
+}  
+
+
+plot_vaf_coding(ss,VAF_coding_plot)
+
+#############################################
+#####       Draw the best tree        #######
+#############################################
+
+
+tree_structure_long <- tree_structure %>% 
+  pivot_longer(everything(), 
+               names_to = "parent", 
+               values_to = "node") %>% 
+  mutate(parent = str_remove_all(parent, ".*[.]")) %>% 
+  distinct()  
+
+
+positions_x <- function(parents){
+  x <- 1:length(unique(parents))
+  names(x) <- unique(parents)
+  col_vals <- unname(x[parents])
+  return(col_vals)
+}
+
+tree_structure_long$x <- positions_x(tree_structure_long$parent)
+
+positions_y <- function(tree_df){
+  y = c("0" = 0.5)
+  for(parent in unique(tree_df$parent)){
+    # parent = "1"
+    child_index = 1
+    num_children <- nrow(tree_df[tree_df$parent == parent,])
+    if(num_children == 1){
+      child <- tree_df[tree_df$parent == parent,]$node
+      child_y <- unname(y[parent])
+      names(child_y) <- child
+      y = c(y, child_y)
+      
+    } else {
+      children <- tree_df[tree_df$parent == parent,]$node
+      y_max <- unname(y[parent]) + (0.25 / child_index)
+      y_min <- unname(y[parent]) - (0.25 / child_index)
+      y_range <- seq(y_min, y_max, length.out = length(children))
+      names(y_range) <- children
+      y = c(y, y_range)
+    }
+    child_index = child_index + 1
+  }
+  return(y)
+}
+
+tree_structure_long$y <- unname(positions_y(tree_structure_long)[as.character(tree_structure_long$node)])
+
+tree_structure_long <- add_row(tree_structure_long, parent = "0", node = 0, x = 0, y = 0.5) 
+
+get_ssms <- function(tree_df, best_focal, best_tree_fileID){
+  data <- best_focal[[str_remove_all(best_tree_fileID, "[.].*")]]$populations
+  ssm_vec <- c()
+  for(node in tree_df$node){
+    # node = "1"
+    num_ssms <- data[[as.character(node)]]$num_ssms
+    names(num_ssms) <- as.character(node)
+    ssm_vec <- c(ssm_vec, num_ssms)
+  }
+  return(ssm_vec)
+}
+
+tree_structure_long$num_ssms <- get_ssms(tree_structure_long, best_focal, best_tree_fileID)[as.character(tree_structure_long$node)]
+
+tree_structure_long <- tree_structure_long %>% 
+  mutate(parent = as.numeric(parent)) %>%
+  left_join(select(tree_structure_long, node, xstart = x, ystart = y), 
+            by = c("parent" = "node"))
+
+
+
+ggplot(tree_structure_long, 
+       aes(x = x, 
+           y = y, 
+           label = node)) + 
+  geom_segment(inherit.aes = FALSE, 
+               aes(x = xstart,
+                   xend = x,
+                   y = ystart,
+                   yend = y)) +
+  geom_point(aes(size = num_ssms), 
+             fill = "white",
+             colour = "black", 
+             pch = 21) + 
+  geom_text() + 
+  scale_size(range = c(5,20)) + 
+  ylim(0,1) +
+  theme_void() + 
+  ggtitle(samplename) + 
+  theme(legend.position = "none")
+ 
+ggsave(tree_plot, height = 6, width = 6)
+
+############
+##### END ##
+############
+
+
+
+
diff --git a/modules/phylowgs/1.0/src/process_phyloWGS_outputs_updated.R b/modules/phylowgs/1.0/src/process_phyloWGS_outputs_updated.R
deleted file mode 100644
index 27a79dcc..00000000
--- a/modules/phylowgs/1.0/src/process_phyloWGS_outputs_updated.R
+++ /dev/null
@@ -1,417 +0,0 @@
-
-
-#'
-#' processing phyloWGS outputs pipeline that takes output json files and preprocessing output files
-#' SAMPLE_ID.mutass.zip file must be unzipped before runing the script
-
-#E example: how to run 
-#mkdir -p output
-#Rscript ./process.R --samplename SAMPLE_ID -j SAMPLE_ID.summ.json -t unziped.mutass/ -s ssm_data.txt -c cnv_data.txt -a SAMPLE_ID--matched_slms-3.final_deblacklisted_augmented.maf -b SAMPLE_ID_matched_slms-3.final_deblacklisted_augmented.maf -m SAMPLE_ID.muts.json -o out
-
-##################################################
-# load required libraries
-##################################################
-
-library("optparse")
-library("rjson")
-library("plyr")
-require("dplyr")
-require("ggplot2")
-library("tidyr")
-
-
-##################################################
-# Command line options
-##################################################
-
-option_list = list(
-  make_option(c("-n", "--samplename"), type="character", default=NULL, help="Samplename of the sample to run", metavar="character"),
-  make_option(c("-j", "--json_summ"), type="character", default=NULL, help="SAMPLE_ID.summ.json file generated by phyloWGS", metavar="character"),
-  make_option(c("-t", "--trees_out"), type="character", default=NULL, help="Directory containing unzipped XX.mutass.zip trees", metavar="character"),
-  make_option(c("-s", "--ssm"), type="character", default=NULL, help="Preprocessing ssm_data.txt output file", metavar="character"),
-  make_option(c("-c", "--copynumber"), type="character", default=NULL, help="Preprocessing cnv_data.txt output file", metavar="character"),
-  make_option(c("-a", "--tumourA_maf"), type="character", default=NULL, help="Agument maf file of tumour A", metavar="character"),
-  make_option(c("-b", "--tumourB_maf"), type="character", default=NULL, help="Agument maf file of tumour B", metavar="character"),
-  make_option(c("-m", "--json_muts"), type="character", default=NULL, help="SAMPLE_ID.muts.json file", metavar="character"),
-  make_option(c("-o", "--output"), type="character", default=NULL, help="Output directory", metavar="character")
-)
-
-opt_parser = OptionParser(option_list=option_list)
-opt = parse_args(opt_parser)
-
-samplename = opt$samplename
-json_file = opt$json_summ
-trees_out= opt$trees_out   ###  directory where unziped SAMPLE_ID.mutass.zip trees are
-ssm_file = opt$ssm
-cnv_file = opt$copynumber
-mafA = opt$tumourA_maf
-mafB = opt$tumourB_maf
-mut_file = opt$json_muts
-output_dir = opt$output
-
-
-
-.checkfile = function(infile) {
-  
-  if (!file.exists(infile)) {
-    
-    stop(paste("File", infile, "does not exist", sep=""))
-    
-  }
-  
-}
-
-
-.checkfile(json_file)
-.checkfile(ssm_file)
-.checkfile(cnv_file)
-.checkfile(mafA)
-.checkfile(mafB)
-.checkfile(mut_file)
-
-##################################################
-# Process input files
-###################################################
-# Parse the input file and obtain the required data for this run
-result1 <- fromJSON(file = json_file)
-result_mut<-fromJSON(file = mut_file)
-ssm_pre<-read.table(file = ssm_file, header = TRUE)
-cnv_pre<-read.delim(file = cnv_file, header = TRUE)[,c("cnv","a","d")]
-maf_TA<-read.delim(file = mafA, header = TRUE)
-maf_TB<-read.delim(file = mafB, header = TRUE)
-
-
-##################################################
-# define output files
-##################################################
-out_json_to_Rtable= file.path(output_dir, paste("out_res_",samplename,"_json_converted_toR.table", sep = ""))
-ssm_to_trees= file.path(output_dir, paste("out_res_",samplename,"_assigned_ssms_to_best_tree_maf_format.table", sep = ""))
-cnv_to_trees= file.path(output_dir, paste("out_res_",samplename,"_assigned_cnvs_to_best_tree_maf_format.table", sep = ""))
-cellular_prevalence_plot= file.path(output_dir, paste("cellular_prevalence_",samplename,".pdf", sep = ""))
-CCF_plot= file.path(output_dir, paste("cancer_cell_fraction_",samplename,".pdf", sep = ""))
-VAF_plot= file.path(output_dir, paste("vaf_",samplename,".pdf", sep = ""))
-VAF_coding_plot= file.path(output_dir, paste("vaf_ccoding",samplename,".pdf", sep = ""))
-
-
-###################################################
-# open summ.json file and convert it into humam readable format
-###################################################
-
-#this function opens SAMPLE_ID_summ.jason and converts it into R table
-open_tree = function(json_summ_file,out_json_to_Rtable){
-
-  out_res<-NULL
-  for (j in 1:length(json_summ_file[["trees"]])){
-  
-  tree_focal<-json_summ_file[["trees"]][j]
-  tree_focal_statA<-as.data.frame(t(unlist(sapply(tree_focal,function(x)x[c("clustering_index","branching_index","llh","linearity_index")])))) 
-  colnames(tree_focal_statA)<-c("clustering_index","branching_index","llh","linearity_index")
-  tree_focal_statA$tree_id<-j-1
-  rownames(tree_focal_statA)<-NULL
-  
-  
-  tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,-c(3,6,9,12,15,18,21,24,27,30)]
-  #tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,!(grepl("cellular_prevalence",colnames(tree_focal_statB)))]
-  colnames(tree_focal_statB)<-sub("^[^.]*.", "", colnames(tree_focal_statB))
-  stat_both<-cbind(tree_focal_statA,tree_focal_statB)
-  out_res<-rbind.fill(stat_both,out_res)  
-  out_res_ordered<-out_res[order(out_res$tree_id),] 
-  } # for j loop 
-
-density<-json_summ_file["tree_densities"]
-density_unlist<-data.frame("density"=unlist(density))
-row.names(density_unlist)<-sub("^[^.]*.", "", row.names(density_unlist))
-
-density_unlist$tree_id<-row.names(density_unlist)
-row.names(density_unlist)<-NULL
-
-final_table=merge(out_res_ordered,density_unlist, by.x = "tree_id", by.y = "tree_id")   ## add tree densities to all tress table
-write.table(final_table, file =out_json_to_Rtable ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE)
-return(final_table)
-}
-
-
-result_tree<-open_tree(result1,out_json_to_Rtable)
-
-
-###################################################
-# extrcats the best tree
-###################################################
-#the best tree is the tree with the highest density
-
-best_tree_id = function(R_table, density) {
-  best=R_table[which.max(R_table$density),]
-  best_tree_focal_name<-best$tree_id
-  best_tree_id<-paste(best$tree_id,"json",sep = ".")
-  return(best_tree_id)
-  return(best_tree_focal_name)
-}
-best_tree_fileID<-best_tree_id(result_tree, density)
-
-
-#######################################################################
-# extract the stats (SNvs and CNVs assigned to each population) from the best tree 
-#######################################################################
-
-open_best_tree = function(trees_out,best_tree_id){
-	rr <- fromJSON(file = paste(trees_out,best_tree_id, sep = "/")) 
-	return(rr)
-}
-rr= open_best_tree(trees_out,best_tree_fileID)
-
-
-#######################################################################
-# annotate point mutations and CNVs in the best tree
-#######################################################################
-best_focal<-result1[["trees"]][as.numeric(gsub(".json","",best_tree_fileID))+1]
-tree_structure<-as.data.frame(sapply(best_focal,function(x)x["structure"]))  ##[6]
-
-
-merge_both<-function(result1,best_tree_fileID,tree_structure){
-best_tree<-as.numeric(gsub(".json","",best_tree_fileID))
-best_focal<-result1[["trees"]][best_tree+1]
-tree_focal_statB<-as.data.frame(sapply(best_focal,function(x)x["populations"]))   ##[3]
-qq<-tree_focal_statB[,grep("cellular_prevalence",colnames(tree_focal_statB))]
-
-sample1<-data.frame(t(qq[1,]))
-sample1$sample_id<-rep("TumourA")
-sample1$population<-row.names(sample1)
-row.names(sample1)<-NULL
-colnames(sample1)<-c("cellular_prevalence","sample_id","population")
-sample1$population<-sub(".*?\\.(.*?\\..*?)\\..*", "\\1", sample1$population)
-sample1<-sample1[!(sample1$population=="populations.0"),]
-
-  if (length(unique(tree_structure[,grep("structure.0",names(tree_structure))]))==1){
-  sample1$CCF<-(sample1$cellular_prevalence/(max(sample1$cellular_prevalence)))
-  }  
-  if (length(unique(tree_structure[,grep("structure.0",names(tree_structure))]))==2){
-  sample1$CCF<-(sample1$cellular_prevalence/((max(sample1$cellular_prevalence))+(sort(sample1$cellular_prevalence)[length(sample1$cellular_prevalence)-1])))
-  } 
-  if (length(unique(tree_structure[,grep("structure.0",names(tree_structure))]))==3){
-  sample1$CCF<-(sample1$cellular_prevalence/((max(sample1$cellular_prevalence))+(sort(sample1$cellular_prevalence)[length(sample1$cellular_prevalence)-1])+(sort(sample1$cellular_prevalence)  [length(sample1$cellular_prevalence)-2])))
- }
-
-
-sample2<-data.frame(t(qq[2,]))
-sample2$sample_id<-rep("TumourB")
-sample2$population<-row.names(sample2)
-row.names(sample2)<-NULL
-colnames(sample2)<-c("cellular_prevalence","sample_id","population")
-sample2$population<-sub(".*?\\.(.*?\\..*?)\\..*", "\\1", sample2$population)
-sample2<-sample2[!(sample2$population=="populations.0"),]
-
-
-  if (length(unique(tree_structure[,grep("structure.0",names(tree_structure))]))==1){
-  sample2$CCF<-(sample2$cellular_prevalence/(max(sample2$cellular_prevalence)))
-  }  
-  if (length(unique(tree_structure[,grep("structure.0",names(tree_structure))]))==2){
-  sample2$CCF<-(sample2$cellular_prevalence/((max(sample2$cellular_prevalence))+(sort(sample2$cellular_prevalence)[length(sample2$cellular_prevalence)-1])))
-  } 
-  if (length(unique(tree_structure[,grep("structure.0",names(tree_structure))]))==3){
-  sample2$CCF<-(sample2$cellular_prevalence/((max(sample2$cellular_prevalence))+(sort(sample2$cellular_prevalence)[length(sample2$cellular_prevalence)-1])+(sort(sample2$cellular_prevalence) [length(sample2$cellular_prevalence)-2])))
-  }
-
-
-both_samples<-rbind(sample1,sample2)
-both_samples$best_tree_ID<-rep(best_tree)
-both_samples<-both_samples[!(both_samples$population=="populations.0"),]
-
-}
-
-both_samples<-merge_both(result1,best_tree_fileID,tree_structure)
-
-
-
-ssm = function(stat_best_tree, ssm_pre,ssm_to_trees,tree_structure){
-  
-  out_res_ssm<-NULL
-  for ( i in 1:length(stat_best_tree$mut_assignments)){
-    focal<-(stat_best_tree$mut_assignments)[i]
-  
-   focal_ssms<-data.frame(sapply(focal, function(x) x[1]))
-   	
-   	colnames(focal_ssms)<-sub("^[^.]*.", "", colnames(focal_ssms))
-   focal_ssms$populations_ssms<-paste("population",i, sep = "_")
-   ssm_assign<-merge(ssm_pre,focal_ssms, by.x = "id", by.y = "ssms")[,c("id", "gene","populations_ssms")]
-   ssm_assign_spi<-separate(ssm_assign, col = gene, into = c("Chromosome","Start_Position"), sep = "_")
-   ssm_assign_with_maf<- merge(ssm_assign_spi, maf_TA, by.x=c("Chromosome","Start_Position"), by.y=c("Chromosome","Start_Position"))
-   out_res_ssm<-rbind(ssm_assign_with_maf,out_res_ssm)    
-   	
-   } ## i loop 
-  
-   
-   #return(out_res_ssm)
-  
-       out_res_clonality<-NULL
-       for (pp in 1:nrow(out_res_ssm)){
-       focal_structure<-data.frame("yy"=unique(tree_structure[,grep("structure.0",names(tree_structure))]))
-       focal_row<-out_res_ssm[pp,]
-       if (gsub("population_","",focal_row$populations)%in%(focal_structure$yy)){
-       focal_row$clonal_status<-"clonal"
-    
-       } else {
-       focal_row$clonal_status<-"subclonal"
-    }
-   
-  out_res_clonality<-rbind(out_res_clonality,focal_row)
-
- } ## pp loop 
-    write.table(out_res_clonality, file =ssm_to_trees ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE)
-
-   return(out_res_clonality)
-}
-
-ss<-ssm(rr,ssm_pre,ssm_to_trees,tree_structure)
-
-
-###########################################################
-## load mut file to extrcat CNVs start and end positions
-###########################################################
-
-cnv = function(stat_best_tree, cnv_pre,mutation_file,cnv_to_trees){
-  out_res_cnv<-NULL
-  for ( ii in 1:length(stat_best_tree$mut_assignments)){     ## ii loop assigns populations to CNVs
-  focal_cnvs_l<-(stat_best_tree$mut_assignments)[ii]
-  
-  focal_cnvs<-data.frame(sapply(focal_cnvs_l, function(x) x[2]))
-  colnames(focal_cnvs)<-sub("^[^.]*.", "", colnames(focal_cnvs))
-  focal_cnvs$populations_cnvs<-paste("population",ii, sep = "_")
-  out_res_cnv<-rbind.fill(focal_cnvs,out_res_cnv)
-  
-  } ## ii loop 
-  
-  #return(out_res_cnv)
-      out_res_mut<-NULL
-      for (cn in 1:length(result_mut$cnvs)){
-      focal_mut_cnv<-(result_mut$cnvs)[cn]
-  
-      focal_mut<-data.frame(sapply(focal_mut_cnv, function(x) x[1]))[1,]
-      colnames(focal_mut)<-sub("^[^.]*.", "", colnames(focal_mut))
-      focal_mut$cnv_id<-names(focal_mut_cnv)
-      out_res_mut<-rbind.fill(focal_mut,out_res_mut)
-      } ## cn loop
-  
-    both_cnvs<-merge(out_res_cnv, out_res_mut, by.x = "cnvs",by.y = "cnv_id")  
-    write.table(both_cnvs, file =cnv_to_trees ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE)
-  
-}
-
-
-cnv<-cnv(rr, cnv_pre,result_mut,cnv_to_trees)
-
-
-##################### plot the results ####################
-###########################################################
-#### Slope chart the best tree, cellular prevalence #######
-
-plot_cp<-function(both_samples,cellular_prevalence_plot){
-pdf(cellular_prevalence_plot, width = 8, height =8 )
-plotA<-ggplot(data = both_samples, aes(x = sample_id, y = cellular_prevalence, group = population)) +
-  geom_line(aes(color = population), size = 2) +
-  labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+  
-  geom_point(aes(color = population), size = 4) +
-  #  Labelling as desired
-  xlab("Sample") + ylab("Cellular prevalence")+
-  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
-        panel.background = element_blank(), axis.line = element_line(colour = "black"),
-        legend.key = element_rect(fill = NA, colour = NA, size = 0.25),axis.text=element_text(size=16),axis.title=element_text(size=18),
-        plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)))
-print(plotA)
-dev.off() 
-}
-
-plot_cp(both_samples,cellular_prevalence_plot)
-
-###########################################################
-#### Slope chart the best tree, CCF #######
-
-plot_cp<-function(both_samples,CCF_plot){
-pdf(CCF_plot, width = 8, height =8 )
-plotB<-ggplot(data = both_samples, aes(x = sample_id, y = CCF, group = population)) +
-  geom_line(aes(color = population), size = 2) +
-  labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+  
-  geom_point(aes(color = population), size = 4) +
-  #  Labelling as desired
-  xlab("Sample") + ylab("CCF")+
-  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
-        panel.background = element_blank(), axis.line = element_line(colour = "black"),
-        legend.key = element_rect(fill = NA, colour = NA, size = 0.25),axis.text=element_text(size=16),axis.title=element_text(size=18),
-        plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)))
-print(plotB)
-dev.off() 
-}
-
-plot_cp(both_samples,CCF_plot)
-
-
-#############################################
-##### Slope chart the best tree (VAF) #######
-
-maf_TA$VAF<-(maf_TA$t_alt_count/maf_TA$t_depth)
-maf_TA$tumour_type<-rep("TumourA")
-TA = merge(ss[,c("Chromosome","Start_Position","id", "populations_ssms")], maf_TA, by.x = c("Chromosome","Start_Position"),by.y=c("Chromosome","Start_Position"))
-
-
-maf_TB$VAF<-(maf_TB$t_alt_count/maf_TB$t_depth)
-maf_TB$tumour_type<-rep("TumourB")
-TB = merge(ss[,c("Chromosome","Start_Position","id", "populations_ssms")], maf_TB, by.x = c("Chromosome","Start_Position"),by.y=c("Chromosome","Start_Position"))
-
-both_Ts<-rbind(TA,TB)
-both_Ts$populations_ssms<-gsub("_","",both_Ts$populations_ssms)
-colnames(both_Ts)[4]<-"populations"
-
-
-plot_vaf<-function(both_Ts,VAF_plot){
-pdf(VAF_plot, width = 8, height =8 )
-plotC<-ggplot (data =both_Ts , aes(x = tumour_type, y = VAF, group = interaction(populations, Start_Position) ,color = populations)) + 
-  geom_line(aes(color = populations), size=0.2, alpha=0.4)+
-  labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+  
-  xlab("Sample") + ylab("VAF") +
-  guides(colour = guide_legend(override.aes = list(alpha = 3)))+
-  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
-        panel.background = element_blank(), axis.line = element_line(colour = "black"),
-        legend.key = element_rect(fill = NA, colour = NA, size = 2),axis.text=element_text(size=16),axis.title=element_text(size=18),
-        plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)))
-print(plotC)
-dev.off() 
-}
- 
-plot_vaf(both_Ts,VAF_plot)
-
-#############################################################
-##### Slope chart the best tree (VAF, coding regions, nonsense, missense and splicing sites) #######
-
-both_Ts_coding<-both_Ts[(both_Ts$Variant_Classification == "Missense_Mutation"| both_Ts$Variant_Classification =="Nonsense_Mutation"|both_Ts$Variant_Classification =="Splice_Site"),]
-
-plot_vaf_coding<-function(vaf_coding,VAF_coding_plot){
-
-pdf(VAF_coding_plot, width = 8, height =8 )
-plotD<-ggplot (data =both_Ts_coding , aes(x = tumour_type, y = VAF, group = interaction(populations, Start_Position) ,color = populations)) + 
-  geom_line(aes(color = populations), size=0.7, alpha=0.8)+
-  labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+ 
-  geom_text(data = both_Ts_coding %>% filter(tumour_type == "TumourB"), 
-            aes(label = Hugo_Symbol) , 
-            hjust = -0.3, 
-            size = 2) +
-            xlab("Sample") + ylab("VAF") +
-            guides(colour = guide_legend(override.aes = list(alpha = 3)))+
-            theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
-            panel.background = element_blank(), axis.line = element_line(colour = "black"),
-            legend.key = element_rect(fill = NA, colour = NA, size = 2),axis.text=element_text(size=16),axis.title=element_text(size=18),
-            plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)))
-
-print(plotD)
-dev.off() 
-}  
-
-
-plot_vaf_coding(both_Ts_coding,VAF_coding_plot)
- 
-
-############
-##### END ##
-############
-
-
-
-

From c5d47948ce3f086c8246e9ba26bb8f3ccb0f0416 Mon Sep 17 00:00:00 2001
From: lkhilton <laura.k.hilton@gmail.com>
Date: Mon, 12 Sep 2022 10:31:05 -0700
Subject: [PATCH 03/14] Remove unneeded fill_battenberg files

---
 .../phylowgs/1.0/envs/fill_battenberg.yaml    |   1 -
 .../1.0/etc/chromArmFiles/chromArm.grch37.tsv |  49 ---
 .../1.0/etc/chromArmFiles/chromArm.grch38.tsv |  49 ---
 .../1.0/etc/chromArmFiles/chromArm.hg19.tsv   |  49 ---
 .../1.0/etc/chromArmFiles/chromArm.hg38.tsv   |  49 ---
 .../1.0/etc/chromArmFiles/chromArm.hs37d5.tsv |   1 -
 modules/phylowgs/1.0/src/fill_battenberg.py   | 379 ------------------
 7 files changed, 577 deletions(-)
 delete mode 120000 modules/phylowgs/1.0/envs/fill_battenberg.yaml
 delete mode 100644 modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch37.tsv
 delete mode 100644 modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch38.tsv
 delete mode 100644 modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg19.tsv
 delete mode 100644 modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg38.tsv
 delete mode 120000 modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hs37d5.tsv
 delete mode 100644 modules/phylowgs/1.0/src/fill_battenberg.py

diff --git a/modules/phylowgs/1.0/envs/fill_battenberg.yaml b/modules/phylowgs/1.0/envs/fill_battenberg.yaml
deleted file mode 120000
index e667a8b1..00000000
--- a/modules/phylowgs/1.0/envs/fill_battenberg.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../../../../envs/phylowgs/fill_battenberg.yaml
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch37.tsv b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch37.tsv
deleted file mode 100644
index 91da51a7..00000000
--- a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch37.tsv
+++ /dev/null
@@ -1,49 +0,0 @@
-chromosome	start	end	arm
-1	10000	121500000	p
-1	142600000	249250621	q
-2	10000	90500000	p
-2	96800000	243199373	q
-3	10000	87900000	p
-3	98300000	198022430	q
-4	10000	48200000	p
-4	52700000	191154276	q
-5	10000	46100000	p
-5	50700000	180915260	q
-6	10000	58700000	p
-6	63300000	171115067	q
-7	10000	58000000	p
-7	61700000	159138663	q
-8	10000	43100000	p
-8	48100000	146364022	q
-9	10000	47300000	p
-9	65900000	141213431	q
-10	10000	38000000	p
-10	42300000	135534747	q
-11	10000	51600000	p
-11	55700000	135006516	q
-12	10000	33300000	p
-12	38200000	133851895	q
-13	10000	16000000	p
-13	19500000	115169878	q
-14	10000	14000000	p
-14	19100000	107349540	q
-15	10000	14000000	p
-15	20700000	102531392	q
-16	10000	34600000	p
-16	47000000	90354753	q
-17	10000	22200000	p
-17	25800000	81195210	q
-18	10000	15400000	p
-18	19000000	78077248	q
-19	10000	20000000	p
-19	32400000	59128983	q
-20	10000	25600000	p
-20	29400000	63025520	q
-21	10000	10000000	p
-21	14300000	48129895	q
-22	10000	11900000	p
-22	17900000	51304566	q
-X	10000	58100000	p
-X	63000000	155270560	q
-Y	10000	11600000	p
-Y	13400000	28800000	q
diff --git a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch38.tsv b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch38.tsv
deleted file mode 100644
index 58b866e2..00000000
--- a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.grch38.tsv
+++ /dev/null
@@ -1,49 +0,0 @@
-chromosome	start	end	arm
-1	10000	121700000	p
-1	143200000	248956422	q
-2	0	91800000	p
-2	96000000	242193529	q
-3	0	87800000	p
-3	98600000	198295559	q
-4	0	48200000	p
-4	51800000	190214555	q
-5	0	46100000	p
-5	51400000	181538259	q
-6	0	58500000	p
-6	62600000	170805979	q
-7	0	58100000	p
-7	62100000	159345973	q
-8	0	43200000	p
-8	47200000	145138636	q
-9	0	42200000	p
-9	61500000	138394717	q
-10	10000	38000000	p
-10	41600000	133797422	q
-11	10000	51000000	p
-11	55800000	135086622	q
-12	10000	33200000	p
-12	37800000	133275309	q
-13	10000	16000000	p
-13	18900000	114364328	q
-14	10000	16000000	p
-14	18200000	107043718	q
-15	10000	16000000	p
-15	20500000	101991189	q
-16	0	35300000	p
-16	47000000	90338345	q
-17	0	22700000	p
-17	27400000	83257441	q
-18	0	15400000	p
-18	21500000	80373285	q
-19	0	19900000	p
-19	31900000	58617616	q
-20	0	25700000	p
-20	30400000	64444167	q
-21	0	10500000	p
-21	13000000	46709983	q
-22	10000	14000000	p
-22	17400000	50818468	q
-X	0	58100000	p
-X	63800000	156040895	q
-Y	0	10300000	p
-Y	10600000	26600000	q
diff --git a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg19.tsv b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg19.tsv
deleted file mode 100644
index a3c8be28..00000000
--- a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg19.tsv
+++ /dev/null
@@ -1,49 +0,0 @@
-chromosome	start	end	arm
-chr1	10000	121500000	p
-chr1	142600000	249250621	q
-chr2	10000	90500000	p
-chr2	96800000	243199373	q
-chr3	10000	87900000	p
-chr3	98300000	198022430	q
-chr4	10000	48200000	p
-chr4	52700000	191154276	q
-chr5	10000	46100000	p
-chr5	50700000	180915260	q
-chr6	10000	58700000	p
-chr6	63300000	171115067	q
-chr7	10000	58000000	p
-chr7	61700000	159138663	q
-chr8	10000	43100000	p
-chr8	48100000	146364022	q
-chr9	10000	47300000	p
-chr9	65900000	141213431	q
-chr10	10000	38000000	p
-chr10	42300000	135534747	q
-chr11	10000	51600000	p
-chr11	55700000	135006516	q
-chr12	10000	33300000	p
-chr12	38200000	133851895	q
-chr13	10000	16000000	p
-chr13	19500000	115169878	q
-chr14	10000	14000000	p
-chr14	19100000	107349540	q
-chr15	10000	14000000	p
-chr15	20700000	102531392	q
-chr16	10000	34600000	p
-chr16	47000000	90354753	q
-chr17	10000	22200000	p
-chr17	25800000	81195210	q
-chr18	10000	15400000	p
-chr18	19000000	78077248	q
-chr19	10000	20000000	p
-chr19	32400000	59128983	q
-chr20	10000	25600000	p
-chr20	29400000	63025520	q
-chr21	10000	10000000	p
-chr21	14300000	48129895	q
-chr22	10000	11900000	p
-chr22	17900000	51304566	q
-chrX	10000	58100000	p
-chrX	63000000	155270560	q
-chrY	10000	11600000	p
-chrY	13400000	28800000	q
diff --git a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg38.tsv b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg38.tsv
deleted file mode 100644
index 4b5d7b6a..00000000
--- a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hg38.tsv
+++ /dev/null
@@ -1,49 +0,0 @@
-chromosome	start	end	arm
-chr1	10000	121700000	p
-chr1	143200000	248956422	q
-chr2	0	91800000	p
-chr2	96000000	242193529	q
-chr3	0	87800000	p
-chr3	98600000	198295559	q
-chr4	0	48200000	p
-chr4	51800000	190214555	q
-chr5	0	46100000	p
-chr5	51400000	181538259	q
-chr6	0	58500000	p
-chr6	62600000	170805979	q
-chr7	0	58100000	p
-chr7	62100000	159345973	q
-chr8	0	43200000	p
-chr8	47200000	145138636	q
-chr9	0	42200000	p
-chr9	61500000	138394717	q
-chr10	10000	38000000	p
-chr10	41600000	133797422	q
-chr11	10000	51000000	p
-chr11	55800000	135086622	q
-chr12	10000	33200000	p
-chr12	37800000	133275309	q
-chr13	10000	16000000	p
-chr13	18900000	114364328	q
-chr14	10000	16000000	p
-chr14	18200000	107043718	q
-chr15	10000	16000000	p
-chr15	20500000	101991189	q
-chr16	0	35300000	p
-chr16	47000000	90338345	q
-chr17	0	22700000	p
-chr17	27400000	83257441	q
-chr18	0	15400000	p
-chr18	21500000	80373285	q
-chr19	0	19900000	p
-chr19	31900000	58617616	q
-chr20	0	25700000	p
-chr20	30400000	64444167	q
-chr21	0	10500000	p
-chr21	13000000	46709983	q
-chr22	10000	14000000	p
-chr22	17400000	50818468	q
-chrX	0	58100000	p
-chrX	63800000	156040895	q
-chrY	0	10300000	p
-chrY	10600000	26600000	q
diff --git a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hs37d5.tsv b/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hs37d5.tsv
deleted file mode 120000
index c8477855..00000000
--- a/modules/phylowgs/1.0/etc/chromArmFiles/chromArm.hs37d5.tsv
+++ /dev/null
@@ -1 +0,0 @@
-chromArm.grch37.tsv
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/src/fill_battenberg.py b/modules/phylowgs/1.0/src/fill_battenberg.py
deleted file mode 100644
index ec6d3cd5..00000000
--- a/modules/phylowgs/1.0/src/fill_battenberg.py
+++ /dev/null
@@ -1,379 +0,0 @@
-#!/usr/bin/python3
-
-"""
-This script will fill empty segments in Battenberg subclones.txt files. It is adapted from Kostiantyn Dreval's fill_segments.py script. 
-It requires seg file and chromosome arms file as mandatory inputs. The path to output file also must be specified.
-
-Example:
-
-python3 fill_battenberg.py --input </path/to/input_file>.subclones.txt --chromArm </path/to/chrom_arm_file>.tsv --output </path/to/output_file>.subclones.txt
-"""
-
-# import required modules
-import pandas as pd
-import argparse
-
-
-def main():
-    # initiate the parser and handle arguments from command line
-    args = parse_args()
-    input_file = args.input
-    output_file = args.output
-    chrom_file = args.chromArm
-
-    # determine the format of input file
-    input_format = input_file[-3:]
-    
-    # check arguments given in command line
-    # check_arguments(args, input_format)
-
-    # create a dictionary containing coordinates of chromosome arms
-    arm_chrom = load_chrom_arm(chrom_file)
-    # get the order of chromosomes
-    chrom_order = list(arm_chrom.keys()) + ["buffer"]
-
-    # initialize empty variable for the new segments
-    columns_new = []
-    columns_edges = []
-
-    # initialize list to store all segments, since it is faster than concatenating pd df with large number of segments
-    seg_filled = []
-
-    # assign values to be used to fill normal CN segments
-    empty_baf = float(0.5)
-    empty_pval = int(1)
-    empty_logr = int(0)
-    empty_ntot = float(2.0)
-    empty_nMaj1_A = int(1)
-    empty_nMin1_A = int(1)
-    empty_frac1_A = int(1)
-    empty_nMaj2_A = int(1)
-    empty_nMin2_A = int(1)
-    empty_frac2_A = int(1)
-    
-    columns_fill = [empty_baf, empty_pval, empty_logr, empty_ntot, empty_nMaj1_A, empty_nMin1_A, empty_frac1_A, empty_nMaj2_A, empty_nMin2_A, empty_frac2_A]
-
-    # fill segments
-    seg = open(input_file, 'r+') 
-    lines=seg.readlines()
-
-    # remove segments with NA values for nMaj1_A or nMin1_B
-    print("Removing segments with NA values for nMaj1_A or nMin1_A...")
-    to_remove = [i for i, line in enumerate(lines) if line[7]=="NA" or line[8] == "NA"]
-    if len(to_remove) > 0: 
-      for index in reversed(to_remove): # start at the end to avoid recomputing offsets
-          del lines[index]
-    
-
-    # first, get header of the file
-    header=lines[0].rstrip("\n").rstrip("\r").split("\t")
-
-    print("Filling missing segments and smoothing centromeres...")
-    # next, go through each segment, skipping the header
-    for i in range(1,len(lines)-1):
-
-        # read 2 segments at a time to compare coordinates of end of previous sefment, and start of the next segments
-        columns_first = (lines[i].rstrip("\n").rstrip("\r")).split("\t")
-        columns_second = (lines[i+1].rstrip("\n").rstrip("\r")).split("\t")
-
-        # insert empty segment from the beginning of chromosome of the first segment in file to complete the telomeric region of first chromosome
-        if i==1: 
-            columns_new = [columns_first[0], str(arm_chrom[columns_first[0]]['p']['start']), str(int(columns_first[1])-1)] + columns_fill
-            seg_filled.append(columns_new)
-            # seg_filled.append(columns_first) I think this is a duplicate of the one below after fixing the possible centromeric end of the segment
-            # deal with fencepost problem
-            if (int(columns_first[2]) > arm_chrom[columns_first[0]]['p']['end'] and int(columns_first[2]) < arm_chrom[columns_first[0]]['q']['start']):            
-                columns_first[2] = str(arm_chrom[columns_first[0]]['p']['end'])
-            seg_filled.append(columns_first)
-
-            if (chrom_order[chrom_order.index(columns_second[0])] == chrom_order[chrom_order.index(columns_first[0])+1]):
-              missing_arm = chrom_order[chrom_order.index(columns_first[0])]
-              columns_edges = [columns_first[0], str(arm_chrom[missing_arm]['q']['start']), str(arm_chrom[missing_arm]['q']['end'])] + columns_fill
-              seg_filled.append(columns_edges)
-              seg_filled.append(columns_second)
-              continue        
-
-        # scenario 1: segments on the same chromosome
-        if (columns_first[0]==columns_second[0]):
-
-            # handle very rare overlapping segments (occurs ~ 0.008%)
-            if (int(columns_first[2]) > int(columns_second[1])):
-                columns_first[2] = int(columns_second[1])-1
-                seg_filled.append(columns_first)
-                pass
-
-            # for segments in p arm
-            if (int(columns_second[1]) < arm_chrom[columns_second[0]]['p']['end']):
-                # create empty segment to fill in
-                columns_new = [columns_first[0], str(int(columns_first[2])+1), str(int(columns_second[1])-1)] + columns_fill
-                seg_filled.append(columns_new)
-                next_segment = (lines[i+1].rstrip("\n").rstrip("\r")).split("\t")
-                if (int(columns_second[2]) < arm_chrom[columns_second[0]]['p']['end'] and int(columns_second[2]) < int(next_segment[1])):
-                    seg_filled.append(columns_second)
-                seg_filled.append(columns_second)    
-
-            # deal with centromeres
-            # I already know that this is same sample, and same chromosome
-            elif (int(columns_first[1]) < arm_chrom[columns_first[0]]['p']['end'] and int(columns_second[1]) > arm_chrom[columns_second[0]]['p']['end']):
-
-                # first lets deal with end of p arm: segment 1 might end before centromere, or within centromere
-                if int(columns_first[2]) < arm_chrom[columns_first[0]]['p']['end']:
-                    columns_new = [columns_first[0], str(int(columns_first[2])+1), str(arm_chrom[columns_first[0]]['p']['end'])] + columns_fill
-                    seg_filled.append(columns_new)
-                # if it extends into centromere, cut segment 1 at the end of p arm
-                else:
-                    columns_first[2] = str(arm_chrom[columns_first[0]]['p']['end'])
-                    seg_filled.append(columns_first)
-
-                # now lets deal with start of q arm: it might start within or after centromere
-                if int(columns_second[1]) < arm_chrom[columns_second[0]]['q']['start']:
-                    columns_second[1] = str(arm_chrom[columns_second[0]]['q']['start'])
-                    seg_filled.append(columns_second)
-                    next_segment = (lines[i+1].rstrip("\n").rstrip("\r")).split("\t")
-                    if (int(next_segment[1]) < arm_chrom[next_segment[0]]['q']['start'] and int(next_segment[2]) > arm_chrom[next_segment[0]]['q']['start']):
-                        next_segment[1] = str(arm_chrom[next_segment[0]]['q']['start'])
-                        seg_filled.append(next_segment)
-
-                # possible edge cases around centromere
-                else:
-                    columns_new = [columns_second[0], str(arm_chrom[columns_second[0]]['q']['start']), str(int(columns_second[2])-1)] + columns_fill
-                    previous_segment = (lines[i].rstrip("\n").rstrip("\r")).split("\t")
-                    if (int(previous_segment[2])>arm_chrom[columns_second[0]]['q']['start']):
-                        columns_edges = [columns_second[0], str(arm_chrom[columns_second[0]]['q']['start']), str(int(previous_segment[2]))] + columns_first[3:13]
-                        seg_filled.append(columns_edges)
-                        columns_new = [columns_edges[0], str(int(columns_edges[2])+1), str(int(columns_second[1])-1)] + columns_fill
-                    seg_filled.append(columns_new)
-                    seg_filled.append(columns_second)
-
-            # for segments in q arm
-            elif (int(columns_first[1]) > arm_chrom[columns_second[0]]['q']['start']):
-                # create empty segment to fill in
-                columns_new = [columns_first[0], str(int(columns_first[2])+1), str(int(columns_second[1])-1)] + columns_fill
-                seg_filled.append(columns_new)
-                seg_filled.append(columns_second)
-
-            # some segments are completely within centromere. drop them
-            elif (int(columns_first[1]) > arm_chrom[columns_first[0]]['p']['end'] and int(columns_first[1]) < arm_chrom[columns_first[0]]['q']['start']): 
-                if (int(columns_first[2]) > arm_chrom[columns_first[0]]['q']['start']):
-                  columns_new = [columns_first[0], str(arm_chrom[columns_first[0]]['q']['start']), str(int(columns_first[2]))] + columns_first[3:13]
-                  columns_edges = [columns_first[0], str(int(columns_first[2])+1), str(int(columns_second[1])-1)] + columns_fill
-                  seg_filled.append(columns_new)
-                else:
-                  columns_edges = [columns_second[0], str(arm_chrom[columns_second[0]]['q']['start']), str(int(columns_second[1])-1)] + columns_fill
-                  if (int(columns_second[1]) > arm_chrom[columns_second[0]]['p']['end'] and int(columns_second[1]) < arm_chrom[columns_second[0]]['q']['start']):
-                    columns_second[1] = str(arm_chrom[columns_second[0]]['q']['start'])
-                    seg_filled.append(columns_second)
-                seg_filled.append(columns_edges)
-                if (int(columns_second[1]) > arm_chrom[columns_second[0]]['p']['end'] and int(columns_second[2]) < arm_chrom[columns_second[0]]['q']['start']):
-                  pass # this just drops the segment from output if it is within centromere
-                else:
-                  if (int(columns_second[1])>arm_chrom[columns_second[0]]['q']['start']):
-                    seg_filled.append(columns_second)
-            elif (int(columns_second[1]) > arm_chrom[columns_second[0]]['p']['end'] and int(columns_second[1]) < arm_chrom[columns_second[0]]['q']['start']):
-                pass # this is handled later
-
-            # did I miss anything? it is possible some edge cases were not considered at time of script development
-            else:
-              print(columns_first[0], columns_second[0], columns_first[1], columns_second[1], columns_first[2], columns_second[2])
-              raise ValueError ("Other sort of way. This is an edge case that needs debugging!")
-
-        # scenario 2: same sample, but going over to the new chromosome
-        elif (columns_first[0]!=columns_second[0]):
-            # very rare cases when whole chromosome is missing, identify them here
-            if (chrom_order[chrom_order.index(columns_second[0])] != chrom_order[chrom_order.index(columns_first[0])+1]):
-              missing_chrom = chrom_order[chrom_order.index(columns_first[0])+1]
-              missing_p = [missing_chrom, str(arm_chrom[missing_chrom]['p']['start']), str(arm_chrom[missing_chrom]['p']['end'])] + columns_fill
-              missing_q = [missing_chrom, str(arm_chrom[missing_chrom]['q']['start']), str(arm_chrom[missing_chrom]['q']['end'])] + columns_fill
-              seg_filled.append(missing_p)
-              seg_filled.append(missing_q)
-            
-            # first, are there any segments in the p arm? that means second segments starts all the way in centromere or q arm
-            if (int(columns_first[2]) > arm_chrom[columns_first[0]]['q']['start']): #TRUE
-              if (int(columns_first[1]) > arm_chrom[columns_first[0]]['p']['end'] and int(columns_first[1]) < arm_chrom[columns_first[0]]['q']['start']): #FALSE
-                  previous_segment = (lines[i-1].rstrip("\n").rstrip("\r")).split("\t")
-                  if (chrom_order[chrom_order.index(previous_segment[0])] != chrom_order[chrom_order.index(columns_first[0])]):
-                    columns_edges = [columns_first[0], str(arm_chrom[columns_first[0]]['p']['start']), str(arm_chrom[columns_first[0]]['p']['end'])] + columns_fill
-                    seg_filled.append(columns_edges)  
-                  # pass
-              elif (int(columns_first[1]) < arm_chrom[columns_first[0]]['p']['end'] and int(columns_first[2]) > arm_chrom[columns_first[0]]['q']['start']): #TRUE
-                  previous_segment = (lines[i-1].rstrip("\n").rstrip("\r")).split("\t")
-                  if (chrom_order[chrom_order.index(previous_segment[0])] != chrom_order[chrom_order.index(columns_first[0])-1]):
-                      columns_edges = [columns_first[0], columns_first[1], str(arm_chrom[columns_first[0]]['p']['end'])] + columns_first[3:13]
-                      columns_first[1] = arm_chrom[columns_first[0]]['q']['start']
-                      seg_filled.append(columns_edges)
-                      seg_filled.append(columns_first)
-                  # Case when a chromosome has a single event that spans the centromere, split into four parts (p-start to seg-start, seg-start to p-end, q-start to seg-end, seg-end to q-end)
-                  else: 
-                    columns_pedge = [columns_first[0], str(arm_chrom[columns_first[0]]['p']['start']), int(columns_first[1])-1] + columns_fill
-                    columns_segp = [columns_first[0], columns_first[1], str(arm_chrom[columns_first[0]]['p']['end'])] + columns_first[3:13]
-                    columns_segq = [columns_first[0], str(arm_chrom[columns_first[0]]['q']['start']), columns_first[2]] + columns_first[3:13]
-                    columns_qedge = [columns_first[0], int(columns_first[2]) + 1, str(arm_chrom[columns_first[0]]['q']['end'])] + columns_fill
-                    seg_filled.append(columns_pedge)
-                    seg_filled.append(columns_segp)
-                    seg_filled.append(columns_segq)
-                    seg_filled.append(columns_qedge)
-
-              else:
-                  columns_edges = [columns_first[0], str(int(columns_first[2])+1), str(arm_chrom[columns_first[0]]['q']['end'])] + columns_fill
-                  seg_filled.append(columns_edges)
-            if (int(columns_second[1]) > arm_chrom[columns_second[0]]['p']['end']):
-              if (chrom_order[chrom_order.index(columns_second[0])] != chrom_order[chrom_order.index(columns_first[0])+1]):
-                seg_filled.append(missing_p)
-                seg_filled.append(missing_q)
-              if (int(columns_first[1]) < arm_chrom[columns_first[0]]['q']['start'] and int(columns_first[1]) > arm_chrom[columns_first[0]]['p']['end']):
-                columns_first[1] = arm_chrom[columns_first[0]]['q']['start']
-                seg_filled.append(columns_first)                  
-              columns_new = [columns_second[0], str(arm_chrom[columns_second[0]]['p']['start']), str(arm_chrom[columns_second[0]]['p']['end'])] + columns_fill
-              seg_filled.append(columns_new)
-              if (int(columns_second[1]) > arm_chrom[columns_second[0]]['q']['start']):
-                columns_edges = [columns_second[0], str(arm_chrom[columns_second[0]]['q']['start']), str(int(columns_second[1])-1)] + columns_fill
-                seg_filled.append(columns_edges)
-              else:
-                columns_second[1] = str(arm_chrom[columns_second[0]]['q']['start'])
-              seg_filled.append(columns_second)
-
-            # are there any segments in the q arm? that means first segment ends before start of q arm
-            elif (int(columns_first[2]) < arm_chrom[columns_first[0]]['q']['start']):
-              columns_new = [columns_first[0], str(arm_chrom[columns_first[0]]['q']['start']), str(arm_chrom[columns_first[0]]['q']['end'])] + columns_fill
-              seg_filled.append(columns_first)
-              if (int(columns_first[2]) < arm_chrom[columns_first[0]]['p']['end']):
-                  columns_edges = [columns_first[0], str(int(columns_first[2])+1), str(arm_chrom[columns_first[0]]['p']['end'])] + columns_fill
-                  seg_filled.append(columns_edges)
-              seg_filled.append(columns_new)
-              seg_filled.append(columns_second)              
-
-            # are there any segments that starts in p arm and span centromere? if so, maintain loh flag and logr, but cut out centromere
-            elif (int(columns_second[1]) < arm_chrom[columns_second[0]]['p']['end'] and int(columns_second[2]) > arm_chrom[columns_second[0]]['q']['start']):
-              previous_segment = (lines[i].rstrip("\n").rstrip("\r")).split("\t")
-              if "X" not in str(columns_second[0]):
-                next_segment = (lines[i+2].rstrip("\n").rstrip("\r")).split("\t")
-                columns_new = [columns_second[0], str(int(columns_second[1])+1), str(arm_chrom[columns_second[0]]['p']['end'])] + columns_first[3:13]
-                columns_edges = [columns_second[0], str(arm_chrom[columns_second[0]]['q']['start']), str(int(columns_second[2]))] + columns_first[3:13]
-                if (columns_new[0]!=previous_segment[0]):
-                    columns_new[1]=str(arm_chrom[columns_new[0]]['p']['start'])
-                if (columns_second[0]==next_segment[0]):
-                    seg_filled.append(columns_new)
-                    seg_filled.append(columns_edges)
-
-            # in other cases, there are segments both in p and q arms
-            else:
-              columns_edges = [columns_second[0], str(arm_chrom[columns_first[0]]['p']['start']), str(int(columns_second[1])-1)] + columns_fill
-              if (int(columns_first[1]) > arm_chrom[columns_second[0]]['p']['end']):
-                if (int(columns_first[1]) > arm_chrom[columns_first[0]]['p']['end'] and int(columns_first[1]) < arm_chrom[columns_first[0]]['q']['start']):
-                  columns_first[1] = arm_chrom[columns_first[0]]['q']['start']
-                  seg_filled.append(columns_first)                
-                columns_new = [columns_first[0], str(int(columns_first[2])+1), str(arm_chrom[columns_first[0]]['q']['end'])] + columns_fill
-                seg_filled.append(columns_new)
-              if (chrom_order[chrom_order.index(columns_second[0])] != chrom_order[chrom_order.index(columns_first[0])+1]):
-                seg_filled.append(missing_p)
-                seg_filled.append(missing_q)
-              seg_filled.append(columns_edges)
-              if (int(columns_second[2]) < arm_chrom[columns_second[0]]['p']['end']):
-                seg_filled.append(columns_second)
-
-
-        # scenario 3: new sample, obviously new chromosome
-        else:
-            previous_segment = (lines[i].rstrip("\n").rstrip("\r")).split("\t")
-            columns_edges = [columns_first[0], str(int(previous_segment[2])+1), str(arm_chrom[columns_first[0]]['q']['end'])] + columns_fill
-            columns_new = [columns_second[0], str(arm_chrom[columns_first[0]]['p']['start']), str(int(columns_second[1])-1)] + columns_fill
-            seg_filled.append(columns_edges)
-            seg_filled.append(columns_new)
-            seg_filled.append(columns_second)
-
-
-    seg.close()
-
-    # make df from list of lists and convert chromosome coordinates to integers
-    seg_filled_df = pd.DataFrame(seg_filled, columns = header)
-    seg_filled_df["startpos"] = seg_filled_df["startpos"].astype(int)
-    seg_filled_df["endpos"] = seg_filled_df["endpos"].astype(int)
-
-    # remove any inverted segments, if there are
-    print("Checking and removing inverted segments...")
-    seg_filled_df = seg_filled_df[(seg_filled_df["endpos"]>seg_filled_df["startpos"])]
-
-    # remove any duplicated segments, if there are
-    print("Checking and removing duplicated segments...")
-    seg_filled_df = seg_filled_df.drop_duplicates()
-    # seg_filled_df = seg_filled_df.groupby((seg_filled_df["endpos"] != seg_filled_df["endpos"].shift(-2)).cumsum().values).first()    
-    # seg_filled_df = seg_filled_df.groupby((seg_filled_df["endpos"] != seg_filled_df["endpos"].shift(-1)).cumsum().values).first()
-    # seg_filled_df = seg_filled_df.groupby((seg_filled_df["startpos"] != seg_filled_df["startpos"].shift(-1)).cumsum().values).first()
-
-    # save to the output file specified by user
-    print("Saving to file...")
-    seg_filled_df.to_csv(output_file, header=True, index=False, sep="\t")
-    print("Done!")
-
-
-# Create nested dictionary to store shromosome arms coordinates. It is adopted from Chris's implementation in other script that summarizes CNVs
-def load_chrom_arm(chrom_file):
-    arm_chrom = {}
-    required_cols = ["chromosome", "start", "end", "arm"]
-    header_cols = {}
-
-    i = 0
-    with open(chrom_file) as f:
-        for line in f:
-            i += 1
-            line = line.rstrip("\n").rstrip("\r")  # Remove line endings
-            cols = line.split("\t")
-
-            # Skip empty lines
-            if not line:
-                continue
-
-            # If we haven't parsed the header yet, assume this is the first line of the file (aka the header)
-            if not header_cols:
-                j = 0
-                for col in cols:
-                    if col in required_cols:
-                        header_cols[col] = j
-                    j += 1
-
-                # Check to make sure all required columns are found
-                for col in required_cols:
-                    if col not in header_cols:
-                        raise AttributeError("Unable to locate column %s in the chromosome arm positions file \'%s\'" % (col, chrom_file))
-                # If we get this far, the header is valid
-                continue
-            
-            if cols[0] not in arm_chrom:
-                arm_chrom[cols[0]] = {}
-            if cols[3]:
-                if cols[3] not in arm_chrom[cols[0]]:
-                    arm_chrom[cols[0]][cols[3]]={}
-                arm_chrom[cols[0]][cols[3]]['start'] = int(cols[1])
-                arm_chrom[cols[0]][cols[3]]['end'] = int(cols[2])
-    return arm_chrom
-
-
-# Check that required arguments are provided, and the input is in .seg format
-def check_arguments(args, input_format):
-    if input_format == 'seg' and not all([args.input, args.output, args.chromArm]):
-        raise ValueError ('Must specify input .seg file, output file, and file listing coordinates of chromosome arms.')
-    elif input_format != 'seg':
-        raise ValueError ('Input file must be in .seg format')
-    else:
-      pass
-
-
-# Parse arguments from command line
-def parse_args():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument("--input",
-                        help="Imput file in .seg format to fill segments", required=True)
-    parser.add_argument("--output",
-                        help="Resulting file after filling missing segments", required=True)
-    parser.add_argument("--chromArm",
-                        help="File with coordinates of chromosme arms for a given genome build", required=True)
-
-    # ignore everything else that is not required by this script
-    args, unknown = parser.parse_known_args()
-    # return arguments provided by user
-    return args
-
-
-if __name__ == '__main__':
-    main()

From 8a48e7319be0310e89425be627611891f869fcae Mon Sep 17 00:00:00 2001
From: lkhilton <laura.k.hilton@gmail.com>
Date: Mon, 12 Sep 2022 17:08:51 -0700
Subject: [PATCH 04/14] Working Py/PhyClone module

---
 modules/pyclone_vi/1.0/config/default.yaml    |  64 +++
 modules/pyclone_vi/1.0/envs/gamblr.yaml       |   1 +
 modules/pyclone_vi/1.0/envs/phyclone.yaml     |  17 +
 modules/pyclone_vi/1.0/envs/pyclone-vi.yaml   |   5 +
 modules/pyclone_vi/1.0/envs/pyclone.yaml      |   5 +
 modules/pyclone_vi/1.0/envs/python.yaml       |   6 +
 modules/pyclone_vi/1.0/pyclone_vi.smk         | 397 ++++++++++++++++++
 modules/pyclone_vi/1.0/schemas/base-1.0.yaml  |   1 +
 .../1.0/schemas/time_point-1.0.yaml           |   1 +
 modules/pyclone_vi/1.0/src/build_input.py     | 213 ++++++++++
 .../1.0/src/build_pyclone_summary_file.py     |  33 ++
 .../1.0/src/compute_expected_statistics.py    | 126 ++++++
 modules/pyclone_vi/1.0/src/merge_files.py     |  26 ++
 .../1.0/src/subset_maf_for_pyclone.R          |  56 +++
 modules/pyclone_vi/CHANGELOG.md               |  16 +
 15 files changed, 967 insertions(+)
 create mode 100644 modules/pyclone_vi/1.0/config/default.yaml
 create mode 120000 modules/pyclone_vi/1.0/envs/gamblr.yaml
 create mode 100644 modules/pyclone_vi/1.0/envs/phyclone.yaml
 create mode 100644 modules/pyclone_vi/1.0/envs/pyclone-vi.yaml
 create mode 100644 modules/pyclone_vi/1.0/envs/pyclone.yaml
 create mode 100644 modules/pyclone_vi/1.0/envs/python.yaml
 create mode 100644 modules/pyclone_vi/1.0/pyclone_vi.smk
 create mode 120000 modules/pyclone_vi/1.0/schemas/base-1.0.yaml
 create mode 120000 modules/pyclone_vi/1.0/schemas/time_point-1.0.yaml
 create mode 100644 modules/pyclone_vi/1.0/src/build_input.py
 create mode 100644 modules/pyclone_vi/1.0/src/build_pyclone_summary_file.py
 create mode 100644 modules/pyclone_vi/1.0/src/compute_expected_statistics.py
 create mode 100644 modules/pyclone_vi/1.0/src/merge_files.py
 create mode 100644 modules/pyclone_vi/1.0/src/subset_maf_for_pyclone.R
 create mode 100644 modules/pyclone_vi/CHANGELOG.md

diff --git a/modules/pyclone_vi/1.0/config/default.yaml b/modules/pyclone_vi/1.0/config/default.yaml
new file mode 100644
index 00000000..2e7ee60b
--- /dev/null
+++ b/modules/pyclone_vi/1.0/config/default.yaml
@@ -0,0 +1,64 @@
+lcr-modules:
+
+    pyclone_vi:
+
+        # TODO: Update the list of available wildcards, if applicable
+        inputs:
+            # Available wildcards: {seq_type} {genome_build} {tumour_id} {normal_id} {pair_status}
+            sample_maf: "__UPDATE__"
+            sample_subclones: "__UPDATE__"
+            sample_cellularity: "__UPDATE__"
+            sample_sex: "__UPDATE__" # Only {normal_id} available
+
+        scratch_subdirectories: []
+
+        options:
+            build_input:
+                subset_maf: True # Whether to use the full maf or subset to only coding and aSHM mutations.
+                gamblr_branch: ""
+                gamblr_config_url: "https://raw.githubusercontent.com/morinlab/GAMBLR/master/config.yml"
+            fit:
+                num_clusters: 30
+                num_restarts: 100
+                opts: "--mix-weight-prior 10"
+            phyclone:
+                burnin: 100
+                num_iters: 350
+                density: "binomial" # Options: "binomial" or "beta-binomial"
+
+        conda_envs:
+            python: "{MODSDIR}/envs/python.yaml"
+            pyclone-vi: "{MODSDIR}/envs/pyclone-vi.yaml"
+            pyclone: "{MODSDIR}/envs/pyclone.yaml"
+            phyclone: "{MODSDIR}/envs/phyclone.yaml"
+            fill_battenberg: "{MODSDIR}/envs/fill_battenberg.yaml"
+            gamblr: "{MODSDIR}/envs/gamblr.yaml"
+
+        scripts:
+            subset_maf: "{MODSDIR}/src/subset_maf_for_pyclone.R"
+            build_input: "{MODSDIR}/src/build_input.py"
+            compute_stats: "{MODSDIR}/src/compute_expected_statistics.py"
+
+        threads:
+            build_input: 1
+            fit: 4
+            phyclone: 4
+
+        resources:
+            build_input:
+                mem_mb: 2000
+            fit:
+                mem_mb: 50000
+            phyclone:
+                mem_mb: 50000
+
+
+        pairing_config:
+            genome:
+                run_paired_tumours: True
+                run_unpaired_tumours_with: null
+                run_paired_tumours_as_unpaired: False
+            capture:
+                run_paired_tumours: True
+                run_unpaired_tumours_with: null
+                run_paired_tumours_as_unpaired: False
diff --git a/modules/pyclone_vi/1.0/envs/gamblr.yaml b/modules/pyclone_vi/1.0/envs/gamblr.yaml
new file mode 120000
index 00000000..5b0e0256
--- /dev/null
+++ b/modules/pyclone_vi/1.0/envs/gamblr.yaml
@@ -0,0 +1 @@
+/home/lhilton/repos/lcr-modules-starfish/envs/GAMBLR/gamblr.yaml
\ No newline at end of file
diff --git a/modules/pyclone_vi/1.0/envs/phyclone.yaml b/modules/pyclone_vi/1.0/envs/phyclone.yaml
new file mode 100644
index 00000000..bc365861
--- /dev/null
+++ b/modules/pyclone_vi/1.0/envs/phyclone.yaml
@@ -0,0 +1,17 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - python =3.9
+  - biopython
+  - click
+  - matplotlib
+  - networkx
+  - numba
+  - numpy
+  - pandas
+  - scikit-learn
+  - scipy
+  - pip
+  - pip:
+      - "git+ssh://git@github.com/aroth85/phyclone.git@7c717f79e62535de952defc15f9907a82deda520"
diff --git a/modules/pyclone_vi/1.0/envs/pyclone-vi.yaml b/modules/pyclone_vi/1.0/envs/pyclone-vi.yaml
new file mode 100644
index 00000000..69f82281
--- /dev/null
+++ b/modules/pyclone_vi/1.0/envs/pyclone-vi.yaml
@@ -0,0 +1,5 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - pyclone-vi
diff --git a/modules/pyclone_vi/1.0/envs/pyclone.yaml b/modules/pyclone_vi/1.0/envs/pyclone.yaml
new file mode 100644
index 00000000..339a0081
--- /dev/null
+++ b/modules/pyclone_vi/1.0/envs/pyclone.yaml
@@ -0,0 +1,5 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - pyclone
diff --git a/modules/pyclone_vi/1.0/envs/python.yaml b/modules/pyclone_vi/1.0/envs/python.yaml
new file mode 100644
index 00000000..d26c3e48
--- /dev/null
+++ b/modules/pyclone_vi/1.0/envs/python.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+dependencies:
+  - python =3.9
+  - networkx
+  - pandas
diff --git a/modules/pyclone_vi/1.0/pyclone_vi.smk b/modules/pyclone_vi/1.0/pyclone_vi.smk
new file mode 100644
index 00000000..3a038b61
--- /dev/null
+++ b/modules/pyclone_vi/1.0/pyclone_vi.smk
@@ -0,0 +1,397 @@
+#!/usr/bin/env snakemake
+
+
+##### ATTRIBUTION #####
+
+
+# Original Author:  Andrew Roth
+# Module Author:    Laura Hilton
+# Contributors:     N/A
+
+
+##### SETUP #####
+
+
+# Import package with useful functions for developing analysis modules
+import oncopipe as op
+import hashlib
+import glob
+
+# Setup module and store module-specific configuration in `CFG`
+# `CFG` is a shortcut to `config["lcr-modules"]["pyclone_vi"]`
+CFG = op.setup_module(
+    name = "pyclone_vi",
+    version = "1.0",
+    subdirectories = ["inputs", "build_inputs", "fit", "og-pyclone", "phyclone", "outputs"],
+)
+
+# Define rules to be run locally when using a compute cluster
+# TODO: Replace with actual rules once you change the rule names
+localrules:
+    _pyclone_vi_write_results,
+    _pyclone_vi_all
+
+# Install GAMBLR
+
+# Obtain the path to the GAMBLR conda environment
+md5hash = hashlib.md5()
+if workflow.conda_prefix:
+    conda_prefix = workflow.conda_prefix
+else:
+    conda_prefix = os.path.abspath(".snakemake/conda")
+
+md5hash.update(conda_prefix.encode())
+f = open("config/envs/GAMBLR.yaml", 'rb')
+md5hash.update(f.read())
+f.close()
+h = md5hash.hexdigest()
+GAMBLR = glob.glob(conda_prefix + "/" + h[:8] + "*")[0]
+
+rule _pyclone_vi_install_GAMBLR:
+    params:
+        branch = ", ref = \"" + CFG["options"]["build_input"]['gamblr_branch'] + "\"" if CFG["options"]["build_input"]['gamblr_branch'] != "" else "",
+        config_url = CFG["options"]["build_input"]["gamblr_config_url"]
+    output:
+        installed = directory(GAMBLR + "/lib/R/library/GAMBLR"),
+        config = "gamblr.yaml"
+    conda:
+        CFG['conda_envs']['gamblr']
+    shell:
+        op.as_one_line("""
+        wget -qO {output.config} {params.config_url} &&
+        R -q -e 'options(timeout=9999999); devtools::install_github("morinlab/GAMBLR"{params.branch})'
+        """)
+
+
+##### RULES #####
+
+
+# Symlinks the input files into the module results directory (under '00-inputs/')
+rule _pyclone_vi_input_maf:
+    input:
+        maf = CFG["inputs"]["sample_maf"]
+    output:
+        maf = CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.maf"
+    group: "input_and_build"
+    run:
+        op.relative_symlink(input.maf, output.maf)
+
+rule _pyclone_vi_input_battenberg:
+    input:
+        subclones = CFG["inputs"]["sample_subclones"],
+        cellularity = CFG["inputs"]["sample_cellularity"],
+        sex = CFG["inputs"]["sample_sex"]
+    output:
+        subclones = CFG["dirs"]["inputs"] + "battenberg/{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.battenberg.subclones.txt",
+        cellularity = CFG["dirs"]["inputs"] + "battenberg/{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.battenberg.cellularity_ploidy.txt",
+        sex = CFG["dirs"]["inputs"] + "battenberg/{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.battenberg.inferred_sex.txt"
+    group: "input_and_build"
+    wildcard_constraints:
+        seq_type = "genome"
+    run:
+        op.absolute_symlink(input.subclones, output.subclones)
+        op.absolute_symlink(input.cellularity, output.cellularity)
+        op.absolute_symlink(input.sex, output.sex)
+
+
+# Prepare Pyclone inputs
+
+
+rule _pyclone_vi_subset_maf:
+    input:
+        maf = str(rules._pyclone_vi_input_maf.output.maf),
+        GAMBLR = ancient(rules._pyclone_vi_install_GAMBLR.output.installed)
+    output:
+        maf = CFG["dirs"]["build_inputs"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.subset.maf"
+    params:
+        script = CFG["scripts"]["subset_maf"]
+    conda:
+        CFG["conda_envs"]["gamblr"]
+    script:
+        "{params.script}"
+
+subset_maf = CFG["options"]["build_input"]["subset_maf"]
+
+rule  _pyclone_vi_build_input:
+    input:
+        maf = str(rules._pyclone_vi_subset_maf.output.maf) if subset_maf else str(rules._pyclone_vi_input_maf.output.maf),
+        cnv = expand(str(rules._pyclone_vi_input_battenberg.output.subclones), seq_type = "genome", allow_missing = True),
+        cellularity = expand(str(rules._pyclone_vi_input_battenberg.output.cellularity), seq_type = "genome", allow_missing = True),
+        sex = expand(str(rules._pyclone_vi_input_battenberg.output.sex), seq_type = "genome", allow_missing = True)
+    output:
+        inputs = CFG["dirs"]["build_inputs"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.inputs.tsv"
+    params:
+        script = CFG["scripts"]["build_input"],
+        version = lambda w: {"genome": "pyclone-vi", "capture": "pyclone"}[w.seq_type]
+    log:
+        CFG["logs"]["build_inputs"] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}/build_input.log"
+    conda:
+        CFG["conda_envs"]["python"]
+    threads:
+        CFG["threads"]["build_input"]
+    resources:
+        **CFG["resources"]["build_input"]
+    group: "input_and_build"
+    shell:
+        op.as_one_line("""
+        cellularity=$(tail -n +2 {input.cellularity} | cut -f 1);
+        if [[ $(tail -n+2 {input.sex} | cut -f4) == "female" ]]; then sex="F"; else sex="M"; fi;
+        echo "Prepping PyClone-vi inputs with cellularity $cellularity and sex $sex. ";
+        python {params.script} -c battenberg -s $sex -t $cellularity -ic {input.cnv} -is {input.maf} -o {output.inputs} -id {wildcards.tumour_id} -p {params.version}
+        > {log} 2>&1
+        """)
+
+# Merge all built inputs into a single tsv file per patient
+def get_built_inputs(wildcards):
+    CFG = config["lcr-modules"]["pyclone_vi"]
+    PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id).sort_values(by = ["tumour_time_point"])
+    inputs = expand(
+        [
+            str(rules._pyclone_vi_build_input.output.inputs)
+        ],
+        zip,
+        tumour_id = PATIENT["tumour_sample_id"],
+        normal_id = PATIENT["normal_sample_id"],
+        pair_status = PATIENT["pair_status"],
+        allow_missing = True
+    )
+    return(inputs)
+
+def get_cellularity(wildcards):
+    CFG = config["lcr-modules"]["pyclone_vi"]
+    PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id).sort_values(by = ["tumour_time_point"])
+    inputs = expand(
+        rules._pyclone_vi_input_battenberg.input.cellularity,
+        zip,
+        seq_type = ["genome"]*len(PATIENT["tumour_genome_build"]),
+        genome_build = PATIENT["tumour_genome_build"],
+        tumour_id = PATIENT["tumour_sample_id"],
+        normal_id = PATIENT["normal_sample_id"],
+        pair_status = PATIENT["pair_status"]
+    )
+    cell_list = []
+    for file in inputs:
+        cellularity = pd.read_csv(file, sep = "\t")
+        cellularity = cellularity["cellularity"].tolist()
+        cell_list = cell_list + cellularity
+    cell_list = [str(i) for i in cell_list]
+    cellularities = " ".join(cell_list)
+    sample_ids = " ".join(PATIENT.tumour_sample_id.tolist())
+    return dict(cellularity = cellularities, sample_ids = sample_ids)
+
+rule _pyclone_vi_merge_input:
+    input:
+        get_built_inputs
+    output:
+        merged = CFG["dirs"]["build_inputs"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.merged_inputs.tsv"
+    group: "input_and_build"
+    run:
+        df_list = []
+        for tsv in input:
+            df = pd.read_csv(tsv, sep = "\t", header = 0, index_col = None)
+            df_list.append(df)
+        df_merged = pd.concat(df_list, ignore_index = True, axis = 0)
+        df_merged.sort_values("mutation_id", inplace=True)
+        df_merged.rename(columns = {"var_counts": "alt_counts"}, inplace=True)
+        df_merged.to_csv(output.merged, sep = "\t", index=False)
+
+# PyClone for capture data
+rule _pyclone_run_analysis_pipeline:
+    input:
+        tsv = get_built_inputs
+    output:
+       loci = CFG["dirs"]["og-pyclone"] + "{seq_type}--{genome_build}/{patient_id}/tables/loci.tsv"
+    params:
+        workdir = CFG["dirs"]["og-pyclone"] + "{seq_type}--{genome_build}/{patient_id}",
+        cellularity = lambda w: get_cellularity(w)["cellularity"],
+        sample_ids = lambda w: get_cellularity(w)["sample_ids"]
+    conda: CFG["conda_envs"]["pyclone"]
+    wildcard_constraints: seq_type = "capture"
+    shell:
+        op.as_one_line("""
+        PyClone run_analysis_pipeline
+            --in_files {input.tsv}
+            --working_dir {params.workdir}
+            --tumour_contents {params.cellularity}
+            --samples {params.sample_ids}
+            --plot_file_format pdf
+        """)
+
+
+# Fit pyclone-vi
+rule _pyclone_vi_fit:
+    input:
+        tsv = str(rules._pyclone_vi_merge_input.output.merged)
+    output:
+        trace = CFG["dirs"]["fit"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.h5"
+    log:
+        CFG["logs"]["fit"] + "{seq_type}--{genome_build}/{patient_id}/fit.log"
+    params:
+        **CFG["options"]["fit"]
+    threads:
+        CFG["threads"]["fit"]
+    resources:
+        **CFG["resources"]["fit"]
+    conda:
+        CFG["conda_envs"]["pyclone-vi"]
+    group: "fit_and_write"
+    wildcard_constraints: seq_type = "genome"
+    shell:
+        op.as_one_line("""
+        pyclone-vi fit
+        -i {input.tsv}
+        -o {output.trace}
+        -c {params.num_clusters}
+        -d binomial
+        -r {params.num_restarts}
+        {params.opts}
+        > {log} 2>&1
+        """)
+
+# Fit pyclone-vi
+rule _pyclone_vi_write_results:
+    input:
+        trace = str(rules._pyclone_vi_fit.output.trace)
+    output:
+        results = CFG["dirs"]["fit"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.pyclone_vi.tsv"
+    log:
+        CFG["logs"]["fit"] + "{seq_type}--{genome_build}/{patient_id}/write.log"
+    conda:
+        CFG["conda_envs"]["pyclone-vi"]
+    group: "fit_and_write"
+    wildcard_constraints: seq_type = "genome"
+    shell:
+        op.as_one_line("""
+        pyclone-vi write-results-file -i {input.trace} -o {output.results} > {log} 2>&1
+        """)
+
+# Run PhyClone to get phylogenetic tree
+rule _pyclone_vi_run_phyclone:
+    input:
+        merged = str(rules._pyclone_vi_merge_input.output.merged),
+        pyclone = lambda w: str(rules._pyclone_vi_write_results.output.results) if w.seq_type == "genome" else str(rules._pyclone_run_analysis_pipeline.output.loci)
+    output:
+        trace = CFG["dirs"]["phyclone"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.phyclone.pkl.gz"
+    params:
+        **CFG["options"]["phyclone"]
+    threads:
+        CFG["threads"]["phyclone"]
+    resources:
+        **CFG["resources"]["phyclone"]
+    conda:
+        CFG["conda_envs"]["phyclone"]
+    shell:
+        op.as_one_line("""
+        phyclone run
+        -i {input.merged}
+        -c {input.pyclone}
+        -o {output.trace}
+        -b {params.burnin}
+        -d {params.density}
+        -n {params.num_iters}
+        """)
+
+rule _pyclone_vi_phyclone_consensus:
+    input:
+        merged = str(rules._pyclone_vi_merge_input.output.merged),
+        pyclone = str(rules._pyclone_vi_run_phyclone.output.trace)
+    output:
+        tree = CFG["dirs"]["phyclone"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.phyclone.tree.nwk",
+        clusters = CFG["dirs"]["phyclone"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.phyclone.clusters.tsv"
+    params:
+        **CFG["options"]["phyclone"]
+    threads:
+        CFG["threads"]["phyclone"]
+    resources:
+        **CFG["resources"]["phyclone"]
+    conda:
+        CFG["conda_envs"]["phyclone"]
+    shell:
+        op.as_one_line("""
+        phyclone consensus
+        -i {input.pyclone}
+        -t {output.tree}
+        -o {output.clusters}
+        """)
+
+rule _pyclone_vi_compute_tree_stats:
+    input:
+        trace = str(rules._pyclone_vi_run_phyclone.output.trace)
+    output:
+        stats = CFG["dirs"]["phyclone"] + "{seq_type}--{genome_build}/{patient_id}/{patient_id}.phyclone.stats.tsv"
+    params:
+        **CFG["options"]["phyclone"],
+        script = CFG["scripts"]["compute_stats"]
+    threads:
+        CFG["threads"]["phyclone"]
+    resources:
+        **CFG["resources"]["phyclone"]
+    conda:
+        CFG["conda_envs"]["phyclone"]
+    shell:
+        op.as_one_line("""
+        python {params.script}
+        -i {input.trace}
+        -o {output.stats}
+        -p {wildcards.patient_id}
+        -b {params.burnin}
+        """)
+
+
+# Symlinks the final output files into the module results directory (under '99-outputs/')
+rule _pyclone_vi_output_tsv:
+    input:
+        pyclone = lambda w: str(rules._pyclone_vi_write_results.output.results) if w.seq_type == "genome" else str(rules._pyclone_run_analysis_pipeline.output.loci),
+        phyclone = str(rules._pyclone_vi_compute_tree_stats.output.stats),
+        tree = str(rules._pyclone_vi_phyclone_consensus.output.tree),
+        clusters = str(rules._pyclone_vi_phyclone_consensus.output.clusters),
+    output:
+        pyclone = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{patient_id}.pyclone.results.tsv",
+        phyclone = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{patient_id}.phyclone.stats.tsv",
+        tree = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{patient_id}.phyclone.tree.nwk",
+        clusters = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{patient_id}.phyclone.clusters.tsv"
+    run:
+        op.relative_symlink(input.pyclone, output.pyclone)
+        op.relative_symlink(input.phyclone, output.phyclone)
+        op.relative_symlink(input.tree, output.tree)
+        op.relative_symlink(input.clusters, output.clusters)
+
+
+# Generates the target sentinels for each run, which generate the symlinks
+
+# Generate a de-duplicated table of patient_ids etc.
+PATIENTS_GENOMES = op.filter_samples(CFG["runs"][["tumour_patient_id", "normal_patient_id", "tumour_genome_build", "tumour_seq_type"]], tumour_seq_type = "genome")\
+    .drop_duplicates(subset = None, ignore_index = True)
+PATIENTS_CAPTURE = op.filter_samples(CFG["runs"][["tumour_patient_id", "normal_patient_id", "tumour_genome_build", "tumour_seq_type"]], tumour_seq_type = "capture")\
+    .drop_duplicates(subset = None, ignore_index = True)
+if isinstance(PATIENTS_GENOMES, pd.DataFrame) and isinstance(PATIENTS_CAPTURE, pd.DataFrame):
+    PATIENTS = pd.concat([PATIENTS_GENOMES, PATIENTS_CAPTURE])
+
+rule _pyclone_vi_all:
+    input:
+        expand(
+            [
+                str(rules._pyclone_vi_output_tsv.output.phyclone),
+                str(rules._pyclone_vi_output_tsv.output.pyclone),
+                str(rules._pyclone_vi_output_tsv.output.tree),
+                str(rules._pyclone_vi_output_tsv.output.clusters)
+            ],
+            zip,  # Run expand() with zip(), not product()
+            seq_type=PATIENTS["tumour_seq_type"],
+            genome_build=PATIENTS["tumour_genome_build"],
+            patient_id=PATIENTS["tumour_patient_id"])
+        # expand(
+        #     str(rules._pyclone_run_analysis_pipeline.output.workdir),
+        #     zip,
+        #     seq_type=PATIENTS_CAPTURE["tumour_seq_type"],
+        #     genome_build=PATIENTS_CAPTURE["tumour_genome_build"],
+        #     patient_id=PATIENTS_CAPTURE["tumour_patient_id"]
+        # )
+
+
+##### CLEANUP #####
+
+
+# Perform some clean-up tasks, including storing the module-specific
+# configuration on disk and deleting the `CFG` variable
+op.cleanup_module(CFG)
diff --git a/modules/pyclone_vi/1.0/schemas/base-1.0.yaml b/modules/pyclone_vi/1.0/schemas/base-1.0.yaml
new file mode 120000
index 00000000..0a69d1ce
--- /dev/null
+++ b/modules/pyclone_vi/1.0/schemas/base-1.0.yaml
@@ -0,0 +1 @@
+../../../../schemas/base/base-1.0.yaml
\ No newline at end of file
diff --git a/modules/pyclone_vi/1.0/schemas/time_point-1.0.yaml b/modules/pyclone_vi/1.0/schemas/time_point-1.0.yaml
new file mode 120000
index 00000000..c163d396
--- /dev/null
+++ b/modules/pyclone_vi/1.0/schemas/time_point-1.0.yaml
@@ -0,0 +1 @@
+../../../../schemas/time_point/time_point-1.0.yaml
\ No newline at end of file
diff --git a/modules/pyclone_vi/1.0/src/build_input.py b/modules/pyclone_vi/1.0/src/build_input.py
new file mode 100644
index 00000000..f412fd6c
--- /dev/null
+++ b/modules/pyclone_vi/1.0/src/build_input.py
@@ -0,0 +1,213 @@
+from operator import index
+import numpy as np
+import pandas as pd
+
+
+def main(args):
+    df = build_input_df(
+        args.cnv_file,
+        args.snv_file,
+        sample_id=args.sample_id,
+        cnv_caller=args.cnv_caller,
+        sex=args.sex,
+        tumour_content=args.tumour_content,
+        pyclone_tool=args.pyclone_tool
+    )
+
+    assert np.all(df["major_cn"] >= df["minor_cn"])
+
+    df = df[df["major_cn"] > 0]
+
+    df.to_csv(args.out_file, index=False, sep="\t")
+
+
+def build_input_df(cnv_file, snv_file, sample_id="tumour", cnv_caller="battenberg", sex=None, tumour_content=1.0, pyclone_tool="pyclone-vi"):
+    if cnv_caller == "battenberg":
+        cnv_df = load_battenberg_cnv_df(cnv_file, sample_id=sample_id, sex=sex)
+    else:
+        cnv_df = load_sequenza_cnv_df(cnv_file, sex=sex)
+    snv_df = load_snv_df(snv_file, sample_id=sample_id)
+    df = merge_files(cnv_df, snv_df)
+    df["tumour_content"] = tumour_content
+    if pyclone_tool == "pyclone":
+        df = df[["mutation_id", "ref_counts", "alt_counts", "normal_cn", "major_cn", "minor_cn", "tumour_content", "sample_id"]].rename(columns = {"alt_counts": "var_counts"})
+    return df
+
+
+def load_battenberg_cnv_df(file_name, sample_id="tumour", sex=None, solution="A"):
+    def get_dominant_clone(x, solution="A"):
+        frac_1_key = "frac1_{}".format(solution)
+        frac_2_key = "frac2_{}".format(solution)
+        if np.isnan(x[frac_2_key]):
+            clone = 1
+        else:
+            if x[frac_1_key] >= x[frac_2_key]:
+                clone = 1
+            else:
+                clone = 2
+        return clone
+
+    def get_major_cn(x, solution="A"):
+        clone = get_dominant_clone(x, solution=solution)
+        return x["nMaj{clone}_{solution}".format(clone=clone, solution=solution)]
+
+    def get_minor_cn(x, solution="A"):
+        clone = get_dominant_clone(x, solution=solution)
+        return x["nMin{clone}_{solution}".format(clone=clone, solution=solution)]
+
+    df = pd.read_csv(file_name, sep="\t")
+    df = df.rename(columns={"chr": "chrom", "startpos": "start", "endpos": "end"})
+    df["major_cn"] = df.apply(lambda row: get_major_cn(row, solution=solution), axis=1)
+    df["minor_cn"] = df.apply(lambda row: get_minor_cn(row, solution=solution), axis=1)
+    df["normal_cn"] = df["chrom"].apply(lambda row: get_normal_cn(row, sex=sex))
+    df["sample_id"] = sample_id
+    df = df[["sample_id", "chrom", "start", "end", "normal_cn", "major_cn", "minor_cn"]]
+    if sex is None:
+        df = df[~df["chrom"].isin(["X", "Y"])]
+    df.to_csv("loaded_battenberg.tsv", sep="\t", index=False)
+    return df
+
+
+def load_sequenza_cnv_df(file_name, sex=None):
+    df = pd.read_csv(file_name, sep="\t")
+    df = df.rename(columns={"chromosome": "chrom", "start.pos": "start", "end.pos": "end"})
+    df["major_cn"] = df.apply(lambda row: max(row["A"], row["B"]), axis=1)
+    df["minor_cn"] = df.apply(lambda row: min(row["A"], row["B"]), axis=1)
+    df["normal_cn"] = df["chrom"].apply(lambda row: get_normal_cn(row, sex=sex))
+    df = df[["chrom", "start", "end", "normal_cn", "major_cn", "minor_cn"]]
+    if sex is None:
+        df = df[~df["chrom"].isin(["X", "Y"])]
+    df["sample_id"] = "tumour"
+    df = df.dropna()
+    return df
+
+
+def get_normal_cn(chrom, sex):
+    if sex == "M":
+        if chrom in ["X", "Y"]:
+            cn = 1
+        else:
+            cn = 2
+    else:
+        cn = 2
+    return cn
+
+
+def load_snv_df(file_name, sample_id="tumour"):
+    df = pd.read_csv(file_name, sep="\t")
+    # PyClone only works on SNPs, not InDels
+    df = df[df["Variant_Type"].isin(["SNP"])]
+    # Ignore intergenic mutations (IGR)
+    df = df[~df["Variant_Classification"].isin(["IGR"])]
+    # Acutally I can't do this sub-sampling here because the mutations in all files for all tumours
+    # per patient need to be the same! Have to do this at the merging step.
+    # Except that by the time this file is generated, info about coding status of
+    # mutations is lost. Need to think about this.
+    # Separate the df into coding and non-coding mutations first
+    # df_coding = df[~df["Variant_Classification"].isin(["3'Flank", "5'Flank", "Intron"])]
+    # df_noncoding = df[df["Variant_Classification"].isin(["3'Flank", "5'Flank", "Intron"])]
+    # # To get up to 5000 total mutations, first check how many coding mutations there are:
+    # if len(df_coding.index) >= 5000:
+    #     df = df_coding.sample(n = 5000)
+    # else:
+    #     to_add = min([(5000 - len(df_coding.index)), len(df_noncoding.index)])
+    #     df_noncoding = df_noncoding.sample(n = to_add)
+    #     df = pd.concat([df_coding, df_noncoding])
+    df = df.rename(columns={
+        "Chromosome": "chrom",
+        "Start_Position": "coord",
+        "Reference_Allele": "ref",
+        "Tumor_Seq_Allele2": "alt",
+        "t_ref_count": "ref_counts",
+        "t_alt_count": "alt_counts"
+    })
+    df = df[["chrom", "coord", "ref", "alt", "ref_counts", "alt_counts"]]
+    df["mutation_id"] = df.apply(lambda row: "{chrom}:{coord}:{ref}:{alt}".format(**row.to_dict()), axis=1)
+    df["sample_id"] = sample_id
+    return df
+
+
+def position_segment_merge(positions, segments):
+    """
+    Merge positions with segments that contain them
+
+    Args:
+        positions (pandas.DataFrame): ['chrom', 'coord'] columns required
+        segments (pandas.DataFrame): ['chrom', 'start', 'end'] columns required
+
+    Returns:
+        pandas.DataFrame: merged table with ['chrom', 'coord', 'start', 'end'] columns
+
+
+    Assuming a set of non-overlapping segments, merge a set of positions so that
+    each entry in the new table provides coord and containing segment start/end
+    """
+
+    positions = positions[['chrom', 'coord']].copy().sort_values(by=['chrom','coord'])
+    positions["chrom"] = positions["chrom"].astype(str)
+    positions["chrom"] = positions["chrom"].str.strip()
+    segments = segments[['chrom', 'start', 'end']].copy().sort_values(by=['chrom','start'])
+    segments["chrom"] = segments["chrom"].astype(str)
+    segments["chrom"] = segments["chrom"].str.strip()
+
+    merged = positions.merge(segments, left_on='chrom', right_on='chrom', how="left")\
+               .sort_values(by=['chrom', 'coord'])
+
+
+    merged['start'] = merged['start'].fillna(method='ffill')
+    merged['end'] = merged['end'].fillna(method='ffill')
+
+
+    merged = merged[(merged['coord'] >= merged['start']) &
+                    (merged['coord'] <= merged['end'])]
+
+    return merged
+
+
+def merge_files(cnv_df, snv_df):
+    df = []
+    for sample_id in cnv_df['sample_id'].unique():
+        df_1 = snv_df[snv_df['sample_id'] == sample_id]
+        df_2 = cnv_df[cnv_df['sample_id'] == sample_id]
+        df_2["chrom"] = df_2["chrom"].astype(str).str.strip()
+        merged = position_segment_merge(df_1, df_2)
+        merged["chrom"] = merged["chrom"].astype(str).str.strip()
+        merged = pd.merge(merged, df_2, on=['chrom', 'start', 'end'])
+        merged = merged[['chrom', 'coord', 'major_cn', 'minor_cn', 'normal_cn']]
+        # df_1.to_csv("snv_df.tsv", sep="\t", index=False)
+        df_1["chrom"] = df_1["chrom"].astype(str).str.strip()
+        merged["chrom"] = merged["chrom"].astype(str).str.strip()
+        merged = pd.merge(merged, df_1, on=['chrom', 'coord'])
+        # merged.to_csv("merged.tsv", sep="\t", index=False)
+        merged = merged[['mutation_id', 'sample_id', 'ref_counts', 'alt_counts', 'normal_cn', 'major_cn', 'minor_cn']]
+        df.append(merged)
+    df = pd.concat(df)
+    df.drop_duplicates(subset=['mutation_id', 'sample_id'], inplace=True)
+    df['major_cn'], df['minor_cn'] = df['major_cn'].astype('Int64'), df['minor_cn'].astype('Int64')
+    return df
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("-ic", "--cnv-file", required=True)
+
+    parser.add_argument("-is", "--snv-file", required=True)
+
+    parser.add_argument("-id", "--sample-id", required=True)
+
+    parser.add_argument("-o", "--out-file", required=True)
+
+    parser.add_argument("-c", "--cnv-caller", default="battenberg", choices=["battenberg", "sequenza"])
+
+    parser.add_argument("-s", "--sex", default=None, choices=["M", "F"])
+
+    parser.add_argument("-t", "--tumour-content", default=1.0, type=float)
+
+    parser.add_argument("-p", "--pyclone_tool", default = "pyclone-vi", choices = ["pyclone-vi", "pyclone"])
+
+    cli_args = parser.parse_args()
+
+    main(cli_args)
diff --git a/modules/pyclone_vi/1.0/src/build_pyclone_summary_file.py b/modules/pyclone_vi/1.0/src/build_pyclone_summary_file.py
new file mode 100644
index 00000000..d48e9b56
--- /dev/null
+++ b/modules/pyclone_vi/1.0/src/build_pyclone_summary_file.py
@@ -0,0 +1,33 @@
+import pandas as pd
+
+
+def main(args):
+    df = pd.read_csv(args.in_file, sep="\t")
+
+    out_df = pd.merge(
+        df.groupby("cluster_id")["mutation_id"].nunique().reset_index(),
+        df[["cluster_id", "cellular_prevalence"]].drop_duplicates(),
+        on="cluster_id"
+    )
+
+    out_df.insert(0, "patient_id", args.patient_id)
+
+    out_df = out_df.rename(columns={"mutation_id": "num_snvs"})
+
+    out_df.to_csv(args.out_file, index=False, sep="\t")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("-i", "--in-file", required=True)
+
+    parser.add_argument("-o", "--out-file", required=True)
+
+    parser.add_argument("-p", "--patient-id", required=True)
+
+    cli_args = parser.parse_args()
+
+    main(cli_args)
diff --git a/modules/pyclone_vi/1.0/src/compute_expected_statistics.py b/modules/pyclone_vi/1.0/src/compute_expected_statistics.py
new file mode 100644
index 00000000..a98a01a7
--- /dev/null
+++ b/modules/pyclone_vi/1.0/src/compute_expected_statistics.py
@@ -0,0 +1,126 @@
+from phyclone.map import get_map_node_ccfs
+from phyclone.run import get_labels_table
+from phyclone.tree import Tree
+
+import gzip
+import networkx as nx
+import numpy as np
+import pandas as pd
+import pickle
+
+
+def main(args):
+    with gzip.GzipFile(args.in_file, "rb") as fh:
+        results = pickle.load(fh)
+
+    out_df = []
+
+    for state in results["trace"][args.burnin:]:
+        phyclone_tree = Tree.from_dict(results["data"], state["tree"])
+
+        tree = get_tree_from_phyclone_tree(phyclone_tree)
+
+        prevs = np.array([tree.nodes[n]["clonal_prev"][0] for n in tree.nodes])
+
+        prevs += 1e-6
+
+        prevs = prevs / np.sum(prevs)
+
+        entropy = -np.sum(prevs * np.log2(prevs))
+
+        num_clones = len(prevs)
+
+        labels = get_labels_table(results["data"], phyclone_tree, clusters=results["clusters"])
+
+        num_snvs = labels.groupby("cluster_id")["mutation_id"].nunique().values
+
+        min_num_snvs = min(num_snvs)
+
+        max_num_snvs = max(num_snvs)
+
+        mean_num_snvs = np.mean(num_snvs)
+
+        median_num_snvs = np.median(num_snvs)
+
+        out_df.append({
+            "entropy": entropy,
+            "num_clones": num_clones,
+            "min_num_snvs": min_num_snvs,
+            "max_num_snvs": max_num_snvs,
+            "mean_num_snvs": mean_num_snvs,
+            "median_num_snvs": median_num_snvs
+        })
+
+    out_df = pd.DataFrame(out_df)
+
+    out_df = pd.DataFrame([out_df.mean()])
+
+    out_df.insert(0, "patient_id", args.patient_id)
+
+    out_df.to_csv(args.out_file, index=False, sep="\t")
+
+
+def get_tree_from_phyclone_tree(tree):
+    """ Convert a Phyclone tree object to a graph for benchmarking.
+
+    Parameters
+    ----------
+    tree: (phyclone.tree.Tree)
+
+    Returns
+    -------
+    nx.Digraph representing clone phylogeny with nodes "snvs", "cellular_prev" and "clonal_prev" set for each node
+    """
+    ccfs = get_map_node_ccfs(tree)
+
+    G = nx.DiGraph()
+
+    for n in tree.graph.nodes:
+        G.add_node(n)
+
+        G.nodes[n]["cellular_prev"] = ccfs[n]
+
+        G.nodes[n]["snvs"] = [x.name for x in tree.node_data[n]]
+
+    for u, v in tree.graph.edges:
+        G.add_edge(u, v)
+
+    roots = []
+
+    for n in G.nodes:
+        if G.in_degree(n) == 0:
+            roots.append(n)
+
+    for r in roots:
+        set_clonal_prev(G, r)
+
+    return G
+
+
+def set_clonal_prev(G, node):
+    clonal_prev = G.nodes[node]["cellular_prev"].copy()
+
+    for child in G.successors(node):
+        clonal_prev -= G.nodes[child]["cellular_prev"]
+
+        set_clonal_prev(G, child)
+
+    G.nodes[node]["clonal_prev"] = clonal_prev
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("-i", "--in-file", required=True)
+
+    parser.add_argument("-o", "--out-file", required=True)
+
+    parser.add_argument("-p", "--patient-id", required=True)
+
+    parser.add_argument("-b", "--burnin", default=0, type=int)
+
+    cli_args = parser.parse_args()
+
+    main(cli_args)
diff --git a/modules/pyclone_vi/1.0/src/merge_files.py b/modules/pyclone_vi/1.0/src/merge_files.py
new file mode 100644
index 00000000..b8e22b8e
--- /dev/null
+++ b/modules/pyclone_vi/1.0/src/merge_files.py
@@ -0,0 +1,26 @@
+import pandas as pd
+
+
+def main(args):
+    out_df = []
+
+    for file_name in args.in_files:
+        out_df.append(pd.read_csv(file_name, sep="\t"))
+
+    out_df = pd.concat(out_df)
+
+    out_df.to_csv(args.out_file, index=False, sep="\t")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("-i", "--in-files", nargs="+", required=True)
+
+    parser.add_argument("-o", "--out-file", required=True)
+
+    cli_args = parser.parse_args()
+
+    main(cli_args)
diff --git a/modules/pyclone_vi/1.0/src/subset_maf_for_pyclone.R b/modules/pyclone_vi/1.0/src/subset_maf_for_pyclone.R
new file mode 100644
index 00000000..b4d1ca55
--- /dev/null
+++ b/modules/pyclone_vi/1.0/src/subset_maf_for_pyclone.R
@@ -0,0 +1,56 @@
+library(GAMBLR)
+library(tidyverse)
+library(data.table)
+
+genome_build <- snakemake@wildcards[["genome_build"]]
+in_maf <- snakemake@input[["maf"]]
+outfile <- snakemake@output[["maf"]]
+
+# Load the input maf file for this sample
+maf <- fread_maf(in_maf)
+
+coding_types <- c(
+  "Frame_Shift_Del",
+  "Frame_Shift_Ins",
+  "In_Frame_Del",
+  "In_Frame_Ins",
+  "Missense_Mutation",
+  "Nonsense_Mutation",
+  "Nonstop_Mutation",
+  "Silent",
+  "Splice_Region",
+  "Splice_Site",
+  "Translation_Start_Site"
+)
+
+# Load the correct aSHM regions file for the current genome_build
+if (str_detect(genome_build, "grch37|hg19|hs37d5")){
+  ashm_regions <- grch37_ashm_regions[1:3]
+} else {
+  ashm_regions <- hg38_ashm_regions[1:3]
+}
+
+# Remove the chr prefix if necessary
+if (!str_detect("chr", maf$Chromosome[1])){
+  ashm_regions <- mutate(ashm_regions, 
+                         chr_name = str_remove(chr_name, "chr"))
+}
+
+# Rename the columns and set the keys for foverlaps
+colnames(ashm_regions) <- c("Chromosome", "Start_Position", "End_Position") 
+ashm_regions <- data.table(ashm_regions)
+setkey(ashm_regions, Chromosome, Start_Position, End_Position)
+
+# Subset the maf to aSHM regions using foverlaps
+subset_maf <- foverlaps(maf, ashm_regions) %>% 
+  filter(!is.na(Start_Position) | 
+           Variant_Classification %in% coding_types) %>% 
+  select(-Start_Position, -End_Position) %>% 
+  select(Start_Position = i.Start_Position, 
+         End_Position = i.End_Position, 
+         everything()) %>% 
+  select(all_of(colnames(maf)))
+
+# Write to file
+write_tsv(subset_maf, outfile)
+
diff --git a/modules/pyclone_vi/CHANGELOG.md b/modules/pyclone_vi/CHANGELOG.md
new file mode 100644
index 00000000..34537789
--- /dev/null
+++ b/modules/pyclone_vi/CHANGELOG.md
@@ -0,0 +1,16 @@
+# Changelog
+
+All notable changes to the `pyclone_vi` module will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [1.0] - 2022-02-22
+
+This release was authored by Laura Hilton.
+See details about [PyClone](https://github.com/Roth-Lab/pyclone) and [PyClone-VI](https://github.com/Roth-Lab/pyclone-vi).
+The PhyClone GitHub repository is currently private but the tool is available on Conda. Several scripts were adapted from those written by Andrew Roth.
+
+This module runs PyClone (either the original meant for capture data) or PhyClone-VI (newer, meant for genome data), followed by PhyClone to refine clusters and assign a phylogeny. Runs on multi-time point patients, and requires a time_point column in the input metadata. The original PyClone does include some plots in its outputs. PhyClone and PyClone-VI do not generate any plots currently but this will be added as a feature in the near future.
+
+PyClone-VI seems to offer superior performance using only coding and aSHM mutations, rather than all mutations in the genome. This is togglable in the config with the `subset_maf` option.

From 07989c1bdbfdc4aa84fe0aa87b41754622d7a291 Mon Sep 17 00:00:00 2001
From: lkhilton <laura.k.hilton@gmail.com>
Date: Mon, 12 Sep 2022 22:21:03 -0700
Subject: [PATCH 05/14] Restore files messed up by git

---
 envs/phylowgs/fill_battenberg.yaml            |  41 ++
 envs/phylowgs/phylowgs_results.yaml           | 204 +++++++
 .../phylowgs/1.0/envs/bcftools-1.10.2.yaml    |   1 +
 modules/phylowgs/1.0/envs/coreutils-8.31.yaml |   1 +
 modules/phylowgs/1.0/envs/phylowgs.yaml       |   1 +
 .../phylowgs/1.0/envs/phylowgs_results.yaml   |   1 +
 modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml |   1 +
 modules/phylowgs/1.0/etc/noncoding.txt        |   8 +
 modules/phylowgs/1.0/schemas/base-1.0.yaml    |   1 +
 .../phylowgs/1.0/schemas/time_point-1.0.yaml  |   1 +
 .../1.0/src/process_phyloWGS_outputs.R        | 536 ++++++++++++++++++
 11 files changed, 796 insertions(+)
 create mode 100644 envs/phylowgs/fill_battenberg.yaml
 create mode 100644 envs/phylowgs/phylowgs_results.yaml
 create mode 120000 modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml
 create mode 120000 modules/phylowgs/1.0/envs/coreutils-8.31.yaml
 create mode 120000 modules/phylowgs/1.0/envs/phylowgs.yaml
 create mode 120000 modules/phylowgs/1.0/envs/phylowgs_results.yaml
 create mode 120000 modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml
 create mode 100644 modules/phylowgs/1.0/etc/noncoding.txt
 create mode 120000 modules/phylowgs/1.0/schemas/base-1.0.yaml
 create mode 120000 modules/phylowgs/1.0/schemas/time_point-1.0.yaml
 create mode 100644 modules/phylowgs/1.0/src/process_phyloWGS_outputs.R

diff --git a/envs/phylowgs/fill_battenberg.yaml b/envs/phylowgs/fill_battenberg.yaml
new file mode 100644
index 00000000..52741e52
--- /dev/null
+++ b/envs/phylowgs/fill_battenberg.yaml
@@ -0,0 +1,41 @@
+name: fill_segments
+channels:
+  - bioconda
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1
+  - _openmp_mutex=4.5
+  - ca-certificates=2020.12.5
+  - certifi=2020.12.5
+  - ld_impl_linux-64=2.35.1
+  - libblas=3.9.0
+  - libcblas=3.9.0
+  - libffi=3.3
+  - libgcc-ng=9.3.0
+  - libgfortran-ng=9.3.0
+  - libgfortran5=9.3.0
+  - libgomp=9.3.0
+  - liblapack=3.9.0
+  - libopenblas=0.3.12
+  - libstdcxx-ng=9.3.0
+  - ncurses=6.2
+  - numpy=1.19.4
+  - openssl=1.1.1i
+  - pandas=1.2.0
+  - pip=20.3.3
+  - python=3.9.1
+  - python-dateutil=2.8.1
+  - python_abi=3.9
+  - pytz=2020.5
+  - readline=8.0
+  - setuptools=49.6.0
+  - simplejson=3.17.2
+  - six=1.15.0
+  - sqlite=3.34.0
+  - tk=8.6.10
+  - tzdata=2020e
+  - wheel=0.36.2
+  - xz=5.2.5
+  - zlib=1.2.11
+prefix: /home/dreval/miniconda3/envs/fill_segments
diff --git a/envs/phylowgs/phylowgs_results.yaml b/envs/phylowgs/phylowgs_results.yaml
new file mode 100644
index 00000000..f64126d7
--- /dev/null
+++ b/envs/phylowgs/phylowgs_results.yaml
@@ -0,0 +1,204 @@
+name: phylowgs_outputs
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1
+  - _openmp_mutex=4.5
+  - _r-mutex=1.0.1
+  - binutils_impl_linux-64=2.35.1
+  - binutils_linux-64=2.35
+  - bwidget=1.9.14
+  - bzip2=1.0.8
+  - c-ares=1.17.1
+  - ca-certificates=2021.5.30
+  - cairo=1.16.0
+  - curl=7.77.0
+  - font-ttf-dejavu-sans-mono=2.37
+  - font-ttf-inconsolata=3.000
+  - font-ttf-source-code-pro=2.038
+  - font-ttf-ubuntu=0.83
+  - fontconfig=2.13.1
+  - fonts-conda-ecosystem=1
+  - fonts-conda-forge=1
+  - freetype=2.10.4
+  - fribidi=1.0.10
+  - gcc_impl_linux-64=9.3.0
+  - gcc_linux-64=9.3.0
+  - gettext=0.21.0
+  - gfortran_impl_linux-64=9.3.0
+  - gfortran_linux-64=9.3.0
+  - graphite2=1.3.14
+  - gsl=2.6
+  - gxx_impl_linux-64=9.3.0
+  - gxx_linux-64=9.3.0
+  - harfbuzz=2.8.1
+  - icu=68.1
+  - jbig=2.1
+  - jpeg=9d
+  - kernel-headers_linux-64=2.6.32
+  - krb5=1.19.1
+  - ld_impl_linux-64=2.35.1
+  - lerc=2.2.1
+  - libblas=3.9.0
+  - libcblas=3.9.0
+  - libcurl=7.77.0
+  - libdeflate=1.7
+  - libedit=3.1.20210216
+  - libev=4.33
+  - libffi=3.3
+  - libgcc-devel_linux-64=9.3.0
+  - libgcc-ng=9.3.0
+  - libgfortran-ng=9.3.0
+  - libgfortran5=9.3.0
+  - libglib=2.68.3
+  - libgomp=9.3.0
+  - libiconv=1.16
+  - liblapack=3.9.0
+  - libnghttp2=1.43.0
+  - libopenblas=0.3.15
+  - libpng=1.6.37
+  - libssh2=1.9.0
+  - libstdcxx-devel_linux-64=9.3.0
+  - libstdcxx-ng=9.3.0
+  - libtiff=4.3.0
+  - libuuid=2.32.1
+  - libwebp-base=1.2.0
+  - libxcb=1.14
+  - libxml2=2.9.12
+  - lz4-c=1.9.3
+  - make=4.3
+  - ncurses=6.2
+  - openssl=1.1.1k
+  - pandoc=2.14.0.3
+  - pango=1.48.6
+  - pcre=8.45
+  - pcre2=10.36
+  - pixman=0.40.0
+  - r-askpass=1.1
+  - r-assertthat=0.2.1
+  - r-backports=1.2.1
+  - r-base=4.1.0
+  - r-base64enc=0.1_3
+  - r-blob=1.2.1
+  - r-brio=1.1.2
+  - r-broom=0.7.8
+  - r-callr=3.7.0
+  - r-cellranger=1.1.0
+  - r-cli=3.0.0
+  - r-clipr=0.7.1
+  - r-colorspace=2.0_2
+  - r-cpp11=0.3.1
+  - r-crayon=1.4.1
+  - r-curl=4.3.2
+  - r-data.table=1.14.0
+  - r-dbi=1.1.1
+  - r-dbplyr=2.1.1
+  - r-desc=1.3.0
+  - r-diffobj=0.3.4
+  - r-digest=0.6.27
+  - r-dplyr=1.0.7
+  - r-dtplyr=1.1.0
+  - r-ellipsis=0.3.2
+  - r-evaluate=0.14
+  - r-fansi=0.4.2
+  - r-farver=2.1.0
+  - r-forcats=0.5.1
+  - r-fs=1.5.0
+  - r-gargle=1.1.0
+  - r-generics=0.1.0
+  - r-ggplot2=3.3.5
+  - r-ggrepel=0.9.1
+  - r-glue=1.4.2
+  - r-googledrive=1.0.1
+  - r-googlesheets4=0.3.0
+  - r-gtable=0.3.0
+  - r-haven=2.4.1
+  - r-highr=0.9
+  - r-hms=1.1.0
+  - r-htmltools=0.5.1.1
+  - r-httr=1.4.2
+  - r-ids=1.0.1
+  - r-isoband=0.2.4
+  - r-jsonlite=1.7.2
+  - r-knitr=1.33
+  - r-labeling=0.4.2
+  - r-lattice=0.20_44
+  - r-lifecycle=1.0.0
+  - r-lubridate=1.7.10
+  - r-magrittr=2.0.1
+  - r-markdown=1.1
+  - r-mass=7.3_54
+  - r-matrix=1.3_4
+  - r-mgcv=1.8_36
+  - r-mime=0.11
+  - r-modelr=0.1.8
+  - r-munsell=0.5.0
+  - r-nlme=3.1_152
+  - r-openssl=1.4.4
+  - r-pillar=1.6.1
+  - r-pkgconfig=2.0.3
+  - r-pkgload=1.2.1
+  - r-plyr=1.8.6
+  - r-praise=1.0.0
+  - r-prettyunits=1.1.1
+  - r-processx=3.5.2
+  - r-progress=1.2.2
+  - r-ps=1.6.0
+  - r-purrr=0.3.4
+  - r-r6=2.5.0
+  - r-rappdirs=0.3.3
+  - r-rcolorbrewer=1.1_2
+  - r-rcpp=1.0.6
+  - r-readr=1.4.0
+  - r-readxl=1.3.1
+  - r-rematch=1.0.1
+  - r-rematch2=2.1.2
+  - r-reprex=2.0.0
+  - r-reshape2=1.4.4
+  - r-rjson=0.2.20
+  - r-rlang=0.4.11
+  - r-rmarkdown=2.9
+  - r-rprojroot=2.0.2
+  - r-rstudioapi=0.13
+  - r-rvest=1.0.0
+  - r-scales=1.1.1
+  - r-selectr=0.4_2
+  - r-stringi=1.6.2
+  - r-stringr=1.4.0
+  - r-sys=3.4
+  - r-testthat=3.0.3
+  - r-tibble=3.1.2
+  - r-tidyr=1.1.3
+  - r-tidyselect=1.1.1
+  - r-tidyverse=1.3.1
+  - r-tinytex=0.32
+  - r-utf8=1.2.1
+  - r-uuid=0.1_4
+  - r-vctrs=0.3.8
+  - r-viridislite=0.4.0
+  - r-waldo=0.2.5
+  - r-withr=2.4.2
+  - r-xfun=0.24
+  - r-xml2=1.3.2
+  - r-yaml=2.2.1
+  - readline=8.1
+  - sed=4.8
+  - sysroot_linux-64=2.12
+  - tk=8.6.10
+  - tktable=2.10
+  - xorg-kbproto=1.0.7
+  - xorg-libice=1.0.10
+  - xorg-libsm=1.2.3
+  - xorg-libx11=1.7.2
+  - xorg-libxext=1.3.4
+  - xorg-libxrender=0.9.10
+  - xorg-libxt=1.2.1
+  - xorg-renderproto=0.11.1
+  - xorg-xextproto=7.3.0
+  - xorg-xproto=7.0.31
+  - xz=5.2.5
+  - zlib=1.2.11
+  - zstd=1.5.0
+prefix: /home/lhilton/miniconda3/envs/phylowgs_outputs
diff --git a/modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml b/modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml
new file mode 120000
index 00000000..72959e7b
--- /dev/null
+++ b/modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml
@@ -0,0 +1 @@
+../../../../envs/bcftools/bcftools-1.10.2.yaml
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/envs/coreutils-8.31.yaml b/modules/phylowgs/1.0/envs/coreutils-8.31.yaml
new file mode 120000
index 00000000..050452f7
--- /dev/null
+++ b/modules/phylowgs/1.0/envs/coreutils-8.31.yaml
@@ -0,0 +1 @@
+../../../../envs/coreutils/coreutils-8.31.yaml
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/envs/phylowgs.yaml b/modules/phylowgs/1.0/envs/phylowgs.yaml
new file mode 120000
index 00000000..6e962c7f
--- /dev/null
+++ b/modules/phylowgs/1.0/envs/phylowgs.yaml
@@ -0,0 +1 @@
+../../../../envs/phylowgs/phylowgs.yaml
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/envs/phylowgs_results.yaml b/modules/phylowgs/1.0/envs/phylowgs_results.yaml
new file mode 120000
index 00000000..926ec438
--- /dev/null
+++ b/modules/phylowgs/1.0/envs/phylowgs_results.yaml
@@ -0,0 +1 @@
+../../../../envs/phylowgs/phylowgs_results.yaml
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml b/modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml
new file mode 120000
index 00000000..829077c7
--- /dev/null
+++ b/modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml
@@ -0,0 +1 @@
+../../../../envs/vcf2maf/vcf2maf-1.6.18.yaml
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/etc/noncoding.txt b/modules/phylowgs/1.0/etc/noncoding.txt
new file mode 100644
index 00000000..258f27f0
--- /dev/null
+++ b/modules/phylowgs/1.0/etc/noncoding.txt
@@ -0,0 +1,8 @@
+Hugo_Symbol
+Silent
+RNA
+IGR
+Intron
+5'Flank
+3'Flank
+5'UTR
diff --git a/modules/phylowgs/1.0/schemas/base-1.0.yaml b/modules/phylowgs/1.0/schemas/base-1.0.yaml
new file mode 120000
index 00000000..0a69d1ce
--- /dev/null
+++ b/modules/phylowgs/1.0/schemas/base-1.0.yaml
@@ -0,0 +1 @@
+../../../../schemas/base/base-1.0.yaml
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/schemas/time_point-1.0.yaml b/modules/phylowgs/1.0/schemas/time_point-1.0.yaml
new file mode 120000
index 00000000..c163d396
--- /dev/null
+++ b/modules/phylowgs/1.0/schemas/time_point-1.0.yaml
@@ -0,0 +1 @@
+../../../../schemas/time_point/time_point-1.0.yaml
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R b/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R
new file mode 100644
index 00000000..587cdb05
--- /dev/null
+++ b/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R
@@ -0,0 +1,536 @@
+
+
+#'
+#' processing phyloWGS outputs pipeline that takes output json files and preprocessing output files
+#' SAMPLE_ID.mutass.zip file must be unzipped before runing the script
+
+#E example: how to run 
+#mkdir -p output
+#Rscript ./process.R --samplename SAMPLE_ID -j SAMPLE_ID.summ.json -t unziped.mutass/ -s ssm_data.txt -c cnv_data.txt -a SAMPLE_ID--matched_slms-3.final_deblacklisted_augmented.maf -b SAMPLE_ID_matched_slms-3.final_deblacklisted_augmented.maf -m SAMPLE_ID.muts.json -o out
+
+##################################################
+# load required libraries
+##################################################
+
+# library("optparse")
+library("rjson")
+library("tidyverse")
+library("ggrepel")
+library("data.table")
+
+
+##########################
+#### Snakemake Input #####
+##########################
+
+samplename = snakemake@wildcards[["patient_id"]]
+json_file = snakemake@input[["summ"]]
+trees_out= snakemake@input[["mutass"]]   
+ssm_file = snakemake@input[["ssms"]]
+cnv_file = snakemake@input[["cnvs"]]
+mafs = unlist(strsplit(snakemake@params[["maf_list"]], ","))
+mut_file = snakemake@input[["muts"]]
+driver_genes = snakemake@params[["drivers"]]
+sample_order = unlist(strsplit(snakemake@params[["sample_order"]], ","))
+genome_build = snakemake@wildcards[["genome_build"]]
+
+# Define the chr_prefix parameter based on the genome_build
+chr_prefixed = str_detect(genome_build, "hg")
+
+
+# option_list = list(
+#   make_option(c("-n", "--samplename"), type="character", default=NULL, help="Samplename of the sample to run", metavar="character"),
+#   make_option(c("-j", "--json_summ"), type="character", default=NULL, help="SAMPLE_ID.summ.json file generated by phyloWGS", metavar="character"),
+#   make_option(c("-t", "--trees_out"), type="character", default=NULL, help="Directory containing unzipped XX.mutass.zip trees", metavar="character"),
+#   make_option(c("-s", "--ssm"), type="character", default=NULL, help="Preprocessing ssm_data.txt output file", metavar="character"),
+#   make_option(c("-c", "--copynumber"), type="character", default=NULL, help="Preprocessing cnv_data.txt output file", metavar="character"),
+#   make_option(c("-a", "--tumourA_maf"), type="character", default=NULL, help="Agument maf file of tumour A", metavar="character"),
+#   make_option(c("-b", "--tumourB_maf"), type="character", default=NULL, help="Agument maf file of tumour B", metavar="character"),
+#   make_option(c("-m", "--json_muts"), type="character", default=NULL, help="SAMPLE_ID.muts.json file", metavar="character"),
+#   make_option(c("-o", "--output"), type="character", default=NULL, help="Output directory", metavar="character")
+# )
+# 
+# opt_parser = OptionParser(option_list=option_list)
+# opt = parse_args(opt_parser)
+# 
+# samplename = opt$samplename
+# json_file = opt$json_summ
+# trees_out= opt$trees_out   ###  directory where unziped SAMPLE_ID.mutass.zip trees are
+# ssm_file = opt$ssm
+# cnv_file = opt$copynumber
+# mafA = opt$tumourA_maf
+# mafB = opt$tumourB_maf
+# mut_file = opt$json_muts
+# output_dir = opt$output
+# 
+# 
+# 
+# .checkfile = function(infile) {
+#   
+#   if (!file.exists(infile)) {
+#     
+#     stop(paste("File", infile, "does not exist", sep=""))
+#     
+#   }
+#   
+# }
+# 
+# 
+# .checkfile(json_file)
+# .checkfile(ssm_file)
+# .checkfile(cnv_file)
+# .checkfile(mafA)
+# .checkfile(mafB)
+# .checkfile(mut_file)
+
+##################################################
+# Process input files
+###################################################
+# Parse the input file and obtain the required data for this run
+result1 <- fromJSON(file = json_file)
+result_mut<-fromJSON(file = mut_file)
+ssm_pre<-read.table(file = ssm_file, header = TRUE)
+cnv_pre<-read.delim(file = cnv_file, header = TRUE)[,c("cnv","a","d")]
+
+
+
+##################################################
+# define output files
+##################################################
+out_json_to_Rtable= snakemake@output[["tree_summary"]]
+ssm_to_trees= snakemake@output[["maf"]]
+cnv_to_trees= snakemake@output[["cnvs"]]
+cellular_prevalence_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_cellular_prevalence.pdf"))
+CCF_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_cancer_cell_fraction_.pdf"))
+VAF_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_vaf_.pdf"))
+VAF_coding_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_vaf_coding.pdf"))
+tree_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_tree.pdf"))
+CCF_table = snakemake@output[["CCF"]]
+
+if(!dir.exists(snakemake@output[["plots"]])){dir.create(snakemake@output[["plots"]])}
+
+# out_json_to_Rtable= file.path(output_dir, paste("out_res_",samplename,"_json_converted_toR.table", sep = ""))
+# ssm_to_trees= file.path(output_dir, paste("out_res_",samplename,"_assigned_ssms_to_best_tree_maf_format.table", sep = ""))
+# cnv_to_trees= file.path(output_dir, paste("out_res_",samplename,"_assigned_cnvs_to_best_tree_maf_format.table", sep = ""))
+# cellular_prevalence_plot= file.path(output_dir, paste("cellular_prevalence_",samplename,".pdf", sep = ""))
+# CCF_plot= file.path(output_dir, paste("cancer_cell_fraction_",samplename,".pdf", sep = ""))
+# VAF_plot= file.path(output_dir, paste("vaf_",samplename,".pdf", sep = ""))
+# VAF_coding_plot= file.path(output_dir, paste("vaf_ccoding",samplename,".pdf", sep = ""))
+
+
+###################################################
+# open summ.json file and convert it into humam readable format
+###################################################
+
+#this function opens SAMPLE_ID_summ.jason and converts it into R table
+open_tree = function(json_summ_file,out_json_to_Rtable){
+
+  out_res<-NULL
+  for (j in 1:length(json_summ_file[["trees"]])){
+  
+  tree_focal<-json_summ_file[["trees"]][j]
+  tree_focal_statA<-as.data.frame(t(unlist(sapply(tree_focal,function(x)x[c("clustering_index","branching_index","llh","linearity_index")])))) 
+  colnames(tree_focal_statA)<-c("clustering_index","branching_index","llh","linearity_index")
+  tree_focal_statA$tree_id<-j-1
+  rownames(tree_focal_statA)<-NULL
+  
+  
+  tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,-c(3,6,9,12,15,18,21,24,27,30)]
+  #tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,!(grepl("cellular_prevalence",colnames(tree_focal_statB)))]
+  colnames(tree_focal_statB)<-sub("^[^.]*.", "", colnames(tree_focal_statB))
+  stat_both<-cbind(tree_focal_statA,tree_focal_statB)
+  out_res<-bind_rows(stat_both,out_res)  
+  out_res_ordered<-out_res[order(out_res$tree_id),] 
+  } # for j loop 
+
+density<-json_summ_file["tree_densities"]
+density_unlist<-data.frame("density"=unlist(density))
+row.names(density_unlist)<-sub("^[^.]*.", "", row.names(density_unlist))
+
+density_unlist$tree_id<-row.names(density_unlist)
+row.names(density_unlist)<-NULL
+
+final_table=merge(out_res_ordered,density_unlist, by.x = "tree_id", by.y = "tree_id")   ## add tree densities to all tress table
+write.table(final_table, file =out_json_to_Rtable ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE)
+return(final_table)
+}
+
+
+result_tree<-open_tree(result1,out_json_to_Rtable)
+
+
+###################################################
+# extrcats the best tree
+###################################################
+#the best tree is the tree with the highest density
+
+best_tree_id = function(R_table, density) {
+  best=R_table[which.max(R_table$density),]
+  best_tree_focal_name<-best$tree_id
+  best_tree_id<-paste(best$tree_id,"json",sep = ".")
+  return(best_tree_id)
+  return(best_tree_focal_name)
+}
+best_tree_fileID<-best_tree_id(result_tree, density)
+
+
+#######################################################################
+# extract the stats (SNvs and CNVs assigned to each population) from the best tree 
+#######################################################################
+
+open_best_tree = function(trees_out,best_tree_id){
+  unzip(trees_out, files = best_tree_id, exdir = dirname(trees_out), overwrite = TRUE)
+  best_tree_path = paste0(dirname(trees_out), "/", best_tree_id)
+	rr <- fromJSON(file = best_tree_path) 
+	return(rr)
+}
+rr= open_best_tree(trees_out,best_tree_fileID)
+
+
+#######################################################################
+# annotate point mutations and CNVs in the best tree
+#######################################################################
+best_focal<-result1[["trees"]][as.numeric(gsub(".json","",best_tree_fileID))+1]
+tree_structure<-as.data.frame(sapply(best_focal,function(x)x["structure"]))  ##[6]
+tree_roots <- best_focal[[1]]$structure$`0`
+
+
+merge_both<-function(result1,best_tree_fileID,tree_structure){
+  best_tree<-as.numeric(gsub(".json","",best_tree_fileID))
+  best_focal<-result1[["trees"]][best_tree+1]
+  tree_focal_statB<-as.data.frame(sapply(best_focal,function(x)x["populations"]))   ##[3]
+  qq<-tree_focal_statB[,grep("cellular_prevalence",colnames(tree_focal_statB))] %>% 
+    rownames_to_column("sample") %>% 
+    pivot_longer(-sample, 
+                 names_to = "population", 
+                 values_to = "cellular_prevalence") %>% 
+    mutate(population = str_remove(str_remove(population, ".*populations[.]"), "[.]cellular_prevalence")) %>% 
+    mutate(is_root = ifelse(population %in% tree_roots, TRUE, FALSE)) %>% 
+    group_by(sample) %>% 
+    mutate(purity = sum(cellular_prevalence[is_root]), 
+           CCF = cellular_prevalence / purity)
+  
+  return(qq)
+
+}
+
+both_samples<-merge_both(result1,best_tree_fileID,tree_structure)
+
+
+write_tsv(both_samples, CCF_table)
+
+
+ssm = function(stat_best_tree, ssm_pre,ssm_to_trees,tree_structure, maf_list){
+  
+  out_res_ssm<-NULL
+  for ( i in 1:length(stat_best_tree$mut_assignments)){
+
+    focal<-(stat_best_tree$mut_assignments)[i]
+  
+   focal_ssms<-data.frame(sapply(focal, function(x) x[1]))
+   	
+   	colnames(focal_ssms)<-sub("^[^.]*.", "", colnames(focal_ssms))
+   focal_ssms$phyloWGS_population<-i
+   ssm_assign<-merge(ssm_pre,focal_ssms, by.x = "id", by.y = "ssms")[,c("id", "gene","phyloWGS_population")]
+   ssm_assign_spi<-separate(ssm_assign, col = gene, into = c("Chromosome","Start_Position"), sep = "_", convert = FALSE) %>% 
+     mutate(Start_Position = as.numeric(Start_Position)) 
+   if(chr_prefixed) {
+     ssm_assign_spi$Chromosome = str_c("chr", as.character(ssm_assign_spi$Chromosome))
+   }
+   
+   
+   out_res_ssm<-rbind(ssm_assign_spi,out_res_ssm)    
+   	
+   } ## i loop 
+  
+  ssm_assign_with_maf <- lapply(maf_list, function(x){
+    maf <- read_tsv(x, 
+                    col_types = cols(Chromosome = col_character())) %>% 
+      # PhyloWGS changes the start position of deletions. This makes the maf start position match that in the PhyloWGS SSM table. 
+      mutate(Start_Position = ifelse(Variant_Type == "DEL", Start_Position - 1, Start_Position))
+    maf <- out_res_ssm %>% 
+      left_join(maf, by = c("Chromosome", "Start_Position")) %>% 
+      # Restore the true MAF start postion after the hack above
+      mutate(Start_Position = ifelse(Variant_Type == "DEL", Start_Position + 1, Start_Position)) %>%
+      select(colnames(maf), everything())
+  })
+  out_res_ssm <- rbindlist(ssm_assign_with_maf) %>% 
+    mutate(clonal_status = case_when(
+      phyloWGS_population %in% tree_roots & length(tree_roots) > 1 ~ "polyclonal", 
+      phyloWGS_population %in% tree_roots & length(tree_roots) == 1 ~ "clonal",
+      TRUE ~ "subclonal"
+    ))
+  
+   return(out_res_ssm)
+  
+   
+}
+
+ss<-ssm(rr,ssm_pre,ssm_to_trees,tree_structure, mafs)
+
+write_tsv(ss, ssm_to_trees, na = "")
+
+
+###########################################################
+## load mut file to extrcat CNVs start and end positions
+###########################################################
+
+cnv = function(stat_best_tree, cnv_pre,mutation_file,cnv_to_trees){
+  out_res_cnv <-
+    bind_rows(lapply(1:length(stat_best_tree$mut_assignments), function(x)
+      data.frame(cnvs = stat_best_tree$mut_assignments[[x]]$cnvs) %>% mutate(phyloWGS_population = x)))
+  
+   
+  
+  #return(out_res_cnv)
+      out_res_mut<-NULL
+      for (cn in 1:length(result_mut$cnvs)){
+      focal_mut_cnv<-(result_mut$cnvs)[cn]
+  
+      focal_mut<-data.frame(sapply(focal_mut_cnv, function(x) x[1]))[1,]
+      colnames(focal_mut)<-sub("^[^.]*.", "", colnames(focal_mut))
+      focal_mut$cnv_id<-names(focal_mut_cnv)
+      out_res_mut<-bind_rows(focal_mut,out_res_mut)
+      } ## cn loop
+  
+    both_cnvs<-merge(out_res_cnv, out_res_mut, by.x = "cnvs",by.y = "cnv_id")  %>% 
+      select(cnvs, phyloWGS_population, physical_cnvs.chrom, 
+             physical_cnvs.start, physical_cnvs.end,
+             physical_cnvs.major_cn, physical_cnvs.minor_cn, physical_cnvs.cell_prev)
+    
+  
+}
+
+
+cnv<-cnv(rr, cnv_pre,result_mut,cnv_to_trees)
+write.table(cnv, file =cnv_to_trees ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE)
+
+##################### plot the results ####################
+###########################################################
+#### Slope chart the best tree, cellular prevalence #######
+
+plot_cp<-function(both_samples,cellular_prevalence_plot){
+pdf(cellular_prevalence_plot, width = 8, height =8 )
+plotA<-ggplot(data = both_samples, aes(x = sample, y = cellular_prevalence, group = population)) +
+  geom_line(aes(color = population), size = 2) +
+  labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+  
+  geom_point(aes(color = population), size = 4) +
+  #  Labelling as desired
+  xlab("Sample") + ylab("Cellular prevalence")+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+        panel.background = element_blank(), axis.line = element_line(colour = "black"),
+        legend.key = element_rect(fill = NA, colour = NA, size = 0.25),axis.text=element_text(size=16),axis.title=element_text(size=18),
+        plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)))
+print(plotA)
+dev.off() 
+}
+
+plot_cp(both_samples,cellular_prevalence_plot)
+
+###########################################################
+#### Slope chart the best tree, CCF #######
+
+plot_cp<-function(both_samples,CCF_plot){
+pdf(CCF_plot, width = 8, height =8 )
+plotB<-ggplot(data = both_samples[both_samples$population != 0, ], aes(x = sample, y = CCF, group = population)) +
+  geom_line(aes(color = population), size = 2) +
+  labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+  
+  geom_point(aes(color = population), size = 4) +
+  #  Labelling as desired
+  xlab("Sample") + ylab("CCF")+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+        panel.background = element_blank(), axis.line = element_line(colour = "black"),
+        legend.key = element_rect(fill = NA, colour = NA, size = 0.25),axis.text=element_text(size=16),axis.title=element_text(size=18),
+        plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)))
+print(plotB)
+dev.off() 
+}
+
+plot_cp(both_samples,CCF_plot)
+
+
+#############################################
+##### Slope chart the best tree (VAF) #######
+
+
+plot_vaf<-function(ss,VAF_plot){
+  pdf(VAF_plot, width = 8, height =8 )
+  plotC <- ss %>% 
+    select(Hugo_Symbol, Chromosome, Tumor_Sample_Barcode, Start_Position, t_depth, t_alt_count, populations = phyloWGS_population) %>% 
+    mutate(VAF = t_alt_count/t_depth, 
+           populations = as.factor(populations)) %>% 
+    filter(!is.na(Tumor_Sample_Barcode)) %>%
+    ggplot(aes(x = Tumor_Sample_Barcode, 
+               y = VAF, 
+               group = interaction(populations, Start_Position),
+               color = populations)) + 
+    geom_line(aes(color = populations), size=0.2, alpha=0.4)+
+    labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+  
+    xlab("Sample") + ylab("VAF") +
+    guides(colour = guide_legend(override.aes = list(alpha = 3)))+
+    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+          panel.background = element_blank(), axis.line = element_line(colour = "black"),
+          legend.key = element_rect(fill = NA, colour = NA, size = 2),axis.text=element_text(size=16),axis.title=element_text(size=18),
+          plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)))
+print(plotC)
+dev.off() 
+}
+
+
+
+plot_vaf(ss,VAF_plot)
+
+#############################################################
+##### Slope chart the best tree (VAF, coding regions, nonsense, missense and splicing sites) #######
+
+drivers <- read_tsv(driver_genes, col_names = "gene") %>% pull(gene) 
+
+plot_vaf_coding<-function(maf,VAF_coding_plot){
+
+  coding <- ss %>% 
+    select(Hugo_Symbol, HGVSp_Short, Chromosome, Tumor_Sample_Barcode, Variant_Classification, Start_Position, t_depth, t_alt_count, populations = phyloWGS_population) %>% 
+    mutate(VAF = t_alt_count/t_depth, 
+           populations = as.factor(populations), 
+           Tumor_Sample_Barcode = factor(Tumor_Sample_Barcode, levels = sample_order)) %>% 
+    filter(!is.na(Tumor_Sample_Barcode), 
+           !Variant_Classification %in% c("Silent", "RNA", "IGR", "Intron", "5'Flank", "3'Flank", "5'UTR")) %>% 
+    mutate(label = ifelse(!is.na(HGVSp_Short), str_c(Hugo_Symbol, "_", HGVSp_Short), str_c(Hugo_Symbol, "_", Variant_Classification)))
+pdf(VAF_coding_plot, width = 8, height =8 )
+plotD<-coding %>%
+  ggplot(aes(x = Tumor_Sample_Barcode, 
+             y = VAF, 
+             group = interaction(populations, Start_Position),
+             color = populations)) + 
+  geom_line(aes(color = populations), size=0.5, alpha=0.4)+
+  geom_text_repel(
+    data = filter(group_by(coding, Hugo_Symbol, Start_Position), VAF == max(VAF), Tumor_Sample_Barcode == sample_order[1], Hugo_Symbol %in% drivers),
+    aes(label = label, 
+        x = Tumor_Sample_Barcode, 
+        y = VAF), 
+    nudge_x = -0.2,
+    size = 4
+    ) +
+  geom_text_repel(
+    data = filter(group_by(coding, Hugo_Symbol, Start_Position), VAF == max(VAF), Tumor_Sample_Barcode == sample_order[length(sample_order)], Hugo_Symbol %in% drivers),
+    aes(label = label, 
+        x = Tumor_Sample_Barcode, 
+        y = VAF), 
+    nudge_x = 0.2,
+    size = 4
+  ) +
+  labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+  
+  xlab("Sample") + ylab("VAF") +
+  guides(colour = guide_legend(override.aes = list(alpha = 3)))+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+        panel.background = element_blank(), axis.line = element_line(colour = "black"),
+        legend.key = element_rect(fill = NA, colour = NA, size = 2),axis.text=element_text(size=16),axis.title=element_text(size=18),
+        plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)))
+
+print(plotD)
+dev.off() 
+}  
+
+
+plot_vaf_coding(ss,VAF_coding_plot)
+
+#############################################
+#####       Draw the best tree        #######
+#############################################
+
+
+tree_structure_long <- tree_structure %>% 
+  pivot_longer(everything(), 
+               names_to = "parent", 
+               values_to = "node") %>% 
+  mutate(parent = str_remove_all(parent, ".*[.]")) %>% 
+  distinct()  
+
+
+positions_x <- function(parents){
+  x <- 1:length(unique(parents))
+  names(x) <- unique(parents)
+  col_vals <- unname(x[parents])
+  return(col_vals)
+}
+
+tree_structure_long$x <- positions_x(tree_structure_long$parent)
+
+positions_y <- function(tree_df){
+  y = c("0" = 0.5)
+  for(parent in unique(tree_df$parent)){
+    # parent = "1"
+    child_index = 1
+    num_children <- nrow(tree_df[tree_df$parent == parent,])
+    if(num_children == 1){
+      child <- tree_df[tree_df$parent == parent,]$node
+      child_y <- unname(y[parent])
+      names(child_y) <- child
+      y = c(y, child_y)
+      
+    } else {
+      children <- tree_df[tree_df$parent == parent,]$node
+      y_max <- unname(y[parent]) + (0.25 / child_index)
+      y_min <- unname(y[parent]) - (0.25 / child_index)
+      y_range <- seq(y_min, y_max, length.out = length(children))
+      names(y_range) <- children
+      y = c(y, y_range)
+    }
+    child_index = child_index + 1
+  }
+  return(y)
+}
+
+tree_structure_long$y <- unname(positions_y(tree_structure_long)[as.character(tree_structure_long$node)])
+
+tree_structure_long <- add_row(tree_structure_long, parent = "0", node = 0, x = 0, y = 0.5) 
+
+get_ssms <- function(tree_df, best_focal, best_tree_fileID){
+  data <- best_focal[[str_remove_all(best_tree_fileID, "[.].*")]]$populations
+  ssm_vec <- c()
+  for(node in tree_df$node){
+    # node = "1"
+    num_ssms <- data[[as.character(node)]]$num_ssms
+    names(num_ssms) <- as.character(node)
+    ssm_vec <- c(ssm_vec, num_ssms)
+  }
+  return(ssm_vec)
+}
+
+tree_structure_long$num_ssms <- get_ssms(tree_structure_long, best_focal, best_tree_fileID)[as.character(tree_structure_long$node)]
+
+tree_structure_long <- tree_structure_long %>% 
+  mutate(parent = as.numeric(parent)) %>%
+  left_join(select(tree_structure_long, node, xstart = x, ystart = y), 
+            by = c("parent" = "node"))
+
+
+
+ggplot(tree_structure_long, 
+       aes(x = x, 
+           y = y, 
+           label = node)) + 
+  geom_segment(inherit.aes = FALSE, 
+               aes(x = xstart,
+                   xend = x,
+                   y = ystart,
+                   yend = y)) +
+  geom_point(aes(size = num_ssms), 
+             fill = "white",
+             colour = "black", 
+             pch = 21) + 
+  geom_text() + 
+  scale_size(range = c(5,20)) + 
+  ylim(0,1) +
+  theme_void() + 
+  ggtitle(samplename) + 
+  theme(legend.position = "none")
+ 
+ggsave(tree_plot, height = 6, width = 6)
+
+############
+##### END ##
+############
+
+
+
+

From ada0c037c1c4d7edfc5267ea4789f47294c4c411 Mon Sep 17 00:00:00 2001
From: lkhilton <laura.k.hilton@gmail.com>
Date: Mon, 12 Sep 2022 22:21:03 -0700
Subject: [PATCH 06/14] Restore files messed up by git

---
 envs/phylowgs/fill_battenberg.yaml            |  41 ++
 envs/phylowgs/phylowgs.yaml                   |  82 +++
 envs/phylowgs/phylowgs_results.yaml           | 204 +++++++
 .../phylowgs/1.0/envs/bcftools-1.10.2.yaml    |   1 +
 modules/phylowgs/1.0/envs/coreutils-8.31.yaml |   1 +
 modules/phylowgs/1.0/envs/phylowgs.yaml       |   1 +
 .../phylowgs/1.0/envs/phylowgs_results.yaml   |   1 +
 modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml |   1 +
 modules/phylowgs/1.0/etc/noncoding.txt        |   8 +
 modules/phylowgs/1.0/schemas/base-1.0.yaml    |   1 +
 .../phylowgs/1.0/schemas/time_point-1.0.yaml  |   1 +
 .../1.0/src/process_phyloWGS_outputs.R        | 536 ++++++++++++++++++
 12 files changed, 878 insertions(+)
 create mode 100644 envs/phylowgs/fill_battenberg.yaml
 create mode 100644 envs/phylowgs/phylowgs.yaml
 create mode 100644 envs/phylowgs/phylowgs_results.yaml
 create mode 120000 modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml
 create mode 120000 modules/phylowgs/1.0/envs/coreutils-8.31.yaml
 create mode 120000 modules/phylowgs/1.0/envs/phylowgs.yaml
 create mode 120000 modules/phylowgs/1.0/envs/phylowgs_results.yaml
 create mode 120000 modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml
 create mode 100644 modules/phylowgs/1.0/etc/noncoding.txt
 create mode 120000 modules/phylowgs/1.0/schemas/base-1.0.yaml
 create mode 120000 modules/phylowgs/1.0/schemas/time_point-1.0.yaml
 create mode 100644 modules/phylowgs/1.0/src/process_phyloWGS_outputs.R

diff --git a/envs/phylowgs/fill_battenberg.yaml b/envs/phylowgs/fill_battenberg.yaml
new file mode 100644
index 00000000..52741e52
--- /dev/null
+++ b/envs/phylowgs/fill_battenberg.yaml
@@ -0,0 +1,41 @@
+name: fill_segments
+channels:
+  - bioconda
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1
+  - _openmp_mutex=4.5
+  - ca-certificates=2020.12.5
+  - certifi=2020.12.5
+  - ld_impl_linux-64=2.35.1
+  - libblas=3.9.0
+  - libcblas=3.9.0
+  - libffi=3.3
+  - libgcc-ng=9.3.0
+  - libgfortran-ng=9.3.0
+  - libgfortran5=9.3.0
+  - libgomp=9.3.0
+  - liblapack=3.9.0
+  - libopenblas=0.3.12
+  - libstdcxx-ng=9.3.0
+  - ncurses=6.2
+  - numpy=1.19.4
+  - openssl=1.1.1i
+  - pandas=1.2.0
+  - pip=20.3.3
+  - python=3.9.1
+  - python-dateutil=2.8.1
+  - python_abi=3.9
+  - pytz=2020.5
+  - readline=8.0
+  - setuptools=49.6.0
+  - simplejson=3.17.2
+  - six=1.15.0
+  - sqlite=3.34.0
+  - tk=8.6.10
+  - tzdata=2020e
+  - wheel=0.36.2
+  - xz=5.2.5
+  - zlib=1.2.11
+prefix: /home/dreval/miniconda3/envs/fill_segments
diff --git a/envs/phylowgs/phylowgs.yaml b/envs/phylowgs/phylowgs.yaml
new file mode 100644
index 00000000..dc531881
--- /dev/null
+++ b/envs/phylowgs/phylowgs.yaml
@@ -0,0 +1,82 @@
+name: phylo
+channels:
+  - conda-forge
+  - defaults
+  - bioconda
+  - r
+dependencies:
+  - _libgcc_mutex=0.1
+  - _openmp_mutex=4.5
+  - binutils_impl_linux-64=2.35.1
+  - binutils_linux-64=2.35
+  - blas=1.1
+  - ca-certificates=2020.12.5
+  - certifi=2019.11.28
+  - dbus=1.13.6
+  - expat=2.2.10
+  - fontconfig=2.13.1
+  - freetype=2.10.4
+  - gcc_impl_linux-64=9.3.0
+  - gcc_linux-64=9.3.0
+  - gettext=0.19.8.1
+  - gfortran_impl_linux-64=9.3.0
+  - gfortran_linux-64=9.3.0
+  - glib=2.58.3
+  - gsl=2.6
+  - gst-plugins-base=1.14.5
+  - gstreamer=1.14.5
+  - gxx_impl_linux-64=9.3.0
+  - gxx_linux-64=9.3.0
+  - icu=64.2
+  - jpeg=9d
+  - kernel-headers_linux-64=2.6.32
+  - ld_impl_linux-64=2.35.1
+  - libblas=3.9.0
+  - libcblas=3.9.0
+  - libffi=3.2.1
+  - libgcc-devel_linux-64=9.3.0
+  - libgcc-ng=9.3.0
+  - libgfortran=3.0.0
+  - libgfortran-ng=9.3.0
+  - libgfortran5=9.3.0
+  - libgomp=9.3.0
+  - libiconv=1.16
+  - liblapack=3.9.0
+  - libpng=1.6.37
+  - libstdcxx-devel_linux-64=9.3.0
+  - libstdcxx-ng=9.3.0
+  - libuuid=2.32.1
+  - libxcb=1.13
+  - libxml2=2.9.10
+  - libxslt=1.1.33
+  - lxml=3.8.0
+  - mysql-connector-c=6.1.11
+  - mysql-python=1.2.5
+  - ncurses=6.2
+  - numpy=1.15.4
+  - openblas=0.3.3
+  - openssl=1.1.1j
+  - pcre=8.44
+  - phylowgs=20181105
+  - pip=20.1.1
+  - pthread-stubs=0.4
+  - pyqt=5.9.2
+  - python=2.7.15
+  - python_abi=2.7
+  - pyvcf=0.6.8
+  - qt=5.9.7
+  - readline=8.0
+  - setuptools=44.0.0
+  - sip=4.19.8
+  - sqlite=3.34.0
+  - sysroot_linux-64=2.12
+  - tk=8.6.10
+  - wheel=0.36.2
+  - xorg-libxau=1.0.9
+  - xorg-libxdmcp=1.1.3
+  - xz=5.2.5
+  - zlib=1.2.11
+  - pip:
+    - ete2==2.3.10
+    - scipy==0.16.0
+prefix: /home/ssoudi/miniconda3/envs/phylo
diff --git a/envs/phylowgs/phylowgs_results.yaml b/envs/phylowgs/phylowgs_results.yaml
new file mode 100644
index 00000000..f64126d7
--- /dev/null
+++ b/envs/phylowgs/phylowgs_results.yaml
@@ -0,0 +1,204 @@
+name: phylowgs_outputs
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1
+  - _openmp_mutex=4.5
+  - _r-mutex=1.0.1
+  - binutils_impl_linux-64=2.35.1
+  - binutils_linux-64=2.35
+  - bwidget=1.9.14
+  - bzip2=1.0.8
+  - c-ares=1.17.1
+  - ca-certificates=2021.5.30
+  - cairo=1.16.0
+  - curl=7.77.0
+  - font-ttf-dejavu-sans-mono=2.37
+  - font-ttf-inconsolata=3.000
+  - font-ttf-source-code-pro=2.038
+  - font-ttf-ubuntu=0.83
+  - fontconfig=2.13.1
+  - fonts-conda-ecosystem=1
+  - fonts-conda-forge=1
+  - freetype=2.10.4
+  - fribidi=1.0.10
+  - gcc_impl_linux-64=9.3.0
+  - gcc_linux-64=9.3.0
+  - gettext=0.21.0
+  - gfortran_impl_linux-64=9.3.0
+  - gfortran_linux-64=9.3.0
+  - graphite2=1.3.14
+  - gsl=2.6
+  - gxx_impl_linux-64=9.3.0
+  - gxx_linux-64=9.3.0
+  - harfbuzz=2.8.1
+  - icu=68.1
+  - jbig=2.1
+  - jpeg=9d
+  - kernel-headers_linux-64=2.6.32
+  - krb5=1.19.1
+  - ld_impl_linux-64=2.35.1
+  - lerc=2.2.1
+  - libblas=3.9.0
+  - libcblas=3.9.0
+  - libcurl=7.77.0
+  - libdeflate=1.7
+  - libedit=3.1.20210216
+  - libev=4.33
+  - libffi=3.3
+  - libgcc-devel_linux-64=9.3.0
+  - libgcc-ng=9.3.0
+  - libgfortran-ng=9.3.0
+  - libgfortran5=9.3.0
+  - libglib=2.68.3
+  - libgomp=9.3.0
+  - libiconv=1.16
+  - liblapack=3.9.0
+  - libnghttp2=1.43.0
+  - libopenblas=0.3.15
+  - libpng=1.6.37
+  - libssh2=1.9.0
+  - libstdcxx-devel_linux-64=9.3.0
+  - libstdcxx-ng=9.3.0
+  - libtiff=4.3.0
+  - libuuid=2.32.1
+  - libwebp-base=1.2.0
+  - libxcb=1.14
+  - libxml2=2.9.12
+  - lz4-c=1.9.3
+  - make=4.3
+  - ncurses=6.2
+  - openssl=1.1.1k
+  - pandoc=2.14.0.3
+  - pango=1.48.6
+  - pcre=8.45
+  - pcre2=10.36
+  - pixman=0.40.0
+  - r-askpass=1.1
+  - r-assertthat=0.2.1
+  - r-backports=1.2.1
+  - r-base=4.1.0
+  - r-base64enc=0.1_3
+  - r-blob=1.2.1
+  - r-brio=1.1.2
+  - r-broom=0.7.8
+  - r-callr=3.7.0
+  - r-cellranger=1.1.0
+  - r-cli=3.0.0
+  - r-clipr=0.7.1
+  - r-colorspace=2.0_2
+  - r-cpp11=0.3.1
+  - r-crayon=1.4.1
+  - r-curl=4.3.2
+  - r-data.table=1.14.0
+  - r-dbi=1.1.1
+  - r-dbplyr=2.1.1
+  - r-desc=1.3.0
+  - r-diffobj=0.3.4
+  - r-digest=0.6.27
+  - r-dplyr=1.0.7
+  - r-dtplyr=1.1.0
+  - r-ellipsis=0.3.2
+  - r-evaluate=0.14
+  - r-fansi=0.4.2
+  - r-farver=2.1.0
+  - r-forcats=0.5.1
+  - r-fs=1.5.0
+  - r-gargle=1.1.0
+  - r-generics=0.1.0
+  - r-ggplot2=3.3.5
+  - r-ggrepel=0.9.1
+  - r-glue=1.4.2
+  - r-googledrive=1.0.1
+  - r-googlesheets4=0.3.0
+  - r-gtable=0.3.0
+  - r-haven=2.4.1
+  - r-highr=0.9
+  - r-hms=1.1.0
+  - r-htmltools=0.5.1.1
+  - r-httr=1.4.2
+  - r-ids=1.0.1
+  - r-isoband=0.2.4
+  - r-jsonlite=1.7.2
+  - r-knitr=1.33
+  - r-labeling=0.4.2
+  - r-lattice=0.20_44
+  - r-lifecycle=1.0.0
+  - r-lubridate=1.7.10
+  - r-magrittr=2.0.1
+  - r-markdown=1.1
+  - r-mass=7.3_54
+  - r-matrix=1.3_4
+  - r-mgcv=1.8_36
+  - r-mime=0.11
+  - r-modelr=0.1.8
+  - r-munsell=0.5.0
+  - r-nlme=3.1_152
+  - r-openssl=1.4.4
+  - r-pillar=1.6.1
+  - r-pkgconfig=2.0.3
+  - r-pkgload=1.2.1
+  - r-plyr=1.8.6
+  - r-praise=1.0.0
+  - r-prettyunits=1.1.1
+  - r-processx=3.5.2
+  - r-progress=1.2.2
+  - r-ps=1.6.0
+  - r-purrr=0.3.4
+  - r-r6=2.5.0
+  - r-rappdirs=0.3.3
+  - r-rcolorbrewer=1.1_2
+  - r-rcpp=1.0.6
+  - r-readr=1.4.0
+  - r-readxl=1.3.1
+  - r-rematch=1.0.1
+  - r-rematch2=2.1.2
+  - r-reprex=2.0.0
+  - r-reshape2=1.4.4
+  - r-rjson=0.2.20
+  - r-rlang=0.4.11
+  - r-rmarkdown=2.9
+  - r-rprojroot=2.0.2
+  - r-rstudioapi=0.13
+  - r-rvest=1.0.0
+  - r-scales=1.1.1
+  - r-selectr=0.4_2
+  - r-stringi=1.6.2
+  - r-stringr=1.4.0
+  - r-sys=3.4
+  - r-testthat=3.0.3
+  - r-tibble=3.1.2
+  - r-tidyr=1.1.3
+  - r-tidyselect=1.1.1
+  - r-tidyverse=1.3.1
+  - r-tinytex=0.32
+  - r-utf8=1.2.1
+  - r-uuid=0.1_4
+  - r-vctrs=0.3.8
+  - r-viridislite=0.4.0
+  - r-waldo=0.2.5
+  - r-withr=2.4.2
+  - r-xfun=0.24
+  - r-xml2=1.3.2
+  - r-yaml=2.2.1
+  - readline=8.1
+  - sed=4.8
+  - sysroot_linux-64=2.12
+  - tk=8.6.10
+  - tktable=2.10
+  - xorg-kbproto=1.0.7
+  - xorg-libice=1.0.10
+  - xorg-libsm=1.2.3
+  - xorg-libx11=1.7.2
+  - xorg-libxext=1.3.4
+  - xorg-libxrender=0.9.10
+  - xorg-libxt=1.2.1
+  - xorg-renderproto=0.11.1
+  - xorg-xextproto=7.3.0
+  - xorg-xproto=7.0.31
+  - xz=5.2.5
+  - zlib=1.2.11
+  - zstd=1.5.0
+prefix: /home/lhilton/miniconda3/envs/phylowgs_outputs
diff --git a/modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml b/modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml
new file mode 120000
index 00000000..72959e7b
--- /dev/null
+++ b/modules/phylowgs/1.0/envs/bcftools-1.10.2.yaml
@@ -0,0 +1 @@
+../../../../envs/bcftools/bcftools-1.10.2.yaml
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/envs/coreutils-8.31.yaml b/modules/phylowgs/1.0/envs/coreutils-8.31.yaml
new file mode 120000
index 00000000..050452f7
--- /dev/null
+++ b/modules/phylowgs/1.0/envs/coreutils-8.31.yaml
@@ -0,0 +1 @@
+../../../../envs/coreutils/coreutils-8.31.yaml
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/envs/phylowgs.yaml b/modules/phylowgs/1.0/envs/phylowgs.yaml
new file mode 120000
index 00000000..6e962c7f
--- /dev/null
+++ b/modules/phylowgs/1.0/envs/phylowgs.yaml
@@ -0,0 +1 @@
+../../../../envs/phylowgs/phylowgs.yaml
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/envs/phylowgs_results.yaml b/modules/phylowgs/1.0/envs/phylowgs_results.yaml
new file mode 120000
index 00000000..926ec438
--- /dev/null
+++ b/modules/phylowgs/1.0/envs/phylowgs_results.yaml
@@ -0,0 +1 @@
+../../../../envs/phylowgs/phylowgs_results.yaml
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml b/modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml
new file mode 120000
index 00000000..829077c7
--- /dev/null
+++ b/modules/phylowgs/1.0/envs/vcf2maf-1.6.18.yaml
@@ -0,0 +1 @@
+../../../../envs/vcf2maf/vcf2maf-1.6.18.yaml
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/etc/noncoding.txt b/modules/phylowgs/1.0/etc/noncoding.txt
new file mode 100644
index 00000000..258f27f0
--- /dev/null
+++ b/modules/phylowgs/1.0/etc/noncoding.txt
@@ -0,0 +1,8 @@
+Hugo_Symbol
+Silent
+RNA
+IGR
+Intron
+5'Flank
+3'Flank
+5'UTR
diff --git a/modules/phylowgs/1.0/schemas/base-1.0.yaml b/modules/phylowgs/1.0/schemas/base-1.0.yaml
new file mode 120000
index 00000000..0a69d1ce
--- /dev/null
+++ b/modules/phylowgs/1.0/schemas/base-1.0.yaml
@@ -0,0 +1 @@
+../../../../schemas/base/base-1.0.yaml
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/schemas/time_point-1.0.yaml b/modules/phylowgs/1.0/schemas/time_point-1.0.yaml
new file mode 120000
index 00000000..c163d396
--- /dev/null
+++ b/modules/phylowgs/1.0/schemas/time_point-1.0.yaml
@@ -0,0 +1 @@
+../../../../schemas/time_point/time_point-1.0.yaml
\ No newline at end of file
diff --git a/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R b/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R
new file mode 100644
index 00000000..587cdb05
--- /dev/null
+++ b/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R
@@ -0,0 +1,536 @@
+
+
+#'
+#' processing phyloWGS outputs pipeline that takes output json files and preprocessing output files
+#' SAMPLE_ID.mutass.zip file must be unzipped before runing the script
+
+#E example: how to run 
+#mkdir -p output
+#Rscript ./process.R --samplename SAMPLE_ID -j SAMPLE_ID.summ.json -t unziped.mutass/ -s ssm_data.txt -c cnv_data.txt -a SAMPLE_ID--matched_slms-3.final_deblacklisted_augmented.maf -b SAMPLE_ID_matched_slms-3.final_deblacklisted_augmented.maf -m SAMPLE_ID.muts.json -o out
+
+##################################################
+# load required libraries
+##################################################
+
+# library("optparse")
+library("rjson")
+library("tidyverse")
+library("ggrepel")
+library("data.table")
+
+
+##########################
+#### Snakemake Input #####
+##########################
+
+samplename = snakemake@wildcards[["patient_id"]]
+json_file = snakemake@input[["summ"]]
+trees_out= snakemake@input[["mutass"]]   
+ssm_file = snakemake@input[["ssms"]]
+cnv_file = snakemake@input[["cnvs"]]
+mafs = unlist(strsplit(snakemake@params[["maf_list"]], ","))
+mut_file = snakemake@input[["muts"]]
+driver_genes = snakemake@params[["drivers"]]
+sample_order = unlist(strsplit(snakemake@params[["sample_order"]], ","))
+genome_build = snakemake@wildcards[["genome_build"]]
+
+# Define the chr_prefix parameter based on the genome_build
+chr_prefixed = str_detect(genome_build, "hg")
+
+
+# option_list = list(
+#   make_option(c("-n", "--samplename"), type="character", default=NULL, help="Samplename of the sample to run", metavar="character"),
+#   make_option(c("-j", "--json_summ"), type="character", default=NULL, help="SAMPLE_ID.summ.json file generated by phyloWGS", metavar="character"),
+#   make_option(c("-t", "--trees_out"), type="character", default=NULL, help="Directory containing unzipped XX.mutass.zip trees", metavar="character"),
+#   make_option(c("-s", "--ssm"), type="character", default=NULL, help="Preprocessing ssm_data.txt output file", metavar="character"),
+#   make_option(c("-c", "--copynumber"), type="character", default=NULL, help="Preprocessing cnv_data.txt output file", metavar="character"),
+#   make_option(c("-a", "--tumourA_maf"), type="character", default=NULL, help="Agument maf file of tumour A", metavar="character"),
+#   make_option(c("-b", "--tumourB_maf"), type="character", default=NULL, help="Agument maf file of tumour B", metavar="character"),
+#   make_option(c("-m", "--json_muts"), type="character", default=NULL, help="SAMPLE_ID.muts.json file", metavar="character"),
+#   make_option(c("-o", "--output"), type="character", default=NULL, help="Output directory", metavar="character")
+# )
+# 
+# opt_parser = OptionParser(option_list=option_list)
+# opt = parse_args(opt_parser)
+# 
+# samplename = opt$samplename
+# json_file = opt$json_summ
+# trees_out= opt$trees_out   ###  directory where unziped SAMPLE_ID.mutass.zip trees are
+# ssm_file = opt$ssm
+# cnv_file = opt$copynumber
+# mafA = opt$tumourA_maf
+# mafB = opt$tumourB_maf
+# mut_file = opt$json_muts
+# output_dir = opt$output
+# 
+# 
+# 
+# .checkfile = function(infile) {
+#   
+#   if (!file.exists(infile)) {
+#     
+#     stop(paste("File", infile, "does not exist", sep=""))
+#     
+#   }
+#   
+# }
+# 
+# 
+# .checkfile(json_file)
+# .checkfile(ssm_file)
+# .checkfile(cnv_file)
+# .checkfile(mafA)
+# .checkfile(mafB)
+# .checkfile(mut_file)
+
+##################################################
+# Process input files
+###################################################
+# Parse the input file and obtain the required data for this run
+result1 <- fromJSON(file = json_file)
+result_mut<-fromJSON(file = mut_file)
+ssm_pre<-read.table(file = ssm_file, header = TRUE)
+cnv_pre<-read.delim(file = cnv_file, header = TRUE)[,c("cnv","a","d")]
+
+
+
+##################################################
+# define output files
+##################################################
+out_json_to_Rtable= snakemake@output[["tree_summary"]]
+ssm_to_trees= snakemake@output[["maf"]]
+cnv_to_trees= snakemake@output[["cnvs"]]
+cellular_prevalence_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_cellular_prevalence.pdf"))
+CCF_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_cancer_cell_fraction_.pdf"))
+VAF_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_vaf_.pdf"))
+VAF_coding_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_vaf_coding.pdf"))
+tree_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_tree.pdf"))
+CCF_table = snakemake@output[["CCF"]]
+
+if(!dir.exists(snakemake@output[["plots"]])){dir.create(snakemake@output[["plots"]])}
+
+# out_json_to_Rtable= file.path(output_dir, paste("out_res_",samplename,"_json_converted_toR.table", sep = ""))
+# ssm_to_trees= file.path(output_dir, paste("out_res_",samplename,"_assigned_ssms_to_best_tree_maf_format.table", sep = ""))
+# cnv_to_trees= file.path(output_dir, paste("out_res_",samplename,"_assigned_cnvs_to_best_tree_maf_format.table", sep = ""))
+# cellular_prevalence_plot= file.path(output_dir, paste("cellular_prevalence_",samplename,".pdf", sep = ""))
+# CCF_plot= file.path(output_dir, paste("cancer_cell_fraction_",samplename,".pdf", sep = ""))
+# VAF_plot= file.path(output_dir, paste("vaf_",samplename,".pdf", sep = ""))
+# VAF_coding_plot= file.path(output_dir, paste("vaf_ccoding",samplename,".pdf", sep = ""))
+
+
+###################################################
+# open summ.json file and convert it into humam readable format
+###################################################
+
+#this function opens SAMPLE_ID_summ.jason and converts it into R table
+open_tree = function(json_summ_file,out_json_to_Rtable){
+
+  out_res<-NULL
+  for (j in 1:length(json_summ_file[["trees"]])){
+  
+  tree_focal<-json_summ_file[["trees"]][j]
+  tree_focal_statA<-as.data.frame(t(unlist(sapply(tree_focal,function(x)x[c("clustering_index","branching_index","llh","linearity_index")])))) 
+  colnames(tree_focal_statA)<-c("clustering_index","branching_index","llh","linearity_index")
+  tree_focal_statA$tree_id<-j-1
+  rownames(tree_focal_statA)<-NULL
+  
+  
+  tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,-c(3,6,9,12,15,18,21,24,27,30)]
+  #tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,!(grepl("cellular_prevalence",colnames(tree_focal_statB)))]
+  colnames(tree_focal_statB)<-sub("^[^.]*.", "", colnames(tree_focal_statB))
+  stat_both<-cbind(tree_focal_statA,tree_focal_statB)
+  out_res<-bind_rows(stat_both,out_res)  
+  out_res_ordered<-out_res[order(out_res$tree_id),] 
+  } # for j loop 
+
+density<-json_summ_file["tree_densities"]
+density_unlist<-data.frame("density"=unlist(density))
+row.names(density_unlist)<-sub("^[^.]*.", "", row.names(density_unlist))
+
+density_unlist$tree_id<-row.names(density_unlist)
+row.names(density_unlist)<-NULL
+
+final_table=merge(out_res_ordered,density_unlist, by.x = "tree_id", by.y = "tree_id")   ## add tree densities to all tress table
+write.table(final_table, file =out_json_to_Rtable ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE)
+return(final_table)
+}
+
+
+result_tree<-open_tree(result1,out_json_to_Rtable)
+
+
+###################################################
+# extrcats the best tree
+###################################################
+#the best tree is the tree with the highest density
+
+best_tree_id = function(R_table, density) {
+  best=R_table[which.max(R_table$density),]
+  best_tree_focal_name<-best$tree_id
+  best_tree_id<-paste(best$tree_id,"json",sep = ".")
+  return(best_tree_id)
+  return(best_tree_focal_name)
+}
+best_tree_fileID<-best_tree_id(result_tree, density)
+
+
+#######################################################################
+# extract the stats (SNvs and CNVs assigned to each population) from the best tree 
+#######################################################################
+
+open_best_tree = function(trees_out,best_tree_id){
+  unzip(trees_out, files = best_tree_id, exdir = dirname(trees_out), overwrite = TRUE)
+  best_tree_path = paste0(dirname(trees_out), "/", best_tree_id)
+	rr <- fromJSON(file = best_tree_path) 
+	return(rr)
+}
+rr= open_best_tree(trees_out,best_tree_fileID)
+
+
+#######################################################################
+# annotate point mutations and CNVs in the best tree
+#######################################################################
+best_focal<-result1[["trees"]][as.numeric(gsub(".json","",best_tree_fileID))+1]
+tree_structure<-as.data.frame(sapply(best_focal,function(x)x["structure"]))  ##[6]
+tree_roots <- best_focal[[1]]$structure$`0`
+
+
+merge_both<-function(result1,best_tree_fileID,tree_structure){
+  best_tree<-as.numeric(gsub(".json","",best_tree_fileID))
+  best_focal<-result1[["trees"]][best_tree+1]
+  tree_focal_statB<-as.data.frame(sapply(best_focal,function(x)x["populations"]))   ##[3]
+  qq<-tree_focal_statB[,grep("cellular_prevalence",colnames(tree_focal_statB))] %>% 
+    rownames_to_column("sample") %>% 
+    pivot_longer(-sample, 
+                 names_to = "population", 
+                 values_to = "cellular_prevalence") %>% 
+    mutate(population = str_remove(str_remove(population, ".*populations[.]"), "[.]cellular_prevalence")) %>% 
+    mutate(is_root = ifelse(population %in% tree_roots, TRUE, FALSE)) %>% 
+    group_by(sample) %>% 
+    mutate(purity = sum(cellular_prevalence[is_root]), 
+           CCF = cellular_prevalence / purity)
+  
+  return(qq)
+
+}
+
+both_samples<-merge_both(result1,best_tree_fileID,tree_structure)
+
+
+write_tsv(both_samples, CCF_table)
+
+
+ssm = function(stat_best_tree, ssm_pre,ssm_to_trees,tree_structure, maf_list){
+  
+  out_res_ssm<-NULL
+  for ( i in 1:length(stat_best_tree$mut_assignments)){
+
+    focal<-(stat_best_tree$mut_assignments)[i]
+  
+   focal_ssms<-data.frame(sapply(focal, function(x) x[1]))
+   	
+   	colnames(focal_ssms)<-sub("^[^.]*.", "", colnames(focal_ssms))
+   focal_ssms$phyloWGS_population<-i
+   ssm_assign<-merge(ssm_pre,focal_ssms, by.x = "id", by.y = "ssms")[,c("id", "gene","phyloWGS_population")]
+   ssm_assign_spi<-separate(ssm_assign, col = gene, into = c("Chromosome","Start_Position"), sep = "_", convert = FALSE) %>% 
+     mutate(Start_Position = as.numeric(Start_Position)) 
+   if(chr_prefixed) {
+     ssm_assign_spi$Chromosome = str_c("chr", as.character(ssm_assign_spi$Chromosome))
+   }
+   
+   
+   out_res_ssm<-rbind(ssm_assign_spi,out_res_ssm)    
+   	
+   } ## i loop 
+  
+  ssm_assign_with_maf <- lapply(maf_list, function(x){
+    maf <- read_tsv(x, 
+                    col_types = cols(Chromosome = col_character())) %>% 
+      # PhyloWGS changes the start position of deletions. This makes the maf start position match that in the PhyloWGS SSM table. 
+      mutate(Start_Position = ifelse(Variant_Type == "DEL", Start_Position - 1, Start_Position))
+    maf <- out_res_ssm %>% 
+      left_join(maf, by = c("Chromosome", "Start_Position")) %>% 
+      # Restore the true MAF start postion after the hack above
+      mutate(Start_Position = ifelse(Variant_Type == "DEL", Start_Position + 1, Start_Position)) %>%
+      select(colnames(maf), everything())
+  })
+  out_res_ssm <- rbindlist(ssm_assign_with_maf) %>% 
+    mutate(clonal_status = case_when(
+      phyloWGS_population %in% tree_roots & length(tree_roots) > 1 ~ "polyclonal", 
+      phyloWGS_population %in% tree_roots & length(tree_roots) == 1 ~ "clonal",
+      TRUE ~ "subclonal"
+    ))
+  
+   return(out_res_ssm)
+  
+   
+}
+
+ss<-ssm(rr,ssm_pre,ssm_to_trees,tree_structure, mafs)
+
+write_tsv(ss, ssm_to_trees, na = "")
+
+
+###########################################################
+## load mut file to extrcat CNVs start and end positions
+###########################################################
+
+cnv = function(stat_best_tree, cnv_pre,mutation_file,cnv_to_trees){
+  out_res_cnv <-
+    bind_rows(lapply(1:length(stat_best_tree$mut_assignments), function(x)
+      data.frame(cnvs = stat_best_tree$mut_assignments[[x]]$cnvs) %>% mutate(phyloWGS_population = x)))
+  
+   
+  
+  #return(out_res_cnv)
+      out_res_mut<-NULL
+      for (cn in 1:length(result_mut$cnvs)){
+      focal_mut_cnv<-(result_mut$cnvs)[cn]
+  
+      focal_mut<-data.frame(sapply(focal_mut_cnv, function(x) x[1]))[1,]
+      colnames(focal_mut)<-sub("^[^.]*.", "", colnames(focal_mut))
+      focal_mut$cnv_id<-names(focal_mut_cnv)
+      out_res_mut<-bind_rows(focal_mut,out_res_mut)
+      } ## cn loop
+  
+    both_cnvs<-merge(out_res_cnv, out_res_mut, by.x = "cnvs",by.y = "cnv_id")  %>% 
+      select(cnvs, phyloWGS_population, physical_cnvs.chrom, 
+             physical_cnvs.start, physical_cnvs.end,
+             physical_cnvs.major_cn, physical_cnvs.minor_cn, physical_cnvs.cell_prev)
+    
+  
+}
+
+
+cnv<-cnv(rr, cnv_pre,result_mut,cnv_to_trees)
+write.table(cnv, file =cnv_to_trees ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE)
+
+##################### plot the results ####################
+###########################################################
+#### Slope chart the best tree, cellular prevalence #######
+
+plot_cp<-function(both_samples,cellular_prevalence_plot){
+pdf(cellular_prevalence_plot, width = 8, height =8 )
+plotA<-ggplot(data = both_samples, aes(x = sample, y = cellular_prevalence, group = population)) +
+  geom_line(aes(color = population), size = 2) +
+  labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+  
+  geom_point(aes(color = population), size = 4) +
+  #  Labelling as desired
+  xlab("Sample") + ylab("Cellular prevalence")+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+        panel.background = element_blank(), axis.line = element_line(colour = "black"),
+        legend.key = element_rect(fill = NA, colour = NA, size = 0.25),axis.text=element_text(size=16),axis.title=element_text(size=18),
+        plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)))
+print(plotA)
+dev.off() 
+}
+
+plot_cp(both_samples,cellular_prevalence_plot)
+
+###########################################################
+#### Slope chart the best tree, CCF #######
+
+plot_cp<-function(both_samples,CCF_plot){
+pdf(CCF_plot, width = 8, height =8 )
+plotB<-ggplot(data = both_samples[both_samples$population != 0, ], aes(x = sample, y = CCF, group = population)) +
+  geom_line(aes(color = population), size = 2) +
+  labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+  
+  geom_point(aes(color = population), size = 4) +
+  #  Labelling as desired
+  xlab("Sample") + ylab("CCF")+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+        panel.background = element_blank(), axis.line = element_line(colour = "black"),
+        legend.key = element_rect(fill = NA, colour = NA, size = 0.25),axis.text=element_text(size=16),axis.title=element_text(size=18),
+        plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)))
+print(plotB)
+dev.off() 
+}
+
+plot_cp(both_samples,CCF_plot)
+
+
+#############################################
+##### Slope chart the best tree (VAF) #######
+
+
+plot_vaf<-function(ss,VAF_plot){
+  pdf(VAF_plot, width = 8, height =8 )
+  plotC <- ss %>% 
+    select(Hugo_Symbol, Chromosome, Tumor_Sample_Barcode, Start_Position, t_depth, t_alt_count, populations = phyloWGS_population) %>% 
+    mutate(VAF = t_alt_count/t_depth, 
+           populations = as.factor(populations)) %>% 
+    filter(!is.na(Tumor_Sample_Barcode)) %>%
+    ggplot(aes(x = Tumor_Sample_Barcode, 
+               y = VAF, 
+               group = interaction(populations, Start_Position),
+               color = populations)) + 
+    geom_line(aes(color = populations), size=0.2, alpha=0.4)+
+    labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+  
+    xlab("Sample") + ylab("VAF") +
+    guides(colour = guide_legend(override.aes = list(alpha = 3)))+
+    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+          panel.background = element_blank(), axis.line = element_line(colour = "black"),
+          legend.key = element_rect(fill = NA, colour = NA, size = 2),axis.text=element_text(size=16),axis.title=element_text(size=18),
+          plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)))
+print(plotC)
+dev.off() 
+}
+
+
+
+plot_vaf(ss,VAF_plot)
+
+#############################################################
+##### Slope chart the best tree (VAF, coding regions, nonsense, missense and splicing sites) #######
+
+drivers <- read_tsv(driver_genes, col_names = "gene") %>% pull(gene) 
+
+plot_vaf_coding<-function(maf,VAF_coding_plot){
+
+  coding <- ss %>% 
+    select(Hugo_Symbol, HGVSp_Short, Chromosome, Tumor_Sample_Barcode, Variant_Classification, Start_Position, t_depth, t_alt_count, populations = phyloWGS_population) %>% 
+    mutate(VAF = t_alt_count/t_depth, 
+           populations = as.factor(populations), 
+           Tumor_Sample_Barcode = factor(Tumor_Sample_Barcode, levels = sample_order)) %>% 
+    filter(!is.na(Tumor_Sample_Barcode), 
+           !Variant_Classification %in% c("Silent", "RNA", "IGR", "Intron", "5'Flank", "3'Flank", "5'UTR")) %>% 
+    mutate(label = ifelse(!is.na(HGVSp_Short), str_c(Hugo_Symbol, "_", HGVSp_Short), str_c(Hugo_Symbol, "_", Variant_Classification)))
+pdf(VAF_coding_plot, width = 8, height =8 )
+plotD<-coding %>%
+  ggplot(aes(x = Tumor_Sample_Barcode, 
+             y = VAF, 
+             group = interaction(populations, Start_Position),
+             color = populations)) + 
+  geom_line(aes(color = populations), size=0.5, alpha=0.4)+
+  geom_text_repel(
+    data = filter(group_by(coding, Hugo_Symbol, Start_Position), VAF == max(VAF), Tumor_Sample_Barcode == sample_order[1], Hugo_Symbol %in% drivers),
+    aes(label = label, 
+        x = Tumor_Sample_Barcode, 
+        y = VAF), 
+    nudge_x = -0.2,
+    size = 4
+    ) +
+  geom_text_repel(
+    data = filter(group_by(coding, Hugo_Symbol, Start_Position), VAF == max(VAF), Tumor_Sample_Barcode == sample_order[length(sample_order)], Hugo_Symbol %in% drivers),
+    aes(label = label, 
+        x = Tumor_Sample_Barcode, 
+        y = VAF), 
+    nudge_x = 0.2,
+    size = 4
+  ) +
+  labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+  
+  xlab("Sample") + ylab("VAF") +
+  guides(colour = guide_legend(override.aes = list(alpha = 3)))+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+        panel.background = element_blank(), axis.line = element_line(colour = "black"),
+        legend.key = element_rect(fill = NA, colour = NA, size = 2),axis.text=element_text(size=16),axis.title=element_text(size=18),
+        plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)))
+
+print(plotD)
+dev.off() 
+}  
+
+
+plot_vaf_coding(ss,VAF_coding_plot)
+
+#############################################
+#####       Draw the best tree        #######
+#############################################
+
+
+tree_structure_long <- tree_structure %>% 
+  pivot_longer(everything(), 
+               names_to = "parent", 
+               values_to = "node") %>% 
+  mutate(parent = str_remove_all(parent, ".*[.]")) %>% 
+  distinct()  
+
+
+positions_x <- function(parents){
+  x <- 1:length(unique(parents))
+  names(x) <- unique(parents)
+  col_vals <- unname(x[parents])
+  return(col_vals)
+}
+
+tree_structure_long$x <- positions_x(tree_structure_long$parent)
+
+positions_y <- function(tree_df){
+  y = c("0" = 0.5)
+  for(parent in unique(tree_df$parent)){
+    # parent = "1"
+    child_index = 1
+    num_children <- nrow(tree_df[tree_df$parent == parent,])
+    if(num_children == 1){
+      child <- tree_df[tree_df$parent == parent,]$node
+      child_y <- unname(y[parent])
+      names(child_y) <- child
+      y = c(y, child_y)
+      
+    } else {
+      children <- tree_df[tree_df$parent == parent,]$node
+      y_max <- unname(y[parent]) + (0.25 / child_index)
+      y_min <- unname(y[parent]) - (0.25 / child_index)
+      y_range <- seq(y_min, y_max, length.out = length(children))
+      names(y_range) <- children
+      y = c(y, y_range)
+    }
+    child_index = child_index + 1
+  }
+  return(y)
+}
+
+tree_structure_long$y <- unname(positions_y(tree_structure_long)[as.character(tree_structure_long$node)])
+
+tree_structure_long <- add_row(tree_structure_long, parent = "0", node = 0, x = 0, y = 0.5) 
+
+get_ssms <- function(tree_df, best_focal, best_tree_fileID){
+  data <- best_focal[[str_remove_all(best_tree_fileID, "[.].*")]]$populations
+  ssm_vec <- c()
+  for(node in tree_df$node){
+    # node = "1"
+    num_ssms <- data[[as.character(node)]]$num_ssms
+    names(num_ssms) <- as.character(node)
+    ssm_vec <- c(ssm_vec, num_ssms)
+  }
+  return(ssm_vec)
+}
+
+tree_structure_long$num_ssms <- get_ssms(tree_structure_long, best_focal, best_tree_fileID)[as.character(tree_structure_long$node)]
+
+tree_structure_long <- tree_structure_long %>% 
+  mutate(parent = as.numeric(parent)) %>%
+  left_join(select(tree_structure_long, node, xstart = x, ystart = y), 
+            by = c("parent" = "node"))
+
+
+
+ggplot(tree_structure_long, 
+       aes(x = x, 
+           y = y, 
+           label = node)) + 
+  geom_segment(inherit.aes = FALSE, 
+               aes(x = xstart,
+                   xend = x,
+                   y = ystart,
+                   yend = y)) +
+  geom_point(aes(size = num_ssms), 
+             fill = "white",
+             colour = "black", 
+             pch = 21) + 
+  geom_text() + 
+  scale_size(range = c(5,20)) + 
+  ylim(0,1) +
+  theme_void() + 
+  ggtitle(samplename) + 
+  theme(legend.position = "none")
+ 
+ggsave(tree_plot, height = 6, width = 6)
+
+############
+##### END ##
+############
+
+
+
+

From d5af90f6bfe97556a2199122cc47ea9687fa0e77 Mon Sep 17 00:00:00 2001
From: lkhilton <laura.k.hilton@gmail.com>
Date: Mon, 12 Sep 2022 22:51:32 -0700
Subject: [PATCH 07/14] Add input job grouping

---
 modules/phylowgs/1.0/phylowgs.smk | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/modules/phylowgs/1.0/phylowgs.smk b/modules/phylowgs/1.0/phylowgs.smk
index 1cdba641..1ceb3110 100644
--- a/modules/phylowgs/1.0/phylowgs.smk
+++ b/modules/phylowgs/1.0/phylowgs.smk
@@ -154,6 +154,7 @@ rule _phylowgs_input_maf:
         maf = CFG["inputs"]["maf"],
     output:
         maf = CFG["dirs"]["inputs"] + "maf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.maf",
+    group: "input_maf"
     run:
         op.absolute_symlink(input.maf, output.maf)
 
@@ -165,6 +166,7 @@ rule _phylowgs_input_battenberg:
     output:
         cellularity = CFG["dirs"]["inputs"] + "battenberg/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.cellularity_ploidy.txt",
         subclones = CFG["dirs"]["inputs"] + "battenberg/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.subclones.txt"
+    group: "input_battenberg"
     run:
         op.absolute_symlink(input.cellularity, output.cellularity)
         op.absolute_symlink(input.subclones, output.subclones)
@@ -187,6 +189,7 @@ rule _phylowgs_parse_battenberg:
         CFG["threads"]["create_inputs"]
     resources:
         **CFG["resources"]["create_inputs"]
+    group: "input_battenberg"
     shell:
         op.as_one_line("""
         cellularity=$(tail -n +2 {input.cellularity} | cut -f 1);
@@ -203,6 +206,7 @@ rule _phylowgs_maf_to_vcf:
         vcf = temp(CFG['dirs']['maf_to_vcf'] + "{seq_type}--{genome_build}/{patient_id}/{tumour_id}--{normal_id}--{pair_status}.maf_to.vcf")
     conda:
         CFG["conda_envs"]["vcf2maf"]
+    group: "input_maf"
     shell:
         op.as_one_line("""
         maf2vcf.pl --input-maf {input.maf} --output-dir $(dirname {output.vcf}) --output-vcf {output.vcf} --ref-fasta {input.fasta}

From 7030305ad3492bd9fb13537d5ddcc086a8c64fc4 Mon Sep 17 00:00:00 2001
From: lkhilton <laura.k.hilton@gmail.com>
Date: Mon, 12 Sep 2022 22:53:17 -0700
Subject: [PATCH 08/14] Note about scratch subdirectories

---
 modules/phylowgs/1.0/config/default.yaml | 31 ++++++++++++------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/modules/phylowgs/1.0/config/default.yaml b/modules/phylowgs/1.0/config/default.yaml
index e8e0c1e4..f1557fa1 100644
--- a/modules/phylowgs/1.0/config/default.yaml
+++ b/modules/phylowgs/1.0/config/default.yaml
@@ -1,27 +1,27 @@
 lcr-modules:
-    
+
     phylowgs:
 
         inputs:
             # Available wildcards: {tumour_id} {normal_id} {pair_status} {genome_build} {sample_id}
-            maf: "__UPDATE__" 
+            maf: "__UPDATE__"
             cellularity: "__UPDATE__" # battenberg/99-outputs/txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_subclones.txt
             subclones: "__UPDATE__" # battenberg/99-outputs/txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_cellularity_ploidy.txt
             drivers: "__UPDATE__" # newline-separated list of driver gene HUGO symbols to be included on plots
 
-        scratch_subdirectories: []
+        scratch_subdirectories: [] # Recommended: "04-multievolve"
 
         options:
-            create_inputs: 
-                opts: 
-                    "-s 5000 --verbose --regions all" # -s controls how many variants should be sub-sampled.  
+            create_inputs:
+                opts:
+                    "-s 5000 --verbose --regions all" # -s controls how many variants should be sub-sampled.
             multievolve: ""
             write_results: "--include-ssm-names"
 
-        switches: 
-            # NOTE: You must include a "sex" column in the input samples table, formatted with "M" and "F".  
-            # If patient sex is unknown, you can leave it empty and phyloWGS will run in "auto" mode. 
-            sex: 
+        switches:
+            # NOTE: You must include a "sex" column in the input samples table, formatted with "M" and "F".
+            # If patient sex is unknown, you can leave it empty and phyloWGS will run in "auto" mode.
+            sex:
                 M: "male"
                 F: "female"
 
@@ -33,24 +33,24 @@ lcr-modules:
             bcftools: "{MODSDIR}/envs/bcftools-1.10.2.yaml"
             coreutils: "{MODSDIR}/envs/coreutils-8.31.yaml"
 
-        scripts: 
+        scripts:
             fill_battenberg: "{MODSDIR}/src/fill_battenberg.py"
             arm_file: "{MODSDIR}/etc/chromArmFiles/chromArm.{genome_build}.tsv"
             process_outputs: "src/process_phyloWGS_outputs.R"
             noncoding: "{MODSDIR}/etc/noncoding.txt"
-            
+
         threads:
             create_inputs: 1
             multievolve: 4
             write_results: 1
 
         resources:
-            create_inputs: 
+            create_inputs:
                 mem_mb: 10000
-            multievolve: 
+            multievolve:
                 mem_mb: 40000
                 evolve: 1
-            write_results: 
+            write_results:
                 mem_mb: 20000
 
         pairing_config:
@@ -58,4 +58,3 @@ lcr-modules:
                 run_paired_tumours: True
                 run_unpaired_tumours_with: None
                 run_paired_tumours_as_unpaired: False
-            
\ No newline at end of file

From 59159c0efa6f4d0247f4e6eb909f36a56d3e6c91 Mon Sep 17 00:00:00 2001
From: lkhilton <laura.k.hilton@gmail.com>
Date: Fri, 3 Mar 2023 15:36:15 -0800
Subject: [PATCH 09/14] Unneeded env yaml file

---
 envs/phylowgs/fill_battenberg.yaml | 41 ------------------------------
 1 file changed, 41 deletions(-)
 delete mode 100644 envs/phylowgs/fill_battenberg.yaml

diff --git a/envs/phylowgs/fill_battenberg.yaml b/envs/phylowgs/fill_battenberg.yaml
deleted file mode 100644
index 52741e52..00000000
--- a/envs/phylowgs/fill_battenberg.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-name: fill_segments
-channels:
-  - bioconda
-  - conda-forge
-  - defaults
-dependencies:
-  - _libgcc_mutex=0.1
-  - _openmp_mutex=4.5
-  - ca-certificates=2020.12.5
-  - certifi=2020.12.5
-  - ld_impl_linux-64=2.35.1
-  - libblas=3.9.0
-  - libcblas=3.9.0
-  - libffi=3.3
-  - libgcc-ng=9.3.0
-  - libgfortran-ng=9.3.0
-  - libgfortran5=9.3.0
-  - libgomp=9.3.0
-  - liblapack=3.9.0
-  - libopenblas=0.3.12
-  - libstdcxx-ng=9.3.0
-  - ncurses=6.2
-  - numpy=1.19.4
-  - openssl=1.1.1i
-  - pandas=1.2.0
-  - pip=20.3.3
-  - python=3.9.1
-  - python-dateutil=2.8.1
-  - python_abi=3.9
-  - pytz=2020.5
-  - readline=8.0
-  - setuptools=49.6.0
-  - simplejson=3.17.2
-  - six=1.15.0
-  - sqlite=3.34.0
-  - tk=8.6.10
-  - tzdata=2020e
-  - wheel=0.36.2
-  - xz=5.2.5
-  - zlib=1.2.11
-prefix: /home/dreval/miniconda3/envs/fill_segments

From 869a266ef4670d1e312f66208ba3faf5cdeec192 Mon Sep 17 00:00:00 2001
From: lkhilton <laura.k.hilton@gmail.com>
Date: Fri, 3 Mar 2023 15:39:45 -0800
Subject: [PATCH 10/14] Add dependency checking

---
 modules/phylowgs/1.0/phylowgs.smk | 42 ++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/modules/phylowgs/1.0/phylowgs.smk b/modules/phylowgs/1.0/phylowgs.smk
index 1ceb3110..39a6b990 100644
--- a/modules/phylowgs/1.0/phylowgs.smk
+++ b/modules/phylowgs/1.0/phylowgs.smk
@@ -16,7 +16,47 @@
 import oncopipe as op
 import hashlib
 import glob
-
+import inspect
+
+# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe
+min_oncopipe_version="1.0.11"
+import pkg_resources
+try:
+    from packaging import version
+except ModuleNotFoundError:
+    sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version")
+
+# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe
+
+current_version = pkg_resources.get_distribution("oncopipe").version
+if version.parse(current_version) < version.parse(min_oncopipe_version):
+    logger.warning(
+                '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}'
+                "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m'
+                )
+    sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)")
+
+# End of dependency checking section
+
+# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe
+min_oncopipe_version="1.0.11"
+import pkg_resources
+try:
+    from packaging import version
+except ModuleNotFoundError:
+    sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version")
+
+# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe
+
+current_version = pkg_resources.get_distribution("oncopipe").version
+if version.parse(current_version) < version.parse(min_oncopipe_version):
+    logger.warning(
+                '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}'
+                "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m'
+                )
+    sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)")
+
+# End of dependency checking section
 
 # Setup module and store module-specific configuration in `CFG`
 # `CFG` is a shortcut to `config["lcr-modules"]["phylowgs"]`

From 9e3161607ec147040474d81f738138612b59a21e Mon Sep 17 00:00:00 2001
From: lkhilton <laura.k.hilton@gmail.com>
Date: Wed, 12 Apr 2023 15:58:26 -0700
Subject: [PATCH 11/14] Add 3'UTR

---
 modules/phylowgs/1.0/etc/noncoding.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/phylowgs/1.0/etc/noncoding.txt b/modules/phylowgs/1.0/etc/noncoding.txt
index 258f27f0..c5fd789c 100644
--- a/modules/phylowgs/1.0/etc/noncoding.txt
+++ b/modules/phylowgs/1.0/etc/noncoding.txt
@@ -6,3 +6,4 @@ Intron
 5'Flank
 3'Flank
 5'UTR
+3'UTR

From e9803de4ce9ef717b54272231fe035f924e08331 Mon Sep 17 00:00:00 2001
From: lkhilton <laura.k.hilton@gmail.com>
Date: Wed, 12 Apr 2023 15:58:46 -0700
Subject: [PATCH 12/14] Patch for GAMBLR installation

---
 modules/pyclone_vi/1.0/pyclone_vi.smk | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/modules/pyclone_vi/1.0/pyclone_vi.smk b/modules/pyclone_vi/1.0/pyclone_vi.smk
index 3a038b61..a882ebd1 100644
--- a/modules/pyclone_vi/1.0/pyclone_vi.smk
+++ b/modules/pyclone_vi/1.0/pyclone_vi.smk
@@ -45,7 +45,10 @@ f = open("config/envs/GAMBLR.yaml", 'rb')
 md5hash.update(f.read())
 f.close()
 h = md5hash.hexdigest()
-GAMBLR = glob.glob(conda_prefix + "/" + h[:8] + "*")[0]
+GAMBLR = glob.glob(conda_prefix + "/" + h[:8] + "*")
+for file in GAMBLR:
+    if os.path.isdir(file):
+        GAMBLR = file
 
 rule _pyclone_vi_install_GAMBLR:
     params:
@@ -369,6 +372,16 @@ if isinstance(PATIENTS_GENOMES, pd.DataFrame) and isinstance(PATIENTS_CAPTURE, p
 
 rule _pyclone_vi_all:
     input:
+        expand(
+            rules._pyclone_vi_input_maf.output.maf,
+            zip,
+            tumour_id = CFG["runs"]["tumour_sample_id"],
+            normal_id = CFG["runs"]["normal_sample_id"],
+            pair_status = CFG["runs"]["pair_status"],
+            seq_type = CFG["runs"]["tumour_seq_type"],
+            genome_build = CFG["runs"]["tumour_genome_build"],
+            patient_id = CFG["runs"]["tumour_patient_id"]
+        ),
         expand(
             [
                 str(rules._pyclone_vi_output_tsv.output.phyclone),

From 0cec80e90f26dc3fba7d9e44771ce1c73ab6c3ed Mon Sep 17 00:00:00 2001
From: lkhilton <laura.k.hilton@gmail.com>
Date: Wed, 12 Apr 2023 15:59:07 -0700
Subject: [PATCH 13/14] Include indels in maf

---
 modules/pyclone_vi/1.0/src/build_input.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/pyclone_vi/1.0/src/build_input.py b/modules/pyclone_vi/1.0/src/build_input.py
index f412fd6c..48170bab 100644
--- a/modules/pyclone_vi/1.0/src/build_input.py
+++ b/modules/pyclone_vi/1.0/src/build_input.py
@@ -96,7 +96,7 @@ def get_normal_cn(chrom, sex):
 def load_snv_df(file_name, sample_id="tumour"):
     df = pd.read_csv(file_name, sep="\t")
     # PyClone only works on SNPs, not InDels
-    df = df[df["Variant_Type"].isin(["SNP"])]
+    # df = df[df["Variant_Type"].isin(["SNP"])]
     # Ignore intergenic mutations (IGR)
     df = df[~df["Variant_Classification"].isin(["IGR"])]
     # Acutally I can't do this sub-sampling here because the mutations in all files for all tumours

From d484a11772706825d65081b356563c508c08ad84 Mon Sep 17 00:00:00 2001
From: lkhilton <laura.k.hilton@gmail.com>
Date: Wed, 12 Jun 2024 11:11:55 -0700
Subject: [PATCH 14/14] Implement suggested changes from PR

---
 modules/phylowgs/1.0/config/default.yaml      |   4 +-
 .../1.0/src/process_phyloWGS_outputs.R        | 679 +++++++++---------
 modules/pyclone_vi/1.0/config/default.yaml    |   6 +-
 modules/pyclone_vi/1.0/pyclone_vi.smk         |  10 +-
 4 files changed, 337 insertions(+), 362 deletions(-)

diff --git a/modules/phylowgs/1.0/config/default.yaml b/modules/phylowgs/1.0/config/default.yaml
index f1557fa1..ff6a3332 100644
--- a/modules/phylowgs/1.0/config/default.yaml
+++ b/modules/phylowgs/1.0/config/default.yaml
@@ -5,8 +5,8 @@ lcr-modules:
         inputs:
             # Available wildcards: {tumour_id} {normal_id} {pair_status} {genome_build} {sample_id}
             maf: "__UPDATE__"
-            cellularity: "__UPDATE__" # battenberg/99-outputs/txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_subclones.txt
-            subclones: "__UPDATE__" # battenberg/99-outputs/txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_cellularity_ploidy.txt
+            cellularity: "__UPDATE__" # battenberg/99-outputs/txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_cellularity_ploidy.txt
+            subclones: "__UPDATE__" # battenberg/99-outputs/txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_subclones.txt 
             drivers: "__UPDATE__" # newline-separated list of driver gene HUGO symbols to be included on plots
 
         scratch_subdirectories: [] # Recommended: "04-multievolve"
diff --git a/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R b/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R
index 587cdb05..ae57c9ab 100644
--- a/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R
+++ b/modules/phylowgs/1.0/src/process_phyloWGS_outputs.R
@@ -1,12 +1,10 @@
-
-
 #'
 #' processing phyloWGS outputs pipeline that takes output json files and preprocessing output files
 #' SAMPLE_ID.mutass.zip file must be unzipped before runing the script
 
-#E example: how to run 
-#mkdir -p output
-#Rscript ./process.R --samplename SAMPLE_ID -j SAMPLE_ID.summ.json -t unziped.mutass/ -s ssm_data.txt -c cnv_data.txt -a SAMPLE_ID--matched_slms-3.final_deblacklisted_augmented.maf -b SAMPLE_ID_matched_slms-3.final_deblacklisted_augmented.maf -m SAMPLE_ID.muts.json -o out
+# E example: how to run
+# mkdir -p output
+# Rscript ./process.R --samplename SAMPLE_ID -j SAMPLE_ID.summ.json -t unziped.mutass/ -s ssm_data.txt -c cnv_data.txt -a SAMPLE_ID--matched_slms-3.final_deblacklisted_augmented.maf -b SAMPLE_ID_matched_slms-3.final_deblacklisted_augmented.maf -m SAMPLE_ID.muts.json -o out
 
 ##################################################
 # load required libraries
@@ -23,91 +21,48 @@ library("data.table")
 #### Snakemake Input #####
 ##########################
 
-samplename = snakemake@wildcards[["patient_id"]]
-json_file = snakemake@input[["summ"]]
-trees_out= snakemake@input[["mutass"]]   
-ssm_file = snakemake@input[["ssms"]]
-cnv_file = snakemake@input[["cnvs"]]
-mafs = unlist(strsplit(snakemake@params[["maf_list"]], ","))
-mut_file = snakemake@input[["muts"]]
-driver_genes = snakemake@params[["drivers"]]
-sample_order = unlist(strsplit(snakemake@params[["sample_order"]], ","))
-genome_build = snakemake@wildcards[["genome_build"]]
+samplename <- snakemake@wildcards[["patient_id"]]
+json_file <- snakemake@input[["summ"]]
+trees_out <- snakemake@input[["mutass"]]
+ssm_file <- snakemake@input[["ssms"]]
+cnv_file <- snakemake@input[["cnvs"]]
+mafs <- unlist(strsplit(snakemake@params[["maf_list"]], ","))
+mut_file <- snakemake@input[["muts"]]
+driver_genes <- snakemake@params[["drivers"]]
+sample_order <- unlist(strsplit(snakemake@params[["sample_order"]], ","))
+genome_build <- snakemake@wildcards[["genome_build"]]
 
 # Define the chr_prefix parameter based on the genome_build
-chr_prefixed = str_detect(genome_build, "hg")
-
-
-# option_list = list(
-#   make_option(c("-n", "--samplename"), type="character", default=NULL, help="Samplename of the sample to run", metavar="character"),
-#   make_option(c("-j", "--json_summ"), type="character", default=NULL, help="SAMPLE_ID.summ.json file generated by phyloWGS", metavar="character"),
-#   make_option(c("-t", "--trees_out"), type="character", default=NULL, help="Directory containing unzipped XX.mutass.zip trees", metavar="character"),
-#   make_option(c("-s", "--ssm"), type="character", default=NULL, help="Preprocessing ssm_data.txt output file", metavar="character"),
-#   make_option(c("-c", "--copynumber"), type="character", default=NULL, help="Preprocessing cnv_data.txt output file", metavar="character"),
-#   make_option(c("-a", "--tumourA_maf"), type="character", default=NULL, help="Agument maf file of tumour A", metavar="character"),
-#   make_option(c("-b", "--tumourB_maf"), type="character", default=NULL, help="Agument maf file of tumour B", metavar="character"),
-#   make_option(c("-m", "--json_muts"), type="character", default=NULL, help="SAMPLE_ID.muts.json file", metavar="character"),
-#   make_option(c("-o", "--output"), type="character", default=NULL, help="Output directory", metavar="character")
-# )
-# 
-# opt_parser = OptionParser(option_list=option_list)
-# opt = parse_args(opt_parser)
-# 
-# samplename = opt$samplename
-# json_file = opt$json_summ
-# trees_out= opt$trees_out   ###  directory where unziped SAMPLE_ID.mutass.zip trees are
-# ssm_file = opt$ssm
-# cnv_file = opt$copynumber
-# mafA = opt$tumourA_maf
-# mafB = opt$tumourB_maf
-# mut_file = opt$json_muts
-# output_dir = opt$output
-# 
-# 
-# 
-# .checkfile = function(infile) {
-#   
-#   if (!file.exists(infile)) {
-#     
-#     stop(paste("File", infile, "does not exist", sep=""))
-#     
-#   }
-#   
-# }
-# 
-# 
-# .checkfile(json_file)
-# .checkfile(ssm_file)
-# .checkfile(cnv_file)
-# .checkfile(mafA)
-# .checkfile(mafB)
-# .checkfile(mut_file)
+chr_prefixed <- str_detect(genome_build, "hg")
+
 
 ##################################################
 # Process input files
 ###################################################
 # Parse the input file and obtain the required data for this run
 result1 <- fromJSON(file = json_file)
-result_mut<-fromJSON(file = mut_file)
-ssm_pre<-read.table(file = ssm_file, header = TRUE)
-cnv_pre<-read.delim(file = cnv_file, header = TRUE)[,c("cnv","a","d")]
+result_mut <- fromJSON(file = mut_file)
+ssm_pre <- read.table(file = ssm_file, header = TRUE)
+cnv_pre <- read.delim(file = cnv_file, header = TRUE)[, c("cnv", "a", "d")]
 
 
 
 ##################################################
 # define output files
 ##################################################
-out_json_to_Rtable= snakemake@output[["tree_summary"]]
-ssm_to_trees= snakemake@output[["maf"]]
-cnv_to_trees= snakemake@output[["cnvs"]]
-cellular_prevalence_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_cellular_prevalence.pdf"))
-CCF_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_cancer_cell_fraction_.pdf"))
-VAF_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_vaf_.pdf"))
-VAF_coding_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_vaf_coding.pdf"))
-tree_plot= file.path(snakemake@output[["plots"]], paste0(samplename, "_tree.pdf"))
-CCF_table = snakemake@output[["CCF"]]
-
-if(!dir.exists(snakemake@output[["plots"]])){dir.create(snakemake@output[["plots"]])}
+out_json_to_Rtable <- snakemake@output[["tree_summary"]]
+ssm_to_trees <- snakemake@output[["maf"]]
+cnv_to_trees <- snakemake@output[["cnvs"]]
+cellular_prevalence_plot <- file.path(snakemake@output[["plots"]], paste0(samplename, "_cellular_prevalence.pdf"))
+CCF_plot <- file.path(snakemake@output[["plots"]], paste0(samplename, "_cancer_cell_fraction_.pdf"))
+VAF_plot <- file.path(snakemake@output[["plots"]], paste0(samplename, "_vaf_.pdf"))
+VAF_coding_plot <- file.path(snakemake@output[["plots"]], paste0(samplename, "_vaf_coding.pdf"))
+tree_plot <- file.path(snakemake@output[["plots"]], paste0(samplename, "_tree.pdf"))
+CCF_table <- snakemake@output[["CCF"]]
+
+if (!dir.exists(snakemake@output[["plots"]])) {
+  dir.create(snakemake@output[["plots"]])
+}
 
 # out_json_to_Rtable= file.path(output_dir, paste("out_res_",samplename,"_json_converted_toR.table", sep = ""))
 # ssm_to_trees= file.path(output_dir, paste("out_res_",samplename,"_assigned_ssms_to_best_tree_maf_format.table", sep = ""))
@@ -122,151 +77,147 @@ if(!dir.exists(snakemake@output[["plots"]])){dir.create(snakemake@output[["plots
 # open summ.json file and convert it into humam readable format
 ###################################################
 
-#this function opens SAMPLE_ID_summ.jason and converts it into R table
-open_tree = function(json_summ_file,out_json_to_Rtable){
-
-  out_res<-NULL
-  for (j in 1:length(json_summ_file[["trees"]])){
-  
-  tree_focal<-json_summ_file[["trees"]][j]
-  tree_focal_statA<-as.data.frame(t(unlist(sapply(tree_focal,function(x)x[c("clustering_index","branching_index","llh","linearity_index")])))) 
-  colnames(tree_focal_statA)<-c("clustering_index","branching_index","llh","linearity_index")
-  tree_focal_statA$tree_id<-j-1
-  rownames(tree_focal_statA)<-NULL
-  
-  
-  tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,-c(3,6,9,12,15,18,21,24,27,30)]
-  #tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,!(grepl("cellular_prevalence",colnames(tree_focal_statB)))]
-  colnames(tree_focal_statB)<-sub("^[^.]*.", "", colnames(tree_focal_statB))
-  stat_both<-cbind(tree_focal_statA,tree_focal_statB)
-  out_res<-bind_rows(stat_both,out_res)  
-  out_res_ordered<-out_res[order(out_res$tree_id),] 
-  } # for j loop 
-
-density<-json_summ_file["tree_densities"]
-density_unlist<-data.frame("density"=unlist(density))
-row.names(density_unlist)<-sub("^[^.]*.", "", row.names(density_unlist))
-
-density_unlist$tree_id<-row.names(density_unlist)
-row.names(density_unlist)<-NULL
-
-final_table=merge(out_res_ordered,density_unlist, by.x = "tree_id", by.y = "tree_id")   ## add tree densities to all tress table
-write.table(final_table, file =out_json_to_Rtable ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE)
-return(final_table)
+# this function opens SAMPLE_ID_summ.jason and converts it into R table
+open_tree <- function(json_summ_file, out_json_to_Rtable) {
+  out_res <- NULL
+  for (j in 1:length(json_summ_file[["trees"]])) {
+    tree_focal <- json_summ_file[["trees"]][j]
+    tree_focal_statA <- as.data.frame(t(unlist(sapply(tree_focal, function(x) x[c("clustering_index", "branching_index", "llh", "linearity_index")]))))
+    colnames(tree_focal_statA) <- c("clustering_index", "branching_index", "llh", "linearity_index")
+    tree_focal_statA$tree_id <- j - 1
+    rownames(tree_focal_statA) <- NULL
+
+
+    tree_focal_statB <- as.data.frame(sapply(tree_focal, function(x) x[3]))[1, -c(3, 6, 9, 12, 15, 18, 21, 24, 27, 30)]
+    # tree_focal_statB<-as.data.frame(sapply(tree_focal,function(x)x[3]))[1,!(grepl("cellular_prevalence",colnames(tree_focal_statB)))]
+    colnames(tree_focal_statB) <- sub("^[^.]*.", "", colnames(tree_focal_statB))
+    stat_both <- cbind(tree_focal_statA, tree_focal_statB)
+    out_res <- bind_rows(stat_both, out_res)
+    out_res_ordered <- out_res[order(out_res$tree_id), ]
+  } # for j loop
+
+  density <- json_summ_file["tree_densities"]
+  density_unlist <- data.frame("density" = unlist(density))
+  row.names(density_unlist) <- sub("^[^.]*.", "", row.names(density_unlist))
+
+  density_unlist$tree_id <- row.names(density_unlist)
+  row.names(density_unlist) <- NULL
+
+  final_table <- merge(out_res_ordered, density_unlist, by.x = "tree_id", by.y = "tree_id") ## add tree densities to all tress table
+  write.table(final_table, file = out_json_to_Rtable, col.names = TRUE, row.names = FALSE, sep = "\t", quote = FALSE)
+  return(final_table)
 }
 
 
-result_tree<-open_tree(result1,out_json_to_Rtable)
+result_tree <- open_tree(result1, out_json_to_Rtable)
 
 
 ###################################################
 # extrcats the best tree
 ###################################################
-#the best tree is the tree with the highest density
+# the best tree is the tree with the highest density
 
-best_tree_id = function(R_table, density) {
-  best=R_table[which.max(R_table$density),]
-  best_tree_focal_name<-best$tree_id
-  best_tree_id<-paste(best$tree_id,"json",sep = ".")
+best_tree_id <- function(R_table, density) {
+  best <- R_table[which.max(R_table$density), ]
+  best_tree_focal_name <- best$tree_id
+  best_tree_id <- paste(best$tree_id, "json", sep = ".")
   return(best_tree_id)
   return(best_tree_focal_name)
 }
-best_tree_fileID<-best_tree_id(result_tree, density)
+best_tree_fileID <- best_tree_id(result_tree, density)
 
 
 #######################################################################
-# extract the stats (SNvs and CNVs assigned to each population) from the best tree 
+# extract the stats (SNvs and CNVs assigned to each population) from the best tree
 #######################################################################
 
-open_best_tree = function(trees_out,best_tree_id){
+open_best_tree <- function(trees_out, best_tree_id) {
   unzip(trees_out, files = best_tree_id, exdir = dirname(trees_out), overwrite = TRUE)
-  best_tree_path = paste0(dirname(trees_out), "/", best_tree_id)
-	rr <- fromJSON(file = best_tree_path) 
-	return(rr)
+  best_tree_path <- paste0(dirname(trees_out), "/", best_tree_id)
+  rr <- fromJSON(file = best_tree_path)
+  return(rr)
 }
-rr= open_best_tree(trees_out,best_tree_fileID)
+rr <- open_best_tree(trees_out, best_tree_fileID)
 
 
 #######################################################################
 # annotate point mutations and CNVs in the best tree
 #######################################################################
-best_focal<-result1[["trees"]][as.numeric(gsub(".json","",best_tree_fileID))+1]
-tree_structure<-as.data.frame(sapply(best_focal,function(x)x["structure"]))  ##[6]
+best_focal <- result1[["trees"]][as.numeric(gsub(".json", "", best_tree_fileID)) + 1]
+tree_structure <- as.data.frame(sapply(best_focal, function(x) x["structure"])) ## [6]
 tree_roots <- best_focal[[1]]$structure$`0`
 
 
-merge_both<-function(result1,best_tree_fileID,tree_structure){
-  best_tree<-as.numeric(gsub(".json","",best_tree_fileID))
-  best_focal<-result1[["trees"]][best_tree+1]
-  tree_focal_statB<-as.data.frame(sapply(best_focal,function(x)x["populations"]))   ##[3]
-  qq<-tree_focal_statB[,grep("cellular_prevalence",colnames(tree_focal_statB))] %>% 
-    rownames_to_column("sample") %>% 
-    pivot_longer(-sample, 
-                 names_to = "population", 
-                 values_to = "cellular_prevalence") %>% 
-    mutate(population = str_remove(str_remove(population, ".*populations[.]"), "[.]cellular_prevalence")) %>% 
-    mutate(is_root = ifelse(population %in% tree_roots, TRUE, FALSE)) %>% 
-    group_by(sample) %>% 
-    mutate(purity = sum(cellular_prevalence[is_root]), 
-           CCF = cellular_prevalence / purity)
-  
-  return(qq)
+merge_both <- function(result1, best_tree_fileID, tree_structure) {
+  best_tree <- as.numeric(gsub(".json", "", best_tree_fileID))
+  best_focal <- result1[["trees"]][best_tree + 1]
+  tree_focal_statB <- as.data.frame(sapply(best_focal, function(x) x["populations"])) ## [3]
+  qq <- tree_focal_statB[, grep("cellular_prevalence", colnames(tree_focal_statB))] %>%
+    rownames_to_column("sample") %>%
+    pivot_longer(-sample,
+      names_to = "population",
+      values_to = "cellular_prevalence"
+    ) %>%
+    mutate(population = str_remove(str_remove(population, ".*populations[.]"), "[.]cellular_prevalence")) %>%
+    mutate(is_root = ifelse(population %in% tree_roots, TRUE, FALSE)) %>%
+    group_by(sample) %>%
+    mutate(
+      purity = sum(cellular_prevalence[is_root]),
+      CCF = cellular_prevalence / purity
+    )
 
+  return(qq)
 }
 
-both_samples<-merge_both(result1,best_tree_fileID,tree_structure)
+both_samples <- merge_both(result1, best_tree_fileID, tree_structure)
 
 
 write_tsv(both_samples, CCF_table)
 
 
-ssm = function(stat_best_tree, ssm_pre,ssm_to_trees,tree_structure, maf_list){
-  
-  out_res_ssm<-NULL
-  for ( i in 1:length(stat_best_tree$mut_assignments)){
-
-    focal<-(stat_best_tree$mut_assignments)[i]
-  
-   focal_ssms<-data.frame(sapply(focal, function(x) x[1]))
-   	
-   	colnames(focal_ssms)<-sub("^[^.]*.", "", colnames(focal_ssms))
-   focal_ssms$phyloWGS_population<-i
-   ssm_assign<-merge(ssm_pre,focal_ssms, by.x = "id", by.y = "ssms")[,c("id", "gene","phyloWGS_population")]
-   ssm_assign_spi<-separate(ssm_assign, col = gene, into = c("Chromosome","Start_Position"), sep = "_", convert = FALSE) %>% 
-     mutate(Start_Position = as.numeric(Start_Position)) 
-   if(chr_prefixed) {
-     ssm_assign_spi$Chromosome = str_c("chr", as.character(ssm_assign_spi$Chromosome))
-   }
-   
-   
-   out_res_ssm<-rbind(ssm_assign_spi,out_res_ssm)    
-   	
-   } ## i loop 
-  
-  ssm_assign_with_maf <- lapply(maf_list, function(x){
-    maf <- read_tsv(x, 
-                    col_types = cols(Chromosome = col_character())) %>% 
-      # PhyloWGS changes the start position of deletions. This makes the maf start position match that in the PhyloWGS SSM table. 
+ssm <- function(stat_best_tree, ssm_pre, ssm_to_trees, tree_structure, maf_list) {
+  out_res_ssm <- NULL
+  for (i in 1:length(stat_best_tree$mut_assignments)) {
+    focal <- (stat_best_tree$mut_assignments)[i]
+
+    focal_ssms <- data.frame(sapply(focal, function(x) x[1]))
+
+    colnames(focal_ssms) <- sub("^[^.]*.", "", colnames(focal_ssms))
+    focal_ssms$phyloWGS_population <- i
+    ssm_assign <- merge(ssm_pre, focal_ssms, by.x = "id", by.y = "ssms")[, c("id", "gene", "phyloWGS_population")]
+    ssm_assign_spi <- separate(ssm_assign, col = gene, into = c("Chromosome", "Start_Position"), sep = "_", convert = FALSE) %>%
+      mutate(Start_Position = as.numeric(Start_Position))
+    if (chr_prefixed) {
+      ssm_assign_spi$Chromosome <- str_c("chr", as.character(ssm_assign_spi$Chromosome))
+    }
+
+
+    out_res_ssm <- rbind(ssm_assign_spi, out_res_ssm)
+  } ## i loop
+
+  ssm_assign_with_maf <- lapply(maf_list, function(x) {
+    maf <- read_tsv(x,
+      col_types = cols(Chromosome = col_character())
+    ) %>%
+      # PhyloWGS changes the start position of deletions. This makes the maf start position match that in the PhyloWGS SSM table.
       mutate(Start_Position = ifelse(Variant_Type == "DEL", Start_Position - 1, Start_Position))
-    maf <- out_res_ssm %>% 
-      left_join(maf, by = c("Chromosome", "Start_Position")) %>% 
+    maf <- out_res_ssm %>%
+      left_join(maf, by = c("Chromosome", "Start_Position")) %>%
       # Restore the true MAF start postion after the hack above
       mutate(Start_Position = ifelse(Variant_Type == "DEL", Start_Position + 1, Start_Position)) %>%
       select(colnames(maf), everything())
   })
-  out_res_ssm <- rbindlist(ssm_assign_with_maf) %>% 
+  out_res_ssm <- rbindlist(ssm_assign_with_maf) %>%
     mutate(clonal_status = case_when(
-      phyloWGS_population %in% tree_roots & length(tree_roots) > 1 ~ "polyclonal", 
+      phyloWGS_population %in% tree_roots & length(tree_roots) > 1 ~ "polyclonal",
       phyloWGS_population %in% tree_roots & length(tree_roots) == 1 ~ "clonal",
       TRUE ~ "subclonal"
     ))
-  
-   return(out_res_ssm)
-  
-   
+
+  return(out_res_ssm)
 }
 
-ss<-ssm(rr,ssm_pre,ssm_to_trees,tree_structure, mafs)
+ss <- ssm(rr, ssm_pre, ssm_to_trees, tree_structure, mafs)
 
 write_tsv(ss, ssm_to_trees, na = "")
 
@@ -275,178 +226,205 @@ write_tsv(ss, ssm_to_trees, na = "")
 ## load mut file to extrcat CNVs start and end positions
 ###########################################################
 
-cnv = function(stat_best_tree, cnv_pre,mutation_file,cnv_to_trees){
+cnv <- function(stat_best_tree, cnv_pre, mutation_file, cnv_to_trees) {
   out_res_cnv <-
-    bind_rows(lapply(1:length(stat_best_tree$mut_assignments), function(x)
-      data.frame(cnvs = stat_best_tree$mut_assignments[[x]]$cnvs) %>% mutate(phyloWGS_population = x)))
-  
-   
-  
-  #return(out_res_cnv)
-      out_res_mut<-NULL
-      for (cn in 1:length(result_mut$cnvs)){
-      focal_mut_cnv<-(result_mut$cnvs)[cn]
-  
-      focal_mut<-data.frame(sapply(focal_mut_cnv, function(x) x[1]))[1,]
-      colnames(focal_mut)<-sub("^[^.]*.", "", colnames(focal_mut))
-      focal_mut$cnv_id<-names(focal_mut_cnv)
-      out_res_mut<-bind_rows(focal_mut,out_res_mut)
-      } ## cn loop
-  
-    both_cnvs<-merge(out_res_cnv, out_res_mut, by.x = "cnvs",by.y = "cnv_id")  %>% 
-      select(cnvs, phyloWGS_population, physical_cnvs.chrom, 
-             physical_cnvs.start, physical_cnvs.end,
-             physical_cnvs.major_cn, physical_cnvs.minor_cn, physical_cnvs.cell_prev)
-    
-  
+    bind_rows(lapply(1:length(stat_best_tree$mut_assignments), function(x) {
+      data.frame(cnvs = stat_best_tree$mut_assignments[[x]]$cnvs) %>% mutate(phyloWGS_population = x)
+    }))
+
+
+
+  # return(out_res_cnv)
+  out_res_mut <- NULL
+  for (cn in 1:length(result_mut$cnvs)) {
+    focal_mut_cnv <- (result_mut$cnvs)[cn]
+
+    focal_mut <- data.frame(sapply(focal_mut_cnv, function(x) x[1]))[1, ]
+    colnames(focal_mut) <- sub("^[^.]*.", "", colnames(focal_mut))
+    focal_mut$cnv_id <- names(focal_mut_cnv)
+    out_res_mut <- bind_rows(focal_mut, out_res_mut)
+  } ## cn loop
+
+  both_cnvs <- merge(out_res_cnv, out_res_mut, by.x = "cnvs", by.y = "cnv_id") %>%
+    select(
+      cnvs, phyloWGS_population, physical_cnvs.chrom,
+      physical_cnvs.start, physical_cnvs.end,
+      physical_cnvs.major_cn, physical_cnvs.minor_cn, physical_cnvs.cell_prev
+    )
 }
 
 
-cnv<-cnv(rr, cnv_pre,result_mut,cnv_to_trees)
-write.table(cnv, file =cnv_to_trees ,col.names = TRUE, row.names = FALSE, sep = "\t",quote = FALSE)
+cnv <- cnv(rr, cnv_pre, result_mut, cnv_to_trees)
+write.table(cnv, file = cnv_to_trees, col.names = TRUE, row.names = FALSE, sep = "\t", quote = FALSE)
 
 ##################### plot the results ####################
 ###########################################################
 #### Slope chart the best tree, cellular prevalence #######
 
-plot_cp<-function(both_samples,cellular_prevalence_plot){
-pdf(cellular_prevalence_plot, width = 8, height =8 )
-plotA<-ggplot(data = both_samples, aes(x = sample, y = cellular_prevalence, group = population)) +
-  geom_line(aes(color = population), size = 2) +
-  labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+  
-  geom_point(aes(color = population), size = 4) +
-  #  Labelling as desired
-  xlab("Sample") + ylab("Cellular prevalence")+
-  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
-        panel.background = element_blank(), axis.line = element_line(colour = "black"),
-        legend.key = element_rect(fill = NA, colour = NA, size = 0.25),axis.text=element_text(size=16),axis.title=element_text(size=18),
-        plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)))
-print(plotA)
-dev.off() 
+plot_cp <- function(both_samples, cellular_prevalence_plot) {
+  pdf(cellular_prevalence_plot, width = 8, height = 8)
+  plotA <- ggplot(data = both_samples, aes(x = sample, y = cellular_prevalence, group = population)) +
+    geom_line(aes(color = population), size = 2) +
+    labs(title = paste("Best Tree", gsub(".json", "", best_tree_fileID), sep = " ")) +
+    geom_point(aes(color = population), size = 4) +
+    #  Labelling as desired
+    xlab("Sample") +
+    ylab("Cellular prevalence") +
+    theme(
+      panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+      panel.background = element_blank(), axis.line = element_line(colour = "black"),
+      legend.key = element_rect(fill = NA, colour = NA, size = 0.25), axis.text = element_text(size = 16), axis.title = element_text(size = 18),
+      plot.margin = margin(1, 1., 1, 1.5, "cm"), axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))
+    )
+  print(plotA)
+  dev.off()
 }
 
-plot_cp(both_samples,cellular_prevalence_plot)
+plot_cp(both_samples, cellular_prevalence_plot)
 
 ###########################################################
 #### Slope chart the best tree, CCF #######
 
-plot_cp<-function(both_samples,CCF_plot){
-pdf(CCF_plot, width = 8, height =8 )
-plotB<-ggplot(data = both_samples[both_samples$population != 0, ], aes(x = sample, y = CCF, group = population)) +
-  geom_line(aes(color = population), size = 2) +
-  labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+  
-  geom_point(aes(color = population), size = 4) +
-  #  Labelling as desired
-  xlab("Sample") + ylab("CCF")+
-  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
-        panel.background = element_blank(), axis.line = element_line(colour = "black"),
-        legend.key = element_rect(fill = NA, colour = NA, size = 0.25),axis.text=element_text(size=16),axis.title=element_text(size=18),
-        plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)))
-print(plotB)
-dev.off() 
+plot_cp <- function(both_samples, CCF_plot) {
+  pdf(CCF_plot, width = 8, height = 8)
+  plotB <- ggplot(data = both_samples[both_samples$population != 0, ], aes(x = sample, y = CCF, group = population)) +
+    geom_line(aes(color = population), size = 2) +
+    labs(title = paste("Best Tree", gsub(".json", "", best_tree_fileID), sep = " ")) +
+    geom_point(aes(color = population), size = 4) +
+    #  Labelling as desired
+    xlab("Sample") +
+    ylab("CCF") +
+    theme(
+      panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+      panel.background = element_blank(), axis.line = element_line(colour = "black"),
+      legend.key = element_rect(fill = NA, colour = NA, size = 0.25), axis.text = element_text(size = 16), axis.title = element_text(size = 18),
+      plot.margin = margin(1, 1., 1, 1.5, "cm"), axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))
+    )
+  print(plotB)
+  dev.off()
 }
 
-plot_cp(both_samples,CCF_plot)
+plot_cp(both_samples, CCF_plot)
 
 
 #############################################
 ##### Slope chart the best tree (VAF) #######
 
 
-plot_vaf<-function(ss,VAF_plot){
-  pdf(VAF_plot, width = 8, height =8 )
-  plotC <- ss %>% 
-    select(Hugo_Symbol, Chromosome, Tumor_Sample_Barcode, Start_Position, t_depth, t_alt_count, populations = phyloWGS_population) %>% 
-    mutate(VAF = t_alt_count/t_depth, 
-           populations = as.factor(populations)) %>% 
+plot_vaf <- function(ss, VAF_plot) {
+  pdf(VAF_plot, width = 8, height = 8)
+  plotC <- ss %>%
+    select(Hugo_Symbol, Chromosome, Tumor_Sample_Barcode, Start_Position, t_depth, t_alt_count, populations = phyloWGS_population) %>%
+    mutate(
+      VAF = t_alt_count / t_depth,
+      populations = as.factor(populations)
+    ) %>%
     filter(!is.na(Tumor_Sample_Barcode)) %>%
-    ggplot(aes(x = Tumor_Sample_Barcode, 
-               y = VAF, 
-               group = interaction(populations, Start_Position),
-               color = populations)) + 
-    geom_line(aes(color = populations), size=0.2, alpha=0.4)+
-    labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+  
-    xlab("Sample") + ylab("VAF") +
-    guides(colour = guide_legend(override.aes = list(alpha = 3)))+
-    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
-          panel.background = element_blank(), axis.line = element_line(colour = "black"),
-          legend.key = element_rect(fill = NA, colour = NA, size = 2),axis.text=element_text(size=16),axis.title=element_text(size=18),
-          plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)))
-print(plotC)
-dev.off() 
+    ggplot(aes(
+      x = Tumor_Sample_Barcode,
+      y = VAF,
+      group = interaction(populations, Start_Position),
+      color = populations
+    )) +
+    geom_line(aes(color = populations), size = 0.2, alpha = 0.4) +
+    labs(title = paste("Best Tree", gsub(".json", "", best_tree_fileID), sep = " ")) +
+    xlab("Sample") +
+    ylab("VAF") +
+    guides(colour = guide_legend(override.aes = list(alpha = 3))) +
+    theme(
+      panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+      panel.background = element_blank(), axis.line = element_line(colour = "black"),
+      legend.key = element_rect(fill = NA, colour = NA, size = 2), axis.text = element_text(size = 16), axis.title = element_text(size = 18),
+      plot.margin = margin(1, 1., 1, 1.5, "cm"), axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))
+    )
+  print(plotC)
+  dev.off()
 }
 
 
 
-plot_vaf(ss,VAF_plot)
+plot_vaf(ss, VAF_plot)
 
 #############################################################
 ##### Slope chart the best tree (VAF, coding regions, nonsense, missense and splicing sites) #######
 
-drivers <- read_tsv(driver_genes, col_names = "gene") %>% pull(gene) 
-
-plot_vaf_coding<-function(maf,VAF_coding_plot){
-
-  coding <- ss %>% 
-    select(Hugo_Symbol, HGVSp_Short, Chromosome, Tumor_Sample_Barcode, Variant_Classification, Start_Position, t_depth, t_alt_count, populations = phyloWGS_population) %>% 
-    mutate(VAF = t_alt_count/t_depth, 
-           populations = as.factor(populations), 
-           Tumor_Sample_Barcode = factor(Tumor_Sample_Barcode, levels = sample_order)) %>% 
-    filter(!is.na(Tumor_Sample_Barcode), 
-           !Variant_Classification %in% c("Silent", "RNA", "IGR", "Intron", "5'Flank", "3'Flank", "5'UTR")) %>% 
+drivers <- read_tsv(driver_genes, col_names = "gene") %>% pull(gene)
+
+plot_vaf_coding <- function(maf, VAF_coding_plot) {
+  coding <- ss %>%
+    select(Hugo_Symbol, HGVSp_Short, Chromosome, Tumor_Sample_Barcode, Variant_Classification, Start_Position, t_depth, t_alt_count, populations = phyloWGS_population) %>%
+    mutate(
+      VAF = t_alt_count / t_depth,
+      populations = as.factor(populations),
+      Tumor_Sample_Barcode = factor(Tumor_Sample_Barcode, levels = sample_order)
+    ) %>%
+    filter(
+      !is.na(Tumor_Sample_Barcode),
+      !Variant_Classification %in% c("Silent", "RNA", "IGR", "Intron", "5'Flank", "3'Flank", "5'UTR")
+    ) %>%
     mutate(label = ifelse(!is.na(HGVSp_Short), str_c(Hugo_Symbol, "_", HGVSp_Short), str_c(Hugo_Symbol, "_", Variant_Classification)))
-pdf(VAF_coding_plot, width = 8, height =8 )
-plotD<-coding %>%
-  ggplot(aes(x = Tumor_Sample_Barcode, 
-             y = VAF, 
-             group = interaction(populations, Start_Position),
-             color = populations)) + 
-  geom_line(aes(color = populations), size=0.5, alpha=0.4)+
-  geom_text_repel(
-    data = filter(group_by(coding, Hugo_Symbol, Start_Position), VAF == max(VAF), Tumor_Sample_Barcode == sample_order[1], Hugo_Symbol %in% drivers),
-    aes(label = label, 
-        x = Tumor_Sample_Barcode, 
-        y = VAF), 
-    nudge_x = -0.2,
-    size = 4
+  pdf(VAF_coding_plot, width = 8, height = 8)
+  plotD <- coding %>%
+    ggplot(aes(
+      x = Tumor_Sample_Barcode,
+      y = VAF,
+      group = interaction(populations, Start_Position),
+      color = populations
+    )) +
+    geom_line(aes(color = populations), size = 0.5, alpha = 0.4) +
+    geom_text_repel(
+      data = filter(group_by(coding, Hugo_Symbol, Start_Position), VAF == max(VAF), Tumor_Sample_Barcode == sample_order[1], Hugo_Symbol %in% drivers),
+      aes(
+        label = label,
+        x = Tumor_Sample_Barcode,
+        y = VAF
+      ),
+      nudge_x = -0.2,
+      size = 4
     ) +
-  geom_text_repel(
-    data = filter(group_by(coding, Hugo_Symbol, Start_Position), VAF == max(VAF), Tumor_Sample_Barcode == sample_order[length(sample_order)], Hugo_Symbol %in% drivers),
-    aes(label = label, 
-        x = Tumor_Sample_Barcode, 
-        y = VAF), 
-    nudge_x = 0.2,
-    size = 4
-  ) +
-  labs(title = paste("Best Tree",gsub(".json", "",best_tree_fileID), sep = " "))+  
-  xlab("Sample") + ylab("VAF") +
-  guides(colour = guide_legend(override.aes = list(alpha = 3)))+
-  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
-        panel.background = element_blank(), axis.line = element_line(colour = "black"),
-        legend.key = element_rect(fill = NA, colour = NA, size = 2),axis.text=element_text(size=16),axis.title=element_text(size=18),
-        plot.margin = margin(1, 1., 1, 1.5, "cm"),axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10)))
-
-print(plotD)
-dev.off() 
-}  
+    geom_text_repel(
+      data = filter(group_by(coding, Hugo_Symbol, Start_Position), VAF == max(VAF), Tumor_Sample_Barcode == sample_order[length(sample_order)], Hugo_Symbol %in% drivers),
+      aes(
+        label = label,
+        x = Tumor_Sample_Barcode,
+        y = VAF
+      ),
+      nudge_x = 0.2,
+      size = 4
+    ) +
+    labs(title = paste("Best Tree", gsub(".json", "", best_tree_fileID), sep = " ")) +
+    xlab("Sample") +
+    ylab("VAF") +
+    guides(colour = guide_legend(override.aes = list(alpha = 3))) +
+    theme(
+      panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+      panel.background = element_blank(), axis.line = element_line(colour = "black"),
+      legend.key = element_rect(fill = NA, colour = NA, size = 2), axis.text = element_text(size = 16), axis.title = element_text(size = 18),
+      plot.margin = margin(1, 1., 1, 1.5, "cm"), axis.title.y = element_text(margin = margin(t = 70, r = 20, b = 50, l = 10))
+    )
+
+  print(plotD)
+  dev.off()
+}
 
 
-plot_vaf_coding(ss,VAF_coding_plot)
+plot_vaf_coding(ss, VAF_coding_plot)
 
 #############################################
 #####       Draw the best tree        #######
 #############################################
 
 
-tree_structure_long <- tree_structure %>% 
-  pivot_longer(everything(), 
-               names_to = "parent", 
-               values_to = "node") %>% 
-  mutate(parent = str_remove_all(parent, ".*[.]")) %>% 
-  distinct()  
+tree_structure_long <- tree_structure %>%
+  pivot_longer(everything(),
+    names_to = "parent",
+    values_to = "node"
+  ) %>%
+  mutate(parent = str_remove_all(parent, ".*[.]")) %>%
+  distinct()
 
 
-positions_x <- function(parents){
+positions_x <- function(parents) {
   x <- 1:length(unique(parents))
   names(x) <- unique(parents)
   col_vals <- unname(x[parents])
@@ -455,39 +433,38 @@ positions_x <- function(parents){
 
 tree_structure_long$x <- positions_x(tree_structure_long$parent)
 
-positions_y <- function(tree_df){
-  y = c("0" = 0.5)
-  for(parent in unique(tree_df$parent)){
+positions_y <- function(tree_df) {
+  y <- c("0" = 0.5)
+  for (parent in unique(tree_df$parent)) {
     # parent = "1"
-    child_index = 1
-    num_children <- nrow(tree_df[tree_df$parent == parent,])
-    if(num_children == 1){
-      child <- tree_df[tree_df$parent == parent,]$node
+    child_index <- 1
+    num_children <- nrow(tree_df[tree_df$parent == parent, ])
+    if (num_children == 1) {
+      child <- tree_df[tree_df$parent == parent, ]$node
       child_y <- unname(y[parent])
       names(child_y) <- child
-      y = c(y, child_y)
-      
+      y <- c(y, child_y)
     } else {
-      children <- tree_df[tree_df$parent == parent,]$node
+      children <- tree_df[tree_df$parent == parent, ]$node
       y_max <- unname(y[parent]) + (0.25 / child_index)
       y_min <- unname(y[parent]) - (0.25 / child_index)
       y_range <- seq(y_min, y_max, length.out = length(children))
       names(y_range) <- children
-      y = c(y, y_range)
+      y <- c(y, y_range)
     }
-    child_index = child_index + 1
+    child_index <- child_index + 1
   }
   return(y)
 }
 
 tree_structure_long$y <- unname(positions_y(tree_structure_long)[as.character(tree_structure_long$node)])
 
-tree_structure_long <- add_row(tree_structure_long, parent = "0", node = 0, x = 0, y = 0.5) 
+tree_structure_long <- add_row(tree_structure_long, parent = "0", node = 0, x = 0, y = 0.5)
 
-get_ssms <- function(tree_df, best_focal, best_tree_fileID){
+get_ssms <- function(tree_df, best_focal, best_tree_fileID) {
   data <- best_focal[[str_remove_all(best_tree_fileID, "[.].*")]]$populations
   ssm_vec <- c()
-  for(node in tree_df$node){
+  for (node in tree_df$node) {
     # node = "1"
     num_ssms <- data[[as.character(node)]]$num_ssms
     names(num_ssms) <- as.character(node)
@@ -498,39 +475,45 @@ get_ssms <- function(tree_df, best_focal, best_tree_fileID){
 
 tree_structure_long$num_ssms <- get_ssms(tree_structure_long, best_focal, best_tree_fileID)[as.character(tree_structure_long$node)]
 
-tree_structure_long <- tree_structure_long %>% 
+tree_structure_long <- tree_structure_long %>%
   mutate(parent = as.numeric(parent)) %>%
-  left_join(select(tree_structure_long, node, xstart = x, ystart = y), 
-            by = c("parent" = "node"))
-
-
-
-ggplot(tree_structure_long, 
-       aes(x = x, 
-           y = y, 
-           label = node)) + 
-  geom_segment(inherit.aes = FALSE, 
-               aes(x = xstart,
-                   xend = x,
-                   y = ystart,
-                   yend = y)) +
-  geom_point(aes(size = num_ssms), 
-             fill = "white",
-             colour = "black", 
-             pch = 21) + 
-  geom_text() + 
-  scale_size(range = c(5,20)) + 
-  ylim(0,1) +
-  theme_void() + 
-  ggtitle(samplename) + 
+  left_join(select(tree_structure_long, node, xstart = x, ystart = y),
+    by = c("parent" = "node")
+  )
+
+
+
+plot_tree <- ggplot(
+  tree_structure_long,
+  aes(
+    x = x,
+    y = y,
+    label = node
+  )
+) +
+  geom_segment(
+    inherit.aes = FALSE,
+    aes(
+      x = xstart,
+      xend = x,
+      y = ystart,
+      yend = y
+    )
+  ) +
+  geom_point(aes(size = num_ssms),
+    fill = "white",
+    colour = "black",
+    pch = 21
+  ) +
+  geom_text() +
+  scale_size(range = c(5, 20)) +
+  ylim(0, 1) +
+  theme_void() +
+  ggtitle(samplename) +
   theme(legend.position = "none")
- 
-ggsave(tree_plot, height = 6, width = 6)
+
+ggsave(tree_plot, plot_tree, height = 6, width = 6)
 
 ############
 ##### END ##
 ############
-
-
-
-
diff --git a/modules/pyclone_vi/1.0/config/default.yaml b/modules/pyclone_vi/1.0/config/default.yaml
index 2e7ee60b..c6638372 100644
--- a/modules/pyclone_vi/1.0/config/default.yaml
+++ b/modules/pyclone_vi/1.0/config/default.yaml
@@ -5,9 +5,9 @@ lcr-modules:
         # TODO: Update the list of available wildcards, if applicable
         inputs:
             # Available wildcards: {seq_type} {genome_build} {tumour_id} {normal_id} {pair_status}
-            sample_maf: "__UPDATE__"
-            sample_subclones: "__UPDATE__"
-            sample_cellularity: "__UPDATE__"
+            sample_maf: "__UPDATE__" # slms_3-1.0_vcf2maf-1.3/99-outputs/deblacklisted/augmented_maf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.slms-3.final.maf
+            sample_subclones: "__UPDATE__" # battenberg/99-outputs/txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_subclones.txt
+            sample_cellularity: "__UPDATE__" # battenberg/99-outputs/txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_cellularity_ploidy.txt
             sample_sex: "__UPDATE__" # Only {normal_id} available
 
         scratch_subdirectories: []
diff --git a/modules/pyclone_vi/1.0/pyclone_vi.smk b/modules/pyclone_vi/1.0/pyclone_vi.smk
index a882ebd1..7321ea39 100644
--- a/modules/pyclone_vi/1.0/pyclone_vi.smk
+++ b/modules/pyclone_vi/1.0/pyclone_vi.smk
@@ -26,7 +26,6 @@ CFG = op.setup_module(
 )
 
 # Define rules to be run locally when using a compute cluster
-# TODO: Replace with actual rules once you change the rule names
 localrules:
     _pyclone_vi_write_results,
     _pyclone_vi_all
@@ -354,7 +353,7 @@ rule _pyclone_vi_output_tsv:
         tree = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{patient_id}.phyclone.tree.nwk",
         clusters = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/{patient_id}.phyclone.clusters.tsv"
     run:
-        op.relative_symlink(input.pyclone, output.pyclone)
+        op.relative_symlink(input.pyclone, output.pyclone, in_module = TRUE)
         op.relative_symlink(input.phyclone, output.phyclone)
         op.relative_symlink(input.tree, output.tree)
         op.relative_symlink(input.clusters, output.clusters)
@@ -393,13 +392,6 @@ rule _pyclone_vi_all:
             seq_type=PATIENTS["tumour_seq_type"],
             genome_build=PATIENTS["tumour_genome_build"],
             patient_id=PATIENTS["tumour_patient_id"])
-        # expand(
-        #     str(rules._pyclone_run_analysis_pipeline.output.workdir),
-        #     zip,
-        #     seq_type=PATIENTS_CAPTURE["tumour_seq_type"],
-        #     genome_build=PATIENTS_CAPTURE["tumour_genome_build"],
-        #     patient_id=PATIENTS_CAPTURE["tumour_patient_id"]
-        # )
 
 
 ##### CLEANUP #####