From 4e44ae94e24442cb7c903f68556f1d494bcba266 Mon Sep 17 00:00:00 2001 From: willtyler Date: Wed, 16 Oct 2024 18:43:37 +0000 Subject: [PATCH 1/5] Convert make rules to pattern rules --- performance/data/Makefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/performance/data/Makefile b/performance/data/Makefile index cabfc63..18a4134 100644 --- a/performance/data/Makefile +++ b/performance/data/Makefile @@ -16,14 +16,14 @@ all: sim_10k.vcz sim_10k.ts: stdpopsim HomSap -c chr22 -o sim_10k.ts pop_0:10000 -sim_10k.vcf.gz: sim_10k.ts - tskit vcf sim_10k.ts | bgzip > sim_10k.vcf.gz +%.vcf.gz: %.ts + tskit vcf $< | bgzip > $@ -sim_10k.vcf.gz.csi: sim_10k.vcf.gz - bcftools index sim_10k.vcf.gz +%.vcf.gz.csi: %.vcf.gz + bcftools index $< -sim_10k.vcz: sim_10k.vcf.gz sim_10k.vcf.gz.csi - vcf2zarr convert sim_10k.vcf.gz sim_10k.vcz +%.vcz: %.vcf.gz %.vcf.gz.csi + vcf2zarr convert $< $@ clean: rm -rf sim_10k.* From 5c23c0a275305acfb56630c65903ecce8cb7e2d9 Mon Sep 17 00:00:00 2001 From: willtyler Date: Thu, 17 Oct 2024 00:22:53 +0000 Subject: [PATCH 2/5] Add rules for real genome data --- performance/data/Makefile | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/performance/data/Makefile b/performance/data/Makefile index 18a4134..8b25b85 100644 --- a/performance/data/Makefile +++ b/performance/data/Makefile @@ -9,13 +9,31 @@ # The Python requirements are listed in requirements.txt: # pip install -r requirements.txt -.PHONY: all clean +# Flags / commandline arguments: +CHROMOSOME ?= 22 +WGS ?= 1 -all: sim_10k.vcz +ifeq ($(WGS), 1) + TGP_URL = "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20190425_NYGC_GATK/CCDG_13607_B01_GRM_WGS_2019-02-19_chr$(CHROMOSOME).recalibrated_variants.vcf.gz" +else + # Use URL for genotyping data: + TGP_URL = "http://hgdownload.cse.ucsc.edu/gbdb/hg19/1000Genomes/phase3/ALL.chr$(CHROMOSOME).phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz" +endif + +.PHONY: all simulated real clean + +all: simulated real + +simulated: sim_10k.vcz + +real: chr22.vcz sim_10k.ts: stdpopsim HomSap -c chr22 -o sim_10k.ts pop_0:10000 +chr22.vcf.gz: + bcftools view $(TGP_URL) | head -n 25000 | bcftools view -O z -o chr22.vcf.gz + %.vcf.gz: %.ts tskit vcf $< | bgzip > $@ From ebf668db8a1d195d08b5444f5002b4253ad424fd Mon Sep 17 00:00:00 2001 From: willtyler Date: Thu, 17 Oct 2024 00:25:03 +0000 Subject: [PATCH 3/5] Update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 53121a2..312f8a7 100644 --- a/.gitignore +++ b/.gitignore @@ -163,3 +163,4 @@ cython_debug/ .vscode vcz_test_cache/ +**/.DS_Store From 7fcec5fecbde9b3d488cd88fe5aa6fc1cb00dfe9 Mon Sep 17 00:00:00 2001 From: willtyler Date: Thu, 17 Oct 2024 00:39:16 +0000 Subject: [PATCH 4/5] Refactor performance/compare.py --- performance/compare.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/performance/compare.py b/performance/compare.py index 1ed1984..5415211 100644 --- a/performance/compare.py +++ b/performance/compare.py @@ -27,24 +27,19 @@ def run_vcztools(command: str, dataset_name: str): if __name__ == "__main__": commands = [ - "view", - "view -s tsk_7068,tsk_8769,tsk_8820", - r"query -f '%CHROM %POS %REF %ALT{0}\n'", - r"query -f '%CHROM:%POS\n' -i 'POS=49887394 | POS=50816415'", - "view -s '' --force-samples", + ("view", "sim_10k"), + ("view -s tsk_7068,tsk_8769,tsk_8820", "sim_10k"), + (r"query -f '%CHROM %POS %REF %ALT{0}\n'", "sim_10k"), + (r"query -f '%CHROM:%POS\n' -i 'POS=49887394 | POS=50816415'", "sim_10k"), + ("view -s '' --force-samples", "sim_10k"), ] - dataset = "sim_10k" if len(sys.argv) == 2 and sys.argv[1].isnumeric(): index = int(sys.argv[1]) - command = commands[index] - run_bcftools(command, dataset) - run_vcztools(command, dataset) - elif len(sys.argv) >= 2: - command = " ".join(sys.argv[1:]) + command, dataset = commands[index] run_bcftools(command, dataset) run_vcztools(command, dataset) else: - for command in commands: + for command, dataset in commands: run_bcftools(command, dataset) run_vcztools(command, dataset) From 8461ac257bd1efb43b7e7a06cc923a7f99851fa9 Mon Sep 17 00:00:00 2001 From: willtyler Date: Thu, 17 Oct 2024 00:57:47 +0000 Subject: [PATCH 5/5] Add more performance commands --- performance/compare.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/performance/compare.py b/performance/compare.py index 5415211..3af4680 100644 --- a/performance/compare.py +++ b/performance/compare.py @@ -32,6 +32,9 @@ def run_vcztools(command: str, dataset_name: str): (r"query -f '%CHROM %POS %REF %ALT{0}\n'", "sim_10k"), (r"query -f '%CHROM:%POS\n' -i 'POS=49887394 | POS=50816415'", "sim_10k"), ("view -s '' --force-samples", "sim_10k"), + ("view -i 'FMT/DP>10 & FMT/GQ>10'", "chr22"), + ("view -i 'QUAL>10 || FMT/GQ>10'", "chr22"), + (r"query -f 'GQ:[ %GQ] \t GT:[ %GT]\n'", "chr22"), ] if len(sys.argv) == 2 and sys.argv[1].isnumeric():