diff --git a/.gitignore b/.gitignore index 53121a2..312f8a7 100644 --- a/.gitignore +++ b/.gitignore @@ -163,3 +163,4 @@ cython_debug/ .vscode vcz_test_cache/ +**/.DS_Store diff --git a/performance/compare.py b/performance/compare.py index 1ed1984..3af4680 100644 --- a/performance/compare.py +++ b/performance/compare.py @@ -27,24 +27,22 @@ def run_vcztools(command: str, dataset_name: str): if __name__ == "__main__": commands = [ - "view", - "view -s tsk_7068,tsk_8769,tsk_8820", - r"query -f '%CHROM %POS %REF %ALT{0}\n'", - r"query -f '%CHROM:%POS\n' -i 'POS=49887394 | POS=50816415'", - "view -s '' --force-samples", + ("view", "sim_10k"), + ("view -s tsk_7068,tsk_8769,tsk_8820", "sim_10k"), + (r"query -f '%CHROM %POS %REF %ALT{0}\n'", "sim_10k"), + (r"query -f '%CHROM:%POS\n' -i 'POS=49887394 | POS=50816415'", "sim_10k"), + ("view -s '' --force-samples", "sim_10k"), + ("view -i 'FMT/DP>10 & FMT/GQ>10'", "chr22"), + ("view -i 'QUAL>10 || FMT/GQ>10'", "chr22"), + (r"query -f 'GQ:[ %GQ] \t GT:[ %GT]\n'", "chr22"), ] - dataset = "sim_10k" if len(sys.argv) == 2 and sys.argv[1].isnumeric(): index = int(sys.argv[1]) - command = commands[index] - run_bcftools(command, dataset) - run_vcztools(command, dataset) - elif len(sys.argv) >= 2: - command = " ".join(sys.argv[1:]) + command, dataset = commands[index] run_bcftools(command, dataset) run_vcztools(command, dataset) else: - for command in commands: + for command, dataset in commands: run_bcftools(command, dataset) run_vcztools(command, dataset) diff --git a/performance/data/Makefile b/performance/data/Makefile index cabfc63..8b25b85 100644 --- a/performance/data/Makefile +++ b/performance/data/Makefile @@ -9,21 +9,39 @@ # The Python requirements are listed in requirements.txt: # pip install -r requirements.txt -.PHONY: all clean +# Flags / commandline arguments: +CHROMOSOME ?= 22 +WGS ?= 1 -all: sim_10k.vcz +ifeq ($(WGS), 1) + TGP_URL = "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20190425_NYGC_GATK/CCDG_13607_B01_GRM_WGS_2019-02-19_chr$(CHROMOSOME).recalibrated_variants.vcf.gz" +else + # Use URL for genotyping data: + TGP_URL = "http://hgdownload.cse.ucsc.edu/gbdb/hg19/1000Genomes/phase3/ALL.chr$(CHROMOSOME).phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz" +endif + +.PHONY: all simulated real clean + +all: simulated real + +simulated: sim_10k.vcz + +real: chr22.vcz sim_10k.ts: stdpopsim HomSap -c chr22 -o sim_10k.ts pop_0:10000 -sim_10k.vcf.gz: sim_10k.ts - tskit vcf sim_10k.ts | bgzip > sim_10k.vcf.gz +chr22.vcf.gz: + bcftools view $(TGP_URL) | head -n 25000 | bcftools view -O z -o chr22.vcf.gz + +%.vcf.gz: %.ts + tskit vcf $< | bgzip > $@ -sim_10k.vcf.gz.csi: sim_10k.vcf.gz - bcftools index sim_10k.vcf.gz +%.vcf.gz.csi: %.vcf.gz + bcftools index $< -sim_10k.vcz: sim_10k.vcf.gz sim_10k.vcf.gz.csi - vcf2zarr convert sim_10k.vcf.gz sim_10k.vcz +%.vcz: %.vcf.gz %.vcf.gz.csi + vcf2zarr convert $< $@ clean: rm -rf sim_10k.*