From 0132dc37774a728162c614d5742e601c6d181884 Mon Sep 17 00:00:00 2001 From: willtyler Date: Wed, 4 Sep 2024 19:26:48 +0000 Subject: [PATCH] Support empty sample subset semantics --- tests/data/vcf/sample.vcf.gz | Bin 954 -> 984 bytes tests/data/vcf/sample.vcf.gz.csi | Bin 0 -> 189 bytes tests/data/vcf/sample.vcf.gz.tbi | Bin 185 -> 0 bytes tests/test_bcftools_validation.py | 1 + tests/test_query.py | 4 ++-- vcztools/cli.py | 4 ++++ vcztools/vcf_writer.py | 22 ++++++++++++++-------- 7 files changed, 21 insertions(+), 10 deletions(-) create mode 100644 tests/data/vcf/sample.vcf.gz.csi delete mode 100644 tests/data/vcf/sample.vcf.gz.tbi diff --git a/tests/data/vcf/sample.vcf.gz b/tests/data/vcf/sample.vcf.gz index 00f8a725f71596daf33d8ad1a948c47bce28eccf..cb00d955803f9235d19553a626e42ff9a7d4a3b0 100644 GIT binary patch delta 970 zcmV;*12z1*2iOOHABzYC000000RIL6LPG)oI|7Z9QE%EX6osF|uefToH&!`xoz`lZ zloC=Rg+L+go-fITSaR&tb~^Fzr$DG6ojRUuS@(SRT>IE{H(bz0=?bm?GLGK9U9*9^ zKMc{5KVs~T{b08P*EOoqIr-HtuIk33 zf(vE8rsIeB=KH{P>ks12;Jss$PN&20%GTDmjTgE(-8Qcjv6>=)bHf)-$X}C$v2xij!=AvfM+=EjyPaWDM96CET(vZ9Ghu zxf0(9ieu(Z5U*Yj@zkK;b_Z(`>JNDAAeb*_LADn>W*6M!V;7qb;-^bea8yI*rBAsm zOH0DyBuXdx>AQ>3C;BPY;-ifq={!u5``P3^NS@DCAqi@C?d1o_GmxN)q&?G))0cD6 zSk=3K zf{}4d6*#bpYY&Gjt_XtIDfQz6NVf6>8!o_qnQv#v9hey&6(c&I;o;FHyMF}DWK53J zb|X+=Q>Q!bz$}Zbf$0Yo{h$&8M=gZ&B~yx$3K}Y;!?B)DcsLE(AC3dZzWy4>r~Yu) z-wWDg?a>a$n|<6HTVj_OypH?Z7}%m(GVlCCE2;#K2^ESLrgsLf4C34svsXyF(BJ(UJY}rJe z6v)gbM#g(5kRYbX;zkxF(Xu4-h~-Ovx&XfAO3}Qf>@wYIjMv^MkYzUc;2%>ic>4eM z{CxTI%4lfSwK^ivXIR(QI<&q%46(04BAH$p={>`G+5`&A scJRUPeuD|;%k=N^iaMgYIyE_s{yw}N=PON9U;w+-{XW(2%{OEVms%%oZ~OKzkiEj^_J#|EWOwM@WV36 z-Fv|%1sIdxq4H%+CdE^gam4r zCaRc4aa#U8?LUsW12x%J7SZnA%Vb_gGKQ1!bok=^Gah);r7_(nE0ufH2oIU^#=BCz ztK1ln-*t;ZfGmCFNG(LM95Q`35}6 z0>@ENs?DYRV0L?V1C^EAmiuAL{V+~t`AhG^Tz%ietsYtoe#^kJ%;92N6cC1@%)vuV z*6QK0p&*S3*&^7imeUzdl&&haQD)%PK8iP(vhF2+*NFX*Yb3L0nXxmriefmPgini# zq_D{)WTJ`!E=2CIP5#pHD2|!i$`6G%P3F!hzS9=Pd@tjuJm8gz(;t|U8t}4s_QMJg zgg4AFIL77*JijFc4H_T4v{`P;(n872S(MK3qmv8v8$66ZzP173uRT0X#PgHzCtoj7 z14xa3Ug_b(OAOmY?aB*mfBJd}>ZjSE`KCW)9?2EtI9KoRQUm*=#mh6Ay)4mqfEG_f z1Vm{5)t{jHB9TrNsvAU8AO2;89-r*^A=cFK`2IZM1B$hF`)Ko zMG$$Qf*VH|t6%|w^J^6O1&R?wE$$MO7Y?z1EoQrb_7Ur%nZI13>C)r9!OWvp&mVfk zw0ncc-iT@9U@iu;hTiH3Dr{=l6w!LyX|^C_abXM7Fe^2eb8RjJ6^a}oegF_qePVXCbN{Q2Ts%|X8ZCg*n2$AES`eBt8skJ!g1~DDS?fJfc z7eaFv%Je4_j0#6EZZH=yP-X%q*fAJl4@_`dSSMx13FB_nnTag7H9uYnT=iuGUf7P| zkYD_-iT6qz|&liETZ}?$g^z~t7p{~~|fM9q3 O1C*=?O#}#&x&t1T_S8NA diff --git a/tests/data/vcf/sample.vcf.gz.csi b/tests/data/vcf/sample.vcf.gz.csi new file mode 100644 index 0000000000000000000000000000000000000000..e18a8341a88005da2cc1c4c4bcbbfd52956d57dd GIT binary patch literal 189 zcmb2|=3rp}f&Xj_PR>jW3mA&|zNI`#PDpsbkR+;*#+2ly!OL_iKuRS)<77h00fB=D zc&2`z!*VH+IqVci7;EBlr_A-9T0J}g2U=X^(%5-6IGsJ?Y9xGYU$24ixilA{?I}m{ z)P4TPsBxTaOH1d!U^64mO{Lh(`K6}A&$Aw%49w2hL|7d7(c@9X!&5zD<4(@lc@r6} Z85rEdLsaH4GBC)aIZ&E`8SGvV0RU5xJtzPG literal 0 HcmV?d00001 diff --git a/tests/data/vcf/sample.vcf.gz.tbi b/tests/data/vcf/sample.vcf.gz.tbi deleted file mode 100644 index 1a63a1ad1c043db7996911fb629b865210cddeff..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 185 zcmb2|=3rp}f&Xj_PR>jWa~QY_Kc%FkB_t#;CAn$vGMx&T#oWQGE7`*%p>&{Vo|8c7 zv^4@hQv+^g3%Tg;vN&TS;c)GASBjj^UJmIiHY+T8;#q{-Z!rhg_sTkLUU>5|^Hu)K z-`TwU8@mqwF!*a1amT=`|B^z0iz2Ug_KBbD)80u)JZsz}Zxg$?+t_FUQ^;Nc28Lx} T^;t)l85rcz94E~H6odi*0~|b8 diff --git a/tests/test_bcftools_validation.py b/tests/test_bcftools_validation.py index 80d1800..63d8fce 100644 --- a/tests/test_bcftools_validation.py +++ b/tests/test_bcftools_validation.py @@ -56,6 +56,7 @@ def run_vcztools(args: str) -> str: ("view --no-version -s NA00001", "sample.vcf.gz"), ("view --no-version -s NA00001,NA00003", "sample.vcf.gz"), ("view --no-version -s HG00096", "1kg_2020_chrM.vcf.gz"), + ("view --no-version -s '' --force-samples", "sample.vcf.gz") ] ) # fmt: on diff --git a/tests/test_query.py b/tests/test_query.py index 26555fa..c1d9691 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -82,8 +82,8 @@ def root(self): "19:111\n19:112\n20:14370\n20:17330\n20:1110696\n20:1230237\n20:1234567\n20:1235237\nX:10\n", ), (r"%INFO/DP\n", ".\n.\n14\n11\n10\n13\n9\n.\n.\n"), - (r"%AC\n", ".\n.\n.\n.\n.\n.\n3,1\n.\n.\n"), - (r"%AC{0}\n", ".\n.\n.\n.\n.\n.\n3\n.\n.\n"), + (r"%AC\n", ".\n.\n.\n.\n.\n.\n1,1\n.\n.\n"), + (r"%AC{0}\n", ".\n.\n.\n.\n.\n.\n1\n.\n.\n"), ], ) def test(self, root, query_format, expected_result): diff --git a/vcztools/cli.py b/vcztools/cli.py index fe82bf1..740e2bd 100644 --- a/vcztools/cli.py +++ b/vcztools/cli.py @@ -85,6 +85,9 @@ def query(path, list_samples, format): default=None, help="Regions to include.", ) +@click.option( + "--force-samples", is_flag=True, help="Only warn about unknown sample subsets." +) @click.option( "-s", "--samples", @@ -120,6 +123,7 @@ def view( no_version, regions, targets, + force_samples, samples, drop_genotypes, include, diff --git a/vcztools/vcf_writer.py b/vcztools/vcf_writer.py index f57b67e..a3583fb 100644 --- a/vcztools/vcf_writer.py +++ b/vcztools/vcf_writer.py @@ -153,6 +153,8 @@ def write_vcf( else: all_samples = root["sample_id"][:] sample_ids = np.array(samples.split(",")) + if np.all(sample_ids == np.array("")): + sample_ids = np.empty((0,)) samples_selection = search(all_samples, sample_ids) if not no_header and vcf_header is None: @@ -347,6 +349,7 @@ def c_chunk_to_vcf( else: gt = get_vchunk_array(array, v_chunk, v_mask_chunk) + # Recompute INFO/AC and INFO/AN if samples_selection is not None: flatter_gt = gt.reshape((gt.shape[0], gt.shape[1] * gt.shape[2])) @@ -363,7 +366,10 @@ def filter_and_bincount(values: np.ndarray): computed_AN = np.count_nonzero(computed_AN + 1, axis=1).astype(np.int8) info_fields["AC"] = computed_AC info_fields["AN"] = computed_AN - if "call_genotype_phased" in root and not drop_genotypes: + + if num_samples == 0: + gt = None + if "call_genotype_phased" in root and not drop_genotypes and num_samples > 0: array = root["call_genotype_phased"] gt_phased = get_vchunk_array( array, v_chunk, v_mask_chunk, samples_selection @@ -397,13 +403,13 @@ def filter_and_bincount(values: np.ndarray): array = array.reshape((num_variants, 1)) encoder.add_info_field(name, array) - for name, array in format_fields.items(): - assert num_samples > 0 - if array.dtype.kind in ("O", "U"): - array = array.astype("S") - if len(array.shape) == 2: - array = array.reshape((num_variants, num_samples, 1)) - encoder.add_format_field(name, array) + if num_samples != 0: + for name, array in format_fields.items(): + if array.dtype.kind in ("O", "U"): + array = array.astype("S") + if len(array.shape) == 2: + array = array.reshape((num_variants, num_samples, 1)) + encoder.add_format_field(name, array) # TODO: (1) make a guess at this based on number of fields and samples, # and (2) log a DEBUG message when we have to double. buflen = 1024