diff --git a/tests/data/vcf/sample.vcf.gz b/tests/data/vcf/sample.vcf.gz index 00f8a72..cb00d95 100644 Binary files a/tests/data/vcf/sample.vcf.gz and b/tests/data/vcf/sample.vcf.gz differ diff --git a/tests/data/vcf/sample.vcf.gz.csi b/tests/data/vcf/sample.vcf.gz.csi new file mode 100644 index 0000000..e18a834 Binary files /dev/null and b/tests/data/vcf/sample.vcf.gz.csi differ diff --git a/tests/data/vcf/sample.vcf.gz.tbi b/tests/data/vcf/sample.vcf.gz.tbi deleted file mode 100644 index 1a63a1a..0000000 Binary files a/tests/data/vcf/sample.vcf.gz.tbi and /dev/null differ diff --git a/tests/test_bcftools_validation.py b/tests/test_bcftools_validation.py index 80d1800..63d8fce 100644 --- a/tests/test_bcftools_validation.py +++ b/tests/test_bcftools_validation.py @@ -56,6 +56,7 @@ def run_vcztools(args: str) -> str: ("view --no-version -s NA00001", "sample.vcf.gz"), ("view --no-version -s NA00001,NA00003", "sample.vcf.gz"), ("view --no-version -s HG00096", "1kg_2020_chrM.vcf.gz"), + ("view --no-version -s '' --force-samples", "sample.vcf.gz") ] ) # fmt: on diff --git a/tests/test_query.py b/tests/test_query.py index 26555fa..c1d9691 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -82,8 +82,8 @@ def root(self): "19:111\n19:112\n20:14370\n20:17330\n20:1110696\n20:1230237\n20:1234567\n20:1235237\nX:10\n", ), (r"%INFO/DP\n", ".\n.\n14\n11\n10\n13\n9\n.\n.\n"), - (r"%AC\n", ".\n.\n.\n.\n.\n.\n3,1\n.\n.\n"), - (r"%AC{0}\n", ".\n.\n.\n.\n.\n.\n3\n.\n.\n"), + (r"%AC\n", ".\n.\n.\n.\n.\n.\n1,1\n.\n.\n"), + (r"%AC{0}\n", ".\n.\n.\n.\n.\n.\n1\n.\n.\n"), ], ) def test(self, root, query_format, expected_result): diff --git a/vcztools/cli.py b/vcztools/cli.py index fe82bf1..740e2bd 100644 --- a/vcztools/cli.py +++ b/vcztools/cli.py @@ -85,6 +85,9 @@ def query(path, list_samples, format): default=None, help="Regions to include.", ) +@click.option( + "--force-samples", is_flag=True, help="Only warn about unknown sample subsets." +) @click.option( "-s", "--samples", @@ -120,6 +123,7 @@ def view( no_version, regions, targets, + force_samples, samples, drop_genotypes, include, diff --git a/vcztools/vcf_writer.py b/vcztools/vcf_writer.py index f57b67e..a3583fb 100644 --- a/vcztools/vcf_writer.py +++ b/vcztools/vcf_writer.py @@ -153,6 +153,8 @@ def write_vcf( else: all_samples = root["sample_id"][:] sample_ids = np.array(samples.split(",")) + if np.all(sample_ids == np.array("")): + sample_ids = np.empty((0,)) samples_selection = search(all_samples, sample_ids) if not no_header and vcf_header is None: @@ -347,6 +349,7 @@ def c_chunk_to_vcf( else: gt = get_vchunk_array(array, v_chunk, v_mask_chunk) + # Recompute INFO/AC and INFO/AN if samples_selection is not None: flatter_gt = gt.reshape((gt.shape[0], gt.shape[1] * gt.shape[2])) @@ -363,7 +366,10 @@ def filter_and_bincount(values: np.ndarray): computed_AN = np.count_nonzero(computed_AN + 1, axis=1).astype(np.int8) info_fields["AC"] = computed_AC info_fields["AN"] = computed_AN - if "call_genotype_phased" in root and not drop_genotypes: + + if num_samples == 0: + gt = None + if "call_genotype_phased" in root and not drop_genotypes and num_samples > 0: array = root["call_genotype_phased"] gt_phased = get_vchunk_array( array, v_chunk, v_mask_chunk, samples_selection @@ -397,13 +403,13 @@ def filter_and_bincount(values: np.ndarray): array = array.reshape((num_variants, 1)) encoder.add_info_field(name, array) - for name, array in format_fields.items(): - assert num_samples > 0 - if array.dtype.kind in ("O", "U"): - array = array.astype("S") - if len(array.shape) == 2: - array = array.reshape((num_variants, num_samples, 1)) - encoder.add_format_field(name, array) + if num_samples != 0: + for name, array in format_fields.items(): + if array.dtype.kind in ("O", "U"): + array = array.astype("S") + if len(array.shape) == 2: + array = array.reshape((num_variants, num_samples, 1)) + encoder.add_format_field(name, array) # TODO: (1) make a guess at this based on number of fields and samples, # and (2) log a DEBUG message when we have to double. buflen = 1024