Skip to content

Commit

Permalink
Support empty sample subset semantics
Browse files Browse the repository at this point in the history
  • Loading branch information
Will-Tyler committed Sep 4, 2024
1 parent 5d5ee37 commit 0132dc3
Show file tree
Hide file tree
Showing 7 changed files with 21 additions and 10 deletions.
Binary file modified tests/data/vcf/sample.vcf.gz
Binary file not shown.
Binary file added tests/data/vcf/sample.vcf.gz.csi
Binary file not shown.
Binary file removed tests/data/vcf/sample.vcf.gz.tbi
Binary file not shown.
1 change: 1 addition & 0 deletions tests/test_bcftools_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def run_vcztools(args: str) -> str:
("view --no-version -s NA00001", "sample.vcf.gz"),
("view --no-version -s NA00001,NA00003", "sample.vcf.gz"),
("view --no-version -s HG00096", "1kg_2020_chrM.vcf.gz"),
("view --no-version -s '' --force-samples", "sample.vcf.gz")
]
)
# fmt: on
Expand Down
4 changes: 2 additions & 2 deletions tests/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ def root(self):
"19:111\n19:112\n20:14370\n20:17330\n20:1110696\n20:1230237\n20:1234567\n20:1235237\nX:10\n",
),
(r"%INFO/DP\n", ".\n.\n14\n11\n10\n13\n9\n.\n.\n"),
(r"%AC\n", ".\n.\n.\n.\n.\n.\n3,1\n.\n.\n"),
(r"%AC{0}\n", ".\n.\n.\n.\n.\n.\n3\n.\n.\n"),
(r"%AC\n", ".\n.\n.\n.\n.\n.\n1,1\n.\n.\n"),
(r"%AC{0}\n", ".\n.\n.\n.\n.\n.\n1\n.\n.\n"),
],
)
def test(self, root, query_format, expected_result):
Expand Down
4 changes: 4 additions & 0 deletions vcztools/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ def query(path, list_samples, format):
default=None,
help="Regions to include.",
)
@click.option(
"--force-samples", is_flag=True, help="Only warn about unknown sample subsets."
)
@click.option(
"-s",
"--samples",
Expand Down Expand Up @@ -120,6 +123,7 @@ def view(
no_version,
regions,
targets,
force_samples,
samples,
drop_genotypes,
include,
Expand Down
22 changes: 14 additions & 8 deletions vcztools/vcf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ def write_vcf(
else:
all_samples = root["sample_id"][:]
sample_ids = np.array(samples.split(","))
if np.all(sample_ids == np.array("")):
sample_ids = np.empty((0,))
samples_selection = search(all_samples, sample_ids)

if not no_header and vcf_header is None:
Expand Down Expand Up @@ -347,6 +349,7 @@ def c_chunk_to_vcf(
else:
gt = get_vchunk_array(array, v_chunk, v_mask_chunk)

# Recompute INFO/AC and INFO/AN
if samples_selection is not None:
flatter_gt = gt.reshape((gt.shape[0], gt.shape[1] * gt.shape[2]))

Expand All @@ -363,7 +366,10 @@ def filter_and_bincount(values: np.ndarray):
computed_AN = np.count_nonzero(computed_AN + 1, axis=1).astype(np.int8)
info_fields["AC"] = computed_AC
info_fields["AN"] = computed_AN
if "call_genotype_phased" in root and not drop_genotypes:

if num_samples == 0:
gt = None
if "call_genotype_phased" in root and not drop_genotypes and num_samples > 0:
array = root["call_genotype_phased"]
gt_phased = get_vchunk_array(
array, v_chunk, v_mask_chunk, samples_selection
Expand Down Expand Up @@ -397,13 +403,13 @@ def filter_and_bincount(values: np.ndarray):
array = array.reshape((num_variants, 1))
encoder.add_info_field(name, array)

for name, array in format_fields.items():
assert num_samples > 0
if array.dtype.kind in ("O", "U"):
array = array.astype("S")
if len(array.shape) == 2:
array = array.reshape((num_variants, num_samples, 1))
encoder.add_format_field(name, array)
if num_samples != 0:
for name, array in format_fields.items():
if array.dtype.kind in ("O", "U"):
array = array.astype("S")
if len(array.shape) == 2:
array = array.reshape((num_variants, num_samples, 1))
encoder.add_format_field(name, array)
# TODO: (1) make a guess at this based on number of fields and samples,
# and (2) log a DEBUG message when we have to double.
buflen = 1024
Expand Down

0 comments on commit 0132dc3

Please sign in to comment.