diff --git a/lib/vcf_encoder.c b/lib/vcf_encoder.c index b155f9a..de041c3 100644 --- a/lib/vcf_encoder.c +++ b/lib/vcf_encoder.c @@ -663,9 +663,11 @@ vcz_variant_encoder_write_format_fields(const vcz_variant_encoder_t *self, if (all_missing) { for (j = 0; j < num_samples + 1; j++) { - offset = append_string(buf, ".\t", 2, offset, buflen); - if (offset < 0) { - goto out; + if (num_samples > 0) { + offset = append_string(buf, ".\t", 2, offset, buflen); + if (offset < 0) { + goto out; + } } } } else { diff --git a/tests/test_bcftools_validation.py b/tests/test_bcftools_validation.py index 02f9de2..ba423d5 100644 --- a/tests/test_bcftools_validation.py +++ b/tests/test_bcftools_validation.py @@ -49,6 +49,10 @@ def run_vcztools(args: str) -> str: "view --no-version -e '(FMT/DP >= 8 | FMT/GQ>40) && POS > 100000'", "sample.vcf.gz" ), + ( + "view --no-version -G", + "sample.vcf.gz" + ), ] ) # fmt: on diff --git a/tests/test_vcf_writer.py b/tests/test_vcf_writer.py index 9c9654d..d1f79ed 100644 --- a/tests/test_vcf_writer.py +++ b/tests/test_vcf_writer.py @@ -174,6 +174,18 @@ def test_write_vcf__samples(tmp_path, samples, expected_genotypes): assert variant.genotypes == expected_genotypes +def test_write_vcf__no_samples(tmp_path): + original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" + vcz = vcz_path_cache(original) + output = tmp_path.joinpath("output.vcf") + + write_vcf(vcz, output, drop_genotypes=True) + + v = VCF(output) + + assert v.samples == [] + + @pytest.mark.parametrize( ("regions", "targets", "samples", "include", "expected_chrom_pos"), [ diff --git a/vcztools/cli.py b/vcztools/cli.py index 508db90..3376ced 100644 --- a/vcztools/cli.py +++ b/vcztools/cli.py @@ -89,6 +89,13 @@ def query(path, list_samples, format): default=None, help="Samples to include.", ) +@click.option( + "-G", + "--drop-genotypes", + type=bool, + is_flag=True, + help="Drop genotypes.", +) @click.option( "-t", "--targets", @@ -110,6 +117,7 @@ def view( regions, targets, samples, + drop_genotypes, include, exclude, ): @@ -121,6 +129,7 @@ def view( variant_regions=regions, variant_targets=targets, samples=samples, + drop_genotypes=drop_genotypes, include=include, exclude=exclude, ) diff --git a/vcztools/vcf_writer.py b/vcztools/vcf_writer.py index b3df584..871b536 100644 --- a/vcztools/vcf_writer.py +++ b/vcztools/vcf_writer.py @@ -85,6 +85,7 @@ def write_vcf( variant_regions=None, variant_targets=None, samples=None, + drop_genotypes: bool = False, include: Optional[str] = None, exclude: Optional[str] = None, ) -> None: @@ -138,7 +139,12 @@ def write_vcf( root = zarr.open(vcz, mode="r") with open_file_like(output) as output: - if samples is None: + if samples and drop_genotypes: + raise ValueError("Cannot select samples and drop genotypes.") + elif drop_genotypes: + sample_ids = [] + samples_selection = np.array([]) + elif samples is None: sample_ids = root["sample_id"][:] samples_selection = None else: @@ -300,7 +306,11 @@ def c_chunk_to_vcf( info_fields = {} num_samples = len(samples_selection) if samples_selection is not None else None for name, array in root.items(): - if name.startswith("call_") and not name.startswith("call_genotype"): + if ( + name.startswith("call_") + and not name.startswith("call_genotype") + and num_samples != 0 + ): vcf_name = name[len("call_") :] format_fields[vcf_name] = get_vchunk_array( array, v_chunk, v_mask_chunk, samples_selection @@ -313,7 +323,7 @@ def c_chunk_to_vcf( gt = None gt_phased = None - if "call_genotype" in root: + if "call_genotype" in root and num_samples != 0: array = root["call_genotype"] gt = get_vchunk_array(array, v_chunk, v_mask_chunk, samples_selection) if "call_genotype_phased" in root: @@ -380,7 +390,7 @@ def _generate_header(ds, original_header, sample_ids): info_fields = [] format_fields = [] - if "call_genotype" in ds: + if "call_genotype" in ds and len(sample_ids) > 0: # GT must be the first field if present, per the spec (section 1.6.2) format_fields.append("GT") @@ -395,7 +405,8 @@ def _generate_header(ds, original_header, sample_ids): key = var[len("variant_") :] info_fields.append(key) elif ( - var.startswith("call_") + len(sample_ids) > 0 + and var.startswith("call_") and not var.endswith("_fill") and not var.endswith("_mask") and dims(arr)[0] == "variants"