From e5ae887bd5a5e0ec8d3f0a6cb72387a1cfce9969 Mon Sep 17 00:00:00 2001 From: Tom White Date: Fri, 27 Sep 2024 13:16:29 +0100 Subject: [PATCH 1/2] Get INFO/FORMAT description from attrs if present --- vcztools/vcf_writer.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/vcztools/vcf_writer.py b/vcztools/vcf_writer.py index c7db961..53dae3b 100644 --- a/vcztools/vcf_writer.py +++ b/vcztools/vcf_writer.py @@ -489,10 +489,9 @@ def _generate_header(ds, original_header, sample_ids, *, no_version: bool = Fals category = "INFO" vcf_number = _array_to_vcf_number(category, key, arr) vcf_type = _array_to_vcf_type(arr) - if "comment" in arr.attrs: - vcf_description = arr.attrs["comment"] - else: - vcf_description = RESERVED_INFO_KEY_DESCRIPTIONS.get(key, "") + vcf_description = arr.attrs.get( + "description", RESERVED_INFO_KEY_DESCRIPTIONS.get(key, "") + ) print( f'##INFO=', file=output, @@ -514,10 +513,9 @@ def _generate_header(ds, original_header, sample_ids, *, no_version: bool = Fals category = "FORMAT" vcf_number = _array_to_vcf_number(category, key, arr) vcf_type = _array_to_vcf_type(arr) - if "comment" in arr.attrs: - vcf_description = arr.attrs["comment"] - else: - vcf_description = RESERVED_FORMAT_KEY_DESCRIPTIONS.get(key, "") + vcf_description = arr.attrs.get( + "description", RESERVED_FORMAT_KEY_DESCRIPTIONS.get(key, "") + ) print( f'##FORMAT=', file=output, From d590ebdd11a8ab8718deb3fcb88bbf71779aaea8 Mon Sep 17 00:00:00 2001 From: Tom White Date: Fri, 27 Sep 2024 13:43:21 +0100 Subject: [PATCH 2/2] Test generated header --- tests/test_vcf_writer.py | 44 ++++++++++++++++++++++++++++++++++++++++ vcztools/vcf_writer.py | 2 -- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/tests/test_vcf_writer.py b/tests/test_vcf_writer.py index 5e562ae..7f680f4 100644 --- a/tests/test_vcf_writer.py +++ b/tests/test_vcf_writer.py @@ -4,6 +4,8 @@ import numpy as np import pytest +import zarr +from bio2zarr import vcf2zarr from cyvcf2 import VCF from numpy.testing import assert_array_equal @@ -269,6 +271,48 @@ def test_write_vcf__header_flags(tmp_path): assert_vcfs_close(original, output) +def test_write_vcf__generate_header(tmp_path): + original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" + # don't use cache here since we mutate the vcz + vcz = tmp_path.joinpath("intermediate.vcz") + vcf2zarr.convert([original], vcz, worker_processes=0, local_alleles=False) + + # remove vcf_header + root = zarr.open(vcz, mode="r+") + del root.attrs["vcf_header"] + + output_header = StringIO() + write_vcf(vcz, output_header, header_only=True, no_version=True) + + expected_vcf_header = """##fileformat=VCFv4.3 +##source={} +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +""" # noqa: E501 + + # substitute value of source + expected_vcf_header = expected_vcf_header.format(root.attrs["source"]) + + assert output_header.getvalue() == expected_vcf_header + + def test_compute_info_fields(): gt = np.array([ [[0, 0], [0, 1], [1, 1]], diff --git a/vcztools/vcf_writer.py b/vcztools/vcf_writer.py index 53dae3b..d4682c3 100644 --- a/vcztools/vcf_writer.py +++ b/vcztools/vcf_writer.py @@ -445,8 +445,6 @@ def _generate_header(ds, original_header, sample_ids, *, no_version: bool = Fals # [1.4.1 File format] print("##fileformat=VCFv4.3", file=output) - print('##FILTER=', file=output) - if "source" in ds.attrs: print(f'##source={ds.attrs["source"]}', file=output)