Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get INFO/FORMAT description from attrs if present #87

Merged
merged 2 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions tests/test_vcf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import numpy as np
import pytest
import zarr
from bio2zarr import vcf2zarr
from cyvcf2 import VCF
from numpy.testing import assert_array_equal

Expand Down Expand Up @@ -269,6 +271,48 @@ def test_write_vcf__header_flags(tmp_path):
assert_vcfs_close(original, output)


def test_write_vcf__generate_header(tmp_path):
original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
# don't use cache here since we mutate the vcz
vcz = tmp_path.joinpath("intermediate.vcz")
vcf2zarr.convert([original], vcz, worker_processes=0, local_alleles=False)

# remove vcf_header
root = zarr.open(vcz, mode="r+")
del root.attrs["vcf_header"]

output_header = StringIO()
write_vcf(vcz, output_header, header_only=True, no_version=True)

expected_vcf_header = """##fileformat=VCFv4.3
##source={}
##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
##INFO=<ID=AC,Number=2,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=2,Type=Float,Description="Allele Frequency">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
##FILTER=<ID=PASS,Description="">
##FILTER=<ID=s50,Description="">
##FILTER=<ID=q10,Description="">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
##contig=<ID=19>
##contig=<ID=20>
##contig=<ID=X>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
""" # noqa: E501

# substitute value of source
expected_vcf_header = expected_vcf_header.format(root.attrs["source"])

assert output_header.getvalue() == expected_vcf_header


def test_compute_info_fields():
gt = np.array([
[[0, 0], [0, 1], [1, 1]],
Expand Down
16 changes: 6 additions & 10 deletions vcztools/vcf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,8 +445,6 @@ def _generate_header(ds, original_header, sample_ids, *, no_version: bool = Fals
# [1.4.1 File format]
print("##fileformat=VCFv4.3", file=output)

print('##FILTER=<ID=PASS,Description="All filters passed">', file=output)

if "source" in ds.attrs:
print(f'##source={ds.attrs["source"]}', file=output)

Expand Down Expand Up @@ -489,10 +487,9 @@ def _generate_header(ds, original_header, sample_ids, *, no_version: bool = Fals
category = "INFO"
vcf_number = _array_to_vcf_number(category, key, arr)
vcf_type = _array_to_vcf_type(arr)
if "comment" in arr.attrs:
vcf_description = arr.attrs["comment"]
else:
vcf_description = RESERVED_INFO_KEY_DESCRIPTIONS.get(key, "")
vcf_description = arr.attrs.get(
"description", RESERVED_INFO_KEY_DESCRIPTIONS.get(key, "")
)
print(
f'##INFO=<ID={key},Number={vcf_number},Type={vcf_type},Description="{vcf_description}">',
file=output,
Expand All @@ -514,10 +511,9 @@ def _generate_header(ds, original_header, sample_ids, *, no_version: bool = Fals
category = "FORMAT"
vcf_number = _array_to_vcf_number(category, key, arr)
vcf_type = _array_to_vcf_type(arr)
if "comment" in arr.attrs:
vcf_description = arr.attrs["comment"]
else:
vcf_description = RESERVED_FORMAT_KEY_DESCRIPTIONS.get(key, "")
vcf_description = arr.attrs.get(
"description", RESERVED_FORMAT_KEY_DESCRIPTIONS.get(key, "")
)
print(
f'##FORMAT=<ID={key},Number={vcf_number},Type={vcf_type},Description="{vcf_description}">',
file=output,
Expand Down