diff --git a/.github/scripts/test_sgkit_vcf.py b/.github/scripts/test_sgkit_vcf.py deleted file mode 100644 index 949ff7bd2..000000000 --- a/.github/scripts/test_sgkit_vcf.py +++ /dev/null @@ -1,15 +0,0 @@ -import urllib.request - -import xarray as xr - -from sgkit.io.vcf import vcf_to_zarr - -if __name__ == "__main__": - for ext in (".gz", ".gz.tbi"): - urllib.request.urlretrieve( - f"https://github.com/sgkit-dev/sgkit/raw/main/sgkit/tests/io/vcf/data/sample.vcf{ext}", - f"sample.vcf{ext}", - ) - vcf_to_zarr("sample.vcf.gz", "out") - ds = xr.open_zarr("out") # type: ignore[no-untyped-call] - print(ds) diff --git a/.github/workflows/build-numpy-2.yml b/.github/workflows/build-numpy-2.yml index f7a2e49e8..0374905b8 100644 --- a/.github/workflows/build-numpy-2.yml +++ b/.github/workflows/build-numpy-2.yml @@ -23,7 +23,8 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt -r requirements-dev.txt - pip install -U 'numpy<2.1' + # update bio2zarr for NumPy 2, see https://github.com/sgkit-dev/bio2zarr/issues/256 + pip install -U 'numpy<2.1' -U git+https://github.com/sgkit-dev/bio2zarr.git - name: Run pre-commit uses: pre-commit/action@v2.0.0 - name: Test with pytest (numba jit disabled) @@ -32,7 +33,6 @@ jobs: run: | # avoid guvectorized functions #1194 pytest -v sgkit/tests/test_pedigree.py - pytest -v sgkit/tests/io/vcf/test_vcf_writer_utils.py - name: Test with pytest and coverage run: | pytest -v --cov=sgkit --cov-report=term-missing diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index fead4649a..fac8f7718 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -34,7 +34,6 @@ jobs: run: | # avoid guvectorized functions #1194 pytest -v sgkit/tests/test_pedigree.py - pytest -v sgkit/tests/io/vcf/test_vcf_writer_utils.py - name: Test with pytest and coverage run: | pytest -v --cov=sgkit --cov-report=term-missing diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 3749ca640..9201fb64e 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -38,14 +38,14 @@ jobs: with: path: dist - unix-test: + test: # This workflow only runs on the origin org if: github.repository_owner == 'sgkit-dev' needs: ['build'] strategy: matrix: # don't use macos-latest as it uses M1 which doesn't work - os: [ubuntu-latest, macos-12] + os: [ubuntu-latest, macos-12, windows-latest] python-version: ["3.9", "3.10", "3.11"] runs-on: ${{ matrix.os }} steps: @@ -64,46 +64,16 @@ jobs: python -VV # Install the local wheel wheel=$(ls artifact/sgkit-*.whl) - pip install ${wheel} ${wheel}[bgen] ${wheel}[plink] ${wheel}[vcf] + pip install ${wheel} ${wheel}[bgen] ${wheel}[plink] python sgkit-copy/.github/scripts/test_sgkit.py python sgkit-copy/.github/scripts/test_sgkit_bgen.py python sgkit-copy/.github/scripts/test_sgkit_plink.py - python sgkit-copy/.github/scripts/test_sgkit_vcf.py - # Windows doesn't support vcf - windows-test: - # This workflow only runs on the origin org - if: github.repository_owner == 'sgkit-dev' - runs-on: windows-latest - needs: ['build'] - strategy: - matrix: - python-version: ["3.9"] - steps: - # checkout repo to subdirectory to get access to scripts - - uses: actions/checkout@v2 - with: - path: sgkit-copy - - name: Download artifacts - uses: actions/download-artifact@v4.1.7 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install wheel and test - run: | - python -VV - # Install the local wheel - $env:wheel = $(ls artifact/sgkit-*.whl) - pip install $env:wheel "$env:wheel[bgen]" "$env:wheel[plink]" - python sgkit-copy/.github/scripts/test_sgkit.py - python sgkit-copy/.github/scripts/test_sgkit_bgen.py - python sgkit-copy/.github/scripts/test_sgkit_plink.py pypi-upload: if: github.repository_owner == 'sgkit-dev' runs-on: ubuntu-latest - needs: ['unix-test', 'windows-test'] + needs: ['test'] steps: - name: Download all uses: actions/download-artifact@v4.1.7 diff --git a/benchmarks/benchmarks_vcf.py b/benchmarks/benchmarks_vcf.py deleted file mode 100644 index 98ac83476..000000000 --- a/benchmarks/benchmarks_vcf.py +++ /dev/null @@ -1,142 +0,0 @@ -"""Benchmark suite for VCF module.""" -import gzip -import os -import shutil -import tempfile -import time -from pathlib import Path - -from numcodecs import FixedScaleOffset - -from sgkit.io.vcf.vcf_reader import vcf_to_zarr, zarr_array_sizes -from sgkit.io.vcf.vcf_writer import zarr_to_vcf - - -class VcfSpeedSuite: - def setup(self) -> None: - asv_env_dir = os.environ["ASV_ENV_DIR"] - path = Path( - asv_env_dir, - "project/sgkit/tests/io/vcf/data/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz", - ) - tmp_path = Path(tempfile.mkdtemp()) - self.input_vcf = tmp_path.joinpath("1000G.in.vcf").as_posix() - self.input_zarr = tmp_path.joinpath("1000G.in.zarr").as_posix() - self.output_zarr = tmp_path.joinpath("1000G.out.zarr").as_posix() - self.output_vcf = tmp_path.joinpath("1000G.out.vcf").as_posix() - - # decompress file into temp dir so we can measure speed of vcf_to_zarr for uncompressed text - _gunzip(path, self.input_vcf) - - # create a zarr input file so we can measure zarr_to_vcf speed - self.field_defs = { - "FORMAT/AD": {"Number": "R"}, - } - vcf_to_zarr( - self.input_vcf, - self.input_zarr, - fields=["INFO/*", "FORMAT/*"], - field_defs=self.field_defs, - chunk_length=1_000, - target_part_size=None, - ) - - # use track_* asv methods since we want to measure speed (MB/s) not time - - def track_vcf_to_zarr_speed(self) -> None: - duration = _time_func( - vcf_to_zarr, - self.input_vcf, - self.output_zarr, - fields=["INFO/*", "FORMAT/*"], - field_defs=self.field_defs, - chunk_length=1_000, - target_part_size=None, - ) - return _to_mb_per_s(os.path.getsize(self.input_vcf), duration) - - def track_zarr_to_vcf_speed(self) -> None: - # throw away first run due to numba jit compilation - for _ in range(2): - duration = _time_func(zarr_to_vcf, self.input_zarr, self.output_vcf) - return _to_mb_per_s(os.path.getsize(self.output_vcf), duration) - - -def _gunzip(input, output): - with gzip.open(input, "rb") as f_in: - with open(output, "wb") as f_out: - shutil.copyfileobj(f_in, f_out) - - -def _time_func(func, *args, **kwargs): - start = time.time() - func(*args, **kwargs) - end = time.time() - return end - start - - -def _to_mb_per_s(bytes, duration): - return bytes / (1_000_000 * duration) - - -class VcfCompressionSuite: - def setup(self) -> None: - asv_env_dir = os.environ["ASV_ENV_DIR"] - self.input_vcf = Path( - asv_env_dir, - "project/sgkit/tests/io/vcf/data/1kg_target_chr20_38_imputed_chr20_500000.vcf.bgz", - ) - - tmp_path = Path(tempfile.mkdtemp()) - self.output_zarr = tmp_path.joinpath("1000G.out.zarr") - - # use track_* asv methods since we want to measure compression size not time - - def track_zarr_compression_size(self) -> None: - encoding = { - "variant_AF": { - "filters": [ - FixedScaleOffset(offset=0, scale=10000, dtype="f4", astype="u2") - ], - }, - "call_DS": { - "filters": [ - FixedScaleOffset(offset=0, scale=100, dtype="f4", astype="u1") - ], - }, - "variant_DR2": { - "filters": [ - FixedScaleOffset(offset=0, scale=100, dtype="f4", astype="u1") - ], - }, - } - - kwargs = zarr_array_sizes(self.input_vcf) - - vcf_to_zarr( - self.input_vcf, - self.output_zarr, - fields=["INFO/*", "FORMAT/*"], - chunk_length=500_000, - encoding=encoding, - **kwargs, - ) - - original_size = du(self.input_vcf) - zarr_size = du(self.output_zarr) - - return float(zarr_size) / original_size - - -def get_file_size(file): - return file.stat().st_size - - -def get_dir_size(dir): - return sum(f.stat().st_size for f in dir.glob("**/*") if f.is_file()) - - -def du(file): - if file.is_file(): - return get_file_size(file) - return get_dir_size(file) diff --git a/conftest.py b/conftest.py index 9343393f3..8b2bdcf82 100644 --- a/conftest.py +++ b/conftest.py @@ -1,5 +1,4 @@ -# Ignore VCF files during pytest collection, so it doesn't fail if cyvcf2 isn't installed. -collect_ignore_glob = ["benchmarks/**", "sgkit/io/vcf/*.py", ".github/scripts/*.py"] +collect_ignore_glob = ["benchmarks/**", ".github/scripts/*.py"] def pytest_addoption(parser): diff --git a/docs/api.rst b/docs/api.rst index ba0605482..113a0658e 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -32,47 +32,11 @@ PLINK write_plink zarr_to_plink -VCF (reading) +VCF ------------- -.. deprecated:: 0.9.0 - Functions for reading VCF are deprecated, please use the `bio2zarr `_ package. - -.. currentmodule:: sgkit.io.vcf -.. autosummary:: - :toctree: generated/ - - read_vcf - vcf_to_zarr - -For more low-level control: - -.. currentmodule:: sgkit.io.vcf -.. autosummary:: - :toctree: generated/ - - partition_into_regions - vcf_to_zarrs - concat_zarrs - zarr_array_sizes - -For converting from `scikit-allel's VCF Zarr representation `_ to sgkit's Zarr representation: - -.. currentmodule:: sgkit -.. autosummary:: - :toctree: generated/ - - read_scikit_allel_vcfzarr - -VCF (writing) -------------- - -.. currentmodule:: sgkit.io.vcf -.. autosummary:: - :toctree: generated/ - - write_vcf - zarr_to_vcf +Functions for reading and writing VCF were removed from sgkit, please use the `bio2zarr `_ +and `vcztools `_ packages. Dataset ------- diff --git a/docs/changelog.rst b/docs/changelog.rst index a5263767f..bfe91ee8a 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -14,8 +14,12 @@ New Features - Add 'matching' method to :func:`identity_by_state` function. (:user:`timothymillar`, :pr:`1229`, :issue:`1227`) -.. Breaking changes -.. ~~~~~~~~~~~~~~~~ +Breaking changes +~~~~~~~~~~~~~~~~ + +- Functions for reading and writing VCF were removed from sgkit, please use the `bio2zarr `_ + and `vcztools `_ packages instead. + (:user:`tomwhite`, :pr:`1264`) .. Deprecations .. ~~~~~~~~~~~~ @@ -147,22 +151,22 @@ New Features - Add :func:`sgkit.convert_call_to_index` method. (:user:`timothymillar`, :pr:`1050`, :issue:`1048`) -- Add ``read_chunk_length`` option to :func:`sgkit.io.vcf.vcf_to_zarr` and - :func:`sgkit.io.vcf.vcf_to_zarrs` functions. These are useful to reduce memory usage +- Add ``read_chunk_length`` option to ``sgkit.io.vcf.vcf_to_zarr`` and + ``sgkit.io.vcf.vcf_to_zarrs`` functions. These are useful to reduce memory usage with large sample counts or a large ``chunk_length``. (:user:`benjeffery`, :pr:`1044`, :issue:`1042`) -- Add ``retain_temp_files`` to :func:`sgkit.io.vcf.vcf_to_zarr` function. +- Add ``retain_temp_files`` to ``sgkit.io.vcf.vcf_to_zarr`` function. (:user:`benjeffery`, :pr:`1046`, :issue:`1036`) -- Add :func:`sgkit.io.vcf.read_vcf` convenience function. +- Add ``sgkit.io.vcf.read_vcf`` convenience function. (:user:`tomwhite`, :pr:`1052`, :issue:`1004`) - Add :func:`sgkit.hybrid_relationship`, :func:`sgkit.hybrid_inverse_relationship` and :func:`invert_relationship_matrix` methods. (:user:`timothymillar`, :pr:`1053`, :issue:`993`) -- Add :func:`sgkit.io.vcf.zarr_array_sizes` for determining array sizes for storage in Zarr. +- Add ``sgkit.io.vcf.zarr_array_sizes`` for determining array sizes for storage in Zarr. (:user:`tomwhite`, :pr:`1073`, :issue:`734`) - Add ``skipna`` option to :func:`genomic_relationship` function. @@ -174,7 +178,7 @@ New Features Breaking changes ~~~~~~~~~~~~~~~~ -- Generate VCF header by default when writing VCF using :func:`sgkit.io.vcf.write_vcf` or :func:`sgkit.io.vcf.zarr_to_vcf`. +- Generate VCF header by default when writing VCF using ``sgkit.io.vcf.write_vcf`` or ``sgkit.io.vcf.zarr_to_vcf``. Previously, the dataset had to contain a ``vcf_header`` attribute. (:user:`tomwhite`, :pr:`1021`, :issue:`1020`) diff --git a/docs/index.rst b/docs/index.rst index 34892d5fe..ec2e770d6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -19,7 +19,6 @@ both popular Python genetics toolkits with a respective focus on population and getting_started user_guide - vcf examples/index api how_do_i diff --git a/docs/user_guide.rst b/docs/user_guide.rst index f9b84eef7..d5b246980 100644 --- a/docs/user_guide.rst +++ b/docs/user_guide.rst @@ -18,14 +18,10 @@ Reading and writing genetic data Installation ------------ -Sgkit can read standard genetic file formats, including VCF, PLINK, and BGEN. It can also export -to VCF. +Sgkit can read standard genetic file formats, including PLINK and BGEN. For reading VCF, +please use the `bio2zarr `_ package. -If sgkit has been installed using conda, support for reading BGEN and PLINK is included, but -VCF is not because there is no Windows support for cyvcf2, the library we use for reading VCF data. -If you are using Linux or a Mac, please install cyvcf2 using the following to enable VCF support:: - - $ conda install -c bioconda cyvcf2 +If sgkit has been installed using conda, support for reading BGEN and PLINK is included. If sgkit has been installed using pip, then support for reading these formats is not included, and requires additional dependencies, which can be installed @@ -39,10 +35,6 @@ To install sgkit with PLINK support:: $ pip install 'sgkit[plink]' -To install sgkit with VCF support:: - - $ pip install 'sgkit[vcf]' - Converting genetic data to Zarr ------------------------------- @@ -88,22 +80,10 @@ arrays within an :class:`xarray.Dataset` from ``bed``, ``bim``, and ``fam`` file The :func:`sgkit.io.plink.write_plink` and :func:`sgkit.io.plink.zarr_to_plink` functions convert sgkit's Xarray data representation to PLINK. -VCF ---- - -The :func:`sgkit.io.vcf.vcf_to_zarr` function converts one or more VCF files to -Zarr files stored in sgkit's Xarray data representation, which can then be opened -as a :class:`xarray.Dataset`. - -The :func:`sgkit.io.vcf.write_vcf` and :func:`sgkit.io.vcf.zarr_to_vcf` functions -convert sgkit's Xarray data representation to VCF. - -See :ref:`vcf` for installation instructions, and details on using VCF in sgkit. - Working with cloud-native data ------------------------------ -TODO: Show how to read/write Zarr (and VCF?) data in cloud storage +TODO: Show how to read/write Zarr data in cloud storage Datasets diff --git a/docs/vcf.rst b/docs/vcf.rst deleted file mode 100644 index bbeeab10c..000000000 --- a/docs/vcf.rst +++ /dev/null @@ -1,363 +0,0 @@ -.. _vcf: - -Reading VCF -=========== - -.. deprecated:: 0.9.0 - Functions for reading VCF are deprecated, please use the `bio2zarr `_ package. - -.. contents:: Table of contents: - :local: - -The :func:`sgkit.io.vcf.vcf_to_zarr` function converts one or more VCF files to Zarr files stored in -sgkit's Xarray data representation. - -Highlights ----------- - -* Reads bgzip-compressed VCF and BCF files. -* Large VCF files can be partitioned into regions using a Tabix (``.tbi``) or CSI (``.csi``) - index, and each region is processed in parallel using `Dask `_. -* VCF parsing is performed by `cyvcf2 `_, - a Cython wrapper around `htslib `_, - the industry-standard VCF library. -* Control over Zarr chunk sizes allows VCFs with a large number of samples - to be converted efficiently. -* Input and output files can reside on local filesystems, Amazon S3, or - Google Cloud Storage. -* Support for polyploid and mixed-ploidy genotypes. - -Installation ------------- - -VCF support is an "extra" feature within sgkit and requires additional -dependencies, notably ``cyvcf2``. - -To install sgkit with VCF support using pip (there is no conda package):: - - $ pip install 'sgkit[vcf]' - -There are `installation instructions for cyvcf2 `_, -which may be helpful if you encounter errors during installation. - -.. warning:: - Reading VCFs is not supported on Windows, since ``cyvcf2`` and ``htslib`` do - not `currently work on Windows `_. - As a workaround, consider using scikit-allel's ``vcf_to_zarr`` function - to write a VCF in Zarr format, followed by :func:`sgkit.read_scikit_allel_vcfzarr` to - read the VCF as a :class:`xarray.Dataset`. - -Usage ------ - -To convert a single VCF or BCF file to Zarr, just specify the input and output file names:: - - >>> import sgkit as sg - >>> from sgkit.io.vcf import vcf_to_zarr - >>> vcf_to_zarr("CEUTrio.20.21.gatk3.4.g.vcf.bgz", "output.zarr") - >>> ds = sg.load_dataset("output.zarr") - >>> ds - - Dimensions: (alleles: 4, ploidy: 2, samples: 1, variants: 19910) - Dimensions without coordinates: alleles, ploidy, samples, variants - Data variables: - call_genotype (variants, samples, ploidy) int8 dask.array - call_genotype_mask (variants, samples, ploidy) bool dask.array - call_genotype_phased (variants, samples) bool dask.array - sample_id (samples) - variant_allele (variants, alleles) object dask.array - variant_contig (variants) int8 dask.array - variant_id (variants) object dask.array - variant_id_mask (variants) bool dask.array - variant_position (variants) int32 dask.array - Attributes: - contigs: ['20', '21'] - max_variant_allele_length: 48 - max_variant_id_length: 1 - -The :func:`sgkit.io.vcf.vcf_to_zarr` function can accept multiple files, and furthermore, each of these -files can be partitioned to enable parallel processing. - -Multiple files --------------- - -If there are multiple files, then pass a list:: - - >>> from sgkit.io.vcf import vcf_to_zarr - >>> vcf_to_zarr(["CEUTrio.20.gatk3.4.g.vcf.bgz", "CEUTrio.21.gatk3.4.g.vcf.bgz"], "output.zarr") - -Processing multiple inputs is more work than a single file, since behind the scenes each input is -converted to a separate temporary Zarr file on disk, then these files are concatenated and rechunked -to form the final output Zarr file. - -In the single file case, the input VCF is converted to the output Zarr file in a single sequential -pass with no need for intermediate temporary files. For small files this is fine, but for very large -files it's a good idea to partition them so the conversion runs faster. - -Partitioning ------------- - -Partitioning a large VCF file involves breaking it into a number of roughly equal-sized parts that can -be processed in parallel. The parts are specified using genomic regions that follow the regions format -used in `bcftools `_: ``chr:beg-end``, -where positions are 1-based and inclusive. - -All files to be partitioned must have either a Tabix (``.tbi``) or CSI (``.csi``) index. If both are present -for a particular file, then Tabix is used for finding partitions. - -The :func:`sgkit.io.vcf.partition_into_regions` function will create a list of region strings for a VCF -file, given a desired number of parts to split the file into: - - >>> from sgkit.io.vcf import partition_into_regions - >>> partition_into_regions("CEUTrio.20.21.gatk3.4.g.vcf.bgz", num_parts=10) - ['20:1-10108928', '20:10108929-10207232', '20:10207233-', '21:1-10027008', '21:10027009-10043392', '21:10043393-10108928', '21:10108929-10141696', '21:10141697-10174464', '21:10174465-10190848', '21:10190849-10207232', '21:10207233-'] - -It's important to note that the number of regions returned may not be exactly the number of parts -requested: it may be more or less. However, it is guaranteed that the regions will be contiguous and -will cover the whole VCF file. - -The region strings are passed to ``vcf_to_zarr`` so it can process the parts in parallel: - - >>> from sgkit.io.vcf import partition_into_regions, vcf_to_zarr - >>> regions = partition_into_regions("CEUTrio.20.21.gatk3.4.g.vcf.bgz", num_parts=10) - >>> vcf_to_zarr("CEUTrio.20.21.gatk3.4.g.vcf.bgz", "output.zarr", regions=regions) - -It's also possible to produce parts that have an approximate target size (in bytes). This is useful -if you are partitioning multiple files, and want all the parts to be roughly the same size. - - >>> from sgkit.io.vcf import partition_into_regions, vcf_to_zarr - >>> inputs = ["CEUTrio.20.gatk3.4.g.vcf.bgz", "CEUTrio.21.gatk3.4.g.vcf.bgz"] - >>> regions = [partition_into_regions(input, target_part_size=100_000) for input in inputs] - >>> vcf_to_zarr(inputs, "output.zarr", regions=regions) - -The same result can be obtained more simply by specifying ``target_part_size`` in the call to -``vcf_to_zarr``: - - >>> from sgkit.io.vcf import vcf_to_zarr - >>> inputs = ["CEUTrio.20.gatk3.4.g.vcf.bgz", "CEUTrio.21.gatk3.4.g.vcf.bgz"] - >>> vcf_to_zarr(inputs, "output.zarr", target_part_size=100_000) - -As a special case, ``None`` is used to represent a single partition. - - >>> from sgkit.io.vcf import partition_into_regions - >>> partition_into_regions("CEUTrio.20.21.gatk3.4.g.vcf.bgz", num_parts=1) - None - -Chunk sizes ------------ - -One key advantage of using Zarr as a storage format is its ability to store -large files in chunks, making it straightforward to process the data in -parallel. - -You can control the chunk *length* (in the variants dimension) and *width* -(in the samples dimension) by setting the ``chunk_length`` and ``chunk_width`` -parameters to :func:`sgkit.io.vcf.vcf_to_zarr`. - -Due to the way that VCF files are parsed, all of the sample data for a given -chunk of variants are loaded into memory at one time. In other words, -``chunk_length`` is honored at read time, whereas ``chunk_width`` is honored -at write time. For files with very large numbers of samples, this can -exceed working memory. The solution is to also set ``temp_chunk_length`` to be a -smaller number (than ``chunk_length``), so that fewer variants are loaded -into memory at one time, while still having the desired output chunk sizes -(``chunk_length`` and ``chunk_width``). Note that ``temp_chunk_length`` must -divide ``chunk_length`` evenly. - -Cloud storage -------------- - -VCF files can be read from various file systems including cloud stores. However, -since different underlying libraries are used in different functions, there are -slight differences in configuration that are outlined here. - -The :func:`sgkit.io.vcf.partition_into_regions` function uses `fsspec `_ -to read VCF metadata and their indexes. Therefore, to access files stored on Amazon S3 or Google Cloud Storage -install the ``s3fs`` or ``gcsfs`` Python packages, and use ``s3://`` or ``gs://`` URLs. - -You can also pass ``storage_options`` to :func:`sgkit.io.vcf.partition_into_regions` to configure the ``fsspec`` backend. -This provides a way to pass any credentials or other necessary arguments needed to ``s3fs`` or ``gcsfs``. - -The :func:`sgkit.io.vcf.vcf_to_zarr` function does *not* use ``fsspec``, since it -relies on ``htslib`` for file handling, and therefore has its own way of accessing -cloud storage. You can access files stored on Amazon S3 or Google Cloud Storage -using ``s3://`` or ``gs://`` URLs. Setting credentials or other options is -typically achieved using environment variables for the underlying cloud store. - -Compression ------------ - -Zarr offers a lot of flexibility over controlling how data is compressed. Each variable can use -a different `compression algorithm `_, -and its own list of `filters `_. - -The :func:`sgkit.io.vcf.vcf_to_zarr` function tries to choose good defaults for compression, using -information about the variable's dtype, and also the nature of the data being stored. - -For example, ``variant_position`` (from the VCF ``POS`` field) is a monotonically increasing integer -(within a contig) so it benefits from using a delta encoding to store the differences in its values, -since these are smaller integers that compress better. This encoding is specified using the NumCodecs -`Delta `_ codec as a Zarr filter. - -When converting from VCF you can specify the default compression algorithm to use for all variables -by specifying ``compressor`` in the call to :func:`sgkit.io.vcf.vcf_to_zarr`. There are trade-offs -between compression speed and size, which this `benchmark `_ -does a good job of exploring. - -Sometimes you may want to override the compression for a particular variable. A good example of this -is for VCF FORMAT fields that are floats. Floats don't compress well, and since there is a value for -every sample they can take up a lot of space. In many cases full float precision is not needed, -so it is a good idea to use a filter to transform the float to an int, that takes less space. - -For example, the following code creates an encoding that can be passed to :func:`sgkit.io.vcf.vcf_to_zarr` -to store the VCF ``DS`` FORMAT field to 2 decimal places. (``DS`` is a dosage field that is between 0 and 2 -so we know it will fit into an unsigned 8-bit int.):: - - from numcodecs import FixedScaleOffset - - encoding = { - "call_DS": { - "filters": [FixedScaleOffset(offset=0, scale=100, dtype="f4", astype="u1")], - }, - } - -Note that this encoding won't work for floats that may be NaN. Consider using -`Quantize `_ (with ``astype=np.float16``) -or `Bitround `_ in that case. - -.. _vcf_low_level_operation: - -Low-level operation -------------------- - -Calling :func:`sgkit.io.vcf.vcf_to_zarr` runs a two-step operation: - -1. Write the output for each input region to a separate temporary Zarr store -2. Concatenate and rechunk the temporary stores into the final output Zarr store - -Each step is run as a Dask computation, which means you can use any Dask configuration -mechanisms to control aspects of the computation. - -For example, you can set the Dask scheduler to run on a cluster. In this case you -would set the temporary Zarr store to be a cloud storage URL (by setting ``tempdir``) so -that all workers can access the store (both for reading and writing). - -For debugging, or for more control over the steps, consider using -:func:`sgkit.io.vcf.vcf_to_zarrs` followed by :func:`sgkit.io.vcf.concat_zarrs`. - -Polyploid and mixed-ploidy VCF ------------------------------- - -The :func:`sgkit.io.vcf.vcf_to_zarr` function can be used to convert polyploid VCF -data to Zarr files stored in sgkit's Xarray data representation by specifying the -ploidy of the dataset using the ``ploidy`` parameter. - -By default, sgkit expects VCF files to have a consistent ploidy level across all samples -and variants. -Manual specification of ploidy is necessary because, within the VCF standard, -ploidy is indicated by the length of each genotype call and hence it may not be -consistent throughout the entire VCF file. - -If a genotype call of lower than specified ploidy is encountered it will be treated -as an incomplete genotype. -For example, if a VCF is being processed assuming a ploidy of four (i.e. tetraploid) -then the diploid genotype ``0/1`` will be treated as the incomplete tetraploid -genotype ``0/1/./.``. - -If a genotype call of higher than specified ploidy is encountered an exception is raised. -This exception can be avoided using the ``truncate_calls`` parameter in which case the -additional alleles will be skipped. - -Conversion of mixed-ploidy VCF files is also supported by :func:`sgkit.io.vcf.vcf_to_zarr` -by use of the ``mixed_ploidy`` parameter. -In this case ``ploidy`` specifies the maximum allowed ploidy and lower ploidy -genotype calls within the VCF file will be preserved within the resulting dataset. - -Note that many statistical genetics methods available for diploid data are not generalized -to polyploid and or mixed-ploidy data. -Therefore, some methods available in sgkit may only be applicable to diploid or fixed-ploidy -datasets. - -Methods that are generalized to polyploid and mixed-ploidy data may make assumptions -such as polysomic inheritance and hence it is necessary to understand the type of polyploidy -present within any given dataset. - -Example: converting 1000 genomes VCF to Zarr --------------------------------------------- - -This section shows how to convert the `1000 genomes `_ dataset into Zarr format for analysis in sgkit. - -For reference, the conversion (not including downloading the data) took about an hour on a machine with 32 vCPUs and 128GB of memory (GCP e2-standard-32). - -Install sgkit -~~~~~~~~~~~~~ - -Install the main package using conda or pip, and the VCF extra package using pip, as described in :ref:`installation`. - -Download the data -~~~~~~~~~~~~~~~~~ - -Run the following to download the 1000 genomes VCF files over FTP:: - - mkdir -p data/1kg - for contig in {1..22}; do - wget -P data/1kg ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr${contig}.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz - wget -P data/1kg ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr${contig}.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz.tbi - done - -Run the conversion -~~~~~~~~~~~~~~~~~~ - -Run the following Python code:: - - from sgkit.io.vcf import vcf_to_zarr - from dask.distributed import Client - - if __name__ == "__main__": - client = Client(n_workers=16, threads_per_worker=1) - - vcfs = [f"data/1kg/ALL.chr{contig}.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz" for contig in range(1, 23)] - target = "1kg.zarr" - vcf_to_zarr(vcfs, target, tempdir="1kg-tmp") - -A few notes about the code: - -1. Using a Dask distributed cluster, even on a single machine, performs better than the default scheduler (which uses threads), or -the multiprocessing scheduler. Creating a ``Client`` object will start a local cluster. - -2. Making the number of workers less than the number of cores (16 rather than 32 in this case) will improve performance. -It's also important to set ``threads_per_worker`` to 1 to avoid overcommitting threads, as recommended in `the Dask documentation `_. - -3. It is useful to track the progress of the computation using `the Dask dashboard `_. -There are two steps in the conversion operation, described in :ref:`vcf_low_level_operation`, the first of which has coarse-grained, long-running tasks, -and the second which has much shorter-running tasks. There is a considerable delay (around 10 minutes) between the two steps, -so don't worry if it doesn't look like it's progressing. - -4. Only the core VCF fields and genotypes are converted. To import more VCF fields see the documentation -for the ``fields`` and ``field_defs`` parameters for :func:`sgkit.io.vcf.vcf_to_zarr`. - -Inspect the dataset -~~~~~~~~~~~~~~~~~~~ - -When the conversion is complete, have a look at the dataset as follows:: - - >>> import sgkit as sg - >>> ds = sg.load_dataset("1kg.zarr") - >>> ds - - Dimensions: (variants: 81271745, samples: 2504, ploidy: 2, alleles: 4) - Dimensions without coordinates: variants, samples, ploidy, alleles - Data variables: - call_genotype (variants, samples, ploidy) int8 dask.array - call_genotype_mask (variants, samples, ploidy) bool dask.array - call_genotype_phased (variants, samples) bool dask.array - sample_id (samples) object dask.array - variant_allele (variants, alleles) object dask.array - variant_contig (variants) int8 dask.array - variant_id (variants) object dask.array - variant_id_mask (variants) bool dask.array - variant_position (variants) int32 dask.array - Attributes: - contigs: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'... - max_alt_alleles_seen: 12 - diff --git a/requirements-dev.txt b/requirements-dev.txt index a8c7ac006..127bc9bed 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -14,7 +14,7 @@ bed-reader rechunker cbgen < 1.0.5; platform_system != "Windows" cbgen == 1.0.1; platform_system == "Windows" -cyvcf2; platform_system != "Windows" +bio2zarr; platform_system != "Windows" yarl matplotlib asv diff --git a/requirements-doc.txt b/requirements-doc.txt index 40937bc32..deac3cd43 100644 --- a/requirements-doc.txt +++ b/requirements-doc.txt @@ -8,4 +8,3 @@ matplotlib seaborn ablog!=0.10.27 pickleshare -bio2zarr diff --git a/setup.cfg b/setup.cfg index 0c35d2151..77bf3cd73 100644 --- a/setup.cfg +++ b/setup.cfg @@ -59,11 +59,6 @@ setup_requires = plink = partd bed-reader -vcf = - aiohttp - cyvcf2 - requests - yarl bgen = rechunker cbgen < 1.0.5; platform_system != "Windows" @@ -105,8 +100,6 @@ line_length = 88 [mypy-callee.*] ignore_missing_imports = True -[mypy-cyvcf2.*] -ignore_missing_imports = True [mypy-dask.*] ignore_missing_imports = True [mypy-fsspec.*] diff --git a/sgkit/__init__.py b/sgkit/__init__.py index 6699ef320..dbe5832a9 100644 --- a/sgkit/__init__.py +++ b/sgkit/__init__.py @@ -3,7 +3,6 @@ from .display import display_genotypes, display_pedigree from .distance.api import pairwise_distance from .io.dataset import load_dataset, save_dataset -from .io.vcfzarr_reader import read_scikit_allel_vcfzarr from .model import ( DIM_ALLELE, DIM_PLOIDY, @@ -94,7 +93,6 @@ "genee", "genomic_relationship", "gwas_linear_regression", - "read_scikit_allel_vcfzarr", "regenie", "regenie_loco_regression", "hardy_weinberg_test", diff --git a/sgkit/io/utils.py b/sgkit/io/utils.py index 4742023e8..49f2df8ec 100644 --- a/sgkit/io/utils.py +++ b/sgkit/io/utils.py @@ -1,9 +1,7 @@ -from typing import Any, Dict, Mapping, Optional, Sequence, Tuple +from typing import Mapping, Optional, Tuple -import dask.array as da import dask.dataframe as dd import numpy as np -import zarr from ..typing import ArrayLike, DType from ..utils import encode_array, max_str_len @@ -52,109 +50,3 @@ def encode_contigs(contig: ArrayLike) -> Tuple[ArrayLike, ArrayLike]: else: ids, names = encode_array(np.asarray(contig, dtype=str)) return ids, names - - -def concatenate_and_rechunk( - zarrs: Sequence[zarr.Array], - chunks: Optional[Tuple[int, ...]] = None, - dtype: DType = None, -) -> da.Array: - """Perform a concatenate and rechunk operation on a collection of Zarr arrays - to produce an array with a uniform chunking, suitable for saving as - a single Zarr array. - - In contrast to Dask's ``rechunk`` method, the Dask computation graph - is embarrassingly parallel and will make efficient use of memory, - since no Zarr chunks are cached by the Dask scheduler. - - The Zarr arrays must have matching shapes except in the first - dimension. - - Parameters - ---------- - zarrs - Collection of Zarr arrays to concatenate. - chunks : Optional[Tuple[int, ...]], optional - The chunks to apply to the concatenated arrays. If not specified - the chunks for the first array will be applied to the concatenated - array. - dtype - The dtype of the concatenated array, by default the same as the - first array. - - Returns - ------- - A Dask array, suitable for saving as a single Zarr array. - - Raises - ------ - ValueError - If the Zarr arrays do not have matching shapes (except in the first - dimension). - """ - - if len(set([z.shape[1:] for z in zarrs])) > 1: - shapes = [z.shape for z in zarrs] - raise ValueError( - f"Zarr arrays must have matching shapes (except in the first dimension): {shapes}" - ) - - lengths = np.array([z.shape[0] for z in zarrs]) - lengths0 = np.insert(lengths, 0, 0, axis=0) # type: ignore[no-untyped-call] - offsets = np.cumsum(lengths0) - total_length = offsets[-1] - - shape = (total_length, *zarrs[0].shape[1:]) - chunks = chunks or zarrs[0].chunks - dtype = dtype or zarrs[0].dtype - - ar = da.empty(shape, chunks=chunks) - - def load_chunk( - x: ArrayLike, - zarrs: Sequence[zarr.Array], - offsets: ArrayLike, - block_info: Dict[Any, Any], - ) -> ArrayLike: - return _slice_zarrs(zarrs, offsets, block_info[0]["array-location"]) - - return ar.map_blocks(load_chunk, zarrs=zarrs, offsets=offsets, dtype=dtype) - - -def _zarr_index(offsets: ArrayLike, pos: int) -> int: - """Return the index of the zarr file that pos falls in""" - index: int = np.searchsorted(offsets, pos, side="right") - 1 # type: ignore[assignment] - return index - - -def _slice_zarrs( - zarrs: Sequence[zarr.Array], offsets: ArrayLike, locs: Sequence[Tuple[int, ...]] -) -> ArrayLike: - """Slice concatenated zarrs by locs""" - # convert array locations to slices - locs = [slice(*loc) for loc in locs] # type: ignore[misc] - # determine which zarr files are needed - start, stop = locs[0].start, locs[0].stop # type: ignore[attr-defined] # stack on first axis - i0 = _zarr_index(offsets, start) - i1 = _zarr_index(offsets, stop) - if i0 == i1: # within a single zarr file - sel = slice(start - offsets[i0], stop - offsets[i0]) - return zarrs[i0][(sel, *locs[1:])] - else: # more than one zarr file - slices = [] - slices.append((i0, slice(start - offsets[i0], None))) - for i in range(i0 + 1, i1): # entire zarr - slices.append((i, slice(None))) - if stop > offsets[i1]: - slices.append((i1, slice(0, stop - offsets[i1]))) - parts = [zarrs[i][(sel, *locs[1:])] for (i, sel) in slices] - return np.concatenate(parts) # type: ignore[no-untyped-call] - - -def str_is_int(x: str) -> bool: - """Test if a string can be parsed as an int""" - try: - int(x) - return True - except ValueError: - return False diff --git a/sgkit/io/vcf/__init__.py b/sgkit/io/vcf/__init__.py deleted file mode 100644 index bcb875de3..000000000 --- a/sgkit/io/vcf/__init__.py +++ /dev/null @@ -1,40 +0,0 @@ -import platform - -try: - from .vcf_partition import partition_into_regions - from .vcf_reader import ( - FloatFormatFieldWarning, - MaxAltAllelesExceededWarning, - concat_zarrs, - read_vcf, - vcf_to_zarr, - vcf_to_zarrs, - zarr_array_sizes, - ) - from .vcf_writer import write_vcf, zarr_to_vcf - - __all__ = [ - "FloatFormatFieldWarning", - "MaxAltAllelesExceededWarning", - "concat_zarrs", - "partition_into_regions", - "read_vcf", - "vcf_to_zarr", - "vcf_to_zarrs", - "write_vcf", - "zarr_array_sizes", - "zarr_to_vcf", - ] -except ImportError as e: # pragma: no cover - if platform.system() == "Windows": - msg = ( - "sgkit-vcf is not supported on Windows.\n" - "Please see the sgkit documentation for details and workarounds." - ) - else: - msg = ( - "sgkit-vcf requirements are not installed.\n\n" - "Please install them via pip :\n\n" - " pip install 'sgkit[vcf]'" - ) - raise ImportError(str(e) + "\n\n" + msg) from e diff --git a/sgkit/io/vcf/csi.py b/sgkit/io/vcf/csi.py deleted file mode 100644 index 277b429af..000000000 --- a/sgkit/io/vcf/csi.py +++ /dev/null @@ -1,150 +0,0 @@ -"""Functions for parsing CSI files into Python objects so they can be inspected. - -The implementation follows the [CSI index file format](http://samtools.github.io/hts-specs/CSIv1.pdf). - -""" -from dataclasses import dataclass -from typing import Any, Dict, Optional, Sequence - -import numpy as np - -from sgkit.io.vcf.utils import ( - get_file_offset, - open_gzip, - read_bytes_as_tuple, - read_bytes_as_value, -) -from sgkit.typing import PathType - -CSI_EXTENSION = ".csi" - - -@dataclass -class Chunk: - cnk_beg: int - cnk_end: int - - -@dataclass -class Bin: - bin: int - loffset: int - chunks: Sequence[Chunk] - - -@dataclass -class CSIIndex: - min_shift: int - depth: int - aux: str - bins: Sequence[Sequence[Bin]] - record_counts: Sequence[int] - n_no_coor: int - - def offsets(self) -> Any: - pseudo_bin = bin_limit(self.min_shift, self.depth) + 1 - - file_offsets = [] - contig_indexes = [] - positions = [] - for contig_index, bins in enumerate(self.bins): - # bins may be in any order within a contig, so sort by loffset - for bin in sorted(bins, key=lambda b: b.loffset): - if bin.bin == pseudo_bin: - continue # skip pseudo bins - file_offset = get_file_offset(bin.loffset) - position = get_first_locus_in_bin(self, bin.bin) - file_offsets.append(file_offset) - contig_indexes.append(contig_index) - positions.append(position) - - return np.array(file_offsets), np.array(contig_indexes), np.array(positions) - - -def bin_limit(min_shift: int, depth: int) -> int: - """Defined in CSI spec""" - return ((1 << (depth + 1) * 3) - 1) // 7 - - -def get_first_bin_in_level(level: int) -> int: - return ((1 << level * 3) - 1) // 7 - - -def get_level_size(level: int) -> int: - return 1 << level * 3 - - -def get_level_for_bin(csi: CSIIndex, bin: int) -> int: - for i in range(csi.depth, -1, -1): - if bin >= get_first_bin_in_level(i): - return i - raise ValueError(f"Cannot find level for bin {bin}.") # pragma: no cover - - -def get_first_locus_in_bin(csi: CSIIndex, bin: int) -> int: - level = get_level_for_bin(csi, bin) - first_bin_on_level = get_first_bin_in_level(level) - level_size = get_level_size(level) - max_span = 1 << (csi.min_shift + 3 * csi.depth) - return (bin - first_bin_on_level) * (max_span // level_size) + 1 - - -def read_csi( - file: PathType, storage_options: Optional[Dict[str, str]] = None -) -> CSIIndex: - """Parse a CSI file into a `CSIIndex` object. - - Parameters - ---------- - file : PathType - The path to the CSI file. - - Returns - ------- - CSIIndex - An object representing a CSI index. - - Raises - ------ - ValueError - If the file is not a CSI file. - """ - with open_gzip(file, storage_options=storage_options) as f: - magic = read_bytes_as_value(f, "4s") - if magic != b"CSI\x01": - raise ValueError("File not in CSI format.") - - min_shift, depth, l_aux = read_bytes_as_tuple(f, "<3i") - aux = read_bytes_as_value(f, f"{l_aux}s", "") - n_ref = read_bytes_as_value(f, " 0: - for _ in range(n_ref): - n_bin = read_bytes_as_value(f, " Any: - # Combine the linear indexes into one stacked array - linear_indexes = self.linear_indexes - linear_index = np.hstack([np.array(li) for li in linear_indexes]) - - # Create file offsets for each element in the linear index - file_offsets = np.array([get_file_offset(vfp) for vfp in linear_index]) - - # Calculate corresponding contigs and positions or each element in the linear index - contig_indexes = np.hstack( - [np.full(len(li), i) for (i, li) in enumerate(linear_indexes)] - ) - # positions are 1-based and inclusive - positions = np.hstack( - [ - np.arange(len(li)) * TABIX_LINEAR_INDEX_INTERVAL_SIZE + 1 - for li in linear_indexes - ] - ) - assert len(file_offsets) == len(contig_indexes) - assert len(file_offsets) == len(positions) - - return file_offsets, contig_indexes, positions - - -def read_tabix( - file: PathType, storage_options: Optional[Dict[str, str]] = None -) -> TabixIndex: - """Parse a tabix file into a `TabixIndex` object. - - Parameters - ---------- - file : PathType - The path to the tabix file. - - Returns - ------- - TabixIndex - An object representing a tabix index. - - Raises - ------ - ValueError - If the file is not a tabix file. - """ - with open_gzip(file, storage_options=storage_options) as f: - magic = read_bytes_as_value(f, "4s") - if magic != b"TBI\x01": - raise ValueError("File not in Tabix format.") - - header = Header(*read_bytes_as_tuple(f, "<8i")) - - sequence_names = [] - bins = [] - linear_indexes = [] - record_counts = [] - - if header.l_nm > 0: - names = read_bytes_as_value(f, f"<{header.l_nm}s") - # Convert \0-terminated names to strings - sequence_names = [str(name, "utf-8") for name in names.split(b"\x00")[:-1]] - - for _ in range(header.n_ref): - n_bin = read_bytes_as_value(f, " int: - """Safe integer ceil function""" - return -(-a // b) - - -# Based on https://dev.to/orenovadia/solution-chunked-iterator-python-riddle-3ple -def chunks(iterator: Iterator[T], n: int) -> Iterator[Iterator[T]]: - """ - Convert an iterator into an iterator of iterators, where the inner iterators - each return `n` items, except the last, which may return fewer. - - For the special case of an empty iterator, an iterator of an empty iterator is - returned. - """ - - empty_iterator = True - for first in iterator: # take one item out (exits loop if `iterator` is empty) - empty_iterator = False - rest_of_chunk = itertools.islice(iterator, 0, n - 1) - yield itertools.chain([first], rest_of_chunk) # concatenate the first item back - if empty_iterator: - yield iter([]) - - -def get_file_length( - path: PathType, storage_options: Optional[Dict[str, str]] = None -) -> int: - """Get the length of a file in bytes.""" - url = str(path) - storage_options = storage_options or {} - with fsspec.open(url, **storage_options) as openfile: - fs = openfile.fs - size = fs.size(url) - if size is None: - raise IOError(f"Cannot determine size of file {url}") # pragma: no cover - return int(size) - - -def get_file_offset(vfp: int) -> int: - """Convert a block compressed virtual file pointer to a file offset.""" - address_mask = 0xFFFFFFFFFFFF - return vfp >> 16 & address_mask - - -def read_bytes_as_value(f: IO[Any], fmt: str, nodata: Optional[Any] = None) -> Any: - """Read bytes using a `struct` format string and return the unpacked data value. - - Parameters - ---------- - f : IO[Any] - The IO stream to read bytes from. - fmt : str - A Python `struct` format string. - nodata : Optional[Any], optional - The value to return in case there is no further data in the stream, by default None - - Returns - ------- - Any - The unpacked data value read from the stream. - """ - data = f.read(struct.calcsize(fmt)) - if not data: - return nodata - values = struct.Struct(fmt).unpack(data) - assert len(values) == 1 - return values[0] - - -def read_bytes_as_tuple(f: IO[Any], fmt: str) -> Sequence[Any]: - """Read bytes using a `struct` format string and return the unpacked data values. - - Parameters - ---------- - f : IO[Any] - The IO stream to read bytes from. - fmt : str - A Python `struct` format string. - - Returns - ------- - Sequence[Any] - The unpacked data values read from the stream. - """ - data = f.read(struct.calcsize(fmt)) - return struct.Struct(fmt).unpack(data) - - -def open_gzip(path: PathType, storage_options: Optional[Dict[str, str]]) -> IO[Any]: - url = str(path) - storage_options = storage_options or {} - openfile: IO[Any] = fsspec.open(url, compression="gzip", **storage_options) - return openfile - - -def url_filename(url: str) -> str: - """Extract the filename from a URL""" - filename: str = URL(url).name - return filename - - -def build_url(dir_url: PathType, child_path: str) -> str: - """Combine a URL for a directory with a child path""" - url = URL(str(dir_url)) - # the division (/) operator discards query and fragment, so add them back - return str((url / child_path).with_query(url.query).with_fragment(url.fragment)) - - -@contextmanager -def temporary_directory( - suffix: Optional[str] = None, - prefix: Optional[str] = None, - dir: Optional[PathType] = None, - storage_options: Optional[Dict[str, str]] = None, - retain_temp_files: Optional[bool] = None, -) -> Iterator[str]: - """Create a temporary directory in a fsspec filesystem. - - Parameters - ---------- - suffix : Optional[str], optional - If not None, the name of the temporary directory will end with that suffix. - prefix : Optional[str], optional - If not None, the name of the temporary directory will start with that prefix. - dir : Optional[PathType], optional - If not None, the temporary directory will be created in that directory, otherwise - the local filesystem directory returned by `tempfile.gettempdir()` will be used. - The directory may be specified as any fsspec URL. - storage_options : Optional[Dict[str, str]], optional - Any additional parameters for the storage backend (see `fsspec.open`). - retain_temp_files : Optional[bool], optional - If True, the temporary directory will not be removed on exiting the context manager. - Defaults to None, which means the directory will be removed. - Yields - ------- - Generator[str, None, None] - A context manager yielding the fsspec URL to the created directory. - """ - - # Fill in defaults - suffix = suffix or "" - prefix = prefix or "" - dir = dir or tempfile.gettempdir() - storage_options = storage_options or {} - - # Find the filesystem by looking at the URL scheme (protocol), empty means local filesystem - protocol = urlparse(str(dir)).scheme - fs = fsspec.filesystem(protocol, **storage_options) - - # Construct a random directory name - tempdir = build_url(dir, prefix + str(uuid.uuid4()) + suffix) - try: - fs.mkdir(tempdir) - yield tempdir - finally: - # Remove the temporary directory on exiting the context manager - if not retain_temp_files: - fs.rm(tempdir, recursive=True) - - -def get_default_vcf_encoding(ds, chunk_length, chunk_width, compressor): - # Enforce uniform chunks in the variants dimension - # Also chunk in the samples direction - def get_chunk_size(dim: Hashable, size: int) -> int: - if dim == "variants": - return chunk_length - elif dim == "samples": - return chunk_width - else: - # Avoid chunk size of 0 - return max(size, 1) - - default_encoding = {} - for var in ds.data_vars: - var_chunks = tuple( - get_chunk_size(dim, size) - for (dim, size) in zip(ds[var].dims, ds[var].shape) - ) - default_encoding[var] = dict(chunks=var_chunks, compressor=compressor) - - # Enable bit packing by default for boolean arrays - if ds[var].dtype.kind == "b": - default_encoding[var]["filters"] = [PackBits()] - - # Position is monotonically increasing (within a contig) so benefits from delta encoding - if var == "variant_position": - default_encoding[var]["filters"] = [Delta(ds[var].dtype)] - - return default_encoding - - -def merge_encodings( - default_encoding: Dict[str, Dict[str, Any]], overrides: Dict[str, Dict[str, Any]] -) -> Dict[str, Dict[str, Any]]: - """Merge a dictionary of dictionaries specifying encodings with another dictionary of dictionaries of overriding encodings. - - Parameters - ---------- - default_encoding : Dict[str, Dict[str, Any]] - The default encoding dictionary. - overrides : Dict[str, Dict[str, Any]] - A dictionary containing selective overrides. - - Returns - ------- - Dict[str, Dict[str, Any]] - The merged encoding dictionary - """ - merged = {} - for var, d in default_encoding.items(): - if var in overrides: - merged[var] = {**d, **overrides[var]} - else: - merged[var] = d - for var, d in overrides.items(): - if var not in merged: - merged[var] = d - return merged diff --git a/sgkit/io/vcf/vcf_partition.py b/sgkit/io/vcf/vcf_partition.py deleted file mode 100644 index c64e5c088..000000000 --- a/sgkit/io/vcf/vcf_partition.py +++ /dev/null @@ -1,204 +0,0 @@ -import warnings -from typing import Any, Dict, Optional, Sequence, Union - -import dask -import fsspec -import numpy as np -from cyvcf2 import VCF - -from sgkit.io.vcf.csi import CSI_EXTENSION, read_csi -from sgkit.io.vcf.tbi import TABIX_EXTENSION, read_tabix -from sgkit.io.vcf.utils import ceildiv, get_file_length -from sgkit.typing import PathType - - -def region_string(contig: str, start: int, end: Optional[int] = None) -> str: - if end is not None: - return f"{contig}:{start}-{end}" - else: - return f"{contig}:{start}-" - - -def get_tabix_path( - vcf_path: PathType, storage_options: Optional[Dict[str, str]] = None -) -> Optional[str]: - url = str(vcf_path) - storage_options = storage_options or {} - tbi_path = url + TABIX_EXTENSION - with fsspec.open(url, **storage_options) as openfile: - fs = openfile.fs - if fs.exists(tbi_path): - return tbi_path - else: - return None - - -def get_csi_path( - vcf_path: PathType, storage_options: Optional[Dict[str, str]] = None -) -> Optional[str]: - url = str(vcf_path) - storage_options = storage_options or {} - csi_path = url + CSI_EXTENSION - with fsspec.open(url, **storage_options) as openfile: - fs = openfile.fs - if fs.exists(csi_path): - return csi_path - else: - return None - - -def read_index( - index_path: PathType, storage_options: Optional[Dict[str, str]] = None -) -> Any: - url = str(index_path) - if url.endswith(TABIX_EXTENSION): - return read_tabix(url, storage_options=storage_options) - elif url.endswith(CSI_EXTENSION): - return read_csi(url, storage_options=storage_options) - else: - raise ValueError("Only .tbi or .csi indexes are supported.") - - -def get_sequence_names(vcf_path: PathType, index: Any) -> Any: - try: - # tbi stores sequence names - return index.sequence_names - except AttributeError: - # ... but csi doesn't, so fall back to the VCF header - return VCF(vcf_path).seqnames - - -def partition_into_regions( - vcf_path: PathType, - *, - index_path: Optional[PathType] = None, - num_parts: Optional[int] = None, - target_part_size: Union[None, int, str] = None, - storage_options: Optional[Dict[str, str]] = None, -) -> Optional[Sequence[str]]: - """ - Calculate genomic region strings to partition a compressed VCF or BCF file into roughly equal parts. - - .. deprecated:: 0.9.0 - Functions for reading VCF are deprecated, please use the `bio2zarr `_ package. - - A ``.tbi`` or ``.csi`` file is used to find BGZF boundaries in the compressed VCF file, which are then - used to divide the file into parts. - - The number of parts can specified directly by providing ``num_parts``, or by specifying the - desired size (in bytes) of each (compressed) part by providing ``target_part_size``. - Exactly one of ``num_parts`` or ``target_part_size`` must be provided. - - Both ``num_parts`` and ``target_part_size`` serve as hints: the number of parts and their sizes - may be more or less than these parameters. - - Parameters - ---------- - vcf_path - The path to the VCF file. - index_path - The path to the VCF index (``.tbi`` or ``.csi``), by default None. If not specified, the - index path is constructed by appending the index suffix (``.tbi`` or ``.csi``) to the VCF path. - num_parts - The desired number of parts to partition the VCF file into, by default None - target_part_size - The desired size, in bytes, of each (compressed) part of the partitioned VCF, by default None. - If the value is a string, it may be specified using standard abbreviations, e.g. ``100MB`` is - equivalent to ``100_000_000``. - storage_options: - Any additional parameters for the storage backend (see ``fsspec.open``). - - Returns - ------- - The region strings that partition the VCF file, or None if the VCF file should not be partitioned - (so there is only a single partition). - - Raises - ------ - ValueError - If neither of ``num_parts`` or ``target_part_size`` has been specified. - ValueError - If both of ``num_parts`` and ``target_part_size`` have been specified. - ValueError - If either of ``num_parts`` or ``target_part_size`` is not a positive integer. - """ - - warnings.warn( - "Functions for reading VCF are deprecated, please use the bio2zarr package.", - DeprecationWarning, - stacklevel=2, - ) - - if num_parts is None and target_part_size is None: - raise ValueError("One of num_parts or target_part_size must be specified") - - if num_parts is not None and target_part_size is not None: - raise ValueError("Only one of num_parts or target_part_size may be specified") - - if num_parts is not None and num_parts < 1: - raise ValueError("num_parts must be positive") - - if target_part_size is not None: - target_part_size_bytes: int = dask.utils.parse_bytes(target_part_size) - if target_part_size_bytes < 1: - raise ValueError("target_part_size must be positive") - - # Calculate the desired part file boundaries - file_length = get_file_length(vcf_path, storage_options=storage_options) - if num_parts is not None: - target_part_size_bytes = file_length // num_parts - elif target_part_size_bytes is not None: - num_parts = ceildiv(file_length, target_part_size_bytes) - if num_parts == 1: - return None - part_lengths = np.array([i * target_part_size_bytes for i in range(num_parts)]) - - if index_path is None: - index_path = get_tabix_path(vcf_path, storage_options=storage_options) - if index_path is None: - index_path = get_csi_path(vcf_path, storage_options=storage_options) - if index_path is None: - raise ValueError("Cannot find .tbi or .csi file.") - - # Get the file offsets from .tbi/.csi - index = read_index(index_path, storage_options=storage_options) - sequence_names = get_sequence_names(vcf_path, index) - file_offsets, region_contig_indexes, region_positions = index.offsets() - - # Search the file offsets to find which indexes the part lengths fall at - ind = np.searchsorted(file_offsets, part_lengths) - - # Drop any parts that are greater than the file offsets (these will be covered by a region with no end) - ind = np.delete(ind, ind >= len(file_offsets)) # type: ignore[no-untyped-call] - - # Drop any duplicates - ind = np.unique(ind) # type: ignore[no-untyped-call] - - # Calculate region contig and start for each index - region_contigs = region_contig_indexes[ind] - region_starts = region_positions[ind] - - # Build region query strings - regions = [] - for i in range(len(region_starts)): - contig = sequence_names[region_contigs[i]] - start = region_starts[i] - - if i == len(region_starts) - 1: # final region - regions.append(region_string(contig, start)) - else: - next_contig = sequence_names[region_contigs[i + 1]] - next_start = region_starts[i + 1] - end = next_start - 1 # subtract one since positions are inclusive - if next_contig == contig: # contig doesn't change - regions.append(region_string(contig, start, end)) - else: # contig changes, so need two regions (or possibly more if any sequences were skipped) - regions.append(region_string(contig, start)) - for ri in range(region_contigs[i] + 1, region_contigs[i + 1]): - regions.append(sequence_names[ri]) # pragma: no cover - regions.append(region_string(next_contig, 1, end)) - # Add any sequences at the end that were not skipped - for ri in range(region_contigs[-1] + 1, len(sequence_names)): - regions.append(sequence_names[ri]) # pragma: no cover - - return regions diff --git a/sgkit/io/vcf/vcf_reader.py b/sgkit/io/vcf/vcf_reader.py deleted file mode 100644 index 5de4f44f2..000000000 --- a/sgkit/io/vcf/vcf_reader.py +++ /dev/null @@ -1,1401 +0,0 @@ -import functools -import itertools -import re -import warnings -from contextlib import contextmanager -from dataclasses import dataclass -from pathlib import Path -from typing import ( - Any, - Callable, - Dict, - Iterator, - MutableMapping, - Optional, - Sequence, - Tuple, - Union, -) - -import dask -import fsspec -import numpy as np -import xarray as xr -import zarr -from cyvcf2 import VCF, Variant - -from sgkit import variables -from sgkit.io.dataset import load_dataset -from sgkit.io.utils import ( - CHAR_FILL, - CHAR_MISSING, - FLOAT32_FILL, - FLOAT32_MISSING, - INT_FILL, - INT_MISSING, - STR_FILL, - STR_MISSING, -) -from sgkit.io.vcf import partition_into_regions -from sgkit.io.vcf.utils import ( - build_url, - chunks, - get_default_vcf_encoding, - merge_encodings, - temporary_directory, - url_filename, -) -from sgkit.io.vcfzarr_reader import ( - concat_zarrs_optimized, - vcf_number_to_dimension_and_size, -) -from sgkit.model import ( - DIM_CONTIG, - DIM_FILTER, - DIM_PLOIDY, - DIM_SAMPLE, - DIM_VARIANT, - create_genotype_call_dataset, -) -from sgkit.typing import ArrayLike, DType, PathType -from sgkit.utils import smallest_numpy_int_dtype - -DEFAULT_MAX_ALT_ALLELES = ( - 3 # equivalent to DEFAULT_ALT_NUMBER in vcf_read.py in scikit_allel -) - -try: - from numcodecs import Blosc - - DEFAULT_COMPRESSOR = Blosc(cname="zstd", clevel=7, shuffle=Blosc.AUTOSHUFFLE) -except ImportError: # pragma: no cover - warnings.warn("Cannot import Blosc, falling back to no compression", RuntimeWarning) - DEFAULT_COMPRESSOR = None - -# From VCF fixed fields -RESERVED_VARIABLE_NAMES = [ - "variant_contig", - "variant_position", - "variant_id", - "variant_id_mask", - "variant_allele", - "variant_quality", - "variant_filter", -] - - -class FloatFormatFieldWarning(UserWarning): - """Warning for VCF FORMAT float fields, which can use a lot of storage.""" - - pass - - -class MaxAltAllelesExceededWarning(UserWarning): - """Warning when the number of alt alleles exceeds the maximum specified.""" - - pass - - -@contextmanager -def open_vcf(path: PathType) -> Iterator[VCF]: - """A context manager for opening a VCF file.""" - vcf = VCF(path) - try: - yield vcf - finally: - vcf.close() - - -def region_filter( - variants: Iterator[Variant], region: Optional[str] = None -) -> Iterator[Variant]: - """Filter out variants that don't start in the given region.""" - if region is None: - return variants - else: - start = get_region_start(region) - return itertools.filterfalse(lambda v: v.POS < start, variants) - - -def get_region_start(region: str) -> int: - """Return the start position of the region string.""" - if re.search(r":\d+-\d*$", region): - contig, start_end = region.rsplit(":", 1) - start, end = start_end.split("-") - else: - return 1 - return int(start) - - -def _get_vcf_field_defs(vcf: VCF, category: str) -> Dict[str, Any]: - """Get a dictionary of field definitions for a category (e.g. INFO or FORMAT) - from the VCF header.""" - return { - h["ID"]: h.info(extra=True) - for h in vcf.header_iter() - if h["HeaderType"] == category - } - - -def _normalize_fields(vcf: VCF, fields: Sequence[str]) -> Sequence[str]: - """Expand 'INFO/*' and 'FORMAT/*' to the full list of fields from the VCF header.""" - info_fields = [f"INFO/{key}" for key in _get_vcf_field_defs(vcf, "INFO").keys()] - format_fields = set( - [f"FORMAT/{key}" for key in _get_vcf_field_defs(vcf, "FORMAT").keys()] - ) - - new_fields = [] - for field in fields: - # genotype is handled specially - if field == "FORMAT/GT" and field not in format_fields: - continue - if not any(field.startswith(prefix) for prefix in ["INFO/", "FORMAT/"]): - raise ValueError("VCF field must be prefixed with 'INFO/' or 'FORMAT/'") - category = field.split("/")[0] - key = field[len(f"{category}/") :] - if field == "INFO/*": - new_fields.extend(info_fields) - elif field == "FORMAT/*": - new_fields.extend(format_fields) - else: - if field not in info_fields and field not in format_fields: - raise ValueError( - f"{category} field '{key}' is not defined in the header." - ) - new_fields.append(field) - return new_fields - - -def _vcf_type_to_numpy( - vcf_type: str, category: str, key: str -) -> Tuple[DType, Any, Any]: - """Convert the VCF Type to a NumPy dtype, missing value, and fill value.""" - if vcf_type == "Flag": - return "bool", False, False - elif vcf_type == "Integer": - return "i4", INT_MISSING, INT_FILL - # the VCF spec defines Float as 32 bit, and in BCF is stored as 32 bit - elif vcf_type == "Float": - return "f4", FLOAT32_MISSING, FLOAT32_FILL - elif vcf_type == "Character": - return "S1", CHAR_MISSING, CHAR_FILL - elif vcf_type == "String": - return "O", STR_MISSING, STR_FILL - raise ValueError( - f"{category} field '{key}' is defined as Type '{vcf_type}', which is not supported." - ) - - -def _is_str_or_char(array: ArrayLike) -> bool: - """Return True if the array is of string or character type""" - return array.dtype.kind in ("O", "S", "U") - - -class VcfFieldHandler: - """Converts a VCF field to a dataset variable.""" - - @classmethod - def for_field( - cls, - vcf: VCF, - field: str, - chunk_length: int, - ploidy: int, - mixed_ploidy: bool, - truncate_calls: bool, - max_alt_alleles: int, - field_def: Dict[str, Any], - ) -> "VcfFieldHandler": - if field == "FORMAT/GT": - return GenotypeFieldHandler( - vcf, chunk_length, ploidy, mixed_ploidy, truncate_calls, max_alt_alleles - ) - category = field.split("/")[0] - vcf_field_defs = _get_vcf_field_defs(vcf, category) - key = field[len(f"{category}/") :] - vcf_number = field_def.get("Number", vcf_field_defs[key]["Number"]) - dimension, size = vcf_number_to_dimension_and_size( - vcf_number, category, key, field_def, ploidy, max_alt_alleles - ) - vcf_type = field_def.get("Type", vcf_field_defs[key]["Type"]) - description = field_def.get( - "Description", vcf_field_defs[key]["Description"].strip('"') - ) - dtype, missing_value, fill_value = _vcf_type_to_numpy(vcf_type, category, key) - chunksize: Tuple[int, ...] - if category == "INFO": - variable_name = f"variant_{key}" - dims = [DIM_VARIANT] - chunksize = (chunk_length,) - elif category == "FORMAT": - variable_name = f"call_{key}" - dims = [DIM_VARIANT, DIM_SAMPLE] - n_sample = len(vcf.samples) - chunksize = (chunk_length, n_sample) - if variable_name in RESERVED_VARIABLE_NAMES: - raise ValueError( - f"Generated name for INFO field '{key}' clashes with '{variable_name}' from fixed VCF fields." - ) - if dimension is not None: - dims.append(dimension) - chunksize += (size,) - - array = np.full(chunksize, fill_value, dtype=dtype) - - return InfoAndFormatFieldHandler( - category, - key, - variable_name, - description, - dims, - missing_value, - fill_value, - array, - ) - - def add_variant(self, i: int, variant: Any) -> None: - pass # pragma: no cover - - def truncate_array(self, length: int) -> None: - pass # pragma: no cover - - def update_dataset(self, ds: xr.Dataset) -> None: - pass # pragma: no cover - - -@dataclass -class InfoAndFormatFieldHandler(VcfFieldHandler): - """Converts a VCF INFO or FORMAT field to a dataset variable.""" - - category: str - key: str - variable_name: str - description: str - dims: Sequence[str] - missing_value: Any - fill_value: Any - array: ArrayLike - - def add_variant(self, i: int, variant: Any) -> None: - if self.category == "INFO": - val = variant.INFO.get(self.key, None) - self.array[i] = self.missing_value - if val is not None: - assert self.array.ndim in (1, 2) - if self.array.ndim == 1: - self.array[i] = val - elif self.array.ndim == 2: - self.array[i] = self.fill_value - if _is_str_or_char(self.array): # need to split strings - val = np.array(val.split(","), dtype=self.array.dtype) - try: - for j, v in enumerate(val): - self.array[i, j] = ( - v if v is not None else self.missing_value - ) - except TypeError: # val is a scalar - self.array[i, 0] = val - - elif self.category == "FORMAT": - val = variant.format(self.key) - if val is not None: - assert self.array.ndim in (2, 3) - if self.array.ndim == 2: - if _is_str_or_char(self.array): - self.array[i] = val - else: - self.array[i] = val[..., 0] - elif self.array.ndim == 3: - self.array[i] = self.fill_value - if _is_str_or_char(self.array): # need to split strings - for j, v in enumerate(val): - v = v.split(",") - if len(v) > self.array.shape[-1]: # pragma: no cover - v = v[: self.array.shape[-1]] - self.array[i, j, : len(v)] = v - else: - a = val - a = a[..., : self.array.shape[-1]] # trim to fit - self.array[i, ..., : a.shape[-1]] = a - else: - self.array[i] = self.missing_value - - def truncate_array(self, length: int) -> None: - self.array = self.array[:length] - - def update_dataset(self, ds: xr.Dataset) -> None: - # cyvcf2 represents missing Integer values as the minimum int32 value - # and fill as minimum int32 value + 1, so change these to our missing and fill values - if self.array.dtype == np.int32: - self.array[self.array == np.iinfo(np.int32).min] = INT_MISSING - self.array[self.array == np.iinfo(np.int32).min + 1] = INT_FILL - - ds[self.variable_name] = (self.dims, self.array) - if len(self.description) > 0: - ds[self.variable_name].attrs["comment"] = self.description - - -class GenotypeFieldHandler(VcfFieldHandler): - """Converts a FORMAT/GT field to a dataset variable.""" - - def __init__( - self, - vcf: VCF, - chunk_length: int, - ploidy: int, - mixed_ploidy: bool, - truncate_calls: bool, - max_alt_alleles: int, - ) -> None: - n_sample = len(vcf.samples) - self.ploidy = ploidy - self.mixed_ploidy = mixed_ploidy - self.truncate_calls = truncate_calls - self.max_alt_alleles = max_alt_alleles - self.fill = -2 if self.mixed_ploidy else -1 - self.call_genotype = np.full( - (chunk_length, n_sample, ploidy), - self.fill, - dtype=smallest_numpy_int_dtype(max_alt_alleles), - ) - self.call_genotype_phased = np.full((chunk_length, n_sample), 0, dtype=bool) - - def add_variant(self, i: int, variant: Any) -> None: - if variant.genotype is not None: - gt = variant.genotype.array(fill=self.fill) - gt_length = gt.shape[-1] - 1 # final element indicates phasing - if (gt_length > self.ploidy) and not self.truncate_calls: - raise ValueError("Genotype call longer than ploidy.") - n = min(self.call_genotype.shape[-1], gt_length) - self.call_genotype[i, ..., 0:n] = gt[..., 0:n] - self.call_genotype_phased[i] = gt[..., -1] - - def truncate_array(self, length: int) -> None: - self.call_genotype = self.call_genotype[:length] - self.call_genotype_phased = self.call_genotype_phased[:length] - - def update_dataset(self, ds: xr.Dataset) -> None: - # set any calls that exceed maximum number of alt alleles as missing - self.call_genotype[self.call_genotype > self.max_alt_alleles] = -1 - - ds["call_genotype"] = ( - [DIM_VARIANT, DIM_SAMPLE, DIM_PLOIDY], - self.call_genotype, - { - "comment": variables.call_genotype_spec.__doc__.strip(), - "mixed_ploidy": self.mixed_ploidy, - }, - ) - ds["call_genotype_mask"] = ( - [DIM_VARIANT, DIM_SAMPLE, DIM_PLOIDY], - self.call_genotype < 0, - {"comment": variables.call_genotype_mask_spec.__doc__.strip()}, - ) - if self.mixed_ploidy is True: - ds["call_genotype_fill"] = ( - [DIM_VARIANT, DIM_SAMPLE, DIM_PLOIDY], - self.call_genotype < -1, - {"comment": variables.call_genotype_fill_spec.__doc__.strip()}, - ) - ds["call_genotype_phased"] = ( - [DIM_VARIANT, DIM_SAMPLE], - self.call_genotype_phased, - {"comment": variables.call_genotype_phased_spec.__doc__.strip()}, - ) - - -def vcf_to_zarr_sequential( - input: PathType, - output: Union[PathType, MutableMapping[str, bytes]], - region: Optional[str] = None, - chunk_length: int = 10_000, - chunk_width: int = 1_000, - compressor: Optional[Any] = DEFAULT_COMPRESSOR, - encoding: Optional[Any] = None, - ploidy: int = 2, - mixed_ploidy: bool = False, - truncate_calls: bool = False, - max_alt_alleles: int = DEFAULT_MAX_ALT_ALLELES, - fields: Optional[Sequence[str]] = None, - exclude_fields: Optional[Sequence[str]] = None, - field_defs: Optional[Dict[str, Dict[str, Any]]] = None, - read_chunk_length: Optional[int] = None, -) -> None: - if read_chunk_length is None: - read_chunk_length = chunk_length - with open_vcf(input) as vcf: - sample_id = np.array(vcf.samples, dtype="O") - n_allele = max_alt_alleles + 1 - - variant_contig_names = vcf.seqnames - - filters = [ - h["ID"] - for h in vcf.header_iter() - if h["HeaderType"] == "FILTER" and isinstance(h["ID"], str) - ] - # Ensure PASS is the first filter if present - if "PASS" in filters: - filters.remove("PASS") - filters.insert(0, "PASS") - - # Remember max lengths of variable-length strings - max_alt_alleles_seen = 0 - - # Iterate through variants in batches of read_chunk_length - - if region is None: - variants = vcf - else: - variants = vcf(region) - - variant_contig_dtype = smallest_numpy_int_dtype(len(variant_contig_names)) - variant_contig = np.empty(read_chunk_length, dtype=variant_contig_dtype) - variant_position = np.empty(read_chunk_length, dtype="i4") - - fields = fields or ["FORMAT/GT"] # default to GT as the only extra field - fields = _normalize_fields(vcf, fields) - exclude_fields = exclude_fields or [] - exclude_fields = _normalize_fields(vcf, exclude_fields) - fields = [f for f in fields if f not in exclude_fields] - field_defs = field_defs or {} - field_handlers = [ - VcfFieldHandler.for_field( - vcf, - field, - read_chunk_length, - ploidy, - mixed_ploidy, - truncate_calls, - max_alt_alleles, - field_defs.get(field, {}), - ) - for field in fields - ] - - first_variants_chunk = True - for variants_chunk in chunks( - region_filter(variants, region), read_chunk_length - ): - variant_ids = [] - variant_alleles = [] - variant_quality = np.empty(read_chunk_length, dtype="f4") - variant_filter = np.full( - (read_chunk_length, len(filters)), False, dtype="bool" - ) - - i = -1 # initialize in case of empty variants_chunk - for i, variant in enumerate(variants_chunk): - variant_id = variant.ID if variant.ID is not None else "." - variant_ids.append(variant_id) - try: - variant_contig[i] = variant_contig_names.index(variant.CHROM) - except ValueError: - raise ValueError( - f"Contig '{variant.CHROM}' is not defined in the header." - ) - variant_position[i] = variant.POS - - alleles = [variant.REF] + variant.ALT - max_alt_alleles_seen = max(max_alt_alleles_seen, len(variant.ALT)) - if len(alleles) > n_allele: - alleles = alleles[:n_allele] - elif len(alleles) < n_allele: - alleles = alleles + ([STR_FILL] * (n_allele - len(alleles))) - variant_alleles.append(alleles) - - variant_quality[i] = ( - variant.QUAL if variant.QUAL is not None else FLOAT32_MISSING - ) - try: - for f in variant.FILTERS: - variant_filter[i][filters.index(f)] = True - except ValueError: - raise ValueError(f"Filter '{f}' is not defined in the header.") - for field_handler in field_handlers: - field_handler.add_variant(i, variant) - - # Truncate np arrays (if last chunk is smaller than read_chunk_length) - if i + 1 < read_chunk_length: - variant_contig = variant_contig[: i + 1] - variant_position = variant_position[: i + 1] - variant_quality = variant_quality[: i + 1] - variant_filter = variant_filter[: i + 1] - - for field_handler in field_handlers: - field_handler.truncate_array(i + 1) - - variant_id = np.array(variant_ids, dtype="O") - variant_id_mask = variant_id == "." - if len(variant_alleles) == 0: - variant_allele = np.empty((0, n_allele), dtype="O") - else: - variant_allele = np.array(variant_alleles, dtype="O") - - ds: xr.Dataset = create_genotype_call_dataset( - variant_contig_names=variant_contig_names, - variant_contig=variant_contig, - variant_position=variant_position, - variant_allele=variant_allele, - sample_id=sample_id, - variant_id=variant_id, - ) - ds["variant_id_mask"] = ( - [DIM_VARIANT], - variant_id_mask, - ) - ds["variant_quality"] = ([DIM_VARIANT], variant_quality) - ds["variant_filter"] = ([DIM_VARIANT, DIM_FILTER], variant_filter) - ds.attrs["filters"] = filters - ds["filter_id"] = ([DIM_FILTER], np.array(filters, dtype="O")) - ds.attrs["vcf_zarr_version"] = "0.2" - ds.attrs["vcf_header"] = vcf.raw_header - try: - ds.attrs["contig_lengths"] = vcf.seqlens - ds["contig_length"] = ([DIM_CONTIG], np.array(vcf.seqlens)) - except AttributeError: - pass - - for field_handler in field_handlers: - field_handler.update_dataset(ds) - ds.attrs["max_alt_alleles_seen"] = max_alt_alleles_seen - - if first_variants_chunk: - # limit chunk width to actual number of samples seen in first chunk - if ds.sizes["samples"] > 0: - chunk_width = min(chunk_width, ds.sizes["samples"]) - - # ensure that booleans are not stored as int8 by xarray https://github.com/pydata/xarray/issues/4386 - for var in ds.data_vars: - if ds[var].dtype.kind == "b": - ds[var].attrs["dtype"] = "bool" - - # values from function args (encoding) take precedence over default_encoding - default_encoding = get_default_vcf_encoding( - ds, chunk_length, chunk_width, compressor - ) - encoding = encoding or {} - merged_encoding = merge_encodings(default_encoding, encoding) - - for var in ds.data_vars: - # Issue warning for VCF FORMAT float fields with no filter - if ( - var.startswith("call_") - and ds[var].dtype == np.float32 - and ( - var not in merged_encoding - or "filters" not in merged_encoding[var] - ) - ): - warnings.warn( - f"Storing call variable {var} (FORMAT field) as a float can result in large file sizes. " - f"Consider setting the encoding filters for this variable to FixedScaleOffset or similar.", - FloatFormatFieldWarning, - ) - - ds.to_zarr(output, mode="w", encoding=merged_encoding) - first_variants_chunk = False - else: - # Append along the variants dimension - ds.to_zarr(output, append_dim=DIM_VARIANT) - - -def vcf_to_zarr_parallel( - input: Union[PathType, Sequence[PathType]], - output: Union[PathType, MutableMapping[str, bytes]], - regions: Union[None, Sequence[str], Sequence[Optional[Sequence[str]]]], - chunk_length: int = 10_000, - chunk_width: int = 1_000, - compressor: Optional[Any] = DEFAULT_COMPRESSOR, - encoding: Optional[Any] = None, - temp_chunk_length: Optional[int] = None, - tempdir: Optional[PathType] = None, - tempdir_storage_options: Optional[Dict[str, str]] = None, - ploidy: int = 2, - mixed_ploidy: bool = False, - truncate_calls: bool = False, - max_alt_alleles: int = DEFAULT_MAX_ALT_ALLELES, - fields: Optional[Sequence[str]] = None, - exclude_fields: Optional[Sequence[str]] = None, - field_defs: Optional[Dict[str, Dict[str, Any]]] = None, - read_chunk_length: Optional[int] = None, - retain_temp_files: Optional[bool] = None, -) -> None: - """Convert specified regions of one or more VCF files to zarr files, then concat, rechunk, write to zarr""" - - if temp_chunk_length is None: - temp_chunk_length = chunk_length - - with temporary_directory( - prefix="vcf_to_zarr_", - dir=tempdir, - storage_options=tempdir_storage_options, - retain_temp_files=retain_temp_files, - ) as tmpdir: - paths = vcf_to_zarrs( - input, - tmpdir, - regions, - temp_chunk_length, - chunk_width, - compressor, - encoding, - tempdir_storage_options, - ploidy=ploidy, - mixed_ploidy=mixed_ploidy, - truncate_calls=truncate_calls, - max_alt_alleles=max_alt_alleles, - fields=fields, - exclude_fields=exclude_fields, - field_defs=field_defs, - read_chunk_length=read_chunk_length, - ) - - concat_zarrs( - paths, - output, - storage_options=tempdir_storage_options, - chunk_length=chunk_length, - ) - - -def vcf_to_zarrs( - input: Union[PathType, Sequence[PathType]], - output: PathType, - regions: Union[None, Sequence[str], Sequence[Optional[Sequence[str]]]], - chunk_length: int = 10_000, - chunk_width: int = 1_000, - compressor: Optional[Any] = DEFAULT_COMPRESSOR, - encoding: Optional[Any] = None, - output_storage_options: Optional[Dict[str, str]] = None, - ploidy: int = 2, - mixed_ploidy: bool = False, - truncate_calls: bool = False, - max_alt_alleles: int = DEFAULT_MAX_ALT_ALLELES, - fields: Optional[Sequence[str]] = None, - exclude_fields: Optional[Sequence[str]] = None, - field_defs: Optional[Dict[str, Dict[str, Any]]] = None, - read_chunk_length: Optional[int] = None, -) -> Sequence[str]: - """Convert VCF files to multiple Zarr on-disk stores, one per region. - - .. deprecated:: 0.9.0 - Functions for reading VCF are deprecated, please use the `bio2zarr `_ package. - - Parameters - ---------- - input - A path (or paths) to the input BCF or VCF file (or files). VCF files should - be compressed and have a ``.tbi`` or ``.csi`` index file. BCF files should - have a ``.csi`` index file. - output - Path to directory containing the multiple Zarr output stores. - regions - Genomic region or regions to extract variants for. For multiple inputs, multiple - input regions are specified as a sequence of values which may be None, or a - sequence of region strings. - chunk_length - Length (number of variants) of chunks in which data are stored, by default 10,000. - chunk_width - Width (number of samples) to use when storing chunks in output, by default 1,000. - compressor - Zarr compressor, by default Blosc + zstd with compression level 7 and auto-shuffle. - No compression is used when set as None. - encoding - Variable-specific encodings for xarray, specified as a nested dictionary with - variable names as keys and dictionaries of variable specific encodings as values. - Can be used to override Zarr compressor and filters on a per-variable basis, - e.g., ``{"call_genotype": {"compressor": Blosc("zstd", 9)}}``. - output_storage_options - Any additional parameters for the storage backend, for the output (see ``fsspec.open``). - ploidy - The (maximum) ploidy of genotypes in the VCF file. - mixed_ploidy - If True, genotype calls with fewer alleles than the specified ploidy will be padded - with the fill (non-allele) sentinel value of -2. If false, calls with fewer alleles than - the specified ploidy will be treated as incomplete and will be padded with the - missing-allele sentinel value of -1. - truncate_calls - If True, genotype calls with more alleles than the specified (maximum) ploidy value - will be truncated to size ploidy. If false, calls with more alleles than the - specified ploidy will raise an exception. - max_alt_alleles - The (maximum) number of alternate alleles in the VCF file. Any records with more than - this number of alternate alleles will have the extra alleles dropped (the `variant_allele` - variable will be truncated). Any call genotype fields with the extra alleles will - be changed to the missing-allele sentinel value of -1. - fields - Extra fields to extract data for. A list of strings, with ``INFO`` or ``FORMAT`` prefixes. - Wildcards are permitted too, for example: ``["INFO/*", "FORMAT/DP"]``. - field_defs - Per-field information that overrides the field definitions in the VCF header, or - provides extra information needed in the dataset representation. Definitions - are a represented as a dictionary whose keys are the field names, and values are - dictionaries with any of the following keys: ``Number``, ``Type``, ``Description``, - ``dimension``. The first three correspond to VCF header values, and ``dimension`` is - the name of the final dimension in the array for the case where ``Number`` is a fixed - integer larger than 1. For example, - ``{"INFO/AC": {"Number": "A"}, "FORMAT/HQ": {"dimension": "haplotypes"}}`` - overrides the ``INFO/AC`` field to be Number ``A`` (useful if the VCF defines it as - having variable length with ``.``), and names the final dimension of the ``HQ`` array - (which is defined as Number 2 in the VCF header) as ``haplotypes``. - (Note that Number ``A`` is the number of alternate alleles, see section 1.4.2 of the - VCF spec https://samtools.github.io/hts-specs/VCFv4.3.pdf.) - read_chunk_length - Length (number of variants) of chunks to read from the VCF file at a time. Use this - option to reduce memory usage by using a value lower than ``chunk_length`` with a small - cost in extra run time. The increase in runtime becomes higher as the ratio of - ``read_chunk_length`` to ``chunk_length`` decreases. Defaults to ``None``, which - means that a value equal to ``chunk_length`` is used. The memory usage of the - conversion process is proportional to ``read_chunk_length*n_samples*(1+n_ploidy)`` - so this option is mainly useful for very large numbers of samples and/or where a - large ``chunk_size`` is desirable to reduce the number of dask tasks needed in - downstream analysis. - - Returns - ------- - A list of URLs to the Zarr outputs. - """ - - warnings.warn( - "Functions for reading VCF are deprecated, please use the bio2zarr package.", - DeprecationWarning, - stacklevel=2, - ) - - output_storage_options = output_storage_options or {} - - tasks = [] - parts = [] - for input, input_region_list in zip_input_and_regions(input, regions): - filename = url_filename(str(input)) - if input_region_list is None: - # single partition case: make a list so the loop below works - input_region_list = [None] # type: ignore - for r, region in enumerate(input_region_list): - part_url = build_url(str(output), f"{filename}/part-{r}.zarr") - output_part = fsspec.get_mapper(part_url, **output_storage_options) - parts.append(part_url) - task = dask.delayed(vcf_to_zarr_sequential)( - input, - output=output_part, - region=region, - chunk_length=chunk_length, - chunk_width=chunk_width, - read_chunk_length=read_chunk_length, - compressor=compressor, - encoding=encoding, - ploidy=ploidy, - mixed_ploidy=mixed_ploidy, - truncate_calls=truncate_calls, - max_alt_alleles=max_alt_alleles, - fields=fields, - exclude_fields=exclude_fields, - field_defs=field_defs, - ) - tasks.append(task) - dask.compute(*tasks) - return parts - - -def concat_zarrs( - urls: Sequence[str], - output: Union[PathType, MutableMapping[str, bytes]], - *, - storage_options: Optional[Dict[str, str]] = None, - chunk_length: Optional[int] = None, -) -> None: - """Concatenate multiple Zarr stores into a single Zarr store. - - .. deprecated:: 0.9.0 - Functions for reading VCF are deprecated, please use the `bio2zarr `_ package. - - The Zarr stores are concatenated and rechunked to produce a single combined store. - - Parameters - ---------- - urls - A list of URLs to the Zarr stores to combine, typically the return value of - :func:`vcf_to_zarrs`. - output - Zarr store or path to directory in file system. - storage_options - Any additional parameters for the storage backend (see ``fsspec.open``). - chunk_length - The length of the variant dimension chunks in the output Zarr store. If not specified, - the chunk length of the first input Zarr store is used. - """ - - warnings.warn( - "Functions for reading VCF are deprecated, please use the bio2zarr package.", - DeprecationWarning, - stacklevel=2, - ) - - vars_to_rechunk = [] - vars_to_copy = [] - storage_options = storage_options or {} - ds = xr.open_zarr( # type: ignore[no-untyped-call] - fsspec.get_mapper(urls[0], **storage_options), concat_characters=False - ) - for var, arr in ds.data_vars.items(): - if arr.dims[0] == "variants": - vars_to_rechunk.append(var) - else: - vars_to_copy.append(var) - - concat_zarrs_optimized( - urls, output, vars_to_rechunk, vars_to_copy, chunk_length=chunk_length - ) - - -def vcf_to_zarr( - input: Union[PathType, Sequence[PathType]], - output: Union[PathType, MutableMapping[str, bytes]], - *, - target_part_size: Union[None, int, str] = "auto", - regions: Union[None, Sequence[str], Sequence[Optional[Sequence[str]]]] = None, - chunk_length: int = 10_000, - chunk_width: int = 1_000, - compressor: Optional[Any] = DEFAULT_COMPRESSOR, - encoding: Optional[Any] = None, - temp_chunk_length: Optional[int] = None, - tempdir: Optional[PathType] = None, - tempdir_storage_options: Optional[Dict[str, str]] = None, - ploidy: int = 2, - mixed_ploidy: bool = False, - truncate_calls: bool = False, - max_alt_alleles: int = DEFAULT_MAX_ALT_ALLELES, - fields: Optional[Sequence[str]] = None, - exclude_fields: Optional[Sequence[str]] = None, - field_defs: Optional[Dict[str, Dict[str, Any]]] = None, - read_chunk_length: Optional[int] = None, - retain_temp_files: Optional[bool] = None, -) -> None: - """Convert VCF files to a single Zarr on-disk store. - - .. deprecated:: 0.9.0 - Functions for reading VCF are deprecated, please use the `bio2zarr `_ package. - - By default, the conversion is carried out in parallel, by writing the output for each - part to a separate, intermediate Zarr store in ``tempdir``. Then, in a second step - the intermediate outputs are concatenated and rechunked into the final output Zarr - store in ``output``. - - Conversion is carried out sequentially if ``target_part_size`` is None, and ``regions`` - is None. - - For more control over these two steps, consider using :func:`vcf_to_zarrs` followed by - :func:`concat_zarrs`. - - Parameters - ---------- - input - A path (or paths) to the input BCF or VCF file (or files). VCF files should - be compressed and have a ``.tbi`` or ``.csi`` index file. BCF files should - have a ``.csi`` index file. - output - Zarr store or path to directory in file system. - target_part_size - The desired size, in bytes, of each (compressed) part of the input to be - processed in parallel. Defaults to ``"auto"``, which will pick a good size - (currently 20MB). A value of None means that the input will be processed - sequentially. The setting will be ignored if ``regions`` is also specified. - regions - Genomic region or regions to extract variants for. For multiple inputs, multiple - input regions are specified as a sequence of values which may be None, or a - sequence of region strings. Takes priority over ``target_part_size`` if both - are not None. - chunk_length - Length (number of variants) of chunks in which data are stored, by default 10,000. - chunk_width - Width (number of samples) to use when storing chunks in output, by default 1,000. - compressor - Zarr compressor, by default Blosc + zstd with compression level 7 and auto-shuffle. - No compression is used when set as None. - encoding - Variable-specific encodings for xarray, specified as a nested dictionary with - variable names as keys and dictionaries of variable specific encodings as values. - Can be used to override Zarr compressor and filters on a per-variable basis, - e.g., ``{"call_genotype": {"compressor": Blosc("zstd", 9)}}``. - temp_chunk_length - Length (number of variants) of chunks for temporary intermediate files. Set this - to be smaller than ``chunk_length`` to avoid memory errors when loading files with - very large numbers of samples. Must be evenly divisible into ``chunk_length``. - Defaults to ``chunk_length`` if not set. - tempdir - Temporary directory where intermediate files are stored. The default None means - use the system default temporary directory. - tempdir_storage_options: - Any additional parameters for the storage backend for tempdir (see ``fsspec.open``). - ploidy - The (maximum) ploidy of genotypes in the VCF file. - mixed_ploidy - If True, genotype calls with fewer alleles than the specified ploidy will be padded - with the fill (non-allele) sentinel value of -2. If false, calls with fewer alleles than - the specified ploidy will be treated as incomplete and will be padded with the - missing-allele sentinel value of -1. - truncate_calls - If True, genotype calls with more alleles than the specified (maximum) ploidy value - will be truncated to size ploidy. If false, calls with more alleles than the - specified ploidy will raise an exception. - max_alt_alleles - The (maximum) number of alternate alleles in the VCF file. Any records with more than - this number of alternate alleles will have the extra alleles dropped (the `variant_allele` - variable will be truncated). Any call genotype fields with the extra alleles will - be changed to the missing-allele sentinel value of -1. - fields - Extra fields to extract data for. A list of strings, with ``INFO`` or ``FORMAT`` prefixes. - Wildcards are permitted too, for example: ``["INFO/*", "FORMAT/DP"]``. - field_defs - Per-field information that overrides the field definitions in the VCF header, or - provides extra information needed in the dataset representation. Definitions - are a represented as a dictionary whose keys are the field names, and values are - dictionaries with any of the following keys: ``Number``, ``Type``, ``Description``, - ``dimension``. The first three correspond to VCF header values, and ``dimension`` is - the name of the final dimension in the array for the case where ``Number`` is a fixed - integer larger than 1. For example, - ``{"INFO/AC": {"Number": "A"}, "FORMAT/HQ": {"dimension": "haplotypes"}}`` - overrides the ``INFO/AC`` field to be Number ``A`` (useful if the VCF defines it as - having variable length with ``.``), and names the final dimension of the ``HQ`` array - (which is defined as Number 2 in the VCF header) as ``haplotypes``. - (Note that Number ``A`` is the number of alternate alleles, see section 1.4.2 of the - VCF spec https://samtools.github.io/hts-specs/VCFv4.3.pdf.) - read_chunk_length - Length (number of variants) of chunks to read from the VCF file at a time. Use this - option to reduce memory usage by using a value lower than ``chunk_length`` with a small - cost in extra run time. The increase in runtime becomes higher as the ratio of - ``read_chunk_length`` to Defaults to ``None``, which means that a value equal - to ``chunk_length`` is used. The memory usage of the conversion process is - proportional to ``read_chunk_length*n_samples*(1+n_ploidy)`` so this option is - mainly useful for very large numbers of samples and/or where a large ``chunk_size`` - is desirable to reduce the number of dask tasks needed in downstream analysis. - retain_temp_files - If True, intermediate files are retained after the final output is written. Defaults - to deleting intermediate files. Intermediate files are deleted in a single process, - so for large VCF files this can be slow. - """ - - warnings.warn( - "Functions for reading VCF are deprecated, please use the bio2zarr package.", - DeprecationWarning, - stacklevel=2, - ) - - if temp_chunk_length is not None: - if chunk_length % temp_chunk_length != 0: - raise ValueError( - f"Temporary chunk length in variant dimension ({temp_chunk_length}) " - f"must evenly divide target chunk length {chunk_length}" - ) - - # all arguments except input and region/regions - sequential_function = functools.partial( - vcf_to_zarr_sequential, - output=output, - chunk_length=chunk_length, - chunk_width=chunk_width, - read_chunk_length=read_chunk_length, - compressor=compressor, - encoding=encoding, - ploidy=ploidy, - mixed_ploidy=mixed_ploidy, - truncate_calls=truncate_calls, - max_alt_alleles=max_alt_alleles, - fields=fields, - exclude_fields=exclude_fields, - field_defs=field_defs, - ) - parallel_function = functools.partial( - vcf_to_zarr_parallel, - output=output, - chunk_length=chunk_length, - chunk_width=chunk_width, - read_chunk_length=read_chunk_length, - compressor=compressor, - encoding=encoding, - ploidy=ploidy, - mixed_ploidy=mixed_ploidy, - truncate_calls=truncate_calls, - max_alt_alleles=max_alt_alleles, - fields=fields, - exclude_fields=exclude_fields, - field_defs=field_defs, - temp_chunk_length=temp_chunk_length, - tempdir=tempdir, - tempdir_storage_options=tempdir_storage_options, - retain_temp_files=retain_temp_files, - ) - process_vcfs( - input, - sequential_function, - parallel_function, - regions=regions, - target_part_size=target_part_size, - ) - - # Issue a warning if max_alt_alleles caused data to be dropped - ds = zarr.open(output) - max_alt_alleles_seen = ds.attrs["max_alt_alleles_seen"] - if max_alt_alleles_seen > max_alt_alleles: - warnings.warn( - f"Some alternate alleles were dropped, since actual max value {max_alt_alleles_seen} exceeded max_alt_alleles setting of {max_alt_alleles}.", - MaxAltAllelesExceededWarning, - ) - - -def read_vcf( - input: Union[PathType, Sequence[PathType]], - *, - target_part_size: Union[None, int, str] = "auto", - regions: Union[None, Sequence[str], Sequence[Optional[Sequence[str]]]] = None, - chunk_length: int = 10_000, - chunk_width: int = 1_000, - compressor: Optional[Any] = DEFAULT_COMPRESSOR, - encoding: Optional[Any] = None, - temp_chunk_length: Optional[int] = None, - tempdir: Optional[PathType] = None, - tempdir_storage_options: Optional[Dict[str, str]] = None, - ploidy: int = 2, - mixed_ploidy: bool = False, - truncate_calls: bool = False, - max_alt_alleles: int = DEFAULT_MAX_ALT_ALLELES, - fields: Optional[Sequence[str]] = None, - exclude_fields: Optional[Sequence[str]] = None, - field_defs: Optional[Dict[str, Dict[str, Any]]] = None, -) -> xr.Dataset: - """Read VCF dataset. - - .. deprecated:: 0.9.0 - Functions for reading VCF are deprecated, please use the `bio2zarr `_ package. - - A convenience for :func:`vcf_to_zarr` followed by :func:`sgkit.load_dataset`. - Note that the output Zarr store in ``tempdir`` is not deleted after this function - returns, so must be deleted manually by the user. - - Refer to :func:`vcf_to_zarr` for details and limitations. - - Parameters - ---------- - input - A path (or paths) to the input BCF or VCF file (or files). VCF files should - be compressed and have a ``.tbi`` or ``.csi`` index file. BCF files should - have a ``.csi`` index file. - target_part_size - The desired size, in bytes, of each (compressed) part of the input to be - processed in parallel. Defaults to ``"auto"``, which will pick a good size - (currently 20MB). A value of None means that the input will be processed - sequentially. The setting will be ignored if ``regions`` is also specified. - regions - Genomic region or regions to extract variants for. For multiple inputs, multiple - input regions are specified as a sequence of values which may be None, or a - sequence of region strings. Takes priority over ``target_part_size`` if both - are not None. - chunk_length - Length (number of variants) of chunks in which data are stored, by default 10,000. - chunk_width - Width (number of samples) to use when storing chunks in output, by default 1,000. - compressor - Zarr compressor, by default Blosc + zstd with compression level 7 and auto-shuffle. - No compression is used when set as None. - encoding - Variable-specific encodings for xarray, specified as a nested dictionary with - variable names as keys and dictionaries of variable specific encodings as values. - Can be used to override Zarr compressor and filters on a per-variable basis, - e.g., ``{"call_genotype": {"compressor": Blosc("zstd", 9)}}``. - temp_chunk_length - Length (number of variants) of chunks for temporary intermediate files. Set this - to be smaller than ``chunk_length`` to avoid memory errors when loading files with - very large numbers of samples. Must be evenly divisible into ``chunk_length``. - Defaults to ``chunk_length`` if not set. - tempdir - Temporary directory where intermediate files are stored. The default None means - use the system default temporary directory. - tempdir_storage_options: - Any additional parameters for the storage backend for tempdir (see ``fsspec.open``). - ploidy - The (maximum) ploidy of genotypes in the VCF file. - mixed_ploidy - If True, genotype calls with fewer alleles than the specified ploidy will be padded - with the fill (non-allele) sentinel value of -2. If false, calls with fewer alleles than - the specified ploidy will be treated as incomplete and will be padded with the - missing-allele sentinel value of -1. - truncate_calls - If True, genotype calls with more alleles than the specified (maximum) ploidy value - will be truncated to size ploidy. If false, calls with more alleles than the - specified ploidy will raise an exception. - max_alt_alleles - The (maximum) number of alternate alleles in the VCF file. Any records with more than - this number of alternate alleles will have the extra alleles dropped (the `variant_allele` - variable will be truncated). Any call genotype fields with the extra alleles will - be changed to the missing-allele sentinel value of -1. - fields - Extra fields to extract data for. A list of strings, with ``INFO`` or ``FORMAT`` prefixes. - Wildcards are permitted too, for example: ``["INFO/*", "FORMAT/DP"]``. - field_defs - Per-field information that overrides the field definitions in the VCF header, or - provides extra information needed in the dataset representation. Definitions - are a represented as a dictionary whose keys are the field names, and values are - dictionaries with any of the following keys: ``Number``, ``Type``, ``Description``, - ``dimension``. The first three correspond to VCF header values, and ``dimension`` is - the name of the final dimension in the array for the case where ``Number`` is a fixed - integer larger than 1. For example, - ``{"INFO/AC": {"Number": "A"}, "FORMAT/HQ": {"dimension": "haplotypes"}}`` - overrides the ``INFO/AC`` field to be Number ``A`` (useful if the VCF defines it as - having variable length with ``.``), and names the final dimension of the ``HQ`` array - (which is defined as Number 2 in the VCF header) as ``haplotypes``. - (Note that Number ``A`` is the number of alternate alleles, see section 1.4.2 of the - VCF spec https://samtools.github.io/hts-specs/VCFv4.3.pdf.) - - """ - - warnings.warn( - "Functions for reading VCF are deprecated, please use the bio2zarr package.", - DeprecationWarning, - stacklevel=2, - ) - - # Need to retain zarr file backing the returned dataset - with temporary_directory( - prefix="read_vcf_", - suffix=".zarr", - dir=tempdir, - storage_options=tempdir_storage_options, - retain_temp_files=True, - ) as output: - vcf_to_zarr( - input, - output, - target_part_size=target_part_size, - regions=regions, - chunk_length=chunk_length, - chunk_width=chunk_width, - compressor=compressor, - encoding=encoding, - temp_chunk_length=temp_chunk_length, - tempdir=tempdir, - tempdir_storage_options=tempdir_storage_options, - ploidy=ploidy, - mixed_ploidy=mixed_ploidy, - truncate_calls=truncate_calls, - max_alt_alleles=max_alt_alleles, - fields=fields, - exclude_fields=exclude_fields, - field_defs=field_defs, - ) - return load_dataset(output) - - -def count_variants(path: PathType, region: Optional[str] = None) -> int: - """Count the number of variants in a VCF file.""" - with open_vcf(path) as vcf: - if region is not None: - vcf = vcf(region) - return sum(1 for _ in region_filter(vcf, region)) - - -def zarr_array_sizes( - input: Union[PathType, Sequence[PathType]], - *, - regions: Union[None, Sequence[str], Sequence[Optional[Sequence[str]]]] = None, - target_part_size: Union[None, int, str] = "auto", -) -> Dict[str, Any]: - """Make a pass through a VCF/BCF file to determine sizes for storage in Zarr. - - .. deprecated:: 0.9.0 - Functions for reading VCF are deprecated, please use the `bio2zarr `_ package. - - By default, the input is processed in parts in parallel. However, if the input - is a single file, ``target_part_size`` is None, and ``regions`` is None, - then the operation will be carried out sequentially. - - Parameters - ---------- - input - A path (or paths) to the input BCF or VCF file (or files). VCF files should - be compressed and have a ``.tbi`` or ``.csi`` index file. BCF files should - have a ``.csi`` index file. - target_part_size - The desired size, in bytes, of each (compressed) part of the input to be - processed in parallel. Defaults to ``"auto"``, which will pick a good size - (currently 20MB). A value of None means that the input will be processed - sequentially. The setting will be ignored if ``regions`` is also specified. - regions - Genomic region or regions to extract variants for. For multiple inputs, multiple - input regions are specified as a sequence of values which may be None, or a - sequence of region strings. Takes priority over ``target_part_size`` if both - are not None. - """ - - warnings.warn( - "Functions for reading VCF are deprecated, please use the bio2zarr package.", - DeprecationWarning, - stacklevel=2, - ) - - return process_vcfs( - input, - zarr_array_sizes_sequential, - zarr_array_sizes_parallel, - regions=regions, - target_part_size=target_part_size, - ) - - -def zarr_array_sizes_sequential( - input: PathType, region: Optional[str] = None -) -> Dict[str, Any]: - with open_vcf(input) as vcf: - ploidy = -1 - alt_alleles = 0 - - info = _get_vcf_field_defs(vcf, "INFO") - info_field_defs = { - key: {"Number": 1} for key in info.keys() if info[key]["Number"] == "." - } - - format = _get_vcf_field_defs(vcf, "FORMAT") - format_field_defs = { - key: {"Number": 1} for key in format.keys() if format[key]["Number"] == "." - } - - if region is None: - variants = vcf - else: - variants = vcf(region) - - for variant in region_filter(variants, region): - for key, val in info_field_defs.items(): - field_val = variant.INFO.get(key) - if field_val is not None: - try: - val["Number"] = max(val["Number"], len(field_val)) - except TypeError: - pass # single value - - for key, val in format_field_defs.items(): - field_val = variant.format(key) - if field_val is not None: - if _is_str_or_char(field_val): # need to split strings - m = max([len(v.split(",")) for v in field_val]) - val["Number"] = max(val["Number"], m) - else: - val["Number"] = max(val["Number"], field_val.shape[-1]) - - try: - if variant.genotype is not None: - ploidy = max(ploidy, variant.genotype.ploidy) - except ( - Exception - ): # cyvcf2 raises an Exception "couldn't get genotypes for variant" - pass # no genotype information - alt_alleles = max(alt_alleles, len(variant.ALT)) - - field_defs = {} - for key, val in info_field_defs.items(): - field_defs[f"INFO/{key}"] = val - for key, val in format_field_defs.items(): - field_defs[f"FORMAT/{key}"] = val - - kwargs: Dict[str, Any] = {"max_alt_alleles": alt_alleles} - if len(field_defs) > 0: - kwargs["field_defs"] = field_defs - if ploidy > -1: - kwargs["ploidy"] = ploidy - return kwargs - - -def zarr_array_sizes_parallel( - input: Union[PathType, Sequence[PathType]], - regions: Union[None, Sequence[str], Sequence[Optional[Sequence[str]]]], -) -> Dict[str, Any]: - tasks = [] - for input, input_region_list in zip_input_and_regions(input, regions): - if input_region_list is None: - # single partition case: make a list so the loop below works - input_region_list = [None] # type: ignore - for region in input_region_list: - task = dask.delayed(zarr_array_sizes_sequential)( - input, - region=region, - ) - tasks.append(task) - all_kwargs = dask.compute(*tasks) - return merge_zarr_array_sizes(all_kwargs) - - -def merge_zarr_array_sizes(all_kwargs: Sequence[Dict[str, Any]]): - """Merge a sequence of size kwargs using the largest size found in any of them.""" - - max_alt_alleles = max([kwargs["max_alt_alleles"] for kwargs in all_kwargs]) - ploidy = max([kwargs.get("ploidy", -1) for kwargs in all_kwargs]) - - field_defs = {} - if len(all_kwargs) > 0 and "field_defs" in all_kwargs[0]: - for key in all_kwargs[0]["field_defs"].keys(): - number = max([kwargs["field_defs"][key]["Number"] for kwargs in all_kwargs]) - field_defs[key] = {"Number": number} - - kwargs: Dict[str, Any] = {"max_alt_alleles": max_alt_alleles} - if len(field_defs) > 0: - kwargs["field_defs"] = field_defs - if ploidy > -1: - kwargs["ploidy"] = ploidy - return kwargs - - -def process_vcfs( - input: Union[PathType, Sequence[PathType]], - sequential_function: Callable, - parallel_function: Callable, - *, - regions: Union[None, Sequence[str], Sequence[Optional[Sequence[str]]]] = None, - target_part_size: Union[None, int, str] = "auto", -) -> Any: - """A helper function to process VCFs in region chunks, using a sequential function - for single file and single region input, and a parallel function otherwise.""" - if regions is None and target_part_size is not None: - if target_part_size == "auto": - target_part_size = "20MB" - if isinstance(input, str) or isinstance(input, Path): - regions = partition_into_regions(input, target_part_size=target_part_size) - else: - # Multiple inputs - inputs = input - regions = [ - partition_into_regions(input, target_part_size=target_part_size) - for input in inputs - ] - - if (isinstance(input, str) or isinstance(input, Path)) and ( - regions is None or isinstance(regions, str) - ): - return sequential_function(input=input, region=regions) - else: - return parallel_function(input=input, regions=regions) - - -def zip_input_and_regions( - input: Union[PathType, Sequence[PathType]], - regions: Union[None, Sequence[str], Sequence[Optional[Sequence[str]]]], -) -> Any: - if isinstance(input, str) or isinstance(input, Path): - # Single input - inputs: Sequence[PathType] = [input] - assert regions is not None # this would just be sequential case - input_regions: Sequence[Optional[Sequence[str]]] = [regions] # type: ignore - else: - # Multiple inputs - inputs = input - if regions is None: - input_regions = [None] * len(inputs) - else: - if len(regions) == 0 or isinstance(regions[0], str): - raise ValueError( - f"For multiple inputs, multiple input regions must be a sequence of sequence of strings: {regions}" - ) - input_regions = regions - - assert len(inputs) == len(input_regions) - - return zip(inputs, input_regions) diff --git a/sgkit/io/vcf/vcf_writer.py b/sgkit/io/vcf/vcf_writer.py deleted file mode 100644 index 0cc512fe1..000000000 --- a/sgkit/io/vcf/vcf_writer.py +++ /dev/null @@ -1,635 +0,0 @@ -import io -import re -from contextlib import ExitStack -from pathlib import Path -from typing import MutableMapping, Optional, TextIO, Union - -import numpy as np -from xarray import Dataset - -from sgkit import load_dataset -from sgkit.io.utils import FLOAT32_MISSING -from sgkit.io.vcf.vcf_reader import RESERVED_VARIABLE_NAMES -from sgkit.io.vcf.vcf_writer_utils import ( - byte_buf_to_str, - create_mask, - interleave, - vcf_fixed_to_byte_buf, - vcf_fixed_to_byte_buf_size, - vcf_format_missing_to_byte_buf, - vcf_format_names_to_byte_buf, - vcf_format_names_to_byte_buf_size, - vcf_genotypes_to_byte_buf, - vcf_genotypes_to_byte_buf_size, - vcf_info_to_byte_buf, - vcf_info_to_byte_buf_size, - vcf_values_to_byte_buf, - vcf_values_to_byte_buf_size, -) -from sgkit.model import get_contigs, get_filters -from sgkit.typing import PathType - -# references to the VCF spec are for https://samtools.github.io/hts-specs/VCFv4.3.pdf - -# [Table 1: Reserved INFO keys] -RESERVED_INFO_KEY_DESCRIPTIONS = { - "AA": "Ancestral allele", - "AC": "Allele count in genotypes, for each ALT allele, in the same order as listed", - "AD": "Total read depth for each allele", - "ADF": "Read depth for each allele on the forward strand", - "ADR": "Read depth for each allele on the reverse strand", - "AF": "Allele frequency for each ALT allele in the same order as listed", - "AN": "Total number of alleles in called genotypes", - "BQ": "RMS base quality", - "CIGAR": "Cigar string describing how to align an alternate allele to the reference allele", - "DB": "dbSNP membership", - "DP": "Combined depth across samples", - "END": "End position on CHROM", - "H2": "HapMap2 membership", - "H3": "HapMap3 membership", - "MQ": "RMS mapping quality", - "MQ0": "Number of MAPQ == 0 reads", - "NS": "Number of samples with data", - "SB": "Strand bias", - "SOMATIC": "Somatic mutation", - "VALIDATED": "Validated by follow-up experiment", - "1000G": "1000 Genomes membership", -} - -# [Table 2: Reserved genotype keys] -RESERVED_FORMAT_KEY_DESCRIPTIONS = { - "AD": "Read depth for each allele", - "ADF": "Read depth for each allele on the forward strand", - "ADR": "Read depth for each allele on the reverse strand", - "DP": "Read depth", - "EC": "Expected alternate allele counts", - "FT": 'Filter indicating if this genotype was "called"', - "GL": "Genotype likelihoods", - "GP": "Genotype posterior probabilities", - "GQ": "Conditional genotype quality", - "GT": "Genotype", - "HQ": "Haplotype quality", - "MQ": "RMS mapping quality", - "PL": "Phred-scaled genotype likelihoods rounded to the closest integer", - "PP": "Phred-scaled genotype posterior probabilities rounded to the closest integer", - "PQ": "Phasing quality", - "PS": "Phase set", -} - - -def write_vcf( - input: Dataset, output: Union[PathType, TextIO], *, vcf_header: Optional[str] = None -) -> None: - """Convert a dataset to a VCF file. - - The VCF header to use is dictated by either the ``vcf_header`` parameter or the - ``vcf_header`` attribute on the input dataset. - - If specified, the ``vcf_header`` parameter will be used, and any variables in the dataset - that are not in this header will not be included in the output. - - If the ``vcf_header`` parameter is left as the default (`None`) and a ``vcf_header`` - attribute is present in the dataset (such as one created by :func:`vcf_to_zarr`), - it will be used to generate the new VCF header. In this case, any variables in the - dataset that are not specified in this header will have corresponding header lines - added, and any lines in the header without a corresponding variable in the dataset - will be omitted. - - In the case of no ``vcf_header`` parameter or attribute, a VCF header will - be generated, and will include all variables in the dataset. - - Float fields are written with up to 3 decimal places of precision. - Exponent/scientific notation is *not* supported, so values less than - ``5e-4`` will be rounded to zero. - - Data is written sequentially to VCF, using Numba to optimize the write - throughput speed. Speeds in the region of 100 MB/s have been observed on - an Apple M1 machine from 2020. - - Data is loaded into memory in chunks sized according to the chunking along - the variants dimension. Chunking in other dimensions (such as samples) is - ignored for the purposes of writing VCF. If the dataset is not chunked - (because it does not originate from Zarr or Dask, for example), then it - will all be loaded into memory at once. - - The output is *not* compressed or indexed. It is therefore recommended to - post-process the output using external tools such as ``bgzip(1)``, - ``bcftools(1)``, or ``tabix(1)``. - - This example shows how to convert a Zarr dataset to bgzip-compressed VCF by - writing it to standard output then applying an external compressor:: - - python -c 'import sys; from sgkit.io.vcf import zarr_to_vcf; zarr_to_vcf("in.zarr", sys.stdout)' - | bgzip > out.vcf.gz - - Parameters - ---------- - input - Dataset to convert to VCF. - output - A path or text file object that the output VCF should be written to. - vcf_header - The VCF header to use (including the line starting with ``#CHROM``). If None, then - a header will be generated from the dataset ``vcf_header`` attribute (if present), - or from scratch otherwise. - """ - - with ExitStack() as stack: - if isinstance(output, str) or isinstance(output, Path): - output = stack.enter_context(open(output, mode="w")) - - if vcf_header is None: - if "vcf_header" in input.attrs: - original_header = input.attrs["vcf_header"] - else: - original_header = None - vcf_header = _generate_header(input, original_header) - - print(vcf_header, end="", file=output) - - if input.sizes["variants"] == 0: - return - - header_info_fields = _info_fields(vcf_header) - header_format_fields = _format_fields(vcf_header) - - contigs = get_contigs(input).astype("S") - filters = get_filters(input) - - if filters is None: - filters = np.array(["PASS"], dtype="S") - else: - filters = filters.astype("S") - - for ds in _variant_chunks(input): - dataset_chunk_to_vcf( - ds, header_info_fields, header_format_fields, contigs, filters, output - ) - - -def dataset_chunk_to_vcf( - ds, header_info_fields, header_format_fields, contigs, filters, output -): - # write a dataset chunk as VCF, with no header - - ds = ds.load() # load dataset chunk into memory - - n_variants = ds.sizes["variants"] # number of variants in this chunk - n_samples = ds.sizes["samples"] # number of samples in whole dataset - - # fixed fields - - chrom = ds.variant_contig.values - pos = ds.variant_position.values - id = ( - ds.variant_id.values.astype("S") - if "variant_id" in ds - else np.full((n_variants), ".", dtype="S") - ) - alleles = ds.variant_allele.values.astype("S") - qual = ( - ds.variant_quality.values - if "variant_quality" in ds - else np.full((n_variants), FLOAT32_MISSING, dtype=np.float32) - ) - filter_ = ( - ds.variant_filter.values - if "variant_filter" in ds - else np.full((n_variants, len(filters)), False, dtype=bool) - ) - - # info fields - - # preconvert all info fields to byte representations - info_bufs = [] - info_mask = np.full((len(header_info_fields), n_variants), False, dtype=bool) - info_indexes = np.zeros((len(header_info_fields), n_variants + 1), dtype=np.int32) - - k = 0 - info_prefixes = [] # field names followed by '=' (except for flag/bool types) - for key in header_info_fields: - var = f"variant_{key}" - if var not in ds: - continue - if ds[var].dtype == bool: - values = ds[var].values - info_mask[k] = create_mask(values) - info_bufs.append(np.zeros(0, dtype=np.uint8)) - # info_indexes contains zeros so nothing is written for flag/bool - info_prefixes.append(key) - k += 1 - else: - values = ds[var].values - if values.dtype.kind == "O": - values = values.astype("S") # convert to fixed-length strings - info_mask[k] = create_mask(values) - info_bufs.append( - np.empty(vcf_values_to_byte_buf_size(values), dtype=np.uint8) - ) - vcf_values_to_byte_buf(info_bufs[k], 0, values, info_indexes[k]) - info_prefixes.append(key + "=") - k += 1 - - info_mask = info_mask[:k] - info_indexes = info_indexes[:k] - - info_prefixes = np.array(info_prefixes, dtype="S") - - # format fields - - # these can have different sizes for different fields, so store in sequences - format_values = [] - format_bufs = [] - - format_mask = np.full((len(header_format_fields), n_variants), False, dtype=bool) - - k = 0 - format_fields = [] - has_gt = False - for key in header_format_fields: - var = "call_genotype" if key == "GT" else f"call_{key}" - if var not in ds: - continue - if key == "GT": - values = ds[var].values - format_mask[k] = create_mask(values) - format_values.append(values) - format_bufs.append( - np.empty(vcf_genotypes_to_byte_buf_size(values[0]), dtype=np.uint8) - ) - format_fields.append(key) - has_gt = True - k += 1 - else: - values = ds[var].values - if values.dtype.kind == "O": - values = values.astype("S") # convert to fixed-length strings - format_mask[k] = create_mask(values) - format_values.append(values) - format_bufs.append( - np.empty(vcf_values_to_byte_buf_size(values[0]), dtype=np.uint8) - ) - format_fields.append(key) - k += 1 - - format_mask = format_mask[:k] - - # indexes are all the same size (number of samples) so store in a single array - format_indexes = np.empty((len(format_values), n_samples + 1), dtype=np.int32) - - if "call_genotype_phased" in ds: - call_genotype_phased = ds["call_genotype_phased"].values - else: - call_genotype_phased = np.full((n_variants, n_samples), False, dtype=bool) - - format_names = np.array(format_fields, dtype="S") - - n_header_format_fields = len(header_format_fields) - - buf_size = ( - vcf_fixed_to_byte_buf_size(contigs, id, alleles, filters) - + vcf_info_to_byte_buf_size(info_prefixes, *info_bufs) - + vcf_format_names_to_byte_buf_size(format_names) - + sum(len(format_buf) for format_buf in format_bufs) - ) - - buf = np.empty(buf_size, dtype=np.uint8) - - for i in range(n_variants): - # fixed fields - p = vcf_fixed_to_byte_buf( - buf, 0, i, contigs, chrom, pos, id, alleles, qual, filters, filter_ - ) - - # info fields - p = vcf_info_to_byte_buf( - buf, - p, - i, - info_indexes, - info_mask, - info_prefixes, - *info_bufs, - ) - - # format fields - # convert each format field to bytes separately (for a variant), then interleave - # note that we can't numba jit this logic since format_values has different types, and - # we can't pass non-homogeneous tuples of format_values to numba - if n_header_format_fields > 0: - p = vcf_format_names_to_byte_buf(buf, p, i, format_mask, format_names) - - n_format_fields = np.sum(~format_mask[:, i]) - - if n_format_fields == 0: # all samples are missing - p = vcf_format_missing_to_byte_buf(buf, p, n_samples) - elif n_format_fields == 1: # fast path if only one format field - for k in range(len(format_values)): - # if format k is not present for variant i, then skip it - if format_mask[k, i]: - continue - if k == 0 and has_gt: - p = vcf_genotypes_to_byte_buf( - buf, - p, - format_values[0][i], - call_genotype_phased[i], - format_indexes[0], - ord("\t"), - ) - else: - p = vcf_values_to_byte_buf( - buf, - p, - format_values[k][i], - format_indexes[k], - ord("\t"), - ) - break - else: - for k in range(len(format_values)): - # if format k is not present for variant i, then skip it - if format_mask[k, i]: - continue - if k == 0 and has_gt: - vcf_genotypes_to_byte_buf( - format_bufs[0], - 0, - format_values[0][i], - call_genotype_phased[i], - format_indexes[0], - ) - else: - vcf_values_to_byte_buf( - format_bufs[k], - 0, - format_values[k][i], - format_indexes[k], - ) - - p = interleave( - buf, - p, - format_indexes, - format_mask[:, i], - ord(":"), - ord("\t"), - *format_bufs, - ) - - s = byte_buf_to_str(buf[:p]) - print(s, file=output) - - -def zarr_to_vcf( - input: Union[PathType, MutableMapping[str, bytes]], - output: Union[PathType, TextIO], - *, - vcf_header: Optional[str] = None, -) -> None: - """Convert a Zarr file to a VCF file. - - A convenience for :func:`sgkit.load_dataset` followed by :func:`write_vcf`. - - Refer to :func:`write_vcf` for details and limitations. - - Parameters - ---------- - input - Zarr store or path to directory in file system. - output - A path or text file object that the output VCF should be written to. - vcf_header - The VCF header to use (including the line starting with ``#CHROM``). If None, then - a header will be generated from the dataset ``vcf_header`` attribute (if present), - or from scratch otherwise. - """ - - ds = load_dataset(input) - write_vcf(ds, output, vcf_header=vcf_header) - - -def _generate_header(ds, original_header): - output = io.StringIO() - - contigs = ds.attrs["contigs"].copy() - filters = ds.attrs["filters"].copy() if "filters" in ds.attrs else [] - info_fields = [] - format_fields = [] - - if "call_genotype" in ds: - # GT must be the first field if present, per the spec (section 1.6.2) - format_fields.append("GT") - - for var, arr in ds.data_vars.items(): - if ( - var.startswith("variant_") - and not var.endswith("_fill") - and not var.endswith("_mask") - and var not in RESERVED_VARIABLE_NAMES - and arr.dims[0] == "variants" - ): - key = var[len("variant_") :] - info_fields.append(key) - elif ( - var.startswith("call_") - and not var.endswith("_fill") - and not var.endswith("_mask") - and arr.dims[0] == "variants" - and arr.dims[1] == "samples" - ): - key = var[len("call_") :] - if key in ("genotype", "genotype_phased"): - continue - format_fields.append(key) - - if original_header is None: # generate entire header - # [1.4.1 File format] - print("##fileformat=VCFv4.3", file=output) - - print('##FILTER=', file=output) - - if "source" in ds.attrs: - print(f'##source={ds.attrs["source"]}', file=output) - - else: # use original header fields where appropriate - unstructured_pattern = re.compile("##([^=]+)=([^<].*)") - structured_pattern = re.compile("##([^=]+)=(<.*)") - - for line in original_header.split("\n"): - if re.fullmatch(unstructured_pattern, line): - print(line, file=output) - else: - match = re.fullmatch(structured_pattern, line) - if match: - category = match.group(1) - id_pattern = re.compile("ID=([^,>]+)") - key = id_pattern.findall(line)[0] - if category not in ("contig", "FILTER", "INFO", "FORMAT"): - # output other structured fields - print(line, file=output) - # only output certain categories if in dataset - elif category == "contig" and key in contigs: - contigs.remove(key) - print(line, file=output) - elif category == "FILTER" and key in filters: - filters.remove(key) - print(line, file=output) - elif category == "INFO" and key in info_fields: - info_fields.remove(key) - print(line, file=output) - elif category == "FORMAT" and key in format_fields: - format_fields.remove(key) - print(line, file=output) - - # add all fields that are not in the original header - # or all fields if there was no original header - - # [1.4.2 Information field format] - for key in info_fields: - arr = ds[f"variant_{key}"] - category = "INFO" - vcf_number = _array_to_vcf_number(category, key, arr) - vcf_type = _array_to_vcf_type(arr) - if "comment" in arr.attrs: - vcf_description = arr.attrs["comment"] - else: - vcf_description = RESERVED_INFO_KEY_DESCRIPTIONS.get(key, "") - print( - f'##INFO=', - file=output, - ) - - # [1.4.3 Filter field format] - for filter in filters: - print(f'##FILTER=', file=output) - - # [1.4.4 Individual format field format] - for key in format_fields: - if key == "GT": - print( - '##FORMAT=', - file=output, - ) - else: - arr = ds[f"call_{key}"] - category = "FORMAT" - vcf_number = _array_to_vcf_number(category, key, arr) - vcf_type = _array_to_vcf_type(arr) - if "comment" in arr.attrs: - vcf_description = arr.attrs["comment"] - else: - vcf_description = RESERVED_FORMAT_KEY_DESCRIPTIONS.get(key, "") - print( - f'##FORMAT=', - file=output, - ) - - # [1.4.7 Contig field format] - contig_lengths = ( - ds.attrs["contig_lengths"] if "contig_lengths" in ds.attrs else None - ) - for i, contig in enumerate(contigs): - if contig_lengths is None: - print(f"##contig=", file=output) - else: - print(f"##contig=", file=output) - - # [1.5 Header line syntax] - print( - "#CHROM", - "POS", - "ID", - "REF", - "ALT", - "QUAL", - "FILTER", - "INFO", - sep="\t", - end="", - file=output, - ) - - if len(ds.sample_id) > 0: - print(end="\t", file=output) - print("FORMAT", *ds.sample_id.values, sep="\t", file=output) - else: - print(file=output) - - return output.getvalue() - - -def _array_to_vcf_number(category, key, a): - # reverse of vcf_number_to_dimension_and_size - if a.dtype == bool: - return 0 - elif category == "INFO" and len(a.dims) == 1: - return 1 - elif category == "FORMAT" and len(a.dims) == 2: - return 1 - - last_dim = a.dims[-1] - if last_dim == "alt_alleles": - return "A" - elif last_dim == "alleles": - return "R" - elif last_dim == "genotypes": - return "G" - elif last_dim == f"{category}_{key}_dim": - return a.shape[-1] - else: - raise ValueError( - f"Cannot determine VCF Number for dimension name '{last_dim}' in {a}" - ) - - -def _array_to_vcf_type(a): - # reverse of _vcf_type_to_numpy - if a.dtype == bool: - return "Flag" - elif np.issubdtype(a.dtype, np.integer): - return "Integer" - elif np.issubdtype(a.dtype, np.float32): - return "Float" - elif a.dtype.str == "|S1": - return "Character" - elif a.dtype.kind in ("O", "S", "U"): - return "String" - else: - raise ValueError(f"Unsupported dtype: {a.dtype}") - - -def _info_fields(header_str): - p = re.compile("ID=([^,>]+)") - return [ - p.findall(line)[0] - for line in header_str.split("\n") - if line.startswith("##INFO=") - ] - - -def _format_fields(header_str): - p = re.compile("ID=([^,>]+)") - fields = [ - p.findall(line)[0] - for line in header_str.split("\n") - if line.startswith("##FORMAT=") - ] - # GT must be the first field if present, per the spec (section 1.6.2) - if "GT" in fields: - fields.remove("GT") - fields.insert(0, "GT") - return fields - - -def _variant_chunks(ds): - # generator for chunks of ds in the variants dimension - chunks = ds.variant_contig.chunksizes - if "variants" not in chunks: - yield ds - else: - offset = 0 - for chunk in chunks["variants"]: - ds_chunk = ds.isel(variants=slice(offset, offset + chunk)) - yield ds_chunk - offset += chunk diff --git a/sgkit/io/vcf/vcf_writer_utils.py b/sgkit/io/vcf/vcf_writer_utils.py deleted file mode 100644 index 5fe023e30..000000000 --- a/sgkit/io/vcf/vcf_writer_utils.py +++ /dev/null @@ -1,649 +0,0 @@ -"""Utility numba-jitted functions for converting array values to their VCF representations. - -Many functions in this module take a bytes buffer argument, ``buf``, which should be a NumPy array of type ``uint8``, -and an integer index into the buffer, ``p``. -""" -import numpy as np - -from sgkit.accelerate import numba_jit -from sgkit.io.utils import ( - FLOAT32_FILL_AS_INT32, - FLOAT32_MISSING_AS_INT32, - INT_FILL, - INT_MISSING, -) - -COLON = ord(":") -COMMA = ord(",") -DOT = ord(".") -EQUALS = ord("=") -MINUS = ord("-") -SEMICOLON = ord(";") -TAB = ord("\t") -ZERO = ord("0") - -PHASED = ord("|") -UNPHASED = ord("/") - -INF = np.array(["inf"], dtype="S") -NAN = np.array(["nan"], dtype="S") - -INT32_BUF_SIZE = len(str(np.iinfo(np.int32).min)) -FLOAT32_BUF_SIZE = INT32_BUF_SIZE + 4 # integer followed by '.' and 3 decimal places - -STR_MISSING_BYTE = b"." -STR_FILL_BYTE = b"" - - -@numba_jit(boundscheck=True) -def itoa(buf, p, value): # pragma: no cover - """Convert an int32 value to its decimal representation. - - Parameters - ---------- - buf - A 1D NumPy array to write to. - p - The index in the array to start writing at. - value - The integer value to convert. - - Returns - ------- - The position in the buffer after the last byte written. - """ - if value < 0: - buf[p] = MINUS - p += 1 - value = -value - # special case small values - if value < 10: - buf[p] = value + ZERO - p += 1 - else: - # this is significantly faster than `k = math.floor(math.log10(value))` - if value < 100: - k = 1 - elif value < 1000: - k = 2 - elif value < 10000: - k = 3 - elif value < 100000: - k = 4 - elif value < 1000000: - k = 5 - elif value < 10000000: - k = 6 - elif value < 100000000: - k = 7 - elif value < 1000000000: - k = 8 - elif value < 10000000000: - k = 9 - else: - # exceeds int32 - raise ValueError("itoa only supports 32-bit integers") - - # iterate backwards in buf - p += k - buf[p] = (value % 10) + ZERO - for _ in range(k): - p -= 1 - value = value // 10 - buf[p] = (value % 10) + ZERO - p += k + 1 - - return p - - -@numba_jit(boundscheck=True) -def ftoa(buf, p, value): # pragma: no cover - """Convert a float32 value to its decimal representation, with up to 3 decimal places. - - Parameters - ---------- - buf - A 1D NumPy array to write to. - p - The index in the array to start writing at. - value - The integer value to convert. - - Returns - ------- - The position in the buffer after the last byte written. - """ - if np.isnan(value): - return copy(buf, p, NAN[0]) - if value < 0: - buf[p] = MINUS - p += 1 - value = -value - if np.isinf(value): - return copy(buf, p, INF[0]) - - # integer part - p = itoa(buf, p, int(np.around(value, 3))) - - # fractional part - i = int(np.around(value * 1000)) - d3 = i % 10 - d2 = (i / 10) % 10 - d1 = (i / 100) % 10 - if d1 + d2 + d3 > 0: - buf[p] = DOT - p += 1 - buf[p] = d1 + ZERO - p += 1 - if d2 + d3 > 0: - buf[p] = d2 + ZERO - p += 1 - if d3 > 0: - buf[p] = d3 + ZERO - p += 1 - - return p - - -@numba_jit(boundscheck=True) -def copy(buf, p, value): # pragma: no cover - """Copy the values from one array to another. - - Parameters - ---------- - buf - A 1D NumPy array to write to. - p - The index in the array to start writing at. - value - The byte values to copy. - - Returns - ------- - The position in the buffer after the last byte written. - """ - for i in range(len(value)): - buf[p] = value[i] - p += 1 - return p - - -def byte_buf_to_str(a): - """Convert a NumPy array of bytes to a Python string""" - return memoryview(a).tobytes().decode() - - -@numba_jit(boundscheck=True) -def vcf_fixed_to_byte_buf( - buf, p, i, contigs, chrom, pos, id, alleles, qual, filters, filter_ -): # pragma: no cover - # CHROM - contig = contigs[chrom[i]] - p = copy(buf, p, contig) - buf[p] = TAB - p += 1 - - # POS - p = itoa(buf, p, pos[i]) - buf[p] = TAB - p += 1 - - # ID - p = copy(buf, p, id[i]) - buf[p] = TAB - p += 1 - - # REF - ref = alleles[i][0] - p = copy(buf, p, ref) - buf[p] = TAB - p += 1 - - # ALT - n_alt = 0 - for k, alt in enumerate(alleles[i][1:]): - if len(alt) > 0: - p = copy(buf, p, alt) - buf[p] = COMMA - p += 1 - n_alt += 1 - if n_alt > 0: - p -= 1 # remove last alt separator - else: - buf[p] = DOT - p += 1 - buf[p] = TAB - p += 1 - - # QUAL - if np.array(qual[i], dtype=np.float32).view(np.int32) == FLOAT32_MISSING_AS_INT32: - buf[p] = DOT - p += 1 - else: - p = ftoa(buf, p, qual[i]) - buf[p] = TAB - p += 1 - - # FILTER - if np.all(~filter_[i]): - buf[p] = DOT - p += 1 - else: - n_filter = 0 - for k, present in enumerate(filter_[i]): - if present: - p = copy(buf, p, filters[k]) - buf[p] = SEMICOLON - p += 1 - n_filter += 1 - if n_filter > 0: - p -= 1 # remove last filter separator - buf[p] = TAB - p += 1 - - return p - - -def vcf_fixed_to_byte_buf_size(contigs, id, alleles, filters): - buf_size = 0 - - # CHROM - buf_size += contigs.dtype.itemsize - buf_size += 1 # TAB - - # POS - buf_size += INT32_BUF_SIZE - buf_size += 1 # TAB - - # ID - buf_size += id.dtype.itemsize - buf_size += 1 # TAB - - # REF ALT - buf_size += alleles.shape[1] * (alleles.dtype.itemsize + 1) - buf_size += 1 # TAB - - # QUAL - buf_size += FLOAT32_BUF_SIZE - buf_size += 1 # TAB - - # FILTER - buf_size += len(filters) * (filters.dtype.itemsize + 1) - buf_size += 1 # TAB - - return buf_size - - -def vcf_values_to_byte_buf(buf, p, a, indexes, separator=-1): - """Convert an array of VCF values to their string representations. - - Parameters - ---------- - buf - A 1D NumPy array to write to. - p - The index in the array to start writing at. - a - The 1D or 2D array of values, which must have an integer, float or string dtype. - Missing and fill values are converted appropriately. - indexes - An integer array that is updated to contain the start positions of each value - written to the buffer, plus the end position after the last character written. - This is used in the ``interleave`` function. It must have size ``a.size + 1``. - separator - For a 1D array, values are separated by the optional ``separator`` (default empty). - For a 2D array, values in each row are separated by commas, and rows are separated - by the optional ``separator`` (default empty). - - Returns - ------- - The position in the buffer after the last byte written. - """ - if a.dtype in (np.int8, np.int16, np.int32): - return vcf_ints_to_byte_buf(buf, p, a, indexes, separator=separator) - elif a.dtype == np.float32: - return vcf_floats_to_byte_buf(buf, p, a, indexes, separator=separator) - elif a.dtype.kind == "S": - return vcf_strings_to_byte_buf(buf, p, a, indexes, separator=separator) - else: - raise ValueError(f"Unsupported dtype: {a.dtype}") - - -def vcf_values_to_byte_buf_size(a): - if a.dtype in (np.int8, np.int16, np.int32): - # values + separators - return a.size * INT32_BUF_SIZE + a.size - elif a.dtype == np.float32: - # values + separators - return a.size * FLOAT32_BUF_SIZE + a.size - elif a.dtype.kind == "S": - # values + separators - return a.size * a.dtype.itemsize + a.size - else: - raise ValueError(f"Unsupported dtype: {a.dtype}") - - -@numba_jit(boundscheck=True) -def vcf_ints_to_byte_buf(buf, p, a, indexes, separator=-1): # pragma: no cover - n = 0 # total number of strings - if a.ndim == 1: - for i in range(a.shape[0]): - indexes[n] = p - if a[i] == INT_MISSING: - buf[p] = DOT - p += 1 - else: - p = itoa(buf, p, a[i]) - if separator != -1: - buf[p] = separator - p += 1 - n += 1 - elif a.ndim == 2: - for i in range(a.shape[0]): - indexes[n] = p - for j in range(a.shape[1]): - if a[i, j] == INT_MISSING: - buf[p] = DOT - p += 1 - elif a[i, j] == INT_FILL: - if j == 0: # virtual comma that will be erased - p += 1 - break - else: - p = itoa(buf, p, a[i, j]) - buf[p] = COMMA - p += 1 - p -= 1 - n += 1 - if separator != -1: - buf[p] = separator - p += 1 - else: - raise ValueError("Array must have dimension 1 or 2") - if separator != -1: # remove last separator - p -= 1 - indexes[n] = p # add index for end - return p - - -@numba_jit(boundscheck=True) -def vcf_floats_to_byte_buf(buf, p, a, indexes, separator=-1): # pragma: no cover - n = 0 # total number of strings - ai = a.view(np.int32) - if a.ndim == 1: - for i in range(a.shape[0]): - indexes[n] = p - if ai[i] == FLOAT32_MISSING_AS_INT32: - buf[p] = DOT - p += 1 - else: - p = ftoa(buf, p, a[i]) - if separator != -1: - buf[p] = separator - p += 1 - n += 1 - elif a.ndim == 2: - for i in range(a.shape[0]): - indexes[n] = p - for j in range(a.shape[1]): - if ai[i, j] == FLOAT32_MISSING_AS_INT32: - buf[p] = DOT - p += 1 - elif ai[i, j] == FLOAT32_FILL_AS_INT32: - if j == 0: # virtual comma that will be erased - p += 1 - break - else: - p = ftoa(buf, p, a[i, j]) - buf[p] = COMMA - p += 1 - p -= 1 - n += 1 - if separator != -1: - buf[p] = separator - p += 1 - else: - raise ValueError("Array must have dimension 1 or 2") - if separator != -1: # remove last separator - p -= 1 - indexes[n] = p # add index for end - return p - - -@numba_jit(boundscheck=True) -def vcf_strings_to_byte_buf(buf, p, a, indexes, separator=-1): # pragma: no cover - n = 0 # total number of strings - if a.ndim == 1: - for i in range(a.shape[0]): - indexes[n] = p - if a[i] == STR_MISSING_BYTE: - buf[p] = DOT - p += 1 - else: - p = copy(buf, p, a[i]) - if separator != -1: - buf[p] = separator - p += 1 - n += 1 - elif a.ndim == 2: - for i in range(a.shape[0]): - indexes[n] = p - for j in range(a.shape[1]): - if a[i, j] == STR_MISSING_BYTE: - buf[p] = DOT - p += 1 - elif a[i, j] == STR_FILL_BYTE: - if j == 0: # virtual comma that will be erased - p += 1 - break - else: - p = copy(buf, p, a[i, j]) - buf[p] = COMMA - p += 1 - p -= 1 - n += 1 - if separator != -1: - buf[p] = separator - p += 1 - else: - raise ValueError("Array must have dimension 1 or 2") - if separator != -1: # remove last separator - p -= 1 - indexes[n] = p # add index for end - return p - - -@numba_jit(boundscheck=True) -def vcf_genotypes_to_byte_buf( - buf, p, call_genotype, call_genotype_phased, indexes, separator=-1 -): # pragma: no cover - n = 0 - for i in range(call_genotype.shape[0]): - indexes[n] = p - phased = call_genotype_phased[i] - for j in range(call_genotype.shape[1]): - gt = call_genotype[i, j] - if gt == INT_MISSING: - buf[p] = DOT - p += 1 - elif gt == INT_FILL: - break - else: - buf[p] = gt + ZERO - p += 1 - if phased: - buf[p] = PHASED - p += 1 - else: - buf[p] = UNPHASED - p += 1 - p -= 1 - n += 1 - if separator != -1: - buf[p] = separator - p += 1 - if separator != -1: # remove last separator - p -= 1 - indexes[n] = p # add index for end - return p - - -def vcf_genotypes_to_byte_buf_size(call_genotype): - # allele values (0, 1, etc) + separators - return call_genotype.size + call_genotype.size - - -def create_mask(arr): - """Return a mask array of shape ``arr.shape[0]` for masking out missing values.""" - axis = tuple(range(1, len(arr.shape))) - if arr.dtype == bool: - return ~arr - elif arr.dtype in (np.int8, np.int16, np.int32): - return np.all(arr == INT_MISSING, axis=axis) - elif arr.dtype == np.float32: - return np.all(arr.view("i4") == FLOAT32_MISSING_AS_INT32, axis=axis) - elif arr.dtype.kind == "S": - return np.all(arr == STR_MISSING_BYTE, axis=axis) - else: - raise ValueError(f"Unsupported dtype: {arr.dtype}") - - -@numba_jit(boundscheck=True) -def vcf_info_to_byte_buf( - buf, p, j, indexes, mask, info_prefixes, *arrays -): # pragma: no cover - if len(arrays) == 0 or np.all(mask[:, j]): - buf[p] = DOT - p += 1 - return p - n = indexes.shape[0] - assert n == len(arrays) - assert n == len(mask) - assert n == len(info_prefixes) - for i in range(n): - if mask[i, j]: - continue - p = copy(buf, p, info_prefixes[i]) - arr = arrays[i] - sub = arr[indexes[i, j] : indexes[i, j + 1]] - len_sub = sub.shape[0] - buf[p : p + len_sub] = sub - p = p + len_sub - buf[p] = SEMICOLON - p += 1 - p -= 1 # remove last separator - return p - - -def vcf_info_to_byte_buf_size(info_prefixes, *arrays): - if len(info_prefixes) == 0: - # DOT + TAB - return 2 - - buf_size = 0 - - buf_size += len(info_prefixes) * info_prefixes.dtype.itemsize # prefixes - buf_size += len(info_prefixes) # separators (SEMICOLON and final TAB) - buf_size += sum(len(a) for a in arrays) # values - - return buf_size - - -@numba_jit(boundscheck=True) -def vcf_format_names_to_byte_buf( - buf, p, i, format_mask, format_names -): # pragma: no cover - buf[p] = TAB - p += 1 - if len(format_names) == 0 or np.all(format_mask[:, i]): - buf[p] = DOT - p += 1 - buf[p] = TAB - p += 1 - return p - for k in range(len(format_names)): - if format_mask[k, i]: - continue - p = copy(buf, p, format_names[k]) - buf[p] = COLON - p += 1 - p -= 1 # remove last separator - buf[p] = TAB - p += 1 - return p - - -def vcf_format_names_to_byte_buf_size(format_names): - if len(format_names) == 0: - # TAB + DOT + TAB - return 3 - # TAB + names + separators - return 1 + len(format_names) * format_names.dtype.itemsize + len(format_names) - - -@numba_jit(boundscheck=True) -def vcf_format_missing_to_byte_buf(buf, p, n_samples): # pragma: no cover - for _ in range(n_samples): - buf[p] = DOT - p += 1 - buf[p] = TAB - p += 1 - p -= 1 # remove last tab - return p - - -@numba_jit(boundscheck=True) -def interleave( - buf, p, indexes, mask, separator, group_separator, *arrays -): # pragma: no cover - """Interleave byte buffers into groups. - - Each array must contain the same number of entries - this is the number of groups - formed. Each group will contain ``len(arrays)`` entries. - - Parameters - ---------- - buf - A 1D NumPy array to write to. - p - The index in the array to start writing at. - indexes - An array that has one row for each array, containing the start index for each - separate string value in the array. - mask - A boolean array with one entry for each array, indicating if the array should - be masked out. - separator - The separator to use between values within a group. - group_separator - The separator to use between each group. - arrays - The byte buffer arrays to interleave. - - Returns - ------- - The position in the buffer after the last byte written. - """ - n = indexes.shape[0] - assert n == len(arrays) - assert n == len(mask) - for j in range(indexes.shape[1] - 1): - for i in range(n): - if mask[i]: - continue - arr = arrays[i] - sub = arr[indexes[i, j] : indexes[i, j + 1]] - len_sub = sub.shape[0] - buf[p : p + len_sub] = sub - p = p + len_sub - buf[p] = separator - p += 1 - buf[p - 1] = group_separator - p -= 1 # remove last separator - return p - - -def interleave_buf_size(indexes, *arrays): - """Return the buffer size needed by ``interleave``.""" - # array buffers + separators - return sum(len(a) for a in arrays) + indexes.size diff --git a/sgkit/io/vcfzarr_reader.py b/sgkit/io/vcfzarr_reader.py deleted file mode 100644 index 66426ceee..000000000 --- a/sgkit/io/vcfzarr_reader.py +++ /dev/null @@ -1,540 +0,0 @@ -import tempfile -import warnings -from math import comb -from pathlib import Path -from typing import ( - Any, - Dict, - Hashable, - List, - MutableMapping, - Optional, - Sequence, - Tuple, - Union, -) - -import dask -import dask.array as da -import numcodecs -import numpy as np -import xarray as xr -import zarr -from dask.delayed import Delayed -from dask.optimization import fuse -from fsspec import get_mapper - -from sgkit.io.utils import INT_FILL, concatenate_and_rechunk, str_is_int - -from ..model import DIM_SAMPLE, DIM_VARIANT, create_genotype_call_dataset -from ..typing import ArrayLike, PathType -from ..utils import encode_array, max_str_len, smallest_numpy_int_dtype - - -class DimensionNameForFixedFormatFieldWarning(UserWarning): - """Warning when a dimension name for a FORMAT field with Number > 1 was created automatically.""" - - pass - - -def _ensure_2d(arr: ArrayLike) -> ArrayLike: - if arr.ndim == 1: - arr = arr.reshape(-1, 1) - return arr - - -def read_scikit_allel_vcfzarr( - path: PathType, - field_defs: Optional[Dict[str, Dict[str, Any]]] = None, -) -> xr.Dataset: - """Read a VCF Zarr file created using scikit-allel. - - .. deprecated:: 0.9.0 - Functions for reading VCF are deprecated, please use the `bio2zarr `_ package. - - Loads VCF variant, sample, and genotype data as Dask arrays within a Dataset - from a Zarr file created using scikit-allel's ``vcf_to_zarr`` function. - - This allows conversion from scikit-allel's Zarr format to sgkit's - `VCF Zarr `_ format. - - Since ``vcf_to_zarr`` does not preserve phasing information, there is no - :data:`sgkit.variables.call_genotype_phased_spec` variable in the resulting dataset. - - Parameters - ---------- - path - Path to the Zarr file. - field_defs - Per-field information that overrides the field definitions in the VCF header, or - provides extra information needed in the dataset representation. Definitions - are a represented as a dictionary whose keys are the field names, and values are - dictionaries with any of the following keys: ``Number``, ``Type``, ``Description``, - ``dimension``. The first three correspond to VCF header values, and ``dimension`` is - the name of the final dimension in the array for the case where ``Number`` is a fixed - integer larger than 1. For example, - ``{"INFO/AC": {"Number": "A"}, "FORMAT/HQ": {"dimension": "haplotypes"}}`` - overrides the ``INFO/AC`` field to be Number ``A`` (useful if the VCF defines it as - having variable length with ``.``), and names the final dimension of the ``HQ`` array - (which is defined as Number 2 in the VCF header) as ``haplotypes``. - (Note that Number ``A`` is the number of alternate alleles, see section 1.4.2 of the - VCF spec https://samtools.github.io/hts-specs/VCFv4.3.pdf.) - - Returns - ------- - A dataset containing the following variables: - - - :data:`sgkit.variables.variant_id_spec` (variants) - - :data:`sgkit.variables.variant_contig_spec` (variants) - - :data:`sgkit.variables.variant_position_spec` (variants) - - :data:`sgkit.variables.variant_allele_spec` (variants) - - :data:`sgkit.variables.sample_id_spec` (samples) - - :data:`sgkit.variables.call_genotype_spec` (variants, samples, ploidy) - - :data:`sgkit.variables.call_genotype_mask_spec` (variants, samples, ploidy) - """ - - warnings.warn( - "Functions for reading VCF are deprecated, please use the bio2zarr package.", - DeprecationWarning, - stacklevel=2, - ) - - vcfzarr = zarr.open_group(str(path), mode="r") - - # don't fix strings since it requires a pass over the whole dataset - return _vcfzarr_to_dataset(vcfzarr, fix_strings=False, field_defs=field_defs) - - -def vcfzarr_to_zarr( - input: PathType, - output: PathType, - *, - contigs: Optional[List[str]] = None, - grouped_by_contig: bool = False, - consolidated: bool = False, - tempdir: Optional[PathType] = None, -) -> None: - """Convert VCF Zarr files created using scikit-allel to a single Zarr on-disk store in sgkit Xarray format. - - Parameters - ---------- - input - Path to the input Zarr file. - output - Path to the ouput Zarr file. - contigs - The contigs to convert. By default all contigs are converted. - grouped_by_contig - Whether there is one group for each contig in the Zarr file, by default False. - consolidated - Whether the Zarr file has consolidated metadata, by default False. - tempdir - Temporary directory where intermediate files are stored. The default None means - use the system default temporary directory. - """ - - if consolidated: - vcfzarr = zarr.open_consolidated(str(input), mode="r") - else: - vcfzarr = zarr.open_group(str(input), mode="r") - - if not grouped_by_contig: - ds = _vcfzarr_to_dataset(vcfzarr) - ds.to_zarr(str(output)) - - else: - # read each contig separately, concatenate, rechunk, then save to zarr - - contigs = contigs or list(vcfzarr.group_keys()) - - # Index the contig names - _, variant_contig_names = encode_array(contigs) - variant_contig_names = list(variant_contig_names) - - vars_to_rechunk = [] - vars_to_copy = [] - - with tempfile.TemporaryDirectory( - prefix="vcfzarr_to_zarr_", suffix=".zarr", dir=tempdir - ) as tmpdir: - zarr_files = [] - for i, contig in enumerate(contigs): - # convert contig group to zarr and save in tmpdir - ds = _vcfzarr_to_dataset(vcfzarr[contig], contig, variant_contig_names) - if i == 0: - for var, arr in ds.data_vars.items(): - if arr.dims[0] == "variants": - vars_to_rechunk.append(var) - else: - vars_to_copy.append(var) - - contig_zarr_file = Path(tmpdir) / contig - ds.to_zarr(contig_zarr_file) - - zarr_files.append(str(contig_zarr_file)) - - concat_zarrs_optimized( - zarr_files, output, vars_to_rechunk, vars_to_copy, fix_strings=True - ) - - -def _vcfzarr_to_dataset( - vcfzarr: zarr.Array, - contig: Optional[str] = None, - variant_contig_names: Optional[List[str]] = None, - fix_strings: bool = True, - field_defs: Optional[Dict[str, Dict[str, Any]]] = None, -) -> xr.Dataset: - variant_position = da.from_zarr(vcfzarr["variants/POS"]) - - if contig is None: - # Get the contigs from variants/CHROM - variants_chrom = da.from_zarr(vcfzarr["variants/CHROM"]).astype(str) - variant_contig, variant_contig_names = encode_array(variants_chrom.compute()) - variant_contig_dtype = smallest_numpy_int_dtype(len(variant_contig_names)) - variant_contig = variant_contig.astype(variant_contig_dtype) - variant_contig_names = list(variant_contig_names) - else: - # Single contig: contig names were passed in - assert variant_contig_names is not None - contig_index = variant_contig_names.index(contig) - variant_contig = da.full_like(variant_position, contig_index) - - # For variant alleles, combine REF and ALT into a single array - variants_ref = da.from_zarr(vcfzarr["variants/REF"]) - variants_alt = da.from_zarr(vcfzarr["variants/ALT"]) - variant_allele = da.concatenate( - [_ensure_2d(variants_ref), _ensure_2d(variants_alt)], axis=1 - ) - # rechunk so there's a single chunk in alleles axis - variant_allele = variant_allele.rechunk((None, variant_allele.shape[1])) - - if "variants/ID" in vcfzarr: - variants_id = da.from_zarr(vcfzarr["variants/ID"]).astype(str) - else: - variants_id = None - - if "variants/QUAL" in vcfzarr: - variant_quality = da.from_zarr(vcfzarr["variants/QUAL"]) - else: - variant_quality = None - - ds = create_genotype_call_dataset( - variant_contig_names=variant_contig_names, - variant_contig=variant_contig, - variant_position=variant_position, - variant_allele=variant_allele, - sample_id=da.from_zarr(vcfzarr["samples"]).astype(str), - call_genotype=da.from_zarr(vcfzarr["calldata/GT"]), - variant_id=variants_id, - ) - - # Add a mask for variant ID - if variants_id is not None: - ds["variant_id_mask"] = ( - [DIM_VARIANT], - variants_id == ".", - ) - - if variant_quality is not None: - ds["variant_quality"] = ([DIM_VARIANT], variant_quality) - - # Add any other fields - field_defs = field_defs or {} - default_info_fields = ["ALT", "CHROM", "ID", "POS", "REF", "QUAL", "FILTER_PASS"] - default_format_fields = ["GT"] - for key in set(vcfzarr["variants"].array_keys()) - set(default_info_fields): - category = "INFO" - vcfzarr_key = f"variants/{key}" - variable_name = f"variant_{key}" - dims = [DIM_VARIANT] - field = f"{category}/{key}" - field_def = field_defs.get(field, {}) - _add_field_to_dataset( - category, key, vcfzarr_key, variable_name, dims, field_def, vcfzarr, ds - ) - for key in set(vcfzarr["calldata"].array_keys()) - set(default_format_fields): - category = "FORMAT" - vcfzarr_key = f"calldata/{key}" - variable_name = f"call_{key}" - dims = [DIM_VARIANT, DIM_SAMPLE] - field = f"{category}/{key}" - field_def = field_defs.get(field, {}) - _add_field_to_dataset( - category, key, vcfzarr_key, variable_name, dims, field_def, vcfzarr, ds - ) - - # Fix string types to include length - if fix_strings: - for var, arr in ds.data_vars.items(): - kind = arr.dtype.kind - if kind in ["O", "U", "S"]: - # Compute fixed-length string dtype for array - if kind == "O" or var in ("variant_id", "variant_allele"): - kind = "S" - max_len = max_str_len(arr) - dt = f"{kind}{max_len}" - ds[var] = arr.astype(dt) - - if var in {"variant_id", "variant_allele"}: - ds.attrs[f"max_length_{var}"] = max_len - - return ds - - -def _add_field_to_dataset( - category: str, - key: str, - vcfzarr_key: str, - variable_name: str, - dims: List[str], - field_def: Dict[str, Any], - vcfzarr: zarr.Array, - ds: xr.Dataset, -) -> None: - if "ID" not in vcfzarr[vcfzarr_key].attrs: - # only convert fields that were defined in the original VCF - return - vcf_number = field_def.get("Number", vcfzarr[vcfzarr_key].attrs["Number"]) - dimension, _ = vcf_number_to_dimension_and_size( - # ploidy and max_alt_alleles are not relevant since size is not used here - vcf_number, - category, - key, - field_def, - ploidy=2, - max_alt_alleles=0, - ) - if dimension is not None: - dims.append(dimension) - array = da.from_zarr(vcfzarr[vcfzarr_key]) - array = _replace_fill_values(array) - ds[variable_name] = (dims, array) - if "Description" in vcfzarr[vcfzarr_key].attrs: - description = vcfzarr[vcfzarr_key].attrs["Description"] - if len(description) > 0: - ds[variable_name].attrs["comment"] = description - - -def _replace_fill_values(arr: ArrayLike) -> ArrayLike: - if arr.dtype == np.int32: - arr[arr == -1] = INT_FILL - return arr - - -def _get_max_len(zarr_groups: List[zarr.Group], attr_name: str) -> int: - max_len: int = max([group.attrs[attr_name] for group in zarr_groups]) - return max_len - - -def concat_zarrs_optimized( - zarr_files: Sequence[str], - output: Union[PathType, MutableMapping[str, bytes]], - vars_to_rechunk: List[Hashable], - vars_to_copy: List[Hashable], - fix_strings: bool = False, - chunk_length: Optional[int] = None, -) -> None: - if isinstance(output, Path): - output = str(output) - - zarr_groups = [zarr.open_group(f) for f in zarr_files] - - first_zarr_group = zarr_groups[0] - - # create the top-level group - zarr.open_group(output, mode="w") - - # copy variables that are to be rechunked - # NOTE: that this uses _to_zarr function defined here that is needed to avoid - # race conditions between writing the array contents and its metadata - # see https://github.com/sgkit-dev/sgkit/pull/486 - for var in vars_to_rechunk: - dtype = None - if fix_strings and var in {"variant_id", "variant_allele"}: - max_len = _get_max_len(zarr_groups, f"max_length_{var}") - dtype = f"S{max_len}" - assert first_zarr_group[var].attrs["_ARRAY_DIMENSIONS"][0] == "variants" - target_chunks = None - if chunk_length is not None: - target_chunks = list(first_zarr_group[var].chunks) - target_chunks[0] = chunk_length - arr = concatenate_and_rechunk( - [group[var] for group in zarr_groups], dtype=dtype, chunks=target_chunks - ) - - _to_zarr_kwargs = dict( - compressor=first_zarr_group[var].compressor, - filters=first_zarr_group[var].filters, - fill_value=None, - ) - if not fix_strings and arr.dtype == "O": - # We assume that all object dtypes are variable length strings - var_len_str_codec = numcodecs.VLenUTF8() - _to_zarr_kwargs["object_codec"] = var_len_str_codec - # Remove from filters to avoid double encoding error - if var_len_str_codec in first_zarr_group[var].filters: - filters = list(first_zarr_group[var].filters) - filters.remove(var_len_str_codec) - _to_zarr_kwargs["filters"] = filters - - d = _to_zarr( # type: ignore[no-untyped-call] - arr, - output, - component=var, - overwrite=True, - compute=False, - attrs=first_zarr_group[var].attrs.asdict(), - **_to_zarr_kwargs, - ) - da.compute(_fuse_delayed(d)) # type: ignore[no-untyped-call] - - # copy variables that are not rechunked (e.g. sample_id) - for var in vars_to_copy: - arr = da.from_zarr(zarr_files[0], component=var) - _to_zarr_kwargs = dict( - compressor=first_zarr_group[var].compressor, - filters=first_zarr_group[var].filters, - fill_value=None, - ) - if not fix_strings and arr.dtype == "O": - # We assume that all object dtypes are variable length strings - var_len_str_codec = numcodecs.VLenUTF8() - _to_zarr_kwargs["object_codec"] = var_len_str_codec - # Remove from filters to avoid double encoding error - if var_len_str_codec in first_zarr_group[var].filters: - filters = list(first_zarr_group[var].filters) - filters.remove(var_len_str_codec) - _to_zarr_kwargs["filters"] = filters - - d = _to_zarr( # type: ignore[no-untyped-call] - arr, - output, - component=var, - overwrite=True, - compute=False, - attrs=first_zarr_group[var].attrs.asdict(), - **_to_zarr_kwargs, - ) - da.compute(_fuse_delayed(d)) # type: ignore[no-untyped-call] - - # copy unchanged variables and top-level metadata - with zarr.open_group(output) as output_zarr: - # copy top-level attributes - group_attrs = dict(first_zarr_group.attrs) - if "max_alt_alleles_seen" in group_attrs: - max_alt_alleles_seen = _get_max_len(zarr_groups, "max_alt_alleles_seen") - group_attrs["max_alt_alleles_seen"] = max_alt_alleles_seen - output_zarr.attrs.update(group_attrs) - - # consolidate metadata - zarr.consolidate_metadata(output) - - -def _fuse_delayed(d): # type: ignore[no-untyped-def] - """Perform task fusion within a Delayed object""" - # from https://github.com/dask/dask/issues/6219 - dsk_fused, _ = fuse(dask.utils.ensure_dict(d.dask)) - return Delayed(d._key, dsk_fused) - - -def _to_zarr( # type: ignore[no-untyped-def] - arr, - url, - component=None, - storage_options=None, - overwrite=False, - compute=True, - return_stored=False, - attrs=None, - **kwargs, -): - """Extension of dask.array.core.to_zarr that can set attributes on the resulting Zarr array, - in the same Dask operation. - """ - - # call Dask version with compute=False just to check preconditions - da.to_zarr( - arr, - url, - component=component, - storage_options=storage_options, - overwrite=overwrite, - compute=False, - return_stored=return_stored, - **kwargs, - ) - - storage_options = storage_options or {} - if isinstance(url, str): - mapper = get_mapper(url, **storage_options) - else: - # assume the object passed is already a mapper - mapper = url # pragma: no cover - chunks = [c[0] for c in arr.chunks] - # Zarr errors if we specify chunks of length 0 (#1068) - if sum(chunks) == 0: - chunks = None - z = _zarr_create_with_attrs( # type: ignore[no-untyped-call] - shape=arr.shape, - chunks=chunks, - dtype=arr.dtype, - store=mapper, - path=component, - overwrite=overwrite, - attrs=attrs, - **kwargs, - ) - return arr.store(z, lock=False, compute=compute, return_stored=return_stored) - - -def _zarr_create_with_attrs( # type: ignore[no-untyped-def] - shape, chunks, dtype, store, path, overwrite, attrs, **kwargs -): - # Create the zarr group and update its attributes within the same task (thread) - arr = zarr.create( - shape=shape, - chunks=chunks, - dtype=dtype, - store=store, - path=path, - overwrite=overwrite, - **kwargs, - ) - if attrs is not None: - arr.attrs.update(attrs) - return arr - - -def vcf_number_to_dimension_and_size( - vcf_number: str, - category: str, - key: str, - field_def: Any, - ploidy: int, - max_alt_alleles: int, -) -> Tuple[Optional[str], int]: - if vcf_number in ("0", "1"): - return (None, 1) - elif vcf_number == "A": - return ("alt_alleles", max_alt_alleles) - elif vcf_number == "R": - return ("alleles", max_alt_alleles + 1) - elif vcf_number == "G": - n_alleles = max_alt_alleles + 1 - n_genotypes = comb(n_alleles + ploidy - 1, ploidy) - return ("genotypes", n_genotypes) - elif str_is_int(vcf_number): - if "dimension" in field_def: - dimension = field_def["dimension"] - return (dimension, int(vcf_number)) - else: - dim_name = f"{category}_{key}_dim" - warnings.warn( - f"A new dimension named {dim_name} was created. To change this name re-run specifying `field_defs`.", - DimensionNameForFixedFormatFieldWarning, - ) - return (dim_name, int(vcf_number)) - raise ValueError( - f"{category} field '{key}' is defined as Number '{vcf_number}', which is not supported. Consider specifying `field_defs` to provide a concrete size for this field." - ) diff --git a/sgkit/model.py b/sgkit/model.py index fcacf6148..e151d218c 100644 --- a/sgkit/model.py +++ b/sgkit/model.py @@ -171,7 +171,7 @@ def num_contigs(ds: xr.Dataset) -> ArrayLike: """Return the number of contigs in a dataset.""" if DIM_CONTIG in ds.sizes: return ds.sizes[DIM_CONTIG] - else: + else: # pragma: no cover return len(ds.attrs["contigs"]) @@ -179,7 +179,7 @@ def get_contigs(ds: xr.Dataset) -> ArrayLike: """Return the contigs in a dataset.""" if "contig_id" in ds: return ds["contig_id"].values - else: + else: # pragma: no cover warnings.warn( "The 'contigs' VCF Zarr group attribute is deprecated and should be converted to a 'contig_id' array.", DeprecationWarning, @@ -191,11 +191,11 @@ def get_filters(ds: xr.Dataset) -> Optional[ArrayLike]: """Return the filters in a dataset.""" if "filter_id" in ds: return ds["filter_id"].values - elif "filters" in ds.attrs: + elif "filters" in ds.attrs: # pragma: no cover warnings.warn( "The 'filters' VCF Zarr group attribute is deprecated and should be converted to a 'filter_id' array.", DeprecationWarning, ) return np.array(ds.attrs["filters"], dtype="S") - else: + else: # pragma: no cover return None diff --git a/sgkit/tests/data/sample.vcf b/sgkit/tests/data/sample.vcf deleted file mode 100644 index 51eeedcca..000000000 --- a/sgkit/tests/data/sample.vcf +++ /dev/null @@ -1,31 +0,0 @@ -##fileformat=VCFv4.0 -##fileDate=20090805 -##source=myImputationProgramV3.1 -##reference=1000GenomesPilot-NCBI36 -##phasing=partial -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##FILTER= -##FILTER= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##ALT= -##ALT= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 -19 111 . A C 9.6 . . GT:HQ 0|0:10,15 0|0:10,10 0/1:3,3 -19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 -20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. -20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,. -20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.,. -20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,. -20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 ./.:40:3 -20 1235237 . T . . . . GT 0/0 0|0 ./. -X 10 rsTest AC A,ATG,C 10 PASS . GT 0 0/1 0|2 diff --git a/sgkit/tests/io/vcf/data/sample.vcf.gz b/sgkit/tests/io/data/sample.vcf.gz similarity index 100% rename from sgkit/tests/io/vcf/data/sample.vcf.gz rename to sgkit/tests/io/data/sample.vcf.gz diff --git a/sgkit/tests/io/vcf/data/sample.vcf.gz.tbi b/sgkit/tests/io/data/sample.vcf.gz.tbi similarity index 100% rename from sgkit/tests/io/vcf/data/sample.vcf.gz.tbi rename to sgkit/tests/io/data/sample.vcf.gz.tbi diff --git a/sgkit/tests/io/test_utils.py b/sgkit/tests/io/test_utils.py deleted file mode 100644 index bca215a6d..000000000 --- a/sgkit/tests/io/test_utils.py +++ /dev/null @@ -1,63 +0,0 @@ -import numpy as np -import pytest -import zarr - -from sgkit.io.utils import concatenate_and_rechunk - - -def test_concatenate_and_rechunk__1d(): - z1 = zarr.zeros(5, chunks=2, dtype="i4") - z1[:] = np.arange(5) - - z2 = zarr.zeros(5, chunks=2, dtype="i4") - z2[:] = np.arange(5, 10) - - zarrs = [z1, z2] - - out = concatenate_and_rechunk(zarrs) - - assert out.chunks == ((2, 2, 2, 2, 2),) - np.testing.assert_array_equal(out.compute(), np.arange(10)) - - -def test_concatenate_and_rechunk__2d(): - z1 = zarr.zeros((5, 3), chunks=(2, 3), dtype="i4") - z1[:] = np.arange(15).reshape(5, 3) - - z2 = zarr.zeros((5, 3), chunks=(2, 3), dtype="i4") - z2[:] = np.arange(15, 30).reshape(5, 3) - - zarrs = [z1, z2] - - out = concatenate_and_rechunk(zarrs) - - assert out.chunks == ((2, 2, 2, 2, 2), (3,)) - np.testing.assert_array_equal(out.compute(), np.arange(30).reshape(10, 3)) - - -def test_concatenate_and_rechunk__tiny_file(): - z1 = zarr.zeros(4, chunks=3, dtype="i4") - z1[:] = np.arange(4) - - # this zarr array lies entirely within the second chunk - z2 = zarr.zeros(1, chunks=3, dtype="i4") - z2[:] = np.arange(4, 5) - - z3 = zarr.zeros(5, chunks=3, dtype="i4") - z3[:] = np.arange(5, 10) - - zarrs = [z1, z2, z3] - - out = concatenate_and_rechunk(zarrs) - - assert out.chunks == ((3, 3, 3, 1),) - np.testing.assert_array_equal(out.compute(), np.arange(10)) - - -def test_concatenate_and_rechunk__shape_mismatch(): - z1 = zarr.zeros((5, 3), chunks=(2, 3), dtype="i4") - z2 = zarr.zeros((5, 4), chunks=(2, 4), dtype="i4") - zarrs = [z1, z2] - - with pytest.raises(ValueError, match="Zarr arrays must have matching shapes"): - concatenate_and_rechunk(zarrs) diff --git a/sgkit/tests/io/test_vcf2zarr_compat.py b/sgkit/tests/io/test_vcf2zarr_compat.py new file mode 100644 index 000000000..696180fed --- /dev/null +++ b/sgkit/tests/io/test_vcf2zarr_compat.py @@ -0,0 +1,249 @@ +import numpy as np +import pytest + +pytest.importorskip("bio2zarr") +from bio2zarr import vcf2zarr +from bio2zarr.constants import ( + FLOAT32_FILL, + FLOAT32_MISSING, + INT_FILL, + INT_MISSING, + STR_FILL, + STR_MISSING, +) +from numpy.testing import assert_array_almost_equal, assert_array_equal + +from sgkit import load_dataset, save_dataset +from sgkit.model import get_contigs, get_filters, num_contigs +from sgkit.tests.io.test_dataset import assert_identical + + +@pytest.mark.filterwarnings("ignore::xarray.coding.variables.SerializationWarning") +def test_vcf2zarr_compat(shared_datadir, tmp_path): + vcf_path = shared_datadir / "sample.vcf.gz" + vcz_path = tmp_path.joinpath("sample.vcz").as_posix() + + vcf2zarr.convert( + [vcf_path], + vcz_path, + variants_chunk_size=5, + samples_chunk_size=2, + worker_processes=0, + ) + + ds = load_dataset(vcz_path) + + assert_array_equal(ds["filter_id"], ["PASS", "s50", "q10"]) + assert_array_equal(get_filters(ds), ["PASS", "s50", "q10"]) # utility function + assert_array_equal( + ds["variant_filter"], + [ + [False, False, False], + [False, False, False], + [True, False, False], + [False, False, True], + [True, False, False], + [True, False, False], + [True, False, False], + [False, False, False], + [True, False, False], + ], + ) + assert num_contigs(ds) == 3 + assert_array_equal(ds["contig_id"], ["19", "20", "X"]) + assert_array_equal(get_contigs(ds), ["19", "20", "X"]) # utility function + assert "contig_length" not in ds + assert_array_equal(ds["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2]) + assert ds["variant_contig"].chunks[0][0] == 5 + + assert_array_equal( + ds["variant_position"], + [111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10], + ) + assert ds["variant_position"].chunks[0][0] == 5 + + im = INT_MISSING + if_ = INT_FILL + fm = FLOAT32_MISSING + ff = FLOAT32_FILL + sm = STR_MISSING + sf = STR_FILL + + assert_array_equal( + ds["variant_NS"], + [im, im, 3, 3, 2, 3, 3, im, im], + ) + assert ds["variant_NS"].chunks[0][0] == 5 + + assert_array_equal( + ds["variant_AN"], + [im, im, im, im, im, im, 6, im, im], + ) + assert ds["variant_AN"].chunks[0][0] == 5 + + assert_array_equal( + ds["variant_AA"], + [ + sm, + sm, + sm, + sm, + "T", + "T", + "G", + sm, + sm, + ], + ) + assert ds["variant_AN"].chunks[0][0] == 5 + + assert_array_equal( + ds["variant_DB"], + [ + False, + False, + True, + False, + True, + False, + False, + False, + False, + ], + ) + assert ds["variant_AN"].chunks[0][0] == 5 + + variant_AF = np.array( + [ + [fm, fm], + [fm, fm], + [0.5, ff], + [0.017, ff], + [0.333, 0.667], + [fm, fm], + [fm, fm], + [fm, fm], + [fm, fm], + ], + dtype=np.float32, + ) + values = ds["variant_AF"].values + assert_array_almost_equal(values, variant_AF, 3) + nans = np.isnan(variant_AF) + assert_array_equal(variant_AF.view(np.int32)[nans], values.view(np.int32)[nans]) + assert ds["variant_AF"].chunks[0][0] == 5 + + assert_array_equal( + ds["variant_AC"], + [ + [im, im], + [im, im], + [im, im], + [im, im], + [im, im], + [im, im], + [3, 1], + [im, im], + [im, im], + ], + ) + assert ds["variant_AC"].chunks[0][0] == 5 + + assert_array_equal( + ds["variant_allele"].values.tolist(), + [ + ["A", "C", sf, sf], + ["A", "G", sf, sf], + ["G", "A", sf, sf], + ["T", "A", sf, sf], + ["A", "G", "T", sf], + ["T", sf, sf, sf], + ["G", "GA", "GAC", sf], + ["T", sf, sf, sf], + ["AC", "A", "ATG", "C"], + ], + ) + assert ds["variant_allele"].chunks[0][0] == 5 + assert ds["variant_allele"].dtype == "O" + assert_array_equal( + ds["variant_id"].values.tolist(), + [sm, sm, "rs6054257", sm, "rs6040355", sm, "microsat1", sm, "rsTest"], + ) + assert ds["variant_id"].chunks[0][0] == 5 + assert ds["variant_id"].dtype == "O" + assert_array_equal( + ds["variant_id_mask"], + [True, True, False, True, False, True, False, True, False], + ) + assert ds["variant_id_mask"].chunks[0][0] == 5 + + assert_array_equal(ds["sample_id"], ["NA00001", "NA00002", "NA00003"]) + assert ds["sample_id"].chunks[0][0] == 2 + + call_genotype = np.array( + [ + [[0, 0], [0, 0], [0, 1]], + [[0, 0], [0, 0], [0, 1]], + [[0, 0], [1, 0], [1, 1]], + [[0, 0], [0, 1], [0, 0]], + [[1, 2], [2, 1], [2, 2]], + [[0, 0], [0, 0], [0, 0]], + [[0, 1], [0, 2], [im, im]], + [[0, 0], [0, 0], [im, im]], + [[0, if_], [0, 1], [0, 2]], + ], + dtype="i1", + ) + call_genotype_phased = np.array( + [ + [True, True, False], + [True, True, False], + [True, True, False], + [True, True, False], + [True, True, False], + [True, True, False], + [False, False, False], + [False, True, False], + [True, False, True], + ], + dtype=bool, + ) + call_DP = [ + [im, im, im], + [im, im, im], + [1, 8, 5], + [3, 5, 3], + [6, 0, 4], + [im, 4, 2], + [4, 2, 3], + [im, im, im], + [im, im, im], + ] + call_HQ = [ + [[10, 15], [10, 10], [3, 3]], + [[10, 10], [10, 10], [3, 3]], + [[51, 51], [51, 51], [im, im]], + [[58, 50], [65, 3], [im, im]], + [[23, 27], [18, 2], [im, im]], + [[56, 60], [51, 51], [im, im]], + [[im, im], [im, im], [im, im]], + [[im, im], [im, im], [im, im]], + [[im, im], [im, im], [im, im]], + ] + + assert_array_equal(ds["call_genotype"], call_genotype) + assert_array_equal(ds["call_genotype_mask"], call_genotype < 0) + assert_array_equal(ds["call_genotype_phased"], call_genotype_phased) + assert_array_equal(ds["call_DP"], call_DP) + assert_array_equal(ds["call_HQ"], call_HQ) + + for name in ["call_genotype", "call_genotype_mask", "call_HQ"]: + assert ds[name].chunks == ((5, 4), (2, 1), (2,)) + + for name in ["call_genotype_phased", "call_DP"]: + assert ds[name].chunks == ((5, 4), (2, 1)) + + # save and load again to test https://github.com/pydata/xarray/issues/3476 + path2 = tmp_path / "ds2.zarr" + save_dataset(ds, path2) + assert_identical(ds, load_dataset(path2)) diff --git a/sgkit/tests/io/vcf/__init__.py b/sgkit/tests/io/vcf/__init__.py deleted file mode 100644 index b29be8992..000000000 --- a/sgkit/tests/io/vcf/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -import pytest - -cyvcf2 = pytest.importorskip("cyvcf2") # noqa: F401 - -# rewrite asserts in assert_vcfs_close to give better failure messages -pytest.register_assert_rewrite("sgkit.tests.io.vcf.utils") diff --git a/sgkit/tests/io/vcf/data/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz b/sgkit/tests/io/vcf/data/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz deleted file mode 100644 index 1e4bc3069..000000000 Binary files a/sgkit/tests/io/vcf/data/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz.tbi b/sgkit/tests/io/vcf/data/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz.tbi deleted file mode 100644 index 002649004..000000000 Binary files a/sgkit/tests/io/vcf/data/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz.tbi and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/1kg_target_chr20_38_imputed_chr20_1000.vcf b/sgkit/tests/io/vcf/data/1kg_target_chr20_38_imputed_chr20_1000.vcf deleted file mode 100644 index 44b77378b..000000000 --- a/sgkit/tests/io/vcf/data/1kg_target_chr20_38_imputed_chr20_1000.vcf +++ /dev/null @@ -1,1000 +0,0 @@ -##fileformat=VCFv4.2 -##FILTER= -##filedate=20220315 -##source="beagle.28Jun21.220.jar" -##INFO= -##INFO= -##INFO= -##FORMAT= -##FORMAT= -##contig= -##bcftools_viewVersion=1.16+htslib-1.16 -##bcftools_viewCommand=view 1kg_target_chr20_38_imputed_chr20.vcf.bgz; Date=Mon Oct 10 15:21:42 2022 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG01258 HG00408 NA12767 HG01135 NA20128 HG02011 NA07435 NA20279 HG02148 HG00532 HG02091 HG02106 NA10865 NA19199 -chr20 60137 . T C . PASS DR2=0.01;AF=0.001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 -chr20 60149 . C T . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60181 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60254 . C A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60280 . TTTCCA T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60286 . T G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60286 . TTCCAG T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60291 . G GTCCAT . PASS DR2=0.06;AF=0.0066;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.05 0|0:0.04 0|0:0.09 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60291 . G GTCCATTCCAT . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60291 . G T . PASS DR2=0.06;AF=0.0094;IMP GT:DS 0|0:0 0|0:0 0|0:0.01 0|0:0.02 0|0:0.01 0|0:0.07 0|0:0 0|0:0 0|0:0 0|0:0.11 0|0:0 0|0:0 0|0:0.03 0|0:0 -chr20 60291 . GTCCAT G . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60291 . GTCCATTCCAT G . PASS DR2=0;AF=0.0006;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60329 . C G . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60331 . T C . PASS DR2=0;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60332 . T G . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60335 . A T . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60340 . A G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60343 . G A . PASS DR2=0.01;AF=0.0008;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 -chr20 60358 . T A . PASS DR2=0;AF=0.0009;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 -chr20 60358 . T C . PASS DR2=0;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60358 . TCACTC T . PASS DR2=0.08;AF=0.0108;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.11 0|0:0.08 0|0:0.11 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60361 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60368 . CCACTCCACTCTACTGCATAG C . PASS DR2=0.03;AF=0.0011;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60440 . A T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60516 . TTCCAC T . PASS DR2=0;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60516 . TTCCACTCCAC T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60531 . CTCCACTCCAT C . PASS DR2=0.01;AF=0.0009;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 -chr20 60536 . CTCCAT C . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60580 . G A . PASS DR2=0.01;AF=0.0011;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0 -chr20 60581 . TTCCAC T . PASS DR2=0.09;AF=0.0093;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.11 0|0:0.04 0|0:0.11 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60591 . CTCCACTCCAT C . PASS DR2=0.1;AF=0.0158;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0.14 0|0:0 0|0:0.16 0|0:0.03 0|0:0 0|0:0.01 0|0:0.06 0|0:0 0|0:0.03 0|0:0 0|0:0 -chr20 60596 . C T . PASS DR2=0.01;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 -chr20 60680 . A C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60705 . A C . PASS DR2=0.01;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 -chr20 60823 . T C . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60831 . C T . PASS DR2=0.31;AF=0.0126;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.35 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60843 . A C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60926 . TTCCATTCCATTCCAC T . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60935 . A G . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60954 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60960 . C A . PASS DR2=0.01;AF=0.0019;IMP GT:DS 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60963 . G C . PASS DR2=0.02;AF=0.0024;IMP GT:DS 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 60978 . CCACTCCATTA C . PASS DR2=0.03;AF=0.0016;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.04 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 61066 . T C . PASS DR2=0.66;AF=0.0572;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 1|0:0.62 0|1:0.85 0|0:0.12 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 61066 . TG T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 61083 . C T . PASS DR2=0.86;AF=0.1188;IMP GT:DS 0|1:1 0|0:0 0|1:0.96 0|0:0 0|1:1.07 0|0:0.09 0|0:0.19 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 61098 . C T . PASS DR2=0.05;AF=0.0047;IMP GT:DS 0|0:0 0|0:0 0|0:0.08 0|0:0 0|0:0 0|0:0.03 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 61114 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 61115 . G A . PASS DR2=0.01;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 61131 . A T . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 61138 . A C . PASS DR2=0;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 61142 . G T . PASS DR2=0.01;AF=0.0014;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 61173 . ACACTC A . PASS DR2=0.1;AF=0.008;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.13 0|0:0.01 0|0:0.08 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 61188 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 61276 . CTCCATTCCACTTCAT C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 61655 . A T . PASS DR2=0.03;AF=0.0043;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.04 0|0:0.02 0|0:0.04 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 -chr20 62150 . G A . PASS DR2=0.73;AF=0.0379;IMP GT:DS 0|0:0 0|0:0.04 0|0:0 0|0:0.01 0|0:0.01 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0.02 0|0:0.04 1|0:0.88 0|0:0 -chr20 62157 . G A . PASS DR2=0.01;AF=0.0011;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 -chr20 62157 . G C . PASS DR2=0.74;AF=0.0374;IMP GT:DS 0|0:0 0|0:0.03 0|0:0 0|0:0.02 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0.01 0|0:0.04 1|0:0.88 0|0:0 -chr20 62162 . G C . PASS DR2=0.75;AF=0.0368;IMP GT:DS 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0.01 0|0:0 0|0:0 0|0:0.02 0|0:0.02 0|0:0.04 1|0:0.88 0|0:0 -chr20 62168 . C T . PASS DR2=0.8;AF=0.0344;IMP GT:DS 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.01 0|0:0 1|0:0.88 0|0:0 -chr20 62520 . C T . PASS DR2=0.08;AF=0.0065;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.11 0|0:0.01 0|0:0.07 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 62542 . C G . PASS DR2=0.01;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 62566 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 62793 . G A . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 62908 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 62977 . C G . PASS DR2=0.1;AF=0.0078;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.13 0|0:0.01 0|0:0.08 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 63012 . C T . PASS DR2=0.02;AF=0.0014;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 63117 . T C . PASS DR2=0.78;AF=0.2437;IMP GT:DS 0|0:0.03 0|0:0 1|0:1.01 0|0:0.48 0|0:0 0|0:0.09 0|0:0.08 0|0:0 1|1:1.9 0|1:0.66 0|1:0.6 1|0:0.96 0|1:1.03 0|0:0 -chr20 63132 . TTCCATTCCAC T . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 63147 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 63852 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 63863 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 63928 . GCACTCCATTC G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 63939 . C T . PASS DR2=0;AF=0.0007;IMP GT:DS 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 63948 . C T . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 63969 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 63971 . TTCCATTCCAC T . PASS DR2=0.92;AF=0.1606;IMP GT:DS 0|0:0 1|1:1.99 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.49 0|0:0 0|0:0.01 1|0:1 1|0:0.99 0|0:0.01 0|0:0 0|0:0 -chr20 64176 . A G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64217 . G A . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64230 . A C . PASS DR2=0.01;AF=0.0009;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64235 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64238 . C A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64239 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64258 . G A . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64266 . CTCCACTCCATTCCAT C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64271 . CTCCAT C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64296 . T G . PASS DR2=0.09;AF=0.0095;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.12 0|0:0.04 0|0:0.1 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64301 . CTCCAA C . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64310 . A C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64336 . T A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64337 . A T . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64422 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64505 . A ATT . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64506 . C CCACCCAATTCCATTCCACTCCACTCCACTCCAT . PASS DR2=0.02;AF=0.0012;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64506 . C CTCCAT . PASS DR2=0.93;AF=0.3433;IMP GT:DS 0|1:1 1|1:1.76 0|1:0.96 0|0:0 1|1:1.94 0|1:0.99 0|1:0.98 0|0:0 0|0:0.02 1|0:0.95 1|0:0.99 0|0:0.01 0|0:0 0|0:0 -chr20 64506 . C CTCCATTCCAT . PASS DR2=0.02;AF=0.0014;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64506 . C CTCCATTCCATTCCACTCCACTCCACTCCAT . PASS DR2=0.01;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64506 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64516 . C T . PASS DR2=0;AF=0.0014;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0 -chr20 64516 . CTCCACTCCACTCCTCTCCAT C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64521 . C T . PASS DR2=0.01;AF=0.002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0.02 0|0:0 0|0:0 -chr20 64525 . A G . PASS DR2=0.01;AF=0.0008;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64556 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64566 . G C . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64580 . A T . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64610 . C T . PASS DR2=0.01;AF=0.0008;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64611 . G A . PASS DR2=0.47;AF=0.099;IMP GT:DS 0|0:0.1 0|0:0 0|0:0.06 0|0:0.17 0|0:0 0|0:0 0|0:0.05 0|0:0 0|1:0.85 0|0:0.08 0|1:0.6 1|0:0.73 0|0:0.13 0|0:0 -chr20 64611 . G T . PASS DR2=0.66;AF=0.0574;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 1|0:0.63 0|1:0.85 0|0:0.12 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64628 . C G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64720 . A C . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64795 . TCTCCATTCCACTCAATTCCATTAAA T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 64953 . C G . PASS DR2=0.02;AF=0.0008;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65268 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65303 . TTCCAC T . PASS DR2=0;AF=0.0005;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65325 . C T . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65355 . C T . PASS DR2=0.06;AF=0.0045;IMP GT:DS 0|0:0 0|0:0 0|0:0.08 0|0:0 0|0:0.01 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65366 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65472 . A C . PASS DR2=0.01;AF=0.0027;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65476 . C A . PASS DR2=0;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65500 . C G . PASS DR2=0.01;AF=0.0022;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65501 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65513 . C T . PASS DR2=0.01;AF=0.0007;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 -chr20 65523 . T C . PASS DR2=0;AF=0.0017;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65526 . C A . PASS DR2=0.03;AF=0.0019;IMP GT:DS 0|0:0 0|0:0 0|0:0.04 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65553 . T A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65568 . TTCCACTGCAC T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65723 . C CTCCAT . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65723 . CTCCAT C . PASS DR2=0.03;AF=0.0012;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65738 . TTCCAC T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65763 . CTCCAT C . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65817 . A ACACCACTCCTCTCCATTCCATTCCATTCCATT . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65818 . CT C . PASS DR2=0;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65820 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65820 . ATATT A . PASS DR2=0;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65823 . TTCCAC T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65831 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65832 . A T . PASS DR2=0;AF=0.0005;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65838 . C T . PASS DR2=0;AF=0.0005;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65841 . G C . PASS DR2=0;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65847 . C A . PASS DR2=0;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65853 . C T . PASS DR2=0;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65865 . A C . PASS DR2=0.01;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65865 . ACACTCCACTT A . PASS DR2=0.01;AF=0.0002;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65927 . A C . PASS DR2=0.1;AF=0.0078;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.13 0|0:0.01 0|0:0.08 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 65955 . C A . PASS DR2=0.78;AF=0.0529;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.48 0|0:0 0|0:0 0|0:0.02 1|0:0.97 0|0:0 0|0:0 0|0:0 -chr20 65971 . G A . PASS DR2=0.09;AF=0.0095;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.12 0|0:0.04 0|0:0.11 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 66018 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 66030 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 66039 . T C . PASS DR2=0.02;AF=0.0013;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 66065 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 66135 . C A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 66140 . A C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 66141 . AATTCC A . PASS DR2=0.76;AF=0.2366;IMP GT:DS 1|0:0.87 0|0:0 0|0:0.02 0|1:1.13 0|0:0.01 0|0:0.29 1|0:0.82 0|0:0 0|0:0.02 0|0:0.2 0|0:0.4 0|1:0.95 1|0:0.92 1|0:1 -chr20 66407 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 66475 . TCACTCCATTCCACTC T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 66478 . C CTCCATTCCACTCCACTCCAATCCAT . PASS DR2=0.01;AF=0.0022;IMP GT:DS 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0.01 0|0:0 0|0:0.02 0|0:0 0|0:0 -chr20 66523 . TTCCATTCCAC T . PASS DR2=0.09;AF=0.0092;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.12 0|0:0.04 0|0:0.1 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 66526 . C T . PASS DR2=0;AF=0.0004;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 66538 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 66578 . T C . PASS DR2=0.01;AF=0.0005;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 66603 . C T . PASS DR2=0.04;AF=0.0016;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.04 0|0:0 0|0:0 -chr20 66620 . G GCACTCCATTCCATGAAATTCCATTC . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 66671 . CATTCCATTCCACTCA C . PASS DR2=0.2;AF=0.0182;IMP GT:DS 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.2 0|0:0.01 0|0:0 0|0:0.01 0|0:0.27 0|0:0 -chr20 66763 . ATCCAC A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 66845 . C G . PASS DR2=0.03;AF=0.001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 66883 . CTCCAT C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 66953 . T TTTCAC . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 66955 . C T . PASS DR2=0.02;AF=0.0118;IMP GT:DS 0|0:0 0|0:0.01 0|0:0.05 0|0:0.04 0|0:0.04 0|0:0.03 0|0:0.01 0|0:0 0|0:0.05 0|0:0.05 0|0:0.01 0|0:0.03 0|0:0 0|0:0 -chr20 66960 . A C . PASS DR2=0.02;AF=0.0112;IMP GT:DS 0|0:0 0|0:0.01 0|0:0.05 0|0:0.04 0|0:0.04 0|0:0.03 0|0:0.01 0|0:0 0|0:0.04 0|0:0.04 0|0:0.01 0|0:0.03 0|0:0 0|0:0 -chr20 66961 . A C . PASS DR2=0.02;AF=0.0112;IMP GT:DS 0|0:0 0|0:0.01 0|0:0.05 0|0:0.04 0|0:0.04 0|0:0.03 0|0:0.01 0|0:0 0|0:0.04 0|0:0.04 0|0:0.01 0|0:0.03 0|0:0 0|0:0 -chr20 66965 . C A . PASS DR2=0.02;AF=0.0097;IMP GT:DS 0|0:0 0|0:0.01 0|0:0.01 0|0:0.04 0|0:0.04 0|0:0.03 0|0:0.02 0|0:0 0|0:0.04 0|0:0.04 0|0:0.01 0|0:0.03 0|0:0 0|0:0 -chr20 66970 . C G . PASS DR2=0.02;AF=0.0097;IMP GT:DS 0|0:0 0|0:0.01 0|0:0.01 0|0:0.04 0|0:0.04 0|0:0.03 0|0:0.02 0|0:0 0|0:0.04 0|0:0.04 0|0:0.01 0|0:0.03 0|0:0 0|0:0 -chr20 66973 . T G . PASS DR2=0.01;AF=0.0091;IMP GT:DS 0|0:0 0|0:0.01 0|0:0.01 0|0:0.03 0|0:0.03 0|0:0.03 0|0:0.01 0|0:0 0|0:0.04 0|0:0.04 0|0:0.01 0|0:0.03 0|0:0 0|0:0 -chr20 66978 . CTCCACTCCATTCCAT C . PASS DR2=0.11;AF=0.0153;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.15 0|0:0 0|0:0.16 0|0:0.03 0|0:0 0|0:0.01 0|0:0.05 0|0:0 0|0:0.03 0|0:0 0|0:0 -chr20 66987 . A G . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 67006 . T C . PASS DR2=0.01;AF=0.0031;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0.01 0|0:0.01 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0.01 0|0:0 0|0:0 -chr20 67009 . T A . PASS DR2=0.01;AF=0.0024;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0.01 0|0:0.01 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0.01 0|0:0 0|0:0 -chr20 67015 . C G . PASS DR2=0.01;AF=0.0024;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0.01 0|0:0.01 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0.01 0|0:0 0|0:0 -chr20 67015 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 67021 . C CATTAA . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 67023 . C T . PASS DR2=0.01;AF=0.0018;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 67025 . C A . PASS DR2=0.01;AF=0.0017;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 67027 . T A . PASS DR2=0.01;AF=0.0015;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 67041 . T C . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 67058 . TTCCAC T . PASS DR2=0.01;AF=0.0008;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 -chr20 67120 . GGACTCCACTCCATTC G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 67345 . T G . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 67372 . T G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 67373 . G GAAATGTGAT . PASS DR2=0.76;AF=0.0391;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.01 0|0:0.05 0|0:0 0|0:0 0|0:0 0|0:0.11 0|0:0 0|0:0 1|0:0.91 0|0:0 -chr20 67384 . C T . PASS DR2=0.01;AF=0.0007;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 67400 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 67744 . T C . PASS DR2=0.3;AF=0.0333;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.15 0|0:0 0|0:0.16 0|0:0.03 0|0:0 0|0:0.01 0|0:0.05 0|0:0 0|0:0.03 0|0:0 0|0:0.5 -chr20 67779 . C G . PASS DR2=0.01;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 67829 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 67860 . G A . PASS DR2=0.02;AF=0.0012;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 67920 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 67955 . T C . PASS DR2=0.02;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 67970 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 68026 . G C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 68044 . G A . PASS DR2=0.02;AF=0.0013;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 68049 . C A . PASS DR2=0.82;AF=0.0461;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0.27 0|0:0 1|0:1 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 68275 . T A . PASS DR2=0.02;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 -chr20 68303 . T C . PASS DR2=0.93;AF=0.6278;IMP GT:DS 1|0:1 0|0:0 1|0:1.04 1|1:1.85 0|0:0.01 1|0:0.81 1|0:0.97 1|1:2 1|1:1.99 0|1:0.95 0|1:1 1|1:1.97 1|1:2 1|1:2 -chr20 68411 . C G . PASS DR2=0;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 68435 . G C . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 68443 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 68511 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 68528 . A G . PASS DR2=0.05;AF=0.0022;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.06 0|0:0 0|0:0 -chr20 68557 . A C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 68575 . G A . PASS DR2=0.09;AF=0.0098;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.12 0|0:0.04 0|0:0.11 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 68649 . G A . PASS DR2=0.01;AF=0.0007;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 -chr20 68719 . A C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 68750 . A G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 69080 . AC A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 69086 . A G . PASS DR2=0.01;AF=0.0009;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 69115 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 69207 . A G . PASS DR2=0.1;AF=0.0104;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.13 0|0:0.04 0|0:0.12 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 69218 . GAC G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 69241 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 69285 . G A . PASS DR2=0.16;AF=0.0061;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.17 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 69306 . C G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 69646 . T C . PASS DR2=0.01;AF=0.0009;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 -chr20 69841 . C A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 69877 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 69947 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 70033 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 70108 . A G . PASS DR2=0.01;AF=0.0015;IMP GT:DS 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0.02 0|0:0 -chr20 70238 . A T . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 70256 . C A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 70510 . T G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 70638 . T C . PASS DR2=0.04;AF=0.0025;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.05 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 70665 . C A . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 70672 . G C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 70744 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 70745 . A G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 70851 . T C . PASS DR2=0.01;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 70867 . C T . PASS DR2=0.01;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 70925 . G A . PASS DR2=0.01;AF=0.0011;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 70938 . A G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 71012 . T C . PASS DR2=0.28;AF=0.0202;IMP GT:DS 0|0:0 0|0:0.22 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.34 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 71110 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 71254 . T G . PASS DR2=0.1;AF=0.0102;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.13 0|0:0.04 0|0:0.12 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 71305 . C G . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 71437 . T C . PASS DR2=0.1;AF=0.0102;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.13 0|0:0.04 0|0:0.12 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 71513 . G T . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 71841 . G C . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 71865 . T A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 71974 . G GA . PASS DR2=0.79;AF=0.046;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.12 0|0:0.04 0|0:0.11 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|1:1 -chr20 71974 . GA G . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 71983 . C A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 72018 . CTT C . PASS DR2=0.08;AF=0.0059;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.1 0|0:0.01 0|0:0.06 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 72040 . G A . PASS DR2=0.1;AF=0.0102;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.13 0|0:0.04 0|0:0.12 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 72087 . G A . PASS DR2=0.02;AF=0.0015;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 72195 . T C . PASS DR2=0.01;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 -chr20 72211 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 72252 . G C . PASS DR2=0.01;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 72272 . T G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 72287 . T C . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 72295 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 72296 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 72446 . C A . PASS DR2=0.01;AF=0.0005;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 72450 . G A . PASS DR2=0.88;AF=0.1587;IMP GT:DS 0|0:0 1|1:2 0|0:0 0|0:0 0|0:0.25 0|0:0.05 0|0:0.2 0|0:0 0|0:0 1|0:1 1|0:0.94 0|0:0 0|0:0 0|0:0 -chr20 72574 . C G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 72602 . G A . PASS DR2=0.01;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 72655 . C A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 72734 . A G . PASS DR2=0.04;AF=0.0027;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.05 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0.01 0|0:0 0|0:0 -chr20 72765 . T TA . PASS DR2=0.93;AF=0.6548;IMP GT:DS 1|0:1 0|0:0 1|0:1.04 1|1:2 0|0:0.13 1|0:1.02 1|0:1.12 1|1:2 1|1:1.99 0|1:0.99 0|1:1.06 1|1:2 1|1:2 1|1:2 -chr20 72765 . T TAA . PASS DR2=0.02;AF=0.0014;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 72765 . TA T . PASS DR2=0.98;AF=0.1406;IMP GT:DS 0|0:0 1|1:2 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 1|0:1 1|0:0.94 0|0:0 0|0:0 0|0:0 -chr20 72844 . A G . PASS DR2=0.66;AF=0.0266;IMP GT:DS 0|0:0 0|0:0 0|1:0.71 0|0:0 0|0:0.01 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 72956 . C A . PASS DR2=0;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 72982 . A C . PASS DR2=1;AF=0.1429;IMP GT:DS 0|0:0 1|1:2 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 1|0:1 1|0:1 0|0:0 0|0:0 0|0:0 -chr20 73112 . G A . PASS DR2=0.1;AF=0.0102;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.13 0|0:0.04 0|0:0.12 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 73204 . T C . PASS DR2=0.1;AF=0.0076;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.13 0|0:0.01 0|0:0.08 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 73315 . G C . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 73474 . TA T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 73517 . A G . PASS DR2=0.05;AF=0.002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.06 0|0:0 0|0:0 0|0:0 -chr20 73569 . A T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 73618 . G A . PASS DR2=0.02;AF=0.0014;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 73678 . A T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 73751 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 73757 . G A . PASS DR2=0.09;AF=0.01;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.12 0|0:0.04 0|0:0.11 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 73765 . C A . PASS DR2=0.94;AF=0.1514;IMP GT:DS 0|0:0 1|1:2 0|0:0 0|0:0 0|0:0.14 0|0:0.01 0|0:0.08 0|0:0 0|0:0.01 1|0:1 1|0:1 0|0:0.01 0|0:0 0|0:0 -chr20 73766 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 73800 . G T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 73853 . G A . PASS DR2=0.03;AF=0.0026;IMP GT:DS 0|0:0 0|0:0.04 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0.01 0|0:0 0|0:0 0|0:0 -chr20 73866 . C T . PASS DR2=0.76;AF=0.0335;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0.04 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|1:0.86 0|0:0 0|0:0 -chr20 73878 . G A . PASS DR2=0.03;AF=0.0011;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 74102 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 74256 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 74331 . T G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 74349 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 74376 . GA G . PASS DR2=0.01;AF=0.0026;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0.02 0|0:0 0|0:0 -chr20 74400 . A C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 74406 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 74676 . C A . PASS DR2=0.02;AF=0.0014;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 74775 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 74857 . G C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 75107 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 75118 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 75199 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 75250 . C T . PASS DR2=0.77;AF=0.039;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0.05 0|0:0 0|0:0 0|0:0 0|0:0.11 0|0:0 0|0:0 1|0:0.91 0|0:0 -chr20 75251 . G A . PASS DR2=0.01;AF=0.0017;IMP GT:DS 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 75314 . T C . PASS DR2=0.09;AF=0.0102;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.12 0|0:0.04 0|0:0.11 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 75325 . CAG C . PASS DR2=0.02;AF=0.0012;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 75332 . A G . PASS DR2=0.02;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 -chr20 75380 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 75398 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 75411 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 75529 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 75541 . C A . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 75555 . C G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 75577 . G A . PASS DR2=0.05;AF=0.0029;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.06 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 75692 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 75743 . C T . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 75851 . A G . PASS DR2=0.03;AF=0.002;IMP GT:DS 0|0:0 0|0:0 0|0:0.04 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 75864 . T A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 75888 . G GA . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 75888 . GA G . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 75969 . G GA . PASS DR2=0.83;AF=0.0413;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|1:0.98 0|0:0.01 0|0:0 0|0:0.17 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 76083 . T C . PASS DR2=0.82;AF=0.0417;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|1:0.98 0|0:0.01 0|0:0 0|0:0.17 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 76090 . G A . PASS DR2=0.01;AF=0.0014;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 -chr20 76101 . T C . PASS DR2=0.04;AF=0.0028;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.06 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 76233 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 76347 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 76410 . T A . PASS DR2=0.01;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 76459 . G C . PASS DR2=0;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 76536 . C G . PASS DR2=0.01;AF=0.0007;IMP GT:DS 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 76583 . C G . PASS DR2=0.02;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 -chr20 76624 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 76624 . G T . PASS DR2=0;AF=0.0007;IMP GT:DS 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 76665 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 76688 . G A . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 76709 . C A . PASS DR2=0.16;AF=0.0087;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.2 0|0:0 0|0:0 0|0:0 -chr20 76709 . C CA . PASS DR2=0.1;AF=0.011;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.13 0|0:0.04 0|0:0.12 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 76709 . C CAA . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 76709 . CA C . PASS DR2=0.02;AF=0.0045;IMP GT:DS 0|0:0 0|0:0.05 0|0:0 0|0:0 0|0:0.01 0|0:0.02 0|0:0.01 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 76861 . C T . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 76908 . G C . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 76944 . C G . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 77005 . A G . PASS DR2=0.99;AF=0.1435;IMP GT:DS 0|0:0 1|1:2 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 1|0:1 1|0:1 0|0:0.01 0|0:0 0|0:0 -chr20 77039 . C T . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 77112 . G A . PASS DR2=0.02;AF=0.0009;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 77141 . A G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 77228 . A G . PASS DR2=0.02;AF=0.0008;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 77238 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 77260 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 77272 . T C . PASS DR2=0.28;AF=0.0203;IMP GT:DS 0|0:0 0|0:0.22 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.34 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 77292 . G A . PASS DR2=0.03;AF=0.0015;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 -chr20 77294 . C A . PASS DR2=0.02;AF=0.0007;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 77391 . CAT C . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 77420 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 77679 . T C . PASS DR2=0.1;AF=0.0101;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.12 0|0:0.04 0|0:0.11 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 77726 . T C . PASS DR2=0.1;AF=0.0101;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.12 0|0:0.04 0|0:0.11 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 77736 . C A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 77736 . C CA . PASS DR2=0.18;AF=0.0139;IMP GT:DS 0|0:0.01 0|0:0 0|0:0.26 0|0:0.01 0|0:0.02 0|0:0.04 0|0:0.04 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 77736 . C CAA . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 77736 . CA C . PASS DR2=0.89;AF=0.2703;IMP GT:DS 0|0:0 1|1:2 0|0:0 0|0:0.01 1|1:1.54 0|1:0.86 0|0:0.13 0|1:0.99 0|0:0.01 1|0:1.01 1|0:1 0|0:0.02 0|0:0 0|0:0 -chr20 77736 . CAA C . PASS DR2=0.09;AF=0.013;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.21 0|0:0.04 0|0:0.11 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 77736 . CAAA C . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 77760 . A T . PASS DR2=0.03;AF=0.001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 77922 . T A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 78012 . T C . PASS DR2=0.32;AF=0.0148;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0.39 0|0:0.01 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 78093 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 78212 . C T . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 78346 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 78378 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 78443 . G A . PASS DR2=0.02;AF=0.0015;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 78498 . T C . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 78506 . G T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 78511 . G A . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 78519 . G C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 78664 . T C . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 78705 . A T . PASS DR2=0.87;AF=0.2003;IMP GT:DS 0|0:0 1|1:2 0|0:0 0|0:0 0|0:0.44 0|1:0.9 0|0:0.27 0|0:0 0|0:0.01 1|0:1 1|0:0.98 0|0:0.01 0|0:0 0|0:0 -chr20 78734 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 78778 . A T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 78833 . CAAAA C . PASS DR2=0.02;AF=0.0013;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 78850 . G C . PASS DR2=0.02;AF=0.0015;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 78861 . T A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 78869 . G A . PASS DR2=0.01;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 78949 . A G . PASS DR2=0.07;AF=0.0052;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.08 0|0:0 0|0:0.06 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 79071 . C T . PASS DR2=0.01;AF=0.0008;IMP GT:DS 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 79212 . T C . PASS DR2=0.01;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 79256 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 79621 . G C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 79838 . C T . PASS DR2=0.02;AF=0.0014;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 79881 . T TC . PASS DR2=0.31;AF=0.0133;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0.36 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 79927 . A C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 79930 . C A . PASS DR2=0.01;AF=0.0008;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 80169 . G GA . PASS DR2=0.01;AF=0.0007;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 80187 . T G . PASS DR2=0.48;AF=0.0181;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 1|0:0.5 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 80254 . A G . PASS DR2=0;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 80275 . G T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 80403 . C A . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 80429 . C T . PASS DR2=0.01;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 -chr20 80457 . C T . PASS DR2=0.89;AF=0.1609;IMP GT:DS 0|0:0 1|1:2 0|0:0 0|0:0 0|0:0.25 0|0:0.05 0|0:0.2 0|0:0 0|0:0 1|0:1 1|0:1 0|0:0 0|0:0 0|0:0 -chr20 80497 . C CT . PASS DR2=0.96;AF=0.107;IMP GT:DS 0|0:0 1|1:1.97 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 1|0:0.97 0|0:0.03 0|0:0 0|0:0 0|0:0 -chr20 80497 . CT C . PASS DR2=0.02;AF=0.0013;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 80597 . G A . PASS DR2=0.01;AF=0.0008;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 -chr20 80688 . C T . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 80747 . T C . PASS DR2=0.01;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 80809 . T C . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 80876 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 80897 . A G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 80997 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 81010 . C A . PASS DR2=0.78;AF=0.028;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|1:0.78 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 81083 . A C . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 81084 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 81154 . T G . PASS DR2=0.99;AF=0.6444;IMP GT:DS 1|0:1 0|0:0 1|0:1 1|1:2 0|0:0.02 1|0:1.02 1|0:1.01 1|1:2 1|1:2 0|1:1 0|1:1 1|1:2 1|1:2 1|1:2 -chr20 81314 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 81459 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 81467 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 81614 . T C . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 81707 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 81746 . T A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 81769 . T A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 81837 . A G . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 81904 . C G . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 81912 . T C . PASS DR2=0.03;AF=0.001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82090 . C A . PASS DR2=0.86;AF=0.0999;IMP GT:DS 0|1:1 0|0:0 0|1:1 0|0:0 0|0:0.09 0|0:0.05 0|1:0.66 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82098 . T C . PASS DR2=0;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82142 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82147 . C G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82155 . T A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82187 . C G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82198 . A T . PASS DR2=0.03;AF=0.0016;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.04 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82239 . TG T . PASS DR2=0.03;AF=0.0028;IMP GT:DS 0|0:0 0|0:0.05 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82294 . A G . PASS DR2=0.02;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 -chr20 82305 . T A . PASS DR2=0.01;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82314 . C T . PASS DR2=0.01;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82367 . C A . PASS DR2=0.02;AF=0.0015;IMP GT:DS 0|0:0.01 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82413 . A G . PASS DR2=0.03;AF=0.001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82456 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82514 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82590 . T G . PASS DR2=0.11;AF=0.0158;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.15 0|0:0 0|0:0.17 0|0:0.03 0|0:0 0|0:0.01 0|0:0.04 0|0:0.01 0|0:0.03 0|0:0 0|0:0 -chr20 82592 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82603 . A C . PASS DR2=1;AF=0.1429 GT:DS 0|0:0 1|1:2 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 1|0:1 1|0:1 0|0:0 0|0:0 0|0:0 -chr20 82616 . A AT . PASS DR2=0.48;AF=0.0181;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 1|0:0.5 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82710 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82719 . C T . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82785 . G T . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82811 . C G . PASS DR2=0.1;AF=0.0101;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.12 0|0:0.04 0|0:0.11 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82880 . G A . PASS DR2=0.06;AF=0.0025;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.07 0|0:0 0|0:0 -chr20 82900 . C A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82907 . G GT . PASS DR2=0.01;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82914 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82918 . C A . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82945 . T G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 82995 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83055 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83092 . C T . PASS DR2=0.67;AF=0.0386;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.18 0|1:0.84 0|0:0.06 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83158 . C T . PASS DR2=1;AF=0.3571 GT:DS 0|1:1 1|1:2 0|1:1 0|0:0 1|1:2 0|1:1 0|1:1 0|0:0 0|0:0 1|0:1 1|0:1 0|0:0 0|0:0 0|0:0 -chr20 83167 . G C . PASS DR2=0.67;AF=0.0386;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.18 0|1:0.84 0|0:0.06 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83172 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83224 . G A . PASS DR2=0.01;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83326 . A G . PASS DR2=0.1;AF=0.0102;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.13 0|0:0.04 0|0:0.12 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83388 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83421 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83509 . C A . PASS DR2=0.01;AF=0.001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83510 . C T . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83534 . A G . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83545 . C G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83582 . A AT . PASS DR2=0.88;AF=0.0408;IMP GT:DS 0|1:1 0|0:0 0|0:0.08 0|0:0.01 0|0:0 0|0:0.03 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83582 . AT A . PASS DR2=0.03;AF=0.0045;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.06 0|0:0.02 0|0:0.03 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83633 . T G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83667 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83682 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83728 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83791 . C T . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83801 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83812 . C G . PASS DR2=0;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83868 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83898 . A T . PASS DR2=0;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83902 . A G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 83962 . A C . PASS DR2=1;AF=0.0357;IMP GT:DS 0|1:1 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 84219 . G A . PASS DR2=0.01;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 -chr20 84257 . G A . PASS DR2=0;AF=0.0005;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 84489 . G GA . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 84532 . CT C . PASS DR2=0.01;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 -chr20 84599 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 84610 . A C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 84626 . CA C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 84647 . G T . PASS DR2=0.86;AF=0.0999;IMP GT:DS 0|1:1 0|0:0 0|1:1 0|0:0 0|0:0.09 0|0:0.05 0|1:0.66 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 84694 . T G . PASS DR2=0.02;AF=0.0009;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 84696 . A T . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 84796 . A G . PASS DR2=0.01;AF=0.0002;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 84804 . T A . PASS DR2=0.01;AF=0.0002;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 84856 . T G . PASS DR2=0.22;AF=0.0168;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.19 0|0:0.01 0|0:0 0|0:0 0|0:0.27 0|0:0 -chr20 84891 . G T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 84987 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 85057 . G T . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 85223 . T C . PASS DR2=0.01;AF=0.0005;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 85259 . G A . PASS DR2=0.89;AF=0.8383;IMP GT:DS 1|1:2 0|0:0 1|1:2 1|1:2 1|1:1.74 1|1:1.95 1|1:1.81 1|1:2 1|1:1.99 0|1:1 0|1:1 1|1:1.99 1|1:1.99 1|1:2 -chr20 85297 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 85310 . T C . PASS DR2=0.06;AF=0.0047;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.04 0|0:0.01 0|0:0.08 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 85345 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 85412 . G T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 85452 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 85453 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 85607 . C T . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 85699 . T A . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 85729 . G A . PASS DR2=0.94;AF=0.8485;IMP GT:DS 1|1:2 0|0:0 1|1:2 1|1:2 1|1:1.87 1|1:1.99 1|1:1.92 1|1:2 1|1:1.99 0|1:1 0|1:1 1|1:1.99 1|1:1.99 1|1:2 -chr20 86064 . C T . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 86079 . C A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 86097 . T A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 86132 . A G . PASS DR2=0.01;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 86405 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 86415 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 86441 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 86543 . C T . PASS DR2=0.01;AF=0.0007;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 86635 . G T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 86757 . G C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 86785 . G C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 86820 . A G . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 86841 . A T . PASS DR2=0.02;AF=0.0013;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 86845 . T C . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 86859 . T TTGGTATCTAG . PASS DR2=0.86;AF=0.8384;IMP GT:DS 1|1:2 0|0:0.02 1|1:2 1|1:2 1|1:1.87 1|1:1.99 1|1:1.92 1|1:2 1|1:1.83 0|1:1.01 0|1:1.01 1|1:1.99 1|1:1.84 1|1:2 -chr20 86865 . T TC . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 86866 . T TAGTGGTATC . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 86913 . A T . PASS DR2=0;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 86933 . T C . PASS DR2=0.01;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 86983 . T TG . PASS DR2=0.77;AF=0.0335;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|1:0.86 0|0:0.07 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 87000 . C T . PASS DR2=0.32;AF=0.0129;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.36 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 87003 . T C . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 87119 . C T . PASS DR2=0.02;AF=0.003;IMP GT:DS 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.03 0|0:0 0|0:0.01 0|0:0.02 0|0:0 -chr20 87124 . C T . PASS DR2=0;AF=0.0005;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 87132 . A C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 87264 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 87290 . GAA G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 87309 . C A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 87541 . A T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 87551 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 87564 . A G . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 87623 . T C . PASS DR2=0.65;AF=0.0568;IMP GT:DS 0|0:0 0|0:0 0|0:0.01 0|0:0 1|0:0.6 0|1:0.85 0|0:0.12 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 87755 . C T . PASS DR2=0.51;AF=0.0378;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 1|0:0.52 0|0:0 0|0:0 0|0:0 0|1:0.54 0|0:0 -chr20 87862 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 87894 . G A . PASS DR2=0.04;AF=0.0026;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.05 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 87977 . G A . PASS DR2=0.51;AF=0.0184;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 1|0:0.52 0|0:0 0|0:0 0|0:0 -chr20 88019 . C A . PASS DR2=0.01;AF=0.0005;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 88026 . T G . PASS DR2=0.03;AF=0.0019;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.04 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 88079 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 88108 . T C . PASS DR2=1;AF=0.6071 GT:DS 1|1:2 0|0:0 1|1:2 0|1:1 1|0:1 0|1:1 1|1:2 0|0:0 1|1:2 0|1:1 1|1:2 1|0:1 1|1:2 0|0:0 -chr20 88152 . G A . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 88155 . G C . PASS DR2=0.01;AF=0.0008;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 88158 . T C . PASS DR2=0.01;AF=0.0012;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0 -chr20 88169 . G A . PASS DR2=0.01;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 88425 . C G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 88453 . G A . PASS DR2=1;AF=0.3929 GT:DS 1|1:2 1|1:2 0|1:1 0|1:1 1|0:1 0|0:0 1|1:2 0|0:0 0|0:0 1|0:1 0|1:1 0|0:0 0|0:0 0|0:0 -chr20 88463 . G T . PASS DR2=0.01;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 -chr20 88535 . A G . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 88615 . A C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 88666 . T C . PASS DR2=0.01;AF=0.0005;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 88715 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 88767 . C T . PASS DR2=0.84;AF=0.1026;IMP GT:DS 0|1:1 0|0:0.08 0|1:1 0|0:0 0|0:0.08 0|0:0.02 0|1:0.67 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 88804 . C A . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 88840 . CT C . PASS DR2=0.81;AF=0.16;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.16 0|1:1.46 1|0:0.74 0|0:0.01 1|0:1 0|0:0.01 0|0:0.09 0|0:0 0|1:1 0|0:0 0|0:0 -chr20 88865 . G GAC . PASS DR2=0.77;AF=0.046;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0.01 0|0:0.06 0|0:0.08 0|0:0.04 0|0:0 0|0:0 0|0:0.06 0|0:0.03 0|0:0.01 0|0:0 1|0:1 -chr20 88865 . G GACAC . PASS DR2=0.76;AF=0.1895;IMP GT:DS 0|1:1 0|0:0.01 0|0:0 0|0:0.1 1|1:1.61 0|1:1.29 0|0:0.14 1|0:1 0|0:0 0|0:0.1 0|0:0 0|0:0.06 0|0:0 0|0:0 -chr20 88865 . G GACACAC . PASS DR2=0.77;AF=0.3793;IMP GT:DS 1|0:0.99 1|1:1.67 0|1:1 0|1:1.08 0|0:0.14 0|0:0.31 1|1:1.64 0|0:0 0|0:0 0|0:0.89 0|1:1 0|1:0.94 1|0:0.97 0|0:0 -chr20 88865 . G GACACACAC . PASS DR2=0.02;AF=0.008;IMP GT:DS 0|0:0.01 0|0:0.03 0|0:0 0|0:0.01 0|0:0.03 0|0:0.01 0|0:0.06 0|0:0 0|0:0 0|0:0.04 0|0:0 0|0:0 0|0:0.02 0|0:0 -chr20 88865 . G GACACACACAC . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 88985 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 89027 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 89063 . G C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 89082 . TATAA T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 89088 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 89106 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 89390 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 89451 . T C . PASS DR2=0.01;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 -chr20 89649 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 89740 . A G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 89843 . CTCTT C . PASS DR2=0.1;AF=0.0096;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.1 0|0:0.03 0|0:0.14 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 89849 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 89851 . CTT C . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 89864 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 90144 . A G . PASS DR2=0.02;AF=0.002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0.01 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 90191 . G C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 90197 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 90252 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 90279 . A G . PASS DR2=0.01;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 90339 . G A . PASS DR2=0.09;AF=0.0087;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.08 0|0:0.03 0|0:0.13 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 90365 . C G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 90374 . T C . PASS DR2=0.06;AF=0.0068;IMP GT:DS 0|0:0.1 0|0:0.02 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0.05 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 90436 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 90437 . G A . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 90438 . C T . PASS DR2=0.05;AF=0.0085;IMP GT:DS 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0.07 0|0:0.01 0|0:0.06 0|0:0 0|0:0 0|0:0.07 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 90452 . G A . PASS DR2=0.08;AF=0.0116;IMP GT:DS 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0.11 0|0:0.01 0|0:0.11 0|0:0 0|0:0 0|0:0.07 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 90745 . A G . PASS DR2=0.01;AF=0.0005;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 90748 . C A . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 90803 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 90803 . CTTTG C . PASS DR2=0.15;AF=0.0086;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.03 0|0:0.2 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 90868 . T G . PASS DR2=0.05;AF=0.0031;IMP GT:DS 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.07 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 90950 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 90969 . A G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 90992 . C A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 91095 . G C . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 91168 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 91218 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 91236 . A T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 91303 . C A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 91309 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 91329 . G A . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 91351 . C T . PASS DR2=0.01;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 -chr20 91395 . G A . PASS DR2=0.01;AF=0.0005;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 91463 . TA T . PASS DR2=0.85;AF=0.5862;IMP GT:DS 0|1:1 0|0:0.5 1|1:1.99 1|0:0.95 1|1:1.91 1|1:1.85 0|1:0.83 1|0:1 1|1:2 0|0:0.39 0|0:0.01 1|1:2 0|1:1 0|1:1 -chr20 91546 . T G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 91549 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 91565 . T G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 91617 . T G . PASS DR2=0.01;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 91676 . G T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 91705 . A C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 91823 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 91832 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 91843 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 91991 . G T . PASS DR2=0.02;AF=0.002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92024 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92050 . A C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92059 . T TATAC . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92059 . TATAC T . PASS DR2=0.02;AF=0.001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92078 . C T . PASS DR2=0.01;AF=0.0011;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92131 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92222 . G C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92251 . G A . PASS DR2=0.08;AF=0.0059;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0.03 0|0:0.11 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92253 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92263 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92276 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92373 . C G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92390 . TC T . PASS DR2=0.02;AF=0.0012;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92393 . TTC T . PASS DR2=0.02;AF=0.0012;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92395 . C CT . PASS DR2=0.12;AF=0.0212;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0.02 0|0:0.07 0|0:0.2 0|0:0.18 0|0:0 0|0:0.02 0|0:0.08 0|0:0 0|0:0.02 0|0:0 0|0:0 -chr20 92395 . C CTT . PASS DR2=0.01;AF=0.0011;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92395 . C CTTT . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92395 . CT C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92408 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92437 . G T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92461 . C G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92495 . T A . PASS DR2=0;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92613 . G C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92728 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92903 . G T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92969 . C CT . PASS DR2=0.03;AF=0.0036;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0.06 0|0:0 0|0:0 0|0:0 -chr20 92969 . CT C . PASS DR2=0.88;AF=0.225;IMP GT:DS 1|0:0.98 0|0:0.12 0|0:0.01 0|1:0.99 0|0:0.03 0|0:0.04 1|0:1.07 0|0:0 0|0:0.01 0|1:1.03 0|1:1.02 0|0:0 1|0:1.01 0|0:0 -chr20 92969 . CTT C . PASS DR2=0.83;AF=0.5916;IMP GT:DS 0|1:1 1|1:1.63 1|1:1.99 1|0:0.95 1|1:1.95 1|0:1.08 0|1:0.72 1|0:1 1|1:1.9 0|0:0.39 0|0:0.01 1|1:1.97 0|1:0.98 0|1:1 -chr20 92969 . CTTT C . PASS DR2=0;AF=0.0007;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 92969 . CTTTT C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 93078 . C A . PASS DR2=0.84;AF=0.0366;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 1|0:0.92 0|0:0.1 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 93117 . C G . PASS DR2=0.01;AF=0.0014;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 -chr20 93119 . C G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 93120 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 93160 . C T . PASS DR2=0.15;AF=0.018;IMP GT:DS 0|0:0.14 0|0:0 0|0:0 0|0:0.17 0|0:0 0|0:0 0|0:0.19 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 93211 . T G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 93215 . C T . PASS DR2=0.01;AF=0.0015;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 -chr20 93216 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 93217 . C T . PASS DR2=0.01;AF=0.0014;IMP GT:DS 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 93221 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 93316 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 93492 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 93568 . A AT . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 93568 . AT A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 93606 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 93640 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 93706 . G A . PASS DR2=1;AF=0.25 GT:DS 1|0:1 0|1:1 0|0:0 0|1:1 0|0:0 0|0:0 1|0:1 0|0:0 0|0:0 0|1:1 0|1:1 0|0:0 1|0:1 0|0:0 -chr20 93862 . C T . PASS DR2=0.03;AF=0.0014;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 -chr20 93902 . A AT . PASS DR2=0.84;AF=0.2401;IMP GT:DS 0|1:1 1|0:0.74 0|1:1 0|0:0 1|0:1 0|1:0.99 0|1:0.69 0|0:0 0|0:0 0|0:0.3 0|0:0 0|0:0 0|0:0 0|1:1 -chr20 93902 . AT A . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 93904 . T A . PASS DR2=0.01;AF=0.001;IMP GT:DS 0|0:0 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 -chr20 94010 . T G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 94036 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 94088 . G GT . PASS DR2=0.91;AF=0.6036;IMP GT:DS 0|1:1 1|0:0.75 1|1:2 1|0:1 1|1:2 1|1:1.91 0|1:0.86 1|0:1 1|1:2 0|0:0.38 0|0:0 1|1:2 0|1:1 0|1:1 -chr20 94088 . G GTT . PASS DR2=0.01;AF=0.0005;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 94237 . C G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 94264 . TG T . PASS DR2=0.06;AF=0.0029;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.07 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 94328 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 94329 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 94361 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 94425 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 94589 . T C . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 94598 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 94613 . C A . PASS DR2=1;AF=0.3929 GT:DS 0|0:0 0|0:0 1|0:1 1|0:1 1|1:2 1|0:1 0|0:0 1|0:1 1|1:2 0|0:0 0|0:0 1|1:2 0|1:1 0|0:0 -chr20 94781 . T A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 94843 . T A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 94875 . A G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 94881 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 94951 . G T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 94969 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 95088 . C G . PASS DR2=0.92;AF=0.067;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|1:0.86 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|1:1 -chr20 95172 . G T . PASS DR2=0.01;AF=0.0008;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 -chr20 95187 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 95239 . G A . PASS DR2=0.03;AF=0.0011;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 95316 . C G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 95358 . T C . PASS DR2=0.01;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 95440 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 95441 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 95472 . G T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 95538 . AT A . PASS DR2=0.01;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 -chr20 95641 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 95655 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 95660 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 95747 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 95830 . C T . PASS DR2=0.02;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 -chr20 95845 . T A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 95878 . T C . PASS DR2=1;AF=0.0357;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|1:1 -chr20 95892 . G T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 95994 . A T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 96048 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 96059 . G A . PASS DR2=0.02;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 96103 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 96130 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 96257 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 96261 . T G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 96274 . A G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 96279 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 96293 . A G . PASS DR2=0.04;AF=0.0015;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.04 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 96321 . T C . PASS DR2=0.85;AF=0.8669;IMP GT:DS 1|1:2 1|1:1.92 1|1:2 1|1:2 1|1:2 1|1:1.95 1|1:1.88 1|0:1 1|1:2 1|1:1.53 0|1:1 1|1:2 1|1:2 0|1:1 -chr20 96372 . C T . PASS DR2=0.01;AF=0.0013;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 -chr20 96384 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 96653 . TTATC T . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 96669 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 96686 . G T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 96686 . GA G . PASS DR2=0.01;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 96752 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 96829 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 96968 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 96968 . GTAAC G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97009 . A C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97033 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97066 . T C . PASS DR2=0.04;AF=0.0019;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.05 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97078 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97079 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97131 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97175 . G A . PASS DR2=0.11;AF=0.0053;IMP GT:DS 0|0:0 0|0:0.13 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97230 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97284 . A G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97289 . A G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97299 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97324 . G GT . PASS DR2=0.95;AF=0.3889;IMP GT:DS 0|0:0 0|0:0 1|0:0.99 1|0:0.99 1|1:2 1|0:0.89 0|0:0.01 1|0:1 1|1:1.99 0|0:0.01 0|0:0.02 1|1:1.99 0|1:1 0|0:0 -chr20 97324 . G GTT . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97324 . GT G . PASS DR2=0.01;AF=0.0007;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97360 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97361 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97364 . C G . PASS DR2=0.02;AF=0.0009;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97422 . C T . PASS DR2=0.04;AF=0.0029;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0.05 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97513 . G A . PASS DR2=0.01;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97571 . T C . PASS DR2=0.02;AF=0.0008;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 -chr20 97613 . C T . PASS DR2=0.01;AF=0.0005;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97615 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97616 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97620 . G GATT . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97620 . GATT G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97688 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97717 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97800 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97814 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97834 . G C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97873 . C T . PASS DR2=0.01;AF=0.0011;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 -chr20 97924 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 97925 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 98004 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 98176 . C A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 98200 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 98207 . T A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 98244 . C A . PASS DR2=0.05;AF=0.0036;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.07 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 98297 . T C . PASS DR2=0.01;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 98352 . GTT G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 98404 . G A . PASS DR2=0.01;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 98471 . G GT . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 98471 . GT G . PASS DR2=0.8;AF=0.1307;IMP GT:DS 0|1:1 1|0:0.75 0|1:1 0|0:0 0|0:0 0|0:0.01 0|1:0.63 0|0:0 0|0:0 0|0:0.27 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 98593 . T C . PASS DR2=0.91;AF=0.6018;IMP GT:DS 0|1:1 1|0:0.76 1|1:2 1|0:1 1|1:2 1|1:1.96 0|1:0.82 1|0:1 1|1:2 0|0:0.32 0|0:0 1|1:2 0|1:1 0|1:1 -chr20 98658 . A G . PASS DR2=0.01;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 98711 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 98722 . G C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 98778 . CTTGAG C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 98792 . T C . PASS DR2=0.04;AF=0.0016;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.04 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 98818 . G A . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 98925 . C T . PASS DR2=0.01;AF=0.0007;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 -chr20 98973 . T C . PASS DR2=0.01;AF=0.0015;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 -chr20 98976 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 98991 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 99092 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 99154 . A G . PASS DR2=0.02;AF=0.0013;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.04 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 99223 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 99308 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 99348 . G A . PASS DR2=0.01;AF=0.001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 -chr20 99430 . G A . PASS DR2=0.87;AF=0.5838;IMP GT:DS 0|1:1 1|0:0.75 1|1:2 1|0:1 1|1:2 1|1:1.65 0|1:0.68 1|0:1 1|1:2 0|0:0.27 0|0:0 1|1:2 0|1:1 0|1:1 -chr20 99657 . T C . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 99699 . A C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 99840 . G C . PASS DR2=0.15;AF=0.0103;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.21 0|0:0.07 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 99872 . C G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 99952 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 100014 . A G . PASS DR2=1;AF=0.8929 GT:DS 1|1:2 1|1:2 1|1:2 1|1:2 1|1:2 1|1:2 1|1:2 1|0:1 1|1:2 1|1:2 0|1:1 1|1:2 1|1:2 0|1:1 -chr20 100082 . ACT A . PASS DR2=0;AF=0.0005;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 100087 . C G . PASS DR2=0.11;AF=0.006;IMP GT:DS 0|0:0 0|0:0.14 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 100198 . G A . PASS DR2=0.01;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 100220 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 100248 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 100357 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 100360 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 100365 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 100382 . C G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 100402 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 100430 . A G . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 100657 . G A . PASS DR2=0.04;AF=0.0013;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.04 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 100704 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 100970 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101025 . C T . PASS DR2=0.02;AF=0.0028;IMP GT:DS 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0.01 0|0:0.02 0|0:0 -chr20 101084 . A C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101205 . CTTCT C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101295 . G GT . PASS DR2=0.01;AF=0.0019;IMP GT:DS 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0.01 0|0:0 0|0:0 -chr20 101308 . T C . PASS DR2=0.03;AF=0.0012;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101317 . A G . PASS DR2=0.04;AF=0.0016;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.04 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101338 . T C . PASS DR2=0.91;AF=0.219;IMP GT:DS 0|0:0.02 0|0:0.05 1|0:1.06 1|0:0.89 0|0:0.01 0|0:0.04 0|0:0.07 0|0:0 1|1:1.99 0|0:0.02 0|0:0 1|0:0.99 0|1:1 0|0:0 -chr20 101339 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101341 . A G . PASS DR2=0.08;AF=0.0162;IMP GT:DS 0|0:0.02 0|0:0.04 0|0:0 0|0:0.02 0|0:0 0|0:0.01 0|0:0.16 0|0:0 0|0:0 0|0:0.21 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101342 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101343 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101346 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101349 . C G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101365 . A G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101367 . G C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101371 . T A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101372 . G A . PASS DR2=0.51;AF=0.044;IMP GT:DS 0|0:0 0|0:0 0|0:0.01 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 1|0:0.62 0|0:0 0|0:0 0|0:0.03 0|1:0.55 0|0:0 -chr20 101379 . G A . PASS DR2=0.04;AF=0.0015;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.04 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101386 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101397 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101410 . CTA C . PASS DR2=0.12;AF=0.005;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.13 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101411 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101411 . T G . PASS DR2=0.04;AF=0.0017;IMP GT:DS 0|0:0 0|0:0 0|0:0.05 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101423 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101428 . GGA G . PASS DR2=0.12;AF=0.005;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0.13 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101429 . GAGCA G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101430 . A C . PASS DR2=0.09;AF=0.0045;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.11 0|0:0 0|0:0 0|0:0.02 0|0:0 -chr20 101430 . A T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101432 . CA C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101433 . A C . PASS DR2=0.78;AF=0.2581;IMP GT:DS 1|0:0.96 1|1:1.67 0|1:0.79 0|1:1.17 0|0:0 0|0:0.02 1|0:0.94 0|0:0 0|0:0 1|0:0.67 0|0:0 0|0:0.01 1|0:0.98 0|0:0 -chr20 101435 . G A . PASS DR2=0.01;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101437 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101438 . G A . PASS DR2=0.78;AF=0.2636;IMP GT:DS 1|0:0.96 1|1:1.67 0|1:0.79 0|1:1.31 0|0:0 0|0:0.02 1|0:0.94 0|0:0 0|0:0 1|0:0.68 0|0:0 0|0:0.01 1|0:0.99 0|0:0 -chr20 101438 . G GCCA . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101441 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101442 . T C . PASS DR2=0.76;AF=0.0412;IMP GT:DS 0|0:0.02 0|1:0.94 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0.11 0|0:0 0|0:0 0|0:0.07 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101447 . G A . PASS DR2=0.02;AF=0.0011;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101448 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101474 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101478 . G A . PASS DR2=0.02;AF=0.0012;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101485 . CCT C . PASS DR2=0.02;AF=0.0012;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101486 . C A . PASS DR2=0.8;AF=0.0455;IMP GT:DS 0|1:1 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.04 0|0:0 0|0:0 0|0:0.1 0|0:0 0|0:0 0|0:0 0|0:0.12 0|0:0 -chr20 101486 . C G . PASS DR2=0.9;AF=0.0348;IMP GT:DS 0|0:0 0|1:0.94 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 -chr20 101488 . G GCA . PASS DR2=0.02;AF=0.0012;IMP GT:DS 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101488 . G GTT . PASS DR2=0.01;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 -chr20 101491 . A G . PASS DR2=1;AF=0.0358;IMP GT:DS 0|1:1 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101495 . A G . PASS DR2=0.01;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 -chr20 101496 . G C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101498 . A C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101498 . A G . PASS DR2=0.78;AF=0.232;IMP GT:DS 0|0:0.02 0|0:0.08 1|0:1.1 1|0:0.89 0|0:0.01 0|0:0.05 0|1:0.64 0|0:0 1|1:1.77 0|0:0.06 0|0:0.03 1|0:1 0|1:0.86 0|0:0 -chr20 101505 . A C . PASS DR2=0.69;AF=0.2184;IMP GT:DS 0|0:0.02 0|0:0.12 1|0:1.19 1|0:0.92 0|0:0.01 0|0:0.06 0|1:0.66 0|0:0 0|1:1.39 0|0:0.08 0|0:0.22 1|0:1 0|0:0.45 0|0:0 -chr20 101506 . G C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101511 . C T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101515 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101520 . G C . PASS DR2=0.01;AF=0.0007;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101527 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101530 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101531 . A G . PASS DR2=0.14;AF=0.0149;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0.21 0|0:0.01 0|0:0.02 0|0:0 0|0:0.14 0|0:0 -chr20 101531 . A T . PASS DR2=0.15;AF=0.0055;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.15 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101535 . T C . PASS DR2=0.14;AF=0.0207;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.16 0|0:0.03 0|0:0 0|0:0.21 0|0:0.02 0|0:0.02 0|0:0 0|0:0.14 0|0:0 -chr20 101536 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101545 . A C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101555 . A G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101563 . T A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101563 . T C . PASS DR2=0.16;AF=0.0198;IMP GT:DS 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0.01 0|0:0.03 0|0:0 0|0:0.21 0|0:0.02 0|0:0 0|0:0.01 0|0:0.24 0|0:0 -chr20 101566 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101568 . G A . PASS DR2=0.88;AF=0.0334;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 1|0:0.91 0|0:0 -chr20 101571 . A T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101573 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101574 . G A . PASS DR2=0.9;AF=0.1579;IMP GT:DS 0|1:1 0|0:0 0|0:0 0|0:0 0|1:0.96 0|0:0.43 0|0:0 1|0:1 0|0:0 0|0:0.02 0|0:0.02 0|1:1 0|0:0 0|0:0 -chr20 101575 . C T . PASS DR2=0.88;AF=0.0334;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 1|0:0.91 0|0:0 -chr20 101576 . G A . PASS DR2=0.78;AF=0.7497;IMP GT:DS 1|1:1.9 1|1:1.92 1|1:1.95 1|1:1.97 1|1:1.96 1|1:1.63 1|1:1.7 1|0:1 1|1:2 1|0:0.85 0|0:0.02 1|1:1.99 0|1:1.09 0|1:1 -chr20 101576 . G T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101579 . C T . PASS DR2=0.88;AF=0.0334;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 1|0:0.91 0|0:0 -chr20 101580 . G A . PASS DR2=0.03;AF=0.0024;IMP GT:DS 0|0:0 0|0:0.02 0|0:0.04 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101582 . C T . PASS DR2=0.46;AF=0.018;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0.49 0|0:0 0|0:0 0|0:0 -chr20 101590 . T A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101617 . A G . PASS DR2=0.01;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101619 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101718 . T C . PASS DR2=0.01;AF=0.0007;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0.01 0|0:0 -chr20 101774 . A C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101775 . T C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101819 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101929 . T TTA . PASS DR2=0.22;AF=0.0145;IMP GT:DS 0|0:0 0|0:0 0|0:0.3 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.04 0|0:0 0|0:0 0|0:0 0|0:0.05 0|0:0 -chr20 101930 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 101944 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 102060 . T C . PASS DR2=0.94;AF=0.1499;IMP GT:DS 0|1:1 0|0:0 0|0:0 0|0:0 0|1:0.99 0|0:0.19 0|0:0 1|0:1 0|0:0 0|0:0.02 0|0:0 0|1:1 0|0:0 0|0:0 -chr20 102113 . C G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 102129 . C A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 102134 . C G . PASS DR2=0.01;AF=0.0005;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 102181 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 102257 . C T . PASS DR2=0.01;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 102326 . C T . PASS DR2=0.01;AF=0.001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 -chr20 102420 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 102466 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 102555 . A T . PASS DR2=0.03;AF=0.0012;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 102591 . T TTC . PASS DR2=0.7;AF=0.0581;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.07 0|0:0 0|0:0 0|0:0.47 0|0:0.1 0|0:0 0|0:0 1|0:0.97 -chr20 102591 . T TTCTC . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 102591 . T TTCTCTCTC . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 102591 . TTC T . PASS DR2=0.83;AF=0.0431;IMP GT:DS 0|1:1 0|0:0 0|0:0 0|0:0 0|0:0.03 0|0:0.06 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.1 0|0:0 0|0:0 -chr20 102591 . TTCTC T . PASS DR2=0.01;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 102619 . G C . PASS DR2=0.61;AF=0.2109;IMP GT:DS 0|0:0.14 1|0:0.95 0|1:1.3 0|0:0.31 0|0:0 0|0:0.02 0|1:0.75 0|0:0.17 0|0:0.2 1|0:0.87 0|0:0.02 0|0:0.15 1|0:1.01 0|0:0 -chr20 102621 . G C . PASS DR2=0.58;AF=0.2275;IMP GT:DS 1|0:0.85 0|1:1.02 1|0:0.65 0|1:1.3 0|0:0 0|0:0.06 1|1:1.42 0|0:0 0|0:0.52 0|0:0.02 0|0:0 0|0:0.22 0|0:0.31 0|0:0 -chr20 102623 . G C . PASS DR2=0.74;AF=0.1323;IMP GT:DS 0|0:0.01 0|1:0.94 0|0:0.02 0|0:0.06 0|0:0 0|1:0.82 0|1:0.62 0|0:0 0|0:0.11 0|0:0.02 0|0:0 0|0:0.09 0|0:0.01 0|1:1 -chr20 102625 . G C . PASS DR2=0.56;AF=0.0574;IMP GT:DS 0|0:0 0|0:0 0|0:0.02 0|0:0.04 0|0:0 0|1:0.79 0|1:0.56 0|0:0 0|0:0.11 0|0:0 0|0:0 0|0:0.09 0|0:0 0|0:0 -chr20 102627 . G C . PASS DR2=0.38;AF=0.0291;IMP GT:DS 0|0:0 0|0:0 0|0:0.02 0|0:0.04 0|0:0 0|0:0.01 0|1:0.55 0|0:0 0|0:0.11 0|0:0 0|0:0 0|0:0.08 0|0:0 0|0:0 -chr20 102629 . G C . PASS DR2=0.38;AF=0.0294;IMP GT:DS 0|0:0 0|0:0 0|0:0.02 0|0:0.05 0|0:0 0|0:0 0|1:0.55 0|0:0 0|0:0.11 0|0:0 0|0:0 0|0:0.09 0|0:0 0|0:0 -chr20 102631 . G C . PASS DR2=0.38;AF=0.0294;IMP GT:DS 0|0:0 0|0:0 0|0:0.01 0|0:0.05 0|0:0 0|0:0 0|1:0.55 0|0:0 0|0:0.11 0|0:0 0|0:0 0|0:0.1 0|0:0 0|0:0 -chr20 102632 . T C . PASS DR2=0.48;AF=0.0211;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|1:0.54 0|0:0 0|0:0 0|0:0.05 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 102633 . G C . PASS DR2=0.58;AF=0.144;IMP GT:DS 0|0:0.02 0|0:0 0|0:0.1 1|0:0.52 0|0:0 0|0:0 0|1:0.6 0|0:0 1|1:1.41 0|0:0 0|0:0 1|0:0.82 0|1:0.55 0|0:0 -chr20 102634 . T C . PASS DR2=0.86;AF=0.1562;IMP GT:DS 0|0:0 1|0:0.97 0|1:0.99 0|0:0 0|0:0 0|0:0 0|1:0.55 0|0:0 0|0:0 1|0:0.87 0|0:0 0|0:0 1|0:0.98 0|0:0 -chr20 102635 . G C . PASS DR2=0.51;AF=0.0195;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|1:0.54 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 -chr20 102638 . T C . PASS DR2=0.51;AF=0.0198;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|1:0.54 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0.01 0|0:0 -chr20 102640 . T C . PASS DR2=0.52;AF=0.0193;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|1:0.54 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 102647 . G C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 102817 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 102929 . T G . PASS DR2=0.23;AF=0.0095;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.25 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 102970 . C A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 102988 . C T . PASS DR2=0.01;AF=0.0005;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 102989 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 103021 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 103151 . A G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 103160 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 103288 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 103296 . G A . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 103330 . C T . PASS DR2=0.01;AF=0.0006;IMP GT:DS 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 103355 . C G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 103461 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 103560 . G A . PASS DR2=0.99;AF=0.036;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|1:1 -chr20 103581 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 103784 . A T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 103833 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 103844 . G C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 103847 . A G . PASS DR2=0.01;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 103858 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 104025 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 104075 . G A . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 104086 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 104106 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 104197 . G A . PASS DR2=0.02;AF=0.0006;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.02 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 104251 . C T . PASS DR2=0.05;AF=0.002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.06 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 104265 . T C . PASS DR2=0;AF=0.0003;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 104475 . C T . PASS DR2=0.92;AF=0.0341;IMP GT:DS 0|0:0 0|1:0.94 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0.01 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 104532 . G T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 104562 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 104813 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 104860 . C G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 105010 . G C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 105234 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 105260 . C G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 105296 . G C . PASS DR2=0.07;AF=0.0076;IMP GT:DS 0|0:0 0|0:0 0|0:0.01 0|0:0.03 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.09 0|0:0 0|0:0 0|0:0.08 0|0:0 0|0:0 -chr20 105364 . C A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 105364 . C CA . PASS DR2=0.6;AF=0.0601;IMP GT:DS 0|0:0 1|0:0.61 0|1:0.79 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0.27 0|0:0.01 0|0:0 0|0:0 0|0:0 -chr20 105364 . CA C . PASS DR2=0.86;AF=0.3771;IMP GT:DS 1|0:0.98 0|0:0.29 1|0:0.99 1|1:1.99 0|0:0.02 0|0:0.2 1|0:0.97 0|0:0 1|1:1.98 0|0:0.15 0|0:0.01 1|0:0.99 0|1:1 0|1:1 -chr20 105364 . CAA C . PASS DR2=0.07;AF=0.0114;IMP GT:DS 0|0:0.02 0|0:0 0|0:0.1 0|0:0.01 0|0:0 0|0:0.11 0|0:0.06 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 -chr20 105376 . A C . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 105397 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 105419 . A T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 105462 . G T . PASS DR2=0;AF=0.0002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 105474 . G A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 105500 . A G . PASS DR2=0.05;AF=0.002;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.06 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 105523 . AG A . PASS DR2=0.99;AF=0.1794;IMP GT:DS 0|1:1 0|0:0 0|0:0 0|0:0 0|1:1 0|1:1.02 0|0:0 1|0:1 0|0:0 0|0:0 0|0:0 0|1:1 0|0:0 0|0:0 -chr20 105675 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 105681 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 105703 . C G . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 105812 . T A . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 106000 . A T . PASS DR2=0;AF=0.0001;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 106048 . T C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 106171 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 106287 . A T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 106335 . G C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 106362 . A C . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 106426 . A T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 106446 . TA T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 106459 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 106470 . C T . PASS DR2=0.01;AF=0.0004;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.01 0|0:0 0|0:0 0|0:0 -chr20 106471 . G A . PASS DR2=1;AF=0.3929 GT:DS 1|0:1 0|1:1 1|0:1 1|1:2 0|0:0 0|0:0 1|0:1 0|0:0 1|1:2 0|0:0 0|0:0 1|0:1 0|1:1 0|1:1 -chr20 106613 . C T . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 106622 . GA G . PASS DR2=1;AF=0.1786;IMP GT:DS 0|1:1 0|0:0 0|0:0 0|0:0 0|1:1 0|1:1 0|0:0 1|0:1 0|0:0 0|0:0 0|0:0 0|1:1 0|0:0 0|0:0 -chr20 106679 . A G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 106690 . C G . PASS DR2=0;AF=0;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 -chr20 106721 . A G . PASS DR2=0.43;AF=0.0191;IMP GT:DS 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0 0|0:0.05 0|0:0 0|0:0 0|0:0 0|0:0.48 0|0:0 0|0:0 0|0:0 diff --git a/sgkit/tests/io/vcf/data/1kg_target_chr20_38_imputed_chr20_500000.vcf.bgz b/sgkit/tests/io/vcf/data/1kg_target_chr20_38_imputed_chr20_500000.vcf.bgz deleted file mode 100644 index 5c2986878..000000000 Binary files a/sgkit/tests/io/vcf/data/1kg_target_chr20_38_imputed_chr20_500000.vcf.bgz and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.csi.g.vcf.bgz b/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.csi.g.vcf.bgz deleted file mode 100644 index 19c47011c..000000000 Binary files a/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.csi.g.vcf.bgz and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.csi.g.vcf.bgz.csi b/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.csi.g.vcf.bgz.csi deleted file mode 100644 index 72529fdd7..000000000 Binary files a/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.csi.g.vcf.bgz.csi and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.g.bcf b/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.g.bcf deleted file mode 100644 index c45cd1fba..000000000 Binary files a/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.g.bcf and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.g.bcf.csi b/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.g.bcf.csi deleted file mode 100644 index 33e9e6351..000000000 Binary files a/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.g.bcf.csi and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.g.vcf.bgz b/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.g.vcf.bgz deleted file mode 100644 index 19c47011c..000000000 Binary files a/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.g.vcf.bgz and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.g.vcf.bgz.tbi b/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.g.vcf.bgz.tbi deleted file mode 100644 index b2f2a90e0..000000000 Binary files a/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.g.vcf.bgz.tbi and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.noindex.g.vcf.bgz b/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.noindex.g.vcf.bgz deleted file mode 100644 index 19c47011c..000000000 Binary files a/sgkit/tests/io/vcf/data/CEUTrio.20.21.gatk3.4.noindex.g.vcf.bgz and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/CEUTrio.20.gatk3.4.g.vcf.bgz b/sgkit/tests/io/vcf/data/CEUTrio.20.gatk3.4.g.vcf.bgz deleted file mode 100644 index 307725c06..000000000 Binary files a/sgkit/tests/io/vcf/data/CEUTrio.20.gatk3.4.g.vcf.bgz and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/CEUTrio.20.gatk3.4.g.vcf.bgz.tbi b/sgkit/tests/io/vcf/data/CEUTrio.20.gatk3.4.g.vcf.bgz.tbi deleted file mode 100644 index 5ebf0a4ee..000000000 Binary files a/sgkit/tests/io/vcf/data/CEUTrio.20.gatk3.4.g.vcf.bgz.tbi and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/CEUTrio.21.gatk3.4.g.vcf.bgz b/sgkit/tests/io/vcf/data/CEUTrio.21.gatk3.4.g.vcf.bgz deleted file mode 100644 index a5e2ffdaa..000000000 Binary files a/sgkit/tests/io/vcf/data/CEUTrio.21.gatk3.4.g.vcf.bgz and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/CEUTrio.21.gatk3.4.g.vcf.bgz.tbi b/sgkit/tests/io/vcf/data/CEUTrio.21.gatk3.4.g.vcf.bgz.tbi deleted file mode 100644 index d5d7f8b32..000000000 Binary files a/sgkit/tests/io/vcf/data/CEUTrio.21.gatk3.4.g.vcf.bgz.tbi and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/Homo_sapiens_assembly38.headerOnly.vcf.gz b/sgkit/tests/io/vcf/data/Homo_sapiens_assembly38.headerOnly.vcf.gz deleted file mode 100644 index 8c3c28dae..000000000 Binary files a/sgkit/tests/io/vcf/data/Homo_sapiens_assembly38.headerOnly.vcf.gz and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/NA12878.prod.chr20snippet.g.vcf.gz b/sgkit/tests/io/vcf/data/NA12878.prod.chr20snippet.g.vcf.gz deleted file mode 100644 index a4fe1d4b1..000000000 Binary files a/sgkit/tests/io/vcf/data/NA12878.prod.chr20snippet.g.vcf.gz and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/NA12878.prod.chr20snippet.g.vcf.gz.tbi b/sgkit/tests/io/vcf/data/NA12878.prod.chr20snippet.g.vcf.gz.tbi deleted file mode 100644 index cb96695e3..000000000 Binary files a/sgkit/tests/io/vcf/data/NA12878.prod.chr20snippet.g.vcf.gz.tbi and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/all_fields.vcf b/sgkit/tests/io/vcf/data/all_fields.vcf deleted file mode 100644 index 8c6f351ba..000000000 --- a/sgkit/tests/io/vcf/data/all_fields.vcf +++ /dev/null @@ -1,257 +0,0 @@ -##fileformat=VCFv4.3 -##contig= -##contig= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s1 s2 -1 1 . G A,C . PASS IB0 . . . -1 2 . A G,G . PASS II1=126 . . . -1 3 . A G,G . PASS . . . . -1 4 . T A,C . PASS II2=459,-140 . . . -1 5 . T A,C . PASS II2=.,-140 . . . -1 6 . T A,C . PASS II2=459,. . . . -1 7 . T A,C . PASS . . . . -1 8 . A A,G . PASS IIA=294,130 . . . -1 9 . A A,G . PASS IIA=.,130 . . . -1 10 . A A,G . PASS IIA=294,. . . . -1 11 . A A,G . PASS . . . . -1 12 . A A,G . PASS IIR=95,724,44 . . . -1 13 . A A,G . PASS IIR=.,724,44 . . . -1 14 . A A,G . PASS IIR=95,.,44 . . . -1 15 . A A,G . PASS IIR=95,724,. . . . -1 16 . A A,G . PASS . . . . -1 17 . G A,G . PASS IID=-879,-534,238,-670,482,-913,396 . . . -1 18 . G A,G . PASS IID=.,-534,238,-670,482,-913,396 . . . -1 19 . G A,G . PASS IID=-879,.,238,-670,482,-913,396 . . . -1 20 . G A,G . PASS IID=-879,-534,.,-670,482,-913,396 . . . -1 21 . G A,G . PASS IID=-879,-534,238,.,482,-913,396 . . . -1 22 . G A,G . PASS IID=-879,-534,238,-670,.,-913,396 . . . -1 23 . G A,G . PASS IID=-879,-534,238,-670,482,.,396 . . . -1 24 . G A,G . PASS IID=-879,-534,238,-670,482,-913,. . . . -1 25 . G A,G . PASS . . . . -1 26 . G A,G . PASS IID=-129,687,-870,685 . . . -1 27 . G A,G . PASS IID=.,687,-870,685 . . . -1 28 . G A,G . PASS IID=-129,.,-870,685 . . . -1 29 . G A,G . PASS IID=-129,687,.,685 . . . -1 30 . G A,G . PASS IID=-129,687,-870,. . . . -1 31 . G A,G . PASS . . . . -1 32 . T A,T . PASS IF1=-887.177 . . . -1 33 . T A,T . PASS . . . . -1 34 . G C,A . PASS IF2=443.998,877.105 . . . -1 35 . G C,A . PASS IF2=.,877.105 . . . -1 36 . G C,A . PASS IF2=443.998,. . . . -1 37 . G C,A . PASS . . . . -1 38 . T C,A . PASS IFA=-998.442,984.423 . . . -1 39 . T C,A . PASS IFA=.,984.423 . . . -1 40 . T C,A . PASS IFA=-998.442,. . . . -1 41 . T C,A . PASS . . . . -1 42 . A T,G . PASS IFR=234.963,223.306,-985.867 . . . -1 43 . A T,G . PASS IFR=.,223.306,-985.867 . . . -1 44 . A T,G . PASS IFR=234.963,.,-985.867 . . . -1 45 . A T,G . PASS IFR=234.963,223.306,. . . . -1 46 . A T,G . PASS . . . . -1 47 . T G,G . PASS IFD=-417.542,223.706,-721.012,-415.711,-267.276,-87.86,570.352,-600.652,28.4689 . . . -1 48 . T G,G . PASS IFD=.,223.706,-721.012,-415.711,-267.276,-87.86,570.352,-600.652,28.4689 . . . -1 49 . T G,G . PASS IFD=-417.542,.,-721.012,-415.711,-267.276,-87.86,570.352,-600.652,28.4689 . . . -1 50 . T G,G . PASS IFD=-417.542,223.706,.,-415.711,-267.276,-87.86,570.352,-600.652,28.4689 . . . -1 51 . T G,G . PASS IFD=-417.542,223.706,-721.012,.,-267.276,-87.86,570.352,-600.652,28.4689 . . . -1 52 . T G,G . PASS IFD=-417.542,223.706,-721.012,-415.711,.,-87.86,570.352,-600.652,28.4689 . . . -1 53 . T G,G . PASS IFD=-417.542,223.706,-721.012,-415.711,-267.276,.,570.352,-600.652,28.4689 . . . -1 54 . T G,G . PASS IFD=-417.542,223.706,-721.012,-415.711,-267.276,-87.86,.,-600.652,28.4689 . . . -1 55 . T G,G . PASS IFD=-417.542,223.706,-721.012,-415.711,-267.276,-87.86,570.352,.,28.4689 . . . -1 56 . T G,G . PASS IFD=-417.542,223.706,-721.012,-415.711,-267.276,-87.86,570.352,-600.652,. . . . -1 57 . T G,G . PASS . . . . -1 58 . T G,G . PASS IFD=-907.099,215.09,-658.952,-869.897,897.771,931.264,616.795,-390.772,-804.656 . . . -1 59 . T G,G . PASS IFD=.,215.09,-658.952,-869.897,897.771,931.264,616.795,-390.772,-804.656 . . . -1 60 . T G,G . PASS IFD=-907.099,.,-658.952,-869.897,897.771,931.264,616.795,-390.772,-804.656 . . . -1 61 . T G,G . PASS IFD=-907.099,215.09,.,-869.897,897.771,931.264,616.795,-390.772,-804.656 . . . -1 62 . T G,G . PASS IFD=-907.099,215.09,-658.952,.,897.771,931.264,616.795,-390.772,-804.656 . . . -1 63 . T G,G . PASS IFD=-907.099,215.09,-658.952,-869.897,.,931.264,616.795,-390.772,-804.656 . . . -1 64 . T G,G . PASS IFD=-907.099,215.09,-658.952,-869.897,897.771,.,616.795,-390.772,-804.656 . . . -1 65 . T G,G . PASS IFD=-907.099,215.09,-658.952,-869.897,897.771,931.264,.,-390.772,-804.656 . . . -1 66 . T G,G . PASS IFD=-907.099,215.09,-658.952,-869.897,897.771,931.264,616.795,.,-804.656 . . . -1 67 . T G,G . PASS IFD=-907.099,215.09,-658.952,-869.897,897.771,931.264,616.795,-390.772,. . . . -1 68 . T G,G . PASS . . . . -1 69 . T C,G . PASS IC1=f . . . -1 70 . T C,G . PASS . . . . -1 71 . G T,G . PASS IC2=e,a . . . -1 72 . G T,G . PASS IC2=.,a . . . -1 73 . G T,G . PASS IC2=e,. . . . -1 74 . G T,G . PASS . . . . -1 75 . A C,A . PASS ICA=b,a . . . -1 76 . A C,A . PASS ICA=.,a . . . -1 77 . A C,A . PASS ICA=b,. . . . -1 78 . A C,A . PASS . . . . -1 79 . C G,C . PASS ICR=c,b,b . . . -1 80 . C G,C . PASS ICR=.,b,b . . . -1 81 . C G,C . PASS ICR=c,.,b . . . -1 82 . C G,C . PASS ICR=c,b,. . . . -1 83 . C G,C . PASS . . . . -1 84 . T G,G . PASS ICD=b,f,b,c . . . -1 85 . T G,G . PASS ICD=.,f,b,c . . . -1 86 . T G,G . PASS ICD=b,.,b,c . . . -1 87 . T G,G . PASS ICD=b,f,.,c . . . -1 88 . T G,G . PASS ICD=b,f,b,. . . . -1 89 . T G,G . PASS . . . . -1 90 . T G,G . PASS ICD=g,e,d,e,f,f,b . . . -1 91 . T G,G . PASS ICD=.,e,d,e,f,f,b . . . -1 92 . T G,G . PASS ICD=g,.,d,e,f,f,b . . . -1 93 . T G,G . PASS ICD=g,e,.,e,f,f,b . . . -1 94 . T G,G . PASS ICD=g,e,d,.,f,f,b . . . -1 95 . T G,G . PASS ICD=g,e,d,e,.,f,b . . . -1 96 . T G,G . PASS ICD=g,e,d,e,f,.,b . . . -1 97 . T G,G . PASS ICD=g,e,d,e,f,f,. . . . -1 98 . T G,G . PASS . . . . -1 99 . A C,C . PASS IS1=bc . . . -1 100 . A C,C . PASS . . . . -1 101 . T T,C . PASS IS2=hij,d . . . -1 102 . T T,C . PASS IS2=.,d . . . -1 103 . T T,C . PASS IS2=hij,. . . . -1 104 . T T,C . PASS . . . . -1 105 . T C,C . PASS ISA=bc,efg . . . -1 106 . T C,C . PASS ISA=.,efg . . . -1 107 . T C,C . PASS ISA=bc,. . . . -1 108 . T C,C . PASS . . . . -1 109 . C G,T . PASS ISR=d,bc,op . . . -1 110 . C G,T . PASS ISR=.,bc,op . . . -1 111 . C G,T . PASS ISR=d,.,op . . . -1 112 . C G,T . PASS ISR=d,bc,. . . . -1 113 . C G,T . PASS . . . . -1 114 . G A,A . PASS ISD=ab,hij,klmn,d,ab,d,op,efg . . . -1 115 . G A,A . PASS ISD=.,hij,klmn,d,ab,d,op,efg . . . -1 116 . G A,A . PASS ISD=ab,.,klmn,d,ab,d,op,efg . . . -1 117 . G A,A . PASS ISD=ab,hij,.,d,ab,d,op,efg . . . -1 118 . G A,A . PASS ISD=ab,hij,klmn,.,ab,d,op,efg . . . -1 119 . G A,A . PASS ISD=ab,hij,klmn,d,.,d,op,efg . . . -1 120 . G A,A . PASS ISD=ab,hij,klmn,d,ab,.,op,efg . . . -1 121 . G A,A . PASS ISD=ab,hij,klmn,d,ab,d,.,efg . . . -1 122 . G A,A . PASS ISD=ab,hij,klmn,d,ab,d,op,. . . . -1 123 . G A,A . PASS . . . . -1 124 . G A,A . PASS ISD=op,op,ab . . . -1 125 . G A,A . PASS ISD=.,op,ab . . . -1 126 . G A,A . PASS ISD=op,.,ab . . . -1 127 . G A,A . PASS ISD=op,op,. . . . -1 128 . G A,A . PASS . . . . -2 129 . G G,G . PASS . FI1 -795 . -2 130 . C G,A . PASS . FI2 104,955 .,955 -2 131 . C G,A . PASS . FI2 104,. . -2 132 . C C,T . PASS . FIA 585,895 .,895 -2 133 . C C,T . PASS . FIA 585,. . -2 134 . T C,G . PASS . FIR 411,25,21 .,25,21 -2 135 . T C,G . PASS . FIR 411,.,21 411,25,. -2 136 . T C,G . PASS . FIR . . -2 137 . A T,T . PASS . FIG 413,-435,129,795,845,500 .,-435,129,795,845,500 -2 138 . A T,T . PASS . FIG 413,.,129,795,845,500 413,-435,.,795,845,500 -2 139 . A T,T . PASS . FIG 413,-435,129,.,845,500 413,-435,129,795,.,500 -2 140 . A T,T . PASS . FIG 413,-435,129,795,845,. . -2 141 . C G,G . PASS . FID -271,579 .,579 -2 142 . C G,G . PASS . FID -271,. . -2 143 . C G,G . PASS . FID -799,981 .,981 -2 144 . C G,G . PASS . FID -799,. . -2 145 . A T,G . PASS . FF1 853.318 . -2 146 . T G,A . PASS . FF2 454.544,-346.918 .,-346.918 -2 147 . T G,A . PASS . FF2 454.544,. . -2 148 . C A,T . PASS . FFA 140.888,41.6685 .,41.6685 -2 149 . C A,T . PASS . FFA 140.888,. . -2 150 . T T,C . PASS . FFR 922.344,689.068,494.64 .,689.068,494.64 -2 151 . T T,C . PASS . FFR 922.344,.,494.64 922.344,689.068,. -2 152 . T T,C . PASS . FFR . . -2 153 . A T,T . PASS . FFG 79.3843,173.502,930.511,214.068,-448.002,-407.453 .,173.502,930.511,214.068,-448.002,-407.453 -2 154 . A T,T . PASS . FFG 79.3843,.,930.511,214.068,-448.002,-407.453 79.3843,173.502,.,214.068,-448.002,-407.453 -2 155 . A T,T . PASS . FFG 79.3843,173.502,930.511,.,-448.002,-407.453 79.3843,173.502,930.511,214.068,.,-407.453 -2 156 . A T,T . PASS . FFG 79.3843,173.502,930.511,214.068,-448.002,. . -2 157 . A C,A . PASS . FFD -968.727 . -2 158 . A C,A . PASS . FFD 544.49,-602.569,-988.956,630.923,413.715,458.014,542.541,-851.911,-283.069 .,-602.569,-988.956,630.923,413.715,458.014,542.541,-851.911,-283.069 -2 159 . A C,A . PASS . FFD 544.49,.,-988.956,630.923,413.715,458.014,542.541,-851.911,-283.069 544.49,-602.569,.,630.923,413.715,458.014,542.541,-851.911,-283.069 -2 160 . A C,A . PASS . FFD 544.49,-602.569,-988.956,.,413.715,458.014,542.541,-851.911,-283.069 544.49,-602.569,-988.956,630.923,.,458.014,542.541,-851.911,-283.069 -2 161 . A C,A . PASS . FFD 544.49,-602.569,-988.956,630.923,413.715,.,542.541,-851.911,-283.069 544.49,-602.569,-988.956,630.923,413.715,458.014,.,-851.911,-283.069 -2 162 . A C,A . PASS . FFD 544.49,-602.569,-988.956,630.923,413.715,458.014,542.541,.,-283.069 544.49,-602.569,-988.956,630.923,413.715,458.014,542.541,-851.911,. -2 163 . A C,A . PASS . FFD . . -2 164 . T T,A . PASS . FC1 d . -2 165 . G C,T . PASS . FC2 c,b .,b -2 166 . G C,T . PASS . FC2 c,. . -2 167 . G G,A . PASS . FCA c,g .,g -2 168 . G G,A . PASS . FCA c,. . -2 169 . G C,G . PASS . FCR a,b,c .,b,c -2 170 . G C,G . PASS . FCR a,.,c a,b,. -2 171 . G C,G . PASS . FCR . . -2 172 . G A,A . PASS . FCG a,e,b,g,g,a .,e,b,g,g,a -2 173 . G A,A . PASS . FCG a,.,b,g,g,a a,e,.,g,g,a -2 174 . G A,A . PASS . FCG a,e,b,.,g,a a,e,b,g,.,a -2 175 . G A,A . PASS . FCG a,e,b,g,g,. . -2 176 . A G,A . PASS . FCD a,g,d,d,f,f,b,a,d .,g,d,d,f,f,b,a,d -2 177 . A G,A . PASS . FCD a,.,d,d,f,f,b,a,d a,g,.,d,f,f,b,a,d -2 178 . A G,A . PASS . FCD a,g,d,.,f,f,b,a,d a,g,d,d,.,f,b,a,d -2 179 . A G,A . PASS . FCD a,g,d,d,f,.,b,a,d a,g,d,d,f,f,.,a,d -2 180 . A G,A . PASS . FCD a,g,d,d,f,f,b,.,d a,g,d,d,f,f,b,a,. -2 181 . A G,A . PASS . FCD . c,d,f,e,g,a,c -2 182 . A G,A . PASS . FCD .,d,f,e,g,a,c c,.,f,e,g,a,c -2 183 . A G,A . PASS . FCD c,d,.,e,g,a,c c,d,f,.,g,a,c -2 184 . A G,A . PASS . FCD c,d,f,e,.,a,c c,d,f,e,g,.,c -2 185 . A G,A . PASS . FCD c,d,f,e,g,a,. . -2 186 . C T,A . PASS . FS1 bc . -2 187 . C C,C . PASS . FS2 bc,op .,op -2 188 . C C,C . PASS . FS2 bc,. . -2 189 . C T,G . PASS . FSA ab,op .,op -2 190 . C T,G . PASS . FSA ab,. . -2 191 . T T,T . PASS . FSR klmn,bc,efg .,bc,efg -2 192 . T T,T . PASS . FSR klmn,.,efg klmn,bc,. -2 193 . T T,T . PASS . FSR . . -2 194 . A C,A . PASS . FSG d,op,bc,klmn,efg,d .,op,bc,klmn,efg,d -2 195 . A C,A . PASS . FSG d,.,bc,klmn,efg,d d,op,.,klmn,efg,d -2 196 . A C,A . PASS . FSG d,op,bc,.,efg,d d,op,bc,klmn,.,d -2 197 . A C,A . PASS . FSG d,op,bc,klmn,efg,. . -2 198 . T T,G . PASS . FSD klmn,bc,d,op,hij,efg,klmn,ab,hij .,bc,d,op,hij,efg,klmn,ab,hij -2 199 . T T,G . PASS . FSD klmn,.,d,op,hij,efg,klmn,ab,hij klmn,bc,.,op,hij,efg,klmn,ab,hij -2 200 . T T,G . PASS . FSD klmn,bc,d,.,hij,efg,klmn,ab,hij klmn,bc,d,op,.,efg,klmn,ab,hij -2 201 . T T,G . PASS . FSD klmn,bc,d,op,hij,.,klmn,ab,hij klmn,bc,d,op,hij,efg,.,ab,hij -2 202 . T T,G . PASS . FSD klmn,bc,d,op,hij,efg,klmn,.,hij klmn,bc,d,op,hij,efg,klmn,ab,. -2 203 . T T,G . PASS . FSD . efg,klmn,bc,op,ab,bc,hij,hij -2 204 . T T,G . PASS . FSD .,klmn,bc,op,ab,bc,hij,hij efg,.,bc,op,ab,bc,hij,hij -2 205 . T T,G . PASS . FSD efg,klmn,.,op,ab,bc,hij,hij efg,klmn,bc,.,ab,bc,hij,hij -2 206 . T T,G . PASS . FSD efg,klmn,bc,op,.,bc,hij,hij efg,klmn,bc,op,ab,.,hij,hij -2 207 . T T,G . PASS . FSD efg,klmn,bc,op,ab,bc,.,hij efg,klmn,bc,op,ab,bc,hij,. -2 208 . T T,G . PASS . FSD . . diff --git a/sgkit/tests/io/vcf/data/allele_overflow.vcf b/sgkit/tests/io/vcf/data/allele_overflow.vcf deleted file mode 100644 index a73771b7e..000000000 --- a/sgkit/tests/io/vcf/data/allele_overflow.vcf +++ /dev/null @@ -1,13 +0,0 @@ -##fileformat=VCFv4.2 -##FILTER= -##fileDate=20201009 -##source=. -##reference=./simple.fasta -##contig= -##contig= -##contig= -##INFO= -##FORMAT= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 SAMPLE2 SAMPLE3 -CHR1 10 . AAAAAAAA AAAAAAAC,AAAAAAAG,AAAAAAAT,AAAAAACC,AAAAAACG,AAAAAACT,AAAAAAGG,AAAAAAGT,AAAAAATT,AAAAACCC,AAAAACCG,AAAAACCT,AAAAACGG,AAAAACGT,AAAAACTT,AAAAAGGG,AAAAAGGT,AAAAAGTT,AAAAATTT,AAAACCCC,AAAACCCG,AAAACCCT,AAAACCGG,AAAACCGT,AAAACCTT,AAAACGGG,AAAACGGT,AAAACGTT,AAAACTTT,AAAAGGGG,AAAAGGGT,AAAAGGTT,AAAAGTTT,AAAATTTT,AAACCCCC,AAACCCCG,AAACCCCT,AAACCCGG,AAACCCGT,AAACCCTT,AAACCGGG,AAACCGGT,AAACCGTT,AAACCTTT,AAACGGGG,AAACGGGT,AAACGGTT,AAACGTTT,AAACTTTT,AAAGGGGG,AAAGGGGT,AAAGGGTT,AAAGGTTT,AAAGTTTT,AAATTTTT,AACCCCCC,AACCCCCG,AACCCCCT,AACCCCGG,AACCCCGT,AACCCCTT,AACCCGGG,AACCCGGT,AACCCGTT,AACCCTTT,AACCGGGG,AACCGGGT,AACCGGTT,AACCGTTT,AACCTTTT,AACGGGGG,AACGGGGT,AACGGGTT,AACGGTTT,AACGTTTT,AACTTTTT,AAGGGGGG,AAGGGGGT,AAGGGGTT,AAGGGTTT,AAGGTTTT,AAGTTTTT,AATTTTTT,ACCCCCCC,ACCCCCCG,ACCCCCCT,ACCCCCGG,ACCCCCGT,ACCCCCTT,ACCCCGGG,ACCCCGGT,ACCCCGTT,ACCCCTTT,ACCCGGGG,ACCCGGGT,ACCCGGTT,ACCCGTTT,ACCCTTTT,ACCGGGGG,ACCGGGGT,ACCGGGTT,ACCGGTTT,ACCGTTTT,ACCTTTTT,ACGGGGGG,ACGGGGGT,ACGGGGTT,ACGGGTTT,ACGGTTTT,ACGTTTTT,ACTTTTTT,AGGGGGGG,AGGGGGGT,AGGGGGTT,AGGGGTTT,AGGGTTTT,AGGTTTTT,AGTTTTTT,ATTTTTTT,CCCCCCCC,CCCCCCCG,CCCCCCCT,CCCCCCGG,CCCCCCGT,CCCCCCTT,CCCCCGGG,CCCCCGGT,CCCCCGTT,CCCCCTTT,CCCCGGGG,CCCCGGGT,CCCCGGTT,CCCCGTTT,CCCCTTTT,CCCGGGGG,CCCGGGGT,CCCGGGTT,CCCGGTTT,CCCGTTTT,CCCTTTTT,CCGGGGGG,CCGGGGGT,CCGGGGTT,CCGGGTTT,CCGGTTTT,CCGTTTTT,CCTTTTTT,CGGGGGGG,CGGGGGGT,CGGGGGTT,CGGGGTTT,CGGGTTTT,CGGTTTTT,CGTTTTTT,CTTTTTTT,GGGGGGGG,GGGGGGGT,GGGGGGTT,GGGGGTTT,GGGGTTTT,GGGTTTTT,GGTTTTTT,GTTTTTTT,TTTTTTTT 60 PASS NS=3 GT 0/57 100/127 100/145 -CHR2 2 . A T 60 PASS NS=3 GT 0/0 0/0 0/1 \ No newline at end of file diff --git a/sgkit/tests/io/vcf/data/allele_overflow.vcf.gz b/sgkit/tests/io/vcf/data/allele_overflow.vcf.gz deleted file mode 100644 index 7bbaad659..000000000 Binary files a/sgkit/tests/io/vcf/data/allele_overflow.vcf.gz and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/allele_overflow.vcf.gz.tbi b/sgkit/tests/io/vcf/data/allele_overflow.vcf.gz.tbi deleted file mode 100644 index 0b3fc7502..000000000 Binary files a/sgkit/tests/io/vcf/data/allele_overflow.vcf.gz.tbi and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/info_name_clash.vcf b/sgkit/tests/io/vcf/data/info_name_clash.vcf deleted file mode 100755 index fe2e3d678..000000000 --- a/sgkit/tests/io/vcf/data/info_name_clash.vcf +++ /dev/null @@ -1,15 +0,0 @@ -##fileformat=VCFv4.2 -##FILTER= -##fileDate=20201009 -##source=. -##reference=./simple.fasta -##contig= -##contig= -##contig= -##INFO= -##INFO= -##INFO= -##FORMAT= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 SAMPLE2 SAMPLE3 -CHR1 2 . A T 60 PASS NS=3;AC=3 GT 0/0 0/0 0/0 -CHR1 7 . A C 60 PASS NS=3;AC=4 GT 0/0 0/1 0/1 diff --git a/sgkit/tests/io/vcf/data/mixed.vcf b/sgkit/tests/io/vcf/data/mixed.vcf deleted file mode 100755 index 91647abad..000000000 --- a/sgkit/tests/io/vcf/data/mixed.vcf +++ /dev/null @@ -1,14 +0,0 @@ -##fileformat=VCFv4.2 -##FILTER= -##fileDate=20201009 -##source=. -##reference=./simple.fasta -##contig= -##contig= -##contig= -##INFO= -##INFO= -##FORMAT= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 SAMPLE2 SAMPLE3 -CHR1 2 . A T 60 PASS NS=3;AC=3 GT 0/0/1/1 0/0 0/0/0/1 -CHR1 7 . A C 60 PASS NS=3;AC=4 GT 0/0/1/1 0/1 0/1/./. diff --git a/sgkit/tests/io/vcf/data/mixed.vcf.gz b/sgkit/tests/io/vcf/data/mixed.vcf.gz deleted file mode 100644 index e853b7107..000000000 Binary files a/sgkit/tests/io/vcf/data/mixed.vcf.gz and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/mixed.vcf.gz.tbi b/sgkit/tests/io/vcf/data/mixed.vcf.gz.tbi deleted file mode 100644 index ff5ea0f22..000000000 Binary files a/sgkit/tests/io/vcf/data/mixed.vcf.gz.tbi and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/no_filter_defined.vcf b/sgkit/tests/io/vcf/data/no_filter_defined.vcf deleted file mode 100644 index 7e0332b92..000000000 --- a/sgkit/tests/io/vcf/data/no_filter_defined.vcf +++ /dev/null @@ -1,14 +0,0 @@ -##fileformat=VCFv4.2 -##FILTER= -##fileDate=20201009 -##source=. -##reference=./simple.fasta -##contig= -##contig= -##contig= -##INFO= -##INFO= -##FORMAT= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 SAMPLE2 SAMPLE3 -CHR1 2 . A T 60 PASS NS=3;AC=3 GT 0/0 0/0 0/0 -CHR1 7 . A C 60 FAIL NS=3;AC=4 GT 0/0 0/1 0/1 diff --git a/sgkit/tests/io/vcf/data/no_genotypes.vcf b/sgkit/tests/io/vcf/data/no_genotypes.vcf deleted file mode 100644 index 6204d495d..000000000 --- a/sgkit/tests/io/vcf/data/no_genotypes.vcf +++ /dev/null @@ -1,112 +0,0 @@ -##fileformat=VCFv4.1 -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -#CHROM POS ID REF ALT QUAL FILTER INFO -1 100 a G A 232.46 PASS . -1 199 b GG G 232.46 PASS . -1 200 c G A 232.46 PASS . -1 203 d GGGG G 232.46 PASS . -1 280 e G A 232.46 PASS . -1 284 f GGG G 232.46 PASS . -1 285 g G A 232.46 PASS . -1 286 h G A 232.46 PASS . -1 999 i G A 232.46 PASS . -1 1000 j G A 232.46 PASS . -1 1000 k GGGG G 232.46 PASS . -1 1076 l G A 232.46 PASS . -1 1150 m G A 232.46 PASS . -1 1176 n G A 232.46 PASS . -2 200 o G A 232.46 PASS . -2 525 p G A 232.46 PASS . -2 548 q GGG G 232.46 PASS . -2 640 r G A 232.46 PASS . -2 700 s G A 232.46 PASS . -3 1 t G A 232.46 PASS . -3 300 u G A 232.46 PASS . -3 300 v GGGG G 232.46 PASS . -3 400 w G A 232.46 PASS . -4 600 x G A 232.46 PASS . -4 775 y G A 232.46 PASS . -4 776 z GGGG G 232.46 PASS . diff --git a/sgkit/tests/io/vcf/data/no_genotypes_with_gt_header.vcf b/sgkit/tests/io/vcf/data/no_genotypes_with_gt_header.vcf deleted file mode 100644 index 0cb1b7027..000000000 --- a/sgkit/tests/io/vcf/data/no_genotypes_with_gt_header.vcf +++ /dev/null @@ -1,113 +0,0 @@ -##fileformat=VCFv4.1 -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##FORMAT= -#CHROM POS ID REF ALT QUAL FILTER INFO -1 100 a G A 232.46 PASS . -1 199 b GG G 232.46 PASS . -1 200 c G A 232.46 PASS . -1 203 d GGGG G 232.46 PASS . -1 280 e G A 232.46 PASS . -1 284 f GGG G 232.46 PASS . -1 285 g G A 232.46 PASS . -1 286 h G A 232.46 PASS . -1 999 i G A 232.46 PASS . -1 1000 j G A 232.46 PASS . -1 1000 k GGGG G 232.46 PASS . -1 1076 l G A 232.46 PASS . -1 1150 m G A 232.46 PASS . -1 1176 n G A 232.46 PASS . -2 200 o G A 232.46 PASS . -2 525 p G A 232.46 PASS . -2 548 q GGG G 232.46 PASS . -2 640 r G A 232.46 PASS . -2 700 s G A 232.46 PASS . -3 1 t G A 232.46 PASS . -3 300 u G A 232.46 PASS . -3 300 v GGGG G 232.46 PASS . -3 400 w G A 232.46 PASS . -4 600 x G A 232.46 PASS . -4 775 y G A 232.46 PASS . -4 776 z GGGG G 232.46 PASS . diff --git a/sgkit/tests/io/vcf/data/no_samples.vcf.gz b/sgkit/tests/io/vcf/data/no_samples.vcf.gz deleted file mode 100644 index 03fef8234..000000000 Binary files a/sgkit/tests/io/vcf/data/no_samples.vcf.gz and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/no_samples.vcf.gz.tbi b/sgkit/tests/io/vcf/data/no_samples.vcf.gz.tbi deleted file mode 100644 index dc0a19c3d..000000000 Binary files a/sgkit/tests/io/vcf/data/no_samples.vcf.gz.tbi and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/sample.vcf b/sgkit/tests/io/vcf/data/sample.vcf deleted file mode 100644 index 51eeedcca..000000000 --- a/sgkit/tests/io/vcf/data/sample.vcf +++ /dev/null @@ -1,31 +0,0 @@ -##fileformat=VCFv4.0 -##fileDate=20090805 -##source=myImputationProgramV3.1 -##reference=1000GenomesPilot-NCBI36 -##phasing=partial -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##FILTER= -##FILTER= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##ALT= -##ALT= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 -19 111 . A C 9.6 . . GT:HQ 0|0:10,15 0|0:10,10 0/1:3,3 -19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 -20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. -20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,. -20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.,. -20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,. -20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 ./.:40:3 -20 1235237 . T . . . . GT 0/0 0|0 ./. -X 10 rsTest AC A,ATG,C 10 PASS . GT 0 0/1 0|2 diff --git a/sgkit/tests/io/vcf/data/sample_multiple_filters.vcf b/sgkit/tests/io/vcf/data/sample_multiple_filters.vcf deleted file mode 100644 index 52fc2a76e..000000000 --- a/sgkit/tests/io/vcf/data/sample_multiple_filters.vcf +++ /dev/null @@ -1,31 +0,0 @@ -##fileformat=VCFv4.0 -##fileDate=20090805 -##source=myImputationProgramV3.1 -##reference=1000GenomesPilot-NCBI36 -##phasing=partial -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##FILTER= -##FILTER= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##ALT= -##ALT= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 -19 111 . A C 9.6 . . GT:HQ 0|0:10,15 0|0:10,10 0/1:3,3 -19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 -20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. -20 17330 . T A 3 q10;s50 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,. -20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.,. -20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,. -20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 ./.:40:3 -20 1235237 . T . . . . GT 0/0 0|0 ./. -X 10 rsTest AC A,ATG,C 10 PASS . GT 0 0/1 0|2 diff --git a/sgkit/tests/io/vcf/data/sample_multiple_filters.vcf.gz b/sgkit/tests/io/vcf/data/sample_multiple_filters.vcf.gz deleted file mode 100644 index bd5529b61..000000000 Binary files a/sgkit/tests/io/vcf/data/sample_multiple_filters.vcf.gz and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/sample_multiple_filters.vcf.gz.tbi b/sgkit/tests/io/vcf/data/sample_multiple_filters.vcf.gz.tbi deleted file mode 100644 index 020130d5f..000000000 Binary files a/sgkit/tests/io/vcf/data/sample_multiple_filters.vcf.gz.tbi and /dev/null differ diff --git a/sgkit/tests/io/vcf/data/simple.output.mixed_depth.likelihoods.vcf b/sgkit/tests/io/vcf/data/simple.output.mixed_depth.likelihoods.vcf deleted file mode 100644 index 116580314..000000000 --- a/sgkit/tests/io/vcf/data/simple.output.mixed_depth.likelihoods.vcf +++ /dev/null @@ -1,35 +0,0 @@ -##fileformat=VCFv4.3 -##fileDate=20210420 -##source=mchap v0.4.2 -##phasing=None -##commandline="mchap assemble --bam simple.sample1.bam simple.sample2.deep.bam simple.sample3.bam --ploidy 4 --targets simple.bed.gz --variants simple.vcf.gz --reference simple.fasta --mcmc-steps 500 --mcmc-burn 100 --mcmc-seed 11 --genotype-likelihoods" -##randomseed=11 -##contig= -##contig= -##contig= -##FILTER= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 SAMPLE2 SAMPLE3 -CHR1 6 CHR1_05_25 AAAAAAAAAAAAAAAAAAAA AAAAAAAAAAGAAAAAATAA,ACAAAAAAAAGAAAAAACAA . . AN=3;AC=3,2;NS=3;DP=159;RCOUNT=240;END=25;NVAR=3;SNVPOS=2,11,18 GT:GQ:PHQ:DP:RCOUNT:RCALLS:MEC:KMERCOV:GPM:PHPM:MCI:GL 0/0/1/2:12:60:13:20:40:0:1,1,1:0.941:1:0:-98.588,-39.531,-38.582,-40.008,-120.497,-38.327,-9.031,-10.536,-91.769,-37.98,-10.536,-91.848,-39.758,-93.677,-142.405 0/0/1/1:60:60:133:200:400:0:1,1,1:1:1:0:-876.34,-72.701,-60.208,-72.701,-876.34,-693.832,-90.31,-90.311,-893.832,-699.4,-120.413,-918.484,-722.459,-960.628,-1752.679 0/0/0/2:10:22:13:20:40:0:1,1,1:0.896:0.994:0:-54.771,-37.145,-38.582,-42.394,-164.314,-4.885,-7.526,-12.041,-135.211,-6.021,-10.536,-134.762,-9.656,-135.688,-164.314 -CHR1 31 CHR1_30_50 AAAAAAAAAAAAAAAAAAAA . . . AN=1;AC=.;NS=3;DP=.;RCOUNT=288;END=50;NVAR=0;SNVPOS=. GT:GQ:PHQ:DP:RCOUNT:RCALLS:MEC:KMERCOV:GPM:PHPM:MCI:GL 0/0/0/0:60:60:.:24:0:0:.,.,.:1:1:0:0 0/0/0/0:60:60:.:240:0:0:.,.,.:1:1:0:0 0/0/0/0:60:60:.:24:0:0:.,.,.:1:1:0:0 -CHR2 11 CHR2_10_30 AAAAAAAAAAAAAAAAAAAA AAAAAAAAAGAAAAAAAAAA,AAAAAAAAATAAAAAAAAAA,AAAATAAAAGAAAAAAAAAA . . AN=4;AC=3,2,1;NS=3;DP=168;RCOUNT=288;END=30;NVAR=2;SNVPOS=5,10 GT:GQ:PHQ:DP:RCOUNT:RCALLS:MEC:KMERCOV:GPM:PHPM:MCI:GL 0/0/0/2:7:14:14:24:28:0:1,1,.:0.812:0.962:0:-21.909,-23.408,-25.521,-29.133,-87.634,-3.908,-6.021,-9.633,-68.134,-4.817,-8.429,-66.93,-7.725,-66.225,-65.726,-23.783,-25.896,-29.508,-89.133,-6.021,-9.633,-69.258,-8.429,-68.054,-67.35,-26.424,-30.036,-91.246,-9.633,-70.843,-69.639,-30.939,-94.859,-73.552,-153.359 0/0/1/2:60:60:140:240:280:0:1,1,.:1:1:0:-438.171,-253.163,-255.209,-272.248,-657.255,-253.163,-72.248,-84.289,-467.25,-255.209,-84.289,-462.253,-272.248,-467.25,-657.255,-403.163,-267.988,-281.279,-672.248,-218.5,-93.32,-480.029,-230.541,-478.78,-620.998,-419.523,-295.592,-693.379,-239.572,-499.626,-635.597,-450.875,-729.502,-663.939,-1314.509 0/1/1/3:8:21:14:24:28:0:1,1,.:0.828:0.992:0:-82.157,-24.156,-21.248,-20.339,-38.34,-82.657,-24.86,-22.452,-39.839,-83.361,-26.064,-41.952,-84.565,-45.565,-104.065,-36.975,-7.526,-6.396,-24.839,-37.679,-8.73,-26.271,-38.883,-28.605,-58.758,-35.873,-7.299,-25.521,-37.077,-27.702,-57.481,-36.021,-27.702,-57.327,-71.203 -CHR3 21 CHR3_20_40 AAAAAAAAAAAAAAAAAAAA . . . AN=1;AC=.;NS=3;DP=.;RCOUNT=0;END=40;NVAR=0;SNVPOS=. GT:GQ:PHQ:DP:RCOUNT:RCALLS:MEC:KMERCOV:GPM:PHPM:MCI:GL 0/0/0/0:60:60:.:0:0:0:.,.,.:1:1:0:0 0/0/0/0:60:60:.:0:0:0:.,.,.:1:1:0:0 0/0/0/0:60:60:.:0:0:0:.,.,.:1:1:0:0 diff --git a/sgkit/tests/io/vcf/hypothesis_vcf.py b/sgkit/tests/io/vcf/hypothesis_vcf.py deleted file mode 100644 index 733c1fbce..000000000 --- a/sgkit/tests/io/vcf/hypothesis_vcf.py +++ /dev/null @@ -1,440 +0,0 @@ -import io -import string -from dataclasses import dataclass -from math import comb -from typing import Any, List, Optional, Union - -from hypothesis.strategies import ( - booleans, - builds, - composite, - floats, - from_regex, - integers, - just, - lists, - none, - one_of, - sampled_from, - text, -) - -import sgkit as sg -from sgkit.io.utils import str_is_int - -ALPHANUMERIC = string.ascii_lowercase + string.ascii_uppercase + string.digits - - -@dataclass(frozen=True) -class Field: - category: str - vcf_key: str - vcf_type: str - vcf_number: str - - def get_header(self): - return ( - f"##{self.category}=<" - f"ID={self.vcf_key}," - f"Type={self.vcf_type}," - f"Number={self.vcf_number}," - f'Description="{self.category},Type={self.vcf_type},Number={self.vcf_number}">' - ) - - -# references to the VCF spec are for https://samtools.github.io/hts-specs/VCFv4.3.pdf - -# [Table 1: Reserved INFO keys] -RESERVED_INFO_KEYS = [ - "AA", - "AC", - "AD", - "ADF", - "ADR", - "AF", - "AN", - "BQ", - "CIGAR", - "DB", - "DP", - "END", - "H2", - "H3", - "MQ", - "MQ0", - "NS", - "SB", - "SOMATIC", - "VALIDATED", - "1000G", - "id", # conflicts with 'variant_id' variable; see RESERVED_VARIABLE_NAMES in vcf_reader.py -] - -# [Table 2: Reserved genotype keys] -RESERVED_FORMAT_KEYS = [ - "AD", - "ADF", - "ADR", - "DP", - "EC", - "FT", - "GL", - "GP", - "GQ", - "GT", - "HQ", - "MQ", - "PL", - "PP", - "PQ", - "PS", -] - -# [1.4.2 Information field format] -# [1.4.4 Individual format field format] - - -def vcf_field_keys(category): - # exclude reserved keys because generated type and number may not match spec - # [1.6.1 Fixed fields] - field_key_regex = r"[A-Za-z_][0-9A-Za-z_.]" - - def is_reserved_key(key): - return (category == "INFO" and key in RESERVED_INFO_KEYS) or ( - category == "FORMAT" and key in RESERVED_FORMAT_KEYS - ) - - return from_regex(field_key_regex, fullmatch=True).filter( - lambda key: not is_reserved_key(key) - ) - - -def vcf_types(category): - if category == "INFO": - return sampled_from(["Integer", "Float", "Flag", "Character", "String"]) - elif category == "FORMAT": - # format fields can't have flag type - return sampled_from(["Integer", "Float", "Character", "String"]) - raise ValueError(f"Category '{category}' is not supported.") - - -def vcf_numbers(category, max_number): - if category == "INFO": - # info fields can't have number G - return one_of(integers(0, max_number).map(str), sampled_from(["A", "R", "."])) - elif category == "FORMAT": - # format fields can't have number 0 (flag type) - return one_of( - integers(1, max_number).map(str), sampled_from(["A", "R", "G", "."]) - ) - raise ValueError(f"Category '{category}' is not supported.") - - -def vcf_fields(category, max_number): - # info flag fields must have number 0 - # non-flag fields can't have number 0 - return builds( - Field, - category=just(category), - vcf_key=vcf_field_keys(category), - vcf_type=vcf_types(category), - vcf_number=vcf_numbers(category, max_number), - ).filter( - lambda field: (field.vcf_type == "Flag" and field.vcf_number == "0") - or (field.vcf_type != "Flag" and field.vcf_number != "0") - ) - - -# [1.6.1 Fixed fields] - - -def contigs(): - # [1.4.7 Contig field format] - contig_regex = r"[0-9A-Za-z!#$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*" - return from_regex(contig_regex, fullmatch=True) - - -def positions(): - # note that 0 is a valid POS value - return integers(0, 2**31 - 1) - - -def ids(): - # currently restricted to alphanumeric, although the spec doesn't have that limitation - return one_of(none(), text(alphabet=ALPHANUMERIC, min_size=1)) - - -def bases(): - return text("ACGTN", min_size=1) - - -def qualities(): - return one_of( - none(), - floats( - min_value=0.0, - exclude_min=True, - allow_nan=False, - allow_infinity=False, - width=32, - ), - ) - - -@composite -def vcf_values(draw, field, *, max_number, alt_alleles, ploidy): - # [1.3 Data types] - if field.vcf_type == "Integer": - # some integer values at lower end of range are not allowed - values = integers(-(2**31) + 8, 2**31 - 1) - elif field.vcf_type == "Float": - # in general inf and nan are allowed - values = floats(width=32) - elif field.vcf_type == "Flag": - # note this returns a bool not a list - return draw(booleans()) - elif field.vcf_type == "Character": - # currently restricted to alphanumeric - values = text(alphabet=ALPHANUMERIC, min_size=1, max_size=1) - elif field.vcf_type == "String": - # currently restricted to alphanumeric - values = text(alphabet=ALPHANUMERIC, min_size=1) - else: - raise ValueError(f"Type '{field.vcf_type}' is not supported.") - - number = draw( - vcf_number_to_ints( - field.vcf_number, - max_number=max_number, - alt_alleles=alt_alleles, - ploidy=ploidy, - ) - ) - return draw(lists(one_of(values, none()), min_size=number, max_size=number)) - - -def vcf_number_to_ints(vcf_number, *, max_number, alt_alleles, ploidy): - # [1.4.2 Information field format] - if vcf_number == ".": - return integers(1, max_number) - elif str_is_int(vcf_number): - return just(int(vcf_number)) - elif vcf_number == "A": - return just(alt_alleles) - elif vcf_number == "R": - return just(alt_alleles + 1) - elif vcf_number == "G": - n_alleles = alt_alleles + 1 - return just(comb(n_alleles + ploidy - 1, ploidy)) - raise ValueError(f"Number '{vcf_number}' is not supported.") - - -@composite -def vcf( - draw, - *, - max_alt_alleles=3, - max_info_fields=2, - max_format_fields=2, - max_number=3, - max_samples=2, - max_variants=2, -): - """A Hypothesis strategy to generate a VCF file as a string. - - Parameters - ---------- - max_alt_alleles - The maximum number of alternate alleles for any variant in the generated VCF. - max_info_fields - The maximum number of INFO fields in the generated VCF. - max_format_fields - The maximum number of FORMAT fields in the generated VCF. - max_number - The maximum value of an integral Number entry in an INFO or FORMAT field in the generated VCF. - This is also the maximum number of values generated for a field with Number='.'. - max_samples - The maximum number of samples in the generated VCF. - max_variants - The maximum number of variants in the generated VCF. - - Returns - ------- - A Hypothesis strategy to generate a VCF file, including header, as a string. - """ - info_fields = draw( - lists( - vcf_fields("INFO", max_number=max_number), - max_size=max_info_fields, - unique_by=lambda f: f.vcf_key, - ) - ) - format_fields = draw( - lists( - vcf_fields("FORMAT", max_number=max_number), - max_size=max_format_fields, - unique_by=lambda f: f.vcf_key, - ) - ) - sample_ids = draw( - lists( - text(alphabet=ALPHANUMERIC, min_size=1), max_size=max_samples, unique=True - ) - ) - variant_ids = draw(lists(ids(), min_size=1, max_size=max_variants, unique=True)) - - contig = draw(contigs()) # currently just a single contig - variant_contigs = [contig] * len(variant_ids) - variant_positions = draw( - lists( - positions(), - min_size=len(variant_ids), - max_size=len(variant_ids), - unique=True, - ) - ) - variant_positions.sort() - - output = io.StringIO() - print( - vcf_header_string([contig], info_fields, format_fields, sample_ids), - end="", - file=output, - ) - - for contig, pos, id in zip(variant_contigs, variant_positions, variant_ids): - ref = draw(bases()) - alt = draw(lists(bases(), max_size=max_alt_alleles)) - qual = draw(qualities()) - filter = None - info = [] - for field in info_fields: - info_values = draw( - vcf_values(field, max_number=max_number, alt_alleles=len(alt), ploidy=2) - ) - if not is_missing(info_values): - if info_values is True: - info.append(field.vcf_key) - else: - text_values = ["." if v is None else str(v) for v in info_values] - info.append(f'{field.vcf_key}={join(",", text_values)}') - format_ = [] - sample_values = [[] for _ in range(len(sample_ids))] - for field in format_fields: - sample_values_for_field = [ - draw( - vcf_values( - field, max_number=max_number, alt_alleles=len(alt), ploidy=2 - ) - ) - for _ in range(len(sample_ids)) - ] - if all(is_missing(v) for v in sample_values_for_field): - continue - format_.append(field.vcf_key) - for sv, sv2 in zip(sample_values_for_field, sample_values): - text_values = ["." if v is None else str(v) for v in sv] - sv2.append(join(",", text_values)) - - variant = vcf_variant_string( - contig, pos, id, ref, alt, qual, filter, info, format_, sample_values - ) - print(str(variant), end="", file=output) - - return output.getvalue() - - -# Formatting - - -def is_missing(val: Union[bool, List[Any]]) -> bool: - if isinstance(val, bool): - return val is False - if len(val) == 0: - return True - return all(v is None for v in val) - - -def join(separator: str, vals: Optional[List[str]]) -> str: - if vals is None or len(vals) == 0: - return "." - res = separator.join(vals) - if len(res) == 0: - return "." - return res - - -def vcf_header_string(contigs, info_fields, format_fields, sample_ids): - output = io.StringIO() - - # [1.4.1 File format] - print("##fileformat=VCFv4.3", file=output) - - # [1.4.3 Filter field format] - print('##FILTER=', file=output) - - print(f"##source=sgkit-vcf-hypothesis-{sg.__version__}", file=output) - - # [1.4.7 Contig field format] - for contig in contigs: - print(f"##contig=", file=output) - - # [1.4.2 Information field format] - for field in info_fields: - print(field.get_header(), file=output) - - # [1.4.4 Individual format field format] - for field in format_fields: - print(field.get_header(), file=output) - - # [1.5 Header line syntax] - print( - "#CHROM", - "POS", - "ID", - "REF", - "ALT", - "QUAL", - "FILTER", - "INFO", - sep="\t", - end="", - file=output, - ) - - if len(sample_ids) > 0: - print(end="\t", file=output) - print("FORMAT", *sample_ids, sep="\t", file=output) - else: - print(file=output) - - return output.getvalue() - - -def vcf_variant_string( - contig, pos, id, ref, alt, qual, filter, info, format_, sample_values -): - output = io.StringIO() - - print( - contig, - pos, - "." if id is None else id, - ref, - join(",", alt), - "." if qual is None else str(qual), - join(";", filter), - join(";", info), - sep="\t", - end="", - file=output, - ) - if len(sample_values) > 0: - print(end="\t", file=output) - format_str = join(":", format_) - sample_strs = [join(":", sv) for sv in sample_values] - print(format_str, *sample_strs, sep="\t", end="\n", file=output) - else: - print(file=output) - - return output.getvalue() diff --git a/sgkit/tests/io/vcf/test_csi.py b/sgkit/tests/io/vcf/test_csi.py deleted file mode 100644 index f4a6a7fdc..000000000 --- a/sgkit/tests/io/vcf/test_csi.py +++ /dev/null @@ -1,42 +0,0 @@ -import pytest -from cyvcf2 import VCF - -from sgkit.io.vcf.csi import read_csi -from sgkit.io.vcf.vcf_partition import get_csi_path -from sgkit.io.vcf.vcf_reader import count_variants - -from .utils import path_for_test - - -@pytest.mark.parametrize( - "vcf_file", - [ - "CEUTrio.20.21.gatk3.4.csi.g.vcf.bgz", - ], -) -@pytest.mark.parametrize( - "is_path", - [True, False], -) -def test_record_counts_csi(shared_datadir, vcf_file, is_path): - # Check record counts in csi with actual count of VCF - vcf_path = path_for_test(shared_datadir, vcf_file, is_path) - csi_path = get_csi_path(vcf_path) - assert csi_path is not None - csi = read_csi(csi_path) - - for i, contig in enumerate(VCF(vcf_path).seqnames): - assert csi.record_counts[i] == count_variants(vcf_path, contig) - - -@pytest.mark.parametrize( - "file", - ["CEUTrio.20.21.gatk3.4.g.vcf.bgz", "CEUTrio.20.21.gatk3.4.g.vcf.bgz.tbi"], -) -@pytest.mark.parametrize( - "is_path", - [True, False], -) -def test_read_csi__invalid_csi(shared_datadir, file, is_path): - with pytest.raises(ValueError, match=r"File not in CSI format."): - read_csi(path_for_test(shared_datadir, file, is_path)) diff --git a/sgkit/tests/io/vcf/test_hypothesis_vcf.py b/sgkit/tests/io/vcf/test_hypothesis_vcf.py deleted file mode 100644 index 8aea2f7b3..000000000 --- a/sgkit/tests/io/vcf/test_hypothesis_vcf.py +++ /dev/null @@ -1,82 +0,0 @@ -import pytest -from hypothesis import HealthCheck, given, note, settings -from hypothesis.strategies import data - -from sgkit.io.vcf.vcf_reader import vcf_to_zarr, zarr_array_sizes - -from .hypothesis_vcf import ( - RESERVED_FORMAT_KEYS, - RESERVED_INFO_KEYS, - Field, - vcf, - vcf_field_keys, - vcf_fields, - vcf_values, -) - - -@given(data=data()) -@settings(deadline=None) # avoid problem with numba jit compilation -def test_vcf_field_keys(data): - info_field_key = data.draw(vcf_field_keys("INFO")) - assert info_field_key not in RESERVED_INFO_KEYS - format_field_key = data.draw(vcf_field_keys("FORMAT")) - assert format_field_key not in RESERVED_FORMAT_KEYS - - -@given(data=data()) -@settings(deadline=None) # avoid problem with numba jit compilation -def test_info_fields(data): - field = data.draw(vcf_fields("INFO", max_number=3)) - assert field.category == "INFO" - assert field.vcf_number != "G" - if field.vcf_type == "Flag": - assert field.vcf_number == "0" - else: - assert field.vcf_number != "0" - - -@given(data=data()) -@settings(deadline=None) # avoid problem with numba jit compilation -def test_format_fields(data): - field = data.draw(vcf_fields("FORMAT", max_number=3)) - assert field.category == "FORMAT" - assert field.vcf_type != "Flag" - assert field.vcf_number != "0" - - -@given(data=data()) -@settings(deadline=None) # avoid problem with numba jit compilation -def test_vcf_values(data): - field = Field("INFO", "I1", "Integer", "1") - values = data.draw(vcf_values(field, max_number=3, alt_alleles=1, ploidy=2)) - assert values is not None - assert len(values) == 1 - assert values[0] is None or isinstance(values[0], int) - - -@given(vcf_string=vcf()) -@settings(suppress_health_check=[HealthCheck.function_scoped_fixture], deadline=None) -@pytest.mark.filterwarnings( - "ignore::sgkit.io.vcf.FloatFormatFieldWarning", - "ignore::sgkit.io.vcfzarr_reader.DimensionNameForFixedFormatFieldWarning", -) -def test_vcf_to_zarr(tmp_path, vcf_string): - # test that we can convert VCFs to Zarr without error - - note(f"vcf:\n{vcf_string}") - - input = tmp_path.joinpath("input.vcf") - output = dict() # in-memory Zarr is guaranteed to be case-sensitive - - with open(input, "w") as f: - f.write(vcf_string) - - kwargs = zarr_array_sizes(input) - vcf_to_zarr( - input, - output, - fields=["INFO/*", "FORMAT/*"], - mixed_ploidy=True, - **kwargs, - ) diff --git a/sgkit/tests/io/vcf/test_tbi.py b/sgkit/tests/io/vcf/test_tbi.py deleted file mode 100644 index 7f56e0e97..000000000 --- a/sgkit/tests/io/vcf/test_tbi.py +++ /dev/null @@ -1,41 +0,0 @@ -import pytest - -from sgkit.io.vcf.tbi import read_tabix -from sgkit.io.vcf.vcf_partition import get_tabix_path -from sgkit.io.vcf.vcf_reader import count_variants - -from .utils import path_for_test - - -@pytest.mark.parametrize( - "vcf_file", - [ - "CEUTrio.20.21.gatk3.4.g.vcf.bgz", - ], -) -@pytest.mark.parametrize( - "is_path", - [True, False], -) -def test_record_counts_tbi(shared_datadir, vcf_file, is_path): - # Check record counts in tabix with actual count of VCF - vcf_path = path_for_test(shared_datadir, vcf_file, is_path) - tabix_path = get_tabix_path(vcf_path) - assert tabix_path is not None - tabix = read_tabix(tabix_path) - - for i, contig in enumerate(tabix.sequence_names): - assert tabix.record_counts[i] == count_variants(vcf_path, contig) - - -@pytest.mark.parametrize( - "file", - ["CEUTrio.20.21.gatk3.4.g.vcf.bgz", "CEUTrio.20.21.gatk3.4.csi.g.vcf.bgz.csi"], -) -@pytest.mark.parametrize( - "is_path", - [True, False], -) -def test_read_tabix__invalid_tbi(shared_datadir, file, is_path): - with pytest.raises(ValueError, match=r"File not in Tabix format."): - read_tabix(path_for_test(shared_datadir, file, is_path)) diff --git a/sgkit/tests/io/vcf/test_utils.py b/sgkit/tests/io/vcf/test_utils.py deleted file mode 100644 index deb07a1b1..000000000 --- a/sgkit/tests/io/vcf/test_utils.py +++ /dev/null @@ -1,131 +0,0 @@ -import os -import tempfile -from pathlib import Path - -import fsspec -import pytest -from callee.strings import StartsWith - -from sgkit.io.vcf.utils import build_url, chunks, merge_encodings, temporary_directory -from sgkit.io.vcf.vcf_reader import get_region_start - - -def directory_with_file_scheme() -> str: - return f"file://{tempfile.gettempdir()}" - - -def directory_with_missing_parent() -> str: - # create a local temporary directory using Python tempfile - with tempfile.TemporaryDirectory() as dir: - pass - # we know it doesn't exist - assert not Path(dir).exists() - return dir - - -@pytest.mark.parametrize( - "dir", - [None, directory_with_file_scheme(), directory_with_missing_parent()], -) -def test_temporary_directory(dir): - prefix = "prefix-" - suffix = "-suffix" - with temporary_directory(suffix=suffix, prefix=prefix, dir=dir) as tmpdir: - if tmpdir.startswith("file:///"): - tmpdir = tmpdir[7:] - dir = Path(tmpdir) - assert dir.exists() - assert dir.name.startswith(prefix) - assert dir.name.endswith(suffix) - - with open(dir / "file.txt", "w") as file: - file.write("Hello") - - assert not dir.exists() - - -def test_temporary_directory__no_permission(): - # create a local temporary directory using Python tempfile - with tempfile.TemporaryDirectory() as dir: - os.chmod(dir, 0o444) # make it read-only - with pytest.raises(PermissionError): - with temporary_directory(dir=dir): - pass # pragma: no cover - - -def test_non_local_filesystem(mocker): - # mock out fsspec calls - mock = mocker.patch("fsspec.filesystem") - myfs = mocker.MagicMock() - mock.return_value = myfs - - # call function - with temporary_directory( - prefix="mytmp", dir="myfs://path/file", storage_options=dict(a="b") - ): - pass - - # check expected called were made - fsspec.filesystem.assert_called_once_with("myfs", a="b") - myfs.mkdir.assert_called_once_with(StartsWith("myfs://path/file/mytmp")) - myfs.rm.assert_called_once_with( - StartsWith("myfs://path/file/mytmp"), recursive=True - ) - - -def test_build_url(): - assert build_url("http://host/path", "subpath") == "http://host/path/subpath" - assert build_url("http://host/path/", "subpath") == "http://host/path/subpath" - assert ( - build_url("http://host/path?a=b", "subpath") == "http://host/path/subpath?a=b" - ) - assert ( - build_url("http://host/path/?a=b", "subpath") == "http://host/path/subpath?a=b" - ) - assert build_url("http://host/path#a", "subpath") == "http://host/path/subpath#a" - assert build_url("s3://host/path", "subpath") == "s3://host/path/subpath" - assert build_url("relative_path/path", "subpath") == "relative_path/path/subpath" - assert build_url("/absolute_path/path", "subpath") == "/absolute_path/path/subpath" - assert ( - build_url("http://host/a%20path", "subpath") == "http://host/a%20path/subpath" - ) - assert build_url("http://host/a path", "subpath") == "http://host/a%20path/subpath" - - -@pytest.mark.parametrize( - "x,n,expected_values", - [ - (0, 1, [[]]), - (1, 1, [[0]]), - (4, 1, [[0], [1], [2], [3]]), - (4, 2, [[0, 1], [2, 3]]), - (5, 2, [[0, 1], [2, 3], [4]]), - (5, 5, [[0, 1, 2, 3, 4]]), - (5, 6, [[0, 1, 2, 3, 4]]), - ], -) -def test_chunks(x, n, expected_values): - assert [list(i) for i in chunks(iter(range(x)), n)] == expected_values - - -@pytest.mark.parametrize( - "region,expected", - [ - ("region-with`~!@#$%^&*()-_=+various:symbols", 1), - ("region-with`~!@#$%^&*()-_=+various:symbols-and:partial_coordinates:5-", 5), - ("region-with`~!@#$%^&*()-_=+various:symbols-and:coordinates:6-11", 6), - ], -) -def test_get_region_start(region: str, expected: int): - assert get_region_start(region) == expected - - -def test_merge_encodings(): - default_encoding = dict(a=dict(a1=1, a2=2), b=dict(b1=5)) - overrides = dict(a=dict(a1=0, a3=3), c=dict(c1=7)) - assert merge_encodings(default_encoding, overrides) == dict( - a=dict(a1=0, a2=2, a3=3), b=dict(b1=5), c=dict(c1=7) - ) - - assert merge_encodings(default_encoding, {}) == default_encoding - assert merge_encodings({}, overrides) == overrides diff --git a/sgkit/tests/io/vcf/test_vcf_generator.py b/sgkit/tests/io/vcf/test_vcf_generator.py deleted file mode 100644 index 9819485bd..000000000 --- a/sgkit/tests/io/vcf/test_vcf_generator.py +++ /dev/null @@ -1,10 +0,0 @@ -from .vcf_generator import generate_vcf - - -def test_generate_vcf(tmp_path): - out = tmp_path / "all_fields.vcf" - - # uncomment the following to regenerate test file used in other tests - # out = "sgkit/tests/io/vcf/data/all_fields.vcf" - - generate_vcf(out) diff --git a/sgkit/tests/io/vcf/test_vcf_partition.py b/sgkit/tests/io/vcf/test_vcf_partition.py deleted file mode 100644 index 0f4017e4d..000000000 --- a/sgkit/tests/io/vcf/test_vcf_partition.py +++ /dev/null @@ -1,125 +0,0 @@ -import pytest - -from sgkit.io.vcf import partition_into_regions -from sgkit.io.vcf.vcf_reader import count_variants - -from .utils import path_for_test - - -@pytest.mark.parametrize( - "vcf_file", - [ - "CEUTrio.20.21.gatk3.4.g.bcf", - "CEUTrio.20.21.gatk3.4.g.vcf.bgz", - "NA12878.prod.chr20snippet.g.vcf.gz", - ], -) -@pytest.mark.parametrize( - "is_path", - [True, False], -) -def test_partition_into_regions__num_parts(shared_datadir, vcf_file, is_path): - vcf_path = path_for_test(shared_datadir, vcf_file, is_path) - - regions = partition_into_regions(vcf_path, num_parts=4) - - assert regions is not None - part_variant_counts = [count_variants(vcf_path, region) for region in regions] - total_variants = count_variants(vcf_path) - - assert sum(part_variant_counts) == total_variants - - -@pytest.mark.parametrize( - "is_path", - [True, False], -) -def test_partition_into_regions__num_parts_large(shared_datadir, is_path): - vcf_path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path) - - regions = partition_into_regions(vcf_path, num_parts=100) - assert regions is not None - assert len(regions) == 18 - - part_variant_counts = [count_variants(vcf_path, region) for region in regions] - total_variants = count_variants(vcf_path) - - assert sum(part_variant_counts) == total_variants - - -@pytest.mark.parametrize( - "target_part_size", - [ - 100_000, - "100KB", - "100 kB", - ], -) -@pytest.mark.parametrize( - "is_path", - [True, False], -) -def test_partition_into_regions__target_part_size( - shared_datadir, is_path, target_part_size -): - vcf_path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path) - - regions = partition_into_regions(vcf_path, target_part_size=target_part_size) - assert regions is not None - assert len(regions) == 5 - - part_variant_counts = [count_variants(vcf_path, region) for region in regions] - total_variants = count_variants(vcf_path) - - assert sum(part_variant_counts) == total_variants - - -@pytest.mark.parametrize( - "is_path", - [True, False], -) -def test_partition_into_regions__invalid_arguments(shared_datadir, is_path): - vcf_path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path) - - with pytest.raises( - ValueError, match=r"One of num_parts or target_part_size must be specified" - ): - partition_into_regions(vcf_path) - - with pytest.raises( - ValueError, match=r"Only one of num_parts or target_part_size may be specified" - ): - partition_into_regions(vcf_path, num_parts=4, target_part_size=100_000) - - with pytest.raises(ValueError, match=r"num_parts must be positive"): - partition_into_regions(vcf_path, num_parts=0) - - with pytest.raises(ValueError, match=r"target_part_size must be positive"): - partition_into_regions(vcf_path, target_part_size=0) - - -@pytest.mark.parametrize( - "is_path", - [True, False], -) -def test_partition_into_regions__one_part(shared_datadir, is_path): - vcf_path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path) - assert partition_into_regions(vcf_path, num_parts=1) is None - - -@pytest.mark.parametrize( - "is_path", - [True, False], -) -def test_partition_into_regions__missing_index(shared_datadir, is_path): - vcf_path = path_for_test( - shared_datadir, "CEUTrio.20.21.gatk3.4.noindex.g.vcf.bgz", is_path - ) - with pytest.raises(ValueError, match=r"Cannot find .tbi or .csi file."): - partition_into_regions(vcf_path, num_parts=2) - - bogus_index_path = path_for_test( - shared_datadir, "CEUTrio.20.21.gatk3.4.noindex.g.vcf.bgz.index", is_path - ) - with pytest.raises(ValueError, match=r"Only .tbi or .csi indexes are supported."): - partition_into_regions(vcf_path, index_path=bogus_index_path, num_parts=2) diff --git a/sgkit/tests/io/vcf/test_vcf_reader.py b/sgkit/tests/io/vcf/test_vcf_reader.py deleted file mode 100644 index 8e342ebc4..000000000 --- a/sgkit/tests/io/vcf/test_vcf_reader.py +++ /dev/null @@ -1,1849 +0,0 @@ -import os -import tempfile -from os import listdir -from os.path import join -from typing import MutableMapping - -import numpy as np -import pytest -import xarray as xr -import zarr -from numcodecs import Blosc, Delta, FixedScaleOffset, PackBits, VLenUTF8 -from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal - -from sgkit import load_dataset, save_dataset -from sgkit.io.utils import ( - FLOAT32_FILL, - FLOAT32_MISSING, - FLOAT32_MISSING_AS_INT32, - INT_FILL, - INT_MISSING, - STR_FILL, - STR_MISSING, -) -from sgkit.io.vcf import ( - MaxAltAllelesExceededWarning, - partition_into_regions, - read_vcf, - vcf_to_zarr, -) -from sgkit.io.vcf.vcf_reader import ( - FloatFormatFieldWarning, - merge_zarr_array_sizes, - zarr_array_sizes, -) -from sgkit.model import get_contigs, get_filters, num_contigs -from sgkit.tests.io.test_dataset import assert_identical - -from .utils import path_for_test - - -@pytest.mark.parametrize( - "read_chunk_length", - [None, 1], -) -@pytest.mark.parametrize( - "is_path", - [True, False], -) -@pytest.mark.parametrize("to_zarr", [True, False]) -@pytest.mark.filterwarnings("ignore::xarray.coding.variables.SerializationWarning") -def test_vcf_to_zarr__small_vcf( - shared_datadir, is_path, read_chunk_length, tmp_path, to_zarr -): - path = path_for_test(shared_datadir, "sample.vcf.gz", is_path) - output = tmp_path.joinpath("vcf.zarr").as_posix() - - fields = [ - "INFO/NS", - "INFO/AN", - "INFO/AA", - "INFO/DB", - "INFO/AC", - "INFO/AF", - "FORMAT/GT", - "FORMAT/DP", - "FORMAT/HQ", - ] - field_defs = { - "FORMAT/HQ": {"dimension": "ploidy"}, - "INFO/AF": {"Number": "2", "dimension": "AF"}, - "INFO/AC": {"Number": "2", "dimension": "AC"}, - } - if to_zarr: - vcf_to_zarr( - path, - output, - max_alt_alleles=3, - chunk_length=5, - chunk_width=2, - read_chunk_length=read_chunk_length, - fields=fields, - field_defs=field_defs, - ) - ds = xr.open_zarr(output) - else: - ds = read_vcf( - path, chunk_length=5, chunk_width=2, fields=fields, field_defs=field_defs - ) - - assert_array_equal(ds["filter_id"], ["PASS", "s50", "q10"]) - assert_array_equal( - ds["variant_filter"], - [ - [False, False, False], - [False, False, False], - [True, False, False], - [False, False, True], - [True, False, False], - [True, False, False], - [True, False, False], - [False, False, False], - [True, False, False], - ], - ) - assert_array_equal(ds["contig_id"], ["19", "20", "X"]) - assert "contig_length" not in ds - assert_array_equal(ds["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2]) - assert ds["variant_contig"].chunks[0][0] == 5 - - assert_array_equal( - ds["variant_position"], - [111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10], - ) - assert ds["variant_position"].chunks[0][0] == 5 - - im = INT_MISSING - fm = FLOAT32_MISSING - ff = FLOAT32_FILL - sm = STR_MISSING - sf = STR_FILL - - assert_array_equal( - ds["variant_NS"], - [im, im, 3, 3, 2, 3, 3, im, im], - ) - assert ds["variant_NS"].chunks[0][0] == 5 - - assert_array_equal( - ds["variant_AN"], - [im, im, im, im, im, im, 6, im, im], - ) - assert ds["variant_AN"].chunks[0][0] == 5 - - assert_array_equal( - ds["variant_AA"], - [ - sm, - sm, - sm, - sm, - "T", - "T", - "G", - sm, - sm, - ], - ) - assert ds["variant_AN"].chunks[0][0] == 5 - - assert_array_equal( - ds["variant_DB"], - [ - False, - False, - True, - False, - True, - False, - False, - False, - False, - ], - ) - assert ds["variant_AN"].chunks[0][0] == 5 - - variant_AF = np.array( - [ - [fm, fm], - [fm, fm], - [0.5, ff], - [0.017, ff], - [0.333, 0.667], - [fm, fm], - [fm, fm], - [fm, fm], - [fm, fm], - ], - dtype=np.float32, - ) - values = ds["variant_AF"].values - assert_array_almost_equal(values, variant_AF, 3) - nans = np.isnan(variant_AF) - assert_array_equal(variant_AF.view(np.int32)[nans], values.view(np.int32)[nans]) - assert ds["variant_AF"].chunks[0][0] == 5 - - assert_array_equal( - ds["variant_AC"], - [ - [im, im], - [im, im], - [im, im], - [im, im], - [im, im], - [im, im], - [3, 1], - [im, im], - [im, im], - ], - ) - assert ds["variant_AC"].chunks[0][0] == 5 - - assert_array_equal( - ds["variant_allele"].values.tolist(), - [ - ["A", "C", sf, sf], - ["A", "G", sf, sf], - ["G", "A", sf, sf], - ["T", "A", sf, sf], - ["A", "G", "T", sf], - ["T", sf, sf, sf], - ["G", "GA", "GAC", sf], - ["T", sf, sf, sf], - ["AC", "A", "ATG", "C"], - ], - ) - assert ds["variant_allele"].chunks[0][0] == 5 - assert ds["variant_allele"].dtype == "O" - assert_array_equal( - ds["variant_id"].values.tolist(), - [sm, sm, "rs6054257", sm, "rs6040355", sm, "microsat1", sm, "rsTest"], - ) - assert ds["variant_id"].chunks[0][0] == 5 - assert ds["variant_id"].dtype == "O" - assert_array_equal( - ds["variant_id_mask"], - [True, True, False, True, False, True, False, True, False], - ) - assert ds["variant_id_mask"].chunks[0][0] == 5 - - assert_array_equal(ds["sample_id"], ["NA00001", "NA00002", "NA00003"]) - assert ds["sample_id"].chunks[0][0] == 2 - - call_genotype = np.array( - [ - [[0, 0], [0, 0], [0, 1]], - [[0, 0], [0, 0], [0, 1]], - [[0, 0], [1, 0], [1, 1]], - [[0, 0], [0, 1], [0, 0]], - [[1, 2], [2, 1], [2, 2]], - [[0, 0], [0, 0], [0, 0]], - [[0, 1], [0, 2], [im, im]], - [[0, 0], [0, 0], [im, im]], - [[0, im], [0, 1], [0, 2]], - ], - dtype="i1", - ) - call_genotype_phased = np.array( - [ - [True, True, False], - [True, True, False], - [True, True, False], - [True, True, False], - [True, True, False], - [True, True, False], - [False, False, False], - [False, True, False], - [True, False, True], - ], - dtype=bool, - ) - call_DP = [ - [im, im, im], - [im, im, im], - [1, 8, 5], - [3, 5, 3], - [6, 0, 4], - [im, 4, 2], - [4, 2, 3], - [im, im, im], - [im, im, im], - ] - call_HQ = [ - [[10, 15], [10, 10], [3, 3]], - [[10, 10], [10, 10], [3, 3]], - [[51, 51], [51, 51], [im, im]], - [[58, 50], [65, 3], [im, im]], - [[23, 27], [18, 2], [im, im]], - [[56, 60], [51, 51], [im, im]], - [[im, im], [im, im], [im, im]], - [[im, im], [im, im], [im, im]], - [[im, im], [im, im], [im, im]], - ] - - assert_array_equal(ds["call_genotype"], call_genotype) - assert_array_equal(ds["call_genotype_mask"], call_genotype < 0) - assert_array_equal(ds["call_genotype_phased"], call_genotype_phased) - assert_array_equal(ds["call_DP"], call_DP) - assert_array_equal(ds["call_HQ"], call_HQ) - - for name in ["call_genotype", "call_genotype_mask", "call_HQ"]: - assert ds[name].chunks == ((5, 4), (2, 1), (2,)) - - for name in ["call_genotype_phased", "call_DP"]: - assert ds[name].chunks == ((5, 4), (2, 1)) - - # save and load again to test https://github.com/pydata/xarray/issues/3476 - path2 = tmp_path / "ds2.zarr" - if not is_path: - path2 = str(path2) - save_dataset(ds, path2) - assert_identical(ds, load_dataset(path2)) - - -@pytest.mark.parametrize( - "is_path", - [True, False], -) -def test_vcf_to_zarr__max_alt_alleles(shared_datadir, is_path, tmp_path): - path = path_for_test(shared_datadir, "sample.vcf.gz", is_path) - output = tmp_path.joinpath("vcf.zarr").as_posix() - - with pytest.warns(MaxAltAllelesExceededWarning): - max_alt_alleles = 1 - vcf_to_zarr( - path, output, chunk_length=5, chunk_width=2, max_alt_alleles=max_alt_alleles - ) - ds = xr.open_zarr(output) - - # extra alt alleles are dropped - assert_array_equal( - ds["variant_allele"].values.tolist(), - [ - ["A", "C"], - ["A", "G"], - ["G", "A"], - ["T", "A"], - ["A", "G"], - ["T", ""], - ["G", "GA"], - ["T", ""], - ["AC", "A"], - ], - ) - - # genotype calls are truncated - assert np.all(ds["call_genotype"].values <= max_alt_alleles) - - # the maximum number of alt alleles actually seen is stored as an attribute - assert ds.attrs["max_alt_alleles_seen"] == 3 - - -@pytest.mark.parametrize( - "read_chunk_length", - [None, 1_000], -) -@pytest.mark.parametrize( - "is_path", - [True, False], -) -@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") -def test_vcf_to_zarr__large_vcf(shared_datadir, is_path, read_chunk_length, tmp_path): - path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path) - output = tmp_path.joinpath("vcf.zarr").as_posix() - - vcf_to_zarr(path, output, chunk_length=5_000, read_chunk_length=read_chunk_length) - ds = xr.open_zarr(output) - - assert_array_equal(ds["contig_id"], ["20", "21"]) - assert_array_equal(ds["contig_length"], [63025520, 48129895]) - assert ds["sample_id"].shape == (1,) - assert ds["call_genotype"].shape == (19910, 1, 2) - assert ds["call_genotype_mask"].shape == (19910, 1, 2) - assert ds["call_genotype_phased"].shape == (19910, 1) - assert ds["variant_allele"].shape == (19910, 4) - assert ds["variant_contig"].shape == (19910,) - assert ds["variant_id"].shape == (19910,) - assert ds["variant_id_mask"].shape == (19910,) - assert ds["variant_position"].shape == (19910,) - - assert ds["variant_allele"].dtype == "O" - assert ds["variant_id"].dtype == "O" - - # check underlying zarr chunk size is 1 in samples dim - za = zarr.open(output) - assert za["sample_id"].chunks == (1,) - assert za["call_genotype"].chunks == (5000, 1, 2) - - -def test_vcf_to_zarr__plain_vcf_with_no_index(shared_datadir, tmp_path): - path = path_for_test( - shared_datadir, - "mixed.vcf", - ) - output = tmp_path.joinpath("vcf.zarr").as_posix() - - vcf_to_zarr(path, output, truncate_calls=True) - ds = xr.open_zarr(output) - assert ds["sample_id"].shape == (3,) - - -@pytest.mark.parametrize( - "is_path", - [True, False], -) -@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") -def test_vcf_to_zarr__mutable_mapping(shared_datadir, is_path): - path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path) - output: MutableMapping[str, bytes] = {} - - vcf_to_zarr(path, output, chunk_length=5_000) - ds = xr.open_zarr(output) - - assert ds["sample_id"].shape == (1,) - assert ds["call_genotype"].shape == (19910, 1, 2) - assert ds["call_genotype_mask"].shape == (19910, 1, 2) - assert ds["call_genotype_phased"].shape == (19910, 1) - assert ds["variant_allele"].shape == (19910, 4) - assert ds["variant_contig"].shape == (19910,) - assert ds["variant_id"].shape == (19910,) - assert ds["variant_id_mask"].shape == (19910,) - assert ds["variant_position"].shape == (19910,) - - assert ds["variant_allele"].dtype == "O" - assert ds["variant_id"].dtype == "O" - - -@pytest.mark.parametrize( - "is_path", - [True, False], -) -def test_vcf_to_zarr__compressor_and_filters(shared_datadir, is_path, tmp_path): - path = path_for_test(shared_datadir, "sample.vcf.gz", is_path) - output = tmp_path.joinpath("vcf.zarr").as_posix() - - compressor = Blosc("zlib", 1, Blosc.NOSHUFFLE) - variant_id_compressor = Blosc("zlib", 2, Blosc.NOSHUFFLE) - encoding = dict( - variant_id=dict(compressor=variant_id_compressor), - variant_id_mask=dict(filters=None), - ) - vcf_to_zarr( - path, - output, - chunk_length=5, - chunk_width=2, - compressor=compressor, - encoding=encoding, - ) - - # look at actual Zarr store to check compressor and filters - z = zarr.open(output) - assert z["call_genotype"].compressor == compressor - assert z["call_genotype"].filters is None # sgkit default - assert z["call_genotype"].chunks == (5, 2, 2) - assert z["call_genotype_mask"].compressor == compressor - assert z["call_genotype_mask"].filters == [PackBits()] # sgkit default - assert z["call_genotype_mask"].chunks == (5, 2, 2) - - assert z["variant_id"].compressor == variant_id_compressor - assert z["variant_id"].filters == [VLenUTF8()] # sgkit default - assert z["variant_id"].chunks == (5,) - assert z["variant_id_mask"].compressor == compressor - assert z["variant_id_mask"].filters is None - assert z["variant_id_mask"].chunks == (5,) - - assert z["variant_position"].filters == [ - Delta(dtype="i4", astype="i4") - ] # sgkit default - - -@pytest.mark.parametrize( - "is_path", - [True, False], -) -@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") -def test_vcf_to_zarr__parallel_compressor_and_filters( - shared_datadir, is_path, tmp_path -): - path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path) - output = tmp_path.joinpath("vcf_concat.zarr").as_posix() - regions = ["20", "21"] - - compressor = Blosc("zlib", 1, Blosc.NOSHUFFLE) - variant_id_compressor = Blosc("zlib", 2, Blosc.NOSHUFFLE) - encoding = dict( - variant_id=dict(compressor=variant_id_compressor), - variant_id_mask=dict(filters=None), - ) - vcf_to_zarr( - path, - output, - regions=regions, - chunk_length=5_000, - compressor=compressor, - encoding=encoding, - ) - - # look at actual Zarr store to check compressor and filters - z = zarr.open(output) - assert z["call_genotype"].compressor == compressor - assert z["call_genotype"].filters is None # sgkit default - assert z["call_genotype"].chunks == (5000, 1, 2) - assert z["call_genotype_mask"].compressor == compressor - assert z["call_genotype_mask"].filters == [PackBits()] # sgkit default - assert z["call_genotype_mask"].chunks == (5000, 1, 2) - - assert z["variant_id"].compressor == variant_id_compressor - assert z["variant_id"].filters == [VLenUTF8()] # sgkit default - assert z["variant_id"].chunks == (5000,) - assert z["variant_id_mask"].compressor == compressor - assert z["variant_id_mask"].filters is None - assert z["variant_id_mask"].chunks == (5000,) - - assert z["variant_position"].filters == [ - Delta(dtype="i4", astype="i4") - ] # sgkit default - - -def test_vcf_to_zarr__float_format_field_warning(shared_datadir, tmp_path): - path = path_for_test(shared_datadir, "simple.output.mixed_depth.likelihoods.vcf") - output = tmp_path.joinpath("vcf.zarr").as_posix() - - with pytest.warns(FloatFormatFieldWarning): - vcf_to_zarr( - path, - output, - ploidy=4, - max_alt_alleles=3, - fields=["FORMAT/GL"], - ) - - -@pytest.mark.parametrize( - "is_path", - [True, False], -) -@pytest.mark.parametrize( - "output_is_path", - [True, False], -) -@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") -def test_vcf_to_zarr__parallel(shared_datadir, is_path, output_is_path, tmp_path): - path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path) - output = tmp_path.joinpath("vcf_concat.zarr") - if not output_is_path: - output = output.as_posix() - - regions = ["20", "21"] - - vcf_to_zarr( - path, - output, - regions=regions, - chunk_length=5_000, - ) - ds = xr.open_zarr(output) - - assert ds["sample_id"].shape == (1,) - assert ds["call_genotype"].shape == (19910, 1, 2) - assert ds["call_genotype_mask"].shape == (19910, 1, 2) - assert ds["call_genotype_phased"].shape == (19910, 1) - assert ds["variant_allele"].shape == (19910, 4) - assert ds["variant_contig"].shape == (19910,) - assert ds["variant_id"].shape == (19910,) - assert ds["variant_id_mask"].shape == (19910,) - assert ds["variant_position"].shape == (19910,) - - assert ds["variant_allele"].dtype == "O" - assert ds["variant_id"].dtype == "O" - - -@pytest.mark.parametrize( - "is_path", - [True, False], -) -@pytest.mark.filterwarnings("ignore::UserWarning") -def test_vcf_to_zarr__empty_region(shared_datadir, is_path, tmp_path): - path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path) - output = tmp_path.joinpath("vcf_concat.zarr").as_posix() - regions = "23" - - vcf_to_zarr(path, output, regions=regions) - ds = xr.open_zarr(output) - - assert ds["sample_id"].shape == (1,) - assert ds["call_genotype"].shape == (0, 1, 2) - assert ds["call_genotype_mask"].shape == (0, 1, 2) - assert ds["call_genotype_phased"].shape == (0, 1) - assert ds["variant_allele"].shape == (0, 4) - assert ds["variant_contig"].shape == (0,) - assert ds["variant_id"].shape == (0,) - assert ds["variant_id_mask"].shape == (0,) - assert ds["variant_position"].shape == (0,) - - -@pytest.mark.parametrize( - "is_path", - [False], -) -@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") -def test_vcf_to_zarr__parallel_temp_chunk_length(shared_datadir, is_path, tmp_path): - path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path) - output = tmp_path.joinpath("vcf_concat.zarr").as_posix() - regions = ["20", "21"] - - # Use a temp_chunk_length that is smaller than chunk_length - # Open the temporary parts to check that they have the right temp chunk length - with tempfile.TemporaryDirectory() as tempdir: - vcf_to_zarr( - path, - output, - regions=regions, - chunk_length=5_000, - temp_chunk_length=2_500, - tempdir=tempdir, - retain_temp_files=True, - ) - inner_temp_dir = join(tempdir, listdir(tempdir)[0]) - parts_dir = join(inner_temp_dir, listdir(inner_temp_dir)[0]) - part = xr.open_zarr(join(parts_dir, "part-0.zarr")) - assert part["call_genotype"].chunks[0][0] == 2_500 - assert part["variant_position"].chunks[0][0] == 2_500 - ds = xr.open_zarr(output) - - assert ds["sample_id"].shape == (1,) - assert ds["call_genotype"].shape == (19910, 1, 2) - assert ds["call_genotype"].chunks[0][0] == 5_000 - assert ds["call_genotype_mask"].shape == (19910, 1, 2) - assert ds["call_genotype_phased"].shape == (19910, 1) - assert ds["variant_allele"].shape == (19910, 4) - assert ds["variant_contig"].shape == (19910,) - assert ds["variant_id"].shape == (19910,) - assert ds["variant_id_mask"].shape == (19910,) - assert ds["variant_position"].shape == (19910,) - assert ds["variant_position"].chunks[0][0] == 5_000 - - assert ds["variant_allele"].dtype == "O" - assert ds["variant_id"].dtype == "O" - - -def test_vcf_to_zarr__parallel_temp_chunk_length_not_divisible( - shared_datadir, tmp_path -): - path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", False) - output = tmp_path.joinpath("vcf_concat.zarr").as_posix() - regions = ["20", "21"] - - with pytest.raises( - ValueError, - match=r"Temporary chunk length in variant dimension \(4000\) must evenly divide target chunk length 5000", - ): - # Use a temp_chunk_length that does not divide into chunk_length - vcf_to_zarr( - path, output, regions=regions, chunk_length=5_000, temp_chunk_length=4_000 - ) - - -@pytest.mark.parametrize( - "is_path", - [True, False], -) -def test_vcf_to_zarr__parallel_partitioned(shared_datadir, is_path, tmp_path): - path = path_for_test( - shared_datadir, - "1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz", - is_path, - ) - output = tmp_path.joinpath("vcf_concat.zarr").as_posix() - - regions = partition_into_regions(path, num_parts=4) - - vcf_to_zarr(path, output, regions=regions, chunk_length=1_000, chunk_width=1_000) - ds = xr.open_zarr(output) - - assert ds["sample_id"].shape == (2535,) - assert ds["variant_id"].shape == (1406,) - - -@pytest.mark.parametrize( - "is_path", - [True, False], -) -def test_vcf_to_zarr__parallel_partitioned_by_size(shared_datadir, is_path, tmp_path): - path = path_for_test( - shared_datadir, - "1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz", - is_path, - ) - output = tmp_path.joinpath("vcf_concat.zarr").as_posix() - - vcf_to_zarr( - path, output, target_part_size="4MB", chunk_length=1_000, chunk_width=1_000 - ) - ds = xr.open_zarr(output) - - assert ds["sample_id"].shape == (2535,) - assert ds["variant_id"].shape == (1406,) - - -@pytest.mark.parametrize( - "is_path", - [True, False], -) -@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") -def test_vcf_to_zarr__multiple(shared_datadir, is_path, tmp_path): - paths = [ - path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path), - path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path), - ] - output = tmp_path.joinpath("vcf_concat.zarr").as_posix() - - vcf_to_zarr(paths, output, target_part_size=None, chunk_length=5_000) - ds = xr.open_zarr(output) - - assert ds["sample_id"].shape == (1,) - assert ds["call_genotype"].shape == (19910, 1, 2) - assert ds["call_genotype_mask"].shape == (19910, 1, 2) - assert ds["call_genotype_phased"].shape == (19910, 1) - assert ds["variant_allele"].shape == (19910, 4) - assert ds["variant_contig"].shape == (19910,) - assert ds["variant_id"].shape == (19910,) - assert ds["variant_id_mask"].shape == (19910,) - assert ds["variant_position"].shape == (19910,) - - assert ds.chunks["variants"] == (5000, 5000, 5000, 4910) - - -@pytest.mark.parametrize( - "is_path", - [True, False], -) -@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") -def test_vcf_to_zarr__multiple_partitioned(shared_datadir, is_path, tmp_path): - paths = [ - path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path), - path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path), - ] - output = tmp_path.joinpath("vcf_concat.zarr").as_posix() - - regions = [partition_into_regions(path, num_parts=2) for path in paths] - - vcf_to_zarr(paths, output, regions=regions, chunk_length=5_000) - ds = xr.open_zarr(output) - - assert ds["sample_id"].shape == (1,) - assert ds["call_genotype"].shape == (19910, 1, 2) - assert ds["call_genotype_mask"].shape == (19910, 1, 2) - assert ds["call_genotype_phased"].shape == (19910, 1) - assert ds["variant_allele"].shape == (19910, 4) - assert ds["variant_contig"].shape == (19910,) - assert ds["variant_id"].shape == (19910,) - assert ds["variant_id_mask"].shape == (19910,) - assert ds["variant_position"].shape == (19910,) - - assert ds.chunks["variants"] == (5000, 5000, 5000, 4910) - - -@pytest.mark.parametrize( - "is_path", - [True, False], -) -@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") -def test_vcf_to_zarr__multiple_partitioned_by_size(shared_datadir, is_path, tmp_path): - paths = [ - path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path), - path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path), - ] - output = tmp_path.joinpath("vcf_concat.zarr").as_posix() - - vcf_to_zarr(paths, output, target_part_size="40KB", chunk_length=5_000) - ds = xr.open_zarr(output) - - assert ds["sample_id"].shape == (1,) - assert ds["call_genotype"].shape == (19910, 1, 2) - assert ds["call_genotype_mask"].shape == (19910, 1, 2) - assert ds["call_genotype_phased"].shape == (19910, 1) - assert ds["variant_allele"].shape == (19910, 4) - assert ds["variant_contig"].shape == (19910,) - assert ds["variant_id"].shape == (19910,) - assert ds["variant_id_mask"].shape == (19910,) - assert ds["variant_position"].shape == (19910,) - - assert ds.chunks["variants"] == (5000, 5000, 5000, 4910) - - -@pytest.mark.parametrize( - "is_path", - [True, False], -) -def test_vcf_to_zarr__mutiple_partitioned_invalid_regions( - shared_datadir, is_path, tmp_path -): - paths = [ - path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path), - path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path), - ] - output = tmp_path.joinpath("vcf_concat.zarr").as_posix() - - # invalid regions, should be a sequence of sequences - regions = partition_into_regions(paths[0], num_parts=2) - - with pytest.raises( - ValueError, - match=r"multiple input regions must be a sequence of sequence of strings", - ): - vcf_to_zarr(paths, output, regions=regions, chunk_length=5_000) - - -@pytest.mark.parametrize( - "is_path", - [True, False], -) -def test_vcf_to_zarr__multiple_max_alt_alleles(shared_datadir, is_path, tmp_path): - paths = [ - path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path), - path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path), - ] - output = tmp_path.joinpath("vcf_concat.zarr").as_posix() - - with pytest.warns(MaxAltAllelesExceededWarning): - vcf_to_zarr( - paths, - output, - target_part_size="40KB", - chunk_length=5_000, - max_alt_alleles=1, - ) - ds = xr.open_zarr(output) - - # the maximum number of alt alleles actually seen is stored as an attribute - assert ds.attrs["max_alt_alleles_seen"] == 7 - - -@pytest.mark.parametrize( - "max_alt_alleles,dtype,warning", - [ - (2, np.int8, True), - (127, np.int8, True), - (128, np.int16, True), - (145, np.int16, True), - (164, np.int16, False), - ], -) -def test_vcf_to_zarr__call_genotype_dtype( - shared_datadir, tmp_path, max_alt_alleles, dtype, warning -): - path = path_for_test(shared_datadir, "allele_overflow.vcf.gz") - output = tmp_path.joinpath("vcf.zarr").as_posix() - if warning: - with pytest.warns(MaxAltAllelesExceededWarning): - vcf_to_zarr(path, output, max_alt_alleles=max_alt_alleles) - else: - vcf_to_zarr(path, output, max_alt_alleles=max_alt_alleles) - ds = load_dataset(output) - assert ds.call_genotype.dtype == dtype - assert ds.call_genotype.values.max() <= max_alt_alleles - - -@pytest.mark.parametrize( - "ploidy,mixed_ploidy,truncate_calls,regions", - [ - (2, False, True, None), - (4, False, False, None), - (4, False, False, ["CHR1:0-5", "CHR1:5-10"]), - (4, True, False, None), - (4, True, False, ["CHR1:0-5", "CHR1:5-10"]), - (5, True, False, None), - ], -) -def test_vcf_to_zarr__mixed_ploidy_vcf( - shared_datadir, tmp_path, ploidy, mixed_ploidy, truncate_calls, regions -): - path = path_for_test(shared_datadir, "mixed.vcf.gz") - output = tmp_path.joinpath("vcf.zarr").as_posix() - - vcf_to_zarr( - path, - output, - regions=regions, - chunk_length=5, - chunk_width=2, - ploidy=ploidy, - mixed_ploidy=mixed_ploidy, - truncate_calls=truncate_calls, - ) - ds = load_dataset(output) - - variant_dtype = "O" - assert_array_equal(ds["contig_id"], ["CHR1", "CHR2", "CHR3"]) - assert_array_equal(ds["variant_contig"], [0, 0]) - assert_array_equal(ds["variant_position"], [2, 7]) - assert_array_equal( - ds["variant_allele"].values.tolist(), - np.array( - [ - ["A", "T", "", ""], - ["A", "C", "", ""], - ], - dtype=variant_dtype, - ), - ) - assert ds["variant_allele"].dtype == variant_dtype # type: ignore[comparison-overlap] - assert_array_equal( - ds["variant_id"], - np.array([".", "."], dtype=variant_dtype), - ) - assert ds["variant_id"].dtype == variant_dtype # type: ignore[comparison-overlap] - assert_array_equal( - ds["variant_id_mask"], - [True, True], - ) - assert_array_equal(ds["sample_id"], ["SAMPLE1", "SAMPLE2", "SAMPLE3"]) - - assert ds["call_genotype"].attrs["mixed_ploidy"] == mixed_ploidy - pad = -2 if mixed_ploidy else -1 # -2 indicates a fill (non-allele) value - call_genotype = np.array( - [ - [[0, 0, 1, 1, pad], [0, 0, pad, pad, pad], [0, 0, 0, 1, pad]], - [[0, 0, 1, 1, pad], [0, 1, pad, pad, pad], [0, 1, -1, -1, pad]], - ], - dtype="i1", - ) - # truncate row vectors if lower ploidy - call_genotype = call_genotype[:, :, 0:ploidy] - - assert_array_equal(ds["call_genotype"], call_genotype) - assert_array_equal(ds["call_genotype_mask"], call_genotype < 0) - if mixed_ploidy: - assert_array_equal(ds["call_genotype_fill"], call_genotype < -1) - - -@pytest.mark.parametrize( - "ploidy,mixed_ploidy,truncate_calls", - [ - (2, False, False), - (3, True, False), - ], -) -def test_vcf_to_zarr__mixed_ploidy_vcf_exception( - shared_datadir, tmp_path, ploidy, mixed_ploidy, truncate_calls -): - path = path_for_test(shared_datadir, "mixed.vcf.gz") - output = tmp_path.joinpath("vcf.zarr").as_posix() - - with pytest.raises(ValueError) as excinfo: - vcf_to_zarr( - path, - output, - ploidy=ploidy, - mixed_ploidy=mixed_ploidy, - truncate_calls=truncate_calls, - ) - assert "Genotype call longer than ploidy." == str(excinfo.value) - - -def test_vcf_to_zarr__no_genotypes(shared_datadir, tmp_path): - path = path_for_test(shared_datadir, "no_genotypes.vcf") - output = tmp_path.joinpath("vcf.zarr").as_posix() - - vcf_to_zarr(path, output) - - ds = xr.open_zarr(output) - - assert "call_genotype" not in ds - assert "call_genotype_mask" not in ds - assert "call_genotype_phased" not in ds - - assert ds["sample_id"].shape == (0,) - assert ds["variant_allele"].shape == (26, 4) - assert ds["variant_contig"].shape == (26,) - assert ds["variant_id"].shape == (26,) - assert ds["variant_id_mask"].shape == (26,) - assert ds["variant_position"].shape == (26,) - - -def test_vcf_to_zarr__no_genotypes_with_gt_header(shared_datadir, tmp_path): - path = path_for_test(shared_datadir, "no_genotypes_with_gt_header.vcf") - output = tmp_path.joinpath("vcf.zarr").as_posix() - - vcf_to_zarr(path, output) - - ds = xr.open_zarr(output) - - assert_array_equal(ds["call_genotype"], -1) - assert_array_equal(ds["call_genotype_mask"], 1) - assert_array_equal(ds["call_genotype_phased"], 0) - - assert ds["sample_id"].shape == (0,) - assert ds["variant_allele"].shape == (26, 4) - assert ds["variant_contig"].shape == (26,) - assert ds["variant_id"].shape == (26,) - assert ds["variant_id_mask"].shape == (26,) - assert ds["variant_position"].shape == (26,) - - -def test_vcf_to_zarr__contig_not_defined_in_header(shared_datadir, tmp_path): - # sample.vcf does not define the contigs in the header, and isn't indexed - path = path_for_test(shared_datadir, "sample.vcf") - output = tmp_path.joinpath("vcf.zarr").as_posix() - - with pytest.raises( - ValueError, - match=r"Contig '19' is not defined in the header.", - ): - vcf_to_zarr(path, output) - - -def test_vcf_to_zarr__filter_not_defined_in_header(shared_datadir, tmp_path): - path = path_for_test(shared_datadir, "no_filter_defined.vcf") - output = tmp_path.joinpath("vcf.zarr").as_posix() - - with pytest.raises( - ValueError, - match=r"Filter 'FAIL' is not defined in the header.", - ): - vcf_to_zarr(path, output) - - -def test_vcf_to_zarr__info_name_clash(shared_datadir, tmp_path): - # info_name_clash.vcf has an info field called 'id' which would be mapped to - # 'variant_id', clashing with the fixed field of the same name - path = path_for_test(shared_datadir, "info_name_clash.vcf") - output = tmp_path.joinpath("info_name_clash.zarr").as_posix() - - vcf_to_zarr(path, output) # OK if problematic field is ignored - - with pytest.raises( - ValueError, - match=r"Generated name for INFO field 'id' clashes with 'variant_id' from fixed VCF fields.", - ): - vcf_to_zarr(path, output, fields=["INFO/id"]) - - -def test_vcf_to_zarr__large_number_of_contigs(shared_datadir, tmp_path): - path = path_for_test(shared_datadir, "Homo_sapiens_assembly38.headerOnly.vcf.gz") - output = tmp_path.joinpath("vcf.zarr").as_posix() - - vcf_to_zarr(path, output) - - ds = xr.open_zarr(output) - - assert len(ds["contig_id"]) == 3366 - assert ds["variant_contig"].dtype == np.int16 # needs larger dtype than np.int8 - - -def test_vcf_to_zarr__fields(shared_datadir, tmp_path): - path = path_for_test(shared_datadir, "sample.vcf.gz") - output = tmp_path.joinpath("vcf.zarr").as_posix() - - vcf_to_zarr( - path, - output, - chunk_length=5, - chunk_width=2, - fields=["INFO/DP", "INFO/AA", "INFO/DB", "FORMAT/DP"], - ) - ds = xr.open_zarr(output) - - imissing = INT_MISSING - assert_array_equal( - ds["variant_DP"], [imissing, imissing, 14, 11, 10, 13, 9, imissing, imissing] - ) - assert ds["variant_DP"].attrs["comment"] == "Total Depth" - - smissing = STR_MISSING - assert_array_equal( - ds["variant_AA"], - np.array( - [smissing, smissing, smissing, smissing, "T", "T", "G", smissing, smissing], - dtype="O", - ), - ) - assert ds["variant_AA"].attrs["comment"] == "Ancestral Allele" - - assert_array_equal( - ds["variant_DB"], [False, False, True, False, True, False, False, False, False] - ) - assert ds["variant_DB"].attrs["comment"] == "dbSNP membership, build 129" - - dp = np.array( - [ - [imissing, imissing, imissing], - [imissing, imissing, imissing], - [1, 8, 5], - [3, 5, 3], - [6, 0, 4], - [imissing, 4, 2], - [4, 2, 3], - [imissing, imissing, imissing], - [imissing, imissing, imissing], - ], - dtype="i4", - ) - assert_array_equal(ds["call_DP"], dp) - assert ds["call_DP"].attrs["comment"] == "Read Depth" - - -@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") -def test_vcf_to_zarr__parallel_with_fields(shared_datadir, tmp_path): - path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz") - output = tmp_path.joinpath("vcf.zarr").as_posix() - regions = ["20", "21"] - - vcf_to_zarr( - path, - output, - regions=regions, - chunk_length=5_000, - temp_chunk_length=2_500, - fields=["INFO/MQ", "FORMAT/PGT"], - ) - ds = xr.open_zarr(output) - - # select a small region to check - ds = ds.set_index(variants=("variant_contig", "variant_position")).sel( - variants=slice((0, 10001661), (0, 10001670)) - ) - sfill = STR_FILL - smissing = STR_MISSING - # check strings have not been truncated after concat_zarrs - assert_array_equal( - ds["variant_allele"], - np.array( - [ - ["T", "C", "", sfill], - ["T", "", sfill, sfill], - ["T", "G", "", sfill], - ], - dtype="O", - ), - ) - - # convert floats to ints to check nan type - fmissing = FLOAT32_MISSING - assert_allclose( - ds["variant_MQ"].values.view("i4"), - np.array([58.33, fmissing, 57.45], dtype="f4").view("i4"), - ) - assert ds["variant_MQ"].attrs["comment"] == "RMS Mapping Quality" - - assert_array_equal( - ds["call_PGT"], np.array([["0|1"], [smissing], ["0|1"]], dtype="O") - ) - assert ( - ds["call_PGT"].attrs["comment"] - == "Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another" - ) - - -def test_vcf_to_zarr__field_defs(shared_datadir, tmp_path): - path = path_for_test(shared_datadir, "sample.vcf.gz") - output = tmp_path.joinpath("vcf.zarr").as_posix() - - vcf_to_zarr( - path, - output, - fields=["INFO/DP"], - field_defs={"INFO/DP": {"Description": "Combined depth across samples"}}, - ) - ds = xr.open_zarr(output) - - imissing = INT_MISSING - assert_array_equal( - ds["variant_DP"], [imissing, imissing, 14, 11, 10, 13, 9, imissing, imissing] - ) - assert ds["variant_DP"].attrs["comment"] == "Combined depth across samples" - - vcf_to_zarr( - path, - output, - fields=["INFO/DP"], - field_defs={"INFO/DP": {"Description": ""}}, # blank description - ) - ds = xr.open_zarr(output) - - assert_array_equal( - ds["variant_DP"], [imissing, imissing, 14, 11, 10, 13, 9, imissing, imissing] - ) - assert "comment" not in ds["variant_DP"].attrs - - -@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") -def test_vcf_to_zarr__field_number_A(shared_datadir, tmp_path): - path = path_for_test(shared_datadir, "sample.vcf.gz") - output = tmp_path.joinpath("vcf.zarr").as_posix() - - vcf_to_zarr( - path, - output, - max_alt_alleles=2, - fields=["INFO/AC"], - field_defs={"INFO/AC": {"Number": "A"}}, - ) - ds = xr.open_zarr(output) - - imissing = INT_MISSING - assert_array_equal( - ds["variant_AC"], - [ - [imissing, imissing], - [imissing, imissing], - [imissing, imissing], - [imissing, imissing], - [imissing, imissing], - [imissing, imissing], - [3, 1], - [imissing, imissing], - [imissing, imissing], - ], - ) - assert ( - ds["variant_AC"].attrs["comment"] - == "Allele count in genotypes, for each ALT allele, in the same order as listed" - ) - - -@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") -def test_vcf_to_zarr__field_number_R(shared_datadir, tmp_path): - path = path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz") - output = tmp_path.joinpath("vcf.zarr").as_posix() - - vcf_to_zarr( - path, - output, - fields=["FORMAT/AD"], - field_defs={"FORMAT/AD": {"Number": "R"}}, - ) - ds = xr.open_zarr(output) - - # select a small region to check - ds = ds.set_index(variants="variant_position").sel( - variants=slice(10002764, 10002793) - ) - - ifill = INT_FILL - imissing = INT_MISSING - ad = np.array( - [ - [[40, 14, 0, ifill]], - [[imissing, imissing, imissing, imissing]], - [[65, 8, 5, 0]], - [[imissing, imissing, imissing, imissing]], - ], - ) - assert_array_equal(ds["call_AD"], ad) - assert ( - ds["call_AD"].attrs["comment"] - == "Allelic depths for the ref and alt alleles in the order listed" - ) - - -@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") -def test_vcf_to_zarr__field_number_G(shared_datadir, tmp_path): - path = path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz") - output = tmp_path.joinpath("vcf.zarr").as_posix() - - vcf_to_zarr(path, output, fields=["FORMAT/PL"]) - ds = xr.open_zarr(output) - - # select a small region to check - ds = ds.set_index(variants="variant_position").sel( - variants=slice(10002764, 10002793) - ) - - fill = INT_FILL - pl = np.array( - [ - [[319, 0, 1316, 440, 1358, 1798, fill, fill, fill, fill]], - [[0, 120, 1800, fill, fill, fill, fill, fill, fill, fill]], - [[8, 0, 1655, 103, 1743, 2955, 184, 1653, 1928, 1829]], - [[0, 0, 2225, fill, fill, fill, fill, fill, fill, fill]], - ], - ) - assert_array_equal(ds["call_PL"], pl) - assert ( - ds["call_PL"].attrs["comment"] - == "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification" - ) - - -def test_vcf_to_zarr__field_number_G_non_diploid(shared_datadir, tmp_path): - path = path_for_test(shared_datadir, "simple.output.mixed_depth.likelihoods.vcf") - output = tmp_path.joinpath("vcf.zarr").as_posix() - - # store GL field as 2dp - encoding = { - "call_GL": { - "filters": [FixedScaleOffset(offset=0, scale=100, dtype="f4", astype="u1")] - } - } - vcf_to_zarr( - path, - output, - ploidy=4, - max_alt_alleles=3, - fields=["FORMAT/GL"], - encoding=encoding, - ) - ds = xr.open_zarr(output) - - # comb(n_alleles + ploidy - 1, ploidy) = comb(4 + 4 - 1, 4) = comb(7, 4) = 35 - assert_array_equal(ds["call_GL"].shape, (4, 3, 35)) - assert ds["call_GL"].attrs["comment"] == "Genotype likelihoods" - - -@pytest.mark.filterwarnings( - "ignore::sgkit.io.vcfzarr_reader.DimensionNameForFixedFormatFieldWarning" -) -def test_vcf_to_zarr__field_number_fixed(shared_datadir, tmp_path): - path = path_for_test(shared_datadir, "sample.vcf.gz") - output = tmp_path.joinpath("vcf.zarr").as_posix() - - # HQ Number is 2, and a dimension is automatically assigned (FORMAT_HQ_dim) - vcf_to_zarr( - path, - output, - fields=["FORMAT/HQ"], - ) - ds = xr.open_zarr(output) - - missing = INT_MISSING - assert_array_equal( - ds["call_HQ"], - [ - [[10, 15], [10, 10], [3, 3]], - [[10, 10], [10, 10], [3, 3]], - [[51, 51], [51, 51], [missing, missing]], - [[58, 50], [65, 3], [missing, missing]], - [[23, 27], [18, 2], [missing, missing]], - [[56, 60], [51, 51], [missing, missing]], - [[missing, missing], [missing, missing], [missing, missing]], - [[missing, missing], [missing, missing], [missing, missing]], - [[missing, missing], [missing, missing], [missing, missing]], - ], - ) - assert ds["call_HQ"].dims == ("variants", "samples", "FORMAT_HQ_dim") - assert ds["call_HQ"].attrs["comment"] == "Haplotype Quality" - - -def test_vcf_to_zarr__fields_errors(shared_datadir, tmp_path): - path = path_for_test(shared_datadir, "sample.vcf.gz") - output = tmp_path.joinpath("vcf.zarr").as_posix() - - with pytest.raises( - ValueError, - match=r"VCF field must be prefixed with 'INFO/' or 'FORMAT/'", - ): - vcf_to_zarr(path, output, fields=["DP"]) - - with pytest.raises( - ValueError, - match=r"INFO field 'XX' is not defined in the header.", - ): - vcf_to_zarr(path, output, fields=["INFO/XX"]) - - with pytest.raises( - ValueError, - match=r"FORMAT field 'XX' is not defined in the header.", - ): - vcf_to_zarr(path, output, fields=["FORMAT/XX"]) - - with pytest.raises( - ValueError, - match=r"FORMAT field 'XX' is not defined in the header.", - ): - vcf_to_zarr(path, output, exclude_fields=["FORMAT/XX"]) - - with pytest.raises( - ValueError, - match=r"INFO field 'AC' is defined as Number '.', which is not supported. Consider specifying `field_defs` to provide a concrete size for this field.", - ): - vcf_to_zarr(path, output, fields=["INFO/AC"]) - - with pytest.raises( - ValueError, - match=r"INFO field 'AN' is defined as Type 'Blah', which is not supported.", - ): - vcf_to_zarr( - path, - output, - fields=["INFO/AN"], - field_defs={"INFO/AN": {"Type": "Blah"}}, - ) - - -@pytest.mark.parametrize( - "vcf_file, expected_sizes", - [ - ( - "sample.vcf.gz", - { - "max_alt_alleles": 3, - "field_defs": {"INFO/AC": {"Number": 2}, "INFO/AF": {"Number": 2}}, - "ploidy": 2, - }, - ), - ("mixed.vcf.gz", {"max_alt_alleles": 1, "ploidy": 4}), - ("no_genotypes.vcf", {"max_alt_alleles": 1}), - ( - "CEUTrio.20.21.gatk3.4.g.vcf.bgz", - { - "max_alt_alleles": 7, - "field_defs": {"FORMAT/AD": {"Number": 8}}, - "ploidy": 2, - }, - ), - ], -) -def test_zarr_array_sizes(shared_datadir, vcf_file, expected_sizes): - path = path_for_test(shared_datadir, vcf_file) - sizes = zarr_array_sizes(path) - assert sizes == expected_sizes - - -def test_zarr_array_sizes__parallel(shared_datadir): - path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz") - regions = ["20", "21"] - sizes = zarr_array_sizes(path, regions=regions) - assert sizes == { - "max_alt_alleles": 7, - "field_defs": {"FORMAT/AD": {"Number": 8}}, - "ploidy": 2, - } - - -def test_zarr_array_sizes__multiple(shared_datadir): - paths = [ - path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz"), - path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz"), - ] - sizes = zarr_array_sizes(paths, target_part_size=None) - assert sizes == { - "max_alt_alleles": 7, - "field_defs": {"FORMAT/AD": {"Number": 8}}, - "ploidy": 2, - } - - -def test_zarr_array_sizes__parallel_partitioned_by_size(shared_datadir): - path = path_for_test( - shared_datadir, - "1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz", - ) - sizes = zarr_array_sizes(path, target_part_size="4MB") - assert sizes == { - "max_alt_alleles": 3, - "field_defs": {"FORMAT/AD": {"Number": 4}}, - "ploidy": 2, - } - - -@pytest.mark.parametrize( - "all_kwargs, expected_sizes", - [ - ([{"max_alt_alleles": 1}, {"max_alt_alleles": 2}], {"max_alt_alleles": 2}), - ( - [{"max_alt_alleles": 1, "ploidy": 3}, {"max_alt_alleles": 2}], - {"max_alt_alleles": 2, "ploidy": 3}, - ), - ( - [ - {"max_alt_alleles": 1, "field_defs": {"FORMAT/AD": {"Number": 8}}}, - {"max_alt_alleles": 2, "field_defs": {"FORMAT/AD": {"Number": 6}}}, - ], - {"max_alt_alleles": 2, "field_defs": {"FORMAT/AD": {"Number": 8}}}, - ), - ], -) -def test_merge_zarr_array_sizes(all_kwargs, expected_sizes): - assert merge_zarr_array_sizes(all_kwargs) == expected_sizes - - -def check_field(group, name, ndim, shape, dimension_names, dtype): - assert group[name].ndim == ndim - assert group[name].shape == shape - assert group[name].attrs["_ARRAY_DIMENSIONS"] == dimension_names - if dtype == str: - assert group[name].dtype == np.object_ - assert VLenUTF8() in group[name].filters - else: - assert group[name].dtype == dtype - - -@pytest.mark.filterwarnings( - "ignore::sgkit.io.vcfzarr_reader.DimensionNameForFixedFormatFieldWarning" -) -def test_spec(shared_datadir, tmp_path): - path = path_for_test(shared_datadir, "sample_multiple_filters.vcf.gz") - output = tmp_path.joinpath("vcf.zarr").as_posix() - - kwargs = zarr_array_sizes(path) - vcf_to_zarr( - path, - output, - chunk_length=5, - fields=["INFO/*", "FORMAT/*"], - mixed_ploidy=True, - **kwargs, - ) - - variants = 9 - alt_alleles = 3 - samples = 3 - ploidy = 2 - - group = zarr.open_group(output) - - # VCF Zarr group attributes - assert group.attrs["vcf_zarr_version"] == "0.2" - assert group.attrs["vcf_header"].startswith("##fileformat=VCFv4.0") - assert group.attrs["contigs"] == ["19", "20", "X"] - - # VCF Zarr arrays - assert set(list(group.array_keys())) == set( - [ - "variant_contig", - "variant_position", - "variant_id", - "variant_id_mask", - "variant_allele", - "variant_quality", - "variant_filter", - "variant_AA", - "variant_AC", - "variant_AF", - "variant_AN", - "variant_DB", - "variant_DP", - "variant_H2", - "variant_NS", - "call_DP", - "call_GQ", - "call_genotype", - "call_genotype_mask", - "call_genotype_fill", - "call_genotype_phased", - "call_HQ", - "contig_id", - "filter_id", - "sample_id", - ] - ) - - # Fixed fields - check_field( - group, - "variant_contig", - ndim=1, - shape=(variants,), - dimension_names=["variants"], - dtype=np.int8, - ) - check_field( - group, - "variant_position", - ndim=1, - shape=(variants,), - dimension_names=["variants"], - dtype=np.int32, - ) - check_field( - group, - "variant_id", - ndim=1, - shape=(variants,), - dimension_names=["variants"], - dtype=str, - ) - check_field( - group, - "variant_allele", - ndim=2, - shape=(variants, alt_alleles + 1), - dimension_names=["variants", "alleles"], - dtype=str, - ) - check_field( - group, - "variant_quality", - ndim=1, - shape=(variants,), - dimension_names=["variants"], - dtype=np.float32, - ) - check_field( - group, - "variant_filter", - ndim=2, - shape=(variants, 3), - dimension_names=["variants", "filters"], - dtype=bool, - ) - - # INFO fields - check_field( - group, - "variant_AA", - ndim=1, - shape=(variants,), - dimension_names=["variants"], - dtype=str, - ) - check_field( - group, - "variant_AC", - ndim=2, - shape=(variants, 2), - dimension_names=["variants", "INFO_AC_dim"], - dtype=np.int32, - ) - check_field( - group, - "variant_AF", - ndim=2, - shape=(variants, 2), - dimension_names=["variants", "INFO_AF_dim"], - dtype=np.float32, - ) - check_field( - group, - "variant_AN", - ndim=1, - shape=(variants,), - dimension_names=["variants"], - dtype=np.int32, - ) - check_field( - group, - "variant_DB", - ndim=1, - shape=(variants,), - dimension_names=["variants"], - dtype=bool, - ) - check_field( - group, - "variant_DP", - ndim=1, - shape=(variants,), - dimension_names=["variants"], - dtype=np.int32, - ) - check_field( - group, - "variant_H2", - ndim=1, - shape=(variants,), - dimension_names=["variants"], - dtype=bool, - ) - check_field( - group, - "variant_NS", - ndim=1, - shape=(variants,), - dimension_names=["variants"], - dtype=np.int32, - ) - - # FORMAT fields - check_field( - group, - "call_DP", - ndim=2, - shape=(variants, samples), - dimension_names=["variants", "samples"], - dtype=np.int32, - ) - check_field( - group, - "call_GQ", - ndim=2, - shape=(variants, samples), - dimension_names=["variants", "samples"], - dtype=np.int32, - ) - check_field( - group, - "call_HQ", - ndim=3, - shape=(variants, samples, 2), - dimension_names=["variants", "samples", "FORMAT_HQ_dim"], - dtype=np.int32, - ) - check_field( - group, - "call_genotype", - ndim=3, - shape=(variants, samples, ploidy), - dimension_names=["variants", "samples", "ploidy"], - dtype=np.int8, - ) - check_field( - group, - "call_genotype_phased", - ndim=2, - shape=(variants, samples), - dimension_names=["variants", "samples"], - dtype=bool, - ) - - # Sample information - check_field( - group, - "sample_id", - ndim=1, - shape=(samples,), - dimension_names=["samples"], - dtype=str, - ) - - # Array values - assert_array_equal(group["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2]) - assert_array_equal( - group["variant_position"], - [111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10], - ) - assert_array_equal( - group["variant_id"], - [".", ".", "rs6054257", ".", "rs6040355", ".", "microsat1", ".", "rsTest"], - ) - assert_array_equal( - group["variant_allele"], - [ - ["A", "C", "", ""], - ["A", "G", "", ""], - ["G", "A", "", ""], - ["T", "A", "", ""], - ["A", "G", "T", ""], - ["T", "", "", ""], - ["G", "GA", "GAC", ""], - ["T", "", "", ""], - ["AC", "A", "ATG", "C"], - ], - ) - assert_allclose( - group["variant_quality"], [9.6, 10.0, 29.0, 3.0, 67.0, 47.0, 50.0, np.nan, 10.0] - ) - assert ( - group["variant_quality"][:].view(np.int32)[7] - == np.array([FLOAT32_MISSING_AS_INT32], dtype=np.int32).item() - ) # missing nan - assert_array_equal( - group["variant_filter"], - [ - [False, False, False], - [False, False, False], - [True, False, False], - [False, True, True], - [True, False, False], - [True, False, False], - [True, False, False], - [False, False, False], - [True, False, False], - ], - ) - - assert_array_equal( - group["variant_NS"], - [INT_MISSING, INT_MISSING, 3, 3, 2, 3, 3, INT_MISSING, INT_MISSING], - ) - - assert_array_equal( - group["call_DP"], - [ - [INT_MISSING, INT_MISSING, INT_MISSING], - [INT_MISSING, INT_MISSING, INT_MISSING], - [1, 8, 5], - [3, 5, 3], - [6, 0, 4], - [INT_MISSING, 4, 2], - [4, 2, 3], - [INT_MISSING, INT_MISSING, INT_MISSING], - [INT_MISSING, INT_MISSING, INT_MISSING], - ], - ) - assert_array_equal( - group["call_genotype"], - [ - [[0, 0], [0, 0], [0, 1]], - [[0, 0], [0, 0], [0, 1]], - [[0, 0], [1, 0], [1, 1]], - [[0, 0], [0, 1], [0, 0]], - [[1, 2], [2, 1], [2, 2]], - [[0, 0], [0, 0], [0, 0]], - [[0, 1], [0, 2], [-1, -1]], - [[0, 0], [0, 0], [-1, -1]], - [[0, -2], [0, 1], [0, 2]], - ], - ) - assert_array_equal( - group["call_genotype_phased"], - [ - [True, True, False], - [True, True, False], - [True, True, False], - [True, True, False], - [True, True, False], - [True, True, False], - [False, False, False], - [False, True, False], - [True, False, True], - ], - ) - - assert_array_equal(group["sample_id"], ["NA00001", "NA00002", "NA00003"]) - - -@pytest.mark.parametrize( - "retain_temp_files", - [True, False], -) -def test_vcf_to_zarr__retain_files(shared_datadir, tmp_path, retain_temp_files): - path = path_for_test(shared_datadir, "sample.vcf.gz") - output = tmp_path.joinpath("vcf.zarr").as_posix() - temp_path = tmp_path.joinpath("temp").as_posix() - - vcf_to_zarr( - path, - output, - chunk_length=5, - chunk_width=2, - tempdir=temp_path, - retain_temp_files=retain_temp_files, - target_part_size="500B", - ) - ds = xr.open_zarr(output) - assert_array_equal(ds["contig_id"], ["19", "20", "X"]) - assert (len(os.listdir(temp_path)) == 0) != retain_temp_files - - -def test_vcf_to_zarr__legacy_contig_and_filter_attrs(shared_datadir, tmp_path): - path = path_for_test(shared_datadir, "sample.vcf.gz") - output = tmp_path.joinpath("vcf.zarr").as_posix() - - vcf_to_zarr(path, output, chunk_length=5, chunk_width=2) - ds = xr.open_zarr(output) - - # drop new contig_id and filter_id variables - ds = ds.drop_vars(["contig_id", "filter_id"]) - - # check that contigs and filters can still be retrieved (with a warning) - assert num_contigs(ds) == 3 - with pytest.warns(DeprecationWarning): - assert_array_equal(get_contigs(ds), np.array(["19", "20", "X"], dtype="S")) - with pytest.warns(DeprecationWarning): - assert_array_equal(get_filters(ds), np.array(["PASS", "s50", "q10"], dtype="S")) - - -def test_vcf_to_zarr__no_samples(shared_datadir, tmp_path): - path = path_for_test(shared_datadir, "no_samples.vcf.gz") - output = tmp_path.joinpath("vcf.zarr").as_posix() - vcf_to_zarr(path, output) - # Run with many parts to test concat_zarrs path also accepts no samples - vcf_to_zarr(path, output, target_part_size="1k") - ds = xr.open_zarr(output) - assert_array_equal(ds["sample_id"], []) - assert_array_equal(ds["contig_id"], ["1"]) - assert ds.sizes["variants"] == 973 diff --git a/sgkit/tests/io/vcf/test_vcf_roundtrip.py b/sgkit/tests/io/vcf/test_vcf_roundtrip.py deleted file mode 100644 index 7b677bbcc..000000000 --- a/sgkit/tests/io/vcf/test_vcf_roundtrip.py +++ /dev/null @@ -1,75 +0,0 @@ -import pytest -from numcodecs import FixedScaleOffset - -from sgkit import load_dataset -from sgkit.io.vcf.vcf_reader import vcf_to_zarr, zarr_array_sizes -from sgkit.io.vcf.vcf_writer import write_vcf, zarr_to_vcf -from sgkit.tests.io.vcf.utils import assert_vcfs_close, path_for_test - - -@pytest.mark.parametrize( - "vcf_file, encoding, generate_header", - [ - ( - "1kg_target_chr20_38_imputed_chr20_1000.vcf", - { - "variant_AF": { - "filters": [ - FixedScaleOffset(offset=0, scale=10000, dtype="f4", astype="u2") - ], - }, - "call_DS": { - "filters": [ - FixedScaleOffset(offset=0, scale=100, dtype="f4", astype="u1") - ], - }, - "variant_DR2": { - "filters": [ - FixedScaleOffset(offset=0, scale=100, dtype="f4", astype="u1") - ], - }, - }, - True, - ), - ("all_fields.vcf", None, True), - ("CEUTrio.20.21.gatk3.4.g.vcf.bgz", None, True), - # don't generate header to avoid https://github.com/pydata/xarray/issues/7328 - ("Homo_sapiens_assembly38.headerOnly.vcf.gz", None, False), - ("mixed.vcf.gz", None, True), - ("no_genotypes.vcf", None, True), - ("no_genotypes_with_gt_header.vcf", None, True), - ("sample_multiple_filters.vcf.gz", None, True), - ("sample.vcf.gz", None, True), - ], -) -@pytest.mark.filterwarnings( - "ignore::sgkit.io.vcf.FloatFormatFieldWarning", - "ignore::sgkit.io.vcfzarr_reader.DimensionNameForFixedFormatFieldWarning", -) -def test_vcf_to_zarr_to_vcf__real_files( - shared_datadir, tmp_path, vcf_file, encoding, generate_header -): - path = path_for_test(shared_datadir, vcf_file) - intermediate = tmp_path.joinpath("intermediate.vcf.zarr").as_posix() - output = tmp_path.joinpath("output.vcf").as_posix() - - kwargs = zarr_array_sizes(path) - vcf_to_zarr( - path, - intermediate, - fields=["INFO/*", "FORMAT/*"], - mixed_ploidy=True, - encoding=encoding, - **kwargs, - ) - - zarr_to_vcf(intermediate, output) - - assert_vcfs_close(path, output) - - if generate_header: - # delete VCF header attribute to check - # we can generate a VCF header without error - ds = load_dataset(intermediate) - del ds.attrs["vcf_header"] - write_vcf(ds, output) diff --git a/sgkit/tests/io/vcf/test_vcf_scikit_allel.py b/sgkit/tests/io/vcf/test_vcf_scikit_allel.py deleted file mode 100644 index fce7c9b8f..000000000 --- a/sgkit/tests/io/vcf/test_vcf_scikit_allel.py +++ /dev/null @@ -1,226 +0,0 @@ -# Test that converting a VCF file to a dataset using the two pathways -# shown below are equivalent. -# -# allel.vcf_to_zarr -# -# vcf +---------------------> zarr -# -# + + -# | | -# | | -# | | -# sg.vcf_to_zarr | | sg.read_scikit_allel_vcfzarr -# | | -# | | -# | | -# v v -# -# zarr +---------------------> ds -# -# sg.load_dataset - -from pathlib import Path -from typing import Any - -import allel -import numpy as np -import pytest -import xarray as xr -from xarray import Dataset - -import sgkit as sg -from sgkit.io.utils import INT_FILL, INT_MISSING -from sgkit.io.vcf import vcf_to_zarr - - -def assert_identical(ds1: Dataset, ds2: Dataset) -> None: - """Assert two Datasets are identical, including dtypes for all variables, except strings.""" - xr.testing.assert_identical(ds1, ds2) - # check all types except strings (since they may differ e.g. "O" vs "U") - assert all( - [ - ds1[v].dtype == ds2[v].dtype - for v in ds1.data_vars - if ds1[v].dtype.kind not in ("O", "S", "U") - ] - ) - - -def create_allel_vcfzarr( - shared_datadir: Path, - tmpdir: Path, - *, - vcf_file: str = "sample.vcf.gz", - **kwargs: Any, -) -> Path: - """Create a vcfzarr file using scikit-allel""" - vcf_path = shared_datadir / vcf_file - output_path = tmpdir / f"allel_{vcf_file}.zarr" - allel.vcf_to_zarr(str(vcf_path), str(output_path), **kwargs) - return output_path - - -def create_sg_vcfzarr( - shared_datadir: Path, - tmpdir: Path, - *, - vcf_file: str = "sample.vcf.gz", - **kwargs: Any, -) -> Path: - """Create a vcfzarr file using sgkit""" - vcf_path = shared_datadir / vcf_file - output_path = tmpdir / f"sg_{vcf_file}.zarr" - vcf_to_zarr(vcf_path, str(output_path), **kwargs) - return output_path - - -def fix_missing_fields(ds: Dataset) -> Dataset: - # drop variables and attributes that are not included in scikit-allel - ds = ds.drop_vars("call_genotype_phased") - ds = ds.drop_vars("variant_filter") - ds = ds.drop_vars("filter_id") - del ds.attrs["filters"] - del ds.attrs["max_alt_alleles_seen"] - del ds.attrs["vcf_zarr_version"] - del ds.attrs["vcf_header"] - - # scikit-allel doesn't distinguish between missing and fill fields, so set all to fill - for var in ds.data_vars: - if ds[var].dtype == np.int32: # type: ignore[comparison-overlap] - ds[var] = ds[var].where(ds[var] != INT_MISSING, INT_FILL) - return ds - - -def test_default_fields(shared_datadir, tmpdir): - allel_vcfzarr_path = create_allel_vcfzarr(shared_datadir, tmpdir) - allel_ds = sg.read_scikit_allel_vcfzarr(allel_vcfzarr_path) - - sg_vcfzarr_path = create_sg_vcfzarr(shared_datadir, tmpdir) - sg_ds = sg.load_dataset(str(sg_vcfzarr_path)) - sg_ds = fix_missing_fields(sg_ds) - - assert_identical(allel_ds, sg_ds) - - -def test_DP_field(shared_datadir, tmpdir): - fields = [ - "variants/CHROM", - "variants/POS", - "variants/ID", - "variants/REF", - "variants/ALT", - "variants/QUAL", - "calldata/GT", - "samples", - # extra - "calldata/DP", - "variants/DP", - ] - types = {"calldata/DP": "i4"} # override default of i2 - allel_vcfzarr_path = create_allel_vcfzarr( - shared_datadir, tmpdir, fields=fields, types=types - ) - allel_ds = sg.read_scikit_allel_vcfzarr(allel_vcfzarr_path) - - sg_vcfzarr_path = create_sg_vcfzarr( - shared_datadir, tmpdir, fields=["INFO/DP", "FORMAT/DP", "FORMAT/GT"] - ) - sg_ds = sg.load_dataset(str(sg_vcfzarr_path)) - sg_ds = fix_missing_fields(sg_ds) - - assert_identical(allel_ds, sg_ds) - - -@pytest.mark.parametrize( - "vcf_file,allel_exclude_fields,sgkit_exclude_fields,max_alt_alleles", - [ - # Excluding AA here because of pad-vs-missing data in sckit-allel strings - # https://github.com/sgkit-dev/sgkit/issues/1195 - ("sample.vcf.gz", ["AA"], ["INFO/AA"], 3), - ("mixed.vcf.gz", None, None, 3), - # exclude PL since it has Number=G, which is not yet supported - # Excluding PGT and PID here because of pad-vs-missing data in sckit-allel strings - # https://github.com/sgkit-dev/sgkit/issues/1195 - # increase max_alt_alleles since scikit-allel does not truncate genotype calls - ( - "CEUTrio.20.21.gatk3.4.g.vcf.bgz", - ["calldata/PL", "calldata/PGT", "calldata/PID"], - ["FORMAT/PL", "FORMAT/PGT", "FORMAT/PID"], - 7, - ), - ], -) -@pytest.mark.filterwarnings( - "ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning", - "ignore::sgkit.io.vcfzarr_reader.DimensionNameForFixedFormatFieldWarning", -) -def test_all_fields( - shared_datadir, - tmpdir, - vcf_file, - allel_exclude_fields, - sgkit_exclude_fields, - max_alt_alleles, -): - # change scikit-allel type defaults back to the VCF default - types = { - "calldata/DP": "i4", - "calldata/GQ": "i4", - "calldata/HQ": "i4", - "calldata/AD": "i4", - } - allel_vcfzarr_path = create_allel_vcfzarr( - shared_datadir, - tmpdir, - vcf_file=vcf_file, - fields=["*"], - exclude_fields=allel_exclude_fields, - types=types, - alt_number=max_alt_alleles, - ) - - field_defs = { - "INFO/AF": {"Number": "A"}, - "INFO/AC": {"Number": "A"}, - "FORMAT/AD": {"Number": "R"}, - # override automatically assigned dim name - "FORMAT/SB": {"dimension": "strand_biases"}, - } - allel_ds = sg.read_scikit_allel_vcfzarr(allel_vcfzarr_path, field_defs=field_defs) - - sg_vcfzarr_path = create_sg_vcfzarr( - shared_datadir, - tmpdir, - vcf_file=vcf_file, - fields=["INFO/*", "FORMAT/*"], - exclude_fields=sgkit_exclude_fields, - field_defs=field_defs, - truncate_calls=True, - max_alt_alleles=max_alt_alleles, - ) - sg_ds = sg.load_dataset(str(sg_vcfzarr_path)) - sg_ds = fix_missing_fields(sg_ds) - - # scikit-allel only records contigs for which there are actual variants, - # whereas sgkit records contigs from the header - allel_ds_contigs = set(allel_ds["contig_id"].values) - sg_ds_contigs = set(sg_ds["contig_id"].values) - assert allel_ds_contigs <= sg_ds_contigs - del allel_ds.attrs["contigs"] - del sg_ds.attrs["contigs"] - # scikit-allel doesn't store contig lengths - if "contig_lengths" in sg_ds.attrs: - del sg_ds.attrs["contig_lengths"] - if "contig_length" in sg_ds: - del sg_ds["contig_length"] - - if allel_ds_contigs < sg_ds_contigs: - # remove contig ids since they can differ (see comment above) - del allel_ds["contig_id"] - del sg_ds["contig_id"] - - # variant_contig variables are not comparable, so remove them before comparison - del allel_ds["variant_contig"] - del sg_ds["variant_contig"] - - assert_identical(allel_ds, sg_ds) diff --git a/sgkit/tests/io/vcf/test_vcf_writer.py b/sgkit/tests/io/vcf/test_vcf_writer.py deleted file mode 100644 index bdb456e04..000000000 --- a/sgkit/tests/io/vcf/test_vcf_writer.py +++ /dev/null @@ -1,294 +0,0 @@ -import gzip -from io import StringIO - -import numpy as np -import pytest -from cyvcf2 import VCF -from numpy.testing import assert_array_equal - -from sgkit.io.dataset import load_dataset -from sgkit.io.vcf.vcf_reader import vcf_to_zarr, zarr_array_sizes -from sgkit.io.vcf.vcf_writer import write_vcf, zarr_to_vcf -from sgkit.testing import simulate_genotype_call_dataset - -from .utils import assert_vcfs_close, path_for_test -from .vcf_writer import canonicalize_vcf - - -def test_canonicalize_vcf(shared_datadir, tmp_path): - path = path_for_test(shared_datadir, "sample.vcf.gz") - output = tmp_path.joinpath("vcf.zarr").as_posix() - - canonicalize_vcf(path, output) - - # check INFO fields now are ordered correctly - with gzip.open(path, "rt") as f: - assert "NS=3;DP=9;AA=G;AN=6;AC=3,1" in f.read() - with open(output, "r") as f: - assert "NS=3;AN=6;AC=3,1;DP=9;AA=G" in f.read() - - -@pytest.mark.parametrize("output_is_path", [True, False]) -@pytest.mark.filterwarnings( - "ignore::sgkit.io.vcfzarr_reader.DimensionNameForFixedFormatFieldWarning", -) -def test_zarr_to_vcf(shared_datadir, tmp_path, output_is_path): - path = path_for_test(shared_datadir, "sample.vcf.gz") - intermediate = tmp_path.joinpath("intermediate.vcf.zarr").as_posix() - output = tmp_path.joinpath("output.vcf").as_posix() - - kwargs = zarr_array_sizes(path) - vcf_to_zarr( - path, intermediate, fields=["INFO/*", "FORMAT/*"], mixed_ploidy=True, **kwargs - ) - - if output_is_path: - output = tmp_path.joinpath("output.vcf").as_posix() - zarr_to_vcf(intermediate, output) - else: - output_str = StringIO() - zarr_to_vcf(intermediate, output_str) - with open(output, "w") as f: - f.write(output_str.getvalue()) - - v = VCF(output) - - assert v.samples == ["NA00001", "NA00002", "NA00003"] - - variant = next(v) - - assert variant.CHROM == "19" - assert variant.POS == 111 - assert variant.ID is None - assert variant.REF == "A" - assert variant.ALT == ["C"] - assert variant.QUAL == pytest.approx(9.6) - assert variant.FILTER is None - - assert variant.genotypes == [[0, 0, True], [0, 0, True], [0, 1, False]] - - assert_array_equal( - variant.format("HQ"), - [[10, 15], [10, 10], [3, 3]], - ) - - -@pytest.mark.parametrize("in_memory_ds", [True, False]) -@pytest.mark.filterwarnings( - "ignore::sgkit.io.vcfzarr_reader.DimensionNameForFixedFormatFieldWarning", -) -def test_write_vcf(shared_datadir, tmp_path, in_memory_ds): - path = path_for_test(shared_datadir, "sample.vcf.gz") - intermediate = tmp_path.joinpath("intermediate.vcf.zarr").as_posix() - output = tmp_path.joinpath("output.vcf").as_posix() - - kwargs = zarr_array_sizes(path) - vcf_to_zarr( - path, intermediate, fields=["INFO/*", "FORMAT/*"], mixed_ploidy=True, **kwargs - ) - - ds = load_dataset(intermediate) - - if in_memory_ds: - ds = ds.load() - - write_vcf(ds, output) - - v = VCF(output) - - assert v.samples == ["NA00001", "NA00002", "NA00003"] - - variant = next(v) - - assert variant.CHROM == "19" - assert variant.POS == 111 - assert variant.ID is None - assert variant.REF == "A" - assert variant.ALT == ["C"] - assert variant.QUAL == pytest.approx(9.6) - assert variant.FILTER is None - - assert variant.genotypes == [[0, 0, True], [0, 0, True], [0, 1, False]] - - assert_array_equal( - variant.format("HQ"), - [[10, 15], [10, 10], [3, 3]], - ) - - # check headers are the same - assert_vcfs_close(path, output) - - -@pytest.mark.filterwarnings( - "ignore::sgkit.io.vcfzarr_reader.DimensionNameForFixedFormatFieldWarning", -) -def test_write_vcf__set_header(shared_datadir, tmp_path): - path = path_for_test(shared_datadir, "sample.vcf.gz") - intermediate = tmp_path.joinpath("intermediate.vcf.zarr").as_posix() - output = tmp_path.joinpath("output.vcf").as_posix() - - kwargs = zarr_array_sizes(path) - vcf_to_zarr( - path, intermediate, fields=["INFO/*", "FORMAT/*"], mixed_ploidy=True, **kwargs - ) - - ds = load_dataset(intermediate) - - # specified header drops NS and HQ fields, - # and adds H3 and GL fields (which are not in the data) - vcf_header = """##fileformat=VCFv4.3 -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##FILTER= -##FILTER= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 -""" - - write_vcf(ds, output, vcf_header=vcf_header) - - v = VCF(output) - # check dropped fields are not present in VCF header - assert "##INFO=', - '##INFO=', - '##FORMAT=', - "##contig=", - "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S0 S1 S2 S3 S4 S5 S6 S7 S8 S9", - ] - - -def test_write_vcf__generate_header_errors(tmp_path): - output = tmp_path.joinpath("output.vcf").as_posix() - - # simulate a dataset - ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10, missing_pct=0.3) - ds["variant_position"] = ds["variant_position"] + 1 # make 1-based for VCF - - # unsupported dtype - ds["variant_AB"] = (["variants"], np.zeros(10, dtype="complex")) - with pytest.raises(ValueError, match=r"Unsupported dtype: complex"): - write_vcf(ds, output) - - # VCF number cannot be determined from dimension name - ds["variant_AB"] = (["variants", "my_dim"], np.zeros((10, 7), dtype=np.int32)) - with pytest.raises( - ValueError, match=r"Cannot determine VCF Number for dimension name 'my_dim'" - ): - write_vcf(ds, output) diff --git a/sgkit/tests/io/vcf/test_vcf_writer_utils.py b/sgkit/tests/io/vcf/test_vcf_writer_utils.py deleted file mode 100644 index f9459ebe3..000000000 --- a/sgkit/tests/io/vcf/test_vcf_writer_utils.py +++ /dev/null @@ -1,494 +0,0 @@ -import numpy as np -import pytest - -from sgkit.io.utils import ( - FLOAT32_FILL, - FLOAT32_MISSING, - INT_FILL, - INT_MISSING, - STR_FILL, - STR_MISSING, -) -from sgkit.io.vcf.vcf_writer_utils import ( - FLOAT32_BUF_SIZE, - INT32_BUF_SIZE, - byte_buf_to_str, - create_mask, - ftoa, - interleave, - interleave_buf_size, - itoa, - vcf_fixed_to_byte_buf, - vcf_fixed_to_byte_buf_size, - vcf_format_names_to_byte_buf, - vcf_format_names_to_byte_buf_size, - vcf_genotypes_to_byte_buf, - vcf_genotypes_to_byte_buf_size, - vcf_info_to_byte_buf, - vcf_info_to_byte_buf_size, - vcf_ints_to_byte_buf, - vcf_values_to_byte_buf, - vcf_values_to_byte_buf_size, -) - - -@pytest.mark.parametrize( - "i", - [pow(10, i) - 1 for i in range(10)] - + [pow(10, i) for i in range(10)] - + [pow(10, i) + 1 for i in range(10)] - + [np.iinfo(np.int32).max, np.iinfo(np.int32).min], -) -def test_itoa(i): - buf = np.empty(INT32_BUF_SIZE, dtype=np.uint8) - - a = str(i) - p = itoa(buf, 0, i) - assert byte_buf_to_str(buf[:p]) == a - assert p == len(a) - - if i > 0: - i = -i - a = str(i) - p = itoa(buf, 0, i) - assert byte_buf_to_str(buf[:p]) == a - assert p == len(a) - - -def test_itoa_out_of_range(): - buf = np.empty(INT32_BUF_SIZE * 2, dtype=np.uint8) - with pytest.raises(ValueError, match=r"itoa only supports 32-bit integers"): - itoa(buf, 0, np.iinfo(np.int32).max * 10) - - -@pytest.mark.parametrize( - "f, a", - [ - (0.0, "0"), - (0.0001, "0"), - (0.3, "0.3"), - (0.32, "0.32"), - (0.329, "0.329"), - (0.3217, "0.322"), - (8.0, "8"), - (8.0001, "8"), - (8.3, "8.3"), - (8.32, "8.32"), - (8.329, "8.329"), - (8.3217, "8.322"), - (443.998, "443.998"), - (1028.0, "1028"), - (1028.0001, "1028"), - (1028.3, "1028.3"), - (1028.32, "1028.32"), - (1028.329, "1028.329"), - (1028.3217, "1028.322"), - (np.nan, "nan"), - (np.inf, "inf"), - ], -) -def test_ftoa(f, a): - f = np.array([f], dtype=np.float32)[0] - buf = np.empty(FLOAT32_BUF_SIZE, dtype=np.uint8) - - p = ftoa(buf, 0, f) - assert byte_buf_to_str(buf[:p]) == a - assert p == len(a) - - if f > 0: - f = -f - a = f"-{a}" - p = ftoa(buf, 0, f) - assert byte_buf_to_str(buf[:p]) == a - assert p == len(a) - - -def _check_indexes(buf, indexes, separator): - if separator == ord(" "): - s = byte_buf_to_str(buf) - words = [] - for i in range(len(indexes) - 1): - words.append(s[indexes[i] : indexes[i + 1]].strip()) - assert words == s.split(" ") - - -def test_vcf_fixed_to_byte_buf(): - contigs = np.array(["chr1", "chr2"], dtype="S") - chrom = np.array([0, 1], dtype="i4") - pos = np.array([110, 1430], dtype="i4") - id = np.array([".", "id1"], dtype="S") - alleles = np.array([["A", "AC", "T"], ["G", "", ""]], dtype="S") - qual = np.array([29, FLOAT32_MISSING], dtype="f4") - filters = np.array(["PASS", "q10", "s50"], dtype="S") - filter_ = np.array([[True, False, False], [False, True, True]], dtype="bool") - - buf_size = vcf_fixed_to_byte_buf_size(contigs, id, alleles, filters) - assert buf_size == 63 - - buf = np.empty(buf_size, dtype=np.uint8) - p = vcf_fixed_to_byte_buf( - buf, 0, 0, contigs, chrom, pos, id, alleles, qual, filters, filter_ - ) - buf = buf[:p] - assert byte_buf_to_str(buf) == "chr1\t110\t.\tA\tAC,T\t29\tPASS\t" - - buf = np.empty(buf_size, dtype=np.uint8) - p = vcf_fixed_to_byte_buf( - buf, 0, 1, contigs, chrom, pos, id, alleles, qual, filters, filter_ - ) - buf = buf[:p] - assert byte_buf_to_str(buf) == "chr2\t1430\tid1\tG\t.\t.\tq10;s50\t" - - -@pytest.mark.parametrize( - "a, separator, result", - [ - # int - (np.array([10, 8, INT_MISSING, 41, 5], dtype=np.int32), -1, "108.415"), - ( - np.array([10, 8, INT_MISSING, 41, 5], dtype=np.int32), - ord(" "), - "10 8 . 41 5", - ), - ( - np.array( - [ - [INT_FILL, INT_FILL, INT_FILL], - [0, 21, 43], - [INT_MISSING, 1, INT_FILL], - [1, INT_FILL, INT_FILL], - ], - dtype=np.int32, - ), - -1, - "0,21,43.,11", - ), - ( - np.array( - [ - [INT_FILL, INT_FILL, INT_FILL], - [0, 21, 43], - [INT_MISSING, 1, INT_FILL], - [1, INT_FILL, INT_FILL], - ], - dtype=np.int32, - ), - ord(" "), - " 0,21,43 .,1 1", - ), - # float - ( - np.array([5, 5.5, 6, FLOAT32_MISSING, 7, 7.5], dtype=np.float32), - -1, - "55.56.77.5", - ), - ( - np.array([5, 5.5, 6, FLOAT32_MISSING, 7, 7.5], dtype=np.float32), - ord(" "), - "5 5.5 6 . 7 7.5", - ), - ( - np.array( - [ - [FLOAT32_FILL, FLOAT32_FILL, FLOAT32_FILL], - [0, 1.5, 2], - [FLOAT32_MISSING, 1.5, FLOAT32_FILL], - [1.5, FLOAT32_FILL, FLOAT32_FILL], - ], - dtype=np.float32, - ), - -1, - "0,1.5,2.,1.51.5", - ), - ( - np.array( - [ - [FLOAT32_FILL, FLOAT32_FILL, FLOAT32_FILL], - [0, 1.5, 2], - [FLOAT32_MISSING, 1.5, FLOAT32_FILL], - [1.5, FLOAT32_FILL, FLOAT32_FILL], - ], - dtype=np.float32, - ), - ord(" "), - " 0,1.5,2 .,1.5 1.5", - ), - # string - (np.array(["ab", "cd", STR_MISSING, "ef", "ghi"], dtype="S"), -1, "abcd.efghi"), - ( - np.array(["ab", "cd", STR_MISSING, "ef", "ghi"], dtype="S"), - ord(" "), - "ab cd . ef ghi", - ), - ( - np.array( - [ - [STR_FILL, STR_FILL, STR_FILL], - ["ab", "cd", "ef"], - [STR_MISSING, "ghi", STR_FILL], - ["j", STR_FILL, STR_FILL], - ], - dtype="S", - ), - -1, - "ab,cd,ef.,ghij", - ), - ( - np.array( - [ - [STR_FILL, STR_FILL, STR_FILL], - ["ab", "cd", "ef"], - [STR_MISSING, "ghi", STR_FILL], - ["j", STR_FILL, STR_FILL], - ], - dtype="S", - ), - ord(" "), - " ab,cd,ef .,ghi j", - ), - ], -) -def test_vcf_values_to_byte_buf(a, separator, result): - buf = np.empty(vcf_values_to_byte_buf_size(a), dtype=np.uint8) - indexes = np.empty(a.shape[0] + 1, dtype=np.int32) - p = vcf_values_to_byte_buf(buf, 0, a, indexes, separator=separator) - buf = buf[:p] - - assert byte_buf_to_str(buf) == result - _check_indexes(buf, indexes, separator) - - -def test_vcf_values_to_byte_buf__dtype_errors(): - a = np.ones((2, 2), dtype=np.float64) - with pytest.raises(ValueError, match="Unsupported dtype: float64"): - vcf_values_to_byte_buf_size(a) - - buf = np.empty(1000, dtype=np.uint8) - indexes = np.empty(a.shape[0] + 1, dtype=np.int32) - with pytest.raises(ValueError, match="Unsupported dtype: float64"): - vcf_values_to_byte_buf(buf, 0, a, indexes) - - -@pytest.mark.parametrize("dtype", [np.int32, np.float32, "S"]) -def test_vcf_values_to_byte_buf__dimension_errors(dtype): - a = np.ones((2, 2, 2), dtype=dtype) - buf = np.empty(vcf_values_to_byte_buf_size(a), dtype=np.uint8) - indexes = np.empty(a.shape[0] + 1, dtype=np.int32) - with pytest.raises(ValueError, match="Array must have dimension 1 or 2"): - vcf_values_to_byte_buf(buf, 0, a, indexes) - - -@pytest.mark.parametrize( - "separator, result", - [ - (-1, "0/10|2."), - (ord(" "), "0/1 0|2 ."), - ], -) -def test_vcf_genotypes_to_byte_buf(separator, result): - call_genotype = np.array([[0, 1], [0, 2], [-1, -2]], dtype="i1") - call_genotype_phased = np.array([False, True, False], dtype=bool) - - buf_size = vcf_genotypes_to_byte_buf_size(call_genotype) - buf = np.empty(buf_size, dtype=np.uint8) - indexes = np.empty(call_genotype.shape[0] + 1, dtype=np.int32) - p = vcf_genotypes_to_byte_buf( - buf, 0, call_genotype, call_genotype_phased, indexes, separator=separator - ) - buf = buf[:p] - - assert byte_buf_to_str(buf) == result - _check_indexes(buf, indexes, separator) - - -def test_create_mask__dtype_errors(): - a = np.ones((2, 2), dtype=np.float64) - with pytest.raises(ValueError, match="Unsupported dtype: float64"): - create_mask(a) - - -def test_vcf_info_to_byte_buf(): - a = np.arange(6) - b = np.arange(6, 12) - c = np.arange(12, 18) - - assert a.shape[0] == b.shape[0] == c.shape[0] - - n = a.shape[0] - - a_buf = np.empty(n * INT32_BUF_SIZE, dtype=np.uint8) - b_buf = np.empty(n * INT32_BUF_SIZE, dtype=np.uint8) - c_buf = np.empty(n * INT32_BUF_SIZE, dtype=np.uint8) - - indexes = np.empty((3, n + 1), dtype=np.int32) - - a_p = vcf_ints_to_byte_buf(a_buf, 0, a, indexes[0]) - b_p = vcf_ints_to_byte_buf(b_buf, 0, b, indexes[1]) - c_p = vcf_ints_to_byte_buf(c_buf, 0, c, indexes[2]) - - a_ch = a_buf[:a_p] - b_ch = b_buf[:b_p] - c_ch = c_buf[:c_p] - - assert byte_buf_to_str(a_ch) == "012345" - assert byte_buf_to_str(b_ch) == "67891011" - assert byte_buf_to_str(c_ch) == "121314151617" - - mask = np.full((3, n), False, dtype=bool) - info_prefixes = np.array(["A=", "B=", "C="], dtype="S") - - buf_size = vcf_info_to_byte_buf_size(info_prefixes, a_buf, b_buf, c_buf) - assert buf_size == 207 - buf = np.empty(buf_size, dtype=np.uint8) - - p = 0 - for j in range(6): - p = vcf_info_to_byte_buf( - buf, p, j, indexes, mask, info_prefixes, a_buf, b_buf, c_buf - ) - buf = buf[:p] - - assert ( - byte_buf_to_str(buf) - == "A=0;B=6;C=12A=1;B=7;C=13A=2;B=8;C=14A=3;B=9;C=15A=4;B=10;C=16A=5;B=11;C=17" - ) - - -@pytest.mark.parametrize( - "format_names, result", - [ - ([], "\t.\t"), - (["AB"], "\tAB\t"), - (["AB", "CD", "EF"], "\tAB:CD:EF\t"), - ], -) -def test_vcf_format_names_to_byte_buf(format_names, result): - mask = np.full((len(format_names), 1), False, dtype=bool) - format_names = np.array(format_names, dtype="S") - buf_size = vcf_format_names_to_byte_buf_size(format_names) - assert buf_size == len(result) - buf = np.empty(buf_size, dtype=np.uint8) - - p = vcf_format_names_to_byte_buf(buf, 0, 0, mask, format_names) - assert byte_buf_to_str(buf[:p]) == result - - -def test_interleave(): - a = np.arange(6) - b = np.arange(6, 12) - c = np.arange(12, 18) - - assert a.shape[0] == b.shape[0] == c.shape[0] - - n = a.shape[0] - - a_buf = np.empty(n * INT32_BUF_SIZE, dtype=np.uint8) - b_buf = np.empty(n * INT32_BUF_SIZE, dtype=np.uint8) - c_buf = np.empty(n * INT32_BUF_SIZE, dtype=np.uint8) - - indexes = np.empty((3, n + 1), dtype=np.int32) - - a_p = vcf_ints_to_byte_buf(a_buf, 0, a, indexes[0]) - b_p = vcf_ints_to_byte_buf(b_buf, 0, b, indexes[1]) - c_p = vcf_ints_to_byte_buf(c_buf, 0, c, indexes[2]) - - a_ch = a_buf[:a_p] - b_ch = b_buf[:b_p] - c_ch = c_buf[:c_p] - - assert byte_buf_to_str(a_ch) == "012345" - assert byte_buf_to_str(b_ch) == "67891011" - assert byte_buf_to_str(c_ch) == "121314151617" - - buf_size = interleave_buf_size(indexes, a_buf, b_buf, c_buf) - buf = np.empty(buf_size, dtype=np.uint8) - - mask = np.array([False, False, False]) - - p = interleave(buf, 0, indexes, mask, ord(":"), ord(" "), a_buf, b_buf, c_buf) - buf = buf[:p] - - assert byte_buf_to_str(buf) == "0:6:12 1:7:13 2:8:14 3:9:15 4:10:16 5:11:17" - - -def test_interleave_with_mask(): - a = np.arange(6) - b = np.arange(6, 12) - c = np.arange(12, 18) - - assert a.shape[0] == b.shape[0] == c.shape[0] - - n = a.shape[0] - - a_buf = np.empty(n * INT32_BUF_SIZE, dtype=np.uint8) - b_buf = np.empty(n * INT32_BUF_SIZE, dtype=np.uint8) - c_buf = np.empty(n * INT32_BUF_SIZE, dtype=np.uint8) - - indexes = np.empty((3, n + 1), dtype=np.int32) - - a_p = vcf_ints_to_byte_buf(a_buf, 0, a, indexes[0]) - b_p = vcf_ints_to_byte_buf(b_buf, 0, b, indexes[1]) - c_p = vcf_ints_to_byte_buf(c_buf, 0, c, indexes[2]) - - a_ch = a_buf[:a_p] - b_ch = b_buf[:b_p] - c_ch = c_buf[:c_p] - - assert byte_buf_to_str(a_ch) == "012345" - assert byte_buf_to_str(b_ch) == "67891011" - assert byte_buf_to_str(c_ch) == "121314151617" - - buf_size = interleave_buf_size(indexes, a_buf, b_buf, c_buf) - buf = np.empty(buf_size, dtype=np.uint8) - - mask = np.array([False, True, False]) - - p = interleave(buf, 0, indexes, mask, ord(":"), ord(" "), a_buf, b_buf, c_buf) - buf = buf[:p] - - assert byte_buf_to_str(buf) == "0:12 1:13 2:14 3:15 4:16 5:17" - - -@pytest.mark.skip -def test_interleave_speed(): - n_samples = 100000 - a = np.arange(0, n_samples) - b = np.arange(1, n_samples + 1) - c = np.arange(2, n_samples + 2) - - assert a.shape[0] == b.shape[0] == c.shape[0] - - n = a.shape[0] - - a_buf = np.empty(n * INT32_BUF_SIZE, dtype=np.uint8) - b_buf = np.empty(n * INT32_BUF_SIZE, dtype=np.uint8) - c_buf = np.empty(n * INT32_BUF_SIZE, dtype=np.uint8) - - indexes = np.empty((3, n + 1), dtype=np.int32) - - buf_size = interleave_buf_size(indexes, a_buf, b_buf, c_buf) - buf = np.empty(buf_size, dtype=np.uint8) - - mask = np.array([False, False, False]) - - import time - - start = time.time() - - reps = 200 - bytes_written = 0 - for _ in range(reps): - print(".", end="") - - vcf_ints_to_byte_buf(a_buf, 0, a, indexes[0]) - vcf_ints_to_byte_buf(b_buf, 0, b, indexes[1]) - vcf_ints_to_byte_buf(c_buf, 0, c, indexes[2]) - - p = interleave(buf, 0, indexes, mask, ord(":"), ord(" "), a_buf, b_buf, c_buf) - - bytes_written += len(byte_buf_to_str(buf[:p])) - - end = time.time() - print(f"bytes written: {bytes_written}") - print(f"duration: {end-start}") - print(f"speed: {bytes_written/(1000000*(end-start))} MB/s") diff --git a/sgkit/tests/io/vcf/utils.py b/sgkit/tests/io/vcf/utils.py deleted file mode 100644 index ac9a5f113..000000000 --- a/sgkit/tests/io/vcf/utils.py +++ /dev/null @@ -1,105 +0,0 @@ -from itertools import zip_longest -from pathlib import Path - -import numpy as np - -from sgkit.io.vcf.vcf_reader import open_vcf -from sgkit.typing import PathType - - -def path_for_test(shared_datadir: Path, file: str, is_path: bool = True) -> PathType: - """Return a test data path whose type is determined by `is_path`. - - If `is_path` is True, return a `Path`, otherwise return a `str`. - """ - path: PathType = shared_datadir / file - return path if is_path else str(path) - - -def assert_vcfs_close(f1, f2, *, rtol=1e-05, atol=1e-03): - """Like :py:func:`numpy.testing.assert_allclose()`, but for VCF files. - - Raises an `AssertionError` if two VCF files are not equal to one another. - Float values in QUAL, INFO, or FORMAT fields are compared up to the - desired tolerance. All other values must match exactly. - - Parameters - ---------- - f1 - Path to first VCF to compare. - f2 - Path to second VCF to compare. - rtol - Relative tolerance. - atol - Absolute tolerance. - """ - with open_vcf(f1) as vcf1, open_vcf(f2) as vcf2: - assert vcf1.raw_header == vcf2.raw_header - assert vcf1.samples == vcf2.samples - - for v1, v2 in zip_longest(vcf1, vcf2): - if v1 is None and v2 is not None: - raise AssertionError(f"Right contains extra variant: {v2}") - if v1 is not None and v2 is None: - raise AssertionError(f"Left contains extra variant: {v1}") - - assert v1.CHROM == v2.CHROM, f"CHROM not equal for variants\n{v1}{v2}" - assert v1.POS == v2.POS, f"POS not equal for variants\n{v1}{v2}" - assert v1.ID == v2.ID, f"ID not equal for variants\n{v1}{v2}" - assert v1.REF == v2.REF, f"REF not equal for variants\n{v1}{v2}" - assert v1.ALT == v2.ALT, f"ALT not equal for variants\n{v1}{v2}" - np.testing.assert_allclose( - np.array(v1.QUAL, dtype=np.float32), - np.array(v2.QUAL, dtype=np.float32), - rtol=rtol, - atol=atol, - err_msg=f"QUAL not equal for variants\n{v1}{v2}", - ) - assert set(v1.FILTERS) == set( - v2.FILTERS - ), f"FILTER not equal for variants\n{v1}{v2}" - - assert ( - dict(v1.INFO).keys() == dict(v2.INFO).keys() - ), f"INFO keys not equal for variants\n{v1}{v2}" - for k in dict(v1.INFO).keys(): - # values are python objects (not np arrays) - val1 = v1.INFO[k] - val2 = v2.INFO[k] - if isinstance(val1, float) or ( - isinstance(val1, tuple) and any(isinstance(v, float) for v in val1) - ): - np.testing.assert_allclose( - np.array(val1, dtype=np.float32), - np.array(val2, dtype=np.float32), - rtol=rtol, - atol=atol, - err_msg=f"INFO {k} not equal for variants\n{v1}{v2}", - ) - else: - assert val1 == val2, f"INFO {k} not equal for variants\n{v1}{v2}" - - assert v1.FORMAT == v2.FORMAT, f"FORMAT not equal for variants\n{v1}{v2}" - for field in v1.FORMAT: - if field == "GT": - assert ( - v1.genotypes == v2.genotypes - ), f"GT not equal for variants\n{v1}{v2}" - else: - val1 = v1.format(field) - val2 = v2.format(field) - if val1.dtype.kind == "f": - np.testing.assert_allclose( - val1, - val2, - rtol=rtol, - atol=atol, - err_msg=f"FORMAT {field} not equal for variants\n{v1}{v2}", - ) - else: - np.testing.assert_array_equal( - val1, - val2, - err_msg=f"FORMAT {field} not equal for variants\n{v1}{v2}", - ) diff --git a/sgkit/tests/io/vcf/vcf_generator.py b/sgkit/tests/io/vcf/vcf_generator.py deleted file mode 100644 index 0df0447f6..000000000 --- a/sgkit/tests/io/vcf/vcf_generator.py +++ /dev/null @@ -1,192 +0,0 @@ -# Code to generate a test VCF file with all VCF Type/Number combinations - -import io -import random -from itertools import product -from typing import Any, Dict, List - -import numpy as np -from scipy.special import comb - -from sgkit.io.utils import str_is_int - -from .vcf_writer import VcfVariant, VcfWriter - -ploidy = 2 -max_alt_alleles = 2 - - -def generate_number(vcf_number, alt_alleles): - if vcf_number == ".": - return np.random.randint(1, 10) - elif str_is_int(vcf_number): - return int(vcf_number) - elif vcf_number == "A": - return alt_alleles - elif vcf_number == "R": - return alt_alleles + 1 - elif vcf_number == "G": - n_alleles = alt_alleles + 1 - return comb(n_alleles + ploidy - 1, ploidy, exact=True) - raise ValueError(f"Number '{vcf_number}' is not supported.") - - -def generate_data(vcf_type, n): - if vcf_type == "Integer": - return np.random.randint(-1000, 1000, n).tolist() - elif vcf_type == "Float": - return np.random.uniform(-1000.0, 1000.0, n).tolist() - elif vcf_type == "Character": - return random.choices("abcdefg", k=n) - elif vcf_type == "String": - return random.choices(["ab", "bc", "d", "efg", "hij", "klmn", "op"], k=n) - raise ValueError(f"Type '{vcf_type}' is not supported.") - - -def generate_alleles(alt_alleles): - return random.choices("ACGT", k=alt_alleles + 1) - - -class Field: - def __init__(self, category, vcf_type, vcf_number, name=None): - assert category in ("INFO", "FORMAT") - self.category = category - self.name = name or f"{category[0]}{vcf_type[0]}{vcf_number[0]}".replace( - ".", "D" - ) - self.vcf_type = vcf_type - self.vcf_number = vcf_number - - def get_header(self): - return f'##{self.category}=' - - def generate_values(self, alt_alleles): - if self.vcf_type == "Flag": - yield True - return - - repeat = 2 if self.vcf_number == "." else 1 # multiple lengths for Number=. - for _ in range(repeat): - n = generate_number(self.vcf_number, alt_alleles) - data = generate_data(self.vcf_type, n) - val = ",".join([str(x) for x in data]) - assert val != "." - yield f"{val}" - for i in range(n): - data_str = [str(x) for x in data] - data_str[i] = "." # missing - val = ",".join(data_str) - yield f"{val}" if val != "." else None - if n > 1: - yield None # all missing - - -def generate_header(info_fields, format_fields, vcf_samples): - output = io.StringIO() - - print("##fileformat=VCFv4.3", file=output) - print("##contig=", file=output) - print("##contig=", file=output) - - for info_field in info_fields: - print(info_field.get_header(), file=output) - - for format_field in format_fields: - print(format_field.get_header(), file=output) - - print( - "#CHROM", - "POS", - "ID", - "REF", - "ALT", - "QUAL", - "FILTER", - "INFO", - "FORMAT", - "\t".join(vcf_samples), - sep="\t", - file=output, - ) - - return output.getvalue() - - -def generate_vcf(output, seed=42): - random.seed(seed) - np.random.seed(seed) - - info_fields = [ - Field(*c) - for c in product( - ["INFO"], - ["Integer", "Float", "Character", "String"], - ["1", "2", "A", "R", "."], # Number=G not allowed for INFO fields - ) - ] - info_fields.insert(0, Field("INFO", "Flag", "0", "IB0")) - - format_fields = [ - Field(*c) - for c in product( - ["FORMAT"], - ["Integer", "Float", "Character", "String"], - ["1", "2", "A", "R", "G", "."], - ) - ] - - vcf_samples = ["s1", "s2"] - - header_str = generate_header(info_fields, format_fields, vcf_samples) - - with open(output, mode="w") as out: - vcf_writer = VcfWriter(out, header_str) - - # only have a single field per variant - - pos = 0 - - for info_field in info_fields: - alt_alleles = max_alt_alleles - alleles = generate_alleles(alt_alleles) - for val in info_field.generate_values(alt_alleles): - contig_id = "1" - pos = pos + 1 - ref = alleles[0] - alt = alleles[1:] - info = {} - if val is not None: - info = {info_field.name: val} - samples: List[Dict[str, Any]] = [{}] * len(vcf_samples) - - variant = VcfVariant( - contig_id, pos, ".", ref, alt, None, ["PASS"], info, samples - ) - vcf_writer.write(variant) - - for format_field in format_fields: - alt_alleles = max_alt_alleles - alleles = generate_alleles(alt_alleles) - formats = list(format_field.generate_values(alt_alleles)) - # group into samples - n_samples = len(vcf_samples) - formats_by_sample = [ - formats[i : i + n_samples] for i in range(0, len(formats), n_samples) - ] - - for sample_vals in formats_by_sample: - if len(sample_vals) < n_samples: - sample_vals = sample_vals + [sample_vals[0]] * ( - n_samples - len(sample_vals) - ) # pad with first val - - contig_id = "2" - pos = pos + 1 - ref = alleles[0] - alt = alleles[1:] - samples = [{format_field.name: val} for val in sample_vals] - - variant = VcfVariant( - contig_id, pos, ".", ref, alt, None, ["PASS"], None, samples - ) - vcf_writer.write(variant) diff --git a/sgkit/tests/io/vcf/vcf_writer.py b/sgkit/tests/io/vcf/vcf_writer.py deleted file mode 100644 index 72ade75f3..000000000 --- a/sgkit/tests/io/vcf/vcf_writer.py +++ /dev/null @@ -1,337 +0,0 @@ -import re -import tempfile -from dataclasses import dataclass -from io import StringIO -from typing import Any, Dict, List, MutableMapping, Optional, Union - -import numpy as np -from cyvcf2 import Writer - -from sgkit import load_dataset -from sgkit.io.utils import ( - CHAR_FILL, - CHAR_MISSING, - FLOAT32_FILL_AS_INT32, - FLOAT32_MISSING_AS_INT32, - INT_FILL, - INT_MISSING, - STR_FILL, - STR_MISSING, -) -from sgkit.io.vcf.vcf_reader import open_vcf -from sgkit.typing import PathType - - -@dataclass -class VcfVariant: - chrom: str - pos: int - id: str - ref: str - alt: List[str] - qual: Optional[float] - filter: List[str] - info: Optional[Dict[str, Any]] - samples: List[Dict[str, Any]] - - def __str__(self): - out = StringIO() - - if self.info is None: - info = "." - else: - info_fields = [f"{format_field(k, v)}" for k, v in self.info.items()] - info = ";".join([field for field in info_fields if field != ""]) - if len(info) == 0: - info = "." - if self.samples is None or len(self.samples) == 0: - format_ = "." - else: - format_ = format_empty_as_missing(":".join(self.samples[0].keys())) - print( - self.chrom, - self.pos, - "." if self.id is None else self.id, - self.ref, - "." if self.alt is None else ",".join(filter_none(self.alt)), - "." if self.qual is None else str(self.qual), - "." if self.filter is None else ";".join(self.filter), - info, - format_, - sep="\t", - end="\t", - file=out, - ) - - print( - "\t".join( - [ - format_empty_as_missing( - ":".join([f"{format_value(v)}" for v in sample.values()]) - ) - for sample in self.samples - ] - ), - file=out, - ) - - return out.getvalue().strip() - - -class VcfWriter: - def __init__(self, output, header_str): - self.output = output - self.header_str = header_str - - # create a cyvcf2 file for formatting, not for writing the file - tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".vcf") - self.vcf = Writer.from_string(tmp, self.header_str) - - # print the header - print(self.header_str, end="", file=self.output) - - def write(self, variant): - # use cyvcf2 to format the variant string - in particular floats - v = self.vcf.variant_from_string(str(variant)) - - # print the formatted variant string to the output - print(str(v), end="", file=self.output) - - -def format_field(key, val): - if isinstance(val, bool): - return key if val else "" - return f"{key}={format_value(val)}" - - -def format_value(val): - if val is None: - return "." - elif isinstance(val, str): - return val - elif isinstance(val, bytes): - return val.decode("utf-8") - try: - lst = [format_value(v) for v in val] - return ",".join(lst) - except TypeError: - return str(val) - - -def format_empty_as_missing(val): - if val == "": - return "." - return val - - -def filter_none(lst): - return [x for x in lst if x is not None] - - -def zarr_to_vcf( - input: Union[PathType, MutableMapping[str, bytes]], - output: PathType, -) -> None: - """Convert a Zarr file to VCF. For test purposes only.""" - ds = load_dataset(input) - ds = ds.load() - - header_str = ds.attrs["vcf_header"] - contigs = ds.attrs["contigs"] - filters = ds.attrs["filters"] - - n_samples = ds.sizes["samples"] - - with open(output, mode="w") as out: - vcf_writer = VcfWriter(out, header_str) - - info_fields = _info_fields(header_str) - format_fields = _format_fields(header_str) - - for i in range(ds.sizes["variants"]): - chrom = ds.variant_contig[i].values.item() - pos = ds.variant_position[i].values.item() - id = ds.variant_id[i].values.item() - _, ref_alt = array_to_values(ds.variant_allele[i].values) - ref = ref_alt[0] - alt = ref_alt[1:] - _, qual = array_to_values(ds.variant_quality[i].values) - _, filter_ = array_to_values(ds.variant_filter[i].values) - if isinstance(filter_, bool): - filter_ = np.array([filter_]) - if np.all(~filter_): - filter_ = None - else: - filter_ = [filters[i] for i, f in enumerate(filter_) if f] - - info = {} - samples = [{} for _ in range(n_samples)] # type: ignore - - for key in info_fields: - variable_name = f"variant_{key}" - if variable_name in ds: - arr = ds[variable_name][i].values - present, val = array_to_values(arr, variable_name) - if present: - info[key] = val - - for key in format_fields: - if key == "GT": - variable_name = "call_genotype" - else: - variable_name = f"call_{key}" - if variable_name in ds: - arr = ds[variable_name][i].values - assert len(arr) == n_samples - if key == "GT": - phased = ds["call_genotype_phased"][i].values - for j in range(len(arr)): - present, val = array_to_values(arr[j], variable_name) - if not present: - break # samples should all be present or none are - if key == "GT": - lst = [(str(v) if v is not None else ".") for v in val] - val = ("|" if phased[j] else "/").join(lst) - samples[j][key] = val - - variant = VcfVariant( - contigs[chrom], pos, id, ref, alt, qual, filter_, info, samples - ) - - vcf_writer.write(variant) - - -def array_to_values(arr, name="unknown"): - """Convert an array from cyvcf2 to a 'present' flag, and an array with fill removed.""" - if isinstance(arr, str): # this happens for the Type=String, Number=1 path - arr = np.array([arr], dtype="O") - if arr.dtype == bool: - if arr.size == 1: - return True, arr.item() - else: - return True, arr - elif arr.dtype in (np.int8, np.int16, np.int32): - if name == "call_genotype": - missing, fill = -1, -2 - else: - assert arr.dtype == np.int32 - missing, fill = INT_MISSING, INT_FILL - if arr.size == 1: - val = arr - if val == missing: - return True, None - elif val != fill: - return True, val.item() - else: - arr = arr[arr != fill] # remove fill padding - if arr.size > 0: - val = [x if x != missing else None for x in arr.tolist()] - return True, val - return False, None - elif arr.dtype == np.float32: - missing, fill = FLOAT32_MISSING_AS_INT32, FLOAT32_FILL_AS_INT32 - if arr.size == 1: - val = arr - if val.view("i4") == missing: - return True, None - elif val.view("i4") != fill: - return True, val.item() - else: - arr = arr[arr.view("i4") != fill] # remove fill padding - if arr.size > 0: - val = [x.item() if x.view("i4") != missing else None for x in arr] - return True, val - return False, None - elif arr.dtype == np.dtype("S1") or arr.dtype == np.dtype( - "S0" - ): # S0 is some cases (e.g. FC1) - missing, fill = np.array([CHAR_MISSING, CHAR_FILL], dtype="S1") - if arr.size == 1: - val = arr - if val == missing: - return True, None - elif val != fill: - return True, val.item() - else: - arr = arr[arr != fill] # remove fill padding - if arr.size > 0: - val = [x if x != missing else None for x in arr.tolist()] - return True, val - return False, None - elif arr.dtype == np.object_: - missing, fill = STR_MISSING, STR_FILL # type: ignore - lst = arr.tolist() # convert to list o/w comparisons don't work for np O type - if arr.size == 1: - val = lst[0] if isinstance(lst, list) else lst - if val == missing: - return True, None - elif val != fill: - return True, val - else: - arr = arr[arr != fill] # remove fill padding - lst = [x for x in lst if x != fill] - if len(lst) > 0: - val = [x if x != missing else None for x in lst] - return True, val - return False, None - else: - raise ValueError(f"Unsupported dtype: {arr.dtype} {name}") - - -def _info_fields(header_str): - p = re.compile("ID=([^,>]+)") - return [ - p.findall(line)[0] - for line in header_str.split("\n") - if line.startswith("##INFO=") - ] - - -def _format_fields(header_str): - p = re.compile("ID=([^,>]+)") - fields = [ - p.findall(line)[0] - for line in header_str.split("\n") - if line.startswith("##FORMAT=") - ] - # GT must be the first field if present, per the spec (section 1.6.2) - if "GT" in fields: - fields.remove("GT") - fields.insert(0, "GT") - return fields - - -def canonicalize_vcf(input: PathType, output: PathType) -> None: - """Canonicalize the fields in a VCF file by writing all INFO fields in the order that they appear in the header.""" - - with open_vcf(input) as vcf: - info_field_names = _info_fields(vcf.raw_header) - - w = Writer(str(output), vcf) - for v in vcf: - v = _reorder_info_fields(w, v, info_field_names) - w.write_record(v) - w.close() - - -def _reorder_info_fields(writer, variant, info_field_names): - # variant.INFO is readonly so we have to go via a string representation - - variant_str = str(variant)[:-1] # strip newline - fields = variant_str.split("\t") - info_field = fields[7] - if info_field == ".": - return variant - elif info_field != ".": - info_fields = {f.split("=")[0]: f for f in info_field.split(";")} - - # sort info_fields in order of info_field_names - index_map = {v: i for i, v in enumerate(info_field_names)} - info_fields_reordered = sorted( - info_fields.items(), key=lambda pair: index_map[pair[0]] - ) - - # update the info field - fields[7] = ";".join([t[1] for t in info_fields_reordered]) - new_variant_str = "\t".join(fields) - return writer.variant_from_string(new_variant_str) diff --git a/sgkit/tests/test_vcfzarr_reader.py b/sgkit/tests/test_vcfzarr_reader.py deleted file mode 100644 index a4c6c8c08..000000000 --- a/sgkit/tests/test_vcfzarr_reader.py +++ /dev/null @@ -1,203 +0,0 @@ -import allel -import numpy as np -import pytest -import xarray as xr -import zarr -from numpy.testing import assert_array_equal - -from sgkit import read_scikit_allel_vcfzarr -from sgkit.io.vcfzarr_reader import _ensure_2d, vcfzarr_to_zarr - - -def create_vcfzarr( - shared_datadir, tmpdir, *, fields=None, grouped_by_contig=False, consolidated=False -): - """Create a vcfzarr file using scikit-allel""" - vcf_path = shared_datadir / "sample.vcf" - output_path = tmpdir / "sample.vcf.zarr" - if grouped_by_contig: - for contig in ["19", "20", "X"]: - allel.vcf_to_zarr( - str(vcf_path), - str(output_path), - fields=fields, - group=contig, - region=contig, - ) - else: - allel.vcf_to_zarr(str(vcf_path), str(output_path), fields=fields) - if consolidated: - zarr.consolidate_metadata(str(output_path)) - return output_path - - -def test_ensure_2d(): - assert_array_equal(_ensure_2d(np.array([0, 2, 1])), np.array([[0], [2], [1]])) - assert_array_equal(_ensure_2d(np.array([[0], [2], [1]])), np.array([[0], [2], [1]])) - - -def test_read_scikit_allel_vcfzarr(shared_datadir, tmpdir): - vcfzarr_path = create_vcfzarr(shared_datadir, tmpdir) - ds = read_scikit_allel_vcfzarr(vcfzarr_path) - - assert_array_equal(ds["contig_id"], ["19", "20", "X"]) - assert_array_equal(ds["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2]) - assert_array_equal( - ds["variant_position"], - [111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10], - ) - assert_array_equal( - ds["variant_allele"], - [ - ["A", "C", "", ""], - ["A", "G", "", ""], - ["G", "A", "", ""], - ["T", "A", "", ""], - ["A", "G", "T", ""], - ["T", "", "", ""], - ["G", "GA", "GAC", ""], - ["T", "", "", ""], - ["AC", "A", "ATG", "C"], - ], - ) - assert_array_equal( - ds["variant_id"], - [".", ".", "rs6054257", ".", "rs6040355", ".", "microsat1", ".", "rsTest"], - ) - assert_array_equal( - ds["variant_id_mask"], - [True, True, False, True, False, True, False, True, False], - ) - - assert_array_equal(ds["sample_id"], ["NA00001", "NA00002", "NA00003"]) - - call_genotype = np.array( - [ - [[0, 0], [0, 0], [0, 1]], - [[0, 0], [0, 0], [0, 1]], - [[0, 0], [1, 0], [1, 1]], - [[0, 0], [0, 1], [0, 0]], - [[1, 2], [2, 1], [2, 2]], - [[0, 0], [0, 0], [0, 0]], - [[0, 1], [0, 2], [-1, -1]], - [[0, 0], [0, 0], [-1, -1]], - [[0, -1], [0, 1], [0, 2]], - ], - dtype="i1", - ) - assert_array_equal(ds["call_genotype"], call_genotype) - assert_array_equal(ds["call_genotype_mask"], call_genotype < 0) - assert "call_genotype_phased" not in ds - - -@pytest.mark.parametrize( - "grouped_by_contig, consolidated, has_variant_id", - [ - (False, False, False), - (False, False, True), - (True, False, True), - (True, True, False), - ], -) -def test_vcfzarr_to_zarr( - shared_datadir, - tmpdir, - grouped_by_contig, - consolidated, - has_variant_id, -): - if has_variant_id: - fields = None - else: - fields = [ - "variants/CHROM", - "variants/POS", - "variants/REF", - "variants/ALT", - "calldata/GT", - "samples", - ] - - vcfzarr_path = create_vcfzarr( - shared_datadir, - tmpdir, - fields=fields, - grouped_by_contig=grouped_by_contig, - consolidated=consolidated, - ) - - output = str(tmpdir / "vcf.zarr") - vcfzarr_to_zarr( - vcfzarr_path, - output, - grouped_by_contig=grouped_by_contig, - consolidated=consolidated, - ) - - ds = xr.open_zarr(output, concat_characters=False) - - # Note that variant_allele values are byte strings, not unicode strings (unlike for read_scikit_allel_vcfzarr) - # We should make the two consistent. - - assert_array_equal(ds["contig_id"], ["19", "20", "X"]) - assert_array_equal(ds["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2]) - assert_array_equal( - ds["variant_position"], - [111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10], - ) - assert_array_equal( - ds["variant_allele"], - [ - [b"A", b"C", b"", b""], - [b"A", b"G", b"", b""], - [b"G", b"A", b"", b""], - [b"T", b"A", b"", b""], - [b"A", b"G", b"T", b""], - [b"T", b"", b"", b""], - [b"G", b"GA", b"GAC", b""], - [b"T", b"", b"", b""], - [b"AC", b"A", b"ATG", b"C"], - ], - ) - if has_variant_id: - assert_array_equal( - ds["variant_id"], - [ - b".", - b".", - b"rs6054257", - b".", - b"rs6040355", - b".", - b"microsat1", - b".", - b"rsTest", - ], - ) - assert_array_equal( - ds["variant_id_mask"], - [True, True, False, True, False, True, False, True, False], - ) - else: - assert "variant_id" not in ds - assert "variant_id_mask" not in ds - - assert_array_equal(ds["sample_id"], ["NA00001", "NA00002", "NA00003"]) - - call_genotype = np.array( - [ - [[0, 0], [0, 0], [0, 1]], - [[0, 0], [0, 0], [0, 1]], - [[0, 0], [1, 0], [1, 1]], - [[0, 0], [0, 1], [0, 0]], - [[1, 2], [2, 1], [2, 2]], - [[0, 0], [0, 0], [0, 0]], - [[0, 1], [0, 2], [-1, -1]], - [[0, 0], [0, 0], [-1, -1]], - [[0, -1], [0, 1], [0, 2]], - ], - dtype="i1", - ) - assert_array_equal(ds["call_genotype"], call_genotype) - assert_array_equal(ds["call_genotype_mask"], call_genotype < 0) - assert "call_genotype_phased" not in ds