From 2d5aa1b37611c0251fc27cba26f150ae56de7b59 Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 25 Nov 2024 17:09:17 +0000 Subject: [PATCH] Remove index creation as it is now done by bio2zarr --- tests/test_stats.py | 5 +++++ tests/utils.py | 3 --- vcztools/cli.py | 4 +--- vcztools/regions.py | 46 --------------------------------------------- vcztools/stats.py | 2 +- 5 files changed, 7 insertions(+), 53 deletions(-) diff --git a/tests/test_stats.py b/tests/test_stats.py index d171e82..7a28c3f 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -2,6 +2,7 @@ from io import StringIO import pytest +import zarr from bio2zarr import vcf2zarr from vcztools.stats import nrecords, stats @@ -40,5 +41,9 @@ def test_stats__no_index(tmp_path): vcz = tmp_path.joinpath("intermediate.vcz") vcf2zarr.convert([original], vcz, worker_processes=0, local_alleles=False) + # delete the index created by vcf2zarr + root = zarr.open(vcz, mode="a") + del root["region_index"] + with pytest.raises(ValueError, match="Could not load 'region_index' variable."): stats(vcz, StringIO()) diff --git a/tests/utils.py b/tests/utils.py index f27d8ab..2129709 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -7,8 +7,6 @@ import numpy as np from bio2zarr import vcf2zarr -from vcztools.regions import create_index - @contextmanager def open_vcf(path) -> Iterator[cyvcf2.VCF]: @@ -152,5 +150,4 @@ def vcz_path_cache(vcf_path): vcf2zarr.convert( [vcf_path], cached_vcz_path, worker_processes=0, local_alleles=False ) - create_index(cached_vcz_path) return cached_vcz_path diff --git a/vcztools/cli.py b/vcztools/cli.py index 693a6c4..3911102 100644 --- a/vcztools/cli.py +++ b/vcztools/cli.py @@ -3,8 +3,8 @@ import click from . import query as query_module -from . import regions, vcf_writer from . import stats as stats_module +from . import vcf_writer include = click.option( "-i", "--include", type=str, help="Filter expression to include variant sites." @@ -42,8 +42,6 @@ def index(path, nrecords, stats): stats_module.nrecords(path, sys.stdout) elif stats: stats_module.stats(path, sys.stdout) - else: - regions.create_index(path) @click.command diff --git a/vcztools/regions.py b/vcztools/regions.py index bc6056a..f0a10e3 100644 --- a/vcztools/regions.py +++ b/vcztools/regions.py @@ -1,57 +1,11 @@ import re from typing import Any -import numcodecs import numpy as np import pandas as pd -import zarr from pyranges import PyRanges -def create_index(vcz) -> None: - """Create an index to support efficient region queries.""" - - root = zarr.open(vcz, mode="r+") - - contig = root["variant_contig"] - pos = root["variant_position"] - length = root["variant_length"] - - assert contig.cdata_shape == pos.cdata_shape - - index = [] - - for v_chunk in range(pos.cdata_shape[0]): - c = contig.blocks[v_chunk] - p = pos.blocks[v_chunk] - e = p + length.blocks[v_chunk] - 1 - - # create a row for each contig in the chunk - d = np.diff(c, append=-1) - c_start_idx = 0 - for c_end_idx in np.nonzero(d)[0]: - assert c[c_start_idx] == c[c_end_idx] - index.append( - ( - v_chunk, # chunk index - c[c_start_idx], # contig ID - p[c_start_idx], # start - p[c_end_idx], # end - np.max(e[c_start_idx : c_end_idx + 1]), # max end - c_end_idx - c_start_idx + 1, # num records - ) - ) - c_start_idx = c_end_idx + 1 - - index = np.array(index, dtype=np.int32) - root.array( - "region_index", - data=index, - compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0), - overwrite=True, - ) - - def parse_region_string(region: str) -> tuple[str, int | None, int | None]: """Return the contig, start position and end position from a region string.""" if re.search(r":\d+-\d*$", region): diff --git a/vcztools/stats.py b/vcztools/stats.py index c613e4e..05b2ce7 100644 --- a/vcztools/stats.py +++ b/vcztools/stats.py @@ -18,7 +18,7 @@ def stats(vcz, output): if "region_index" not in root: raise ValueError( "Could not load 'region_index' variable. " - "Use 'vcztools index' to create an index." + "Use 'vcz2zarr' to create an index." ) with open_file_like(output) as output: