Skip to content

Commit

Permalink
Remove index creation as it is now done by bio2zarr
Browse files Browse the repository at this point in the history
  • Loading branch information
tomwhite authored and jeromekelleher committed Nov 28, 2024
1 parent 598c441 commit 97578d1
Show file tree
Hide file tree
Showing 5 changed files with 7 additions and 53 deletions.
5 changes: 5 additions & 0 deletions tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from io import StringIO

import pytest
import zarr
from bio2zarr import vcf2zarr

from vcztools.stats import nrecords, stats
Expand Down Expand Up @@ -40,5 +41,9 @@ def test_stats__no_index(tmp_path):
vcz = tmp_path.joinpath("intermediate.vcz")
vcf2zarr.convert([original], vcz, worker_processes=0, local_alleles=False)

# delete the index created by vcf2zarr
root = zarr.open(vcz, mode="a")
del root["region_index"]

with pytest.raises(ValueError, match="Could not load 'region_index' variable."):
stats(vcz, StringIO())
3 changes: 0 additions & 3 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
import numpy as np
from bio2zarr import vcf2zarr

from vcztools.regions import create_index


@contextmanager
def open_vcf(path) -> Iterator[cyvcf2.VCF]:
Expand Down Expand Up @@ -152,5 +150,4 @@ def vcz_path_cache(vcf_path):
vcf2zarr.convert(
[vcf_path], cached_vcz_path, worker_processes=0, local_alleles=False
)
create_index(cached_vcz_path)
return cached_vcz_path
4 changes: 1 addition & 3 deletions vcztools/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import click

from . import query as query_module
from . import regions, vcf_writer
from . import stats as stats_module
from . import vcf_writer

include = click.option(
"-i", "--include", type=str, help="Filter expression to include variant sites."
Expand Down Expand Up @@ -42,8 +42,6 @@ def index(path, nrecords, stats):
stats_module.nrecords(path, sys.stdout)
elif stats:
stats_module.stats(path, sys.stdout)
else:
regions.create_index(path)


@click.command
Expand Down
46 changes: 0 additions & 46 deletions vcztools/regions.py
Original file line number Diff line number Diff line change
@@ -1,57 +1,11 @@
import re
from typing import Any

import numcodecs
import numpy as np
import pandas as pd
import zarr
from pyranges import PyRanges


def create_index(vcz) -> None:
"""Create an index to support efficient region queries."""

root = zarr.open(vcz, mode="r+")

contig = root["variant_contig"]
pos = root["variant_position"]
length = root["variant_length"]

assert contig.cdata_shape == pos.cdata_shape

index = []

for v_chunk in range(pos.cdata_shape[0]):
c = contig.blocks[v_chunk]
p = pos.blocks[v_chunk]
e = p + length.blocks[v_chunk] - 1

# create a row for each contig in the chunk
d = np.diff(c, append=-1)
c_start_idx = 0
for c_end_idx in np.nonzero(d)[0]:
assert c[c_start_idx] == c[c_end_idx]
index.append(
(
v_chunk, # chunk index
c[c_start_idx], # contig ID
p[c_start_idx], # start
p[c_end_idx], # end
np.max(e[c_start_idx : c_end_idx + 1]), # max end
c_end_idx - c_start_idx + 1, # num records
)
)
c_start_idx = c_end_idx + 1

index = np.array(index, dtype=np.int32)
root.array(
"region_index",
data=index,
compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0),
overwrite=True,
)


def parse_region_string(region: str) -> tuple[str, int | None, int | None]:
"""Return the contig, start position and end position from a region string."""
if re.search(r":\d+-\d*$", region):
Expand Down
2 changes: 1 addition & 1 deletion vcztools/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def stats(vcz, output):
if "region_index" not in root:
raise ValueError(
"Could not load 'region_index' variable. "
"Use 'vcztools index' to create an index."
"Use 'vcz2zarr' to create an index."
)

with open_file_like(output) as output:
Expand Down

0 comments on commit 97578d1

Please sign in to comment.