Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove index creation as it is now done by bio2zarr #109

Merged
merged 1 commit into from
Nov 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from io import StringIO

import pytest
import zarr
from bio2zarr import vcf2zarr

from vcztools.stats import nrecords, stats
Expand Down Expand Up @@ -40,5 +41,9 @@ def test_stats__no_index(tmp_path):
vcz = tmp_path.joinpath("intermediate.vcz")
vcf2zarr.convert([original], vcz, worker_processes=0, local_alleles=False)

# delete the index created by vcf2zarr
root = zarr.open(vcz, mode="a")
del root["region_index"]

with pytest.raises(ValueError, match="Could not load 'region_index' variable."):
stats(vcz, StringIO())
3 changes: 0 additions & 3 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
import numpy as np
from bio2zarr import vcf2zarr

from vcztools.regions import create_index


@contextmanager
def open_vcf(path) -> Iterator[cyvcf2.VCF]:
Expand Down Expand Up @@ -152,5 +150,4 @@ def vcz_path_cache(vcf_path):
vcf2zarr.convert(
[vcf_path], cached_vcz_path, worker_processes=0, local_alleles=False
)
create_index(cached_vcz_path)
return cached_vcz_path
4 changes: 1 addition & 3 deletions vcztools/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import click

from . import query as query_module
from . import regions, vcf_writer
from . import stats as stats_module
from . import vcf_writer

include = click.option(
"-i", "--include", type=str, help="Filter expression to include variant sites."
Expand Down Expand Up @@ -42,8 +42,6 @@ def index(path, nrecords, stats):
stats_module.nrecords(path, sys.stdout)
elif stats:
stats_module.stats(path, sys.stdout)
else:
regions.create_index(path)


@click.command
Expand Down
46 changes: 0 additions & 46 deletions vcztools/regions.py
Original file line number Diff line number Diff line change
@@ -1,57 +1,11 @@
import re
from typing import Any

import numcodecs
import numpy as np
import pandas as pd
import zarr
from pyranges import PyRanges


def create_index(vcz) -> None:
"""Create an index to support efficient region queries."""

root = zarr.open(vcz, mode="r+")

contig = root["variant_contig"]
pos = root["variant_position"]
length = root["variant_length"]

assert contig.cdata_shape == pos.cdata_shape

index = []

for v_chunk in range(pos.cdata_shape[0]):
c = contig.blocks[v_chunk]
p = pos.blocks[v_chunk]
e = p + length.blocks[v_chunk] - 1

# create a row for each contig in the chunk
d = np.diff(c, append=-1)
c_start_idx = 0
for c_end_idx in np.nonzero(d)[0]:
assert c[c_start_idx] == c[c_end_idx]
index.append(
(
v_chunk, # chunk index
c[c_start_idx], # contig ID
p[c_start_idx], # start
p[c_end_idx], # end
np.max(e[c_start_idx : c_end_idx + 1]), # max end
c_end_idx - c_start_idx + 1, # num records
)
)
c_start_idx = c_end_idx + 1

index = np.array(index, dtype=np.int32)
root.array(
"region_index",
data=index,
compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0),
overwrite=True,
)


def parse_region_string(region: str) -> tuple[str, int | None, int | None]:
"""Return the contig, start position and end position from a region string."""
if re.search(r":\d+-\d*$", region):
Expand Down
2 changes: 1 addition & 1 deletion vcztools/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def stats(vcz, output):
if "region_index" not in root:
raise ValueError(
"Could not load 'region_index' variable. "
"Use 'vcztools index' to create an index."
"Use 'vcz2zarr' to create an index."
)

with open_file_like(output) as output:
Expand Down
Loading