Skip to content

Commit

Permalink
Implement using pyranges
Browse files Browse the repository at this point in the history
  • Loading branch information
tomwhite committed Jul 12, 2024
1 parent 516e960 commit 2c6c696
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 46 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ dependencies = [
"numba",
"zarr>=2.17,<3",
"click",
"pyranges",
]
requires-python = ">=3.9"
dynamic = ["version"]
Expand Down
6 changes: 3 additions & 3 deletions tests/test_regions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Optional
import pytest
from vcztools.regions import parse_targets_string
from vcztools.regions import parse_region


@pytest.mark.parametrize(
Expand All @@ -12,7 +12,7 @@
("chr1:12-103", ("chr1", 12, 103)),
],
)
def test_parse_targets_string(
def test_parse_region(
targets: str, expected: tuple[str, Optional[int], Optional[int]]
):
assert parse_targets_string(targets) == expected
assert parse_region(targets) == expected
76 changes: 41 additions & 35 deletions vcztools/regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,53 +2,59 @@
from typing import Any, List, Optional

import numpy as np
import pandas as pd
import pyranges


def parse_targets_string(targets: str) -> tuple[str, Optional[int], Optional[int]]:
"""Return the contig, start position and end position from a targets string."""
if re.search(r":\d+-\d*$", targets):
contig, start_end = targets.rsplit(":", 1)
def parse_region(region: str) -> tuple[str, Optional[int], Optional[int]]:
"""Return the contig, start position and end position from a region string."""
if re.search(r":\d+-\d*$", region):
contig, start_end = region.rsplit(":", 1)
start, end = start_end.split("-")
return contig, int(start), int(end) if len(end) > 0 else None
elif re.search(r":\d+$", targets):
contig, start = targets.rsplit(":", 1)
elif re.search(r":\d+$", region):
contig, start = region.rsplit(":", 1)
return contig, int(start), int(start)
else:
contig = targets
contig = region
return contig, None, None


def region_to_slice(
def parse_targets(targets: str) -> list[tuple[str, Optional[int], Optional[int]]]:
return [parse_region(region) for region in targets.split(",")]


def regions_to_selection(
all_contigs: List[str],
variant_contig: Any,
variant_position: Any,
contig: str,
start: Optional[int] = None,
end: Optional[int] = None,
) -> slice:
"""Convert a VCF region to a Python slice."""

# subtract one from start since VCF is 1-based (fully closed),
# but Python slices are 0-based (half open)
start = None if start is None else start - 1

contig_index = all_contigs.index(contig)
contig_range = np.searchsorted(variant_contig, [contig_index, contig_index + 1])

if start is None and end is None:
start_index, end_index = contig_range
else:
contig_pos = variant_position[slice(contig_range[0], contig_range[1])]
regions: list[tuple[str, Optional[int], Optional[int]]],
):
# subtract 1 from start coordinate to convert intervals
# from VCF (1-based, fully-closed) to Python (0-based, half-open)

df = pd.DataFrame({"Chromosome": variant_contig, "Start": variant_position - 1, "End": variant_position})
# save original index as column so we can retrieve it after finding overlap
df["index"] = df.index
variants = pyranges.PyRanges(df)

chromosomes = []
starts = []
ends = []
for contig, start, end in regions:
if start is None:
assert end is not None # covered by case above
start_index = contig_range[0]
end_index = contig_range[0] + np.searchsorted(contig_pos, [end])[0]
elif end is None:
start_index = contig_range[0] + np.searchsorted(contig_pos, [start])[0]
end_index = contig_range[1]
start = 0
else:
start_index, end_index = contig_range[0] + np.searchsorted(
contig_pos, [start, end]
)
start -= 1

if end is None:
end = np.iinfo(np.int64).max

chromosomes.append(all_contigs.index(contig))
starts.append(start)
ends.append(end)

query = pyranges.PyRanges(chromosomes=chromosomes, starts=starts, ends=ends)

return slice(start_index, end_index)
overlap = variants.overlap(query)
return overlap.df["index"].to_numpy()
14 changes: 6 additions & 8 deletions vcztools/vcf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import MutableMapping, Optional, TextIO, Union

import numpy as np
from vcztools.regions import parse_targets_string, region_to_slice
from vcztools.regions import parse_targets, regions_to_selection
import zarr

from . import _vcztools
Expand Down Expand Up @@ -167,17 +167,15 @@ def write_vcf(
if variant_targets is None:
variant_mask = np.ones(pos.shape[0], dtype=bool)
else:
contig, start, end = parse_targets_string(variant_targets)
variant_slice = region_to_slice(
regions = parse_targets(variant_targets)
variant_selection = regions_to_selection(
root["contig_id"][:].astype("U").tolist(),
root["variant_contig"],
pos,
contig,
start,
end,
pos[:],
regions,
)
variant_mask = np.zeros(pos.shape[0], dtype=bool)
variant_mask[variant_slice] = 1
variant_mask[variant_selection] = 1
# Use zarr arrays to get mask chunks aligned with the main data
# for convenience.
z_variant_mask = zarr.array(variant_mask, chunks=pos.chunks[0])
Expand Down

0 comments on commit 2c6c696

Please sign in to comment.