From 2c6c6966c5772e1e050ff5dafdd7fe1fac972a76 Mon Sep 17 00:00:00 2001 From: Tom White Date: Fri, 12 Jul 2024 15:30:16 +0100 Subject: [PATCH] Implement using pyranges --- pyproject.toml | 1 + tests/test_regions.py | 6 ++-- vcztools/regions.py | 76 +++++++++++++++++++++++------------------- vcztools/vcf_writer.py | 14 ++++---- 4 files changed, 51 insertions(+), 46 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1b53341..a70cbfc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "numba", "zarr>=2.17,<3", "click", + "pyranges", ] requires-python = ">=3.9" dynamic = ["version"] diff --git a/tests/test_regions.py b/tests/test_regions.py index 36c966b..71f076d 100644 --- a/tests/test_regions.py +++ b/tests/test_regions.py @@ -1,6 +1,6 @@ from typing import Optional import pytest -from vcztools.regions import parse_targets_string +from vcztools.regions import parse_region @pytest.mark.parametrize( @@ -12,7 +12,7 @@ ("chr1:12-103", ("chr1", 12, 103)), ], ) -def test_parse_targets_string( +def test_parse_region( targets: str, expected: tuple[str, Optional[int], Optional[int]] ): - assert parse_targets_string(targets) == expected + assert parse_region(targets) == expected diff --git a/vcztools/regions.py b/vcztools/regions.py index f0d6394..2ba1d97 100644 --- a/vcztools/regions.py +++ b/vcztools/regions.py @@ -2,53 +2,59 @@ from typing import Any, List, Optional import numpy as np +import pandas as pd +import pyranges -def parse_targets_string(targets: str) -> tuple[str, Optional[int], Optional[int]]: - """Return the contig, start position and end position from a targets string.""" - if re.search(r":\d+-\d*$", targets): - contig, start_end = targets.rsplit(":", 1) +def parse_region(region: str) -> tuple[str, Optional[int], Optional[int]]: + """Return the contig, start position and end position from a region string.""" + if re.search(r":\d+-\d*$", region): + contig, start_end = region.rsplit(":", 1) start, end = start_end.split("-") return contig, int(start), int(end) if len(end) > 0 else None - elif re.search(r":\d+$", targets): - contig, start = targets.rsplit(":", 1) + elif re.search(r":\d+$", region): + contig, start = region.rsplit(":", 1) return contig, int(start), int(start) else: - contig = targets + contig = region return contig, None, None -def region_to_slice( +def parse_targets(targets: str) -> list[tuple[str, Optional[int], Optional[int]]]: + return [parse_region(region) for region in targets.split(",")] + + +def regions_to_selection( all_contigs: List[str], variant_contig: Any, variant_position: Any, - contig: str, - start: Optional[int] = None, - end: Optional[int] = None, -) -> slice: - """Convert a VCF region to a Python slice.""" - - # subtract one from start since VCF is 1-based (fully closed), - # but Python slices are 0-based (half open) - start = None if start is None else start - 1 - - contig_index = all_contigs.index(contig) - contig_range = np.searchsorted(variant_contig, [contig_index, contig_index + 1]) - - if start is None and end is None: - start_index, end_index = contig_range - else: - contig_pos = variant_position[slice(contig_range[0], contig_range[1])] + regions: list[tuple[str, Optional[int], Optional[int]]], +): + # subtract 1 from start coordinate to convert intervals + # from VCF (1-based, fully-closed) to Python (0-based, half-open) + + df = pd.DataFrame({"Chromosome": variant_contig, "Start": variant_position - 1, "End": variant_position}) + # save original index as column so we can retrieve it after finding overlap + df["index"] = df.index + variants = pyranges.PyRanges(df) + + chromosomes = [] + starts = [] + ends = [] + for contig, start, end in regions: if start is None: - assert end is not None # covered by case above - start_index = contig_range[0] - end_index = contig_range[0] + np.searchsorted(contig_pos, [end])[0] - elif end is None: - start_index = contig_range[0] + np.searchsorted(contig_pos, [start])[0] - end_index = contig_range[1] + start = 0 else: - start_index, end_index = contig_range[0] + np.searchsorted( - contig_pos, [start, end] - ) + start -= 1 + + if end is None: + end = np.iinfo(np.int64).max + + chromosomes.append(all_contigs.index(contig)) + starts.append(start) + ends.append(end) + + query = pyranges.PyRanges(chromosomes=chromosomes, starts=starts, ends=ends) - return slice(start_index, end_index) + overlap = variants.overlap(query) + return overlap.df["index"].to_numpy() diff --git a/vcztools/vcf_writer.py b/vcztools/vcf_writer.py index d8c4728..833bf3f 100644 --- a/vcztools/vcf_writer.py +++ b/vcztools/vcf_writer.py @@ -5,7 +5,7 @@ from typing import MutableMapping, Optional, TextIO, Union import numpy as np -from vcztools.regions import parse_targets_string, region_to_slice +from vcztools.regions import parse_targets, regions_to_selection import zarr from . import _vcztools @@ -167,17 +167,15 @@ def write_vcf( if variant_targets is None: variant_mask = np.ones(pos.shape[0], dtype=bool) else: - contig, start, end = parse_targets_string(variant_targets) - variant_slice = region_to_slice( + regions = parse_targets(variant_targets) + variant_selection = regions_to_selection( root["contig_id"][:].astype("U").tolist(), root["variant_contig"], - pos, - contig, - start, - end, + pos[:], + regions, ) variant_mask = np.zeros(pos.shape[0], dtype=bool) - variant_mask[variant_slice] = 1 + variant_mask[variant_selection] = 1 # Use zarr arrays to get mask chunks aligned with the main data # for convenience. z_variant_mask = zarr.array(variant_mask, chunks=pos.chunks[0])