Skip to content

Commit

Permalink
indexing refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
0x00b1 committed May 18, 2024
1 parent ec09e7a commit 584466d
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 46 deletions.
11 changes: 3 additions & 8 deletions src/beignet/datasets/__uni_ref_dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import re
from os import PathLike
from pathlib import Path
from typing import Callable
Expand Down Expand Up @@ -51,8 +50,6 @@ def __init__(

name = self.__class__.__name__.replace("Dataset", "")

self._pattern = re.compile(r"^UniRef.+_([A-Z0-9]+)\s.+$")

super().__init__(
pooch.retrieve(
url,
Expand All @@ -71,14 +68,12 @@ def __init__(
self.target_transform = target_transform

def __getitem__(self, index: int) -> (str, str):
target, sequence = self.get(index)

(target,) = re.search(self._pattern, target).groups()
input, target = self.get(index)

if self.transform:
sequence = self.transform(sequence)
input = self.transform(input)

if self.target_transform:
target = self.target_transform(target)

return sequence, target
return input, target
79 changes: 41 additions & 38 deletions src/beignet/datasets/_fasta_dataset.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import subprocess
from os import PathLike
from pathlib import Path
from typing import Callable, Tuple, TypeVar

import numpy
import tqdm

from beignet.io import ThreadSafeFile

Expand Down Expand Up @@ -37,10 +37,7 @@ def __init__(
else:
self.offsets, sizes = self._build_index()

numpy.save(
f"{offsets}",
numpy.stack([self.offsets, sizes]),
)
numpy.save(f"{offsets}", numpy.stack([self.offsets, sizes]))

self.transform = transform

Expand All @@ -57,44 +54,50 @@ def __getitem__(self, index: int) -> Tuple[str, str]:
def __len__(self) -> int:
return self.offsets.size

def get(self, index: int) -> str:
def get(self, index: int) -> (str, str):
self.data.seek(self.offsets[index])

if index == len(self) - 1:
data = self.data.read()
else:
data = self.data.read(
self.offsets[index + 1] - self.offsets[index],
)
data = self.data.read(self.offsets[index + 1] - self.offsets[index])

description, *sequence = data.split("\n")

return "".join(sequence)

def _build_index(self) -> tuple[numpy.ndarray, numpy.ndarray]:
# TODO: rewrite in Rust (using `libripgrep`) or similar to remove
# dependency on `grep` and `awk`. — Allen (Tuesday, November 29, 2022)
return (
numpy.fromstring(
subprocess.check_output(
f"cat {self.root} "
f"| tqdm --bytes --total $(wc -c < {self.root})"
"| grep --byte-offset '^>' -o | cut -d: -f1",
shell=True,
),
dtype=numpy.int64,
sep=" ",
),
numpy.fromstring(
subprocess.check_output(
f"cat {self.root} "
f"| tqdm --bytes --total $(wc -c < {self.root})"
'| awk \'/^>/ {print "";next;} { printf("%s",$0);}\' '
"| tail -n+2 | awk "
"'{print length($1)}'",
shell=True,
),
dtype=numpy.int64,
sep=" ",
),
)
return "".join(sequence), description

def _build_index(self) -> (numpy.ndarray, numpy.ndarray):
with open(self.root, "r") as file:
content = file.read()

offsets, sizes = [], []

current_offset, current_size = 0, 0

parsing = False

for sequence in tqdm.tqdm(content.splitlines(keepends=True)):
characters = len(sequence)

if sequence.startswith(">"):
if parsing:
sizes = [*sizes, current_size]

current_size = 0

offsets = [*offsets, current_offset]

parsing = True
elif parsing:
current_size = current_size + len(sequence.rstrip("\n"))

current_offset = current_offset + characters

if parsing:
sizes = [*sizes, current_size]

offsets = numpy.array(offsets, dtype=numpy.int64)

sizes = numpy.array(sizes, dtype=numpy.int64)

return offsets, sizes

0 comments on commit 584466d

Please sign in to comment.