Skip to content

Commit

Permalink
feat: add Searcher
Browse files Browse the repository at this point in the history
  • Loading branch information
WieeRd committed Feb 17, 2024
1 parent 1757b85 commit bdb7923
Showing 1 changed file with 51 additions and 44 deletions.
95 changes: 51 additions & 44 deletions ricecake/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import re
from collections.abc import Iterator
from dataclasses import dataclass

from . import offset as o

Expand Down Expand Up @@ -41,55 +42,61 @@
# fmt: on


def search_pattern(
text: str,
/,
*,
choseong_search: bool = True,
jongseong_completion: bool = True,
incremental: bool = True,
) -> Iterator[str]:
"""Generates a regex pattern tailored for searching Hangul texts.
# DOC: did you know? writing human language is a lot harder than programming language
# TEST: speaking of docs, I haven't tested anything I coded so far.
# | I should add example sections with doctests at some point
# | when is that some point? who knows.
@dataclass(kw_only=True)
class Searcher:
"""Fuzzy & incremental search for Korean texts.
Examples: Coming soon:tm:
Args:
text: ...
choseong_search: ...
jongseong_completion: ...
incremental: ...
Attributes:
choseong_search: Match Jaum with all syllables using that Jaum as a Choseong.
jongseong_completion: Documentation is hard.
incremental: I'll come back later.
fuzzy: Hopefully.
"""
for c in re.escape(text):
code = ord(c)

# FEAT: LATER: composite jamo completion e.g. "우" -> "위", "일" -> "읽"
if jongseong_completion and o.is_syllable(c):
# checks if the syllable is missing a jongseong
# if so, yield a pattern that matches any jongseong
# e.g. "슈" -> "[슈-슣]" / "슉" -> "슉"
if (code - o.SYLLABLE_BASE) % o.JUNGSEONG_COEF == 0:
yield f"[{c}-{chr(code + o.JUNGSEONG_COEF - 1)}]"
choseong_search: bool
jongseong_completion: bool
incremental: bool
fuzzy: bool

def _search_pattern(self, text: str, /) -> Iterator[str]:
"""Generates regex patterns for each character in the text."""
for c in re.escape(text):
code = ord(c)

# FEAT: LATER: composite jamo completion e.g. "우" -> "위", "일" -> "읽"
if self.jongseong_completion and o.is_syllable(c):
# checks if the syllable is missing a jongseong
# if so, yield a pattern that matches any jongseong
# e.g. "슈" -> "[슈-슣]" / "슉" -> "슉"
if (code - o.SYLLABLE_BASE) % o.JUNGSEONG_COEF == 0:
yield f"[{c}-{chr(code + o.JUNGSEONG_COEF - 1)}]"

elif (
choseong_search
and o.MODERN_COMPAT_JAUM_BASE <= code <= o.MODERN_COMPAT_JAUM_END
):
# compat jamo cannot be 1:1 mapped to jamo or syllable using algorithm
# because jamo separates jongseong-only jaums while compat jamo does not
# instead, consult the lookup table and yield a pattern that matches
# choseong itself, or any syllable that starts with the choseong
offset = ord(c) - o.MODERN_COMPAT_JAUM_BASE
yield _COMPAT_JAMO_CHOSEONG_PATTERN[offset] or c
elif (
self.choseong_search
and o.MODERN_COMPAT_JAUM_BASE <= code <= o.MODERN_COMPAT_JAUM_END
):
# compat jamo cannot be 1:1 mapped to jamo or syllable using algorithm
# because jamo separates jongseong-only jaums while compat jamo does not
# instead, consult the lookup table and yield a pattern that matches
# choseong itself, or any syllable that starts with the choseong
offset = ord(c) - o.MODERN_COMPAT_JAUM_BASE
yield _COMPAT_JAMO_CHOSEONG_PATTERN[offset] or c

elif o.is_jamo(c):
# FEAT: preprocess text with `re.escape()` and `unicodedata.normalize("NFC", ...)`
# | should this be the caller's responsibility or this function's?
raise ValueError("Hangul Jamo and NFD-normalized string are not supported")
elif o.is_jamo(c):
# FEAT: preprocess text with `re.escape()` and `unicodedata.normalize("NFC", ...)`
# | should this be the caller's responsibility or this function's?
raise ValueError(
"Hangul Jamo and NFD-normalized string are not supported"
)

yield c
yield c

if not (incremental and text):
return
if not (self.incremental and text):
return

# FEAT: get the last character, and do either jongseong completion or choseong search
raise NotImplementedError
# FEAT: get the last character, and do either jongseong completion or choseong search
raise NotImplementedError

0 comments on commit bdb7923

Please sign in to comment.