Skip to content

Commit

Permalink
chore: note search pattern API design
Browse files Browse the repository at this point in the history
  • Loading branch information
WieeRd committed Mar 24, 2024
1 parent 5b18815 commit 6356f08
Showing 1 changed file with 75 additions and 74 deletions.
149 changes: 75 additions & 74 deletions ricecake/search.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,45 @@
"""Generates regex patterns tailored for searching Korean texts."""

import re
from collections.abc import Iterator
from dataclasses import dataclass

from . import offset as o

# fmt: off
_COMPAT_JAMO_CHOSEONG_PATTERN = [
"[ㄱ가-깋]", # "ㄱ"
"[ㄲ까-낗]", # "ㄲ"
None, # "ㄳ"
"[ㄴ나-닣]", # "ㄴ"
None, # "ㄵ"
None, # "ㄶ"
"[ㄷ다-딯]", # "ㄷ"
"[ㄸ따-띻]", # "ㄸ"
"[ㄹ라-맇]", # "ㄹ"
None, # "ㄺ"
None, # "ㄻ"
None, # "ㄼ"
None, # "ㄽ"
None, # "ㄾ"
None, # "ㄿ"
None, # "ㅀ"
"[ㅁ마-밓]", # "ㅁ"
"[ㅂ바-빟]", # "ㅂ"
"[ㅃ빠-삫]", # "ㅃ"
None, # "ㅄ"
"[ㅅ사-싷]", # "ㅅ"
"[ㅆ싸-앃]", # "ㅆ"
"[ㅇ아-잏]", # "ㅇ"
"[ㅈ자-짛]", # "ㅈ"
"[ㅉ짜-찧]", # "ㅉ"
"[ㅊ차-칳]", # "ㅊ"
"[ㅋ카-킿]", # "ㅋ"
"[ㅌ타-팋]", # "ㅌ"
"[ㅍ파-핗]", # "ㅍ"
"[ㅎ하-힣]", # "ㅎ"
from .compose import get_jongseong
from .offset import JONGSEONG_COUNT, compat_jaum_offset, is_compat_jaum, is_syllable

__all__ = ["Searcher"]


CHOSEONG_SEARCH_PATTERN = [
"[ㄱ가-깋]",
"[ㄲ까-낗]",
None, # ㄳ
"[ㄴ나-닣]",
None, # ㄵ
None, # ㄶ
"[ㄷ다-딯]",
"[ㄸ따-띻]",
"[ㄹ라-맇]",
None, # ㄺ
None, # ㄻ
None, # ㄼ
None, # ㄽ
None, # ㄾ
None, # ㄿ
None, # ㅀ
"[ㅁ마-밓]",
"[ㅂ바-빟]",
"[ㅃ빠-삫]",
None, # ㅄ
"[ㅅ사-싷]",
"[ㅆ싸-앃]",
"[ㅇ아-잏]",
"[ㅈ자-짛]",
"[ㅉ짜-찧]",
"[ㅊ차-칳]",
"[ㅋ카-킿]",
"[ㅌ타-팋]",
"[ㅍ파-핗]",
"[ㅎ하-힣]",
]
# fmt: on


# DOC: did you know? writing human language is a lot harder than programming language
Expand All @@ -61,42 +61,43 @@ class Searcher:
jongseong_completion: bool
incremental: bool
fuzzy: bool
# FEAT: LATER: sort-by, regex flags, filter, search/match/fullmatch

def _search_pattern(self, c: str, /) -> str:
if self.choseong_search and is_compat_jaum(c):
return CHOSEONG_SEARCH_PATTERN[compat_jaum_offset(c)] or c

if self.jongseong_completion and is_syllable(c) and get_jongseong(c) is None:
return f"[{c}-{chr(ord(c) + JONGSEONG_COUNT)}]"

return c

@staticmethod
def _incremental_pattern(c: str, /) -> str:
# 1. Jaum
# "ㄱ" -> "[ㄱ가-깋]"
if is_compat_jaum(c):
return CHOSEONG_SEARCH_PATTERN[compat_jaum_offset(c)] or c

# 2. Syllable
if not is_syllable(c):
return c

# 2.1. Has Jongseong
jong = get_jongseong(c)
if jong is not None:
# 2.1.1. Single Jongseong
# "일" -> "([일-잃]|이[ㄹ라-맇])"

# 2.1.2. Composite Jongseong
# "읽" -> "(읽|일[ㄱ가-깋])"
raise NotImplementedError

# 2.2. No Jongseong

# 2.2.1. Composable Moum
# "으" -> "[으-읳]"

def _search_pattern(self, text: str, /) -> Iterator[str]:
"""Generates regex patterns for each character in the text."""
for c in re.escape(text):
code = ord(c)

# FEAT: LATER: composite jamo completion e.g. "우" -> "위", "일" -> "읽"
if self.jongseong_completion and o.is_syllable(c):
# checks if the syllable is missing a jongseong
# if so, yield a pattern that matches any jongseong
# e.g. "슈" -> "[슈-슣]" / "슉" -> "슉"
if (code - o.SYLLABLE_BASE) % o.JUNGSEONG_COEF == 0:
yield f"[{c}-{chr(code + o.JUNGSEONG_COEF - 1)}]"

elif (
self.choseong_search
and o.MODERN_COMPAT_JAUM_BASE <= code <= o.MODERN_COMPAT_JAUM_END
):
# compat jamo cannot be 1:1 mapped to jamo or syllable using algorithm
# because jamo separates jongseong-only jaums while compat jamo does not
# instead, consult the lookup table and yield a pattern that matches
# choseong itself, or any syllable that starts with the choseong
offset = ord(c) - o.MODERN_COMPAT_JAUM_BASE
yield _COMPAT_JAMO_CHOSEONG_PATTERN[offset] or c

elif o.is_jamo(c):
# FEAT: preprocess text with `re.escape()` and `unicodedata.normalize("NFC", ...)`
# | should this be the caller's responsibility or this function's?
raise ValueError(
"Hangul Jamo and NFD-normalized string are not supported"
)

yield c

if not (self.incremental and text):
return

# FEAT: get the last character, and do either jongseong completion or choseong search
# 2.2.2. Complete Moum
# "왜" -> "[왜-왷]"
raise NotImplementedError

0 comments on commit 6356f08

Please sign in to comment.