chore: note search pattern API design

WieeRd · Mar 24, 2024 · 6356f08 · 6356f08
1 parent 5b18815
commit 6356f08
Showing 1 changed file with 75 additions and 74 deletions.
diff --git a/ricecake/search.py b/ricecake/search.py
@@ -1,45 +1,45 @@
 """Generates regex patterns tailored for searching Korean texts."""
 
-import re
-from collections.abc import Iterator
 from dataclasses import dataclass
 
-from . import offset as o
-
-# fmt: off
-_COMPAT_JAMO_CHOSEONG_PATTERN = [
-    "[ㄱ가-깋]",  # "ㄱ"
-    "[ㄲ까-낗]",  # "ㄲ"
-    None,         # "ㄳ"
-    "[ㄴ나-닣]",  # "ㄴ"
-    None,         # "ㄵ"
-    None,         # "ㄶ"
-    "[ㄷ다-딯]",  # "ㄷ"
-    "[ㄸ따-띻]",  # "ㄸ"
-    "[ㄹ라-맇]",  # "ㄹ"
-    None,         # "ㄺ"
-    None,         # "ㄻ"
-    None,         # "ㄼ"
-    None,         # "ㄽ"
-    None,         # "ㄾ"
-    None,         # "ㄿ"
-    None,         # "ㅀ"
-    "[ㅁ마-밓]",  # "ㅁ"
-    "[ㅂ바-빟]",  # "ㅂ"
-    "[ㅃ빠-삫]",  # "ㅃ"
-    None,         # "ㅄ"
-    "[ㅅ사-싷]",  # "ㅅ"
-    "[ㅆ싸-앃]",  # "ㅆ"
-    "[ㅇ아-잏]",  # "ㅇ"
-    "[ㅈ자-짛]",  # "ㅈ"
-    "[ㅉ짜-찧]",  # "ㅉ"
-    "[ㅊ차-칳]",  # "ㅊ"
-    "[ㅋ카-킿]",  # "ㅋ"
-    "[ㅌ타-팋]",  # "ㅌ"
-    "[ㅍ파-핗]",  # "ㅍ"
-    "[ㅎ하-힣]",  # "ㅎ"
+from .compose import get_jongseong
+from .offset import JONGSEONG_COUNT, compat_jaum_offset, is_compat_jaum, is_syllable
+
+__all__ = ["Searcher"]
+
+
+CHOSEONG_SEARCH_PATTERN = [
+    "[ㄱ가-깋]",
+    "[ㄲ까-낗]",
+    None,  # ㄳ
+    "[ㄴ나-닣]",
+    None,  # ㄵ
+    None,  # ㄶ
+    "[ㄷ다-딯]",
+    "[ㄸ따-띻]",
+    "[ㄹ라-맇]",
+    None,  # ㄺ
+    None,  # ㄻ
+    None,  # ㄼ
+    None,  # ㄽ
+    None,  # ㄾ
+    None,  # ㄿ
+    None,  # ㅀ
+    "[ㅁ마-밓]",
+    "[ㅂ바-빟]",
+    "[ㅃ빠-삫]",
+    None,  # ㅄ
+    "[ㅅ사-싷]",
+    "[ㅆ싸-앃]",
+    "[ㅇ아-잏]",
+    "[ㅈ자-짛]",
+    "[ㅉ짜-찧]",
+    "[ㅊ차-칳]",
+    "[ㅋ카-킿]",
+    "[ㅌ타-팋]",
+    "[ㅍ파-핗]",
+    "[ㅎ하-힣]",
 ]
-# fmt: on
 
 
 # DOC: did you know? writing human language is a lot harder than programming language
@@ -61,42 +61,43 @@ class Searcher:
     jongseong_completion: bool
     incremental: bool
     fuzzy: bool
+    # FEAT: LATER: sort-by, regex flags, filter, search/match/fullmatch
+
+    def _search_pattern(self, c: str, /) -> str:
+        if self.choseong_search and is_compat_jaum(c):
+            return CHOSEONG_SEARCH_PATTERN[compat_jaum_offset(c)] or c
+
+        if self.jongseong_completion and is_syllable(c) and get_jongseong(c) is None:
+            return f"[{c}-{chr(ord(c) + JONGSEONG_COUNT)}]"
+
+        return c
+
+    @staticmethod
+    def _incremental_pattern(c: str, /) -> str:
+        # 1. Jaum
+        # "ㄱ" -> "[ㄱ가-깋]"
+        if is_compat_jaum(c):
+            return CHOSEONG_SEARCH_PATTERN[compat_jaum_offset(c)] or c
+
+        # 2. Syllable
+        if not is_syllable(c):
+            return c
+
+        # 2.1. Has Jongseong
+        jong = get_jongseong(c)
+        if jong is not None:
+            # 2.1.1. Single Jongseong
+            # "일" -> "([일-잃]|이[ㄹ라-맇])"
+
+            # 2.1.2. Composite Jongseong
+            # "읽" -> "(읽|일[ㄱ가-깋])"
+            raise NotImplementedError
+
+        # 2.2. No Jongseong
+
+        # 2.2.1. Composable Moum
+        # "으" -> "[으-읳]"
 
-    def _search_pattern(self, text: str, /) -> Iterator[str]:
-        """Generates regex patterns for each character in the text."""
-        for c in re.escape(text):
-            code = ord(c)
-
-            # FEAT: LATER: composite jamo completion e.g. "우" -> "위", "일" -> "읽"
-            if self.jongseong_completion and o.is_syllable(c):
-                # checks if the syllable is missing a jongseong
-                # if so, yield a pattern that matches any jongseong
-                # e.g. "슈" -> "[슈-슣]" / "슉" -> "슉"
-                if (code - o.SYLLABLE_BASE) % o.JUNGSEONG_COEF == 0:
-                    yield f"[{c}-{chr(code + o.JUNGSEONG_COEF - 1)}]"
-
-            elif (
-                self.choseong_search
-                and o.MODERN_COMPAT_JAUM_BASE <= code <= o.MODERN_COMPAT_JAUM_END
-            ):
-                # compat jamo cannot be 1:1 mapped to jamo or syllable using algorithm
-                # because jamo separates jongseong-only jaums while compat jamo does not
-                # instead, consult the lookup table and yield a pattern that matches
-                # choseong itself, or any syllable that starts with the choseong
-                offset = ord(c) - o.MODERN_COMPAT_JAUM_BASE
-                yield _COMPAT_JAMO_CHOSEONG_PATTERN[offset] or c
-
-            elif o.is_jamo(c):
-                # FEAT: preprocess text with `re.escape()` and `unicodedata.normalize("NFC", ...)`
-                # | should this be the caller's responsibility or this function's?
-                raise ValueError(
-                    "Hangul Jamo and NFD-normalized string are not supported"
-                )
-
-            yield c
-
-        if not (self.incremental and text):
-            return
-
-        # FEAT: get the last character, and do either jongseong completion or choseong search
+        # 2.2.2. Complete Moum
+        # "왜" -> "[왜-왷]"
         raise NotImplementedError