diff --git a/ricecake/search.py b/ricecake/search.py index ff19565..6c9edf8 100644 --- a/ricecake/search.py +++ b/ricecake/search.py @@ -1,45 +1,45 @@ """Generates regex patterns tailored for searching Korean texts.""" -import re -from collections.abc import Iterator from dataclasses import dataclass -from . import offset as o - -# fmt: off -_COMPAT_JAMO_CHOSEONG_PATTERN = [ - "[ㄱ가-깋]", # "ㄱ" - "[ㄲ까-낗]", # "ㄲ" - None, # "ㄳ" - "[ㄴ나-닣]", # "ㄴ" - None, # "ㄵ" - None, # "ㄶ" - "[ㄷ다-딯]", # "ㄷ" - "[ㄸ따-띻]", # "ㄸ" - "[ㄹ라-맇]", # "ㄹ" - None, # "ㄺ" - None, # "ㄻ" - None, # "ㄼ" - None, # "ㄽ" - None, # "ㄾ" - None, # "ㄿ" - None, # "ㅀ" - "[ㅁ마-밓]", # "ㅁ" - "[ㅂ바-빟]", # "ㅂ" - "[ㅃ빠-삫]", # "ㅃ" - None, # "ㅄ" - "[ㅅ사-싷]", # "ㅅ" - "[ㅆ싸-앃]", # "ㅆ" - "[ㅇ아-잏]", # "ㅇ" - "[ㅈ자-짛]", # "ㅈ" - "[ㅉ짜-찧]", # "ㅉ" - "[ㅊ차-칳]", # "ㅊ" - "[ㅋ카-킿]", # "ㅋ" - "[ㅌ타-팋]", # "ㅌ" - "[ㅍ파-핗]", # "ㅍ" - "[ㅎ하-힣]", # "ㅎ" +from .compose import get_jongseong +from .offset import JONGSEONG_COUNT, compat_jaum_offset, is_compat_jaum, is_syllable + +__all__ = ["Searcher"] + + +CHOSEONG_SEARCH_PATTERN = [ + "[ㄱ가-깋]", + "[ㄲ까-낗]", + None, # ㄳ + "[ㄴ나-닣]", + None, # ㄵ + None, # ㄶ + "[ㄷ다-딯]", + "[ㄸ따-띻]", + "[ㄹ라-맇]", + None, # ㄺ + None, # ㄻ + None, # ㄼ + None, # ㄽ + None, # ㄾ + None, # ㄿ + None, # ㅀ + "[ㅁ마-밓]", + "[ㅂ바-빟]", + "[ㅃ빠-삫]", + None, # ㅄ + "[ㅅ사-싷]", + "[ㅆ싸-앃]", + "[ㅇ아-잏]", + "[ㅈ자-짛]", + "[ㅉ짜-찧]", + "[ㅊ차-칳]", + "[ㅋ카-킿]", + "[ㅌ타-팋]", + "[ㅍ파-핗]", + "[ㅎ하-힣]", ] -# fmt: on # DOC: did you know? writing human language is a lot harder than programming language @@ -61,42 +61,43 @@ class Searcher: jongseong_completion: bool incremental: bool fuzzy: bool + # FEAT: LATER: sort-by, regex flags, filter, search/match/fullmatch + + def _search_pattern(self, c: str, /) -> str: + if self.choseong_search and is_compat_jaum(c): + return CHOSEONG_SEARCH_PATTERN[compat_jaum_offset(c)] or c + + if self.jongseong_completion and is_syllable(c) and get_jongseong(c) is None: + return f"[{c}-{chr(ord(c) + JONGSEONG_COUNT)}]" + + return c + + @staticmethod + def _incremental_pattern(c: str, /) -> str: + # 1. Jaum + # "ㄱ" -> "[ㄱ가-깋]" + if is_compat_jaum(c): + return CHOSEONG_SEARCH_PATTERN[compat_jaum_offset(c)] or c + + # 2. Syllable + if not is_syllable(c): + return c + + # 2.1. Has Jongseong + jong = get_jongseong(c) + if jong is not None: + # 2.1.1. Single Jongseong + # "일" -> "([일-잃]|이[ㄹ라-맇])" + + # 2.1.2. Composite Jongseong + # "읽" -> "(읽|일[ㄱ가-깋])" + raise NotImplementedError + + # 2.2. No Jongseong + + # 2.2.1. Composable Moum + # "으" -> "[으-읳]" - def _search_pattern(self, text: str, /) -> Iterator[str]: - """Generates regex patterns for each character in the text.""" - for c in re.escape(text): - code = ord(c) - - # FEAT: LATER: composite jamo completion e.g. "우" -> "위", "일" -> "읽" - if self.jongseong_completion and o.is_syllable(c): - # checks if the syllable is missing a jongseong - # if so, yield a pattern that matches any jongseong - # e.g. "슈" -> "[슈-슣]" / "슉" -> "슉" - if (code - o.SYLLABLE_BASE) % o.JUNGSEONG_COEF == 0: - yield f"[{c}-{chr(code + o.JUNGSEONG_COEF - 1)}]" - - elif ( - self.choseong_search - and o.MODERN_COMPAT_JAUM_BASE <= code <= o.MODERN_COMPAT_JAUM_END - ): - # compat jamo cannot be 1:1 mapped to jamo or syllable using algorithm - # because jamo separates jongseong-only jaums while compat jamo does not - # instead, consult the lookup table and yield a pattern that matches - # choseong itself, or any syllable that starts with the choseong - offset = ord(c) - o.MODERN_COMPAT_JAUM_BASE - yield _COMPAT_JAMO_CHOSEONG_PATTERN[offset] or c - - elif o.is_jamo(c): - # FEAT: preprocess text with `re.escape()` and `unicodedata.normalize("NFC", ...)` - # | should this be the caller's responsibility or this function's? - raise ValueError( - "Hangul Jamo and NFD-normalized string are not supported" - ) - - yield c - - if not (self.incremental and text): - return - - # FEAT: get the last character, and do either jongseong completion or choseong search + # 2.2.2. Complete Moum + # "왜" -> "[왜-왷]" raise NotImplementedError