diff --git a/simplemma/strategies/defaultrules/lv.py b/simplemma/strategies/defaultrules/lv.py index 883b679..883b652 100644 --- a/simplemma/strategies/defaultrules/lv.py +++ b/simplemma/strategies/defaultrules/lv.py @@ -39,7 +39,7 @@ def apply_lv(token: str) -> Optional[str]: "Apply pre-defined rules for Latvian." - if len(token) < 4: + if len(token) < 5: return None return apply_rules(token, DEFAULT_RULES) diff --git a/tests/test_dictionary_pickler.py b/tests/test_dictionary_pickler.py index 2fc806f..333484c 100644 --- a/tests/test_dictionary_pickler.py +++ b/tests/test_dictionary_pickler.py @@ -20,6 +20,7 @@ def test_logic() -> None: # log warning mydict = dictionary_pickler._read_dict(testfile, "zz", silent=False) assert len(mydict) == 3 + # different length mydict = dictionary_pickler._read_dict(testfile, "en", silent=True) assert len(mydict) == 5 diff --git a/training/dictionary_pickler.py b/training/dictionary_pickler.py index a435087..a1eed9f 100644 --- a/training/dictionary_pickler.py +++ b/training/dictionary_pickler.py @@ -19,7 +19,7 @@ LOGGER = logging.getLogger(__name__) -INPUT_PUNCT = re.compile(r"[,:*/\+_]|.+-$|.+-\t") +INPUT_PUNCT = re.compile(r"[,:*/\+_]|.+-$|.+-\t|^-.+") SAFE_LIMIT = { "cs", "da",