Skip to content

Commit

Permalink
better handling of Latvian
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Nov 5, 2024
1 parent 97c3285 commit fac72b5
Show file tree
Hide file tree
Showing 4 changed files with 6 additions and 5 deletions.
4 changes: 2 additions & 2 deletions simplemma/strategies/defaultrules/lv.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,15 @@
# fallback
re.compile(r"(?:as|ai|ā|ām|ās)$"): "a",
re.compile(r"(?:ei|es|ē|ēm|ēs)$"): "e",
re.compile(r"(?:is|im|ī|iem|īs)$"): "is",
# re.compile(r"(?:os|us)$"): "s",
# re.compile(r"(?:is|im|ī|iem|īs)$"): "is",
# re.compile(r"(?:ēto|ēts)$"): "ēt",
}


def apply_lv(token: str) -> Optional[str]:
"Apply pre-defined rules for Latvian."
if len(token) < 5 or token[0].isupper():
if len(token) < 4:
return None

return apply_rules(token, DEFAULT_RULES)
Binary file modified simplemma/strategies/dictionaries/data/lv.plzma
Binary file not shown.
2 changes: 1 addition & 1 deletion simplemma/strategies/greedy_dictionary_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .dictionaries.dictionary_factory import DefaultDictionaryFactory, DictionaryFactory
from .lemmatization_strategy import LemmatizationStrategy

SHORTER_GREEDY = {"bg", "et", "fi"}
SHORTER_GREEDY = {"bg", "et", "fi", "lv"}


class GreedyDictionaryLookupStrategy(LemmatizationStrategy):
Expand Down
5 changes: 3 additions & 2 deletions training/dictionary_pickler.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

LOGGER = logging.getLogger(__name__)

INPUT_PUNCT = re.compile(r"[,:*/\+_]|^-|-\t")
INPUT_PUNCT = re.compile(r"[,:*/\+_]|.+-$|.+-\t")
SAFE_LIMIT = {
"cs",
"da",
Expand All @@ -31,6 +31,7 @@
"ga",
"hu",
"it",
"lv",
"pl",
"pt",
"ru",
Expand Down Expand Up @@ -86,7 +87,7 @@ def _read_dict(
and columns[1] != columns[0]
):
rule = DEFAULT_RULES[langcode](columns[1])
if rule is not None and rule != columns[1]:
if rule and rule != columns[0]:
print(columns[1], columns[0], rule)
# process
if columns[1] in mydict and mydict[columns[1]] != columns[0]:
Expand Down

0 comments on commit fac72b5

Please sign in to comment.