Skip to content

Commit

Permalink
simplify is_known() function (#126)
Browse files Browse the repository at this point in the history
* feat: simplify is_known function as an alias of DictionaryLookupStrategy

* fix: use _legacy_dictionary_factory in is_known check

* fix: removed unused greedy option from is_known

* test: add multilingual test for is_known function
  • Loading branch information
juanjoDiaz authored May 17, 2024
1 parent 6049223 commit 39ff74d
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 45 deletions.
38 changes: 9 additions & 29 deletions simplemma/lemmatizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,30 +80,6 @@ def __init__(
self._fallback_lemmatization_strategy = fallback_lemmatization_strategy
self._cached_lemmatize = lru_cache(maxsize=cache_max_size)(self._lemmatize)

def is_known(
self,
token: str,
lang: Union[str, Tuple[str, ...]],
) -> bool:
"""Check if a token is known in the specified language(s).
Args:
token: The token to check.
lang: The language or languages to check in.
Returns:
bool: True if the token is known, False otherwise.
"""

_control_input_type(token)
lang = validate_lang_input(lang)

dictionary_lookup = DictionaryLookupStrategy()
return any(
dictionary_lookup.get_lemma(token, lang_code) is not None
for lang_code in lang
)

def lemmatize(
self,
token: str,
Expand Down Expand Up @@ -179,9 +155,7 @@ def get_lemmas_in_text(
)


def is_known(
token: str, lang: Union[str, Tuple[str, ...]], greedy: bool = False
) -> bool:
def is_known(token: str, lang: Union[str, Tuple[str, ...]]) -> bool:
"""Check if a token is known in the specified language(s).
Args:
Expand All @@ -191,8 +165,14 @@ def is_known(
Returns:
bool: True if the token is known, False otherwise.
"""
lemmatizer = _legacy_lemmatizer if not greedy else _legacy_greedy_lemmatizer
return lemmatizer.is_known(token, lang)

_control_input_type(token)
lang = validate_lang_input(lang)

dictionary_lookup = DictionaryLookupStrategy(_legacy_dictionary_factory)
return any(
dictionary_lookup.get_lemma(token, lang_code) is not None for lang_code in lang
)


def lemmatize(
Expand Down
21 changes: 5 additions & 16 deletions tests/test_lemmatizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,27 +452,16 @@ def test_subwords() -> None:


def test_is_known() -> None:
# logic
with pytest.raises(TypeError):
assert Lemmatizer().is_known(None, lang="en") is None # type: ignore[arg-type]
with pytest.raises(TypeError):
assert is_known(None, lang="en") is None # type: ignore[arg-type]
with pytest.raises(ValueError):
assert Lemmatizer().is_known("", lang="en") is None
with pytest.raises(ValueError):
assert is_known("", lang="en") is None

assert (
Lemmatizer().is_known("FanCY", lang="en")
== is_known("FanCY", lang="en")
== True
)
# known words
assert (
Lemmatizer().is_known("Fancy-String", lang="en")
== is_known("Fancy-String", lang="en")
== False
)
assert is_known("FanCY", lang="en") == True
assert is_known("Fancy-String", lang="en") == False

assert is_known("espejos", lang=("es", "de")) == True
assert is_known("espejos", lang=("de", "es")) == True


def test_get_lemmas_in_text() -> None:
Expand Down

0 comments on commit 39ff74d

Please sign in to comment.