Skip to content

Commit

Permalink
Test languages exemplars canonical duplicates
Browse files Browse the repository at this point in the history
  • Loading branch information
moyogo committed Nov 1, 2022
1 parent a46f0ee commit 036b804
Showing 1 changed file with 20 additions and 0 deletions.
20 changes: 20 additions & 0 deletions tests/test_data_languages.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,26 @@
LANGUAGES = LoadLanguages()


@pytest.mark.parametrize("lang_code", LANGUAGES)
@pytest.mark.parametrize(
"exemplar_name",
["base", "auxiliary", "marks", "numerals", "punctuation", "index"]
)
def test_languages_exemplars_canonical_duplicates(lang_code, exemplar_name):
lang = LANGUAGES[lang_code]
exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
normalized = defaultdict(set)

for g in exemplar:
if g[0] == "{" and g[-1] == "}":
g = g.lstrip("{").rstrip("}")
normalized[unicodedata.normalize("NFC", g)].add(g)

result = [(len(gs), gs) for n, gs in normalized.items()]
expected = [(1, {n}) for n, gs in normalized.items()]
assert result == expected


@pytest.mark.parametrize("lang_code", LANGUAGES)
@pytest.mark.parametrize(
"exemplar_name",
Expand Down

0 comments on commit 036b804

Please sign in to comment.