Skip to content

Commit

Permalink
Implement variant-aware fuzzy_find function
Browse files Browse the repository at this point in the history
See #7
  • Loading branch information
thatbudakguy committed Aug 2, 2022
1 parent b2f9419 commit 2a91b02
Show file tree
Hide file tree
Showing 2 changed files with 3,225 additions and 3 deletions.
14 changes: 11 additions & 3 deletions bin/lib/util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
from pathlib import Path
import json

import pandas as pd
from cihai.core import Cihai
Expand All @@ -22,6 +23,7 @@
KR_UNICODE = pd.read_csv(Path("data/kr-unicode.csv"))
MC_BAXTER = Reconstruction(pd.read_csv(Path("data/GDR-SBGY-FULL.csv")))
OC_BAXTER = NotImplementedError("TODO")
VARIANT_TABLE = str.maketrans(json.loads(Path("data/variants.json").read_text()))

NLP = spacy.blank("och")
NLP.add_pipe("sentencizer")
Expand Down Expand Up @@ -214,6 +216,12 @@ def _convert(annotation: re.Match) -> str:
return EMPTY_ANNO.sub(_convert, text)


def get_variant(char: str, rc: Reconstruction, c: Cihai) -> str:
"""Find a variant for a character with reconstructed data."""
pass
def fuzzy_find(phrase: str, text: str) -> int:
"""True if phrase is found in text, ignoring variant characters."""

# collapse variants into a single character
# TODO: use levenshtein for distance matching
text_norm = text.translate(VARIANT_TABLE)
phrase_norm = phrase.translate(VARIANT_TABLE)

return text_norm.find(phrase_norm)
Loading

0 comments on commit 2a91b02

Please sign in to comment.