diff --git a/VoPho/langtokenizers/manual_word_dict.json b/VoPho/langtokenizers/manual_word_dict.json new file mode 100644 index 0000000..a64f510 --- /dev/null +++ b/VoPho/langtokenizers/manual_word_dict.json @@ -0,0 +1,177 @@ +{ + "hello": "en", + "hi": "en", + "hey": "en", + "the": "en", + "and": "en", + "in": "en", + "that": "en", + "have": "en", + "it": "en", + "for": "en", + "not": "en", + "on": "en", + "with": "en", + "he": "en", + "as": "en", + "you": "en", + "do": "en", + "at": "en", + "this": "en", + "but": "en", + "his": "en", + "by": "en", + "from": "en", + "they": "en", + "we": "en", + "say": "en", + "her": "en", + "she": "en", + "or": "en", + "an": "en", + "will": "en", + "my": "en", + "one": "en", + "all": "en", + "would": "en", + "there": "en", + "their": "en", + "what": "en", + "up": "en", + "out": "en", + "if": "en", + "who": "en", + "get": "en", + "which": "en", + "go": "en", + "me": "en", + "when": "en", + "make": "en", + "can": "en", + "like": "en", + "time": "en", + "no": "en", + "just": "en", + "him": "en", + "know": "en", + "take": "en", + "people": "en", + "into": "en", + "year": "en", + "your": "en", + "good": "en", + "some": "en", + "could": "en", + "them": "en", + "see": "en", + "other": "en", + "than": "en", + "then": "en", + "now": "en", + "look": "en", + "only": "en", + "come": "en", + "its": "en", + "over": "en", + "also": "en", + "back": "en", + "after": "en", + "use": "en", + "two": "en", + "how": "en", + "our": "en", + "work": "en", + "first": "en", + "well": "en", + "way": "en", + "even": "en", + "new": "en", + "want": "en", + "because": "en", + "any": "en", + "these": "en", + "give": "en", + "day": "en", + "most": "en", + "suppose": "en", + "think": "en", + "place": "en", + "life": "en", + "where": "en", + "help": "en", + "little": "en", + "few": "en", + "long": "en", + "never": "en", + "always": "en", + "might": "en", + "around": "en", + "high": "en", + "old": "en", + "contemplate": "en", + "ephemeral": "en", + "juxtapose": "en", + "melancholy": "en", + "nuance": "en", + "paradox": "en", + "quintessential": "en", + "serendipity": "en", + "ubiquitous": "en", + "vicarious": "en", + "zealous": "en", + "ambiguous": "en", + "benevolent": "en", + "candid": "en", + "diligent": "en", + "empathy": "en", + "frustrate": "en", + "genuine": "en", + "haphazard": "en", + "impeccable": "en", + "jubilant": "en", + "kinetic": "en", + "luminous": "en", + "meticulous": "en", + "nostalgia": "en", + "perception": "en", + "resilience": "en", + "sophisticated": "en", + "tenacious": "en", + "vulnerable": "en", + "analyze": "en", + "articulate": "en", + "collaborate": "en", + "cultivate": "en", + "elucidate": "en", + "enhance": "en", + "evaluate": "en", + "facilitate": "en", + "implement": "en", + "innovate": "en", + "integrate": "en", + "perceive": "en", + "procrastinate": "en", + "reiterate": "en", + "synthesize": "en", + "transform": "en", + "validate": "en", + "apple": "en", + "banana": "en", + "grape": "en", + "orange": "en", + "peach": "en", + "pear": "en", + "kiwi": "en", + "mango": "en", + "pineapple": "en", + "strawberry": "en", + "blueberry": "en", + "watermelon": "en", + "pomegranate": "en", + "apricot": "en", + "blackberry": "en", + "raspberry": "en", + "curiosity": "en", + "fulfillment": "en", + "endures": "en" +} diff --git a/VoPho/langtokenizers/multicoded.py b/VoPho/langtokenizers/multicoded.py index 135838b..93cc224 100644 --- a/VoPho/langtokenizers/multicoded.py +++ b/VoPho/langtokenizers/multicoded.py @@ -279,7 +279,7 @@ def tokenize(self, text, group=True): # Main function if __name__ == "__main__": - input_text = "hello, how are you? the grandiosity of the matter is astonishing. 音素のテストを行うことは、発音の理解を深めるために重要です。" + input_text = "测试音素对于加深对发音的理解非常重要。 音素のテストを行うことは、発音の理解を深めるために重要です。" token = Tokenizer() processed_text = token.tokenize(input_text) print("Input text:")