Skip to content

Commit

Permalink
add manual_words_dict
Browse files Browse the repository at this point in the history
  • Loading branch information
korakoe committed Oct 9, 2024
1 parent d142c02 commit b6a3286
Show file tree
Hide file tree
Showing 2 changed files with 178 additions and 1 deletion.
177 changes: 177 additions & 0 deletions VoPho/langtokenizers/manual_word_dict.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
{
"hello": "en",
"hi": "en",
"hey": "en",
"the": "en",
"and": "en",
"in": "en",
"that": "en",
"have": "en",
"it": "en",
"for": "en",
"not": "en",
"on": "en",
"with": "en",
"he": "en",
"as": "en",
"you": "en",
"do": "en",
"at": "en",
"this": "en",
"but": "en",
"his": "en",
"by": "en",
"from": "en",
"they": "en",
"we": "en",
"say": "en",
"her": "en",
"she": "en",
"or": "en",
"an": "en",
"will": "en",
"my": "en",
"one": "en",
"all": "en",
"would": "en",
"there": "en",
"their": "en",
"what": "en",
"up": "en",
"out": "en",
"if": "en",
"who": "en",
"get": "en",
"which": "en",
"go": "en",
"me": "en",
"when": "en",
"make": "en",
"can": "en",
"like": "en",
"time": "en",
"no": "en",
"just": "en",
"him": "en",
"know": "en",
"take": "en",
"people": "en",
"into": "en",
"year": "en",
"your": "en",
"good": "en",
"some": "en",
"could": "en",
"them": "en",
"see": "en",
"other": "en",
"than": "en",
"then": "en",
"now": "en",
"look": "en",
"only": "en",
"come": "en",
"its": "en",
"over": "en",
"also": "en",
"back": "en",
"after": "en",
"use": "en",
"two": "en",
"how": "en",
"our": "en",
"work": "en",
"first": "en",
"well": "en",
"way": "en",
"even": "en",
"new": "en",
"want": "en",
"because": "en",
"any": "en",
"these": "en",
"give": "en",
"day": "en",
"most": "en",
"suppose": "en",
"think": "en",
"place": "en",
"life": "en",
"where": "en",
"help": "en",
"little": "en",
"few": "en",
"long": "en",
"never": "en",
"always": "en",
"might": "en",
"around": "en",
"high": "en",
"old": "en",
"contemplate": "en",
"ephemeral": "en",
"juxtapose": "en",
"melancholy": "en",
"nuance": "en",
"paradox": "en",
"quintessential": "en",
"serendipity": "en",
"ubiquitous": "en",
"vicarious": "en",
"zealous": "en",
"ambiguous": "en",
"benevolent": "en",
"candid": "en",
"diligent": "en",
"empathy": "en",
"frustrate": "en",
"genuine": "en",
"haphazard": "en",
"impeccable": "en",
"jubilant": "en",
"kinetic": "en",
"luminous": "en",
"meticulous": "en",
"nostalgia": "en",
"perception": "en",
"resilience": "en",
"sophisticated": "en",
"tenacious": "en",
"vulnerable": "en",
"analyze": "en",
"articulate": "en",
"collaborate": "en",
"cultivate": "en",
"elucidate": "en",
"enhance": "en",
"evaluate": "en",
"facilitate": "en",
"implement": "en",
"innovate": "en",
"integrate": "en",
"perceive": "en",
"procrastinate": "en",
"reiterate": "en",
"synthesize": "en",
"transform": "en",
"validate": "en",
"apple": "en",
"banana": "en",
"grape": "en",
"orange": "en",
"peach": "en",
"pear": "en",
"kiwi": "en",
"mango": "en",
"pineapple": "en",
"strawberry": "en",
"blueberry": "en",
"watermelon": "en",
"pomegranate": "en",
"apricot": "en",
"blackberry": "en",
"raspberry": "en",
"curiosity": "en",
"fulfillment": "en",
"endures": "en"
}
2 changes: 1 addition & 1 deletion VoPho/langtokenizers/multicoded.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ def tokenize(self, text, group=True):

# Main function
if __name__ == "__main__":
input_text = "hello, how are you? the grandiosity of the matter is astonishing. 音素のテストを行うことは、発音の理解を深めるために重要です。"
input_text = "测试音素对于加深对发音的理解非常重要。 音素のテストを行うことは、発音の理解を深めるために重要です。"
token = Tokenizer()
processed_text = token.tokenize(input_text)
print("Input text:")
Expand Down

0 comments on commit b6a3286

Please sign in to comment.