diff --git a/VoPho/langtokenizers/multicoded.py b/VoPho/langtokenizers/multicoded.py index 93cc224..b00555c 100644 --- a/VoPho/langtokenizers/multicoded.py +++ b/VoPho/langtokenizers/multicoded.py @@ -112,7 +112,6 @@ def detect_language(self, text): langs = self.detector.detect_language_of(text) if langs is not None: langs = langs.iso_code_639_1.name.lower() - print(langs, text) return langs else: return '??' @@ -151,6 +150,7 @@ def detect_writing_system(self, text): return None def is_punctuation(self, char): + not_punctuation = ["'", '"', "(", ")", "{", "}", "[", "]", "&"] if len(char) > 1: # Valid punctuation characters, including space return all(self.is_punctuation(c) for c in char) # Check each character individually @@ -158,7 +158,8 @@ def is_punctuation(self, char): # Single character check (as per your original logic) return (not char.isalnum() # Is not alphanumeric and not char.isspace() # Is not whitespace - and not self.is_writing_system(char, self.detect_writing_system(char))) # Is not in a writing system self.detect_writing_system( + and not self.is_writing_system(char, self.detect_writing_system(char)) + and char not in not_punctuation) # Is not in a writing system self.detect_writing_system( def split_text_by_writing_system(self, text): @@ -279,7 +280,7 @@ def tokenize(self, text, group=True): # Main function if __name__ == "__main__": - input_text = "测试音素对于加深对发音的理解非常重要。 音素のテストを行うことは、発音の理解を深めるために重要です。" + input_text = "don't do that please" token = Tokenizer() processed_text = token.tokenize(input_text) print("Input text:") diff --git a/pyproject.toml b/pyproject.toml index 79196b0..844a225 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "VoPho" -version = "0.0.8" +version = "0.0.9" description = "An easy to use Multilingual phonemization meta-library" readme = "README.md" authors = [