Skip to content

Commit

Permalink
Merge pull request #11 from ShoukanLabs/dev
Browse files Browse the repository at this point in the history
resolve punctuation errors
  • Loading branch information
korakoe authored Oct 9, 2024
2 parents 0d12cfa + d9de86c commit 6a6e9d8
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 4 deletions.
7 changes: 4 additions & 3 deletions VoPho/langtokenizers/multicoded.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ def detect_language(self, text):
langs = self.detector.detect_language_of(text)
if langs is not None:
langs = langs.iso_code_639_1.name.lower()
print(langs, text)
return langs
else:
return '??'
Expand Down Expand Up @@ -151,14 +150,16 @@ def detect_writing_system(self, text):
return None

def is_punctuation(self, char):
not_punctuation = ["'", '"', "(", ")", "{", "}", "[", "]", "&"]
if len(char) > 1:
# Valid punctuation characters, including space
return all(self.is_punctuation(c) for c in char) # Check each character individually
else:
# Single character check (as per your original logic)
return (not char.isalnum() # Is not alphanumeric
and not char.isspace() # Is not whitespace
and not self.is_writing_system(char, self.detect_writing_system(char))) # Is not in a writing system self.detect_writing_system(
and not self.is_writing_system(char, self.detect_writing_system(char))
and char not in not_punctuation) # Is not in a writing system self.detect_writing_system(


def split_text_by_writing_system(self, text):
Expand Down Expand Up @@ -279,7 +280,7 @@ def tokenize(self, text, group=True):

# Main function
if __name__ == "__main__":
input_text = "测试音素对于加深对发音的理解非常重要。 音素のテストを行うことは、発音の理解を深めるために重要です。"
input_text = "don't do that please"
token = Tokenizer()
processed_text = token.tokenize(input_text)
print("Input text:")
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "VoPho"
version = "0.0.8"
version = "0.0.9"
description = "An easy to use Multilingual phonemization meta-library"
readme = "README.md"
authors = [
Expand Down

0 comments on commit 6a6e9d8

Please sign in to comment.