Skip to content

Commit

Permalink
Merge pull request #9 from ShoukanLabs/dev
Browse files Browse the repository at this point in the history
Bump version, more robust lang detection
  • Loading branch information
korakoe authored Oct 9, 2024
2 parents e75ad69 + d142c02 commit dda8dc9
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 15 deletions.
28 changes: 15 additions & 13 deletions VoPho/langtokenizers/multicoded.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import re
from langdetect import detect_langs
from langdetect.lang_detect_exception import LangDetectException
from lingua import LanguageDetectorBuilder
import random
from termcolor import colored

Expand Down Expand Up @@ -99,6 +100,7 @@ class Tokenizer:
def __init__(self):
self.min_confidence = 0.5
self.manual_word_dict = load_manual_word_dict()
self.detector = LanguageDetectorBuilder.from_all_languages().build()

def detect_language(self, text):
# Adjusted logic to improve language detection
Expand All @@ -107,28 +109,28 @@ def detect_language(self, text):
if manual_lang:
return manual_lang
try:
langs = detect_langs(text)
for lang in langs:
if lang.prob >= self.min_confidence:
return lang.lang
return '??'
langs = self.detector.detect_language_of(text)
if langs is not None:
langs = langs.iso_code_639_1.name.lower()
print(langs, text)
return langs
else:
return '??'
except LangDetectException:
return '??'

@staticmethod
def is_writing_system(char, system):
def is_writing_system(self, char, system):
if len(char) > 1:
# Valid punctuation characters, including space
return all(self.is_writing_system(c) for c in char) # Check each character individually
else:
code_point = ord(char)
return any(start <= code_point <= end for start, end in WRITING_SYSTEMS_UNICODE_RANGES.get(system, []))

@staticmethod
def detect_japanese_korean_chinese(text):
is_japanese = any(Tokenizer.is_writing_system(char, 'ja') for char in text)
is_korean = any(Tokenizer.is_writing_system(char, 'ko') for char in text)
is_chinese = any(Tokenizer.is_writing_system(char, 'zh') for char in text)
def detect_japanese_korean_chinese(self, text):
is_japanese = any(self.is_writing_system(char, 'ja') for char in text)
is_korean = any(self.is_writing_system(char, 'ko') for char in text)
is_chinese = any(self.is_writing_system(char, 'zh') for char in text)

if is_japanese:
return "ja"
Expand Down Expand Up @@ -277,7 +279,7 @@ def tokenize(self, text, group=True):

# Main function
if __name__ == "__main__":
input_text = "hello, 音素のテストを行うことは、発音の理解を深めるために重要です。"
input_text = "hello, how are you? the grandiosity of the matter is astonishing. 音素のテストを行うことは、発音の理解を深めるために重要です。"
token = Tokenizer()
processed_text = token.tokenize(input_text)
print("Input text:")
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "VoPho"
version = "0.0.7"
version = "0.0.8"
description = "An easy to use Multilingual phonemization meta-library"
readme = "README.md"
authors = [
Expand Down Expand Up @@ -32,7 +32,8 @@ dependencies = [
"pypinyin==0.52.0",
"pythainlp==5.0.4",
"torch==2.2.0",
"langdetect==1.0.9"
"langdetect==1.0.9",
"lingua-language-detector==2.0.2"
]

[project.urls]
Expand Down

0 comments on commit dda8dc9

Please sign in to comment.