Merge pull request #9 from ShoukanLabs/dev

Bump version, more robust lang detection
ShoukanLabs · Oct 9, 2024 · dda8dc9 · dda8dc9
2 parents e75ad69 + d142c02
commit dda8dc9
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 15 deletions.
diff --git a/VoPho/langtokenizers/multicoded.py b/VoPho/langtokenizers/multicoded.py
@@ -5,6 +5,7 @@
 import re
 from langdetect import detect_langs
 from langdetect.lang_detect_exception import LangDetectException
+from lingua import LanguageDetectorBuilder
 import random
 from termcolor import colored
 
@@ -99,6 +100,7 @@ class Tokenizer:
     def __init__(self):
         self.min_confidence = 0.5
         self.manual_word_dict = load_manual_word_dict()
+        self.detector = LanguageDetectorBuilder.from_all_languages().build()
 
     def detect_language(self, text):
         # Adjusted logic to improve language detection
@@ -107,28 +109,28 @@ def detect_language(self, text):
         if manual_lang:
             return manual_lang
         try:
-            langs = detect_langs(text)
-            for lang in langs:
-                if lang.prob >= self.min_confidence:
-                    return lang.lang
-            return '??'
+            langs = self.detector.detect_language_of(text)
+            if langs is not None:
+                langs = langs.iso_code_639_1.name.lower()
+                print(langs, text)
+                return langs
+            else:
+                return '??'
         except LangDetectException:
             return '??'
 
-    @staticmethod
-    def is_writing_system(char, system):
+    def is_writing_system(self, char, system):
         if len(char) > 1:
             # Valid punctuation characters, including space
             return all(self.is_writing_system(c) for c in char)  # Check each character individually
         else:
             code_point = ord(char)
             return any(start <= code_point <= end for start, end in WRITING_SYSTEMS_UNICODE_RANGES.get(system, []))
 
-    @staticmethod
-    def detect_japanese_korean_chinese(text):
-        is_japanese = any(Tokenizer.is_writing_system(char, 'ja') for char in text)
-        is_korean = any(Tokenizer.is_writing_system(char, 'ko') for char in text)
-        is_chinese = any(Tokenizer.is_writing_system(char, 'zh') for char in text)
+    def detect_japanese_korean_chinese(self, text):
+        is_japanese = any(self.is_writing_system(char, 'ja') for char in text)
+        is_korean = any(self.is_writing_system(char, 'ko') for char in text)
+        is_chinese = any(self.is_writing_system(char, 'zh') for char in text)
 
         if is_japanese:
             return "ja"
@@ -277,7 +279,7 @@ def tokenize(self, text, group=True):
 
 # Main function
 if __name__ == "__main__":
-    input_text = "hello, 音素のテストを行うことは、発音の理解を深めるために重要です。"
+    input_text = "hello, how are you? the grandiosity of the matter is astonishing. 音素のテストを行うことは、発音の理解を深めるために重要です。"
     token = Tokenizer()
     processed_text = token.tokenize(input_text)
     print("Input text:")

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "VoPho"
-version = "0.0.7"
+version = "0.0.8"
 description = "An easy to use Multilingual phonemization meta-library"
 readme = "README.md"
 authors = [
@@ -32,7 +32,8 @@ dependencies = [
     "pypinyin==0.52.0",
     "pythainlp==5.0.4",
     "torch==2.2.0",
-    "langdetect==1.0.9"
+    "langdetect==1.0.9",
+    "lingua-language-detector==2.0.2"
 ]
 
 [project.urls]