Skip to content

Commit

Permalink
Merge pull request #12 from ShoukanLabs/dev
Browse files Browse the repository at this point in the history
Add manual tagging for phonemes and languages
  • Loading branch information
korakoe authored Oct 9, 2024
2 parents 6a6e9d8 + f8b3db5 commit b3ce1c8
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 7 deletions.
11 changes: 7 additions & 4 deletions VoPho/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,13 @@ def phonemize_for_language(self, text, lang):
:param lang: The language ID for phonemization
:return: Phonemized text, or original text wrapped in <??> tags if language is not supported
"""
phonemizer = self.get_phonemizer(lang)
if phonemizer:
return phonemizer.phonemize(text)
return f"<??>{text}</??>" # Return original text if no phonemizer available
if lang != "phoneme":
phonemizer = self.get_phonemizer(lang)
if phonemizer:
return phonemizer.phonemize(text)
return f"<??>{text}</??>" # Return original text if no phonemizer available
else:
return text

def phonemize(self, input_text, output_dict=False):
"""
Expand Down
19 changes: 18 additions & 1 deletion VoPho/langtokenizers/multicoded.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
'mr': 'red',
'he': 'white',
'th': 'blue',
'phoneme': 'blue',
'??': 'red'
}

Expand Down Expand Up @@ -269,12 +270,28 @@ def _group_segments(self, text):
return ''.join(grouped_segments).replace("<punctuation>", "").replace("</punctuation>", " ")

def tokenize(self, text, group=True):
result = self._tokenize(text)
# Split the input text into segments based on existing tags
pattern = r'(<\w+>.*?</\w+>)|([^<]+)' # Matches either tagged segments or untagged text
segments = re.findall(pattern, text)

processed_segments = []

for tagged_segment, untagged_segment in segments:
if tagged_segment: # If this segment is already tagged, just add it
processed_segments.append(tagged_segment)
else: # If the segment is untagged, process it as usual
result = self._tokenize(untagged_segment)
processed_segments.append(result)

result = ''.join(processed_segments)

if group:
result = self._group_segments(result)

if "<??>" in result:
warnings.warn(
"Your output contains tokenization errors. We were unable to detect a language or writing system, or there was an error in processing.")

return result


Expand Down
2 changes: 1 addition & 1 deletion examples/phonemize_texts.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from VoPho.engine import Phonemizer
from time import time

input_text = "I suppose i can, dont take my word for it though. 音素のテストを行うことは、発音の理解を深めるために重要です。"
input_text = "<phoneme>I suppose i can</phoneme>, dont take my word for it though. 音素のテストを行うことは、発音の理解を深めるために重要です。"

engine = Phonemizer()
start = time()
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "VoPho"
version = "0.0.9"
version = "0.0.10"
description = "An easy to use Multilingual phonemization meta-library"
readme = "README.md"
authors = [
Expand Down

0 comments on commit b3ce1c8

Please sign in to comment.