Skip to content

Commit

Permalink
Set default to composed unicode
Browse files Browse the repository at this point in the history
  • Loading branch information
mmcauliffe committed Dec 2, 2024
1 parent 0fc9b47 commit 73c3e09
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 2 deletions.
2 changes: 1 addition & 1 deletion montreal_forced_aligner/dictionary/multispeaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,7 @@ def dictionary_setup(self) -> Tuple[typing.Set[str], collections.Counter]:
if getattr(self, "unicode_decomposition", False):
characters = unicodedata.normalize("NFKD", word)
else:
characters = word
characters = unicodedata.normalize("NFKC", word)
graphemes.update(characters)
if pretrained:
difference = set(pron) - self.non_silence_phones - self.silence_phones
Expand Down
4 changes: 4 additions & 0 deletions montreal_forced_aligner/g2p/phonetisaurus_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1744,6 +1744,8 @@ def initialize_training(self) -> None:
for pronunciation, word in query:
if self.unicode_decomposition:
word = unicodedata.normalize("NFKD", word)
else:
word = unicodedata.normalize("NFKC", word)
self.g2p_training_graphemes.update(word)
self.g2p_training_phones.update(pronunciation.split())

Expand Down Expand Up @@ -1814,6 +1816,8 @@ def initialize_training(self) -> None:
for pronunciation, word in query:
if self.unicode_decomposition:
word = unicodedata.normalize("NFKD", word)
else:
word = unicodedata.normalize("NFKC", word)
grapheme_count += len(word)
self.g2p_training_graphemes.update(word)
self.g2p_num_training_pronunciations += 1
Expand Down
4 changes: 3 additions & 1 deletion montreal_forced_aligner/g2p/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def __init__(
validation_proportion: float = 0.1,
num_pronunciations: int = 0,
evaluation_mode: bool = False,
unicode_decomposition: bool = True,
unicode_decomposition: bool = False,
**kwargs,
):
super().__init__(**kwargs)
Expand Down Expand Up @@ -782,6 +782,8 @@ def initialize_training(self) -> None:
continue
if self.unicode_decomposition:
word = unicodedata.normalize("NFKD", word)
else:
word = unicodedata.normalize("NFKC", word)
self.g2p_training_graphemes.update(word)
for p in pronunciations:
self.g2p_training_phones.update(p.split())
Expand Down

0 comments on commit 73c3e09

Please sign in to comment.