diff --git a/montreal_forced_aligner/dictionary/multispeaker.py b/montreal_forced_aligner/dictionary/multispeaker.py index 0eac9db8..755ffe20 100644 --- a/montreal_forced_aligner/dictionary/multispeaker.py +++ b/montreal_forced_aligner/dictionary/multispeaker.py @@ -581,7 +581,7 @@ def dictionary_setup(self) -> Tuple[typing.Set[str], collections.Counter]: if getattr(self, "unicode_decomposition", False): characters = unicodedata.normalize("NFKD", word) else: - characters = word + characters = unicodedata.normalize("NFKC", word) graphemes.update(characters) if pretrained: difference = set(pron) - self.non_silence_phones - self.silence_phones diff --git a/montreal_forced_aligner/g2p/phonetisaurus_trainer.py b/montreal_forced_aligner/g2p/phonetisaurus_trainer.py index 08ea6639..45ab2371 100644 --- a/montreal_forced_aligner/g2p/phonetisaurus_trainer.py +++ b/montreal_forced_aligner/g2p/phonetisaurus_trainer.py @@ -1744,6 +1744,8 @@ def initialize_training(self) -> None: for pronunciation, word in query: if self.unicode_decomposition: word = unicodedata.normalize("NFKD", word) + else: + word = unicodedata.normalize("NFKC", word) self.g2p_training_graphemes.update(word) self.g2p_training_phones.update(pronunciation.split()) @@ -1814,6 +1816,8 @@ def initialize_training(self) -> None: for pronunciation, word in query: if self.unicode_decomposition: word = unicodedata.normalize("NFKD", word) + else: + word = unicodedata.normalize("NFKC", word) grapheme_count += len(word) self.g2p_training_graphemes.update(word) self.g2p_num_training_pronunciations += 1 diff --git a/montreal_forced_aligner/g2p/trainer.py b/montreal_forced_aligner/g2p/trainer.py index 413db3f0..52cc09a8 100644 --- a/montreal_forced_aligner/g2p/trainer.py +++ b/montreal_forced_aligner/g2p/trainer.py @@ -205,7 +205,7 @@ def __init__( validation_proportion: float = 0.1, num_pronunciations: int = 0, evaluation_mode: bool = False, - unicode_decomposition: bool = True, + unicode_decomposition: bool = False, **kwargs, ): super().__init__(**kwargs) @@ -782,6 +782,8 @@ def initialize_training(self) -> None: continue if self.unicode_decomposition: word = unicodedata.normalize("NFKD", word) + else: + word = unicodedata.normalize("NFKC", word) self.g2p_training_graphemes.update(word) for p in pronunciations: self.g2p_training_phones.update(p.split())