fix: remove lower and nfc_normalization from default cleaners

fixes #321
EveryVoiceTTS · Jun 21, 2024 · b605a66 · b605a66
1 parent 919ab4a
commit b605a66
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 34 deletions.
diff --git a/everyvoice/config/text_config.py b/everyvoice/config/text_config.py
@@ -5,7 +5,7 @@
 from everyvoice.config.shared_types import ConfigModel
 from everyvoice.config.utils import PossiblySerializedCallable
 from everyvoice.text.utils import normalize_text_helper
-from everyvoice.utils import collapse_whitespace, lower, nfc_normalize
+from everyvoice.utils import collapse_whitespace
 
 
 class Punctuation(BaseModel):
@@ -82,9 +82,7 @@ class TextConfig(ConfigModel):
     symbols: Symbols = Field(default_factory=Symbols)
     to_replace: Dict[str, str] = {}  # Happens before cleaners
     cleaners: list[PossiblySerializedCallable] = [
-        lower,
         collapse_whitespace,
-        nfc_normalize,
     ]
 
     @model_validator(mode="after")

diff --git a/everyvoice/tests/test_text.py b/everyvoice/tests/test_text.py
@@ -15,7 +15,12 @@
 from everyvoice.text.lookups import build_lookup, lookuptables_from_data
 from everyvoice.text.phonemizer import AVAILABLE_G2P_ENGINES, get_g2p_engine
 from everyvoice.text.text_processor import TextProcessor
-from everyvoice.utils import generic_psv_filelist_reader
+from everyvoice.utils import (
+    collapse_whitespace,
+    generic_psv_filelist_reader,
+    lower,
+    nfc_normalize,
+)
 
 
 class TextTest(BasicTestCase):
@@ -38,7 +43,7 @@ def test_text_to_sequence(self):
         self.assertEqual(self.base_text_processor.decode_tokens(sequence, ""), text)
 
     def test_token_sequence_to_text(self):
-        sequence = [25, 22, 29, 29, 32, 1, 40, 32, 35, 29, 21]
+        sequence = [51, 48, 55, 55, 58, 1, 66, 58, 61, 55, 47]
         self.assertEqual(self.base_text_processor.encode_text("hello world"), sequence)
 
     def test_hardcoded_symbols(self):
@@ -48,19 +53,31 @@ def test_hardcoded_symbols(self):
             "pad should be Unicode PAD symbol and index 0, whitespace should be index 1",
         )
 
-    def test_cleaners(self):
+    def test_cleaners_with_upper(self):
         text = "hello world"
         text_upper = "HELLO WORLD"
-        sequence = self.base_text_processor.encode_text(text_upper)
-        self.assertEqual(self.base_text_processor.decode_tokens(sequence, ""), text)
+        upper_text_processor = TextProcessor(
+            TextConfig(
+                cleaners=[collapse_whitespace, lower],
+                symbols=Symbols(letters=list(string.ascii_letters)),
+            ),
+        )
+        sequence = upper_text_processor.encode_text(text_upper)
+        self.assertEqual(upper_text_processor.decode_tokens(sequence, ""), text)
 
     def test_punctuation(self):
         text = "hello! How are you? My name's: foo;."
-        tokens = self.base_text_processor.apply_tokenization(
-            self.base_text_processor.normalize_text(text)
+        upper_text_processor = TextProcessor(
+            TextConfig(
+                cleaners=[collapse_whitespace, lower],
+                symbols=Symbols(letters=list(string.ascii_letters)),
+            ),
+        )
+        tokens = upper_text_processor.apply_tokenization(
+            upper_text_processor.normalize_text(text)
         )
         self.assertEqual(
-            self.base_text_processor.apply_punctuation_rules(tokens),
+            upper_text_processor.apply_punctuation_rules(tokens),
             [
                 "h",
                 "e",
@@ -105,6 +122,7 @@ def test_phonological_features(self):
         moh_config = FeaturePredictionConfig(
             contact=self.contact,
             text=TextConfig(
+                cleaners=[collapse_whitespace, lower, nfc_normalize],
                 symbols=Symbols(
                     letters=[
                         "ʌ̃̀ː",
@@ -153,7 +171,7 @@ def test_phonological_features(self):
                         "j",
                         "ʔ",
                     ]
-                )
+                ),
             ),
         )
         moh_text_processor = TextProcessor(moh_config.text)
@@ -202,10 +220,11 @@ def test_dipgrahs(self):
         self.assertEqual(len(sequence), 1)
 
     def test_normalization(self):
-        # This test doesn't really test very much, but just here to highlight that base cleaning involves NFC
+        # This test doesn't really test very much, but just here to highlight that base cleaning doesn't involve NFC
         accented_text_processor = TextProcessor(
             TextConfig(
-                symbols=Symbols(letters=list(string.ascii_letters), accented=["é"])
+                cleaners=[nfc_normalize],
+                symbols=Symbols(letters=list(string.ascii_letters), accented=["é"]),
             ),
         )
         text = "he\u0301llo world"
@@ -215,6 +234,9 @@ def test_normalization(self):
             accented_text_processor.decode_tokens(sequence, ""),
             normalize("NFC", text),
         )
+        self.assertNotEqual(
+            self.base_text_processor.apply_cleaners(text), normalize("NFC", text)
+        )
 
     def test_missing_symbol(self):
         text = "h3llo world"

diff --git a/everyvoice/tests/test_wizard.py b/everyvoice/tests/test_wizard.py
@@ -1220,6 +1220,7 @@ def test_absolute_wav_file_directory_and_local_experiment(self):
                 tmpdir = Path(tmpdir).absolute()
                 wavs_dir = tmpdir / "wavs/Common-Voice"
                 self.config.state["dataset_0"][SN.wavs_dir_step.value] = wavs_dir
+                self.config.state["dataset_0"][SN.text_processing_step] = (0,)
                 self.config.effect()
                 data_file = (
                     Path(self.config.state[SN.name_step.value])
@@ -1248,6 +1249,7 @@ def test_absolute_wav_file_directory_and_nested_experiment(self):
                 tmpdir = Path(tmpdir).absolute()
                 wavs_dir = tmpdir / "wavs/Common-Voice"
                 self.config.state["dataset_0"][SN.wavs_dir_step.value] = wavs_dir
+                self.config.state["dataset_0"][SN.text_processing_step] = tuple()
                 self.config.effect()
                 data_file = (
                     Path(self.config.state[SN.output_step.value])

diff --git a/everyvoice/wizard/basic.py b/everyvoice/wizard/basic.py
@@ -33,7 +33,7 @@
     Step,
     StepNames,
 )
-from everyvoice.wizard.dataset import get_dataset_steps
+from everyvoice.wizard.dataset import TextProcessingStep, get_dataset_steps
 from everyvoice.wizard.prompts import (
     CUSTOM_QUESTIONARY_STYLE,
     get_response_from_menu_prompt,
@@ -261,7 +261,14 @@ def effect(self):
                     permissions_obtained=True,  # If you get this far, you've answered the Dataset Permission Attestation step correctly
                 )
             )
+
         text_config = TextConfig(symbols=Symbols(**symbols))
+        # Add Cleaners
+        if dataset_state.get(StepNames.text_processing_step):
+            text_config.cleaners += [
+                TextProcessingStep().process_lookup[x]["fn"]
+                for x in dataset_state[StepNames.text_processing_step]
+            ]
         text_config_path = Path(f"{TEXT_CONFIG_FILENAME_PREFIX}.{self.response}")
         write_dict_to_config(
             json.loads(text_config.model_dump_json(exclude_none=False)),

diff --git a/everyvoice/wizard/dataset.py b/everyvoice/wizard/dataset.py
@@ -4,7 +4,6 @@
 import re
 from pathlib import Path
 from typing import Sequence
-from unicodedata import normalize
 
 import questionary
 import rich
@@ -14,7 +13,13 @@
 
 from everyvoice.config.type_definitions import DatasetTextRepresentation
 from everyvoice.text.utils import guess_graphemes_in_text, guess_ipa_phones_in_text
-from everyvoice.utils import generic_xsv_filelist_reader, read_festival, slugify
+from everyvoice.utils import (
+    generic_xsv_filelist_reader,
+    lower,
+    nfc_normalize,
+    read_festival,
+    slugify,
+)
 from everyvoice.wizard import TEXT_CONFIG_FILENAME_PREFIX, Step, StepNames, Tour
 from everyvoice.wizard.prompts import (
     CUSTOM_QUESTIONARY_STYLE,
@@ -424,11 +429,11 @@ def effect(self):
         # re-parse data:
         reload_filelist_data_as_dict(self.state)
         # apply automatic conversions
-        self.state["model_target_training_text_representation"] = (
-            apply_automatic_text_conversions(
-                self.state["filelist_data"],
-                self.state[StepNames.filelist_text_representation_step],
-            )
+        self.state[
+            "model_target_training_text_representation"
+        ] = apply_automatic_text_conversions(
+            self.state["filelist_data"],
+            self.state[StepNames.filelist_text_representation_step],
         )
 
 
@@ -571,12 +576,12 @@ def effect(self):
         # Apply the language code:
         isocode = get_iso_code(self.response)
         # Apply text conversions and get target training representation
-        self.state["model_target_training_text_representation"] = (
-            apply_automatic_text_conversions(
-                self.state["filelist_data"],
-                self.state[StepNames.filelist_text_representation_step],
-                global_isocode=isocode,
-            )
+        self.state[
+            "model_target_training_text_representation"
+        ] = apply_automatic_text_conversions(
+            self.state["filelist_data"],
+            self.state[StepNames.filelist_text_representation_step],
+            global_isocode=isocode,
         )
 
 
@@ -626,6 +631,10 @@ def get_iso_code(language):
 
 class TextProcessingStep(Step):
     DEFAULT_NAME = StepNames.text_processing_step
+    process_lookup = {
+        0: {"fn": lower, "desc": "lowercase"},
+        1: {"fn": nfc_normalize, "desc": "NFC Normalization"},
+    }
 
     def prompt(self):
         return get_response_from_menu_prompt(
@@ -644,21 +653,17 @@ def validate(self, response):
 
     def effect(self):
         # Apply the selected text processing processes
-        process_lookup = {
-            0: {"fn": lambda x: x.lower(), "desc": "lowercase"},
-            1: {"fn": lambda x: normalize("NFC", x), "desc": "NFC Normalization"},
-        }
         if "symbols" not in self.state:
             self.state["symbols"] = {}
         if self.response:
             text_index = self.state["filelist_headers"].index(
                 self.state[StepNames.filelist_text_representation_step]
             )
             for process in self.response:
-                process_fn = process_lookup[process]["fn"]
+                process_fn = self.process_lookup[process]["fn"]
                 for i in tqdm(
                     range(len(self.state["filelist_data_list"])),
-                    desc=f"Applying {process_lookup[process]['desc']} to data",
+                    desc=f"Applying {self.process_lookup[process]['desc']} to data",
                 ):
                     self.state["filelist_data_list"][i][text_index] = process_fn(
                         self.state["filelist_data_list"][i][text_index]