fix: always validate permissions_obtained

also add a unittest
EveryVoiceTTS · Jun 21, 2024 · 0db8178 · 0db8178
1 parent 4555605
commit 0db8178
Show file tree

Hide file tree

Showing 11 changed files with 91 additions and 36 deletions.
diff --git a/everyvoice/config/preprocessing_config.py b/everyvoice/config/preprocessing_config.py
@@ -99,6 +99,7 @@ class Dataset(PartialLoadConfig):
     permissions_obtained: bool = Field(
         False,
         description="An attestation that permission has been obtained to use this data. You may not use EveryVoice to build a TTS system with data that you do not have permission to use and there are serious possible consequences for doing so. Finding data online does not constitute permission. The speaker should be aware and consent to their data being used in this way.",
+        validate_default=True,
     )
     data_dir: PossiblyRelativePath = Field(
         Path("/please/create/a/path/to/your/dataset/data"),

diff --git a/everyvoice/config/text_config.py b/everyvoice/config/text_config.py
@@ -5,7 +5,7 @@
 from everyvoice.config.shared_types import ConfigModel
 from everyvoice.config.utils import PossiblySerializedCallable
 from everyvoice.text.utils import normalize_text_helper
-from everyvoice.utils import collapse_whitespace, lower, nfc_normalize
+from everyvoice.utils import collapse_whitespace
 
 
 class Punctuation(BaseModel):
@@ -82,9 +82,7 @@ class TextConfig(ConfigModel):
     symbols: Symbols = Field(default_factory=Symbols)
     to_replace: Dict[str, str] = {}  # Happens before cleaners
     cleaners: list[PossiblySerializedCallable] = [
-        lower,
         collapse_whitespace,
-        nfc_normalize,
     ]
 
     @model_validator(mode="after")

diff --git a/everyvoice/tests/data/relative/config/everyvoice-shared-data.yaml b/everyvoice/tests/data/relative/config/everyvoice-shared-data.yaml
@@ -11,6 +11,7 @@ source_data:
   filelist: ../r-filelist.psv
   filelist_loader: everyvoice.utils.generic_psv_filelist_reader
   label: dataset_0
+  permissions_obtained: true
   sox_effects:
   - [channel, '1']
 train_split: 0.9
diff --git a/everyvoice/tests/preprocessed_audio_fixture.py b/everyvoice/tests/preprocessed_audio_fixture.py
@@ -7,6 +7,7 @@
 from everyvoice.model.e2e.config import FeaturePredictionConfig
 from everyvoice.preprocessor import Preprocessor
 from everyvoice.tests.basic_test_case import BasicTestCase
+from everyvoice.utils import collapse_whitespace, lower, nfc_normalize
 
 
 class PreprocessedAudioFixture:
@@ -34,6 +35,7 @@ class PreprocessedAudioFixture:
             ],
         ),
         text=TextConfig(
+            cleaners=[collapse_whitespace, lower, nfc_normalize],
             symbols=Symbols(
                 ascii_symbols=list(ascii_lowercase),
                 ipa=[
@@ -51,7 +53,7 @@ class PreprocessedAudioFixture:
                     "ʊ",
                     "ʒ",
                 ],
-            )
+            ),
         ),
         contact=BasicTestCase.contact,
     )

diff --git a/everyvoice/tests/test_configs.py b/everyvoice/tests/test_configs.py
@@ -316,7 +316,12 @@ def test_shared_sox(self) -> None:
         vocoder_config = VocoderConfig(
             contact=self.contact,
             preprocessing=PreprocessingConfig(
-                source_data=[Dataset(), Dataset(), Dataset(), Dataset()]
+                source_data=[
+                    Dataset(permissions_obtained=True),
+                    Dataset(permissions_obtained=True),
+                    Dataset(permissions_obtained=True),
+                    Dataset(permissions_obtained=True),
+                ]
             ),
         )
         config: EveryVoiceConfig = EveryVoiceConfig(

diff --git a/everyvoice/tests/test_preprocessing.py b/everyvoice/tests/test_preprocessing.py
@@ -59,6 +59,14 @@ def test_run_doctest(self):
     def test_read_filelist(self):
         self.assertEqual(self.filelist[0]["basename"], "LJ050-0269")
 
+    def test_no_permissions(self):
+        no_permissions_args = self.fp_config.model_dump()
+        no_permissions_args["preprocessing"]["source_data"][0][
+            "permissions_obtained"
+        ] = False
+        with self.assertRaises(ValueError):
+            FeaturePredictionConfig(**no_permissions_args)
+
     def test_process_audio_for_alignment(self):
         config = AlignerConfig(contact=self.contact)
         for entry in self.filelist[1:]:
@@ -392,9 +400,9 @@ def test_text_processing(self):
                     preprocessed_dir.mkdir(parents=True, exist_ok=True)
                     output_filelist = preprocessed_dir / "preprocessed_filelist.psv"
                     shutil.copyfile(filelist_test_info["path"], output_filelist)
-                    fp_config.preprocessing.source_data[0].filelist = (
-                        filelist_test_info["path"]
-                    )
+                    fp_config.preprocessing.source_data[
+                        0
+                    ].filelist = filelist_test_info["path"]
                     fp_config.preprocessing.save_dir = preprocessed_dir
                     preprocessor = Preprocessor(fp_config)
                     with capture_stdout() as output, mute_logger(
@@ -489,9 +497,13 @@ def test_incremental_preprocess(self):
         with tempfile.TemporaryDirectory(
             prefix="test_incremental_preprocess", dir="."
         ) as tmpdir:
-            fp_config, lj_filelist, full_filelist, partial_filelist, to_process = (
-                self.get_simple_config(tmpdir)
-            )
+            (
+                fp_config,
+                lj_filelist,
+                full_filelist,
+                partial_filelist,
+                to_process,
+            ) = self.get_simple_config(tmpdir)
 
             fp_config.preprocessing.source_data[0].filelist = partial_filelist
             with capture_stdout() as output, mute_logger("everyvoice.preprocessor"):

diff --git a/everyvoice/tests/test_text.py b/everyvoice/tests/test_text.py
@@ -15,7 +15,12 @@
 from everyvoice.text.lookups import build_lookup, lookuptables_from_data
 from everyvoice.text.phonemizer import AVAILABLE_G2P_ENGINES, get_g2p_engine
 from everyvoice.text.text_processor import TextProcessor
-from everyvoice.utils import generic_psv_filelist_reader
+from everyvoice.utils import (
+    collapse_whitespace,
+    generic_psv_filelist_reader,
+    lower,
+    nfc_normalize,
+)
 
 
 class TextTest(BasicTestCase):
@@ -38,7 +43,7 @@ def test_text_to_sequence(self):
         self.assertEqual(self.base_text_processor.decode_tokens(sequence, ""), text)
 
     def test_token_sequence_to_text(self):
-        sequence = [25, 22, 29, 29, 32, 1, 40, 32, 35, 29, 21]
+        sequence = [51, 48, 55, 55, 58, 1, 66, 58, 61, 55, 47]
         self.assertEqual(self.base_text_processor.encode_text("hello world"), sequence)
 
     def test_hardcoded_symbols(self):
@@ -48,19 +53,31 @@ def test_hardcoded_symbols(self):
             "pad should be Unicode PAD symbol and index 0, whitespace should be index 1",
         )
 
-    def test_cleaners(self):
+    def test_cleaners_with_upper(self):
         text = "hello world"
         text_upper = "HELLO WORLD"
-        sequence = self.base_text_processor.encode_text(text_upper)
-        self.assertEqual(self.base_text_processor.decode_tokens(sequence, ""), text)
+        upper_text_processor = TextProcessor(
+            TextConfig(
+                cleaners=[collapse_whitespace, lower],
+                symbols=Symbols(letters=list(string.ascii_letters)),
+            ),
+        )
+        sequence = upper_text_processor.encode_text(text_upper)
+        self.assertEqual(upper_text_processor.decode_tokens(sequence, ""), text)
 
     def test_punctuation(self):
         text = "hello! How are you? My name's: foo;."
-        tokens = self.base_text_processor.apply_tokenization(
-            self.base_text_processor.normalize_text(text)
+        upper_text_processor = TextProcessor(
+            TextConfig(
+                cleaners=[collapse_whitespace, lower],
+                symbols=Symbols(letters=list(string.ascii_letters)),
+            ),
+        )
+        tokens = upper_text_processor.apply_tokenization(
+            upper_text_processor.normalize_text(text)
         )
         self.assertEqual(
-            self.base_text_processor.apply_punctuation_rules(tokens),
+            upper_text_processor.apply_punctuation_rules(tokens),
             [
                 "h",
                 "e",
@@ -105,6 +122,7 @@ def test_phonological_features(self):
         moh_config = FeaturePredictionConfig(
             contact=self.contact,
             text=TextConfig(
+                cleaners=[collapse_whitespace, lower, nfc_normalize],
                 symbols=Symbols(
                     letters=[
                         "ʌ̃̀ː",
@@ -153,7 +171,7 @@ def test_phonological_features(self):
                         "j",
                         "ʔ",
                     ]
-                )
+                ),
             ),
         )
         moh_text_processor = TextProcessor(moh_config.text)
@@ -202,10 +220,11 @@ def test_dipgrahs(self):
         self.assertEqual(len(sequence), 1)
 
     def test_normalization(self):
-        # This test doesn't really test very much, but just here to highlight that base cleaning involves NFC
+        # This test doesn't really test very much, but just here to highlight that base cleaning doesn't involve NFC
         accented_text_processor = TextProcessor(
             TextConfig(
-                symbols=Symbols(letters=list(string.ascii_letters), accented=["é"])
+                cleaners=[nfc_normalize],
+                symbols=Symbols(letters=list(string.ascii_letters), accented=["é"]),
             ),
         )
         text = "he\u0301llo world"
@@ -215,6 +234,9 @@ def test_normalization(self):
             accented_text_processor.decode_tokens(sequence, ""),
             normalize("NFC", text),
         )
+        self.assertNotEqual(
+            self.base_text_processor.apply_cleaners(text), normalize("NFC", text)
+        )
 
     def test_missing_symbol(self):
         text = "h3llo world"
@@ -375,9 +397,7 @@ def test_basic_g2p(self):
         )
         # another language
         str_g2p = get_g2p_engine("str")
-        self.assertEqual(
-            str_g2p("SENĆOŦEN"), ["s", "ʌ", "n", "t͡ʃ", "ɑ", "θ", "ʌ", "n"]
-        )
+        self.assertEqual(str_g2p("SENĆOŦEN"), ["s", "ʌ", "n", "t͡ʃ", "ɑ", "θ", "ʌ", "n"])
         # test lang_id missing
         with self.assertRaises(NotImplementedError):
             get_g2p_engine("boop")

diff --git a/everyvoice/tests/test_wizard.py b/everyvoice/tests/test_wizard.py
@@ -1220,6 +1220,7 @@ def test_absolute_wav_file_directory_and_local_experiment(self):
                 tmpdir = Path(tmpdir).absolute()
                 wavs_dir = tmpdir / "wavs/Common-Voice"
                 self.config.state["dataset_0"][SN.wavs_dir_step.value] = wavs_dir
+                self.config.state["dataset_0"][SN.text_processing_step] = (0,)
                 self.config.effect()
                 data_file = (
                     Path(self.config.state[SN.name_step.value])
@@ -1248,6 +1249,7 @@ def test_absolute_wav_file_directory_and_nested_experiment(self):
                 tmpdir = Path(tmpdir).absolute()
                 wavs_dir = tmpdir / "wavs/Common-Voice"
                 self.config.state["dataset_0"][SN.wavs_dir_step.value] = wavs_dir
+                self.config.state["dataset_0"][SN.text_processing_step] = tuple()
                 self.config.effect()
                 data_file = (
                     Path(self.config.state[SN.output_step.value])

diff --git a/everyvoice/text/text_processor.py b/everyvoice/text/text_processor.py
@@ -171,7 +171,8 @@ def apply_cleaners(self, text: str) -> str:
         Returns:
             str: the replaced text
 
-        >>> tp = TextProcessor(TextConfig())
+        >>> from everyvoice.utils import collapse_whitespace, lower, nfc_normalize
+        >>> tp = TextProcessor(TextConfig(cleaners=[collapse_whitespace, lower, nfc_normalize]))
         >>> tp.apply_cleaners('HELLO\u0301')
         'helló'
         """
@@ -190,7 +191,8 @@ def normalize_text(
         Returns:
             str: normalized text ready to be tokenized
 
-        >>> tp = TextProcessor(TextConfig())
+        >>> from everyvoice.utils import collapse_whitespace, lower, nfc_normalize
+        >>> tp = TextProcessor(TextConfig(cleaners=[collapse_whitespace, lower, nfc_normalize]))
         >>> tp.normalize_text('HELLO\u0301!')
         'helló!'
         """

diff --git a/everyvoice/wizard/basic.py b/everyvoice/wizard/basic.py
@@ -33,7 +33,7 @@
     Step,
     StepNames,
 )
-from everyvoice.wizard.dataset import get_dataset_steps
+from everyvoice.wizard.dataset import TextProcessingStep, get_dataset_steps
 from everyvoice.wizard.prompts import (
     CUSTOM_QUESTIONARY_STYLE,
     get_response_from_menu_prompt,
@@ -261,7 +261,14 @@ def effect(self):
                     permissions_obtained=True,  # If you get this far, you've answered the Dataset Permission Attestation step correctly
                 )
             )
+
         text_config = TextConfig(symbols=Symbols(**symbols))
+        # Add Cleaners
+        if dataset_state.get(StepNames.text_processing_step):
+            text_config.cleaners += [
+                TextProcessingStep().process_lookup[x]["fn"]
+                for x in dataset_state[StepNames.text_processing_step]
+            ]
         text_config_path = Path(f"{TEXT_CONFIG_FILENAME_PREFIX}.{self.response}")
         write_dict_to_config(
             json.loads(text_config.model_dump_json(exclude_none=False)),

diff --git a/everyvoice/wizard/dataset.py b/everyvoice/wizard/dataset.py
@@ -4,7 +4,6 @@
 import re
 from pathlib import Path
 from typing import Sequence
-from unicodedata import normalize
 
 import questionary
 import rich
@@ -14,7 +13,13 @@
 
 from everyvoice.config.type_definitions import DatasetTextRepresentation
 from everyvoice.text.utils import guess_graphemes_in_text, guess_ipa_phones_in_text
-from everyvoice.utils import generic_xsv_filelist_reader, read_festival, slugify
+from everyvoice.utils import (
+    generic_xsv_filelist_reader,
+    lower,
+    nfc_normalize,
+    read_festival,
+    slugify,
+)
 from everyvoice.wizard import TEXT_CONFIG_FILENAME_PREFIX, Step, StepNames, Tour
 from everyvoice.wizard.prompts import (
     CUSTOM_QUESTIONARY_STYLE,
@@ -626,6 +631,10 @@ def get_iso_code(language):
 
 class TextProcessingStep(Step):
     DEFAULT_NAME = StepNames.text_processing_step
+    process_lookup = {
+        0: {"fn": lower, "desc": "lowercase"},
+        1: {"fn": nfc_normalize, "desc": "NFC Normalization"},
+    }
 
     def prompt(self):
         return get_response_from_menu_prompt(
@@ -644,21 +653,17 @@ def validate(self, response):
 
     def effect(self):
         # Apply the selected text processing processes
-        process_lookup = {
-            0: {"fn": lambda x: x.lower(), "desc": "lowercase"},
-            1: {"fn": lambda x: normalize("NFC", x), "desc": "NFC Normalization"},
-        }
         if "symbols" not in self.state:
             self.state["symbols"] = {}
         if self.response:
             text_index = self.state["filelist_headers"].index(
                 self.state[StepNames.filelist_text_representation_step]
             )
             for process in self.response:
-                process_fn = process_lookup[process]["fn"]
+                process_fn = self.process_lookup[process]["fn"]
                 for i in tqdm(
                     range(len(self.state["filelist_data_list"])),
-                    desc=f"Applying {process_lookup[process]['desc']} to data",
+                    desc=f"Applying {self.process_lookup[process]['desc']} to data",
                 ):
                     self.state["filelist_data_list"][i][text_index] = process_fn(
                         self.state["filelist_data_list"][i][text_index]