Merge remote-tracking branch 'origin/main'

Darwinkel · Mar 26, 2024 · 261ce1f · 261ce1f
2 parents 0f9a4a1 + 0986f4f
commit 261ce1f
Show file tree

Hide file tree

Showing 7 changed files with 97 additions and 49 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -36,7 +36,7 @@ deepspeed = "0.14.0"
 [tool.poetry.group.dev.dependencies]
 ruff = "0.3.3"
 mypy = "1.9.0"
-poetry-plugin-export = "1.7.0"
+poetry-plugin-export = "1.7.1"
 pytest = "8.1.1"
 pytest-cov = "4.1.0"
 
@@ -48,7 +48,7 @@ soundfile = "0.12.1"
 accelerate = "0.28.0"
 
 [tool.poetry.group.deploy.dependencies]
-gradio = "4.21.0"
+gradio = "4.23.0"
 
 # Code quality tooling configuration
 [tool.ruff]

diff --git a/shepardtts/app.py b/shepardtts/app.py
@@ -17,14 +17,22 @@
 QUARTER_SECOND_PAUSE = torch.tensor(np.zeros(24000 // 4), dtype=torch.float32)
 MIN_PROMPT_LENGTH = 2
 MAX_PROMPT_LENGTH = 2500
+MIN_AMOUNT_OF_SAMPLES = 3
 
 
 def get_available_speaker_embeddings() -> list[str]:
     """Return a list of all available pre-generated speaker embeddings."""
     available_speaker_embeddings = []
     for file in Path(settings.MEAN_CHARACTER_EMBEDDINGS_PATH).glob("*_speaker_embedding.pt"):
         character = file.stem[:-18]
-        available_speaker_embeddings.append(character)
+        no_samples = int(character.split("_")[0])
+
+        # Only keep embeddings which have 3 or more samples
+        if no_samples >= MIN_AMOUNT_OF_SAMPLES:
+            available_speaker_embeddings.append(character)
+
+    # Sort descending by amount of samples it is based on
+    available_speaker_embeddings.sort(key=lambda x: int(x.split("_")[0]), reverse=True)
 
     return available_speaker_embeddings
 
@@ -167,7 +175,7 @@ def main() -> None:
                     info="Select a reference voice for the synthesised speech.",
                     choices=get_available_speaker_embeddings(),
                     multiselect=False,
-                    value="ME2_f-player_f-Shepard",
+                    value="46_ME2-f_player_f_Shepard-twrhub_main_contact_d_dlg",
                 )
 
                 language_gr = gr.Dropdown(

diff --git a/shepardtts/app_helpers.py b/shepardtts/app_helpers.py
@@ -12,6 +12,7 @@
 <a href="https://huggingface.co/coqui/XTTS-v2">XTTS</a> is a multilingual text-to-speech and voice-cloning model.
 It has been fine-tuned on game data from Mass Effect 2 and Mass Effect 3.
 Speaker embeddings are generated from available game samples.
+The number in front of each speaker indicates the amount of audio samples it was calibrated on.
 See the GitHub repository for more information.
 
 <br/>
@@ -44,65 +45,90 @@
 examples = [
     [
         "I'm Commander Shepard, and this is my favorite store on the Citadel.",
-        "ME2_f-player_f-Shepard",
+        "46_ME2-f_player_f_Shepard-twrhub_main_contact_d_dlg",
+        # 31_ME2-f_player_f_Shepard-twrasa_pinnacle_assassin_d_dlg
+        # 35_ME2-f_player_f_Shepard-omgpra_mordin_d_dlg
+        # 38_ME2-f_player_f_Shepard-norcv_starter_h_dlg
+        # 24_ME2-f_player_f_Shepard-norlm_relationship_02_dlg
     ],
     [
         "I'm Commander Shepard, and this is my favorite store on the Citadel.",
-        "ME2_m-player_m-Shepard",
+        "70_ME2-m_player_m_Shepard-nor_yeoman_d_dlg",
+        # 32_ME3-m_Shepard-citprs_cat3_ashjoins_m_dlg
+        # ME2-m_player_m_Shepard-quatll_admiraldove1_d_dlg
+        # 29_ME3-m_Shepard-kro002_salarianalt_bye_m_dlg
+        # 14_ME2-m_player_m_Shepard-norlm_relationship_02_dlg
     ],
     [
         "I don't know if the Reapers understand fear, but you killed one of them. They have to respect that.",
-        "ME2_m-global_illusive_man-nor_illusive_man",
+        "31_ME2-m_global_illusive_man-norcr1_debriefing_d_dlg",
     ],
     [
         "Worried about my qualifications? "
         "I can crush a mech with my biotics or shoot its head off at a hundred yards. "
         "Take your pick.",
-        "ME2_m-hench_vixen-procer_miranda",
+        "42_ME2-m_hench_vixen-norvx_relationship_03_h_dlg",
+        # 24_ME2-m_hench_vixen-procer_vixen_intro_d_dlg
+        # 40_ME2-m_hench_vixen-norvx_starter_h_dlg"
     ],
     [
         "Lots of ways to help people. Sometimes heal patients; sometimes execute dangerous people. Either way helps.",
-        "ME2_m-hench_professor",
+        "28_ME2-m_hench_professor-norpr_relationship_00_h_dlg",
+        # 28_ME2-m_hench_professor-norpr_relationship_00_h_dlg
+        # 43_ME2-m_hench_professor-norpr_relationship_03_h_dlg
+        # 41_ME2-m_hench_professor-norpr_loyalty_01_h_dlg
+        # 41_ME2-m_hench_professor-kroprl_protege_d_dlg
+        # 36_ME2-m_hench_professor-kroprl_deadkrogan_h_dlg
     ],
     [
         "Christian Bible, the Gospel of Mark, chapter five, verse nine. "
         "We acknowledge this as an appropriate metaphor. "
         "We are Legion, a terminal of the geth. "
         "We will integrate into Normandy.",
-        "ME2_m-hench_geth",
+        "29_ME2-m_hench_geth-norgt_relationship03_h_dlg",
+        # 31_ME2-m_hench_geth-norgt_relationship00_h_dlg
     ],
     [
         "I enjoy the sight of humans on their knees. That is a joke.",
-        "ME2_m-hench_ai",
+        "25_ME2-m_hench_ai-endgm2_huddle_03a_d_dlg",
+        # 27_ME3-m_Owner-norhen_edi_investigate_d_dlg
     ],
     [
         "The problem is that war isn't orderly. "
         "And the enemy is never predictable. "
         "Even the most experienced veteran is going to find themselves in situations they haven't trained for.",
-        "ME2_m-cithub_anderson",
+        "14_ME2-m_cithub_anderson_citprs_anderson-citprs_council_d_dlg",
     ],
     [
         "I'm Garrus Vakarian, and this is now my favorite spot on the Citadel.",
-        "ME2_m-hench_garrus",
+        "27_ME2-m_hench_garrus-citgrl_window_tgr_dlg",
+        # 32_ME2-m_hench_garrus-norgr_relationship_03_h_dlg
+        # 37_ME2-m_hench_garrus-omggra_garrus_intro_d_dlg
+        # 35_ME2-m_hench_garrus-norgr_relationship_00_h_dlg
     ],
     [
         "We've seen these before, Shepard. Dragon's Teeth, your people call them. The geth used them on Eden Prime.",
-        "ME2_m-hench_tali",
+        "41_ME2-m_hench_tali-nortl_relationship_03_h_dlg",
+        # 35_ME2-m_hench_tali-nortl_relationship_04_h_dlg
+        # 32_ME2-m_hench_tali-nortl_loyalty_02_h_dlg
     ],
     [
         "I spent two years mourning you. So if we're going to try this, I need to know that you're always coming back.",
-        "ME3_m-hench_liara",
+        "121_ME2-m_liara-twrhub_main_contact_d_dlg",
+        # 58_ME3-m_hench_liara-cat002_monastery_objects_b_dlg
     ],
     [
         "That assassin should be embarrassed. A terminally ill drell kept him from reaching his target.",
-        "ME3_m-global_thane",
+        "66_ME2-m_hench_assassin-noras_starter_h_dlg",
+        # 40_ME2-m_hench_assassin-noras_loyalty01_h_dlg
     ],
     [
         "Stand amongst the ashes of a trillion dead souls and ask the ghosts if honor matters.",
-        "ME3_m-hench_prothean",
+        "32_ME3-m_hench_prothean-cat002_monastery_objects_b_dlg",
     ],
     [
         "Want me to call the Council and hang up on them? For old times' sake.",
-        "ME3_m-global_joker",
+        "78_ME3-m_Owner-nor_joker_bridge_d_dlg",
+        # 8_ME3-m_global_joker-cat003_nor_warning_v_dlg
     ],
 ]
diff --git a/shepardtts/create_character_embeddings.py b/shepardtts/create_character_embeddings.py
@@ -12,15 +12,22 @@ def main() -> None:
     mapped_unique_characters: dict[str, list[Path]] = {}
 
     for file in Path("ljspeech/wavs").glob("*.wav"):
-        # NOTE: Let's not unify things for now. More options for users.
-        # character = get_character(file.stem[:-7]) # noqa: ERA001
-        character = file.stem[:-7]
+        character_metadata = file.stem.split("-")
+
+        game = character_metadata[0]
+        character = character_metadata[1]
+        conversation = character_metadata[2]
+        string_id = character_metadata[3]
+
         print(file)
-        print(character)
-        if character not in mapped_unique_characters:
-            mapped_unique_characters[character] = []
+        print(game, character, conversation, string_id)
+
+        character_conversation = f"{game}-{character}-{conversation}"
+
+        if character_conversation not in mapped_unique_characters:
+            mapped_unique_characters[character_conversation] = []
 
-        mapped_unique_characters[character].append(file)
+        mapped_unique_characters[character_conversation].append(file)
 
     model = load_checkpoint()
 
@@ -29,8 +36,8 @@ def main() -> None:
         gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
             audio_path=value, max_ref_length=30, gpt_cond_len=6, gpt_cond_chunk_len=3
         )
-        torch.save(gpt_cond_latent, f"mean_character_embeddings/{key}_gpt_cond_latent.pt")
-        torch.save(speaker_embedding, f"mean_character_embeddings/{key}_speaker_embedding.pt")
+        torch.save(gpt_cond_latent, f"mean_character_embeddings/{len(value)}_{key}_gpt_cond_latent.pt")
+        torch.save(speaker_embedding, f"mean_character_embeddings/{len(value)}_{key}_speaker_embedding.pt")
 
 
 if __name__ == "__main__":

diff --git a/shepardtts/create_dataset.py b/shepardtts/create_dataset.py
@@ -5,6 +5,7 @@
 
 import pandas as pd
 import soundfile
+
 from datasets import Audio, Dataset, concatenate_datasets
 
 from .utils import normalize_line
@@ -37,8 +38,11 @@ def dataset_from_iterator(path_to_dialogue_dump: str, path_to_audio: str, game:
                 # Take character from dialogue dump
                 character_from_dialogue = row["Speaker"].values[0]  # noqa: PD011
 
+                # Take conversation from dialogue dump
+                conversation = row["Conversation"].values[0]  # noqa: PD011
+
                 # Prefix character with gender
-                probable_character = f"{gender}-"
+                probable_character = f"{gender}_"
 
                 # Strip quotes and normalize sentence
                 line = normalize_line(row["Line"].values[0][1:-1])  # noqa: PD011
@@ -51,7 +55,7 @@ def dataset_from_iterator(path_to_dialogue_dump: str, path_to_audio: str, game:
                     if character_from_dialogue in (character_from_filename, "Owner"):
                         probable_character += character_from_filename
                     else:
-                        probable_character += f"{character_from_filename}-{character_from_dialogue}"
+                        probable_character += f"{character_from_filename}_{character_from_dialogue}"
 
                 elif game == "ME3":
                     # Take character from dialogue dump
@@ -62,6 +66,7 @@ def dataset_from_iterator(path_to_dialogue_dump: str, path_to_audio: str, game:
                     "game": game,
                     "string_id": string_id,
                     "character": probable_character,
+                    "conversation": conversation,
                     "line": line,
                     "audio": str(file),
                 }
@@ -142,7 +147,7 @@ def main() -> None:
     )
 
     # Sort by line to get a good view of multiple speakers for the same line
-    complete_dataset = concatenate_datasets([me2_dataset, me3_dataset]).sort(["line", "character"])
+    complete_dataset = concatenate_datasets([me2_dataset, me3_dataset]).sort(["line", "character", "conversation"])
 
     # Original sampling rate: 24000
     # SpeechT5 requires 16000, xttsv2 requires 22050, and cast to mono just to be safe
@@ -154,7 +159,9 @@ def main() -> None:
     # Write to ljspeech format
     with open("ljspeech/metadata.csv", "w") as file:  # noqa: PTH123
         for sample in complete_dataset_with_valid_audio:
-            filename_without_extension = f"{sample['game']}_{sample['character']}_{sample['string_id']}"
+            filename_without_extension = (
+                f"{sample['game']}-{sample['character']}-{sample['conversation']}-{sample['string_id']}"
+            )
             normalized_text = sample["line"]
             soundfile.write(
                 f"ljspeech/wavs/{filename_without_extension}.wav",

diff --git a/update.sh b/update.sh
@@ -10,4 +10,5 @@ then
     docker compose stop
     docker compose down
     docker compose up -d
-fi
+    docker image prune -f # Warning: may also remove some of your other old images 
+fi