Skip to content

Commit

Permalink
Seperate embeddings per scene (finally)
Browse files Browse the repository at this point in the history
  • Loading branch information
Darwinkel committed Mar 25, 2024
1 parent ec7c5ac commit c6714a7
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 29 deletions.
12 changes: 10 additions & 2 deletions shepardtts/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,22 @@
QUARTER_SECOND_PAUSE = torch.tensor(np.zeros(24000 // 4), dtype=torch.float32)
MIN_PROMPT_LENGTH = 2
MAX_PROMPT_LENGTH = 2500
MIN_AMOUNT_OF_SAMPLES = 3


def get_available_speaker_embeddings() -> list[str]:
"""Return a list of all available pre-generated speaker embeddings."""
available_speaker_embeddings = []
for file in Path(settings.MEAN_CHARACTER_EMBEDDINGS_PATH).glob("*_speaker_embedding.pt"):
character = file.stem[:-18]
available_speaker_embeddings.append(character)
no_samples = int(character.split("_")[0])

# Only keep embeddings which have 3 or more samples
if no_samples >= MIN_AMOUNT_OF_SAMPLES:
available_speaker_embeddings.append(character)

# Sort descending by amount of samples it is based on
available_speaker_embeddings.sort(key=lambda x: int(x.split("_")[0]), reverse=True)

return available_speaker_embeddings

Expand Down Expand Up @@ -167,7 +175,7 @@ def main() -> None:
info="Select a reference voice for the synthesised speech.",
choices=get_available_speaker_embeddings(),
multiselect=False,
value="ME2_f-player_f-Shepard",
value="31_ME2-f_player_f_Shepard-twrasa_pinnacle_assassin_d_dlg",
)

language_gr = gr.Dropdown(
Expand Down
52 changes: 38 additions & 14 deletions shepardtts/app_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
<a href="https://huggingface.co/coqui/XTTS-v2">XTTS</a> is a multilingual text-to-speech and voice-cloning model.
It has been fine-tuned on game data from Mass Effect 2 and Mass Effect 3.
Speaker embeddings are generated from available game samples.
The number in front of each speaker indicates the amount of audio samples it was calibrated on.
See the GitHub repository for more information.
<br/>
Expand Down Expand Up @@ -44,65 +45,88 @@
examples = [
[
"I'm Commander Shepard, and this is my favorite store on the Citadel.",
"ME2_f-player_f-Shepard",
"46_ME2-f_player_f_Shepard-twrhub_main_contact_d_dlg",
# 31_ME2-f_player_f_Shepard-twrasa_pinnacle_assassin_d_dlg
# 35_ME2-f_player_f_Shepard-omgpra_mordin_d_dlg
# 38_ME2-f_player_f_Shepard-norcv_starter_h_dlg
# 24_ME2-f_player_f_Shepard-norlm_relationship_02_dlg
],
[
"I'm Commander Shepard, and this is my favorite store on the Citadel.",
"ME2_m-player_m-Shepard",
"70_ME2-m_player_m_Shepard-nor_yeoman_d_dlg",
# 32_ME3-m_Shepard-citprs_cat3_ashjoins_m_dlg
# ME2-m_player_m_Shepard-quatll_admiraldove1_d_dlg
# 29_ME3-m_Shepard-kro002_salarianalt_bye_m_dlg
# 14_ME2-m_player_m_Shepard-norlm_relationship_02_dlg
],
[
"I don't know if the Reapers understand fear, but you killed one of them. They have to respect that.",
"ME2_m-global_illusive_man-nor_illusive_man",
"31_ME2-m_global_illusive_man-norcr1_debriefing_d_dlg",
],
[
"Worried about my qualifications? "
"I can crush a mech with my biotics or shoot its head off at a hundred yards. "
"Take your pick.",
"ME2_m-hench_vixen-procer_miranda",
"42_ME2-m_hench_vixen-norvx_relationship_03_h_dlg",
# 24_ME2-m_hench_vixen-procer_vixen_intro_d_dlg
# 40_ME2-m_hench_vixen-norvx_starter_h_dlg"
],
[
"Lots of ways to help people. Sometimes heal patients; sometimes execute dangerous people. Either way helps.",
"ME2_m-hench_professor",
"28_ME2-m_hench_professor-norpr_relationship_00_h_dlg",
# 28_ME2-m_hench_professor-norpr_relationship_00_h_dlg
# 43_ME2-m_hench_professor-norpr_relationship_03_h_dlg
# 41_ME2-m_hench_professor-norpr_loyalty_01_h_dlg
# 41_ME2-m_hench_professor-kroprl_protege_d_dlg
# 36_ME2-m_hench_professor-kroprl_deadkrogan_h_dlg
],
[
"Christian Bible, the Gospel of Mark, chapter five, verse nine. "
"We acknowledge this as an appropriate metaphor. "
"We are Legion, a terminal of the geth. "
"We will integrate into Normandy.",
"ME2_m-hench_geth",
"29_ME2-m_hench_geth-norgt_relationship03_h_dlg",
# 31_ME2-m_hench_geth-norgt_relationship00_h_dlg
],
[
"I enjoy the sight of humans on their knees. That is a joke.",
"ME2_m-hench_ai",
"25_ME2-m_hench_ai-endgm2_huddle_03a_d_dlg",
],
[
"The problem is that war isn't orderly. "
"And the enemy is never predictable. "
"Even the most experienced veteran is going to find themselves in situations they haven't trained for.",
"ME2_m-cithub_anderson",
"14_ME2-m_cithub_anderson_citprs_anderson-citprs_council_d_dlg",
],
[
"I'm Garrus Vakarian, and this is now my favorite spot on the Citadel.",
"ME2_m-hench_garrus",
"27_ME2-m_hench_garrus-citgrl_window_tgr_dlg",
# 32_ME2-m_hench_garrus-norgr_relationship_03_h_dlg
# 37_ME2-m_hench_garrus-omggra_garrus_intro_d_dlg
# 35_ME2-m_hench_garrus-norgr_relationship_00_h_dlg
],
[
"We've seen these before, Shepard. Dragon's Teeth, your people call them. The geth used them on Eden Prime.",
"ME2_m-hench_tali",
"41_ME2-m_hench_tali-nortl_relationship_03_h_dlg",
# 35_ME2-m_hench_tali-nortl_relationship_04_h_dlg
# 32_ME2-m_hench_tali-nortl_loyalty_02_h_dlg
],
[
"I spent two years mourning you. So if we're going to try this, I need to know that you're always coming back.",
"ME3_m-hench_liara",
"58_ME3-m_hench_liara-cat002_monastery_objects_b_dlg",
],
[
"That assassin should be embarrassed. A terminally ill drell kept him from reaching his target.",
"ME3_m-global_thane",
"66_ME2-m_hench_assassin-noras_starter_h_dlg",
# 40_ME2-m_hench_assassin-noras_loyalty01_h_dlg
],
[
"Stand amongst the ashes of a trillion dead souls and ask the ghosts if honor matters.",
"ME3_m-hench_prothean",
"32_ME3-m_hench_prothean-cat002_monastery_objects_b_dlg",
],
[
"Want me to call the Council and hang up on them? For old times' sake.",
"ME3_m-global_joker",
"78_ME3-m_Owner-nor_joker_bridge_d_dlg",
# 8_ME3-m_global_joker-cat003_nor_warning_v_dlg
],
]
25 changes: 16 additions & 9 deletions shepardtts/create_character_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,22 @@ def main() -> None:
mapped_unique_characters: dict[str, list[Path]] = {}

for file in Path("ljspeech/wavs").glob("*.wav"):
# NOTE: Let's not unify things for now. More options for users.
# character = get_character(file.stem[:-7]) # noqa: ERA001
character = file.stem[:-7]
character_metadata = file.stem.split("-")

game = character_metadata[0]
character = character_metadata[1]
conversation = character_metadata[2]
string_id = character_metadata[3]

print(file)
print(character)
if character not in mapped_unique_characters:
mapped_unique_characters[character] = []
print(game, character, conversation, string_id)

character_conversation = f"{game}-{character}-{conversation}"

if character_conversation not in mapped_unique_characters:
mapped_unique_characters[character_conversation] = []

mapped_unique_characters[character].append(file)
mapped_unique_characters[character_conversation].append(file)

model = load_checkpoint()

Expand All @@ -29,8 +36,8 @@ def main() -> None:
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
audio_path=value, max_ref_length=30, gpt_cond_len=6, gpt_cond_chunk_len=3
)
torch.save(gpt_cond_latent, f"mean_character_embeddings/{key}_gpt_cond_latent.pt")
torch.save(speaker_embedding, f"mean_character_embeddings/{key}_speaker_embedding.pt")
torch.save(gpt_cond_latent, f"mean_character_embeddings/{len(value)}_{key}_gpt_cond_latent.pt")
torch.save(speaker_embedding, f"mean_character_embeddings/{len(value)}_{key}_speaker_embedding.pt")


if __name__ == "__main__":
Expand Down
15 changes: 11 additions & 4 deletions shepardtts/create_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import pandas as pd
import soundfile

from datasets import Audio, Dataset, concatenate_datasets

from .utils import normalize_line
Expand Down Expand Up @@ -37,8 +38,11 @@ def dataset_from_iterator(path_to_dialogue_dump: str, path_to_audio: str, game:
# Take character from dialogue dump
character_from_dialogue = row["Speaker"].values[0] # noqa: PD011

# Take conversation from dialogue dump
conversation = row["Conversation"].values[0] # noqa: PD011

# Prefix character with gender
probable_character = f"{gender}-"
probable_character = f"{gender}_"

# Strip quotes and normalize sentence
line = normalize_line(row["Line"].values[0][1:-1]) # noqa: PD011
Expand All @@ -51,7 +55,7 @@ def dataset_from_iterator(path_to_dialogue_dump: str, path_to_audio: str, game:
if character_from_dialogue in (character_from_filename, "Owner"):
probable_character += character_from_filename
else:
probable_character += f"{character_from_filename}-{character_from_dialogue}"
probable_character += f"{character_from_filename}_{character_from_dialogue}"

elif game == "ME3":
# Take character from dialogue dump
Expand All @@ -62,6 +66,7 @@ def dataset_from_iterator(path_to_dialogue_dump: str, path_to_audio: str, game:
"game": game,
"string_id": string_id,
"character": probable_character,
"conversation": conversation,
"line": line,
"audio": str(file),
}
Expand Down Expand Up @@ -142,7 +147,7 @@ def main() -> None:
)

# Sort by line to get a good view of multiple speakers for the same line
complete_dataset = concatenate_datasets([me2_dataset, me3_dataset]).sort(["line", "character"])
complete_dataset = concatenate_datasets([me2_dataset, me3_dataset]).sort(["line", "character", "conversation"])

# Original sampling rate: 24000
# SpeechT5 requires 16000, xttsv2 requires 22050, and cast to mono just to be safe
Expand All @@ -154,7 +159,9 @@ def main() -> None:
# Write to ljspeech format
with open("ljspeech/metadata.csv", "w") as file: # noqa: PTH123
for sample in complete_dataset_with_valid_audio:
filename_without_extension = f"{sample['game']}_{sample['character']}_{sample['string_id']}"
filename_without_extension = (
f"{sample['game']}-{sample['character']}-{sample['conversation']}-{sample['string_id']}"
)
normalized_text = sample["line"]
soundfile.write(
f"ljspeech/wavs/{filename_without_extension}.wav",
Expand Down

0 comments on commit c6714a7

Please sign in to comment.