Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dom/voice switching #634

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 170 additions & 0 deletions examples/foundational/21a-switch-between-multiple-voices-cartesia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
import asyncio
import os
import sys
from enum import Enum

import aiohttp
from dotenv import load_dotenv
from loguru import logger

from pipecat.frames.frames import (
Frame,
LLMMessagesFrame,
TextFrame,
TTSUpdateSettingsFrame,
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer

load_dotenv(override=True)

logger.remove(0)
logger.add(sys.stderr, level="DEBUG")


class Character(Enum):
GIMLI = "Gimli"
LEGOLAS = "Legolas"


class CustomFrame(TextFrame):
def __init__(self, text: str, character: Character, **kwargs):
super().__init__(text=text, **kwargs)
self.character = character


class FrameAndCharacterHandler(FrameProcessor):
def __init__(self):
super().__init__()
self.previous_frame_character = None
self.frame_character = None
self.voice_change_lock = asyncio.Lock()

async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, TextFrame):
custom_frame = self.create_custom_frame(frame)
if custom_frame:
async with self.voice_change_lock:
voice_changed = await self.process_frame_character(custom_frame)
if voice_changed:
# Wait a bit for the voice change to take effect
await asyncio.sleep(1) # Adjust this delay as needed
await self.push_frame(custom_frame, direction)
else:
await self.push_frame(frame, direction)

def create_custom_frame(self, frame: TextFrame):
if "#####" in frame.text:
self.frame_character = "Gimli"
frame.text = frame.text.replace("#####", "").strip()
elif "-----" in frame.text:
self.frame_character = "Legolas"
frame.text = frame.text.replace("-----", "").strip()

if self.frame_character and frame.text:
return CustomFrame(text=frame.text, character=self.frame_character)
return None

async def process_frame_character(self, frame: CustomFrame):
if self.previous_frame_character != frame.character:
await self.change_tts_settings(frame.character)
self.previous_frame_character = frame.character
return True
return False

async def change_tts_settings(self, character: str):
voice_id_mapping = {
"Gimli": "79a125e8-cd45-4c13-8a67-188112f4dd22",
"Legolas": "95856005-0332-41b0-935f-352e296aa0df",
}
voice_id = voice_id_mapping[character]
if voice_id:
settingsFrame = TTSUpdateSettingsFrame(settings={"voice": voice_id})
await self.push_frame(settingsFrame)
# Wait for the settings to be applied
await asyncio.sleep(1) # Adjust this delay as needed
else:
logger.warning(f"No voice ID found for character: {character}")


async def main():
async with aiohttp.ClientSession() as session:
room_url = "some url here"
token = "some token here"

transport = DailyTransport(
room_url,
token,
"Pipecat",
DailyParams(
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)

llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": """You are writing a dialogue between two characters from Middle Earth: Gimli, a proud Dwarf, and Legolas, a skilled Elf. They are discussing who is better at craftsmanship: Dwarves or Elves. Make the conversation engaging and lively, with each character passionately defending their race's skills.

Separate Gimli's parts/responses with ##### and Legolas's with -----. Do not add any other text than the tags and the responses. They should argue for at least 10 turns before both deciding that dealing with Sauron is more important than their craftsmanship debate.

Example:
##### By Durin's beard, everyone knows that Dwarven craftsmanship is unmatched in all of Middle Earth! Our axes and armor are the stuff of legend!
----- While I respect your people's skills, Gimli, surely you jest. Elven craftsmanship has been refined over thousands of years. Our blades and bows are works of art!

Begin the dialogue immediately, starting with Gimli's perspective.""",
}]

context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
frame_and_character_handler = FrameAndCharacterHandler()

tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id=os.getenv("79a125e8-cd45-4c13-8a67-188112f4dd22"), # Use Gimlis voice here
)

pipeline = Pipeline(
[
# transport.input(), # Transport user input - I have commented this out because we don't want to capture user input. Just for this example. Add it back in if you want to capture user input.
# context_aggregator.user(), # User responses - I have commented this out because we don't want to capture user input. Just for this example. Add it back in if you want to capture user input.
llm,
frame_and_character_handler,
tts,
transport.output(),
context_aggregator.assistant(),
]
)

task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))

@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append(
{
"role": "system",
"content": "Please begin the dialogue between Gimli and Legolas immediately. Start with Gimli's perspective on Dwarven craftsmanship. Do not add any introductions or additional text; begin directly with Gimli's first statement.",
}
)
await task.queue_frames([LLMMessagesFrame(messages)])

runner = PipelineRunner()

await runner.run(task)


if __name__ == "__main__":
asyncio.run(main())