From b8be7b57ceb6905bafdf94d3fa4f963df8e1aa05 Mon Sep 17 00:00:00 2001 From: Dominic Date: Mon, 21 Oct 2024 18:58:51 +0900 Subject: [PATCH 1/3] Pushing first commit --- ...s-for-each-character-on-script-from-llm.py | 183 ++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 examples/foundational/switch-voices-for-each-character-on-script-from-llm.py diff --git a/examples/foundational/switch-voices-for-each-character-on-script-from-llm.py b/examples/foundational/switch-voices-for-each-character-on-script-from-llm.py new file mode 100644 index 000000000..381342f18 --- /dev/null +++ b/examples/foundational/switch-voices-for-each-character-on-script-from-llm.py @@ -0,0 +1,183 @@ +import asyncio +import os +import sys +from enum import Enum +from typing import Dict, List, Optional + +import aiohttp +from dotenv import load_dotenv +from loguru import logger + +from pipecat.frames.frames import ( + TTSStoppedFrame, + TTSStartedFrame, + Frame, + LLMMessagesFrame, + TextFrame, + SystemFrame, + TTSUpdateSettingsFrame, + StopTaskFrame +) +from pipecat.pipeline.parallel_pipeline import ParallelPipeline +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.processors.aggregators.sentence import SentenceAggregator +from pipecat.processors.filters.function_filter import FunctionFilter +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.services.elevenlabs import ElevenLabsTTSService +from pipecat.services.cartesia import CartesiaTTSService +from pipecat.services.openai import OpenAILLMService +from pipecat.transports.services.daily import DailyParams, DailyTransport +from pipecat.vad.silero import SileroVADAnalyzer + +load_dotenv(override=True) + +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + + +class Character(Enum): + GIMLI = "Gimli" + LEGOLAS = "Legolas" + + +class CustomFrame(TextFrame): + def __init__(self, text: str, character: Character, **kwargs): + super().__init__(text=text, **kwargs) + self.character = character + + +class FrameAndCharacterHandler(FrameProcessor): + def __init__(self): + super().__init__() + self.previous_frame_character = None + self.frame_character = None + self.voice_change_lock = asyncio.Lock() + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, TextFrame): + custom_frame = self.create_custom_frame(frame) + if custom_frame: + async with self.voice_change_lock: + voice_changed = await self.process_frame_character(custom_frame) + if voice_changed: + # Wait a bit for the voice change to take effect + await asyncio.sleep(1) # Adjust this delay as needed + await self.push_frame(custom_frame, direction) + else: + await self.push_frame(frame, direction) + + def create_custom_frame(self, frame: TextFrame): + if "#####" in frame.text: + self.frame_character = "Gimli" + frame.text = frame.text.replace("#####", "").strip() + elif "-----" in frame.text: + self.frame_character = "Legolas" + frame.text = frame.text.replace("-----", "").strip() + + if self.frame_character and frame.text: + return CustomFrame(text=frame.text, character=self.frame_character) + return None + + async def process_frame_character(self, frame: CustomFrame): + if self.previous_frame_character != frame.character: + await self.change_tts_settings(frame.character) + self.previous_frame_character = frame.character + return True + return False + + async def change_tts_settings(self, character: str): + upper_character_voice_id = character.upper() + "_VOICE_ID" + cartesia_voice_id = "CARTESIA_" + character.upper() + "_VOICE_ID" + logger.debug(f"Cartesia voice ID: {cartesia_voice_id}") + voice_id = os.getenv(cartesia_voice_id) + logger.debug(f"Changing TTS settings to {voice_id}") + if voice_id: + settingsFrame = TTSUpdateSettingsFrame(settings={"voice": voice_id}) + await self.push_frame(settingsFrame) + # Wait for the settings to be applied + await asyncio.sleep(1) # Adjust this delay as needed + else: + logger.warning(f"No voice ID found for character: {character}") + + +async def main(): + async with aiohttp.ClientSession() as session: + room_url = "some room url" # https://bdom.daily.co/support for example + token = "room token here that you generate" + + transport = DailyTransport( + room_url, + token, + "Pipecat", + DailyParams( + audio_out_enabled=True, + transcription_enabled=True, + vad_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), + ) + + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") + messages = [ + { + "role": "system", + "content": """You are writing a dialogue between two characters from Middle Earth: Gimli, a proud Dwarf, and Legolas, a skilled Elf. They are discussing who is better at craftsmanship: Dwarves or Elves. Make the conversation engaging and lively, with each character passionately defending their race's skills. + + Separate Gimli's parts/responses with ##### and Legolas's with -----. Do not add any other text than the tags and the responses. They should argue for at least 10 turns before both deciding that dealing with Sauron is more important than their craftsmanship debate. + + Example: + ##### By Durin's beard, everyone knows that Dwarven craftsmanship is unmatched in all of Middle Earth! Our axes and armor are the stuff of legend! + ----- While I respect your people's skills, Gimli, surely you jest. Elven craftsmanship has been refined over thousands of years. Our blades and bows are works of art! + + Begin the dialogue immediately, starting with Gimli's perspective.""", + }] + + context = OpenAILLMContext(messages) + context_aggregator = llm.create_context_aggregator(context) + frame_and_character_handler = FrameAndCharacterHandler() + + tts = ElevenLabsTTSService( + api_key=os.getenv("ELEVENLABS_API_KEY"), + voice_id=os.getenv("GIMLI_VOICE_ID"), + ) + + tts2 = CartesiaTTSService( + api_key=os.getenv("CARTESIA_API_KEY"), + voice_id=os.getenv("CARTESIA_GIMLI_VOICE_ID"), + ) + + pipeline = Pipeline( + [ + llm, + frame_and_character_handler, + # tts, + tts2, + transport.output(), + context_aggregator.assistant(), + ] + ) + + task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True)) + + @transport.event_handler("on_first_participant_joined") + async def on_first_participant_joined(transport, participant): + transport.capture_participant_transcription(participant["id"]) + # Kick off the conversation. + messages.append( + { + "role": "system", + "content": "Please begin the dialogue between Gimli and Legolas immediately. Start with Gimli's perspective on Dwarven craftsmanship. Do not add any introductions or additional text; begin directly with Gimli's first statement.", + } + ) + await task.queue_frames([LLMMessagesFrame(messages)]) + + runner = PipelineRunner() + + await runner.run(task) + + +if __name__ == "__main__": + asyncio.run(main()) From 13c383e19ebff245a7e0d515d066572103509af9 Mon Sep 17 00:00:00 2001 From: Dominic Date: Mon, 21 Oct 2024 19:16:06 +0900 Subject: [PATCH 2/3] Added example of switching voices --- ...witch-between-multiple-voices-cartesia.py} | 35 ++++++------------- 1 file changed, 11 insertions(+), 24 deletions(-) rename examples/foundational/{switch-voices-for-each-character-on-script-from-llm.py => 21a-switch-between-multiple-voices-cartesia.py} (85%) diff --git a/examples/foundational/switch-voices-for-each-character-on-script-from-llm.py b/examples/foundational/21a-switch-between-multiple-voices-cartesia.py similarity index 85% rename from examples/foundational/switch-voices-for-each-character-on-script-from-llm.py rename to examples/foundational/21a-switch-between-multiple-voices-cartesia.py index 381342f18..3bbddde64 100644 --- a/examples/foundational/switch-voices-for-each-character-on-script-from-llm.py +++ b/examples/foundational/21a-switch-between-multiple-voices-cartesia.py @@ -2,31 +2,22 @@ import os import sys from enum import Enum -from typing import Dict, List, Optional import aiohttp from dotenv import load_dotenv from loguru import logger from pipecat.frames.frames import ( - TTSStoppedFrame, - TTSStartedFrame, Frame, LLMMessagesFrame, TextFrame, - SystemFrame, TTSUpdateSettingsFrame, - StopTaskFrame ) -from pipecat.pipeline.parallel_pipeline import ParallelPipeline from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext -from pipecat.processors.aggregators.sentence import SentenceAggregator -from pipecat.processors.filters.function_filter import FunctionFilter from pipecat.processors.frame_processor import FrameDirection, FrameProcessor -from pipecat.services.elevenlabs import ElevenLabsTTSService from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport @@ -89,11 +80,11 @@ async def process_frame_character(self, frame: CustomFrame): return False async def change_tts_settings(self, character: str): - upper_character_voice_id = character.upper() + "_VOICE_ID" - cartesia_voice_id = "CARTESIA_" + character.upper() + "_VOICE_ID" - logger.debug(f"Cartesia voice ID: {cartesia_voice_id}") - voice_id = os.getenv(cartesia_voice_id) - logger.debug(f"Changing TTS settings to {voice_id}") + voice_id_mapping = { + "Gimli": "79a125e8-cd45-4c13-8a67-188112f4dd22", + "Legolas": "95856005-0332-41b0-935f-352e296aa0df", + } + voice_id = voice_id_mapping[character] if voice_id: settingsFrame = TTSUpdateSettingsFrame(settings={"voice": voice_id}) await self.push_frame(settingsFrame) @@ -105,8 +96,8 @@ async def change_tts_settings(self, character: str): async def main(): async with aiohttp.ClientSession() as session: - room_url = "some room url" # https://bdom.daily.co/support for example - token = "room token here that you generate" + room_url = "some url here" + token = "some token here" transport = DailyTransport( room_url, @@ -139,22 +130,18 @@ async def main(): context_aggregator = llm.create_context_aggregator(context) frame_and_character_handler = FrameAndCharacterHandler() - tts = ElevenLabsTTSService( - api_key=os.getenv("ELEVENLABS_API_KEY"), - voice_id=os.getenv("GIMLI_VOICE_ID"), - ) - - tts2 = CartesiaTTSService( + tts = CartesiaTTSService( api_key=os.getenv("CARTESIA_API_KEY"), voice_id=os.getenv("CARTESIA_GIMLI_VOICE_ID"), ) pipeline = Pipeline( [ + # transport.input(), # Transport user input - I have commented this out because we don't want to capture user input. Just for this example. Add it back in if you want to capture user input. + # context_aggregator.user(), # User responses - I have commented this out because we don't want to capture user input. Just for this example. Add it back in if you want to capture user input. llm, frame_and_character_handler, - # tts, - tts2, + tts, transport.output(), context_aggregator.assistant(), ] From c29de4ac3e60607e4efebf8f5978edad64feb6f1 Mon Sep 17 00:00:00 2001 From: Dominic Date: Mon, 21 Oct 2024 19:17:23 +0900 Subject: [PATCH 3/3] Fixed Gimlis voice ID at the start --- .../foundational/21a-switch-between-multiple-voices-cartesia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/foundational/21a-switch-between-multiple-voices-cartesia.py b/examples/foundational/21a-switch-between-multiple-voices-cartesia.py index 3bbddde64..b8f6b3c7c 100644 --- a/examples/foundational/21a-switch-between-multiple-voices-cartesia.py +++ b/examples/foundational/21a-switch-between-multiple-voices-cartesia.py @@ -132,7 +132,7 @@ async def main(): tts = CartesiaTTSService( api_key=os.getenv("CARTESIA_API_KEY"), - voice_id=os.getenv("CARTESIA_GIMLI_VOICE_ID"), + voice_id=os.getenv("79a125e8-cd45-4c13-8a67-188112f4dd22"), # Use Gimlis voice here ) pipeline = Pipeline(