From 2846d6f4613a8f8ca85c70037997fca95440998f Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 10 Dec 2024 22:59:24 -0500 Subject: [PATCH] Update READMEs and comment files --- examples/simple-chatbot/README.md | 53 +++++++++++------- .../examples/javascript/README.md | 4 +- .../simple-chatbot/examples/react/README.md | 2 +- .../simple-chatbot/examples/react/index.html | 2 +- examples/simple-chatbot/server/bot-gemini.py | 51 +++++++++++++++--- examples/simple-chatbot/server/bot-openai.py | 54 ++++++++++++++++--- 6 files changed, 128 insertions(+), 38 deletions(-) diff --git a/examples/simple-chatbot/README.md b/examples/simple-chatbot/README.md index 9254138e8..e9b06855d 100644 --- a/examples/simple-chatbot/README.md +++ b/examples/simple-chatbot/README.md @@ -2,7 +2,18 @@ -This repository demonstrates a simple AI chatbot with real-time audio/video interaction, implemented in three different ways. The bot server remains the same, but you can connect to it using three different client approaches. +This repository demonstrates a simple AI chatbot with real-time audio/video interaction, implemented in three different ways. The bot server supports multiple AI backends, and you can connect to it using three different client approaches. + +## Two Bot Options + +1. **OpenAI Bot** (Default) + + - Uses gpt-4o for conversation + - Requires OpenAI API key + +2. **Gemini Bot** + - Uses Google's Gemini Multimodal Live model + - Requires Gemini API key ## Three Ways to Connect @@ -13,13 +24,13 @@ This repository demonstrates a simple AI chatbot with real-time audio/video inte 2. **JavaScript** - - Basic implementation using RTVI JavaScript SDK + - Basic implementation using [Pipecat JavaScript SDK](https://docs.pipecat.ai/client/reference/js/introduction) - No framework dependencies - Good for learning the fundamentals 3. **React** - - Basic impelmentation using RTVI React SDK - - Demonstrates the basic client principles with RTVI React + - Basic impelmentation using [Pipecat React SDK](https://docs.pipecat.ai/client/reference/react/introduction) + - Demonstrates the basic client principles with Pipecat React ## Quick Start @@ -38,8 +49,12 @@ This repository demonstrates a simple AI chatbot with real-time audio/video inte ```bash pip install -r requirements.txt ``` -4. Copy env.example to .env and add your credentials - +4. Copy env.example to .env and configure: + - Add your API keys + - Choose your bot implementation: + ```ini + BOT_IMPLEMENTATION= # Options: 'openai' (default) or 'gemini' + ``` 5. Start the server: ```bash python server.py @@ -48,7 +63,7 @@ This repository demonstrates a simple AI chatbot with real-time audio/video inte ### Next, connect using your preferred client app: - [Daily Prebuilt](examples/prebuilt/README.md) -- [Vanilla JavaScript Guide](examples/javascript/README.md) +- [JavaScript Guide](examples/javascript/README.md) - [React Guide](examples/react/README.md) ## Important Note @@ -60,21 +75,23 @@ The bot server must be running for any of the client implementations to work. St - Python 3.10+ - Node.js 16+ (for JavaScript and React implementations) - Daily API key -- OpenAI API key -- Cartesia API key +- OpenAI API key (for OpenAI bot) +- Gemini API key (for Gemini bot) +- ElevenLabs API key - Modern web browser with WebRTC support ## Project Structure ``` -simple-chatbot-full-stack/ -├── server/ # Bot server implementation -│ ├── bot.py # Bot logic and media handling -│ ├── runner.py # Server runner utilities -│ ├── server.py # FastAPI server +simple-chatbot/ +├── server/ # Bot server implementation +│ ├── bot-openai.py # OpenAI bot implementation +│ ├── bot-gemini.py # Gemini bot implementation +│ ├── runner.py # Server runner utilities +│ ├── server.py # FastAPI server │ └── requirements.txt -└── examples/ # Client implementations - ├── prebuilt/ # Daily Prebuilt connection - ├── javascript/ # JavaScript RTVI client - └── react/ # React RTVI client +└── examples/ # Client implementations + ├── prebuilt/ # Daily Prebuilt connection + ├── javascript/ # Pipecat JavaScript client + └── react/ # Pipecat React client ``` diff --git a/examples/simple-chatbot/examples/javascript/README.md b/examples/simple-chatbot/examples/javascript/README.md index f07c9d243..74525c1c1 100644 --- a/examples/simple-chatbot/examples/javascript/README.md +++ b/examples/simple-chatbot/examples/javascript/README.md @@ -1,10 +1,10 @@ # JavaScript Implementation -Basic implementation using the RTVI JavaScript SDK. +Basic implementation using the [Pipecat JavaScript SDK](https://docs.pipecat.ai/client/reference/js/introduction). ## Setup -1. Run the bot server; see [README](../../README). +1. Run the bot server. See the [server README](../../README). 2. Navigate to the `examples/javascript` directory: diff --git a/examples/simple-chatbot/examples/react/README.md b/examples/simple-chatbot/examples/react/README.md index a7ff4be09..44775a083 100644 --- a/examples/simple-chatbot/examples/react/README.md +++ b/examples/simple-chatbot/examples/react/README.md @@ -1,6 +1,6 @@ # React Implementation -Basic implementation using the RTVI React SDK. +Basic implementation using the [Pipecat React SDK](https://docs.pipecat.ai/client/reference/react/introduction). ## Setup diff --git a/examples/simple-chatbot/examples/react/index.html b/examples/simple-chatbot/examples/react/index.html index 38ce1ffd7..154e0a75a 100644 --- a/examples/simple-chatbot/examples/react/index.html +++ b/examples/simple-chatbot/examples/react/index.html @@ -4,7 +4,7 @@ - RTVI React Client + Pipecat React Client diff --git a/examples/simple-chatbot/server/bot-gemini.py b/examples/simple-chatbot/server/bot-gemini.py index a81e213c9..991df1cd1 100644 --- a/examples/simple-chatbot/server/bot-gemini.py +++ b/examples/simple-chatbot/server/bot-gemini.py @@ -4,6 +4,18 @@ # SPDX-License-Identifier: BSD 2-Clause License # +"""Gemini Bot Implementation. + +This module implements a chatbot using Google's Gemini Multimodal Live model. +It includes: +- Real-time audio/video interaction through Daily +- Animated robot avatar +- Speech-to-speech model + +The bot runs as part of a pipeline that processes audio/video frames and manages +the conversation flow using Gemini's streaming capabilities. +""" + import asyncio import os import sys @@ -21,7 +33,6 @@ BotStoppedSpeakingFrame, EndFrame, Frame, - LLMMessagesFrame, OutputImageRawFrame, SpriteFrame, ) @@ -47,7 +58,6 @@ logger.add(sys.stderr, level="DEBUG") sprites = [] - script_dir = os.path.dirname(__file__) for i in range(1, 26): @@ -58,18 +68,20 @@ with Image.open(full_path) as img: sprites.append(OutputImageRawFrame(image=img.tobytes(), size=img.size, format=img.format)) +# Create a smooth animation by adding reversed frames flipped = sprites[::-1] sprites.extend(flipped) -# When the bot isn't talking, show a static image of the cat listening -quiet_frame = sprites[0] -talking_frame = SpriteFrame(images=sprites) +# Define static and animated states +quiet_frame = sprites[0] # Static frame for when bot is listening +talking_frame = SpriteFrame(images=sprites) # Animation sequence for when bot is talking class TalkingAnimation(FrameProcessor): - """This class starts a talking animation when it receives an first AudioFrame. + """Manages the bot's visual animation states. - It then returns to a "quiet" sprite when it sees a TTSStoppedFrame. + Switches between static (listening) and animated (talking) states based on + the bot's current speaking status. """ def __init__(self): @@ -77,12 +89,20 @@ def __init__(self): self._is_talking = False async def process_frame(self, frame: Frame, direction: FrameDirection): + """Process incoming frames and update animation state. + + Args: + frame: The incoming frame to process + direction: The direction of frame flow in the pipeline + """ await super().process_frame(frame, direction) + # Switch to talking animation when bot starts speaking if isinstance(frame, BotStartedSpeakingFrame): if not self._is_talking: await self.push_frame(talking_frame) self._is_talking = True + # Return to static frame when bot stops speaking elif isinstance(frame, BotStoppedSpeakingFrame): await self.push_frame(quiet_frame) self._is_talking = False @@ -91,9 +111,19 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): async def main(): + """Main bot execution function. + + Sets up and runs the bot pipeline including: + - Daily video transport with specific audio parameters + - Gemini Live multimodal model integration + - Voice activity detection + - Animation processing + - RTVI event handling + """ async with aiohttp.ClientSession() as session: (room_url, token) = await configure(session) + # Set up Daily transport with specific audio/video parameters for Gemini transport = DailyTransport( room_url, token, @@ -111,6 +141,7 @@ async def main(): ), ) + # Initialize the Gemini Multimodal Live model llm = GeminiMultimodalLiveLLMService( api_key=os.getenv("GEMINI_API_KEY"), voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck @@ -125,12 +156,16 @@ async def main(): }, ] + # Set up conversation context and management + # The context_aggregator will automatically collect conversation context context = OpenAILLMContext(messages) context_aggregator = llm.create_context_aggregator(context) ta = TalkingAnimation() - # RTVI + # + # RTVI events for Pipecat client UI + # # This will send `user-*-speaking` and `bot-*-speaking` messages. rtvi_speaking = RTVISpeakingProcessor() diff --git a/examples/simple-chatbot/server/bot-openai.py b/examples/simple-chatbot/server/bot-openai.py index 4db90c932..a3a68c839 100644 --- a/examples/simple-chatbot/server/bot-openai.py +++ b/examples/simple-chatbot/server/bot-openai.py @@ -4,6 +4,19 @@ # SPDX-License-Identifier: BSD 2-Clause License # +"""OpenAI Bot Implementation. + +This module implements a chatbot using OpenAI's GPT-4 model for natural language +processing. It includes: +- Real-time audio/video interaction through Daily +- Animated robot avatar +- Text-to-speech using ElevenLabs +- Support for both English and Spanish + +The bot runs as part of a pipeline that processes audio/video frames and manages +the conversation flow. +""" + import asyncio import os import sys @@ -40,14 +53,13 @@ from pipecat.transports.services.daily import DailyParams, DailyTransport load_dotenv(override=True) - logger.remove(0) logger.add(sys.stderr, level="DEBUG") sprites = [] - script_dir = os.path.dirname(__file__) +# Load sequential animation frames for i in range(1, 26): # Build the full path to the image file full_path = os.path.join(script_dir, f"assets/robot0{i}.png") @@ -56,18 +68,20 @@ with Image.open(full_path) as img: sprites.append(OutputImageRawFrame(image=img.tobytes(), size=img.size, format=img.format)) +# Create a smooth animation by adding reversed frames flipped = sprites[::-1] sprites.extend(flipped) -# When the bot isn't talking, show a static image of the cat listening -quiet_frame = sprites[0] -talking_frame = SpriteFrame(images=sprites) +# Define static and animated states +quiet_frame = sprites[0] # Static frame for when bot is listening +talking_frame = SpriteFrame(images=sprites) # Animation sequence for when bot is talking class TalkingAnimation(FrameProcessor): - """This class starts a talking animation when it receives an first AudioFrame. + """Manages the bot's visual animation states. - It then returns to a "quiet" sprite when it sees a TTSStoppedFrame. + Switches between static (listening) and animated (talking) states based on + the bot's current speaking status. """ def __init__(self): @@ -75,12 +89,20 @@ def __init__(self): self._is_talking = False async def process_frame(self, frame: Frame, direction: FrameDirection): + """Process incoming frames and update animation state. + + Args: + frame: The incoming frame to process + direction: The direction of frame flow in the pipeline + """ await super().process_frame(frame, direction) + # Switch to talking animation when bot starts speaking if isinstance(frame, BotStartedSpeakingFrame): if not self._is_talking: await self.push_frame(talking_frame) self._is_talking = True + # Return to static frame when bot stops speaking elif isinstance(frame, BotStoppedSpeakingFrame): await self.push_frame(quiet_frame) self._is_talking = False @@ -89,9 +111,19 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): async def main(): + """Main bot execution function. + + Sets up and runs the bot pipeline including: + - Daily video transport + - Speech-to-text and text-to-speech services + - Language model integration + - Animation processing + - RTVI event handling + """ async with aiohttp.ClientSession() as session: (room_url, token) = await configure(session) + # Set up Daily transport with video/audio parameters transport = DailyTransport( room_url, token, @@ -115,6 +147,7 @@ async def main(): ), ) + # Initialize text-to-speech service tts = ElevenLabsTTSService( api_key=os.getenv("ELEVENLABS_API_KEY"), # @@ -128,6 +161,7 @@ async def main(): # voice_id="gD1IexrzCvsXPHUuT0s3", ) + # Initialize LLM service llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") messages = [ @@ -144,12 +178,16 @@ async def main(): }, ] + # Set up conversation context and management + # The context_aggregator will automatically collect conversation context context = OpenAILLMContext(messages) context_aggregator = llm.create_context_aggregator(context) ta = TalkingAnimation() - # RTVI + # + # RTVI events for Pipecat client UI + # # This will send `user-*-speaking` and `bot-*-speaking` messages. rtvi_speaking = RTVISpeakingProcessor()