From eeb8338dce52d8539985ff5d5107d76b51badcdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Mon, 23 Sep 2024 09:11:37 -0700 Subject: [PATCH 1/4] introduce Ruff formatting --- .github/workflows/{lint.yaml => format.yaml} | 19 +- dev-requirements.txt | 2 +- examples/deployment/flyio-example/bot.py | 30 ++- .../deployment/flyio-example/bot_runner.py | 118 ++++----- examples/dialin-chatbot/bot_daily.py | 39 ++- examples/dialin-chatbot/bot_runner.py | 70 ++--- examples/dialin-chatbot/bot_twilio.py | 40 +-- examples/foundational/01-say-one-thing.py | 7 +- examples/foundational/01a-local-audio.py | 1 + examples/foundational/02-llm-say-one-thing.py | 14 +- examples/foundational/03-still-frame.py | 11 +- .../foundational/03a-local-still-frame.py | 11 +- .../foundational/04-utterance-and-speech.py | 7 +- .../foundational/05-sync-speech-and-image.py | 35 ++- .../05a-local-sync-speech-and-image.py | 48 ++-- .../foundational/06-listen-and-respond.py | 43 +-- examples/foundational/06a-image-sync.py | 50 ++-- examples/foundational/07-interruptible.py | 51 ++-- .../07a-interruptible-anthropic.py | 31 ++- .../07b-interruptible-langchain.py | 38 +-- .../07c-interruptible-deepgram.py | 40 +-- .../foundational/07e-interruptible-playht.py | 34 +-- .../foundational/07f-interruptible-azure.py | 30 ++- .../07g-interruptible-openai-tts.py | 39 ++- .../07h-interruptible-openpipe.py | 30 +-- .../foundational/07i-interruptible-xtts.py | 34 +-- .../foundational/07j-interruptible-gladia.py | 34 +-- .../foundational/07k-interruptible-lmnt.py | 39 ++- .../07l-interruptible-together.py | 31 ++- examples/foundational/08-bots-arguing.py | 11 +- examples/foundational/09-mirror.py | 34 ++- examples/foundational/09a-local-mirror.py | 36 ++- examples/foundational/10-wake-phrase.py | 33 +-- examples/foundational/11-sound-effects.py | 42 +-- examples/foundational/12-describe-video.py | 31 ++- .../12a-describe-video-gemini-flash.py | 35 +-- .../foundational/12b-describe-video-gpt-4o.py | 36 +-- .../12c-describe-video-anthropic.py | 35 +-- .../foundational/13-whisper-transcription.py | 7 +- examples/foundational/13a-whisper-local.py | 2 +- .../13b-deepgram-transcription.py | 7 +- examples/foundational/14-function-calling.py | 49 ++-- examples/foundational/15-switch-voices.py | 49 ++-- examples/foundational/15a-switch-languages.py | 43 +-- .../16-gpu-container-local-bot.py | 52 ++-- examples/foundational/17-detect-user-idle.py | 57 ++-- examples/foundational/18-gstreamer-filesrc.py | 22 +- .../18a-gstreamer-videotestsrc.py | 23 +- examples/foundational/19a-tools-anthropic.py | 28 +- .../foundational/19b-tools-video-anthropic.py | 37 +-- examples/foundational/19c-tools-togetherai.py | 41 ++- examples/foundational/runner.py | 21 +- examples/moondream-chatbot/bot.py | 41 ++- examples/moondream-chatbot/runner.py | 16 +- examples/moondream-chatbot/server.py | 41 ++- examples/patient-intake/bot.py | 173 +++++++----- examples/patient-intake/runner.py | 17 +- examples/patient-intake/server.py | 41 ++- examples/simple-chatbot/bot.py | 42 ++- examples/simple-chatbot/runner.py | 16 +- examples/simple-chatbot/server.py | 41 ++- examples/storytelling-chatbot/src/bot.py | 54 ++-- .../storytelling-chatbot/src/bot_runner.py | 110 ++++---- .../storytelling-chatbot/src/processors.py | 9 +- examples/storytelling-chatbot/src/prompts.py | 4 +- .../storytelling-chatbot/src/utils/helpers.py | 11 +- examples/studypal/runner.py | 21 +- examples/studypal/studypal.py | 64 +++-- examples/translation-chatbot/bot.py | 32 +-- examples/translation-chatbot/runner.py | 16 +- examples/translation-chatbot/server.py | 41 ++- examples/twilio-chatbot/bot.py | 41 +-- examples/twilio-chatbot/server.py | 4 +- examples/websocket-server/bot.py | 38 +-- src/pipecat/clocks/base_clock.py | 1 - src/pipecat/clocks/system_clock.py | 1 - src/pipecat/frames/frames.py | 111 +++++--- src/pipecat/pipeline/base_pipeline.py | 1 - src/pipecat/pipeline/parallel_pipeline.py | 2 - src/pipecat/pipeline/pipeline.py | 3 - src/pipecat/pipeline/runner.py | 7 +- .../pipeline/sync_parallel_pipeline.py | 2 - src/pipecat/pipeline/task.py | 20 +- .../pipeline/to_be_updated/merge_pipeline.py | 4 +- src/pipecat/processors/aggregators/gated.py | 11 +- .../processors/aggregators/llm_response.py | 19 +- .../aggregators/openai_llm_context.py | 91 +++---- .../processors/aggregators/user_response.py | 5 +- .../aggregators/vision_image_frame.py | 10 +- .../processors/filters/frame_filter.py | 9 +- .../processors/filters/function_filter.py | 1 - .../processors/filters/wake_check_filter.py | 11 +- src/pipecat/processors/frame_processor.py | 26 +- .../processors/frameworks/langchain.py | 7 +- src/pipecat/processors/frameworks/rtvi.py | 87 +++--- .../processors/gstreamer/pipeline_source.py | 28 +- .../processors/idle_frame_processor.py | 13 +- src/pipecat/processors/logger.py | 5 +- .../metrics/frame_processor_metrics.py | 24 +- src/pipecat/processors/metrics/sentry.py | 11 +- src/pipecat/processors/user_idle_processor.py | 14 +- src/pipecat/serializers/base_serializer.py | 1 - src/pipecat/serializers/livekit.py | 10 +- src/pipecat/serializers/protobuf.py | 8 +- src/pipecat/serializers/twilio.py | 22 +- src/pipecat/services/ai_services.py | 92 +++---- src/pipecat/services/anthropic.py | 250 ++++++++++-------- src/pipecat/services/azure.py | 67 +++-- src/pipecat/services/cartesia.py | 71 +++-- src/pipecat/services/deepgram.py | 94 ++++--- src/pipecat/services/fal.py | 13 +- src/pipecat/services/fireworks.py | 13 +- src/pipecat/services/gladia.py | 36 +-- src/pipecat/services/google.py | 26 +- src/pipecat/services/lmnt.py | 24 +- src/pipecat/services/moondream.py | 13 +- src/pipecat/services/ollama.py | 1 - src/pipecat/services/openai.py | 191 ++++++------- src/pipecat/services/openpipe.py | 43 ++- src/pipecat/services/playht.py | 36 +-- .../to_be_updated/cloudflare_ai_service.py | 27 +- .../to_be_updated/google_ai_service.py | 8 +- .../to_be_updated/huggingface_ai_service.py | 4 +- src/pipecat/services/together.py | 87 +++--- src/pipecat/services/whisper.py | 26 +- src/pipecat/services/xtts.py | 33 ++- src/pipecat/transcriptions/language.py | 87 +++--- src/pipecat/transports/base_input.py | 13 +- src/pipecat/transports/base_output.py | 28 +- src/pipecat/transports/base_transport.py | 12 +- src/pipecat/transports/local/audio.py | 20 +- src/pipecat/transports/local/tk.py | 26 +- .../transports/network/fastapi_websocket.py | 63 ++--- .../transports/network/websocket_server.py | 67 +++-- src/pipecat/transports/services/daily.py | 194 +++++++------- .../transports/services/helpers/daily_rest.py | 49 ++-- src/pipecat/utils/test_frame_processor.py | 4 +- src/pipecat/vad/silero.py | 64 +++-- src/pipecat/vad/vad_analyzer.py | 1 - tests/integration/integration_azure_llm.py | 6 +- tests/integration/integration_ollama_llm.py | 6 +- tests/integration/integration_openai_llm.py | 49 ++-- tests/test_aggregators.py | 35 +-- tests/test_ai_services.py | 5 +- tests/test_daily_transport_service.py | 1 - tests/test_langchain.py | 20 +- tests/test_openai_tts.py | 7 +- tests/test_pipeline.py | 47 ++-- tests/test_protobuf_serializer.py | 18 +- 149 files changed, 2657 insertions(+), 2465 deletions(-) rename .github/workflows/{lint.yaml => format.yaml} (63%) diff --git a/.github/workflows/lint.yaml b/.github/workflows/format.yaml similarity index 63% rename from .github/workflows/lint.yaml rename to .github/workflows/format.yaml index ad5b160f1..1100ea394 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/format.yaml @@ -1,4 +1,4 @@ -name: lint +name: format on: workflow_dispatch: @@ -12,12 +12,12 @@ on: - "docs/**" concurrency: - group: build-lint-${{ github.event.pull_request.number || github.ref }} + group: build-format-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true jobs: - autopep8: - name: "Formatting lints" + ruff-format: + name: "Formatting checker" runs-on: ubuntu-latest steps: - name: Checkout repo @@ -25,7 +25,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: "3.10" - name: Setup virtual environment run: | python -m venv .venv @@ -34,11 +34,8 @@ jobs: source .venv/bin/activate python -m pip install --upgrade pip pip install -r dev-requirements.txt - - name: autopep8 - id: autopep8 + - name: Ruff formatter + id: ruff run: | source .venv/bin/activate - autopep8 --max-line-length 100 --exit-code -r -d --exclude "*_pb2.py" -a -a src/ - - name: Fail if autopep8 requires changes - if: steps.autopep8.outputs.exit-code == 2 - run: exit 1 + ruff format --config line-length=100 --diff --exclude "*_pb2.py" diff --git a/dev-requirements.txt b/dev-requirements.txt index 6ce9ffcb4..cce356b14 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,8 +1,8 @@ -autopep8~=2.3.1 build~=1.2.1 grpcio-tools~=1.62.2 pip-tools~=7.4.1 pyright~=1.1.376 pytest~=8.3.2 +ruff~=0.6.6 setuptools~=72.2.0 setuptools_scm~=8.1.0 diff --git a/examples/deployment/flyio-example/bot.py b/examples/deployment/flyio-example/bot.py index c6380f6f3..b7378c0ff 100644 --- a/examples/deployment/flyio-example/bot.py +++ b/examples/deployment/flyio-example/bot.py @@ -6,7 +6,10 @@ from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask -from pipecat.processors.aggregators.llm_response import LLMAssistantResponseAggregator, LLMUserResponseAggregator +from pipecat.processors.aggregators.llm_response import ( + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) from pipecat.frames.frames import LLMMessagesFrame, EndFrame from pipecat.services.openai import OpenAILLMService from pipecat.services.elevenlabs import ElevenLabsTTSService @@ -16,6 +19,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -39,7 +43,7 @@ async def main(room_url: str, token: str): vad_enabled=True, vad_analyzer=SileroVADAnalyzer(), transcription_enabled=True, - ) + ), ) tts = ElevenLabsTTSService( @@ -47,9 +51,7 @@ async def main(room_url: str, token: str): voice_id=os.getenv("ELEVENLABS_VOICE_ID", ""), ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") messages = [ { @@ -61,14 +63,16 @@ async def main(room_url: str, token: str): tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), - tma_in, - llm, - tts, - transport.output(), - tma_out, - ]) + pipeline = Pipeline( + [ + transport.input(), + tma_in, + llm, + tts, + transport.output(), + tma_out, + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True)) diff --git a/examples/deployment/flyio-example/bot_runner.py b/examples/deployment/flyio-example/bot_runner.py index 2c2ee43cc..7c76d26f4 100644 --- a/examples/deployment/flyio-example/bot_runner.py +++ b/examples/deployment/flyio-example/bot_runner.py @@ -16,9 +16,14 @@ from fastapi.responses import JSONResponse from pipecat.transports.services.helpers.daily_rest import ( - DailyRESTHelper, DailyRoomObject, DailyRoomProperties, DailyRoomParams) + DailyRESTHelper, + DailyRoomObject, + DailyRoomProperties, + DailyRoomParams, +) from dotenv import load_dotenv + load_dotenv(override=True) @@ -26,37 +31,37 @@ MAX_SESSION_TIME = 5 * 60 # 5 minutes REQUIRED_ENV_VARS = [ - 'DAILY_API_KEY', - 'OPENAI_API_KEY', - 'ELEVENLABS_API_KEY', - 'ELEVENLABS_VOICE_ID', - 'FLY_API_KEY', - 'FLY_APP_NAME',] + "DAILY_API_KEY", + "OPENAI_API_KEY", + "ELEVENLABS_API_KEY", + "ELEVENLABS_VOICE_ID", + "FLY_API_KEY", + "FLY_APP_NAME", +] FLY_API_HOST = os.getenv("FLY_API_HOST", "https://api.machines.dev/v1") FLY_APP_NAME = os.getenv("FLY_APP_NAME", "pipecat-fly-example") FLY_API_KEY = os.getenv("FLY_API_KEY", "") -FLY_HEADERS = { - 'Authorization': f"Bearer {FLY_API_KEY}", - 'Content-Type': 'application/json' -} +FLY_HEADERS = {"Authorization": f"Bearer {FLY_API_KEY}", "Content-Type": "application/json"} daily_helpers = {} # ----------------- API ----------------- # + @asynccontextmanager async def lifespan(app: FastAPI): aiohttp_session = aiohttp.ClientSession() daily_helpers["rest"] = DailyRESTHelper( daily_api_key=os.getenv("DAILY_API_KEY", ""), - daily_api_url=os.getenv("DAILY_API_URL", 'https://api.daily.co/v1'), - aiohttp_session=aiohttp_session + daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"), + aiohttp_session=aiohttp_session, ) yield await aiohttp_session.close() + app = FastAPI(lifespan=lifespan) app.add_middleware( @@ -64,7 +69,7 @@ async def lifespan(app: FastAPI): allow_origins=["*"], allow_credentials=True, allow_methods=["*"], - allow_headers=["*"] + allow_headers=["*"], ) # ----------------- Main ----------------- # @@ -73,13 +78,15 @@ async def lifespan(app: FastAPI): async def spawn_fly_machine(room_url: str, token: str): async with aiohttp.ClientSession() as session: # Use the same image as the bot runner - async with session.get(f"{FLY_API_HOST}/apps/{FLY_APP_NAME}/machines", headers=FLY_HEADERS) as r: + async with session.get( + f"{FLY_API_HOST}/apps/{FLY_APP_NAME}/machines", headers=FLY_HEADERS + ) as r: if r.status != 200: text = await r.text() raise Exception(f"Unable to get machine info from Fly: {text}") data = await r.json() - image = data[0]['config']['image'] + image = data[0]["config"]["image"] # Machine configuration cmd = f"python3 bot.py -u {room_url} -t {token}" @@ -88,31 +95,28 @@ async def spawn_fly_machine(room_url: str, token: str): "config": { "image": image, "auto_destroy": True, - "init": { - "cmd": cmd - }, - "restart": { - "policy": "no" - }, - "guest": { - "cpu_kind": "shared", - "cpus": 1, - "memory_mb": 1024 - } + "init": {"cmd": cmd}, + "restart": {"policy": "no"}, + "guest": {"cpu_kind": "shared", "cpus": 1, "memory_mb": 1024}, }, } # Spawn a new machine instance - async with session.post(f"{FLY_API_HOST}/apps/{FLY_APP_NAME}/machines", headers=FLY_HEADERS, json=worker_props) as r: + async with session.post( + f"{FLY_API_HOST}/apps/{FLY_APP_NAME}/machines", headers=FLY_HEADERS, json=worker_props + ) as r: if r.status != 200: text = await r.text() raise Exception(f"Problem starting a bot worker: {text}") data = await r.json() # Wait for the machine to enter the started state - vm_id = data['id'] + vm_id = data["id"] - async with session.get(f"{FLY_API_HOST}/apps/{FLY_APP_NAME}/machines/{vm_id}/wait?state=started", headers=FLY_HEADERS) as r: + async with session.get( + f"{FLY_API_HOST}/apps/{FLY_APP_NAME}/machines/{vm_id}/wait?state=started", + headers=FLY_HEADERS, + ) as r: if r.status != 200: text = await r.text() raise Exception(f"Bot was unable to enter started state: {text}") @@ -134,29 +138,23 @@ async def start_bot(request: Request) -> JSONResponse: room_url = os.getenv("DAILY_SAMPLE_ROOM_URL", "") if not room_url: - params = DailyRoomParams( - properties=DailyRoomProperties() - ) + params = DailyRoomParams(properties=DailyRoomProperties()) try: room: DailyRoomObject = await daily_helpers["rest"].create_room(params=params) except Exception as e: - raise HTTPException( - status_code=500, - detail=f"Unable to provision room {e}") + raise HTTPException(status_code=500, detail=f"Unable to provision room {e}") else: # Check passed room URL exists, we should assume that it already has a sip set up try: room: DailyRoomObject = await daily_helpers["rest"].get_room_from_url(room_url) except Exception: - raise HTTPException( - status_code=500, detail=f"Room not found: {room_url}") + raise HTTPException(status_code=500, detail=f"Room not found: {room_url}") # Give the agent a token to join the session token = await daily_helpers["rest"].get_token(room.url, MAX_SESSION_TIME) if not room or not token: - raise HTTPException( - status_code=500, detail=f"Failed to get token for room: {room_url}") + raise HTTPException(status_code=500, detail=f"Failed to get token for room: {room_url}") # Launch a new fly.io machine, or run as a shell process (not recommended) run_as_process = os.getenv("RUN_AS_PROCESS", False) @@ -167,24 +165,26 @@ async def start_bot(request: Request) -> JSONResponse: [f"python3 -m bot -u {room.url} -t {token}"], shell=True, bufsize=1, - cwd=os.path.dirname(os.path.abspath(__file__))) + cwd=os.path.dirname(os.path.abspath(__file__)), + ) except Exception as e: - raise HTTPException( - status_code=500, detail=f"Failed to start subprocess: {e}") + raise HTTPException(status_code=500, detail=f"Failed to start subprocess: {e}") else: try: await spawn_fly_machine(room.url, token) except Exception as e: - raise HTTPException( - status_code=500, detail=f"Failed to spawn VM: {e}") + raise HTTPException(status_code=500, detail=f"Failed to spawn VM: {e}") # Grab a token for the user to join with user_token = await daily_helpers["rest"].get_token(room.url, MAX_SESSION_TIME) - return JSONResponse({ - "room_url": room.url, - "token": user_token, - }) + return JSONResponse( + { + "room_url": room.url, + "token": user_token, + } + ) + if __name__ == "__main__": # Check environment variables @@ -193,23 +193,19 @@ async def start_bot(request: Request) -> JSONResponse: raise Exception(f"Missing environment variable: {env_var}.") parser = argparse.ArgumentParser(description="Pipecat Bot Runner") - parser.add_argument("--host", type=str, - default=os.getenv("HOST", "0.0.0.0"), help="Host address") - parser.add_argument("--port", type=int, - default=os.getenv("PORT", 7860), help="Port number") - parser.add_argument("--reload", action="store_true", - default=False, help="Reload code on change") + parser.add_argument( + "--host", type=str, default=os.getenv("HOST", "0.0.0.0"), help="Host address" + ) + parser.add_argument("--port", type=int, default=os.getenv("PORT", 7860), help="Port number") + parser.add_argument( + "--reload", action="store_true", default=False, help="Reload code on change" + ) config = parser.parse_args() try: import uvicorn - uvicorn.run( - "bot_runner:app", - host=config.host, - port=config.port, - reload=config.reload - ) + uvicorn.run("bot_runner:app", host=config.host, port=config.port, reload=config.reload) except KeyboardInterrupt: print("Pipecat runner shutting down...") diff --git a/examples/dialin-chatbot/bot_daily.py b/examples/dialin-chatbot/bot_daily.py index cd6afdad0..2645c65a0 100644 --- a/examples/dialin-chatbot/bot_daily.py +++ b/examples/dialin-chatbot/bot_daily.py @@ -6,11 +6,11 @@ from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask -from pipecat.processors.aggregators.llm_response import LLMAssistantResponseAggregator, LLMUserResponseAggregator -from pipecat.frames.frames import ( - LLMMessagesFrame, - EndFrame +from pipecat.processors.aggregators.llm_response import ( + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, ) +from pipecat.frames.frames import LLMMessagesFrame, EndFrame from pipecat.services.elevenlabs import ElevenLabsTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport, DailyDialinSettings @@ -18,6 +18,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -31,10 +32,7 @@ async def main(room_url: str, token: str, callId: str, callDomain: str): # diallin_settings are only needed if Daily's SIP URI is used # If you are handling this via Twilio, Telnyx, set this to None # and handle call-forwarding when on_dialin_ready fires. - diallin_settings = DailyDialinSettings( - call_id=callId, - call_domain=callDomain - ) + diallin_settings = DailyDialinSettings(call_id=callId, call_domain=callDomain) transport = DailyTransport( room_url, @@ -50,7 +48,7 @@ async def main(room_url: str, token: str, callId: str, callDomain: str): vad_enabled=True, vad_analyzer=SileroVADAnalyzer(), transcription_enabled=True, - ) + ), ) tts = ElevenLabsTTSService( @@ -58,10 +56,7 @@ async def main(room_url: str, token: str, callId: str, callDomain: str): voice_id=os.getenv("ELEVENLABS_VOICE_ID", ""), ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o" - ) + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") messages = [ { @@ -73,14 +68,16 @@ async def main(room_url: str, token: str, callId: str, callDomain: str): tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), - tma_in, - llm, - tts, - transport.output(), - tma_out, - ]) + pipeline = Pipeline( + [ + transport.input(), + tma_in, + llm, + tts, + transport.output(), + tma_out, + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True)) diff --git a/examples/dialin-chatbot/bot_runner.py b/examples/dialin-chatbot/bot_runner.py index 3b6e12eec..d29adc34e 100644 --- a/examples/dialin-chatbot/bot_runner.py +++ b/examples/dialin-chatbot/bot_runner.py @@ -7,7 +7,6 @@ Refer to README for more information. """ - import aiohttp import os import argparse @@ -25,17 +24,18 @@ DailyRoomObject, DailyRoomProperties, DailyRoomSipParams, - DailyRoomParams) + DailyRoomParams, +) from dotenv import load_dotenv + load_dotenv(override=True) # ------------ Configuration ------------ # MAX_SESSION_TIME = 5 * 60 # 5 minutes -REQUIRED_ENV_VARS = ['OPENAI_API_KEY', 'DAILY_API_KEY', - 'ELEVENLABS_API_KEY', 'ELEVENLABS_VOICE_ID'] +REQUIRED_ENV_VARS = ["OPENAI_API_KEY", "DAILY_API_KEY", "ELEVENLABS_API_KEY", "ELEVENLABS_VOICE_ID"] daily_helpers = {} @@ -47,12 +47,13 @@ async def lifespan(app: FastAPI): aiohttp_session = aiohttp.ClientSession() daily_helpers["rest"] = DailyRESTHelper( daily_api_key=os.getenv("DAILY_API_KEY", ""), - daily_api_url=os.getenv("DAILY_API_URL", 'https://api.daily.co/v1'), - aiohttp_session=aiohttp_session + daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"), + aiohttp_session=aiohttp_session, ) yield await aiohttp_session.close() + app = FastAPI(lifespan=lifespan) app.add_middleware( @@ -60,7 +61,7 @@ async def lifespan(app: FastAPI): allow_origins=["*"], allow_credentials=True, allow_methods=["*"], - allow_headers=["*"] + allow_headers=["*"], ) """ @@ -80,10 +81,7 @@ async def _create_daily_room(room_url, callId, callDomain=None, vendor="daily"): properties=DailyRoomProperties( # Note: these are the default values, except for the display name sip=DailyRoomSipParams( - display_name="dialin-user", - video=False, - sip_mode="dial-in", - num_endpoints=1 + display_name="dialin-user", video=False, sip_mode="dial-in", num_endpoints=1 ) ) ) @@ -97,8 +95,7 @@ async def _create_daily_room(room_url, callId, callDomain=None, vendor="daily"): print(f"Joining existing room: {room_url}") room: DailyRoomObject = await daily_helpers["rest"].get_room_from_url(room_url) except Exception: - raise HTTPException( - status_code=500, detail=f"Room not found: {room_url}") + raise HTTPException(status_code=500, detail=f"Room not found: {room_url}") print(f"Daily room: {room.url} {room.config.sip_endpoint}") @@ -106,8 +103,7 @@ async def _create_daily_room(room_url, callId, callDomain=None, vendor="daily"): token = await daily_helpers["rest"].get_token(room.url, MAX_SESSION_TIME) if not room or not token: - raise HTTPException( - status_code=500, detail=f"Failed to get room or token token") + raise HTTPException(status_code=500, detail=f"Failed to get room or token token") # Spawn a new agent, and join the user session # Note: this is mostly for demonstration purposes (refer to 'deployment' in docs) @@ -120,14 +116,10 @@ async def _create_daily_room(room_url, callId, callDomain=None, vendor="daily"): try: subprocess.Popen( - [bot_proc], - shell=True, - bufsize=1, - cwd=os.path.dirname(os.path.abspath(__file__)) + [bot_proc], shell=True, bufsize=1, cwd=os.path.dirname(os.path.abspath(__file__)) ) except Exception as e: - raise HTTPException( - status_code=500, detail=f"Failed to start subprocess: {e}") + raise HTTPException(status_code=500, detail=f"Failed to start subprocess: {e}") return room @@ -150,11 +142,10 @@ async def twilio_start_bot(request: Request): pass room_url = os.getenv("DAILY_SAMPLE_ROOM_URL", None) - callId = data.get('CallSid') + callId = data.get("CallSid") if not callId: - raise HTTPException( - status_code=500, detail="Missing 'CallSid' in request") + raise HTTPException(status_code=500, detail="Missing 'CallSid' in request") print("CallId: %s" % callId) @@ -170,7 +161,8 @@ async def twilio_start_bot(request: Request): # http://com.twilio.music.classical.s3.amazonaws.com/BusyStrings.mp3 resp = VoiceResponse() resp.play( - url="http://com.twilio.sounds.music.s3.amazonaws.com/MARKOVICHAMP-Borghestral.mp3", loop=10) + url="http://com.twilio.sounds.music.s3.amazonaws.com/MARKOVICHAMP-Borghestral.mp3", loop=10 + ) return str(resp) @@ -192,18 +184,14 @@ async def daily_start_bot(request: Request) -> JSONResponse: callId = data.get("callId", None) callDomain = data.get("callDomain", None) except Exception: - raise HTTPException( - status_code=500, - detail="Missing properties 'callId' or 'callDomain'") + raise HTTPException(status_code=500, detail="Missing properties 'callId' or 'callDomain'") print(f"CallId: {callId}, CallDomain: {callDomain}") room: DailyRoomObject = await _create_daily_room(room_url, callId, callDomain, "daily") # Grab a token for the user to join with - return JSONResponse({ - "room_url": room.url, - "sipUri": room.config.sip_endpoint - }) + return JSONResponse({"room_url": room.url, "sipUri": room.config.sip_endpoint}) + # ----------------- Main ----------------- # @@ -215,24 +203,18 @@ async def daily_start_bot(request: Request) -> JSONResponse: raise Exception(f"Missing environment variable: {env_var}.") parser = argparse.ArgumentParser(description="Pipecat Bot Runner") - parser.add_argument("--host", type=str, - default=os.getenv("HOST", "0.0.0.0"), help="Host address") - parser.add_argument("--port", type=int, - default=os.getenv("PORT", 7860), help="Port number") - parser.add_argument("--reload", action="store_true", - default=True, help="Reload code on change") + parser.add_argument( + "--host", type=str, default=os.getenv("HOST", "0.0.0.0"), help="Host address" + ) + parser.add_argument("--port", type=int, default=os.getenv("PORT", 7860), help="Port number") + parser.add_argument("--reload", action="store_true", default=True, help="Reload code on change") config = parser.parse_args() try: import uvicorn - uvicorn.run( - "bot_runner:app", - host=config.host, - port=config.port, - reload=config.reload - ) + uvicorn.run("bot_runner:app", host=config.host, port=config.port, reload=config.reload) except KeyboardInterrupt: print("Pipecat runner shutting down...") diff --git a/examples/dialin-chatbot/bot_twilio.py b/examples/dialin-chatbot/bot_twilio.py index e6653babd..c2fe144a6 100644 --- a/examples/dialin-chatbot/bot_twilio.py +++ b/examples/dialin-chatbot/bot_twilio.py @@ -6,11 +6,11 @@ from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask -from pipecat.processors.aggregators.llm_response import LLMAssistantResponseAggregator, LLMUserResponseAggregator -from pipecat.frames.frames import ( - LLMMessagesFrame, - EndFrame +from pipecat.processors.aggregators.llm_response import ( + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, ) +from pipecat.frames.frames import LLMMessagesFrame, EndFrame from pipecat.services.elevenlabs import ElevenLabsTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport @@ -21,14 +21,15 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) logger.add(sys.stderr, level="DEBUG") -twilio_account_sid = os.getenv('TWILIO_ACCOUNT_SID') -twilio_auth_token = os.getenv('TWILIO_AUTH_TOKEN') +twilio_account_sid = os.getenv("TWILIO_ACCOUNT_SID") +twilio_auth_token = os.getenv("TWILIO_AUTH_TOKEN") twilioclient = Client(twilio_account_sid, twilio_auth_token) daily_api_key = os.getenv("DAILY_API_KEY", "") @@ -51,7 +52,7 @@ async def main(room_url: str, token: str, callId: str, sipUri: str): vad_enabled=True, vad_analyzer=SileroVADAnalyzer(), transcription_enabled=True, - ) + ), ) tts = ElevenLabsTTSService( @@ -59,10 +60,7 @@ async def main(room_url: str, token: str, callId: str, sipUri: str): voice_id=os.getenv("ELEVENLABS_VOICE_ID", ""), ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o" - ) + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") messages = [ { @@ -74,14 +72,16 @@ async def main(room_url: str, token: str, callId: str, sipUri: str): tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), - tma_in, - llm, - tts, - transport.output(), - tma_out, - ]) + pipeline = Pipeline( + [ + transport.input(), + tma_in, + llm, + tts, + transport.output(), + tma_out, + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True)) @@ -103,7 +103,7 @@ async def on_dialin_ready(transport, cdata): try: # The TwiML is updated using Twilio's client library call = twilioclient.calls(callId).update( - twiml=f'{sipUri}' + twiml=f"{sipUri}" ) except Exception as e: raise Exception(f"Failed to forward call: {str(e)}") diff --git a/examples/foundational/01-say-one-thing.py b/examples/foundational/01-say-one-thing.py index fce774822..288fcefc3 100644 --- a/examples/foundational/01-say-one-thing.py +++ b/examples/foundational/01-say-one-thing.py @@ -21,6 +21,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -32,7 +33,8 @@ async def main(): (room_url, _) = await configure(session) transport = DailyTransport( - room_url, None, "Say One Thing", DailyParams(audio_out_enabled=True)) + room_url, None, "Say One Thing", DailyParams(audio_out_enabled=True) + ) tts = CartesiaHttpTTSService( api_key=os.getenv("CARTESIA_API_KEY"), @@ -47,10 +49,11 @@ async def main(): # participant joins. @transport.event_handler("on_participant_joined") async def on_new_participant_joined(transport, participant): - participant_name = participant["info"]["userName"] or '' + participant_name = participant["info"]["userName"] or "" await task.queue_frames([TextFrame(f"Hello there, {participant_name}!"), EndFrame()]) await runner.run(task) + if __name__ == "__main__": asyncio.run(main()) diff --git a/examples/foundational/01a-local-audio.py b/examples/foundational/01a-local-audio.py index df63bca99..d39e922d7 100644 --- a/examples/foundational/01a-local-audio.py +++ b/examples/foundational/01a-local-audio.py @@ -20,6 +20,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) diff --git a/examples/foundational/02-llm-say-one-thing.py b/examples/foundational/02-llm-say-one-thing.py index 00a1e9e51..8cce7a017 100644 --- a/examples/foundational/02-llm-say-one-thing.py +++ b/examples/foundational/02-llm-say-one-thing.py @@ -22,6 +22,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -33,25 +34,22 @@ async def main(): (room_url, _) = await configure(session) transport = DailyTransport( - room_url, - None, - "Say One Thing From an LLM", - DailyParams(audio_out_enabled=True)) + room_url, None, "Say One Thing From an LLM", DailyParams(audio_out_enabled=True) + ) tts = CartesiaHttpTTSService( api_key=os.getenv("CARTESIA_API_KEY"), voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") messages = [ { "role": "system", "content": "You are an LLM in a WebRTC session, and this is a 'hello world' demo. Say hello to the world.", - }] + } + ] runner = PipelineRunner() diff --git a/examples/foundational/03-still-frame.py b/examples/foundational/03-still-frame.py index 1ad36dfcc..46e333ba4 100644 --- a/examples/foundational/03-still-frame.py +++ b/examples/foundational/03-still-frame.py @@ -21,6 +21,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -35,17 +36,11 @@ async def main(): room_url, None, "Show a still frame image", - DailyParams( - camera_out_enabled=True, - camera_out_width=1024, - camera_out_height=1024 - ) + DailyParams(camera_out_enabled=True, camera_out_width=1024, camera_out_height=1024), ) imagegen = FalImageGenService( - params=FalImageGenService.InputParams( - image_size="square_hd" - ), + params=FalImageGenService.InputParams(image_size="square_hd"), aiohttp_session=session, key=os.getenv("FAL_KEY"), ) diff --git a/examples/foundational/03a-local-still-frame.py b/examples/foundational/03a-local-still-frame.py index 14e092508..c06834d90 100644 --- a/examples/foundational/03a-local-still-frame.py +++ b/examples/foundational/03a-local-still-frame.py @@ -22,6 +22,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -35,15 +36,11 @@ async def main(): transport = TkLocalTransport( tk_root, - TransportParams( - camera_out_enabled=True, - camera_out_width=1024, - camera_out_height=1024)) + TransportParams(camera_out_enabled=True, camera_out_width=1024, camera_out_height=1024), + ) imagegen = FalImageGenService( - params=FalImageGenService.InputParams( - image_size="square_hd" - ), + params=FalImageGenService.InputParams(image_size="square_hd"), aiohttp_session=session, key=os.getenv("FAL_KEY"), ) diff --git a/examples/foundational/04-utterance-and-speech.py b/examples/foundational/04-utterance-and-speech.py index 10a1dcf1c..7f63757d6 100644 --- a/examples/foundational/04-utterance-and-speech.py +++ b/examples/foundational/04-utterance-and-speech.py @@ -28,6 +28,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -58,8 +59,7 @@ async def main(): voice_id=os.getenv("ELEVENLABS_VOICE_ID"), ) - messages = [{"role": "system", - "content": "tell the user a joke about llamas"}] + messages = [{"role": "system", "content": "tell the user a joke about llamas"}] # Start a task to run the LLM to create a joke, and convert the LLM # output to audio frames. This task will run in parallel with generating @@ -77,8 +77,7 @@ async def main(): ] ) - merge_pipeline = SequentialMergePipeline( - [simple_tts_pipeline, llm_pipeline]) + merge_pipeline = SequentialMergePipeline([simple_tts_pipeline, llm_pipeline]) await asyncio.gather( transport.run(merge_pipeline), diff --git a/examples/foundational/05-sync-speech-and-image.py b/examples/foundational/05-sync-speech-and-image.py index 07e54ab8a..dae860a92 100644 --- a/examples/foundational/05-sync-speech-and-image.py +++ b/examples/foundational/05-sync-speech-and-image.py @@ -16,7 +16,7 @@ Frame, LLMFullResponseStartFrame, LLMMessagesFrame, - TextFrame + TextFrame, ) from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -34,6 +34,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -81,8 +82,8 @@ async def main(): audio_out_enabled=True, camera_out_enabled=True, camera_out_width=1024, - camera_out_height=1024 - ) + camera_out_height=1024, + ), ) tts = CartesiaHttpTTSService( @@ -90,14 +91,10 @@ async def main(): voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") imagegen = FalImageGenService( - params=FalImageGenService.InputParams( - image_size="square_hd" - ), + params=FalImageGenService.InputParams(image_size="square_hd"), aiohttp_session=session, key=os.getenv("FAL_KEY"), ) @@ -112,15 +109,17 @@ async def main(): # # Note that `SyncParallelPipeline` requires all processors in it to be # synchronous (which is the default for most processors). - pipeline = Pipeline([ - llm, # LLM - sentence_aggregator, # Aggregates LLM output into full sentences - SyncParallelPipeline( # Run pipelines in parallel aggregating the result - [month_prepender, tts], # Create "Month: sentence" and output audio - [imagegen] # Generate image - ), - transport.output() # Transport output - ]) + pipeline = Pipeline( + [ + llm, # LLM + sentence_aggregator, # Aggregates LLM output into full sentences + SyncParallelPipeline( # Run pipelines in parallel aggregating the result + [month_prepender, tts], # Create "Month: sentence" and output audio + [imagegen], # Generate image + ), + transport.output(), # Transport output + ] + ) frames = [] for month in [ diff --git a/examples/foundational/05a-local-sync-speech-and-image.py b/examples/foundational/05a-local-sync-speech-and-image.py index d9a0e792e..27c36f6ce 100644 --- a/examples/foundational/05a-local-sync-speech-and-image.py +++ b/examples/foundational/05a-local-sync-speech-and-image.py @@ -17,7 +17,8 @@ TTSAudioRawFrame, URLImageRawFrame, LLMMessagesFrame, - TextFrame) + TextFrame, +) from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.sync_parallel_pipeline import SyncParallelPipeline @@ -48,7 +49,12 @@ async def main(): runner = PipelineRunner() async def get_month_data(month): - messages = [{"role": "system", "content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.", }] + messages = [ + { + "role": "system", + "content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.", + } + ] class ImageDescription(FrameProcessor): def __init__(self): @@ -74,7 +80,8 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): if isinstance(frame, TTSAudioRawFrame): self.audio.extend(frame.audio) self.frame = OutputAudioRawFrame( - bytes(self.audio), frame.sample_rate, frame.num_channels) + bytes(self.audio), frame.sample_rate, frame.num_channels + ) class ImageGrabber(FrameProcessor): def __init__(self): @@ -87,9 +94,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): if isinstance(frame, URLImageRawFrame): self.frame = frame - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") tts = CartesiaHttpTTSService( api_key=os.getenv("CARTESIA_API_KEY"), @@ -97,11 +102,10 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): ) imagegen = FalImageGenService( - params=FalImageGenService.InputParams( - image_size="square_hd" - ), + params=FalImageGenService.InputParams(image_size="square_hd"), aiohttp_session=session, - key=os.getenv("FAL_KEY")) + key=os.getenv("FAL_KEY"), + ) sentence_aggregator = SentenceAggregator() @@ -119,15 +123,17 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): # # Note that `SyncParallelPipeline` requires all processors in it to # be synchronous (which is the default for most processors). - pipeline = Pipeline([ - llm, # LLM - sentence_aggregator, # Aggregates LLM output into full sentences - description, # Store sentence - SyncParallelPipeline( - [tts, audio_grabber], # Generate and store audio for the given sentence - [imagegen, image_grabber] # Generate and storeimage for the given sentence - ) - ]) + pipeline = Pipeline( + [ + llm, # LLM + sentence_aggregator, # Aggregates LLM output into full sentences + description, # Store sentence + SyncParallelPipeline( + [tts, audio_grabber], # Generate and store audio for the given sentence + [imagegen, image_grabber], # Generate and storeimage for the given sentence + ), + ] + ) task = PipelineTask(pipeline) await task.queue_frame(LLMMessagesFrame(messages)) @@ -148,7 +154,9 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): audio_out_enabled=True, camera_out_enabled=True, camera_out_width=1024, - camera_out_height=1024)) + camera_out_height=1024, + ), + ) pipeline = Pipeline([transport.output()]) diff --git a/examples/foundational/06-listen-and-respond.py b/examples/foundational/06-listen-and-respond.py index 6a10f927c..ce9e235f5 100644 --- a/examples/foundational/06-listen-and-respond.py +++ b/examples/foundational/06-listen-and-respond.py @@ -10,7 +10,12 @@ import sys from pipecat.frames.frames import Frame, LLMMessagesFrame, MetricsFrame -from pipecat.metrics.metrics import TTFBMetricsData, ProcessingMetricsData, LLMUsageMetricsData, TTSUsageMetricsData +from pipecat.metrics.metrics import ( + TTFBMetricsData, + ProcessingMetricsData, + LLMUsageMetricsData, + TTSUsageMetricsData, +) from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask @@ -29,6 +34,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -48,7 +54,8 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): print( f"!!! MetricsFrame: {frame}, tokens: { tokens.prompt_tokens}, characters: { - tokens.completion_tokens}") + tokens.completion_tokens}" + ) elif isinstance(d, TTSUsageMetricsData): print(f"!!! MetricsFrame: {frame}, characters: {d.value}") await self.push_frame(frame, direction) @@ -66,8 +73,8 @@ async def main(): audio_out_enabled=True, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) tts = CartesiaTTSService( @@ -75,10 +82,7 @@ async def main(): voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o" - ) + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") ml = MetricsLogger() @@ -91,15 +95,17 @@ async def main(): tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), - tma_in, - llm, - tts, - ml, - transport.output(), - tma_out, - ]) + pipeline = Pipeline( + [ + transport.input(), + tma_in, + llm, + tts, + ml, + transport.output(), + tma_out, + ] + ) task = PipelineTask(pipeline) @@ -107,8 +113,7 @@ async def main(): async def on_first_participant_joined(transport, participant): transport.capture_participant_transcription(participant["id"]) # Kick off the conversation. - messages.append( - {"role": "system", "content": "Please introduce yourself to the user."}) + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) await task.queue_frames([LLMMessagesFrame(messages)]) runner = PipelineRunner() diff --git a/examples/foundational/06a-image-sync.py b/examples/foundational/06a-image-sync.py index db48b709e..30bd8dc64 100644 --- a/examples/foundational/06a-image-sync.py +++ b/examples/foundational/06a-image-sync.py @@ -31,6 +31,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -52,16 +53,21 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) if not isinstance(frame, SystemFrame) and direction == FrameDirection.DOWNSTREAM: - await self.push_frame(OutputImageRawFrame( - image=self._speaking_image_bytes, - size=(1024, 1024), - format=self._speaking_image_format) + await self.push_frame( + OutputImageRawFrame( + image=self._speaking_image_bytes, + size=(1024, 1024), + format=self._speaking_image_format, + ) ) await self.push_frame(frame) - await self.push_frame(OutputImageRawFrame( - image=self._waiting_image_bytes, - size=(1024, 1024), - format=self._waiting_image_format)) + await self.push_frame( + OutputImageRawFrame( + image=self._waiting_image_bytes, + size=(1024, 1024), + format=self._waiting_image_format, + ) + ) else: await self.push_frame(frame) @@ -82,7 +88,7 @@ async def main(): transcription_enabled=True, vad_enabled=True, vad_analyzer=SileroVADAnalyzer(), - ) + ), ) tts = CartesiaHttpTTSService( @@ -90,9 +96,7 @@ async def main(): voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") messages = [ { @@ -109,21 +113,23 @@ async def main(): os.path.join(os.path.dirname(__file__), "assets", "waiting.png"), ) - pipeline = Pipeline([ - transport.input(), - image_sync_aggregator, - tma_in, - llm, - tts, - transport.output(), - tma_out - ]) + pipeline = Pipeline( + [ + transport.input(), + image_sync_aggregator, + tma_in, + llm, + tts, + transport.output(), + tma_out, + ] + ) task = PipelineTask(pipeline) @transport.event_handler("on_first_participant_joined") async def on_first_participant_joined(transport, participant): - participant_name = participant["info"]["userName"] or '' + participant_name = participant["info"]["userName"] or "" transport.capture_participant_transcription(participant["id"]) await task.queue_frames([TextFrame(f"Hi there {participant_name}!")]) diff --git a/examples/foundational/07-interruptible.py b/examples/foundational/07-interruptible.py index 90c10c76d..8026940f8 100644 --- a/examples/foundational/07-interruptible.py +++ b/examples/foundational/07-interruptible.py @@ -14,7 +14,9 @@ from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.llm_response import ( - LLMAssistantResponseAggregator, LLMUserResponseAggregator) + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport @@ -25,6 +27,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -43,8 +46,8 @@ async def main(): audio_out_enabled=True, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) tts = CartesiaTTSService( @@ -52,9 +55,7 @@ async def main(): voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") messages = [ { @@ -66,28 +67,32 @@ async def main(): tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), # Transport user input - tma_in, # User responses - llm, # LLM - tts, # TTS - transport.output(), # Transport bot output - tma_out # Assistant spoken responses - ]) - - task = PipelineTask(pipeline, PipelineParams( - allow_interruptions=True, - enable_metrics=True, - enable_usage_metrics=True, - report_only_initial_ttfb=True, - )) + pipeline = Pipeline( + [ + transport.input(), # Transport user input + tma_in, # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + tma_out, # Assistant spoken responses + ] + ) + + task = PipelineTask( + pipeline, + PipelineParams( + allow_interruptions=True, + enable_metrics=True, + enable_usage_metrics=True, + report_only_initial_ttfb=True, + ), + ) @transport.event_handler("on_first_participant_joined") async def on_first_participant_joined(transport, participant): transport.capture_participant_transcription(participant["id"]) # Kick off the conversation. - messages.append( - {"role": "system", "content": "Please introduce yourself to the user."}) + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) await task.queue_frames([LLMMessagesFrame(messages)]) runner = PipelineRunner() diff --git a/examples/foundational/07a-interruptible-anthropic.py b/examples/foundational/07a-interruptible-anthropic.py index a8d90f087..2bded2480 100644 --- a/examples/foundational/07a-interruptible-anthropic.py +++ b/examples/foundational/07a-interruptible-anthropic.py @@ -14,7 +14,9 @@ from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.llm_response import ( - LLMAssistantResponseAggregator, LLMUserResponseAggregator) + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.anthropic import AnthropicLLMService from pipecat.transports.services.daily import DailyParams, DailyTransport @@ -25,6 +27,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -43,8 +46,8 @@ async def main(): audio_out_enabled=True, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) tts = CartesiaTTSService( @@ -53,8 +56,8 @@ async def main(): ) llm = AnthropicLLMService( - api_key=os.getenv("ANTHROPIC_API_KEY"), - model="claude-3-opus-20240229") + api_key=os.getenv("ANTHROPIC_API_KEY"), model="claude-3-opus-20240229" + ) # todo: think more about how to handle system prompts in a more general way. OpenAI, # Google, and Anthropic all have slightly different approaches to providing a system @@ -69,14 +72,16 @@ async def main(): tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), # Transport user input - tma_in, # User responses - llm, # LLM - tts, # TTS - transport.output(), # Transport bot output - tma_out # Assistant spoken responses - ]) + pipeline = Pipeline( + [ + transport.input(), # Transport user input + tma_in, # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + tma_out, # Assistant spoken responses + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True)) diff --git a/examples/foundational/07b-interruptible-langchain.py b/examples/foundational/07b-interruptible-langchain.py index 872dbf9bb..5ebfd3388 100644 --- a/examples/foundational/07b-interruptible-langchain.py +++ b/examples/foundational/07b-interruptible-langchain.py @@ -15,7 +15,9 @@ from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.llm_response import ( - LLMAssistantResponseAggregator, LLMUserResponseAggregator) + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) from pipecat.processors.frameworks.langchain import LangchainProcessor from pipecat.services.cartesia import CartesiaTTSService from pipecat.transports.services.daily import DailyParams, DailyTransport @@ -32,6 +34,7 @@ from runner import configure from dotenv import load_dotenv + load_dotenv(override=True) @@ -70,19 +73,22 @@ async def main(): prompt = ChatPromptTemplate.from_messages( [ - ("system", - "Be nice and helpful. Answer very briefly and without special characters like `#` or `*`. " - "Your response will be synthesized to voice and those characters will create unnatural sounds.", - ), + ( + "system", + "Be nice and helpful. Answer very briefly and without special characters like `#` or `*`. " + "Your response will be synthesized to voice and those characters will create unnatural sounds.", + ), MessagesPlaceholder("chat_history"), ("human", "{input}"), - ]) + ] + ) chain = prompt | ChatOpenAI(model="gpt-4o", temperature=0.7) history_chain = RunnableWithMessageHistory( chain, get_session_history, history_messages_key="chat_history", - input_messages_key="input") + input_messages_key="input", + ) lc = LangchainProcessor(history_chain) tma_in = LLMUserResponseAggregator() @@ -90,12 +96,12 @@ async def main(): pipeline = Pipeline( [ - transport.input(), # Transport user input - tma_in, # User responses - lc, # Langchain - tts, # TTS - transport.output(), # Transport bot output - tma_out, # Assistant spoken responses + transport.input(), # Transport user input + tma_in, # User responses + lc, # Langchain + tts, # TTS + transport.output(), # Transport bot output + tma_out, # Assistant spoken responses ] ) @@ -109,11 +115,7 @@ async def on_first_participant_joined(transport, participant): # the `LLMMessagesFrame` will be picked up by the LangchainProcessor using # only the content of the last message to inject it in the prompt defined # above. So no role is required here. - messages = [( - { - "content": "Please briefly introduce yourself to the user." - } - )] + messages = [({"content": "Please briefly introduce yourself to the user."})] await task.queue_frames([LLMMessagesFrame(messages)]) runner = PipelineRunner() diff --git a/examples/foundational/07c-interruptible-deepgram.py b/examples/foundational/07c-interruptible-deepgram.py index dad6834ec..41bef8a47 100644 --- a/examples/foundational/07c-interruptible-deepgram.py +++ b/examples/foundational/07c-interruptible-deepgram.py @@ -14,7 +14,9 @@ from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.llm_response import ( - LLMAssistantResponseAggregator, LLMUserResponseAggregator) + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) from pipecat.services.deepgram import DeepgramSTTService, DeepgramTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport @@ -25,6 +27,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -43,21 +46,17 @@ async def main(): audio_out_enabled=True, vad_enabled=True, vad_analyzer=SileroVADAnalyzer(), - vad_audio_passthrough=True - ) + vad_audio_passthrough=True, + ), ) stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) tts = DeepgramTTSService( - aiohttp_session=session, - api_key=os.getenv("DEEPGRAM_API_KEY"), - voice="aura-helios-en" + aiohttp_session=session, api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en" ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") messages = [ { @@ -69,15 +68,17 @@ async def main(): tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), # Transport user input - stt, # STT - tma_in, # User responses - llm, # LLM - tts, # TTS - transport.output(), # Transport bot output - tma_out # Assistant spoken responses - ]) + pipeline = Pipeline( + [ + transport.input(), # Transport user input + stt, # STT + tma_in, # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + tma_out, # Assistant spoken responses + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True)) @@ -85,8 +86,7 @@ async def main(): async def on_first_participant_joined(transport, participant): transport.capture_participant_transcription(participant["id"]) # Kick off the conversation. - messages.append( - {"role": "system", "content": "Please introduce yourself to the user."}) + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) await task.queue_frames([LLMMessagesFrame(messages)]) runner = PipelineRunner() diff --git a/examples/foundational/07e-interruptible-playht.py b/examples/foundational/07e-interruptible-playht.py index 1ad61dc5e..9c48df93a 100644 --- a/examples/foundational/07e-interruptible-playht.py +++ b/examples/foundational/07e-interruptible-playht.py @@ -14,7 +14,9 @@ from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.llm_response import ( - LLMAssistantResponseAggregator, LLMUserResponseAggregator) + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) from pipecat.services.playht import PlayHTTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport @@ -25,6 +27,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -44,8 +47,8 @@ async def main(): audio_out_sample_rate=16000, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) tts = PlayHTTTSService( @@ -54,9 +57,7 @@ async def main(): voice_url="s3://voice-cloning-zero-shot/801a663f-efd0-4254-98d0-5c175514c3e8/jennifer/manifest.json", ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") messages = [ { @@ -68,14 +69,16 @@ async def main(): tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), # Transport user input - tma_in, # User responses - llm, # LLM - tts, # TTS - transport.output(), # Transport bot output - tma_out # Assistant spoken responses - ]) + pipeline = Pipeline( + [ + transport.input(), # Transport user input + tma_in, # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + tma_out, # Assistant spoken responses + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True)) @@ -83,8 +86,7 @@ async def main(): async def on_first_participant_joined(transport, participant): transport.capture_participant_transcription(participant["id"]) # Kick off the conversation. - messages.append( - {"role": "system", "content": "Please introduce yourself to the user."}) + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) await task.queue_frames([LLMMessagesFrame(messages)]) runner = PipelineRunner() diff --git a/examples/foundational/07f-interruptible-azure.py b/examples/foundational/07f-interruptible-azure.py index 50f67f94c..11bfebe53 100644 --- a/examples/foundational/07f-interruptible-azure.py +++ b/examples/foundational/07f-interruptible-azure.py @@ -14,7 +14,9 @@ from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.llm_response import ( - LLMAssistantResponseAggregator, LLMUserResponseAggregator) + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) from pipecat.services.azure import AzureLLMService, AzureSTTService, AzureTTSService from pipecat.transports.services.daily import DailyParams, DailyTransport from pipecat.vad.silero import SileroVADAnalyzer @@ -25,6 +27,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -45,7 +48,7 @@ async def main(): vad_enabled=True, vad_analyzer=SileroVADAnalyzer(), vad_audio_passthrough=True, - ) + ), ) stt = AzureSTTService( @@ -74,15 +77,17 @@ async def main(): tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), # Transport user input - stt, # STT - tma_in, # User responses - llm, # LLM - tts, # TTS - transport.output(), # Transport bot output - tma_out # Assistant spoken responses - ]) + pipeline = Pipeline( + [ + transport.input(), # Transport user input + stt, # STT + tma_in, # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + tma_out, # Assistant spoken responses + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True)) @@ -90,8 +95,7 @@ async def main(): async def on_first_participant_joined(transport, participant): transport.capture_participant_transcription(participant["id"]) # Kick off the conversation. - messages.append( - {"role": "system", "content": "Please introduce yourself to the user."}) + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) await task.queue_frames([LLMMessagesFrame(messages)]) runner = PipelineRunner() diff --git a/examples/foundational/07g-interruptible-openai-tts.py b/examples/foundational/07g-interruptible-openai-tts.py index 2b27f7c0b..70576c97a 100644 --- a/examples/foundational/07g-interruptible-openai-tts.py +++ b/examples/foundational/07g-interruptible-openai-tts.py @@ -14,7 +14,9 @@ from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.llm_response import ( - LLMAssistantResponseAggregator, LLMUserResponseAggregator) + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) from pipecat.services.openai import OpenAITTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport @@ -25,6 +27,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -44,18 +47,13 @@ async def main(): audio_out_sample_rate=24000, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) - tts = OpenAITTSService( - api_key=os.getenv("OPENAI_API_KEY"), - voice="alloy" - ) + tts = OpenAITTSService(api_key=os.getenv("OPENAI_API_KEY"), voice="alloy") - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") messages = [ { @@ -67,14 +65,16 @@ async def main(): tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), # Transport user input - tma_in, # User responses - llm, # LLM - tts, # TTS - transport.output(), # Transport bot output - tma_out # Assistant spoken responses - ]) + pipeline = Pipeline( + [ + transport.input(), # Transport user input + tma_in, # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + tma_out, # Assistant spoken responses + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True)) @@ -82,8 +82,7 @@ async def main(): async def on_first_participant_joined(transport, participant): transport.capture_participant_transcription(participant["id"]) # Kick off the conversation. - messages.append( - {"role": "system", "content": "Please introduce yourself to the user."}) + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) await task.queue_frames([LLMMessagesFrame(messages)]) runner = PipelineRunner() diff --git a/examples/foundational/07h-interruptible-openpipe.py b/examples/foundational/07h-interruptible-openpipe.py index 489015f21..b87563bd3 100644 --- a/examples/foundational/07h-interruptible-openpipe.py +++ b/examples/foundational/07h-interruptible-openpipe.py @@ -28,6 +28,7 @@ import time from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -46,8 +47,8 @@ async def main(): audio_out_enabled=True, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) tts = CartesiaTTSService( @@ -60,9 +61,7 @@ async def main(): api_key=os.getenv("OPENAI_API_KEY"), openpipe_api_key=os.getenv("OPENPIPE_API_KEY"), model="gpt-4o", - tags={ - "conversation_id": f"pipecat-{timestamp}" - } + tags={"conversation_id": f"pipecat-{timestamp}"}, ) messages = [ @@ -74,14 +73,16 @@ async def main(): tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), # Transport user input - tma_in, # User responses - llm, # LLM - tts, # TTS - transport.output(), # Transport bot output - tma_out # Assistant spoken responses - ]) + pipeline = Pipeline( + [ + transport.input(), # Transport user input + tma_in, # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + tma_out, # Assistant spoken responses + ] + ) task = PipelineTask(pipeline, params=PipelineParams(allow_interruptions=True)) @@ -89,8 +90,7 @@ async def main(): async def on_first_participant_joined(transport, participant): transport.capture_participant_transcription(participant["id"]) # Kick off the conversation. - messages.append( - {"role": "system", "content": "Please introduce yourself to the user."}) + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) await task.queue_frames([LLMMessagesFrame(messages)]) runner = PipelineRunner() diff --git a/examples/foundational/07i-interruptible-xtts.py b/examples/foundational/07i-interruptible-xtts.py index e892651e0..2e6f95433 100644 --- a/examples/foundational/07i-interruptible-xtts.py +++ b/examples/foundational/07i-interruptible-xtts.py @@ -14,7 +14,9 @@ from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.llm_response import ( - LLMAssistantResponseAggregator, LLMUserResponseAggregator) + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) from pipecat.services.deepgram import DeepgramSTTService, DeepgramTTSService from pipecat.services.openai import OpenAILLMService from pipecat.services.xtts import XTTSService @@ -26,6 +28,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -45,19 +48,17 @@ async def main(): transcription_enabled=True, vad_enabled=True, vad_analyzer=SileroVADAnalyzer(), - ) + ), ) tts = XTTSService( aiohttp_session=session, voice_id="Claribel Dervla", language="en", - base_url="http://localhost:8000" + base_url="http://localhost:8000", ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") messages = [ { @@ -69,14 +70,16 @@ async def main(): tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), # Transport user input - tma_in, # User responses - llm, # LLM - tts, # TTS - transport.output(), # Transport bot output - tma_out # Assistant spoken responses - ]) + pipeline = Pipeline( + [ + transport.input(), # Transport user input + tma_in, # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + tma_out, # Assistant spoken responses + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True)) @@ -84,8 +87,7 @@ async def main(): async def on_first_participant_joined(transport, participant): transport.capture_participant_transcription(participant["id"]) # Kick off the conversation. - messages.append( - {"role": "system", "content": "Please introduce yourself to the user."}) + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) await task.queue_frames([LLMMessagesFrame(messages)]) runner = PipelineRunner() diff --git a/examples/foundational/07j-interruptible-gladia.py b/examples/foundational/07j-interruptible-gladia.py index aff975e29..dc07ec7ba 100644 --- a/examples/foundational/07j-interruptible-gladia.py +++ b/examples/foundational/07j-interruptible-gladia.py @@ -14,7 +14,9 @@ from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.llm_response import ( - LLMAssistantResponseAggregator, LLMUserResponseAggregator) + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.gladia import GladiaSTTService from pipecat.services.openai import OpenAILLMService @@ -26,6 +28,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -45,7 +48,7 @@ async def main(): vad_enabled=True, vad_analyzer=SileroVADAnalyzer(), vad_audio_passthrough=True, - ) + ), ) stt = GladiaSTTService( @@ -57,9 +60,7 @@ async def main(): voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") messages = [ { @@ -71,15 +72,17 @@ async def main(): tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), # Transport user input - stt, # STT - tma_in, # User responses - llm, # LLM - tts, # TTS - transport.output(), # Transport bot output - tma_out # Assistant spoken responses - ]) + pipeline = Pipeline( + [ + transport.input(), # Transport user input + stt, # STT + tma_in, # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + tma_out, # Assistant spoken responses + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True)) @@ -87,8 +90,7 @@ async def main(): async def on_first_participant_joined(transport, participant): transport.capture_participant_transcription(participant["id"]) # Kick off the conversation. - messages.append( - {"role": "system", "content": "Please introduce yourself to the user."}) + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) await task.queue_frames([LLMMessagesFrame(messages)]) runner = PipelineRunner() diff --git a/examples/foundational/07k-interruptible-lmnt.py b/examples/foundational/07k-interruptible-lmnt.py index 6e68564ea..fb231c7bc 100644 --- a/examples/foundational/07k-interruptible-lmnt.py +++ b/examples/foundational/07k-interruptible-lmnt.py @@ -14,7 +14,9 @@ from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.llm_response import ( - LLMAssistantResponseAggregator, LLMUserResponseAggregator) + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) from pipecat.services.lmnt import LmntTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport @@ -25,6 +27,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -44,18 +47,13 @@ async def main(): audio_out_sample_rate=24000, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) - tts = LmntTTSService( - api_key=os.getenv("LMNT_API_KEY"), - voice_id="morgan" - ) + tts = LmntTTSService(api_key=os.getenv("LMNT_API_KEY"), voice_id="morgan") - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") messages = [ { @@ -67,14 +65,16 @@ async def main(): tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), # Transport user input - tma_in, # User responses - llm, # LLM - tts, # TTS - transport.output(), # Transport bot output - tma_out # Assistant spoken responses - ]) + pipeline = Pipeline( + [ + transport.input(), # Transport user input + tma_in, # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + tma_out, # Assistant spoken responses + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True)) @@ -82,8 +82,7 @@ async def main(): async def on_first_participant_joined(transport, participant): transport.capture_participant_transcription(participant["id"]) # Kick off the conversation. - messages.append( - {"role": "system", "content": "Please introduce yourself to the user."}) + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) await task.queue_frames([LLMMessagesFrame(messages)]) runner = PipelineRunner() diff --git a/examples/foundational/07l-interruptible-together.py b/examples/foundational/07l-interruptible-together.py index d5afa6d0d..e2cb55fed 100644 --- a/examples/foundational/07l-interruptible-together.py +++ b/examples/foundational/07l-interruptible-together.py @@ -14,7 +14,9 @@ from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.llm_response import ( - LLMAssistantResponseAggregator, LLMUserResponseAggregator) + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.together import TogetherLLMService from pipecat.transports.services.daily import DailyParams, DailyTransport @@ -25,6 +27,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -43,8 +46,8 @@ async def main(): audio_out_enabled=True, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) tts = CartesiaTTSService( @@ -62,8 +65,8 @@ async def main(): extra={ "frequency_penalty": 2.0, "presence_penalty": 0.0, - } - ) + }, + ), ) messages = [ @@ -76,14 +79,16 @@ async def main(): tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), # Transport user input - tma_in, # User responses - llm, # LLM - tts, # TTS - transport.output(), # Transport bot output - tma_out # Assistant spoken responses - ]) + pipeline = Pipeline( + [ + transport.input(), # Transport user input + tma_in, # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + tma_out, # Assistant spoken responses + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True)) diff --git a/examples/foundational/08-bots-arguing.py b/examples/foundational/08-bots-arguing.py index abf5a1d54..150fbfc0a 100644 --- a/examples/foundational/08-bots-arguing.py +++ b/examples/foundational/08-bots-arguing.py @@ -15,6 +15,7 @@ from runner import configure from dotenv import load_dotenv + load_dotenv(override=True) logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") @@ -53,9 +54,7 @@ async def main(): voice_id="jBpfuIE2acCO8z3wKNLl", ) dalle = FalImageGenService( - params=FalImageGenService.InputParams( - image_size="1024x1024" - ), + params=FalImageGenService.InputParams(image_size="1024x1024"), aiohttp_session=session, key=os.getenv("FAL_KEY"), ) @@ -75,13 +74,11 @@ async def main(): async def get_text_and_audio(messages) -> Tuple[str, bytearray]: """This function streams text from the LLM and uses the TTS service to convert - that text to speech as it's received. """ + that text to speech as it's received.""" source_queue = asyncio.Queue() sink_queue = asyncio.Queue() sentence_aggregator = SentenceAggregator() - pipeline = Pipeline( - [llm, sentence_aggregator, tts1], source_queue, sink_queue - ) + pipeline = Pipeline([llm, sentence_aggregator, tts1], source_queue, sink_queue) await source_queue.put(LLMMessagesFrame(messages)) await source_queue.put(EndFrame()) diff --git a/examples/foundational/09-mirror.py b/examples/foundational/09-mirror.py index bb6253deb..ff71c60d6 100644 --- a/examples/foundational/09-mirror.py +++ b/examples/foundational/09-mirror.py @@ -8,7 +8,13 @@ import asyncio import sys -from pipecat.frames.frames import Frame, InputAudioRawFrame, InputImageRawFrame, OutputAudioRawFrame, OutputImageRawFrame +from pipecat.frames.frames import ( + Frame, + InputAudioRawFrame, + InputImageRawFrame, + OutputAudioRawFrame, + OutputImageRawFrame, +) from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineTask @@ -20,6 +26,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -27,21 +34,20 @@ class MirrorProcessor(FrameProcessor): - async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) if isinstance(frame, InputAudioRawFrame): - await self.push_frame(OutputAudioRawFrame( - audio=frame.audio, - sample_rate=frame.sample_rate, - num_channels=frame.num_channels) + await self.push_frame( + OutputAudioRawFrame( + audio=frame.audio, + sample_rate=frame.sample_rate, + num_channels=frame.num_channels, + ) ) elif isinstance(frame, InputImageRawFrame): - await self.push_frame(OutputImageRawFrame( - image=frame.image, - size=frame.size, - format=frame.format) + await self.push_frame( + OutputImageRawFrame(image=frame.image, size=frame.size, format=frame.format) ) else: await self.push_frame(frame, direction) @@ -52,15 +58,17 @@ async def main(): (room_url, token) = await configure(session) transport = DailyTransport( - room_url, token, "Test", + room_url, + token, + "Test", DailyParams( audio_in_enabled=True, audio_out_enabled=True, camera_out_enabled=True, camera_out_is_live=True, camera_out_width=1280, - camera_out_height=720 - ) + camera_out_height=720, + ), ) @transport.event_handler("on_first_participant_joined") diff --git a/examples/foundational/09a-local-mirror.py b/examples/foundational/09a-local-mirror.py index afc77470d..c3c66569b 100644 --- a/examples/foundational/09a-local-mirror.py +++ b/examples/foundational/09a-local-mirror.py @@ -10,7 +10,13 @@ import tkinter as tk -from pipecat.frames.frames import Frame, InputAudioRawFrame, InputImageRawFrame, OutputAudioRawFrame, OutputImageRawFrame +from pipecat.frames.frames import ( + Frame, + InputAudioRawFrame, + InputImageRawFrame, + OutputAudioRawFrame, + OutputImageRawFrame, +) from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineTask @@ -24,31 +30,33 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) logger.add(sys.stderr, level="DEBUG") -class MirrorProcessor(FrameProcessor): +class MirrorProcessor(FrameProcessor): async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) if isinstance(frame, InputAudioRawFrame): - await self.push_frame(OutputAudioRawFrame( - audio=frame.audio, - sample_rate=frame.sample_rate, - num_channels=frame.num_channels) + await self.push_frame( + OutputAudioRawFrame( + audio=frame.audio, + sample_rate=frame.sample_rate, + num_channels=frame.num_channels, + ) ) elif isinstance(frame, InputImageRawFrame): - await self.push_frame(OutputImageRawFrame( - image=frame.image, - size=frame.size, - format=frame.format) + await self.push_frame( + OutputImageRawFrame(image=frame.image, size=frame.size, format=frame.format) ) else: await self.push_frame(frame, direction) + async def main(): async with aiohttp.ClientSession() as session: (room_url, token) = await configure(session) @@ -57,8 +65,8 @@ async def main(): tk_root.title("Local Mirror") daily_transport = DailyTransport( - room_url, token, "Test", DailyParams( - audio_in_enabled=True)) + room_url, token, "Test", DailyParams(audio_in_enabled=True) + ) tk_transport = TkLocalTransport( tk_root, @@ -67,7 +75,9 @@ async def main(): camera_out_enabled=True, camera_out_is_live=True, camera_out_width=1280, - camera_out_height=720)) + camera_out_height=720, + ), + ) @daily_transport.event_handler("on_first_participant_joined") async def on_first_participant_joined(transport, participant): diff --git a/examples/foundational/10-wake-phrase.py b/examples/foundational/10-wake-phrase.py index 6e9e106b8..860cda7d0 100644 --- a/examples/foundational/10-wake-phrase.py +++ b/examples/foundational/10-wake-phrase.py @@ -14,7 +14,9 @@ from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.llm_response import ( - LLMAssistantResponseAggregator, LLMUserResponseAggregator) + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport @@ -25,6 +27,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -43,8 +46,8 @@ async def main(): audio_out_enabled=True, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) tts = CartesiaTTSService( @@ -52,9 +55,7 @@ async def main(): voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") messages = [ { @@ -67,15 +68,17 @@ async def main(): tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), # Transport user input - hey_robot_filter, # Filter out speech not directed at the robot - tma_in, # User responses - llm, # LLM - tts, # TTS - transport.output(), # Transport bot output - tma_out # Assistant spoken responses - ]) + pipeline = Pipeline( + [ + transport.input(), # Transport user input + hey_robot_filter, # Filter out speech not directed at the robot + tma_in, # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + tma_out, # Assistant spoken responses + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True)) diff --git a/examples/foundational/11-sound-effects.py b/examples/foundational/11-sound-effects.py index 21b03bedf..89b7ea93c 100644 --- a/examples/foundational/11-sound-effects.py +++ b/examples/foundational/11-sound-effects.py @@ -35,6 +35,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -53,12 +54,12 @@ filename = os.path.splitext(os.path.basename(full_path))[0] # Open the image and convert it to bytes with wave.open(full_path) as audio_file: - sounds[file] = OutputAudioRawFrame(audio_file.readframes(-1), - audio_file.getframerate(), audio_file.getnchannels()) + sounds[file] = OutputAudioRawFrame( + audio_file.readframes(-1), audio_file.getframerate(), audio_file.getnchannels() + ) class OutboundSoundEffectWrapper(FrameProcessor): - async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) @@ -71,7 +72,6 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): class InboundSoundEffectWrapper(FrameProcessor): - async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) @@ -95,13 +95,11 @@ async def main(): audio_out_enabled=True, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") tts = CartesiaHttpTTSService( api_key=os.getenv("CARTESIA_API_KEY"), @@ -122,18 +120,20 @@ async def main(): fl = FrameLogger("LLM Out") fl2 = FrameLogger("Transcription In") - pipeline = Pipeline([ - transport.input(), - tma_in, - in_sound, - fl2, - llm, - fl, - tts, - out_sound, - transport.output(), - tma_out - ]) + pipeline = Pipeline( + [ + transport.input(), + tma_in, + in_sound, + fl2, + llm, + fl, + tts, + out_sound, + transport.output(), + tma_out, + ] + ) @transport.event_handler("on_first_participant_joined") async def on_first_participant_joined(transport, participant): diff --git a/examples/foundational/12-describe-video.py b/examples/foundational/12-describe-video.py index 11240e8de..6b24190d0 100644 --- a/examples/foundational/12-describe-video.py +++ b/examples/foundational/12-describe-video.py @@ -26,6 +26,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -33,7 +34,6 @@ class UserImageRequester(FrameProcessor): - def __init__(self, participant_id: str | None = None): super().__init__() self._participant_id = participant_id @@ -45,7 +45,9 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) if self._participant_id and isinstance(frame, TextFrame): - await self.push_frame(UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM) + await self.push_frame( + UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM + ) await self.push_frame(frame, direction) @@ -61,8 +63,8 @@ async def main(): audio_out_enabled=True, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) user_response = UserResponseAggregator() @@ -86,15 +88,17 @@ async def on_first_participant_joined(transport, participant): transport.capture_participant_transcription(participant["id"]) image_requester.set_participant_id(participant["id"]) - pipeline = Pipeline([ - transport.input(), - user_response, - image_requester, - vision_aggregator, - moondream, - tts, - transport.output() - ]) + pipeline = Pipeline( + [ + transport.input(), + user_response, + image_requester, + vision_aggregator, + moondream, + tts, + transport.output(), + ] + ) task = PipelineTask(pipeline) @@ -102,5 +106,6 @@ async def on_first_participant_joined(transport, participant): await runner.run(task) + if __name__ == "__main__": asyncio.run(main()) diff --git a/examples/foundational/12a-describe-video-gemini-flash.py b/examples/foundational/12a-describe-video-gemini-flash.py index 395abdbdc..440564d23 100644 --- a/examples/foundational/12a-describe-video-gemini-flash.py +++ b/examples/foundational/12a-describe-video-gemini-flash.py @@ -26,6 +26,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -33,7 +34,6 @@ class UserImageRequester(FrameProcessor): - def __init__(self, participant_id: str | None = None): super().__init__() self._participant_id = participant_id @@ -45,7 +45,9 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) if self._participant_id and isinstance(frame, TextFrame): - await self.push_frame(UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM) + await self.push_frame( + UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM + ) await self.push_frame(frame, direction) @@ -62,8 +64,8 @@ async def main(): audio_out_enabled=True, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) user_response = UserResponseAggregator() @@ -73,8 +75,8 @@ async def main(): vision_aggregator = VisionImageFrameAggregator() google = GoogleLLMService( - model="gemini-1.5-flash-latest", - api_key=os.getenv("GOOGLE_API_KEY")) + model="gemini-1.5-flash-latest", api_key=os.getenv("GOOGLE_API_KEY") + ) tts = CartesiaTTSService( api_key=os.getenv("CARTESIA_API_KEY"), @@ -88,15 +90,17 @@ async def on_first_participant_joined(transport, participant): transport.capture_participant_transcription(participant["id"]) image_requester.set_participant_id(participant["id"]) - pipeline = Pipeline([ - transport.input(), - user_response, - image_requester, - vision_aggregator, - google, - tts, - transport.output() - ]) + pipeline = Pipeline( + [ + transport.input(), + user_response, + image_requester, + vision_aggregator, + google, + tts, + transport.output(), + ] + ) task = PipelineTask(pipeline) @@ -104,5 +108,6 @@ async def on_first_participant_joined(transport, participant): await runner.run(task) + if __name__ == "__main__": asyncio.run(main()) diff --git a/examples/foundational/12b-describe-video-gpt-4o.py b/examples/foundational/12b-describe-video-gpt-4o.py index 384c9aa0c..1d2865004 100644 --- a/examples/foundational/12b-describe-video-gpt-4o.py +++ b/examples/foundational/12b-describe-video-gpt-4o.py @@ -26,6 +26,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -33,7 +34,6 @@ class UserImageRequester(FrameProcessor): - def __init__(self, participant_id: str | None = None): super().__init__() self._participant_id = participant_id @@ -45,7 +45,9 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) if self._participant_id and isinstance(frame, TextFrame): - await self.push_frame(UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM) + await self.push_frame( + UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM + ) await self.push_frame(frame, direction) @@ -61,8 +63,8 @@ async def main(): audio_out_enabled=True, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) user_response = UserResponseAggregator() @@ -71,10 +73,7 @@ async def main(): vision_aggregator = VisionImageFrameAggregator() - openai = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o" - ) + openai = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") tts = CartesiaTTSService( api_key=os.getenv("CARTESIA_API_KEY"), @@ -88,15 +87,17 @@ async def on_first_participant_joined(transport, participant): transport.capture_participant_transcription(participant["id"]) image_requester.set_participant_id(participant["id"]) - pipeline = Pipeline([ - transport.input(), - user_response, - image_requester, - vision_aggregator, - openai, - tts, - transport.output() - ]) + pipeline = Pipeline( + [ + transport.input(), + user_response, + image_requester, + vision_aggregator, + openai, + tts, + transport.output(), + ] + ) task = PipelineTask(pipeline) @@ -104,5 +105,6 @@ async def on_first_participant_joined(transport, participant): await runner.run(task) + if __name__ == "__main__": asyncio.run(main()) diff --git a/examples/foundational/12c-describe-video-anthropic.py b/examples/foundational/12c-describe-video-anthropic.py index cc1f14c92..7458adf69 100644 --- a/examples/foundational/12c-describe-video-anthropic.py +++ b/examples/foundational/12c-describe-video-anthropic.py @@ -26,6 +26,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -33,7 +34,6 @@ class UserImageRequester(FrameProcessor): - def __init__(self, participant_id: str | None = None): super().__init__() self._participant_id = participant_id @@ -45,7 +45,9 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) if self._participant_id and isinstance(frame, TextFrame): - await self.push_frame(UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM) + await self.push_frame( + UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM + ) await self.push_frame(frame, direction) @@ -61,8 +63,8 @@ async def main(): audio_out_enabled=True, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) user_response = UserResponseAggregator() @@ -71,9 +73,7 @@ async def main(): vision_aggregator = VisionImageFrameAggregator() - anthropic = AnthropicLLMService( - api_key=os.getenv("ANTHROPIC_API_KEY") - ) + anthropic = AnthropicLLMService(api_key=os.getenv("ANTHROPIC_API_KEY")) tts = CartesiaTTSService( api_key=os.getenv("CARTESIA_API_KEY"), @@ -88,15 +88,17 @@ async def on_first_participant_joined(transport, participant): transport.capture_participant_transcription(participant["id"]) image_requester.set_participant_id(participant["id"]) - pipeline = Pipeline([ - transport.input(), - user_response, - image_requester, - vision_aggregator, - anthropic, - tts, - transport.output() - ]) + pipeline = Pipeline( + [ + transport.input(), + user_response, + image_requester, + vision_aggregator, + anthropic, + tts, + transport.output(), + ] + ) task = PipelineTask(pipeline) @@ -104,5 +106,6 @@ async def on_first_participant_joined(transport, participant): await runner.run(task) + if __name__ == "__main__": asyncio.run(main()) diff --git a/examples/foundational/13-whisper-transcription.py b/examples/foundational/13-whisper-transcription.py index bb24a80bb..c895cb944 100644 --- a/examples/foundational/13-whisper-transcription.py +++ b/examples/foundational/13-whisper-transcription.py @@ -21,6 +21,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -28,7 +29,6 @@ class TranscriptionLogger(FrameProcessor): - async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) @@ -40,8 +40,9 @@ async def main(): async with aiohttp.ClientSession() as session: (room_url, _) = await configure(session) - transport = DailyTransport(room_url, None, "Transcription bot", - DailyParams(audio_in_enabled=True)) + transport = DailyTransport( + room_url, None, "Transcription bot", DailyParams(audio_in_enabled=True) + ) stt = WhisperSTTService() diff --git a/examples/foundational/13a-whisper-local.py b/examples/foundational/13a-whisper-local.py index 6bf27aa0a..c1ba37ca9 100644 --- a/examples/foundational/13a-whisper-local.py +++ b/examples/foundational/13a-whisper-local.py @@ -19,6 +19,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -26,7 +27,6 @@ class TranscriptionLogger(FrameProcessor): - async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) diff --git a/examples/foundational/13b-deepgram-transcription.py b/examples/foundational/13b-deepgram-transcription.py index c5961109b..6af3237db 100644 --- a/examples/foundational/13b-deepgram-transcription.py +++ b/examples/foundational/13b-deepgram-transcription.py @@ -22,6 +22,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -29,7 +30,6 @@ class TranscriptionLogger(FrameProcessor): - async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) @@ -41,8 +41,9 @@ async def main(): async with aiohttp.ClientSession() as session: (room_url, _) = await configure(session) - transport = DailyTransport(room_url, None, "Transcription bot", - DailyParams(audio_in_enabled=True)) + transport = DailyTransport( + room_url, None, "Transcription bot", DailyParams(audio_in_enabled=True) + ) stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) diff --git a/examples/foundational/14-function-calling.py b/examples/foundational/14-function-calling.py index e4bdd5797..b5aba449c 100644 --- a/examples/foundational/14-function-calling.py +++ b/examples/foundational/14-function-calling.py @@ -26,6 +26,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -52,8 +53,8 @@ async def main(): audio_out_enabled=True, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) tts = CartesiaTTSService( @@ -61,15 +62,10 @@ async def main(): voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") # Register a function_name of None to get all functions # sent to the same callback with an additional function_name parameter. - llm.register_function( - None, - fetch_weather_from_api, - start_callback=start_fetch_weather) + llm.register_function(None, fetch_weather_from_api, start_callback=start_fetch_weather) fl_in = FrameLogger("Inner") fl_out = FrameLogger("Outer") @@ -89,17 +85,15 @@ async def main(): }, "format": { "type": "string", - "enum": [ - "celsius", - "fahrenheit"], + "enum": ["celsius", "fahrenheit"], "description": "The temperature unit to use. Infer this from the users location.", }, }, - "required": [ - "location", - "format"], + "required": ["location", "format"], }, - })] + }, + ) + ] messages = [ { "role": "system", @@ -110,16 +104,18 @@ async def main(): context = OpenAILLMContext(messages, tools) context_aggregator = llm.create_context_aggregator(context) - pipeline = Pipeline([ - fl_in, - transport.input(), - context_aggregator.user(), - llm, - fl_out, - tts, - transport.output(), - context_aggregator.assistant(), - ]) + pipeline = Pipeline( + [ + fl_in, + transport.input(), + context_aggregator.user(), + llm, + fl_out, + tts, + transport.output(), + context_aggregator.assistant(), + ] + ) task = PipelineTask(pipeline) @@ -133,5 +129,6 @@ async def on_first_participant_joined(transport, participant): await runner.run(task) + if __name__ == "__main__": asyncio.run(main()) diff --git a/examples/foundational/15-switch-voices.py b/examples/foundational/15-switch-voices.py index a55dedc83..4feaa4bbf 100644 --- a/examples/foundational/15-switch-voices.py +++ b/examples/foundational/15-switch-voices.py @@ -28,6 +28,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -39,7 +40,11 @@ async def switch_voice(function_name, tool_call_id, args, llm, context, result_callback): global current_voice current_voice = args["voice"] - await result_callback({"voice": f"You are now using your {current_voice} voice. Your responses should now be as if you were a {current_voice}."}) + await result_callback( + { + "voice": f"You are now using your {current_voice} voice. Your responses should now be as if you were a {current_voice}." + } + ) async def news_lady_filter(frame) -> bool: @@ -66,8 +71,8 @@ async def main(): audio_out_enabled=True, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) news_lady = CartesiaTTSService( @@ -85,9 +90,7 @@ async def main(): voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") llm.register_function("switch_voice", switch_voice) tools = [ @@ -106,7 +109,9 @@ async def main(): }, "required": ["voice"], }, - })] + }, + ) + ] messages = [ { "role": "system", @@ -117,18 +122,20 @@ async def main(): context = OpenAILLMContext(messages, tools) context_aggregator = llm.create_context_aggregator(context) - pipeline = Pipeline([ - transport.input(), # Transport user input - context_aggregator.user(), # User responses - llm, # LLM - ParallelPipeline( # TTS (one of the following vocies) - [FunctionFilter(news_lady_filter), news_lady], # News Lady voice - [FunctionFilter(british_lady_filter), british_lady], # British Lady voice - [FunctionFilter(barbershop_man_filter), barbershop_man], # Barbershop Man voice - ), - transport.output(), # Transport bot output - context_aggregator.assistant(), # Assistant spoken responses - ]) + pipeline = Pipeline( + [ + transport.input(), # Transport user input + context_aggregator.user(), # User responses + llm, # LLM + ParallelPipeline( # TTS (one of the following vocies) + [FunctionFilter(news_lady_filter), news_lady], # News Lady voice + [FunctionFilter(british_lady_filter), british_lady], # British Lady voice + [FunctionFilter(barbershop_man_filter), barbershop_man], # Barbershop Man voice + ), + transport.output(), # Transport bot output + context_aggregator.assistant(), # Assistant spoken responses + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True)) @@ -139,7 +146,9 @@ async def on_first_participant_joined(transport, participant): messages.append( { "role": "system", - "content": f"Please introduce yourself to the user and let them know the voices you can do. Your initial responses should be as if you were a {current_voice}."}) + "content": f"Please introduce yourself to the user and let them know the voices you can do. Your initial responses should be as if you were a {current_voice}.", + } + ) await task.queue_frames([LLMMessagesFrame(messages)]) runner = PipelineRunner() diff --git a/examples/foundational/15a-switch-languages.py b/examples/foundational/15a-switch-languages.py index 0dde985ef..8c47ad963 100644 --- a/examples/foundational/15a-switch-languages.py +++ b/examples/foundational/15a-switch-languages.py @@ -29,6 +29,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -64,8 +65,8 @@ async def main(): audio_out_enabled=True, vad_enabled=True, vad_analyzer=SileroVADAnalyzer(), - vad_audio_passthrough=True - ) + vad_audio_passthrough=True, + ), ) stt = WhisperSTTService(model=Model.LARGE) @@ -80,9 +81,7 @@ async def main(): voice_id="846d6cb0-2301-48b6-9683-48f5618ea2f6", # Spanish-speaking Lady ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") llm.register_function("switch_language", switch_language) tools = [ @@ -101,7 +100,9 @@ async def main(): }, "required": ["language"], }, - })] + }, + ) + ] messages = [ { "role": "system", @@ -112,18 +113,20 @@ async def main(): context = OpenAILLMContext(messages, tools) context_aggregator = llm.create_context_aggregator(context) - pipeline = Pipeline([ - transport.input(), # Transport user input - stt, # STT - context_aggregator.user(), # User responses - llm, # LLM - ParallelPipeline( # TTS (bot will speak the chosen language) - [FunctionFilter(english_filter), english_tts], # English - [FunctionFilter(spanish_filter), spanish_tts], # Spanish - ), - transport.output(), # Transport bot output - context_aggregator.assistant() # Assistant spoken responses - ]) + pipeline = Pipeline( + [ + transport.input(), # Transport user input + stt, # STT + context_aggregator.user(), # User responses + llm, # LLM + ParallelPipeline( # TTS (bot will speak the chosen language) + [FunctionFilter(english_filter), english_tts], # English + [FunctionFilter(spanish_filter), spanish_tts], # Spanish + ), + transport.output(), # Transport bot output + context_aggregator.assistant(), # Assistant spoken responses + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True)) @@ -134,7 +137,9 @@ async def on_first_participant_joined(transport, participant): messages.append( { "role": "system", - "content": f"Please introduce yourself to the user and let them know the languages you speak. Your initial responses should be in {current_language}."}) + "content": f"Please introduce yourself to the user and let them know the languages you speak. Your initial responses should be in {current_language}.", + } + ) await task.queue_frames([LLMMessagesFrame(messages)]) runner = PipelineRunner() diff --git a/examples/foundational/16-gpu-container-local-bot.py b/examples/foundational/16-gpu-container-local-bot.py index 7c0af45f7..06bf45195 100644 --- a/examples/foundational/16-gpu-container-local-bot.py +++ b/examples/foundational/16-gpu-container-local-bot.py @@ -14,10 +14,16 @@ from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.llm_response import ( - LLMAssistantResponseAggregator, LLMUserResponseAggregator) + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) from pipecat.services.deepgram import DeepgramTTSService from pipecat.services.openai import OpenAILLMService -from pipecat.transports.services.daily import DailyParams, DailyTransport, DailyTransportMessageFrame +from pipecat.transports.services.daily import ( + DailyParams, + DailyTransport, + DailyTransportMessageFrame, +) from pipecat.vad.silero import SileroVADAnalyzer from runner import configure @@ -25,6 +31,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -43,15 +50,15 @@ async def main(): audio_out_enabled=True, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) tts = DeepgramTTSService( aiohttp_session=session, api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-asteria-en", - base_url="http://0.0.0.0:8080/v1/speak" + base_url="http://0.0.0.0:8080/v1/speak", ) llm = OpenAILLMService( @@ -60,7 +67,7 @@ async def main(): # model="gpt-4o" # Or, to use a local vLLM (or similar) api server model="meta-llama/Meta-Llama-3-8B-Instruct", - base_url="http://0.0.0.0:8000/v1" + base_url="http://0.0.0.0:8000/v1", ) messages = [ @@ -73,14 +80,16 @@ async def main(): tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), # Transport user input - tma_in, # User responses - llm, # LLM - tts, # TTS - transport.output(), # Transport bot output - tma_out # Assistant spoken responses - ]) + pipeline = Pipeline( + [ + transport.input(), # Transport user input + tma_in, # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + tma_out, # Assistant spoken responses + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True, enable_metrics=True)) @@ -93,8 +102,7 @@ async def on_participant_joined(transport, participant): # When the first participant joins, the bot should introduce itself. @transport.event_handler("on_first_participant_joined") async def on_first_participant_joined(transport, participant): - messages.append( - {"role": "system", "content": "Please introduce yourself to the user."}) + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) await task.queue_frames([LLMMessagesFrame(messages)]) # Handle "latency-ping" messages. The client will send app messages that look like @@ -111,14 +119,18 @@ async def on_app_message(transport, message, sender): logger.debug(f"Received latency ping app message: {message}") ts = message["latency-ping"]["ts"] # Send immediately - transport.output().send_message(DailyTransportMessageFrame( - message={"latency-pong-msg-handler": {"ts": ts}}, - participant_id=sender)) + transport.output().send_message( + DailyTransportMessageFrame( + message={"latency-pong-msg-handler": {"ts": ts}}, participant_id=sender + ) + ) # And push to the pipeline for the Daily transport.output to send await tma_in.push_frame( DailyTransportMessageFrame( message={"latency-pong-pipeline-delivery": {"ts": ts}}, - participant_id=sender)) + participant_id=sender, + ) + ) except Exception as e: logger.debug(f"message handling error: {e} - {message}") diff --git a/examples/foundational/17-detect-user-idle.py b/examples/foundational/17-detect-user-idle.py index 66fcfb200..91835f8b3 100644 --- a/examples/foundational/17-detect-user-idle.py +++ b/examples/foundational/17-detect-user-idle.py @@ -14,7 +14,9 @@ from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.llm_response import ( - LLMAssistantResponseAggregator, LLMUserResponseAggregator) + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) from pipecat.processors.user_idle_processor import UserIdleProcessor from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.openai import OpenAILLMService @@ -26,6 +28,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -44,8 +47,8 @@ async def main(): audio_out_enabled=True, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) tts = CartesiaTTSService( @@ -53,9 +56,7 @@ async def main(): voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") messages = [ { @@ -69,33 +70,41 @@ async def main(): async def user_idle_callback(user_idle: UserIdleProcessor): messages.append( - {"role": "system", "content": "Ask the user if they are still there and try to prompt for some input, but be short."}) + { + "role": "system", + "content": "Ask the user if they are still there and try to prompt for some input, but be short.", + } + ) await user_idle.push_frame(LLMMessagesFrame(messages)) user_idle = UserIdleProcessor(callback=user_idle_callback, timeout=5.0) - pipeline = Pipeline([ - transport.input(), # Transport user input - user_idle, # Idle user check-in - tma_in, # User responses - llm, # LLM - tts, # TTS - transport.output(), # Transport bot output - tma_out # Assistant spoken responses - ]) - - task = PipelineTask(pipeline, PipelineParams( - allow_interruptions=True, - enable_metrics=True, - report_only_initial_ttfb=True, - )) + pipeline = Pipeline( + [ + transport.input(), # Transport user input + user_idle, # Idle user check-in + tma_in, # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + tma_out, # Assistant spoken responses + ] + ) + + task = PipelineTask( + pipeline, + PipelineParams( + allow_interruptions=True, + enable_metrics=True, + report_only_initial_ttfb=True, + ), + ) @transport.event_handler("on_first_participant_joined") async def on_first_participant_joined(transport, participant): transport.capture_participant_transcription(participant["id"]) # Kick off the conversation. - messages.append( - {"role": "system", "content": "Please introduce yourself to the user."}) + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) await task.queue_frames([LLMMessagesFrame(messages)]) runner = PipelineRunner() diff --git a/examples/foundational/18-gstreamer-filesrc.py b/examples/foundational/18-gstreamer-filesrc.py index 4b04dcf92..cdb187f66 100644 --- a/examples/foundational/18-gstreamer-filesrc.py +++ b/examples/foundational/18-gstreamer-filesrc.py @@ -20,6 +20,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -29,12 +30,7 @@ async def main(): async with aiohttp.ClientSession() as session: parser = argparse.ArgumentParser(description="Daily AI SDK Bot Sample") - parser.add_argument( - "-i", - "--input", - type=str, - required=True, - help="Input video file") + parser.add_argument("-i", "--input", type=str, required=True, help="Input video file") (room_url, _, args) = await configure_with_args(session, parser) @@ -49,7 +45,7 @@ async def main(): camera_out_width=1280, camera_out_height=720, camera_out_is_live=True, - ) + ), ) gst = GStreamerPipelineSource( @@ -59,13 +55,15 @@ async def main(): video_height=720, audio_sample_rate=16000, audio_channels=1, - ) + ), ) - pipeline = Pipeline([ - gst, # GStreamer file source - transport.output(), # Transport bot output - ]) + pipeline = Pipeline( + [ + gst, # GStreamer file source + transport.output(), # Transport bot output + ] + ) task = PipelineTask(pipeline) diff --git a/examples/foundational/18a-gstreamer-videotestsrc.py b/examples/foundational/18a-gstreamer-videotestsrc.py index 7c71e06ce..9e5977348 100644 --- a/examples/foundational/18a-gstreamer-videotestsrc.py +++ b/examples/foundational/18a-gstreamer-videotestsrc.py @@ -19,6 +19,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -38,20 +39,22 @@ async def main(): camera_out_width=1280, camera_out_height=720, camera_out_is_live=True, - ) + ), ) gst = GStreamerPipelineSource( - pipeline="videotestsrc ! capsfilter caps=\"video/x-raw,width=1280,height=720,framerate=30/1\"", + pipeline='videotestsrc ! capsfilter caps="video/x-raw,width=1280,height=720,framerate=30/1"', out_params=GStreamerPipelineSource.OutputParams( - video_width=1280, - video_height=720, - clock_sync=False)) - - pipeline = Pipeline([ - gst, # GStreamer file source - transport.output(), # Transport bot output - ]) + video_width=1280, video_height=720, clock_sync=False + ), + ) + + pipeline = Pipeline( + [ + gst, # GStreamer file source + transport.output(), # Transport bot output + ] + ) task = PipelineTask(pipeline) diff --git a/examples/foundational/19a-tools-anthropic.py b/examples/foundational/19a-tools-anthropic.py index 4cf42c2a2..05042c65b 100644 --- a/examples/foundational/19a-tools-anthropic.py +++ b/examples/foundational/19a-tools-anthropic.py @@ -23,6 +23,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -46,8 +47,8 @@ async def main(): audio_out_enabled=True, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) tts = CartesiaTTSService( @@ -56,8 +57,7 @@ async def main(): ) llm = AnthropicLLMService( - api_key=os.getenv("ANTHROPIC_API_KEY"), - model="claude-3-5-sonnet-20240620" + api_key=os.getenv("ANTHROPIC_API_KEY"), model="claude-3-5-sonnet-20240620" ) llm.register_function("get_weather", get_weather) @@ -90,18 +90,20 @@ async def main(): context = OpenAILLMContext(messages, tools) context_aggregator = llm.create_context_aggregator(context) - pipeline = Pipeline([ - transport.input(), # Transport user input - context_aggregator.user(), # User spoken responses - llm, # LLM - tts, # TTS - transport.output(), # Transport bot output - context_aggregator.assistant(), # Assistant spoken responses and tool context - ]) + pipeline = Pipeline( + [ + transport.input(), # Transport user input + context_aggregator.user(), # User spoken responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + context_aggregator.assistant(), # Assistant spoken responses and tool context + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True, enable_metrics=True)) - @ transport.event_handler("on_first_participant_joined") + @transport.event_handler("on_first_participant_joined") async def on_first_participant_joined(transport, participant): transport.capture_participant_transcription(participant["id"]) # Kick off the conversation. diff --git a/examples/foundational/19b-tools-video-anthropic.py b/examples/foundational/19b-tools-video-anthropic.py index d9446d8e2..8a8110487 100644 --- a/examples/foundational/19b-tools-video-anthropic.py +++ b/examples/foundational/19b-tools-video-anthropic.py @@ -23,6 +23,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -55,8 +56,8 @@ async def main(): audio_out_enabled=True, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) tts = CartesiaTTSService( @@ -67,7 +68,7 @@ async def main(): llm = AnthropicLLMService( api_key=os.getenv("ANTHROPIC_API_KEY"), model="claude-3-5-sonnet-20240620", - enable_prompt_caching_beta=True + enable_prompt_caching_beta=True, ) llm.register_function("get_weather", get_weather) llm.register_function("get_image", get_image) @@ -100,7 +101,7 @@ async def main(): }, "required": ["question"], }, - } + }, ] # todo: test with very short initial user message @@ -134,28 +135,28 @@ async def main(): "type": "text", "text": system_prompt, } - ] + ], }, - { - "role": "user", - "content": "Start the conversation by introducing yourself." - }] + {"role": "user", "content": "Start the conversation by introducing yourself."}, + ] context = OpenAILLMContext(messages, tools) context_aggregator = llm.create_context_aggregator(context) - pipeline = Pipeline([ - transport.input(), # Transport user input - context_aggregator.user(), # User speech to text - llm, # LLM - tts, # TTS - transport.output(), # Transport bot output - context_aggregator.assistant(), # Assistant spoken responses and tool context - ]) + pipeline = Pipeline( + [ + transport.input(), # Transport user input + context_aggregator.user(), # User speech to text + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + context_aggregator.assistant(), # Assistant spoken responses and tool context + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True, enable_metrics=True)) - @ transport.event_handler("on_first_participant_joined") + @transport.event_handler("on_first_participant_joined") async def on_first_participant_joined(transport, participant): global video_participant_id video_participant_id = participant["id"] diff --git a/examples/foundational/19c-tools-togetherai.py b/examples/foundational/19c-tools-togetherai.py index 329ecce68..f8e63ef75 100644 --- a/examples/foundational/19c-tools-togetherai.py +++ b/examples/foundational/19c-tools-togetherai.py @@ -25,6 +25,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -32,12 +33,8 @@ async def get_current_weather( - function_name, - tool_call_id, - arguments, - llm, - context, - result_callback): + function_name, tool_call_id, arguments, llm, context, result_callback +): logger.debug("IN get_current_weather") location = arguments["location"] await result_callback(f"The weather in {location} is currently 72 degrees and sunny.") @@ -55,8 +52,8 @@ async def main(): audio_out_enabled=True, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) tts = CartesiaTTSService( @@ -104,26 +101,28 @@ async def main(): """ - messages = [{"role": "system", - "content": system_prompt}, - {"role": "user", - "content": "Wait for the user to say something."}] + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": "Wait for the user to say something."}, + ] context = OpenAILLMContext(messages) context_aggregator = llm.create_context_aggregator(context) - pipeline = Pipeline([ - transport.input(), # Transport user input - context_aggregator.user(), # User speech to text - llm, # LLM - tts, # TTS - transport.output(), # Transport bot output - context_aggregator.assistant(), # Assistant spoken responses and tool context - ]) + pipeline = Pipeline( + [ + transport.input(), # Transport user input + context_aggregator.user(), # User speech to text + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + context_aggregator.assistant(), # Assistant spoken responses and tool context + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True, enable_metrics=True)) - @ transport.event_handler("on_first_participant_joined") + @transport.event_handler("on_first_participant_joined") async def on_first_participant_joined(transport, participant): transport.capture_participant_transcription(participant["id"]) # Kick off the conversation. diff --git a/examples/foundational/runner.py b/examples/foundational/runner.py index 068174eec..13c4ff076 100644 --- a/examples/foundational/runner.py +++ b/examples/foundational/runner.py @@ -17,16 +17,13 @@ async def configure(aiohttp_session: aiohttp.ClientSession): async def configure_with_args( - aiohttp_session: aiohttp.ClientSession, - parser: argparse.ArgumentParser | None = None): + aiohttp_session: aiohttp.ClientSession, parser: argparse.ArgumentParser | None = None +): if not parser: parser = argparse.ArgumentParser(description="Daily AI SDK Bot Sample") parser.add_argument( - "-u", - "--url", - type=str, - required=False, - help="URL of the Daily room to join") + "-u", "--url", type=str, required=False, help="URL of the Daily room to join" + ) parser.add_argument( "-k", "--apikey", @@ -42,15 +39,19 @@ async def configure_with_args( if not url: raise Exception( - "No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL.") + "No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL." + ) if not key: - raise Exception("No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers.") + raise Exception( + "No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers." + ) daily_rest_helper = DailyRESTHelper( daily_api_key=key, daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"), - aiohttp_session=aiohttp_session) + aiohttp_session=aiohttp_session, + ) # Create a meeting token for the given room with an expiration 1 hour in # the future. diff --git a/examples/moondream-chatbot/bot.py b/examples/moondream-chatbot/bot.py index d14a5f016..86456d40f 100644 --- a/examples/moondream-chatbot/bot.py +++ b/examples/moondream-chatbot/bot.py @@ -43,6 +43,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -60,11 +61,7 @@ # Get the filename without the extension to use as the dictionary key # Open the image and convert it to bytes with Image.open(full_path) as img: - sprites.append(OutputImageRawFrame( - image=img.tobytes(), - size=img.size, - format=img.format) - ) + sprites.append(OutputImageRawFrame(image=img.tobytes(), size=img.size, format=img.format)) flipped = sprites[::-1] sprites.extend(flipped) @@ -110,7 +107,9 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): if self.participant_id and isinstance(frame, TextFrame): if frame.text == user_request_answer: - await self.push_frame(UserImageRequestFrame(self.participant_id), FrameDirection.UPSTREAM) + await self.push_frame( + UserImageRequestFrame(self.participant_id), FrameDirection.UPSTREAM + ) await self.push_frame(TextFrame("Describe the image in a short sentence.")) elif isinstance(frame, UserImageRawFrame): await self.push_frame(frame) @@ -154,8 +153,8 @@ async def main(): camera_out_height=576, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) tts = CartesiaTTSService( @@ -163,9 +162,7 @@ async def main(): voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") ta = TalkingAnimation() @@ -188,17 +185,17 @@ async def main(): ura = LLMUserResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), - ura, - llm, - ParallelPipeline( - [sa, ir, va, moondream], - [tf, imgf]), - tts, - ta, - transport.output() - ]) + pipeline = Pipeline( + [ + transport.input(), + ura, + llm, + ParallelPipeline([sa, ir, va, moondream], [tf, imgf]), + tts, + ta, + transport.output(), + ] + ) task = PipelineTask(pipeline) await task.queue_frame(quiet_frame) diff --git a/examples/moondream-chatbot/runner.py b/examples/moondream-chatbot/runner.py index 7507d28d6..3df3ee81f 100644 --- a/examples/moondream-chatbot/runner.py +++ b/examples/moondream-chatbot/runner.py @@ -14,11 +14,8 @@ async def configure(aiohttp_session: aiohttp.ClientSession): parser = argparse.ArgumentParser(description="Daily AI SDK Bot Sample") parser.add_argument( - "-u", - "--url", - type=str, - required=False, - help="URL of the Daily room to join") + "-u", "--url", type=str, required=False, help="URL of the Daily room to join" + ) parser.add_argument( "-k", "--apikey", @@ -34,15 +31,18 @@ async def configure(aiohttp_session: aiohttp.ClientSession): if not url: raise Exception( - "No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL.") + "No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL." + ) if not key: - raise Exception("No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers.") + raise Exception( + "No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers." + ) daily_rest_helper = DailyRESTHelper( daily_api_key=key, daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"), - aiohttp_session=aiohttp_session + aiohttp_session=aiohttp_session, ) # Create a meeting token for the given room with an expiration 1 hour in diff --git a/examples/moondream-chatbot/server.py b/examples/moondream-chatbot/server.py index d758e67f9..e3523851e 100644 --- a/examples/moondream-chatbot/server.py +++ b/examples/moondream-chatbot/server.py @@ -38,13 +38,14 @@ async def lifespan(app: FastAPI): aiohttp_session = aiohttp.ClientSession() daily_helpers["rest"] = DailyRESTHelper( daily_api_key=os.getenv("DAILY_API_KEY", ""), - daily_api_url=os.getenv("DAILY_API_URL", 'https://api.daily.co/v1'), - aiohttp_session=aiohttp_session + daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"), + aiohttp_session=aiohttp_session, ) yield await aiohttp_session.close() cleanup() + app = FastAPI(lifespan=lifespan) app.add_middleware( @@ -65,37 +66,34 @@ async def start_agent(request: Request): if not room.url: raise HTTPException( status_code=500, - detail="Missing 'room' property in request data. Cannot start agent without a target room!") + detail="Missing 'room' property in request data. Cannot start agent without a target room!", + ) # Check if there is already an existing process running in this room num_bots_in_room = sum( - 1 for proc in bot_procs.values() if proc[1] == room.url and proc[0].poll() is None) + 1 for proc in bot_procs.values() if proc[1] == room.url and proc[0].poll() is None + ) if num_bots_in_room >= MAX_BOTS_PER_ROOM: - raise HTTPException( - status_code=500, detail=f"Max bot limited reach for room: {room.url}") + raise HTTPException(status_code=500, detail=f"Max bot limited reach for room: {room.url}") # Get the token for the room token = await daily_helpers["rest"].get_token(room.url) if not token: - raise HTTPException( - status_code=500, detail=f"Failed to get token for room: {room.url}") + raise HTTPException(status_code=500, detail=f"Failed to get token for room: {room.url}") # Spawn a new agent, and join the user session # Note: this is mostly for demonstration purposes (refer to 'deployment' in README) try: proc = subprocess.Popen( - [ - f"python3 -m bot -u {room.url} -t {token}" - ], + [f"python3 -m bot -u {room.url} -t {token}"], shell=True, bufsize=1, - cwd=os.path.dirname(os.path.abspath(__file__)) + cwd=os.path.dirname(os.path.abspath(__file__)), ) bot_procs[proc.pid] = (proc, room.url) except Exception as e: - raise HTTPException( - status_code=500, detail=f"Failed to start subprocess: {e}") + raise HTTPException(status_code=500, detail=f"Failed to start subprocess: {e}") return RedirectResponse(room.url) @@ -107,8 +105,7 @@ def get_status(pid: int): # If the subprocess doesn't exist, return an error if not proc: - raise HTTPException( - status_code=404, detail=f"Bot with process id: {pid} not found") + raise HTTPException(status_code=404, detail=f"Bot with process id: {pid} not found") # Check the status of the subprocess if proc[0].poll() is None: @@ -125,14 +122,10 @@ def get_status(pid: int): default_host = os.getenv("HOST", "0.0.0.0") default_port = int(os.getenv("FAST_API_PORT", "7860")) - parser = argparse.ArgumentParser( - description="Daily Moondream FastAPI server") - parser.add_argument("--host", type=str, - default=default_host, help="Host address") - parser.add_argument("--port", type=int, - default=default_port, help="Port number") - parser.add_argument("--reload", action="store_true", - help="Reload code on change") + parser = argparse.ArgumentParser(description="Daily Moondream FastAPI server") + parser.add_argument("--host", type=str, default=default_host, help="Host address") + parser.add_argument("--port", type=int, default=default_port, help="Port number") + parser.add_argument("--reload", action="store_true", help="Reload code on change") config = parser.parse_args() diff --git a/examples/patient-intake/bot.py b/examples/patient-intake/bot.py index 33ca9e26d..52f45f75e 100644 --- a/examples/patient-intake/bot.py +++ b/examples/patient-intake/bot.py @@ -26,6 +26,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -49,41 +50,44 @@ filename = os.path.splitext(os.path.basename(full_path))[0] # Open the sound and convert it to bytes with wave.open(full_path) as audio_file: - sounds[file] = OutputAudioRawFrame(audio_file.readframes(-1), - audio_file.getframerate(), - audio_file.getnchannels()) + sounds[file] = OutputAudioRawFrame( + audio_file.readframes(-1), audio_file.getframerate(), audio_file.getnchannels() + ) class IntakeProcessor: - def __init__(self, context: OpenAILLMContext): print(f"Initializing context from IntakeProcessor") - context.add_message({"role": "system", "content": "You are Jessica, an agent for a company called Tri-County Health Services. Your job is to collect important information from the user before their doctor visit. You're talking to Chad Bailey. You should address the user by their first name and be polite and professional. You're not a medical professional, so you shouldn't provide any advice. Keep your responses short. Your job is to collect information to give to a doctor. Don't make assumptions about what values to plug into functions. Ask for clarification if a user response is ambiguous. Start by introducing yourself. Then, ask the user to confirm their identity by telling you their birthday, including the year. When they answer with their birthday, call the verify_birthday function."}) - context.set_tools([ + context.add_message( { - "type": "function", - "function": { - "name": "verify_birthday", - "description": "Use this function to verify the user has provided their correct birthday.", - "parameters": { - "type": "object", - "properties": { - "birthday": { - "type": "string", - "description": "The user's birthdate, including the year. The user can provide it in any format, but convert it to YYYY-MM-DD format to call this function.", - }}, + "role": "system", + "content": "You are Jessica, an agent for a company called Tri-County Health Services. Your job is to collect important information from the user before their doctor visit. You're talking to Chad Bailey. You should address the user by their first name and be polite and professional. You're not a medical professional, so you shouldn't provide any advice. Keep your responses short. Your job is to collect information to give to a doctor. Don't make assumptions about what values to plug into functions. Ask for clarification if a user response is ambiguous. Start by introducing yourself. Then, ask the user to confirm their identity by telling you their birthday, including the year. When they answer with their birthday, call the verify_birthday function.", + } + ) + context.set_tools( + [ + { + "type": "function", + "function": { + "name": "verify_birthday", + "description": "Use this function to verify the user has provided their correct birthday.", + "parameters": { + "type": "object", + "properties": { + "birthday": { + "type": "string", + "description": "The user's birthdate, including the year. The user can provide it in any format, but convert it to YYYY-MM-DD format to call this function.", + } + }, + }, }, - }, - }]) + } + ] + ) async def verify_birthday( - self, - function_name, - tool_call_id, - args, - llm, - context, - result_callback): + self, function_name, tool_call_id, args, llm, context, result_callback + ): if args["birthday"] == "1983-01-01": context.set_tools( [ @@ -110,18 +114,35 @@ async def verify_birthday( }, }, }, - }}, + } + }, }, }, - }]) + } + ] + ) # It's a bit weird to push this to the LLM, but it gets it into the pipeline # await llm.push_frame(sounds["ding2.wav"], FrameDirection.DOWNSTREAM) # We don't need the function call in the context, so just return a new # system message and let the framework re-prompt - await result_callback([{"role": "system", "content": "Next, thank the user for confirming their identity, then ask the user to list their current prescriptions. Each prescription needs to have a medication name and a dosage. Do not call the list_prescriptions function with any unknown dosages."}]) + await result_callback( + [ + { + "role": "system", + "content": "Next, thank the user for confirming their identity, then ask the user to list their current prescriptions. Each prescription needs to have a medication name and a dosage. Do not call the list_prescriptions function with any unknown dosages.", + } + ] + ) else: # The user provided an incorrect birthday; ask them to try again - await result_callback([{"role": "system", "content": "The user provided an incorrect birthday. Ask them for their birthday again. When they answer, call the verify_birthday function."}]) + await result_callback( + [ + { + "role": "system", + "content": "The user provided an incorrect birthday. Ask them for their birthday again. When they answer, call the verify_birthday function.", + } + ] + ) async def start_prescriptions(self, function_name, llm, context): print(f"!!! doing start prescriptions") @@ -144,16 +165,22 @@ async def start_prescriptions(self, function_name, llm, context): "name": { "type": "string", "description": "What the user is allergic to", - }}, + } + }, }, - }}, + } + }, }, }, - }]) + } + ] + ) context.add_message( { "role": "system", - "content": "Next, ask the user if they have any allergies. Once they have listed their allergies or confirmed they don't have any, call the list_allergies function."}) + "content": "Next, ask the user if they have any allergies. Once they have listed their allergies or confirmed they don't have any, call the list_allergies function.", + } + ) print(f"!!! about to await llm process frame in start prescrpitions") await llm.process_frame(OpenAILLMContextFrame(context), FrameDirection.DOWNSTREAM) print(f"!!! past await process frame in start prescriptions") @@ -179,17 +206,22 @@ async def start_allergies(self, function_name, llm, context): "name": { "type": "string", "description": "The user's medical condition", - }}, + } + }, }, - }}, + } + }, }, }, }, - ]) + ] + ) context.add_message( { "role": "system", - "content": "Now ask the user if they have any medical conditions the doctor should know about. Once they've answered the question, call the list_conditions function."}) + "content": "Now ask the user if they have any medical conditions the doctor should know about. Once they've answered the question, call the list_conditions function.", + } + ) await llm.process_frame(OpenAILLMContextFrame(context), FrameDirection.DOWNSTREAM) async def start_conditions(self, function_name, llm, context): @@ -213,24 +245,31 @@ async def start_conditions(self, function_name, llm, context): "name": { "type": "string", "description": "The user's reason for visiting the doctor", - }}, + } + }, }, - }}, + } + }, }, }, - }]) + } + ] + ) context.add_message( { "role": "system", - "content": "Finally, ask the user the reason for their doctor visit today. Once they answer, call the list_visit_reasons function."}) + "content": "Finally, ask the user the reason for their doctor visit today. Once they answer, call the list_visit_reasons function.", + } + ) await llm.process_frame(OpenAILLMContextFrame(context), FrameDirection.DOWNSTREAM) async def start_visit_reasons(self, function_name, llm, context): print("!!! doing start visit reasons") # move to finish call context.set_tools([]) - context.add_message({"role": "system", - "content": "Now, thank the user and end the conversation."}) + context.add_message( + {"role": "system", "content": "Now, thank the user and end the conversation."} + ) await llm.process_frame(OpenAILLMContextFrame(context), FrameDirection.DOWNSTREAM) async def save_data(self, function_name, tool_call_id, args, llm, context, result_callback): @@ -261,7 +300,7 @@ async def main(): # tier="nova", # model="2-general" # ) - ) + ), ) tts = CartesiaTTSService( @@ -274,9 +313,7 @@ async def main(): # voice_id="846d6cb0-2301-48b6-9683-48f5618ea2f6", # Spanish-speaking Lady # ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") messages = [] context = OpenAILLMContext(messages=messages) @@ -285,33 +322,31 @@ async def main(): intake = IntakeProcessor(context) llm.register_function("verify_birthday", intake.verify_birthday) llm.register_function( - "list_prescriptions", - intake.save_data, - start_callback=intake.start_prescriptions) + "list_prescriptions", intake.save_data, start_callback=intake.start_prescriptions + ) llm.register_function( - "list_allergies", - intake.save_data, - start_callback=intake.start_allergies) + "list_allergies", intake.save_data, start_callback=intake.start_allergies + ) llm.register_function( - "list_conditions", - intake.save_data, - start_callback=intake.start_conditions) + "list_conditions", intake.save_data, start_callback=intake.start_conditions + ) llm.register_function( - "list_visit_reasons", - intake.save_data, - start_callback=intake.start_visit_reasons) + "list_visit_reasons", intake.save_data, start_callback=intake.start_visit_reasons + ) fl = FrameLogger("LLM Output") - pipeline = Pipeline([ - transport.input(), # Transport input - context_aggregator.user(), # User responses - llm, # LLM - fl, # Frame logger - tts, # TTS - transport.output(), # Transport output - context_aggregator.assistant(), # Assistant responses - ]) + pipeline = Pipeline( + [ + transport.input(), # Transport input + context_aggregator.user(), # User responses + llm, # LLM + fl, # Frame logger + tts, # TTS + transport.output(), # Transport output + context_aggregator.assistant(), # Assistant responses + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=False)) diff --git a/examples/patient-intake/runner.py b/examples/patient-intake/runner.py index 7242c4f27..3df3ee81f 100644 --- a/examples/patient-intake/runner.py +++ b/examples/patient-intake/runner.py @@ -14,11 +14,8 @@ async def configure(aiohttp_session: aiohttp.ClientSession): parser = argparse.ArgumentParser(description="Daily AI SDK Bot Sample") parser.add_argument( - "-u", - "--url", - type=str, - required=False, - help="URL of the Daily room to join") + "-u", "--url", type=str, required=False, help="URL of the Daily room to join" + ) parser.add_argument( "-k", "--apikey", @@ -34,15 +31,19 @@ async def configure(aiohttp_session: aiohttp.ClientSession): if not url: raise Exception( - "No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL.") + "No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL." + ) if not key: - raise Exception("No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers.") + raise Exception( + "No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers." + ) daily_rest_helper = DailyRESTHelper( daily_api_key=key, daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"), - aiohttp_session=aiohttp_session) + aiohttp_session=aiohttp_session, + ) # Create a meeting token for the given room with an expiration 1 hour in # the future. diff --git a/examples/patient-intake/server.py b/examples/patient-intake/server.py index 639587894..c0fc9c97f 100644 --- a/examples/patient-intake/server.py +++ b/examples/patient-intake/server.py @@ -38,13 +38,14 @@ async def lifespan(app: FastAPI): aiohttp_session = aiohttp.ClientSession() daily_helpers["rest"] = DailyRESTHelper( daily_api_key=os.getenv("DAILY_API_KEY", ""), - daily_api_url=os.getenv("DAILY_API_URL", 'https://api.daily.co/v1'), - aiohttp_session=aiohttp_session + daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"), + aiohttp_session=aiohttp_session, ) yield await aiohttp_session.close() cleanup() + app = FastAPI(lifespan=lifespan) app.add_middleware( @@ -65,37 +66,34 @@ async def start_agent(request: Request): if not room.url: raise HTTPException( status_code=500, - detail="Missing 'room' property in request data. Cannot start agent without a target room!") + detail="Missing 'room' property in request data. Cannot start agent without a target room!", + ) # Check if there is already an existing process running in this room num_bots_in_room = sum( - 1 for proc in bot_procs.values() if proc[1] == room.url and proc[0].poll() is None) + 1 for proc in bot_procs.values() if proc[1] == room.url and proc[0].poll() is None + ) if num_bots_in_room >= MAX_BOTS_PER_ROOM: - raise HTTPException( - status_code=500, detail=f"Max bot limited reach for room: {room.url}") + raise HTTPException(status_code=500, detail=f"Max bot limited reach for room: {room.url}") # Get the token for the room token = await daily_helpers["rest"].get_token(room.url) if not token: - raise HTTPException( - status_code=500, detail=f"Failed to get token for room: {room.url}") + raise HTTPException(status_code=500, detail=f"Failed to get token for room: {room.url}") # Spawn a new agent, and join the user session # Note: this is mostly for demonstration purposes (refer to 'deployment' in README) try: proc = subprocess.Popen( - [ - f"python3 -m bot -u {room.url} -t {token}" - ], + [f"python3 -m bot -u {room.url} -t {token}"], shell=True, bufsize=1, - cwd=os.path.dirname(os.path.abspath(__file__)) + cwd=os.path.dirname(os.path.abspath(__file__)), ) bot_procs[proc.pid] = (proc, room.url) except Exception as e: - raise HTTPException( - status_code=500, detail=f"Failed to start subprocess: {e}") + raise HTTPException(status_code=500, detail=f"Failed to start subprocess: {e}") return RedirectResponse(room.url) @@ -107,8 +105,7 @@ def get_status(pid: int): # If the subprocess doesn't exist, return an error if not proc: - raise HTTPException( - status_code=404, detail=f"Bot with process id: {pid} not found") + raise HTTPException(status_code=404, detail=f"Bot with process id: {pid} not found") # Check the status of the subprocess if proc[0].poll() is None: @@ -125,14 +122,10 @@ def get_status(pid: int): default_host = os.getenv("HOST", "0.0.0.0") default_port = int(os.getenv("FAST_API_PORT", "7860")) - parser = argparse.ArgumentParser( - description="Daily Storyteller FastAPI server") - parser.add_argument("--host", type=str, - default=default_host, help="Host address") - parser.add_argument("--port", type=int, - default=default_port, help="Port number") - parser.add_argument("--reload", action="store_true", - help="Reload code on change") + parser = argparse.ArgumentParser(description="Daily Storyteller FastAPI server") + parser.add_argument("--host", type=str, default=default_host, help="Host address") + parser.add_argument("--port", type=int, default=default_port, help="Port number") + parser.add_argument("--reload", action="store_true", help="Reload code on change") config = parser.parse_args() print(f"to join a test room, visit http://localhost:{config.port}/start") diff --git a/examples/simple-chatbot/bot.py b/examples/simple-chatbot/bot.py index f179dfeb5..b06721d4c 100644 --- a/examples/simple-chatbot/bot.py +++ b/examples/simple-chatbot/bot.py @@ -14,14 +14,17 @@ from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask -from pipecat.processors.aggregators.llm_response import LLMAssistantResponseAggregator, LLMUserResponseAggregator +from pipecat.processors.aggregators.llm_response import ( + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) from pipecat.frames.frames import ( OutputImageRawFrame, SpriteFrame, Frame, LLMMessagesFrame, TTSAudioRawFrame, - TTSStoppedFrame + TTSStoppedFrame, ) from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.services.elevenlabs import ElevenLabsTTSService @@ -34,6 +37,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -49,11 +53,7 @@ # Get the filename without the extension to use as the dictionary key # Open the image and convert it to bytes with Image.open(full_path) as img: - sprites.append(OutputImageRawFrame( - image=img.tobytes(), - size=img.size, - format=img.format) - ) + sprites.append(OutputImageRawFrame(image=img.tobytes(), size=img.size, format=img.format)) flipped = sprites[::-1] sprites.extend(flipped) @@ -111,7 +111,7 @@ async def main(): # tier="nova", # model="2-general" # ) - ) + ), ) tts = ElevenLabsTTSService( @@ -120,7 +120,6 @@ async def main(): # English # voice_id="pNInz6obpgDQGcFmaJgB", - # # Spanish # @@ -128,9 +127,7 @@ async def main(): # voice_id="gD1IexrzCvsXPHUuT0s3", ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") messages = [ { @@ -139,7 +136,6 @@ async def main(): # English # "content": "You are Chatbot, a friendly, helpful robot. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by introducing yourself.", - # # Spanish # @@ -152,15 +148,17 @@ async def main(): ta = TalkingAnimation() - pipeline = Pipeline([ - transport.input(), - user_response, - llm, - tts, - ta, - transport.output(), - assistant_response, - ]) + pipeline = Pipeline( + [ + transport.input(), + user_response, + llm, + tts, + ta, + transport.output(), + assistant_response, + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True)) await task.queue_frame(quiet_frame) diff --git a/examples/simple-chatbot/runner.py b/examples/simple-chatbot/runner.py index 7507d28d6..3df3ee81f 100644 --- a/examples/simple-chatbot/runner.py +++ b/examples/simple-chatbot/runner.py @@ -14,11 +14,8 @@ async def configure(aiohttp_session: aiohttp.ClientSession): parser = argparse.ArgumentParser(description="Daily AI SDK Bot Sample") parser.add_argument( - "-u", - "--url", - type=str, - required=False, - help="URL of the Daily room to join") + "-u", "--url", type=str, required=False, help="URL of the Daily room to join" + ) parser.add_argument( "-k", "--apikey", @@ -34,15 +31,18 @@ async def configure(aiohttp_session: aiohttp.ClientSession): if not url: raise Exception( - "No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL.") + "No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL." + ) if not key: - raise Exception("No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers.") + raise Exception( + "No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers." + ) daily_rest_helper = DailyRESTHelper( daily_api_key=key, daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"), - aiohttp_session=aiohttp_session + aiohttp_session=aiohttp_session, ) # Create a meeting token for the given room with an expiration 1 hour in diff --git a/examples/simple-chatbot/server.py b/examples/simple-chatbot/server.py index d54452d10..5240c254f 100644 --- a/examples/simple-chatbot/server.py +++ b/examples/simple-chatbot/server.py @@ -38,13 +38,14 @@ async def lifespan(app: FastAPI): aiohttp_session = aiohttp.ClientSession() daily_helpers["rest"] = DailyRESTHelper( daily_api_key=os.getenv("DAILY_API_KEY", ""), - daily_api_url=os.getenv("DAILY_API_URL", 'https://api.daily.co/v1'), - aiohttp_session=aiohttp_session + daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"), + aiohttp_session=aiohttp_session, ) yield await aiohttp_session.close() cleanup() + app = FastAPI(lifespan=lifespan) app.add_middleware( @@ -65,37 +66,34 @@ async def start_agent(request: Request): if not room.url: raise HTTPException( status_code=500, - detail="Missing 'room' property in request data. Cannot start agent without a target room!") + detail="Missing 'room' property in request data. Cannot start agent without a target room!", + ) # Check if there is already an existing process running in this room num_bots_in_room = sum( - 1 for proc in bot_procs.values() if proc[1] == room.url and proc[0].poll() is None) + 1 for proc in bot_procs.values() if proc[1] == room.url and proc[0].poll() is None + ) if num_bots_in_room >= MAX_BOTS_PER_ROOM: - raise HTTPException( - status_code=500, detail=f"Max bot limited reach for room: {room.url}") + raise HTTPException(status_code=500, detail=f"Max bot limited reach for room: {room.url}") # Get the token for the room token = await daily_helpers["rest"].get_token(room.url) if not token: - raise HTTPException( - status_code=500, detail=f"Failed to get token for room: {room.url}") + raise HTTPException(status_code=500, detail=f"Failed to get token for room: {room.url}") # Spawn a new agent, and join the user session # Note: this is mostly for demonstration purposes (refer to 'deployment' in README) try: proc = subprocess.Popen( - [ - f"python3 -m bot -u {room.url} -t {token}" - ], + [f"python3 -m bot -u {room.url} -t {token}"], shell=True, bufsize=1, - cwd=os.path.dirname(os.path.abspath(__file__)) + cwd=os.path.dirname(os.path.abspath(__file__)), ) bot_procs[proc.pid] = (proc, room.url) except Exception as e: - raise HTTPException( - status_code=500, detail=f"Failed to start subprocess: {e}") + raise HTTPException(status_code=500, detail=f"Failed to start subprocess: {e}") return RedirectResponse(room.url) @@ -107,8 +105,7 @@ def get_status(pid: int): # If the subprocess doesn't exist, return an error if not proc: - raise HTTPException( - status_code=404, detail=f"Bot with process id: {pid} not found") + raise HTTPException(status_code=404, detail=f"Bot with process id: {pid} not found") # Check the status of the subprocess if proc[0].poll() is None: @@ -125,14 +122,10 @@ def get_status(pid: int): default_host = os.getenv("HOST", "0.0.0.0") default_port = int(os.getenv("FAST_API_PORT", "7860")) - parser = argparse.ArgumentParser( - description="Daily Storyteller FastAPI server") - parser.add_argument("--host", type=str, - default=default_host, help="Host address") - parser.add_argument("--port", type=int, - default=default_port, help="Port number") - parser.add_argument("--reload", action="store_true", - help="Reload code on change") + parser = argparse.ArgumentParser(description="Daily Storyteller FastAPI server") + parser.add_argument("--host", type=str, default=default_host, help="Host address") + parser.add_argument("--port", type=int, default=default_port, help="Port number") + parser.add_argument("--reload", action="store_true", help="Reload code on change") config = parser.parse_args() diff --git a/examples/storytelling-chatbot/src/bot.py b/examples/storytelling-chatbot/src/bot.py index 91452dd75..e67af8d7d 100644 --- a/examples/storytelling-chatbot/src/bot.py +++ b/examples/storytelling-chatbot/src/bot.py @@ -9,11 +9,18 @@ from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineTask -from pipecat.processors.aggregators.llm_response import LLMAssistantResponseAggregator, LLMUserResponseAggregator +from pipecat.processors.aggregators.llm_response import ( + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) from pipecat.services.elevenlabs import ElevenLabsTTSService from pipecat.services.fal import FalImageGenService from pipecat.services.openai import OpenAILLMService -from pipecat.transports.services.daily import DailyParams, DailyTransport, DailyTransportMessageFrame +from pipecat.transports.services.daily import ( + DailyParams, + DailyTransport, + DailyTransportMessageFrame, +) from processors import StoryProcessor, StoryImageProcessor from prompts import LLM_BASE_PROMPT, LLM_INTRO_PROMPT, CUE_USER_TURN @@ -22,6 +29,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -33,7 +41,6 @@ async def main(room_url, token=None): async with aiohttp.ClientSession() as session: - # -------------- Transport --------------- # transport = DailyTransport( @@ -47,17 +54,14 @@ async def main(room_url, token=None): camera_out_height=768, transcription_enabled=True, vad_enabled=True, - ) + ), ) logger.debug("Transport created for room:" + room_url) # -------------- Services --------------- # - llm_service = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o" - ) + llm_service = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") tts_service = ElevenLabsTTSService( api_key=os.getenv("ELEVENLABS_API_KEY"), @@ -65,10 +69,7 @@ async def main(room_url, token=None): ) fal_service_params = FalImageGenService.InputParams( - image_size={ - "width": 768, - "height": 768 - } + image_size={"width": 768, "height": 768} ) fal_service = FalImageGenService( @@ -110,12 +111,12 @@ async def on_first_participant_joined(transport, participant): transport.capture_participant_transcription(participant["id"]) await intro_task.queue_frames( [ - images['book1'], + images["book1"], LLMMessagesFrame([LLM_INTRO_PROMPT]), DailyTransportMessageFrame(CUE_USER_TURN), sounds["listening"], - images['book2'], - StopTaskFrame() + images["book2"], + StopTaskFrame(), ] ) @@ -125,16 +126,18 @@ async def on_first_participant_joined(transport, participant): # The main story pipeline is used to continue the story based on user # input. - main_pipeline = Pipeline([ - transport.input(), - user_responses, - llm_service, - story_processor, - image_processor, - tts_service, - transport.output(), - llm_responses - ]) + main_pipeline = Pipeline( + [ + transport.input(), + user_responses, + llm_service, + story_processor, + image_processor, + tts_service, + transport.output(), + llm_responses, + ] + ) main_task = PipelineTask(main_pipeline) @@ -150,6 +153,7 @@ async def on_call_state_updated(transport, state): await runner.run(main_task) + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Daily Storyteller Bot") parser.add_argument("-u", type=str, help="Room URL") diff --git a/examples/storytelling-chatbot/src/bot_runner.py b/examples/storytelling-chatbot/src/bot_runner.py index 97e933c25..13ce49834 100644 --- a/examples/storytelling-chatbot/src/bot_runner.py +++ b/examples/storytelling-chatbot/src/bot_runner.py @@ -20,10 +20,15 @@ from fastapi.responses import FileResponse, JSONResponse from pipecat.transports.services.helpers.daily_rest import ( - DailyRESTHelper, DailyRoomObject, DailyRoomProperties, DailyRoomParams) + DailyRESTHelper, + DailyRoomObject, + DailyRoomProperties, + DailyRoomParams, +) from dotenv import load_dotenv + load_dotenv(override=True) # ------------ Fast API Config ------------ # @@ -38,12 +43,13 @@ async def lifespan(app: FastAPI): aiohttp_session = aiohttp.ClientSession() daily_helpers["rest"] = DailyRESTHelper( daily_api_key=os.getenv("DAILY_API_KEY", ""), - daily_api_url=os.getenv("DAILY_API_URL", 'https://api.daily.co/v1'), - aiohttp_session=aiohttp_session + daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"), + aiohttp_session=aiohttp_session, ) yield await aiohttp_session.close() + app = FastAPI(lifespan=lifespan) app.add_middleware( @@ -85,55 +91,50 @@ async def start_bot(request: Request) -> JSONResponse: room_url = os.getenv("DAILY_SAMPLE_ROOM_URL", "") if not room_url: - params = DailyRoomParams( - properties=DailyRoomProperties() - ) + params = DailyRoomParams(properties=DailyRoomProperties()) try: room: DailyRoomObject = await daily_helpers["rest"].create_room(params=params) except Exception as e: - raise HTTPException( - status_code=500, - detail=f"Unable to provision room {e}") + raise HTTPException(status_code=500, detail=f"Unable to provision room {e}") else: # Check passed room URL exists, we should assume that it already has a sip set up try: room: DailyRoomObject = await daily_helpers["rest"].get_room_from_url(room_url) except Exception: - raise HTTPException( - status_code=500, detail=f"Room not found: {room_url}") + raise HTTPException(status_code=500, detail=f"Room not found: {room_url}") # Give the agent a token to join the session token = await daily_helpers["rest"].get_token(room.url, MAX_SESSION_TIME) if not room or not token: - raise HTTPException( - status_code=500, detail=f"Failed to get token for room: {room_url}") + raise HTTPException(status_code=500, detail=f"Failed to get token for room: {room_url}") # Launch a new VM, or run as a shell process (not recommended) if os.getenv("RUN_AS_VM", False): try: await virtualize_bot(room.url, token) except Exception as e: - raise HTTPException( - status_code=500, detail=f"Failed to spawn VM: {e}") + raise HTTPException(status_code=500, detail=f"Failed to spawn VM: {e}") else: try: subprocess.Popen( [f"python3 -m bot -u {room.url} -t {token}"], shell=True, bufsize=1, - cwd=os.path.dirname(os.path.abspath(__file__))) + cwd=os.path.dirname(os.path.abspath(__file__)), + ) except Exception as e: - raise HTTPException( - status_code=500, detail=f"Failed to start subprocess: {e}") + raise HTTPException(status_code=500, detail=f"Failed to start subprocess: {e}") # Grab a token for the user to join with user_token = await daily_helpers["rest"].get_token(room.url, MAX_SESSION_TIME) - return JSONResponse({ - "room_url": room.url, - "token": user_token, - }) + return JSONResponse( + { + "room_url": room.url, + "token": user_token, + } + ) @app.get("/{path_name:path}", response_class=FileResponse) @@ -155,6 +156,7 @@ async def catch_all(path_name: Optional[str] = ""): # ------------ Virtualization ------------ # + async def virtualize_bot(room_url: str, token: str): """ This is an example of how to virtualize the bot using Fly.io @@ -163,20 +165,19 @@ async def virtualize_bot(room_url: str, token: str): FLY_API_HOST = os.getenv("FLY_API_HOST", "https://api.machines.dev/v1") FLY_APP_NAME = os.getenv("FLY_APP_NAME", "storytelling-chatbot") FLY_API_KEY = os.getenv("FLY_API_KEY", "") - FLY_HEADERS = { - 'Authorization': f"Bearer {FLY_API_KEY}", - 'Content-Type': 'application/json' - } + FLY_HEADERS = {"Authorization": f"Bearer {FLY_API_KEY}", "Content-Type": "application/json"} async with aiohttp.ClientSession() as session: # Use the same image as the bot runner - async with session.get(f"{FLY_API_HOST}/apps/{FLY_APP_NAME}/machines", headers=FLY_HEADERS) as r: + async with session.get( + f"{FLY_API_HOST}/apps/{FLY_APP_NAME}/machines", headers=FLY_HEADERS + ) as r: if r.status != 200: text = await r.text() raise Exception(f"Unable to get machine info from Fly: {text}") data = await r.json() - image = data[0]['config']['image'] + image = data[0]["config"]["image"] # Machine configuration cmd = f"python3 src/bot.py -u {room_url} -t {token}" @@ -185,31 +186,28 @@ async def virtualize_bot(room_url: str, token: str): "config": { "image": image, "auto_destroy": True, - "init": { - "cmd": cmd - }, - "restart": { - "policy": "no" - }, - "guest": { - "cpu_kind": "shared", - "cpus": 1, - "memory_mb": 512 - } + "init": {"cmd": cmd}, + "restart": {"policy": "no"}, + "guest": {"cpu_kind": "shared", "cpus": 1, "memory_mb": 512}, }, } # Spawn a new machine instance - async with session.post(f"{FLY_API_HOST}/apps/{FLY_APP_NAME}/machines", headers=FLY_HEADERS, json=worker_props) as r: + async with session.post( + f"{FLY_API_HOST}/apps/{FLY_APP_NAME}/machines", headers=FLY_HEADERS, json=worker_props + ) as r: if r.status != 200: text = await r.text() raise Exception(f"Problem starting a bot worker: {text}") data = await r.json() # Wait for the machine to enter the started state - vm_id = data['id'] + vm_id = data["id"] - async with session.get(f"{FLY_API_HOST}/apps/{FLY_APP_NAME}/machines/{vm_id}/wait?state=started", headers=FLY_HEADERS) as r: + async with session.get( + f"{FLY_API_HOST}/apps/{FLY_APP_NAME}/machines/{vm_id}/wait?state=started", + headers=FLY_HEADERS, + ) as r: if r.status != 200: text = await r.text() raise Exception(f"Bot was unable to enter started state: {text}") @@ -221,8 +219,13 @@ async def virtualize_bot(room_url: str, token: str): if __name__ == "__main__": # Check environment variables - required_env_vars = ['OPENAI_API_KEY', 'DAILY_API_KEY', - 'FAL_KEY', 'ELEVENLABS_VOICE_ID', 'ELEVENLABS_API_KEY'] + required_env_vars = [ + "OPENAI_API_KEY", + "DAILY_API_KEY", + "FAL_KEY", + "ELEVENLABS_VOICE_ID", + "ELEVENLABS_API_KEY", + ] for env_var in required_env_vars: if env_var not in os.environ: raise Exception(f"Missing environment variable: {env_var}.") @@ -232,20 +235,11 @@ async def virtualize_bot(room_url: str, token: str): default_host = os.getenv("HOST", "0.0.0.0") default_port = int(os.getenv("FAST_API_PORT", "7860")) - parser = argparse.ArgumentParser( - description="Daily Storyteller FastAPI server") - parser.add_argument("--host", type=str, - default=default_host, help="Host address") - parser.add_argument("--port", type=int, - default=default_port, help="Port number") - parser.add_argument("--reload", action="store_true", - help="Reload code on change") + parser = argparse.ArgumentParser(description="Daily Storyteller FastAPI server") + parser.add_argument("--host", type=str, default=default_host, help="Host address") + parser.add_argument("--port", type=int, default=default_port, help="Port number") + parser.add_argument("--reload", action="store_true", help="Reload code on change") config = parser.parse_args() - uvicorn.run( - "bot_runner:app", - host=config.host, - port=config.port, - reload=config.reload - ) + uvicorn.run("bot_runner:app", host=config.host, port=config.port, reload=config.reload) diff --git a/examples/storytelling-chatbot/src/processors.py b/examples/storytelling-chatbot/src/processors.py index a8b2a0980..6aa9ad7ab 100644 --- a/examples/storytelling-chatbot/src/processors.py +++ b/examples/storytelling-chatbot/src/processors.py @@ -6,7 +6,8 @@ Frame, LLMFullResponseEndFrame, TextFrame, - UserStoppedSpeakingFrame) + UserStoppedSpeakingFrame, +) from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.transports.services.daily import DailyTransportMessageFrame @@ -35,6 +36,7 @@ class StoryPromptFrame(TextFrame): # ------------ Frame Processors ----------- # + class StoryImageProcessor(FrameProcessor): """ Processor for image prompt frames that will be sent to the FAL service. @@ -113,7 +115,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): # Extract the image prompt from the text using regex image_prompt = re.search(r"<(.*?)>", self._text).group(1) # Remove the image prompt from the text - self._text = re.sub(r"<.*?>", '', self._text, count=1) + self._text = re.sub(r"<.*?>", "", self._text, count=1) # Process the image prompt frame await self.push_frame(StoryImageFrame(image_prompt)) @@ -124,8 +126,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): if re.search(r".*\[[bB]reak\].*", self._text): # Remove the [break] token from the text # so it isn't spoken out loud by the TTS - self._text = re.sub(r'\[[bB]reak\]', '', - self._text, flags=re.IGNORECASE) + self._text = re.sub(r"\[[bB]reak\]", "", self._text, flags=re.IGNORECASE) self._text = self._text.replace("\n", " ") if len(self._text) > 2: # Append the sentence to the story diff --git a/examples/storytelling-chatbot/src/prompts.py b/examples/storytelling-chatbot/src/prompts.py index 551a7c4f2..08abbc93c 100644 --- a/examples/storytelling-chatbot/src/prompts.py +++ b/examples/storytelling-chatbot/src/prompts.py @@ -3,7 +3,7 @@ "content": "You are a creative storyteller who loves to tell whimsical, fantastical stories. \ Your goal is to craft an engaging and fun story. \ Start by asking the user what kind of story they'd like to hear. Don't provide any examples. \ - Keep your response to only a few sentences." + Keep your response to only a few sentences.", } @@ -25,7 +25,7 @@ Responses should use the format: <...> story sentence [break] <...> story sentence [break] ... \ After each response, ask me how I'd like the story to continue and wait for my input. \ Please ensure your responses are less than 3-4 sentences long. \ - Please refrain from using any explicit language or content. Do not tell scary stories." + Please refrain from using any explicit language or content. Do not tell scary stories.", } diff --git a/examples/storytelling-chatbot/src/utils/helpers.py b/examples/storytelling-chatbot/src/utils/helpers.py index 743a04c97..36ba3e609 100644 --- a/examples/storytelling-chatbot/src/utils/helpers.py +++ b/examples/storytelling-chatbot/src/utils/helpers.py @@ -17,7 +17,8 @@ def load_images(image_files): # Open the image and convert it to bytes with Image.open(full_path) as img: images[filename] = OutputImageRawFrame( - image=img.tobytes(), size=img.size, format=img.format) + image=img.tobytes(), size=img.size, format=img.format + ) return images @@ -31,8 +32,10 @@ def load_sounds(sound_files): filename = os.path.splitext(os.path.basename(full_path))[0] # Open the sound and convert it to bytes with wave.open(full_path) as audio_file: - sounds[filename] = OutputAudioRawFrame(audio=audio_file.readframes(-1), - sample_rate=audio_file.getframerate(), - num_channels=audio_file.getnchannels()) + sounds[filename] = OutputAudioRawFrame( + audio=audio_file.readframes(-1), + sample_rate=audio_file.getframerate(), + num_channels=audio_file.getnchannels(), + ) return sounds diff --git a/examples/studypal/runner.py b/examples/studypal/runner.py index 068174eec..13c4ff076 100644 --- a/examples/studypal/runner.py +++ b/examples/studypal/runner.py @@ -17,16 +17,13 @@ async def configure(aiohttp_session: aiohttp.ClientSession): async def configure_with_args( - aiohttp_session: aiohttp.ClientSession, - parser: argparse.ArgumentParser | None = None): + aiohttp_session: aiohttp.ClientSession, parser: argparse.ArgumentParser | None = None +): if not parser: parser = argparse.ArgumentParser(description="Daily AI SDK Bot Sample") parser.add_argument( - "-u", - "--url", - type=str, - required=False, - help="URL of the Daily room to join") + "-u", "--url", type=str, required=False, help="URL of the Daily room to join" + ) parser.add_argument( "-k", "--apikey", @@ -42,15 +39,19 @@ async def configure_with_args( if not url: raise Exception( - "No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL.") + "No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL." + ) if not key: - raise Exception("No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers.") + raise Exception( + "No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers." + ) daily_rest_helper = DailyRESTHelper( daily_api_key=key, daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"), - aiohttp_session=aiohttp_session) + aiohttp_session=aiohttp_session, + ) # Create a meeting token for the given room with an expiration 1 hour in # the future. diff --git a/examples/studypal/studypal.py b/examples/studypal/studypal.py index 368a9b072..2364c65cf 100644 --- a/examples/studypal/studypal.py +++ b/examples/studypal/studypal.py @@ -13,7 +13,9 @@ from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.llm_response import ( - LLMAssistantResponseAggregator, LLMUserResponseAggregator) + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport @@ -24,6 +26,7 @@ from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) # Run this script directly from your command line. @@ -45,15 +48,17 @@ def truncate_content(content, model_name): return encoding.decode(truncated_tokens) return content + # Main function to extract content from url async def get_article_content(url: str, aiohttp_session: aiohttp.ClientSession): - if 'arxiv.org' in url: + if "arxiv.org" in url: return await get_arxiv_content(url, aiohttp_session) else: return await get_wikipedia_content(url, aiohttp_session) + # Helper function to extract content from Wikipedia url (this is # technically agnostic to URL type but will work best with Wikipedia # articles) @@ -65,23 +70,24 @@ async def get_wikipedia_content(url: str, aiohttp_session: aiohttp.ClientSession return "Failed to download Wikipedia article." text = await response.text() - soup = BeautifulSoup(text, 'html.parser') + soup = BeautifulSoup(text, "html.parser") - content = soup.find('div', {'class': 'mw-parser-output'}) + content = soup.find("div", {"class": "mw-parser-output"}) if content: return content.get_text() else: return "Failed to extract Wikipedia article content." + # Helper function to extract content from arXiv url async def get_arxiv_content(url: str, aiohttp_session: aiohttp.ClientSession): - if '/abs/' in url: - url = url.replace('/abs/', '/pdf/') - if not url.endswith('.pdf'): - url += '.pdf' + if "/abs/" in url: + url = url.replace("/abs/", "/pdf/") + if not url.endswith(".pdf"): + url += ".pdf" async with aiohttp_session.get(url) as response: if response.status != 200: @@ -95,6 +101,7 @@ async def get_arxiv_content(url: str, aiohttp_session: aiohttp.ClientSession): text += page.extract_text() return text + # This is the main function that handles STT -> LLM -> TTS @@ -116,8 +123,8 @@ async def main(): audio_out_enabled=True, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() - ) + vad_analyzer=SileroVADAnalyzer(), + ), ) tts = CartesiaTTSService( @@ -127,29 +134,33 @@ async def main(): sample_rate=44100, ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o-mini") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o-mini") - messages = [{ - "role": "system", "content": f"""You are an AI study partner. You have been given the following article content: + messages = [ + { + "role": "system", + "content": f"""You are an AI study partner. You have been given the following article content: {article_content} Your task is to help the user understand and learn from this article in 2 sentences. THESE RESPONSES SHOULD BE ONLY MAX 2 SENTENCES. THIS INSTRUCTION IS VERY IMPORTANT. RESPONSES SHOULDN'T BE LONG. -""", }, ] +""", + }, + ] tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), - tma_in, - llm, - tts, - transport.output(), - tma_out, - ]) + pipeline = Pipeline( + [ + transport.input(), + tma_in, + llm, + tts, + transport.output(), + tma_out, + ] + ) task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True, enable_metrics=True)) @@ -159,12 +170,15 @@ async def on_first_participant_joined(transport, participant): messages.append( { "role": "system", - "content": "Hello! I'm ready to discuss the article with you. What would you like to learn about?"}) + "content": "Hello! I'm ready to discuss the article with you. What would you like to learn about?", + } + ) await task.queue_frames([LLMMessagesFrame(messages)]) runner = PipelineRunner() await runner.run(task) + if __name__ == "__main__": asyncio.run(main()) diff --git a/examples/translation-chatbot/bot.py b/examples/translation-chatbot/bot.py index 1dbe802b9..55302b392 100644 --- a/examples/translation-chatbot/bot.py +++ b/examples/translation-chatbot/bot.py @@ -22,13 +22,15 @@ DailyParams, DailyTranscriptionSettings, DailyTransport, - DailyTransportMessageFrame) + DailyTransportMessageFrame, +) from runner import configure from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -44,7 +46,6 @@ # We need to use a custom service here to yield LLM frames without saving # any context class TranslationProcessor(FrameProcessor): - def __init__(self, language): super().__init__() self._language = language @@ -80,10 +81,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) if isinstance(frame, TextFrame): - message = { - "language": self._language, - "text": frame.text - } + message = {"language": self._language, "text": frame.text} await self.push_frame(DailyTransportMessageFrame(message)) await self.push_frame(frame) @@ -100,10 +98,8 @@ async def main(): DailyParams( audio_out_enabled=True, transcription_enabled=True, - transcription_settings=DailyTranscriptionSettings(extra={ - "interim_results": False - }) - ) + transcription_settings=DailyTranscriptionSettings(extra={"interim_results": False}), + ), ) tts = AzureTTSService( @@ -112,26 +108,14 @@ async def main(): voice="es-ES-AlvaroNeural", ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o" - ) + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") sa = SentenceAggregator() tp = TranslationProcessor("Spanish") lfra = LLMFullResponseAggregator() ts = TranslationSubtitles("spanish") - pipeline = Pipeline([ - transport.input(), - sa, - tp, - llm, - lfra, - ts, - tts, - transport.output() - ]) + pipeline = Pipeline([transport.input(), sa, tp, llm, lfra, ts, tts, transport.output()]) task = PipelineTask(pipeline) diff --git a/examples/translation-chatbot/runner.py b/examples/translation-chatbot/runner.py index 5f0e41795..f19fcf211 100644 --- a/examples/translation-chatbot/runner.py +++ b/examples/translation-chatbot/runner.py @@ -15,11 +15,8 @@ async def configure(aiohttp_session: aiohttp.ClientSession): parser = argparse.ArgumentParser(description="Daily AI SDK Bot Sample") parser.add_argument( - "-u", - "--url", - type=str, - required=False, - help="URL of the Daily room to join") + "-u", "--url", type=str, required=False, help="URL of the Daily room to join" + ) parser.add_argument( "-k", "--apikey", @@ -35,15 +32,18 @@ async def configure(aiohttp_session: aiohttp.ClientSession): if not url: raise Exception( - "No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL.") + "No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL." + ) if not key: - raise Exception("No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers.") + raise Exception( + "No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers." + ) daily_rest_helper = DailyRESTHelper( daily_api_key=key, daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"), - aiohttp_session=aiohttp_session + aiohttp_session=aiohttp_session, ) # Create a meeting token for the given room with an expiration 1 hour in diff --git a/examples/translation-chatbot/server.py b/examples/translation-chatbot/server.py index d54452d10..5240c254f 100644 --- a/examples/translation-chatbot/server.py +++ b/examples/translation-chatbot/server.py @@ -38,13 +38,14 @@ async def lifespan(app: FastAPI): aiohttp_session = aiohttp.ClientSession() daily_helpers["rest"] = DailyRESTHelper( daily_api_key=os.getenv("DAILY_API_KEY", ""), - daily_api_url=os.getenv("DAILY_API_URL", 'https://api.daily.co/v1'), - aiohttp_session=aiohttp_session + daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"), + aiohttp_session=aiohttp_session, ) yield await aiohttp_session.close() cleanup() + app = FastAPI(lifespan=lifespan) app.add_middleware( @@ -65,37 +66,34 @@ async def start_agent(request: Request): if not room.url: raise HTTPException( status_code=500, - detail="Missing 'room' property in request data. Cannot start agent without a target room!") + detail="Missing 'room' property in request data. Cannot start agent without a target room!", + ) # Check if there is already an existing process running in this room num_bots_in_room = sum( - 1 for proc in bot_procs.values() if proc[1] == room.url and proc[0].poll() is None) + 1 for proc in bot_procs.values() if proc[1] == room.url and proc[0].poll() is None + ) if num_bots_in_room >= MAX_BOTS_PER_ROOM: - raise HTTPException( - status_code=500, detail=f"Max bot limited reach for room: {room.url}") + raise HTTPException(status_code=500, detail=f"Max bot limited reach for room: {room.url}") # Get the token for the room token = await daily_helpers["rest"].get_token(room.url) if not token: - raise HTTPException( - status_code=500, detail=f"Failed to get token for room: {room.url}") + raise HTTPException(status_code=500, detail=f"Failed to get token for room: {room.url}") # Spawn a new agent, and join the user session # Note: this is mostly for demonstration purposes (refer to 'deployment' in README) try: proc = subprocess.Popen( - [ - f"python3 -m bot -u {room.url} -t {token}" - ], + [f"python3 -m bot -u {room.url} -t {token}"], shell=True, bufsize=1, - cwd=os.path.dirname(os.path.abspath(__file__)) + cwd=os.path.dirname(os.path.abspath(__file__)), ) bot_procs[proc.pid] = (proc, room.url) except Exception as e: - raise HTTPException( - status_code=500, detail=f"Failed to start subprocess: {e}") + raise HTTPException(status_code=500, detail=f"Failed to start subprocess: {e}") return RedirectResponse(room.url) @@ -107,8 +105,7 @@ def get_status(pid: int): # If the subprocess doesn't exist, return an error if not proc: - raise HTTPException( - status_code=404, detail=f"Bot with process id: {pid} not found") + raise HTTPException(status_code=404, detail=f"Bot with process id: {pid} not found") # Check the status of the subprocess if proc[0].poll() is None: @@ -125,14 +122,10 @@ def get_status(pid: int): default_host = os.getenv("HOST", "0.0.0.0") default_port = int(os.getenv("FAST_API_PORT", "7860")) - parser = argparse.ArgumentParser( - description="Daily Storyteller FastAPI server") - parser.add_argument("--host", type=str, - default=default_host, help="Host address") - parser.add_argument("--port", type=int, - default=default_port, help="Port number") - parser.add_argument("--reload", action="store_true", - help="Reload code on change") + parser = argparse.ArgumentParser(description="Daily Storyteller FastAPI server") + parser.add_argument("--host", type=str, default=default_host, help="Host address") + parser.add_argument("--port", type=int, default=default_port, help="Port number") + parser.add_argument("--reload", action="store_true", help="Reload code on change") config = parser.parse_args() diff --git a/examples/twilio-chatbot/bot.py b/examples/twilio-chatbot/bot.py index 5b83139f9..de9e395c4 100644 --- a/examples/twilio-chatbot/bot.py +++ b/examples/twilio-chatbot/bot.py @@ -7,18 +7,22 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.llm_response import ( LLMAssistantResponseAggregator, - LLMUserResponseAggregator + LLMUserResponseAggregator, ) from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.openai import OpenAILLMService from pipecat.services.deepgram import DeepgramSTTService -from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketTransport, FastAPIWebsocketParams +from pipecat.transports.network.fastapi_websocket import ( + FastAPIWebsocketTransport, + FastAPIWebsocketParams, +) from pipecat.vad.silero import SileroVADAnalyzer from pipecat.serializers.twilio import TwilioFrameSerializer from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -34,15 +38,13 @@ async def run_bot(websocket_client, stream_sid): vad_enabled=True, vad_analyzer=SileroVADAnalyzer(), vad_audio_passthrough=True, - serializer=TwilioFrameSerializer(stream_sid) - ) + serializer=TwilioFrameSerializer(stream_sid), + ), ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") - stt = DeepgramSTTService(api_key=os.getenv('DEEPGRAM_API_KEY')) + stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) tts = CartesiaTTSService( api_key=os.getenv("CARTESIA_API_KEY"), @@ -59,23 +61,24 @@ async def run_bot(websocket_client, stream_sid): tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), # Websocket input from client - stt, # Speech-To-Text - tma_in, # User responses - llm, # LLM - tts, # Text-To-Speech - transport.output(), # Websocket output to client - tma_out # LLM responses - ]) + pipeline = Pipeline( + [ + transport.input(), # Websocket input from client + stt, # Speech-To-Text + tma_in, # User responses + llm, # LLM + tts, # Text-To-Speech + transport.output(), # Websocket output to client + tma_out, # LLM responses + ] + ) task = PipelineTask(pipeline, params=PipelineParams(allow_interruptions=True)) @transport.event_handler("on_client_connected") async def on_client_connected(transport, client): # Kick off the conversation. - messages.append( - {"role": "system", "content": "Please introduce yourself to the user."}) + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) await task.queue_frames([LLMMessagesFrame(messages)]) @transport.event_handler("on_client_disconnected") diff --git a/examples/twilio-chatbot/server.py b/examples/twilio-chatbot/server.py index f64e7f309..9656875ec 100644 --- a/examples/twilio-chatbot/server.py +++ b/examples/twilio-chatbot/server.py @@ -19,7 +19,7 @@ ) -@app.post('/start_call') +@app.post("/start_call") async def start_call(): print("POST TwiML") return HTMLResponse(content=open("templates/streams.xml").read(), media_type="application/xml") @@ -32,7 +32,7 @@ async def websocket_endpoint(websocket: WebSocket): await start_data.__anext__() call_data = json.loads(await start_data.__anext__()) print(call_data, flush=True) - stream_sid = call_data['start']['streamSid'] + stream_sid = call_data["start"]["streamSid"] print("WebSocket connection accepted") await run_bot(websocket, stream_sid) diff --git a/examples/websocket-server/bot.py b/examples/websocket-server/bot.py index 61d285fa8..e223d4e3f 100644 --- a/examples/websocket-server/bot.py +++ b/examples/websocket-server/bot.py @@ -14,17 +14,21 @@ from pipecat.pipeline.task import PipelineTask from pipecat.processors.aggregators.llm_response import ( LLMAssistantResponseAggregator, - LLMUserResponseAggregator + LLMUserResponseAggregator, ) from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.deepgram import DeepgramSTTService from pipecat.services.openai import OpenAILLMService -from pipecat.transports.network.websocket_server import WebsocketServerParams, WebsocketServerTransport +from pipecat.transports.network.websocket_server import ( + WebsocketServerParams, + WebsocketServerTransport, +) from pipecat.vad.silero import SileroVADAnalyzer from loguru import logger from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) @@ -38,13 +42,11 @@ async def main(): add_wav_header=True, vad_enabled=True, vad_analyzer=SileroVADAnalyzer(), - vad_audio_passthrough=True + vad_audio_passthrough=True, ) ) - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o") + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) @@ -63,28 +65,30 @@ async def main(): tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline([ - transport.input(), # Websocket input from client - stt, # Speech-To-Text - tma_in, # User responses - llm, # LLM - tts, # Text-To-Speech - transport.output(), # Websocket output to client - tma_out # LLM responses - ]) + pipeline = Pipeline( + [ + transport.input(), # Websocket input from client + stt, # Speech-To-Text + tma_in, # User responses + llm, # LLM + tts, # Text-To-Speech + transport.output(), # Websocket output to client + tma_out, # LLM responses + ] + ) task = PipelineTask(pipeline) @transport.event_handler("on_client_connected") async def on_client_connected(transport, client): # Kick off the conversation. - messages.append( - {"role": "system", "content": "Please introduce yourself to the user."}) + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) await task.queue_frames([LLMMessagesFrame(messages)]) runner = PipelineRunner() await runner.run(task) + if __name__ == "__main__": asyncio.run(main()) diff --git a/src/pipecat/clocks/base_clock.py b/src/pipecat/clocks/base_clock.py index aa7b7b806..79e17d5ba 100644 --- a/src/pipecat/clocks/base_clock.py +++ b/src/pipecat/clocks/base_clock.py @@ -8,7 +8,6 @@ class BaseClock(ABC): - @abstractmethod def get_time(self) -> int: pass diff --git a/src/pipecat/clocks/system_clock.py b/src/pipecat/clocks/system_clock.py index 20319cff6..d919b6acd 100644 --- a/src/pipecat/clocks/system_clock.py +++ b/src/pipecat/clocks/system_clock.py @@ -10,7 +10,6 @@ class SystemClock(BaseClock): - def __init__(self): self._time = 0 diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py index 3211bd266..e4495098b 100644 --- a/src/pipecat/frames/frames.py +++ b/src/pipecat/frames/frames.py @@ -43,6 +43,7 @@ class DataFrame(Frame): @dataclass class AudioRawFrame(DataFrame): """A chunk of audio.""" + audio: bytes sample_rate: int num_channels: int @@ -58,9 +59,8 @@ def __str__(self): @dataclass class InputAudioRawFrame(AudioRawFrame): - """A chunk of audio usually coming from an input transport. + """A chunk of audio usually coming from an input transport.""" - """ pass @@ -70,14 +70,14 @@ class OutputAudioRawFrame(AudioRawFrame): transport's microphone has been enabled. """ + pass @dataclass class TTSAudioRawFrame(OutputAudioRawFrame): - """A chunk of output audio generated by a TTS service. + """A chunk of output audio generated by a TTS service.""" - """ pass @@ -87,6 +87,7 @@ class ImageRawFrame(DataFrame): enabled. """ + image: bytes size: Tuple[int, int] format: str | None @@ -112,6 +113,7 @@ class UserImageRawFrame(InputImageRawFrame): transport's camera is enabled. """ + user_id: str def __str__(self): @@ -125,11 +127,14 @@ class VisionImageRawFrame(InputImageRawFrame): shown by the transport if the transport's camera is enabled. """ + text: str | None def __str__(self): pts = format_pts(self.pts) - return f"{self.name}(pts: {pts}, text: {self.text}, size: {self.size}, format: {self.format})" + return ( + f"{self.name}(pts: {pts}, text: {self.text}, size: {self.size}, format: {self.format})" + ) @dataclass @@ -138,6 +143,7 @@ class URLImageRawFrame(OutputImageRawFrame): transport's camera is enabled. """ + url: str | None def __str__(self): @@ -152,6 +158,7 @@ class SpriteFrame(Frame): `camera_out_framerate` constructor parameter. """ + images: List[ImageRawFrame] def __str__(self): @@ -165,6 +172,7 @@ class TextFrame(DataFrame): be used to send text through pipelines. """ + text: str def __str__(self): @@ -178,6 +186,7 @@ class TranscriptionFrame(TextFrame): transport's receive queue when a participant speaks. """ + user_id: str timestamp: str language: Language | None = None @@ -190,6 +199,7 @@ def __str__(self): class InterimTranscriptionFrame(TextFrame): """A text frame with interim transcription-specific data. Will be placed in the transport's receive queue when a participant speaks.""" + user_id: str timestamp: str language: Language | None = None @@ -207,6 +217,7 @@ class LLMMessagesFrame(DataFrame): processors. """ + messages: List[dict] @@ -216,6 +227,7 @@ class LLMMessagesAppendFrame(DataFrame): current context. """ + messages: List[dict] @@ -226,6 +238,7 @@ class LLMMessagesUpdateFrame(DataFrame): LLMMessagesFrame. """ + messages: List[dict] @@ -235,13 +248,14 @@ class LLMSetToolsFrame(DataFrame): The specific format depends on the LLM being used, but it should typically contain JSON Schema objects. """ + tools: List[dict] @dataclass class LLMEnablePromptCachingFrame(DataFrame): - """A frame to enable/disable prompt caching in certain LLMs. - """ + """A frame to enable/disable prompt caching in certain LLMs.""" + enable: bool @@ -251,6 +265,7 @@ class TTSSpeakFrame(DataFrame): pipeline (if any). """ + text: str @@ -262,6 +277,7 @@ class TransportMessageFrame(DataFrame): def __str__(self): return f"{self.name}(message: {self.message})" + # # App frames. Application user-defined frames. # @@ -271,6 +287,7 @@ def __str__(self): class AppFrame(Frame): pass + # # System frames # @@ -284,6 +301,7 @@ class SystemFrame(Frame): @dataclass class StartFrame(SystemFrame): """This is the first frame that should be pushed down a pipeline.""" + clock: BaseClock allow_interruptions: bool = False enable_metrics: bool = False @@ -294,6 +312,7 @@ class StartFrame(SystemFrame): @dataclass class CancelFrame(SystemFrame): """Indicates that a pipeline needs to stop right away.""" + pass @@ -304,6 +323,7 @@ class ErrorFrame(SystemFrame): bot should exit. """ + error: str fatal: bool = False @@ -317,6 +337,7 @@ class FatalErrorFrame(ErrorFrame): that the bot should exit. """ + fatal: bool = field(default=True, init=False) @@ -327,6 +348,7 @@ class StopTaskFrame(SystemFrame): the pipeline task. """ + pass @@ -338,6 +360,7 @@ class StartInterruptionFrame(SystemFrame): guaranteed). """ + pass @@ -349,6 +372,7 @@ class StopInterruptionFrame(SystemFrame): guaranteed). """ + pass @@ -359,13 +383,14 @@ class BotInterruptionFrame(SystemFrame): UserStartedSpeakingFrame and UserStoppedSpeakingFrame won't be generated. """ + pass @dataclass class MetricsFrame(SystemFrame): - """Emitted by processor that can compute metrics like latencies. - """ + """Emitted by processor that can compute metrics like latencies.""" + data: List[MetricsData] @@ -388,6 +413,7 @@ class EndFrame(ControlFrame): was sent (unline system frames). """ + pass @@ -395,12 +421,14 @@ class EndFrame(ControlFrame): class LLMFullResponseStartFrame(ControlFrame): """Used to indicate the beginning of an LLM response. Following by one or more TextFrame and a final LLMFullResponseEndFrame.""" + pass @dataclass class LLMFullResponseEndFrame(ControlFrame): """Indicates the end of an LLM response.""" + pass @@ -412,28 +440,28 @@ class UserStartedSpeakingFrame(ControlFrame): with a TranscriptionFrame) """ + pass @dataclass class UserStoppedSpeakingFrame(ControlFrame): """Emitted by the VAD to indicate that a user stopped speaking.""" + pass @dataclass class BotStartedSpeakingFrame(ControlFrame): - """Emitted upstream by transport outputs to indicate the bot started speaking. + """Emitted upstream by transport outputs to indicate the bot started speaking.""" - """ pass @dataclass class BotStoppedSpeakingFrame(ControlFrame): - """Emitted upstream by transport outputs to indicate the bot stopped speaking. + """Emitted upstream by transport outputs to indicate the bot stopped speaking.""" - """ pass @@ -445,6 +473,7 @@ class BotSpeakingFrame(ControlFrame): since the user might be listening. """ + pass @@ -457,18 +486,21 @@ class TTSStartedFrame(ControlFrame): needing to control this in the TTS service. """ + pass @dataclass class TTSStoppedFrame(ControlFrame): """Indicates the end of a TTS response.""" + pass @dataclass class UserImageRequestFrame(ControlFrame): """A frame user to request an image from the given user.""" + user_id: str context: Optional[Any] = None @@ -478,29 +510,29 @@ def __str__(self): @dataclass class LLMModelUpdateFrame(ControlFrame): - """A control frame containing a request to update to a new LLM model. - """ + """A control frame containing a request to update to a new LLM model.""" + model: str @dataclass class LLMTemperatureUpdateFrame(ControlFrame): - """A control frame containing a request to update to a new LLM temperature. - """ + """A control frame containing a request to update to a new LLM temperature.""" + temperature: float @dataclass class LLMTopKUpdateFrame(ControlFrame): - """A control frame containing a request to update to a new LLM top_k. - """ + """A control frame containing a request to update to a new LLM top_k.""" + top_k: int @dataclass class LLMTopPUpdateFrame(ControlFrame): - """A control frame containing a request to update to a new LLM top_p. - """ + """A control frame containing a request to update to a new LLM top_p.""" + top_p: float @@ -510,6 +542,7 @@ class LLMFrequencyPenaltyUpdateFrame(ControlFrame): penalty. """ + frequency_penalty: float @@ -519,41 +552,42 @@ class LLMPresencePenaltyUpdateFrame(ControlFrame): penalty. """ + presence_penalty: float @dataclass class LLMMaxTokensUpdateFrame(ControlFrame): - """A control frame containing a request to update to a new LLM max tokens. - """ + """A control frame containing a request to update to a new LLM max tokens.""" + max_tokens: int @dataclass class LLMSeedUpdateFrame(ControlFrame): - """A control frame containing a request to update to a new LLM seed. - """ + """A control frame containing a request to update to a new LLM seed.""" + seed: int @dataclass class LLMExtraUpdateFrame(ControlFrame): - """A control frame containing a request to update to a new LLM extra params. - """ + """A control frame containing a request to update to a new LLM extra params.""" + extra: dict @dataclass class TTSModelUpdateFrame(ControlFrame): - """A control frame containing a request to update the TTS model. - """ + """A control frame containing a request to update the TTS model.""" + model: str @dataclass class TTSVoiceUpdateFrame(ControlFrame): - """A control frame containing a request to update to a new TTS voice. - """ + """A control frame containing a request to update to a new TTS voice.""" + voice: str @@ -563,6 +597,7 @@ class TTSLanguageUpdateFrame(ControlFrame): optional voice. """ + language: Language @@ -572,20 +607,21 @@ class STTModelUpdateFrame(ControlFrame): language. """ + model: str @dataclass class STTLanguageUpdateFrame(ControlFrame): - """A control frame containing a request to update to STT language. - """ + """A control frame containing a request to update to STT language.""" + language: Language @dataclass class FunctionCallInProgressFrame(SystemFrame): - """A frame signaling that a function call is in progress. - """ + """A frame signaling that a function call is in progress.""" + function_name: str tool_call_id: str arguments: str @@ -593,8 +629,8 @@ class FunctionCallInProgressFrame(SystemFrame): @dataclass class FunctionCallResultFrame(DataFrame): - """A frame containing the result of an LLM function (tool) call. - """ + """A frame containing the result of an LLM function (tool) call.""" + function_name: str tool_call_id: str arguments: str @@ -606,4 +642,5 @@ class VADParamsUpdateFrame(ControlFrame): """A control frame containing a request to update VAD params. Intended to be pushed upstream from RTVI processor. """ + params: VADParams diff --git a/src/pipecat/pipeline/base_pipeline.py b/src/pipecat/pipeline/base_pipeline.py index 54f6499a9..393914684 100644 --- a/src/pipecat/pipeline/base_pipeline.py +++ b/src/pipecat/pipeline/base_pipeline.py @@ -12,7 +12,6 @@ class BasePipeline(FrameProcessor): - def __init__(self): super().__init__() diff --git a/src/pipecat/pipeline/parallel_pipeline.py b/src/pipecat/pipeline/parallel_pipeline.py index d045c3493..1c2eeabde 100644 --- a/src/pipecat/pipeline/parallel_pipeline.py +++ b/src/pipecat/pipeline/parallel_pipeline.py @@ -18,7 +18,6 @@ class Source(FrameProcessor): - def __init__(self, upstream_queue: asyncio.Queue): super().__init__() self._up_queue = upstream_queue @@ -34,7 +33,6 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): class Sink(FrameProcessor): - def __init__(self, downstream_queue: asyncio.Queue): super().__init__() self._down_queue = downstream_queue diff --git a/src/pipecat/pipeline/pipeline.py b/src/pipecat/pipeline/pipeline.py index 6805cfad0..a1715570e 100644 --- a/src/pipecat/pipeline/pipeline.py +++ b/src/pipecat/pipeline/pipeline.py @@ -12,7 +12,6 @@ class PipelineSource(FrameProcessor): - def __init__(self, upstream_push_frame: Callable[[Frame, FrameDirection], Coroutine]): super().__init__() self._upstream_push_frame = upstream_push_frame @@ -28,7 +27,6 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): class PipelineSink(FrameProcessor): - def __init__(self, downstream_push_frame: Callable[[Frame, FrameDirection], Coroutine]): super().__init__() self._downstream_push_frame = downstream_push_frame @@ -44,7 +42,6 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): class Pipeline(BasePipeline): - def __init__(self, processors: List[FrameProcessor]): super().__init__() diff --git a/src/pipecat/pipeline/runner.py b/src/pipecat/pipeline/runner.py index 3237c3904..57b818487 100644 --- a/src/pipecat/pipeline/runner.py +++ b/src/pipecat/pipeline/runner.py @@ -14,7 +14,6 @@ class PipelineRunner: - def __init__(self, *, name: str | None = None, handle_sigint: bool = True): self.id: int = obj_id() self.name: str = name or f"{self.__class__.__name__}#{obj_count(self)}" @@ -42,12 +41,10 @@ async def cancel(self): def _setup_sigint(self): loop = asyncio.get_running_loop() loop.add_signal_handler( - signal.SIGINT, - lambda *args: asyncio.create_task(self._sig_handler()) + signal.SIGINT, lambda *args: asyncio.create_task(self._sig_handler()) ) loop.add_signal_handler( - signal.SIGTERM, - lambda *args: asyncio.create_task(self._sig_handler()) + signal.SIGTERM, lambda *args: asyncio.create_task(self._sig_handler()) ) async def _sig_handler(self): diff --git a/src/pipecat/pipeline/sync_parallel_pipeline.py b/src/pipecat/pipeline/sync_parallel_pipeline.py index d922134f4..854cea89d 100644 --- a/src/pipecat/pipeline/sync_parallel_pipeline.py +++ b/src/pipecat/pipeline/sync_parallel_pipeline.py @@ -18,7 +18,6 @@ class Source(FrameProcessor): - def __init__(self, upstream_queue: asyncio.Queue): super().__init__() self._up_queue = upstream_queue @@ -34,7 +33,6 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): class Sink(FrameProcessor): - def __init__(self, downstream_queue: asyncio.Queue): super().__init__() self._down_queue = downstream_queue diff --git a/src/pipecat/pipeline/task.py b/src/pipecat/pipeline/task.py index 26e6e9f4f..2b46c47c2 100644 --- a/src/pipecat/pipeline/task.py +++ b/src/pipecat/pipeline/task.py @@ -19,7 +19,8 @@ Frame, MetricsFrame, StartFrame, - StopTaskFrame) + StopTaskFrame, +) from pipecat.metrics.metrics import TTFBMetricsData, ProcessingMetricsData from pipecat.pipeline.base_pipeline import BasePipeline from pipecat.processors.frame_processor import FrameDirection, FrameProcessor @@ -37,7 +38,6 @@ class PipelineParams(BaseModel): class Source(FrameProcessor): - def __init__(self, up_queue: asyncio.Queue): super().__init__() self._up_queue = up_queue @@ -62,12 +62,12 @@ async def _handle_upstream_frame(self, frame: Frame): class PipelineTask: - def __init__( - self, - pipeline: BasePipeline, - params: PipelineParams = PipelineParams(), - clock: BaseClock = SystemClock()): + self, + pipeline: BasePipeline, + params: PipelineParams = PipelineParams(), + clock: BaseClock = SystemClock(), + ): self.id: int = obj_id() self.name: str = f"{self.__class__.__name__}#{obj_count(self)}" @@ -133,12 +133,14 @@ async def _process_down_queue(self): enable_metrics=self._params.enable_metrics, enable_usage_metrics=self._params.enable_metrics, report_only_initial_ttfb=self._params.report_only_initial_ttfb, - clock=self._clock + clock=self._clock, ) await self._source.process_frame(start_frame, FrameDirection.DOWNSTREAM) if self._params.enable_metrics and self._params.send_initial_empty_metrics: - await self._source.process_frame(self._initial_metrics_frame(), FrameDirection.DOWNSTREAM) + await self._source.process_frame( + self._initial_metrics_frame(), FrameDirection.DOWNSTREAM + ) running = True should_cleanup = True diff --git a/src/pipecat/pipeline/to_be_updated/merge_pipeline.py b/src/pipecat/pipeline/to_be_updated/merge_pipeline.py index f6f9a5ebd..6142a55ea 100644 --- a/src/pipecat/pipeline/to_be_updated/merge_pipeline.py +++ b/src/pipecat/pipeline/to_be_updated/merge_pipeline.py @@ -15,9 +15,7 @@ async def run_pipeline(self): for idx, pipeline in enumerate(self.pipelines): while True: frame = await pipeline.sink.get() - if isinstance( - frame, EndFrame) or isinstance( - frame, EndPipeFrame): + if isinstance(frame, EndFrame) or isinstance(frame, EndPipeFrame): break await self.sink.put(frame) diff --git a/src/pipecat/processors/aggregators/gated.py b/src/pipecat/processors/aggregators/gated.py index 7d784b14c..c39a35c82 100644 --- a/src/pipecat/processors/aggregators/gated.py +++ b/src/pipecat/processors/aggregators/gated.py @@ -41,8 +41,13 @@ class GatedAggregator(FrameProcessor): Goodbye. """ - def __init__(self, gate_open_fn, gate_close_fn, start_open, - direction: FrameDirection = FrameDirection.DOWNSTREAM): + def __init__( + self, + gate_open_fn, + gate_close_fn, + start_open, + direction: FrameDirection = FrameDirection.DOWNSTREAM, + ): super().__init__() self._gate_open_fn = gate_open_fn self._gate_close_fn = gate_close_fn @@ -75,7 +80,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): if self._gate_open: await self.push_frame(frame, direction) - for (f, d) in self._accumulator: + for f, d in self._accumulator: await self.push_frame(f, d) self._accumulator = [] else: diff --git a/src/pipecat/processors/aggregators/llm_response.py b/src/pipecat/processors/aggregators/llm_response.py index 13920c59b..036f5fe47 100644 --- a/src/pipecat/processors/aggregators/llm_response.py +++ b/src/pipecat/processors/aggregators/llm_response.py @@ -6,7 +6,10 @@ from typing import List, Type -from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContextFrame, OpenAILLMContext +from pipecat.processors.aggregators.openai_llm_context import ( + OpenAILLMContextFrame, + OpenAILLMContext, +) from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.frames.frames import ( @@ -22,11 +25,11 @@ TranscriptionFrame, TextFrame, UserStartedSpeakingFrame, - UserStoppedSpeakingFrame) + UserStoppedSpeakingFrame, +) class LLMResponseAggregator(FrameProcessor): - def __init__( self, *, @@ -36,7 +39,7 @@ def __init__( end_frame, accumulator_frame: Type[TextFrame], interim_accumulator_frame: Type[TextFrame] | None = None, - handle_interruptions: bool = False + handle_interruptions: bool = False, ): super().__init__() @@ -175,7 +178,7 @@ def __init__(self, messages: List[dict] = []): start_frame=LLMFullResponseStartFrame, end_frame=LLMFullResponseEndFrame, accumulator_frame=TextFrame, - handle_interruptions=True + handle_interruptions=True, ) @@ -187,7 +190,7 @@ def __init__(self, messages: List[dict] = []): start_frame=UserStartedSpeakingFrame, end_frame=UserStoppedSpeakingFrame, accumulator_frame=TranscriptionFrame, - interim_accumulator_frame=InterimTranscriptionFrame + interim_accumulator_frame=InterimTranscriptionFrame, ) @@ -295,7 +298,7 @@ def __init__(self, context: OpenAILLMContext): start_frame=LLMFullResponseStartFrame, end_frame=LLMFullResponseEndFrame, accumulator_frame=TextFrame, - handle_interruptions=True + handle_interruptions=True, ) @@ -308,5 +311,5 @@ def __init__(self, context: OpenAILLMContext): start_frame=UserStartedSpeakingFrame, end_frame=UserStoppedSpeakingFrame, accumulator_frame=TranscriptionFrame, - interim_accumulator_frame=InterimTranscriptionFrame + interim_accumulator_frame=InterimTranscriptionFrame, ) diff --git a/src/pipecat/processors/aggregators/openai_llm_context.py b/src/pipecat/processors/aggregators/openai_llm_context.py index 3d1acf32e..83ec3e57f 100644 --- a/src/pipecat/processors/aggregators/openai_llm_context.py +++ b/src/pipecat/processors/aggregators/openai_llm_context.py @@ -17,7 +17,8 @@ Frame, VisionImageRawFrame, FunctionCallInProgressFrame, - FunctionCallResultFrame) + FunctionCallResultFrame, +) from pipecat.processors.frame_processor import FrameProcessor from loguru import logger @@ -28,12 +29,13 @@ from openai.types.chat import ( ChatCompletionToolParam, ChatCompletionToolChoiceOptionParam, - ChatCompletionMessageParam + ChatCompletionMessageParam, ) except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use OpenAI, you need to `pip install pipecat-ai[openai]`. Also, set `OPENAI_API_KEY` environment variable.") + "In order to use OpenAI, you need to `pip install pipecat-ai[openai]`. Also, set `OPENAI_API_KEY` environment variable." + ) raise Exception(f"Missing module: {e}") # JSON custom encoder to handle bytes arrays so that we can log contexts @@ -44,20 +46,18 @@ class CustomEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, io.BytesIO): # Convert the first 8 bytes to an ASCII hex string - return (f"{obj.getbuffer()[0:8].hex()}...") + return f"{obj.getbuffer()[0:8].hex()}..." return super().default(obj) class OpenAILLMContext: - def __init__( self, messages: List[ChatCompletionMessageParam] | None = None, tools: List[ChatCompletionToolParam] | NotGiven = NOT_GIVEN, - tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = NOT_GIVEN + tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = NOT_GIVEN, ): - self._messages: List[ChatCompletionMessageParam] = messages if messages else [ - ] + self._messages: List[ChatCompletionMessageParam] = messages if messages else [] self._tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = tool_choice self._tools: List[ChatCompletionToolParam] | NotGiven = tools @@ -81,19 +81,10 @@ def from_image_frame(frame: VisionImageRawFrame) -> "OpenAILLMContext": """ context = OpenAILLMContext() buffer = io.BytesIO() - Image.frombytes( - frame.format, - frame.size, - frame.image - ).save( - buffer, - format="JPEG") - context.add_message({ - "content": frame.text, - "role": "user", - "data": buffer, - "mime_type": "image/jpeg" - }) + Image.frombytes(frame.format, frame.size, frame.image).save(buffer, format="JPEG") + context.add_message( + {"content": frame.text, "role": "user", "data": buffer, "mime_type": "image/jpeg"} + ) return context @property @@ -123,9 +114,7 @@ def get_messages(self) -> List[ChatCompletionMessageParam]: def get_messages_json(self) -> str: return json.dumps(self._messages, cls=CustomEncoder) - def set_tool_choice( - self, tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven - ): + def set_tool_choice(self, tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven): self._tool_choice = tool_choice def set_tools(self, tools: List[ChatCompletionToolParam] | NotGiven = NOT_GIVEN): @@ -133,37 +122,40 @@ def set_tools(self, tools: List[ChatCompletionToolParam] | NotGiven = NOT_GIVEN) tools = NOT_GIVEN self._tools = tools - async def call_function(self, - f: Callable[[str, - str, - Any, - FrameProcessor, - 'OpenAILLMContext', - Callable[[Any], - Awaitable[None]]], - Awaitable[None]], - *, - function_name: str, - tool_call_id: str, - arguments: str, - llm: FrameProcessor) -> None: - + async def call_function( + self, + f: Callable[ + [str, str, Any, FrameProcessor, "OpenAILLMContext", Callable[[Any], Awaitable[None]]], + Awaitable[None], + ], + *, + function_name: str, + tool_call_id: str, + arguments: str, + llm: FrameProcessor, + ) -> None: # Push a SystemFrame downstream. This frame will let our assistant context aggregator # know that we are in the middle of a function call. Some contexts/aggregators may # not need this. But some definitely do (Anthropic, for example). - await llm.push_frame(FunctionCallInProgressFrame( - function_name=function_name, - tool_call_id=tool_call_id, - arguments=arguments, - )) - - # Define a callback function that pushes a FunctionCallResultFrame downstream. - async def function_call_result_callback(result): - await llm.push_frame(FunctionCallResultFrame( + await llm.push_frame( + FunctionCallInProgressFrame( function_name=function_name, tool_call_id=tool_call_id, arguments=arguments, - result=result)) + ) + ) + + # Define a callback function that pushes a FunctionCallResultFrame downstream. + async def function_call_result_callback(result): + await llm.push_frame( + FunctionCallResultFrame( + function_name=function_name, + tool_call_id=tool_call_id, + arguments=arguments, + result=result, + ) + ) + await f(function_name, tool_call_id, arguments, llm, self, function_call_result_callback) @@ -174,4 +166,5 @@ class OpenAILLMContextFrame(Frame): OpenAIContextAggregator frame processor. """ + context: OpenAILLMContext diff --git a/src/pipecat/processors/aggregators/user_response.py b/src/pipecat/processors/aggregators/user_response.py index 002b6dd95..903019059 100644 --- a/src/pipecat/processors/aggregators/user_response.py +++ b/src/pipecat/processors/aggregators/user_response.py @@ -12,7 +12,8 @@ TextFrame, TranscriptionFrame, UserStartedSpeakingFrame, - UserStoppedSpeakingFrame) + UserStoppedSpeakingFrame, +) class ResponseAggregator(FrameProcessor): @@ -49,7 +50,7 @@ def __init__( start_frame, end_frame, accumulator_frame: TextFrame, - interim_accumulator_frame: TextFrame | None = None + interim_accumulator_frame: TextFrame | None = None, ): super().__init__() diff --git a/src/pipecat/processors/aggregators/vision_image_frame.py b/src/pipecat/processors/aggregators/vision_image_frame.py index 97f6b5ec8..d07337f06 100644 --- a/src/pipecat/processors/aggregators/vision_image_frame.py +++ b/src/pipecat/processors/aggregators/vision_image_frame.py @@ -4,12 +4,7 @@ # SPDX-License-Identifier: BSD 2-Clause License # -from pipecat.frames.frames import ( - Frame, - InputImageRawFrame, - TextFrame, - VisionImageRawFrame -) +from pipecat.frames.frames import Frame, InputImageRawFrame, TextFrame, VisionImageRawFrame from pipecat.processors.frame_processor import FrameDirection, FrameProcessor @@ -46,7 +41,8 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): text=self._describe_text, image=frame.image, size=frame.size, - format=frame.format) + format=frame.format, + ) await self.push_frame(frame) self._describe_text = None else: diff --git a/src/pipecat/processors/filters/frame_filter.py b/src/pipecat/processors/filters/frame_filter.py index 9f2eb98c4..45927a604 100644 --- a/src/pipecat/processors/filters/frame_filter.py +++ b/src/pipecat/processors/filters/frame_filter.py @@ -11,7 +11,6 @@ class FrameFilter(FrameProcessor): - def __init__(self, types: List[type]): super().__init__() self._types = types @@ -25,9 +24,11 @@ def _should_passthrough_frame(self, frame): if isinstance(frame, t): return True - return (isinstance(frame, AppFrame) - or isinstance(frame, ControlFrame) - or isinstance(frame, SystemFrame)) + return ( + isinstance(frame, AppFrame) + or isinstance(frame, ControlFrame) + or isinstance(frame, SystemFrame) + ) async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) diff --git a/src/pipecat/processors/filters/function_filter.py b/src/pipecat/processors/filters/function_filter.py index 421fcc80c..ba1f706a7 100644 --- a/src/pipecat/processors/filters/function_filter.py +++ b/src/pipecat/processors/filters/function_filter.py @@ -11,7 +11,6 @@ class FunctionFilter(FrameProcessor): - def __init__(self, filter: Callable[[Frame], Awaitable[bool]]): super().__init__() self._filter = filter diff --git a/src/pipecat/processors/filters/wake_check_filter.py b/src/pipecat/processors/filters/wake_check_filter.py index c3e0942ea..f1a7afbef 100644 --- a/src/pipecat/processors/filters/wake_check_filter.py +++ b/src/pipecat/processors/filters/wake_check_filter.py @@ -21,6 +21,7 @@ class WakeCheckFilter(FrameProcessor): after a wake phrase has been detected. It also has a keepalive timeout to allow for a brief period of continued conversation after a wake phrase has been detected. """ + class WakeState(Enum): IDLE = 1 AWAKE = 2 @@ -38,8 +39,9 @@ def __init__(self, wake_phrases: list[str], keepalive_timeout: float = 3): self._keepalive_timeout = keepalive_timeout self._wake_patterns = [] for name in wake_phrases: - pattern = re.compile(r'\b' + r'\s*'.join(re.escape(word) - for word in name.split()) + r'\b', re.IGNORECASE) + pattern = re.compile( + r"\b" + r"\s*".join(re.escape(word) for word in name.split()) + r"\b", re.IGNORECASE + ) self._wake_patterns.append(pattern) async def process_frame(self, frame: Frame, direction: FrameDirection): @@ -57,7 +59,8 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): if p.state == WakeCheckFilter.WakeState.AWAKE: if time.time() - p.wake_timer < self._keepalive_timeout: logger.debug( - f"Wake phrase keepalive timeout has not expired. Pushing {frame}") + f"Wake phrase keepalive timeout has not expired. Pushing {frame}" + ) p.wake_timer = time.time() await self.push_frame(frame) return @@ -73,7 +76,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): # and modify the frame in place. p.state = WakeCheckFilter.WakeState.AWAKE p.wake_timer = time.time() - frame.text = p.accumulator[match.start():] + frame.text = p.accumulator[match.start() :] p.accumulator = "" await self.push_frame(frame) else: diff --git a/src/pipecat/processors/frame_processor.py b/src/pipecat/processors/frame_processor.py index 69c957c97..9dd92599e 100644 --- a/src/pipecat/processors/frame_processor.py +++ b/src/pipecat/processors/frame_processor.py @@ -17,10 +17,9 @@ StartFrame, StartInterruptionFrame, StopInterruptionFrame, - SystemFrame) -from pipecat.metrics.metrics import ( - LLMTokenUsage, - MetricsData) + SystemFrame, +) +from pipecat.metrics.metrics import LLMTokenUsage, MetricsData from pipecat.processors.metrics.frame_processor_metrics import FrameProcessorMetrics from pipecat.utils.utils import obj_count, obj_id @@ -33,15 +32,15 @@ class FrameDirection(Enum): class FrameProcessor: - def __init__( - self, - *, - name: str | None = None, - metrics: FrameProcessorMetrics | None = None, - sync: bool = True, - loop: asyncio.AbstractEventLoop | None = None, - **kwargs): + self, + *, + name: str | None = None, + metrics: FrameProcessorMetrics | None = None, + sync: bool = True, + loop: asyncio.AbstractEventLoop | None = None, + **kwargs, + ): self.id: int = obj_id() self.name = name or f"{self.__class__.__name__}#{obj_count(self)}" self._parent: "FrameProcessor" | None = None @@ -202,8 +201,7 @@ async def __internal_push_frame(self, frame: Frame, direction: FrameDirection): def __create_push_task(self): self.__push_queue = asyncio.Queue() - self.__push_frame_task = self.get_event_loop( - ).create_task(self.__push_frame_task_handler()) + self.__push_frame_task = self.get_event_loop().create_task(self.__push_frame_task_handler()) async def __push_frame_task_handler(self): running = True diff --git a/src/pipecat/processors/frameworks/langchain.py b/src/pipecat/processors/frameworks/langchain.py index b6a24cfd2..c49dbaa76 100644 --- a/src/pipecat/processors/frameworks/langchain.py +++ b/src/pipecat/processors/frameworks/langchain.py @@ -11,7 +11,8 @@ LLMFullResponseEndFrame, LLMFullResponseStartFrame, LLMMessagesFrame, - TextFrame) + TextFrame, +) from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from loguru import logger @@ -20,9 +21,7 @@ from langchain_core.messages import AIMessageChunk from langchain_core.runnables import Runnable except ModuleNotFoundError as e: - logger.exception( - "In order to use Langchain, you need to `pip install pipecat-ai[langchain]`. " - ) + logger.exception("In order to use Langchain, you need to `pip install pipecat-ai[langchain]`. ") raise Exception(f"Missing module: {e}") diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py index 66adb9ad0..0450102a7 100644 --- a/src/pipecat/processors/frameworks/rtvi.py +++ b/src/pipecat/processors/frameworks/rtvi.py @@ -24,7 +24,8 @@ TransportMessageFrame, UserStartedSpeakingFrame, FunctionCallResultFrame, - UserStoppedSpeakingFrame) + UserStoppedSpeakingFrame, +) from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext from pipecat.processors.frame_processor import FrameDirection, FrameProcessor @@ -39,8 +40,9 @@ class RTVIServiceOption(BaseModel): name: str type: Literal["bool", "number", "string", "array", "object"] - handler: Callable[["RTVIProcessor", str, "RTVIServiceOptionConfig"], - Awaitable[None]] = Field(exclude=True) + handler: Callable[["RTVIProcessor", str, "RTVIServiceOptionConfig"], Awaitable[None]] = Field( + exclude=True + ) class RTVIService(BaseModel): @@ -70,8 +72,9 @@ class RTVIAction(BaseModel): action: str arguments: List[RTVIActionArgument] = [] result: Literal["bool", "number", "string", "array", "object"] - handler: Callable[["RTVIProcessor", str, Dict[str, Any]], - Awaitable[ActionResult]] = Field(exclude=True) + handler: Callable[["RTVIProcessor", str, Dict[str, Any]], Awaitable[ActionResult]] = Field( + exclude=True + ) _arguments_dict: Dict[str, RTVIActionArgument] = PrivateAttr(default={}) def model_post_init(self, __context: Any) -> None: @@ -122,6 +125,7 @@ class RTVIMessage(BaseModel): id: str data: Optional[Dict[str, Any]] = None + # # Pipecat -> Client responses and messages. # @@ -268,12 +272,13 @@ class RTVIProcessorParams(BaseModel): class RTVIProcessor(FrameProcessor): - - def __init__(self, - *, - config: RTVIConfig = RTVIConfig(config=[]), - params: RTVIProcessorParams = RTVIProcessorParams(), - **kwargs): + def __init__( + self, + *, + config: RTVIConfig = RTVIConfig(config=[]), + params: RTVIProcessorParams = RTVIProcessorParams(), + **kwargs, + ): super().__init__(sync=False, **kwargs) self._config = config self._params = params @@ -310,25 +315,23 @@ async def set_client_ready(self): await self._maybe_send_bot_ready() async def handle_function_call( - self, - function_name: str, - tool_call_id: str, - arguments: dict, - llm: FrameProcessor, - context: OpenAILLMContext, - result_callback): + self, + function_name: str, + tool_call_id: str, + arguments: dict, + llm: FrameProcessor, + context: OpenAILLMContext, + result_callback, + ): fn = RTVILLMFunctionCallMessageData( - function_name=function_name, - tool_call_id=tool_call_id, - args=arguments) + function_name=function_name, tool_call_id=tool_call_id, args=arguments + ) message = RTVILLMFunctionCallMessage(data=fn) await self._push_transport_message(message, exclude_none=False) async def handle_function_call_start( - self, - function_name: str, - llm: FrameProcessor, - context: OpenAILLMContext): + self, function_name: str, llm: FrameProcessor, context: OpenAILLMContext + ): fn = RTVILLMFunctionCallStartMessageData(function_name=function_name) message = RTVILLMFunctionCallStartMessage(data=fn) await self._push_transport_message(message, exclude_none=False) @@ -357,10 +360,14 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): # finish and the task finishes when EndFrame is processed. await self.push_frame(frame, direction) await self._stop(frame) - elif isinstance(frame, UserStartedSpeakingFrame) or isinstance(frame, UserStoppedSpeakingFrame): + elif isinstance(frame, UserStartedSpeakingFrame) or isinstance( + frame, UserStoppedSpeakingFrame + ): await self._handle_interruptions(frame) await self.push_frame(frame, direction) - elif isinstance(frame, BotStartedSpeakingFrame) or isinstance(frame, BotStoppedSpeakingFrame): + elif isinstance(frame, BotStartedSpeakingFrame) or isinstance( + frame, BotStoppedSpeakingFrame + ): await self._handle_bot_speaking(frame) await self.push_frame(frame, direction) # Data frames @@ -393,8 +400,8 @@ async def _cancel(self, frame: CancelFrame): async def _push_transport_message(self, model: BaseModel, exclude_none: bool = True): frame = TransportMessageFrame( - message=model.model_dump(exclude_none=exclude_none), - urgent=True) + message=model.model_dump(exclude_none=exclude_none), urgent=True + ) await self.push_frame(frame) async def _handle_transcriptions(self, frame: Frame): @@ -405,17 +412,15 @@ async def _handle_transcriptions(self, frame: Frame): if isinstance(frame, TranscriptionFrame): message = RTVITranscriptionMessage( data=RTVITranscriptionMessageData( - text=frame.text, - user_id=frame.user_id, - timestamp=frame.timestamp, - final=True)) + text=frame.text, user_id=frame.user_id, timestamp=frame.timestamp, final=True + ) + ) elif isinstance(frame, InterimTranscriptionFrame): message = RTVITranscriptionMessage( data=RTVITranscriptionMessageData( - text=frame.text, - user_id=frame.user_id, - timestamp=frame.timestamp, - final=False)) + text=frame.text, user_id=frame.user_id, timestamp=frame.timestamp, final=False + ) + ) if message: await self._push_transport_message(message) @@ -539,7 +544,8 @@ async def _handle_function_call_result(self, data): function_name=data.function_name, tool_call_id=data.tool_call_id, arguments=data.arguments, - result=data.result) + result=data.result, + ) await self.push_frame(frame) async def _handle_action(self, request_id: str, data: RTVIActionRun): @@ -567,9 +573,8 @@ async def _send_bot_ready(self): message = RTVIBotReady( id=self._client_ready_id, - data=RTVIBotReadyData( - version=RTVI_PROTOCOL_VERSION, - config=self._config.config)) + data=RTVIBotReadyData(version=RTVI_PROTOCOL_VERSION, config=self._config.config), + ) await self._push_transport_message(message) async def _send_error_frame(self, frame: ErrorFrame): diff --git a/src/pipecat/processors/gstreamer/pipeline_source.py b/src/pipecat/processors/gstreamer/pipeline_source.py index f852dd641..9f8471153 100644 --- a/src/pipecat/processors/gstreamer/pipeline_source.py +++ b/src/pipecat/processors/gstreamer/pipeline_source.py @@ -15,20 +15,23 @@ OutputAudioRawFrame, OutputImageRawFrame, StartFrame, - SystemFrame) + SystemFrame, +) from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from loguru import logger try: import gi - gi.require_version('Gst', '1.0') - gi.require_version('GstApp', '1.0') + + gi.require_version("Gst", "1.0") + gi.require_version("GstApp", "1.0") from gi.repository import Gst, GstApp except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use GStreamer, you need to `pip install pipecat-ai[gstreamer]`. Also, you need to install GStreamer in your system.") + "In order to use GStreamer, you need to `pip install pipecat-ai[gstreamer]`. Also, you need to install GStreamer in your system." + ) raise Exception(f"Missing module: {e}") @@ -120,7 +123,8 @@ def _decodebin_audio(self, pad: Gst.Pad): audioresample = Gst.ElementFactory.make("audioresample", None) audiocapsfilter = Gst.ElementFactory.make("capsfilter", None) audiocaps = Gst.Caps.from_string( - f"audio/x-raw,format=S16LE,rate={self._out_params.audio_sample_rate},channels={self._out_params.audio_channels},layout=interleaved") + f"audio/x-raw,format=S16LE,rate={self._out_params.audio_sample_rate},channels={self._out_params.audio_channels},layout=interleaved" + ) audiocapsfilter.set_property("caps", audiocaps) appsink_audio = Gst.ElementFactory.make("appsink", None) appsink_audio.set_property("emit-signals", True) @@ -152,7 +156,8 @@ def _decodebin_video(self, pad: Gst.Pad): videoscale = Gst.ElementFactory.make("videoscale", None) videocapsfilter = Gst.ElementFactory.make("capsfilter", None) videocaps = Gst.Caps.from_string( - f"video/x-raw,format=RGB,width={self._out_params.video_width},height={self._out_params.video_height}") + f"video/x-raw,format=RGB,width={self._out_params.video_width},height={self._out_params.video_height}" + ) videocapsfilter.set_property("caps", videocaps) appsink_video = Gst.ElementFactory.make("appsink", None) @@ -182,9 +187,11 @@ def _decodebin_video(self, pad: Gst.Pad): def _appsink_audio_new_sample(self, appsink: GstApp.AppSink): buffer = appsink.pull_sample().get_buffer() (_, info) = buffer.map(Gst.MapFlags.READ) - frame = OutputAudioRawFrame(audio=info.data, - sample_rate=self._out_params.audio_sample_rate, - num_channels=self._out_params.audio_channels) + frame = OutputAudioRawFrame( + audio=info.data, + sample_rate=self._out_params.audio_sample_rate, + num_channels=self._out_params.audio_channels, + ) asyncio.run_coroutine_threadsafe(self.push_frame(frame), self.get_event_loop()) buffer.unmap(info) return Gst.FlowReturn.OK @@ -195,7 +202,8 @@ def _appsink_video_new_sample(self, appsink: GstApp.AppSink): frame = OutputImageRawFrame( image=info.data, size=(self._out_params.video_width, self._out_params.video_height), - format="RGB") + format="RGB", + ) asyncio.run_coroutine_threadsafe(self.push_frame(frame), self.get_event_loop()) buffer.unmap(info) return Gst.FlowReturn.OK diff --git a/src/pipecat/processors/idle_frame_processor.py b/src/pipecat/processors/idle_frame_processor.py index 42b81517e..576cb9087 100644 --- a/src/pipecat/processors/idle_frame_processor.py +++ b/src/pipecat/processors/idle_frame_processor.py @@ -19,12 +19,13 @@ class IdleFrameProcessor(FrameProcessor): """ def __init__( - self, - *, - callback: Callable[["IdleFrameProcessor"], Awaitable[None]], - timeout: float, - types: List[type] = [], - **kwargs): + self, + *, + callback: Callable[["IdleFrameProcessor"], Awaitable[None]], + timeout: float, + types: List[type] = [], + **kwargs, + ): super().__init__(sync=False, **kwargs) self._callback = callback diff --git a/src/pipecat/processors/logger.py b/src/pipecat/processors/logger.py index 79334ba73..a26c67014 100644 --- a/src/pipecat/processors/logger.py +++ b/src/pipecat/processors/logger.py @@ -8,6 +8,7 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from loguru import logger from typing import Optional + logger = logger.opt(ansi=True) @@ -19,7 +20,9 @@ def __init__( ignored_frame_types: Optional[list] = [ BotSpeakingFrame, AudioRawFrame, - TransportMessageFrame]): + TransportMessageFrame, + ], + ): super().__init__() self._prefix = prefix self._color = color diff --git a/src/pipecat/processors/metrics/frame_processor_metrics.py b/src/pipecat/processors/metrics/frame_processor_metrics.py index b9de3c2b4..52ea7e34c 100644 --- a/src/pipecat/processors/metrics/frame_processor_metrics.py +++ b/src/pipecat/processors/metrics/frame_processor_metrics.py @@ -7,7 +7,8 @@ MetricsData, ProcessingMetricsData, TTFBMetricsData, - TTSUsageMetricsData) + TTSUsageMetricsData, +) from loguru import logger @@ -42,9 +43,8 @@ async def stop_ttfb_metrics(self): value = time.time() - self._start_ttfb_time logger.debug(f"{self._processor_name()} TTFB: {value}") ttfb = TTFBMetricsData( - processor=self._processor_name(), - value=value, - model=self._model_name()) + processor=self._processor_name(), value=value, model=self._model_name() + ) self._start_ttfb_time = 0 return MetricsFrame(data=[ttfb]) @@ -58,24 +58,24 @@ async def stop_processing_metrics(self): value = time.time() - self._start_processing_time logger.debug(f"{self._processor_name()} processing time: {value}") processing = ProcessingMetricsData( - processor=self._processor_name(), value=value, model=self._model_name()) + processor=self._processor_name(), value=value, model=self._model_name() + ) self._start_processing_time = 0 return MetricsFrame(data=[processing]) async def start_llm_usage_metrics(self, tokens: LLMTokenUsage): logger.debug( - f"{self._processor_name()} prompt tokens: {tokens.prompt_tokens}, completion tokens: {tokens.completion_tokens}") + f"{self._processor_name()} prompt tokens: {tokens.prompt_tokens}, completion tokens: {tokens.completion_tokens}" + ) value = LLMUsageMetricsData( - processor=self._processor_name(), - model=self._model_name(), - value=tokens) + processor=self._processor_name(), model=self._model_name(), value=tokens + ) return MetricsFrame(data=[value]) async def start_tts_usage_metrics(self, text: str): characters = TTSUsageMetricsData( - processor=self._processor_name(), - model=self._model_name(), - value=len(text)) + processor=self._processor_name(), model=self._model_name(), value=len(text) + ) logger.debug(f"{self._processor_name()} usage characters: { characters.value}") return MetricsFrame(data=[characters]) diff --git a/src/pipecat/processors/metrics/sentry.py b/src/pipecat/processors/metrics/sentry.py index da3057e76..e37dd9d44 100644 --- a/src/pipecat/processors/metrics/sentry.py +++ b/src/pipecat/processors/metrics/sentry.py @@ -3,14 +3,13 @@ try: import sentry_sdk + sentry_available = sentry_sdk.is_initialized() if not sentry_available: - logger.warning( - "Sentry SDK not initialized. Sentry features will be disabled.") + logger.warning("Sentry SDK not initialized. Sentry features will be disabled.") except ImportError: sentry_available = False - logger.warning( - "Sentry SDK not installed. Sentry features will be disabled.") + logger.warning("Sentry SDK not installed. Sentry features will be disabled.") from pipecat.processors.metrics.frame_processor_metrics import FrameProcessorMetrics @@ -28,7 +27,7 @@ async def start_ttfb_metrics(self, report_only_initial_ttfb): self._ttfb_metrics_span = sentry_sdk.start_span( op="ttfb", description=f"TTFB for {self._processor_name()}", - start_timestamp=self._start_ttfb_time + start_timestamp=self._start_ttfb_time, ) logger.debug(f"Sentry Span ID: {self._ttfb_metrics_span.span_id} Description: { self._ttfb_metrics_span.description} started.") @@ -45,7 +44,7 @@ async def start_processing_metrics(self): self._processing_metrics_span = sentry_sdk.start_span( op="processing", description=f"Processing for {self._processor_name()}", - start_timestamp=self._start_processing_time + start_timestamp=self._start_processing_time, ) logger.debug(f"Sentry Span ID: {self._processing_metrics_span.span_id} Description: { self._processing_metrics_span.description} started.") diff --git a/src/pipecat/processors/user_idle_processor.py b/src/pipecat/processors/user_idle_processor.py index 36c394a5d..31d49cf5a 100644 --- a/src/pipecat/processors/user_idle_processor.py +++ b/src/pipecat/processors/user_idle_processor.py @@ -12,7 +12,8 @@ BotSpeakingFrame, Frame, UserStartedSpeakingFrame, - UserStoppedSpeakingFrame) + UserStoppedSpeakingFrame, +) from pipecat.processors.frame_processor import FrameDirection, FrameProcessor @@ -24,11 +25,12 @@ class UserIdleProcessor(FrameProcessor): """ def __init__( - self, - *, - callback: Callable[["UserIdleProcessor"], Awaitable[None]], - timeout: float, - **kwargs): + self, + *, + callback: Callable[["UserIdleProcessor"], Awaitable[None]], + timeout: float, + **kwargs, + ): super().__init__(sync=False, **kwargs) self._callback = callback diff --git a/src/pipecat/serializers/base_serializer.py b/src/pipecat/serializers/base_serializer.py index 83613d9ce..96f5fd214 100644 --- a/src/pipecat/serializers/base_serializer.py +++ b/src/pipecat/serializers/base_serializer.py @@ -10,7 +10,6 @@ class FrameSerializer(ABC): - @abstractmethod def serialize(self, frame: Frame) -> str | bytes | None: pass diff --git a/src/pipecat/serializers/livekit.py b/src/pipecat/serializers/livekit.py index fec5243f5..29d32b861 100644 --- a/src/pipecat/serializers/livekit.py +++ b/src/pipecat/serializers/livekit.py @@ -7,10 +7,7 @@ import ctypes import pickle -from pipecat.frames.frames import ( - Frame, - InputAudioRawFrame, - OutputAudioRawFrame) +from pipecat.frames.frames import Frame, InputAudioRawFrame, OutputAudioRawFrame from pipecat.serializers.base_serializer import FrameSerializer from loguru import logger @@ -19,8 +16,7 @@ from livekit.rtc import AudioFrame except ModuleNotFoundError as e: logger.error(f"Exception: {e}") - logger.error( - "In order to use LiveKit, you need to `pip install pipecat-ai[livekit]`.") + logger.error("In order to use LiveKit, you need to `pip install pipecat-ai[livekit]`.") raise Exception(f"Missing module: {e}") @@ -37,7 +33,7 @@ def serialize(self, frame: Frame) -> str | bytes | None: return pickle.dumps(audio_frame) def deserialize(self, data: str | bytes) -> Frame | None: - audio_frame: AudioFrame = pickle.loads(data)['frame'] + audio_frame: AudioFrame = pickle.loads(data)["frame"] return InputAudioRawFrame( audio=bytes(audio_frame.data), sample_rate=audio_frame.sample_rate, diff --git a/src/pipecat/serializers/protobuf.py b/src/pipecat/serializers/protobuf.py index 6ae1b0c03..2adf403a5 100644 --- a/src/pipecat/serializers/protobuf.py +++ b/src/pipecat/serializers/protobuf.py @@ -8,11 +8,7 @@ import pipecat.frames.protobufs.frames_pb2 as frame_protos -from pipecat.frames.frames import ( - AudioRawFrame, - Frame, - TextFrame, - TranscriptionFrame) +from pipecat.frames.frames import AudioRawFrame, Frame, TextFrame, TranscriptionFrame from pipecat.serializers.base_serializer import FrameSerializer from loguru import logger @@ -22,7 +18,7 @@ class ProtobufFrameSerializer(FrameSerializer): SERIALIZABLE_TYPES = { TextFrame: "text", AudioRawFrame: "audio", - TranscriptionFrame: "transcription" + TranscriptionFrame: "transcription", } SERIALIZABLE_FIELDS = {v: k for k, v in SERIALIZABLE_TYPES.items()} diff --git a/src/pipecat/serializers/twilio.py b/src/pipecat/serializers/twilio.py index ed2905a40..c0d4c0c47 100644 --- a/src/pipecat/serializers/twilio.py +++ b/src/pipecat/serializers/twilio.py @@ -9,10 +9,7 @@ from pydantic import BaseModel -from pipecat.frames.frames import ( - AudioRawFrame, - Frame, - StartInterruptionFrame) +from pipecat.frames.frames import AudioRawFrame, Frame, StartInterruptionFrame from pipecat.serializers.base_serializer import FrameSerializer from pipecat.utils.audio import ulaw_to_pcm, pcm_to_ulaw @@ -30,15 +27,12 @@ def serialize(self, frame: Frame) -> str | bytes | None: if isinstance(frame, AudioRawFrame): data = frame.audio - serialized_data = pcm_to_ulaw( - data, frame.sample_rate, self._params.twilio_sample_rate) + serialized_data = pcm_to_ulaw(data, frame.sample_rate, self._params.twilio_sample_rate) payload = base64.b64encode(serialized_data).decode("utf-8") answer = { "event": "media", "streamSid": self._stream_sid, - "media": { - "payload": payload - } + "media": {"payload": payload}, } return json.dumps(answer) @@ -57,11 +51,9 @@ def deserialize(self, data: str | bytes) -> Frame | None: payload = base64.b64decode(payload_base64) deserialized_data = ulaw_to_pcm( - payload, - self._params.twilio_sample_rate, - self._params.sample_rate) + payload, self._params.twilio_sample_rate, self._params.sample_rate + ) audio_frame = AudioRawFrame( - audio=deserialized_data, - num_channels=1, - sample_rate=self._params.sample_rate) + audio=deserialized_data, num_channels=1, sample_rate=self._params.sample_rate + ) return audio_frame diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index dc75b9793..cdad3de52 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -31,7 +31,7 @@ TTSVoiceUpdateFrame, TextFrame, UserImageRequestFrame, - VisionImageRawFrame + VisionImageRawFrame, ) from pipecat.metrics.metrics import MetricsData from pipecat.processors.frame_processor import FrameDirection, FrameProcessor @@ -114,12 +114,8 @@ def has_function(self, function_name: str): return function_name in self._callbacks.keys() async def call_function( - self, - *, - context: OpenAILLMContext, - tool_call_id: str, - function_name: str, - arguments: str) -> None: + self, *, context: OpenAILLMContext, tool_call_id: str, function_name: str, arguments: str + ) -> None: f = None if function_name in self._callbacks.keys(): f = self._callbacks[function_name] @@ -128,11 +124,8 @@ async def call_function( else: return None await context.call_function( - f, - function_name=function_name, - tool_call_id=tool_call_id, - arguments=arguments, - llm=self) + f, function_name=function_name, tool_call_id=tool_call_id, arguments=arguments, llm=self + ) # QUESTION FOR CB: maybe this isn't needed anymore? async def call_start_function(self, context: OpenAILLMContext, function_name: str): @@ -142,21 +135,23 @@ async def call_start_function(self, context: OpenAILLMContext, function_name: st return await self._start_callbacks[None](function_name, self, context) async def request_image_frame(self, user_id: str, *, text_content: str | None = None): - await self.push_frame(UserImageRequestFrame(user_id=user_id, context=text_content), - FrameDirection.UPSTREAM) + await self.push_frame( + UserImageRequestFrame(user_id=user_id, context=text_content), FrameDirection.UPSTREAM + ) class TTSService(AIService): def __init__( - self, - *, - aggregate_sentences: bool = True, - # if True, TTSService will push TextFrames and LLMFullResponseEndFrames, - # otherwise subclass must do it - push_text_frames: bool = True, - # TTS output sample rate - sample_rate: int = 16000, - **kwargs): + self, + *, + aggregate_sentences: bool = True, + # if True, TTSService will push TextFrames and LLMFullResponseEndFrames, + # otherwise subclass must do it + push_text_frames: bool = True, + # TTS output sample rate + sample_rate: int = 16000, + **kwargs, + ): super().__init__(**kwargs) self._aggregate_sentences: bool = aggregate_sentences self._push_text_frames: bool = push_text_frames @@ -247,12 +242,13 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): class AsyncTTSService(TTSService): def __init__( - self, - # if True, TTSService will push TTSStoppedFrames, otherwise subclass must do it - push_stop_frames: bool = False, - # if push_stop_frames is True, wait for this idle period before pushing TTSStoppedFrame - stop_frame_timeout_s: float = 1.0, - **kwargs): + self, + # if True, TTSService will push TTSStoppedFrames, otherwise subclass must do it + push_stop_frames: bool = False, + # if push_stop_frames is True, wait for this idle period before pushing TTSStoppedFrame + stop_frame_timeout_s: float = 1.0, + **kwargs, + ): super().__init__(sync=False, **kwargs) self._push_stop_frames: bool = push_stop_frames self._stop_frame_timeout_s: float = stop_frame_timeout_s @@ -286,10 +282,11 @@ async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirect await super().push_frame(frame, direction) if self._push_stop_frames and ( - isinstance(frame, StartInterruptionFrame) or - isinstance(frame, TTSStartedFrame) or - isinstance(frame, TTSAudioRawFrame) or - isinstance(frame, TTSStoppedFrame)): + isinstance(frame, StartInterruptionFrame) + or isinstance(frame, TTSStartedFrame) + or isinstance(frame, TTSAudioRawFrame) + or isinstance(frame, TTSStoppedFrame) + ): await self._stop_frame_queue.put(frame) async def _stop_frame_handler(self): @@ -297,8 +294,9 @@ async def _stop_frame_handler(self): has_started = False while True: try: - frame = await asyncio.wait_for(self._stop_frame_queue.get(), - self._stop_frame_timeout_s) + frame = await asyncio.wait_for( + self._stop_frame_queue.get(), self._stop_frame_timeout_s + ) if isinstance(frame, TTSStartedFrame): has_started = True elif isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)): @@ -327,7 +325,7 @@ def reset_word_timestamps(self): self._word_timestamps = [] async def add_word_timestamps(self, word_times: List[Tuple[str, float]]): - for (word, timestamp) in word_times: + for word, timestamp in word_times: await self._words_queue.put((word, seconds_to_nanoseconds(timestamp))) async def stop(self, frame: EndFrame): @@ -414,14 +412,16 @@ class SegmentedSTTService(STTService): """ - def __init__(self, - *, - min_volume: float = 0.6, - max_silence_secs: float = 0.3, - max_buffer_secs: float = 1.5, - sample_rate: int = 16000, - num_channels: int = 1, - **kwargs): + def __init__( + self, + *, + min_volume: float = 0.6, + max_silence_secs: float = 0.3, + max_buffer_secs: float = 1.5, + sample_rate: int = 16000, + num_channels: int = 1, + **kwargs, + ): super().__init__(**kwargs) self._min_volume = min_volume self._max_silence_secs = max_silence_secs @@ -450,7 +450,8 @@ async def process_audio_frame(self, frame: AudioRawFrame): silence_secs = self._silence_num_frames / self._sample_rate buffer_secs = self._wave.getnframes() / self._sample_rate if self._content.tell() > 0 and ( - buffer_secs > self._max_buffer_secs or silence_secs > self._max_silence_secs): + buffer_secs > self._max_buffer_secs or silence_secs > self._max_silence_secs + ): self._silence_num_frames = 0 self._wave.close() self._content.seek(0) @@ -477,7 +478,6 @@ def _get_smoothed_volume(self, frame: AudioRawFrame) -> float: class ImageGenService(AIService): - def __init__(self, **kwargs): super().__init__(**kwargs) diff --git a/src/pipecat/services/anthropic.py b/src/pipecat/services/anthropic.py index 421196e2c..8b8e187ea 100644 --- a/src/pipecat/services/anthropic.py +++ b/src/pipecat/services/anthropic.py @@ -28,18 +28,18 @@ LLMFullResponseEndFrame, FunctionCallResultFrame, FunctionCallInProgressFrame, - StartInterruptionFrame + StartInterruptionFrame, ) from pipecat.metrics.metrics import LLMTokenUsage from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import LLMService from pipecat.processors.aggregators.openai_llm_context import ( OpenAILLMContext, - OpenAILLMContextFrame + OpenAILLMContextFrame, ) from pipecat.processors.aggregators.llm_response import ( LLMUserContextAggregator, - LLMAssistantContextAggregator + LLMAssistantContextAggregator, ) from loguru import logger @@ -49,8 +49,9 @@ except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use Anthropic, you need to `pip install pipecat-ai[anthropic]`. " + - "Also, set `ANTHROPIC_API_KEY` environment variable.") + "In order to use Anthropic, you need to `pip install pipecat-ai[anthropic]`. " + + "Also, set `ANTHROPIC_API_KEY` environment variable." + ) raise Exception(f"Missing module: {e}") @@ -62,19 +63,19 @@ class AnthropicImageMessageFrame(Frame): @dataclass class AnthropicContextAggregatorPair: - _user: 'AnthropicUserContextAggregator' - _assistant: 'AnthropicAssistantContextAggregator' + _user: "AnthropicUserContextAggregator" + _assistant: "AnthropicAssistantContextAggregator" - def user(self) -> 'AnthropicUserContextAggregator': + def user(self) -> "AnthropicUserContextAggregator": return self._user - def assistant(self) -> 'AnthropicAssistantContextAggregator': + def assistant(self) -> "AnthropicAssistantContextAggregator": return self._assistant class AnthropicLLMService(LLMService): - """This class implements inference with Anthropic's AI models - """ + """This class implements inference with Anthropic's AI models""" + class InputParams(BaseModel): enable_prompt_caching_beta: Optional[bool] = False max_tokens: Optional[int] = Field(default_factory=lambda: 4096, ge=1) @@ -84,12 +85,13 @@ class InputParams(BaseModel): extra: Optional[Dict[str, Any]] = Field(default_factory=dict) def __init__( - self, - *, - api_key: str, - model: str = "claude-3-5-sonnet-20240620", - params: InputParams = InputParams(), - **kwargs): + self, + *, + api_key: str, + model: str = "claude-3-5-sonnet-20240620", + params: InputParams = InputParams(), + **kwargs, + ): super().__init__(**kwargs) self._client = AsyncAnthropic(api_key=api_key) self.set_model_name(model) @@ -111,10 +113,7 @@ def enable_prompt_caching_beta(self) -> bool: def create_context_aggregator(context: OpenAILLMContext) -> AnthropicContextAggregatorPair: user = AnthropicUserContextAggregator(context) assistant = AnthropicAssistantContextAggregator(user) - return AnthropicContextAggregatorPair( - _user=user, - _assistant=assistant - ) + return AnthropicContextAggregatorPair(_user=user, _assistant=assistant) async def set_enable_prompt_caching_beta(self, enable_prompt_caching_beta: bool): logger.debug(f"Switching LLM enable_prompt_caching_beta to: [{enable_prompt_caching_beta}]") @@ -157,7 +156,8 @@ async def _process_context(self, context: OpenAILLMContext): await self.start_processing_metrics() logger.debug( - f"Generating chat: {context.system} | {context.get_messages_for_logging()}") + f"Generating chat: {context.system} | {context.get_messages_for_logging()}" + ) messages = context.messages if self._enable_prompt_caching_beta: @@ -178,7 +178,7 @@ async def _process_context(self, context: OpenAILLMContext): "stream": True, "temperature": self._temperature, "top_k": self._top_k, - "top_p": self._top_p + "top_p": self._top_p, } params.update(self._extra) @@ -189,54 +189,70 @@ async def _process_context(self, context: OpenAILLMContext): # Function calling tool_use_block = None - json_accumulator = '' + json_accumulator = "" async for event in response: # logger.debug(f"Anthropic LLM event: {event}") # Aggregate streaming content, create frames, trigger events - if (event.type == "content_block_delta"): - if hasattr(event.delta, 'text'): + if event.type == "content_block_delta": + if hasattr(event.delta, "text"): await self.push_frame(TextFrame(event.delta.text)) completion_tokens_estimate += self._estimate_tokens(event.delta.text) - elif hasattr(event.delta, 'partial_json') and tool_use_block: + elif hasattr(event.delta, "partial_json") and tool_use_block: json_accumulator += event.delta.partial_json completion_tokens_estimate += self._estimate_tokens( - event.delta.partial_json) - elif (event.type == "content_block_start"): + event.delta.partial_json + ) + elif event.type == "content_block_start": if event.content_block.type == "tool_use": tool_use_block = event.content_block - json_accumulator = '' - elif ((event.type == "message_delta" and - hasattr(event.delta, 'stop_reason') - and event.delta.stop_reason == 'tool_use')): + json_accumulator = "" + elif ( + event.type == "message_delta" + and hasattr(event.delta, "stop_reason") + and event.delta.stop_reason == "tool_use" + ): if tool_use_block: - await self.call_function(context=context, - tool_call_id=tool_use_block.id, - function_name=tool_use_block.name, - arguments=json.loads(json_accumulator) if json_accumulator else dict() - ) + await self.call_function( + context=context, + tool_call_id=tool_use_block.id, + function_name=tool_use_block.name, + arguments=json.loads(json_accumulator) if json_accumulator else dict(), + ) # Calculate usage. Do this here in its own if statement, because there may be usage # data embedded in messages that we do other processing for, above. if hasattr(event, "usage"): - prompt_tokens += event.usage.input_tokens if hasattr( - event.usage, "input_tokens") else 0 - completion_tokens += event.usage.output_tokens if hasattr( - event.usage, "output_tokens") else 0 + prompt_tokens += ( + event.usage.input_tokens if hasattr(event.usage, "input_tokens") else 0 + ) + completion_tokens += ( + event.usage.output_tokens if hasattr(event.usage, "output_tokens") else 0 + ) elif hasattr(event, "message") and hasattr(event.message, "usage"): - prompt_tokens += event.message.usage.input_tokens if hasattr( - event.message.usage, "input_tokens") else 0 - completion_tokens += event.message.usage.output_tokens if hasattr( - event.message.usage, "output_tokens") else 0 + prompt_tokens += ( + event.message.usage.input_tokens + if hasattr(event.message.usage, "input_tokens") + else 0 + ) + completion_tokens += ( + event.message.usage.output_tokens + if hasattr(event.message.usage, "output_tokens") + else 0 + ) if hasattr(event.message.usage, "cache_creation_input_tokens"): - cache_creation_input_tokens += event.message.usage.cache_creation_input_tokens + cache_creation_input_tokens += ( + event.message.usage.cache_creation_input_tokens + ) logger.debug(f"Cache creation input tokens: {cache_creation_input_tokens}") if hasattr(event.message.usage, "cache_read_input_tokens"): cache_read_input_tokens += event.message.usage.cache_read_input_tokens logger.debug(f"Cache read input tokens: {cache_read_input_tokens}") - total_input_tokens = prompt_tokens + cache_creation_input_tokens + cache_read_input_tokens + total_input_tokens = ( + prompt_tokens + cache_creation_input_tokens + cache_read_input_tokens + ) if total_input_tokens >= 1024: context.turns_above_cache_threshold += 1 @@ -251,12 +267,16 @@ async def _process_context(self, context: OpenAILLMContext): finally: await self.stop_processing_metrics() await self.push_frame(LLMFullResponseEndFrame()) - comp_tokens = completion_tokens if not use_completion_tokens_estimate else completion_tokens_estimate + comp_tokens = ( + completion_tokens + if not use_completion_tokens_estimate + else completion_tokens_estimate + ) await self._report_usage_metrics( prompt_tokens=prompt_tokens, completion_tokens=comp_tokens, cache_creation_input_tokens=cache_creation_input_tokens, - cache_read_input_tokens=cache_read_input_tokens + cache_read_input_tokens=cache_read_input_tokens, ) async def process_frame(self, frame: Frame, direction: FrameDirection): @@ -286,21 +306,27 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): await self._process_context(context) def _estimate_tokens(self, text: str) -> int: - return int(len(re.split(r'[^\w]+', text)) * 1.3) + return int(len(re.split(r"[^\w]+", text)) * 1.3) async def _report_usage_metrics( - self, - prompt_tokens: int, - completion_tokens: int, - cache_creation_input_tokens: int, - cache_read_input_tokens: int): - if prompt_tokens or completion_tokens or cache_creation_input_tokens or cache_read_input_tokens: + self, + prompt_tokens: int, + completion_tokens: int, + cache_creation_input_tokens: int, + cache_read_input_tokens: int, + ): + if ( + prompt_tokens + or completion_tokens + or cache_creation_input_tokens + or cache_read_input_tokens + ): tokens = LLMTokenUsage( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, cache_creation_input_tokens=cache_creation_input_tokens, cache_read_input_tokens=cache_read_input_tokens, - total_tokens=prompt_tokens + completion_tokens + total_tokens=prompt_tokens + completion_tokens, ) await self.start_llm_usage_metrics(tokens) @@ -312,7 +338,7 @@ def __init__( tools: list[dict] | None = None, tool_choice: dict | None = None, *, - system: str | NotGiven = NOT_GIVEN + system: str | NotGiven = NOT_GIVEN, ): super().__init__(messages=messages, tools=tools, tool_choice=tool_choice) self._user_image_request_context = {} @@ -345,10 +371,8 @@ def from_messages(cls, messages: List[dict]) -> "AnthropicLLMContext": def from_image_frame(cls, frame: VisionImageRawFrame) -> "AnthropicLLMContext": context = cls() context.add_image_frame_message( - format=frame.format, - size=frame.size, - image=frame.image, - text=frame.text) + format=frame.format, size=frame.size, image=frame.image, text=frame.text + ) return context def set_messages(self, messages: List): @@ -357,18 +381,23 @@ def set_messages(self, messages: List): self._restructure_from_openai_messages() def add_image_frame_message( - self, *, format: str, size: tuple[int, int], image: bytes, text: str = None): + self, *, format: str, size: tuple[int, int], image: bytes, text: str = None + ): buffer = io.BytesIO() Image.frombytes(format, size, image).save(buffer, format="JPEG") encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8") # Anthropic docs say that the image should be the first content block in the message. - content = [{"type": "image", - "source": { - "type": "base64", - "media_type": "image/jpeg", - "data": encoded_image, - }}] + content = [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": encoded_image, + }, + } + ] if text: content.append({"type": "text", "text": text}) self.add_message({"role": "user", "content": content}) @@ -382,8 +411,9 @@ def add_message(self, message): # if the last message has just a content string, convert it to a list # in the proper format if isinstance(self.messages[-1]["content"], str): - self.messages[-1]["content"] = [{"type": "text", - "text": self.messages[-1]["content"]}] + self.messages[-1]["content"] = [ + {"type": "text", "text": self.messages[-1]["content"]} + ] # if this message has just a content string, convert it to a list # in the proper format if isinstance(message["content"], str): @@ -404,8 +434,11 @@ def get_messages_with_cache_control_markers(self) -> List[dict]: if isinstance(messages[-1]["content"], str): messages[-1]["content"] = [{"type": "text", "text": messages[-1]["content"]}] messages[-1]["content"][-1]["cache_control"] = {"type": "ephemeral"} - if (self.turns_above_cache_threshold >= 2 and - len(messages) > 2 and messages[-3]["role"] == "user"): + if ( + self.turns_above_cache_threshold >= 2 + and len(messages) > 2 + and messages[-3]["role"] == "user" + ): if isinstance(messages[-3]["content"], str): messages[-3]["content"] = [{"type": "text", "text": messages[-3]["content"]}] messages[-3]["content"][-1]["cache_control"] = {"type": "ephemeral"} @@ -459,12 +492,13 @@ async def process_frame(self, frame, direction): # The LLM sends a UserImageRequestFrame upstream. Cache any context provided with # that frame so we can use it when we assemble the image message in the assistant # context aggregator. - if (frame.context): + if frame.context: if isinstance(frame.context, str): self._context._user_image_request_context[frame.user_id] = frame.context else: logger.error( - f"Unexpected UserImageRequestFrame context type: {type(frame.context)}") + f"Unexpected UserImageRequestFrame context type: {type(frame.context)}" + ) del self._context._user_image_request_context[frame.user_id] else: if frame.user_id in self._context._user_image_request_context: @@ -481,6 +515,7 @@ async def process_frame(self, frame, direction): except Exception as e: logger.error(f"Error processing frame: {e}") + # # Claude returns a text content block along with a tool use content block. This works quite nicely # with streaming. We get the text first, so we can start streaming it right away. Then we get the @@ -508,13 +543,16 @@ async def process_frame(self, frame, direction): elif isinstance(frame, FunctionCallInProgressFrame): self._function_call_in_progress = frame elif isinstance(frame, FunctionCallResultFrame): - if (self._function_call_in_progress and self._function_call_in_progress.tool_call_id == - frame.tool_call_id): + if ( + self._function_call_in_progress + and self._function_call_in_progress.tool_call_id == frame.tool_call_id + ): self._function_call_in_progress = None self._function_call_result = frame else: logger.warning( - "FunctionCallResultFrame tool_call_id != InProgressFrame tool_call_id") + "FunctionCallResultFrame tool_call_id != InProgressFrame tool_call_id" + ) self._function_call_in_progress = None self._function_call_result = None elif isinstance(frame, AnthropicImageMessageFrame): @@ -534,31 +572,32 @@ async def _push_aggregation(self): frame = self._function_call_result self._function_call_result = None if frame.result: - self._context.add_message({ - "role": "assistant", - "content": [ - { - "type": "text", - "text": aggregation - }, - { - "type": "tool_use", - "id": frame.tool_call_id, - "name": frame.function_name, - "input": frame.arguments - } - ] - }) - self._context.add_message({ - "role": "user", - "content": [ - { - "type": "tool_result", - "tool_use_id": frame.tool_call_id, - "content": json.dumps(frame.result) - } - ] - }) + self._context.add_message( + { + "role": "assistant", + "content": [ + {"type": "text", "text": aggregation}, + { + "type": "tool_use", + "id": frame.tool_call_id, + "name": frame.function_name, + "input": frame.arguments, + }, + ], + } + ) + self._context.add_message( + { + "role": "user", + "content": [ + { + "type": "tool_result", + "tool_use_id": frame.tool_call_id, + "content": json.dumps(frame.result), + } + ], + } + ) run_llm = True else: self._context.add_message({"role": "assistant", "content": aggregation}) @@ -570,7 +609,8 @@ async def _push_aggregation(self): format=frame.user_image_raw_frame.format, size=frame.user_image_raw_frame.size, image=frame.user_image_raw_frame.image, - text=frame.text) + text=frame.text, + ) run_llm = True if run_llm: diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py index 36c8bc1bb..24e73cd2a 100644 --- a/src/pipecat/services/azure.py +++ b/src/pipecat/services/azure.py @@ -21,7 +21,8 @@ TTSStartedFrame, TTSStoppedFrame, TranscriptionFrame, - URLImageRawFrame) + URLImageRawFrame, +) from pipecat.metrics.metrics import TTSUsageMetricsData from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import STTService, TTSService, ImageGenService @@ -45,18 +46,15 @@ except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use Azure, you need to `pip install pipecat-ai[azure]`. Also, set `AZURE_SPEECH_API_KEY` and `AZURE_SPEECH_REGION` environment variables.") + "In order to use Azure, you need to `pip install pipecat-ai[azure]`. Also, set `AZURE_SPEECH_API_KEY` and `AZURE_SPEECH_REGION` environment variables." + ) raise Exception(f"Missing module: {e}") class AzureLLMService(BaseOpenAILLMService): def __init__( - self, - *, - api_key: str, - endpoint: str, - model: str, - api_version: str = "2023-12-01-preview"): + self, *, api_key: str, endpoint: str, model: str, api_version: str = "2023-12-01-preview" + ): # Initialize variables before calling parent __init__() because that # will call create_client() and we need those values there. self._endpoint = endpoint @@ -73,13 +71,14 @@ def create_client(self, api_key=None, base_url=None, **kwargs): class AzureTTSService(TTSService): def __init__( - self, - *, - api_key: str, - region: str, - voice="en-US-SaraNeural", - sample_rate: int = 16000, - **kwargs): + self, + *, + api_key: str, + region: str, + voice="en-US-SaraNeural", + sample_rate: int = 16000, + **kwargs, + ): super().__init__(sample_rate=sample_rate, **kwargs) speech_config = SpeechConfig(subscription=api_key, region=region) @@ -108,7 +107,8 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: "" "" f"{text}" - " ") + " " + ) result = await asyncio.to_thread(self._speech_synthesizer.speak_ssml, (ssml)) @@ -117,7 +117,9 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: await self.stop_ttfb_metrics() await self.push_frame(TTSStartedFrame()) # Azure always sends a 44-byte header. Strip it off. - yield TTSAudioRawFrame(audio=result.audio_data[44:], sample_rate=self._sample_rate, num_channels=1) + yield TTSAudioRawFrame( + audio=result.audio_data[44:], sample_rate=self._sample_rate, num_channels=1 + ) await self.push_frame(TTSStoppedFrame()) elif result.reason == ResultReason.Canceled: cancellation_details = result.cancellation_details @@ -128,14 +130,15 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: class AzureSTTService(STTService): def __init__( - self, - *, - api_key: str, - region: str, - language="en-US", - sample_rate=16000, - channels=1, - **kwargs): + self, + *, + api_key: str, + region: str, + language="en-US", + sample_rate=16000, + channels=1, + **kwargs, + ): super().__init__(**kwargs) speech_config = SpeechConfig(subscription=api_key, region=region) @@ -146,7 +149,8 @@ def __init__( audio_config = AudioConfig(stream=self._audio_stream) self._speech_recognizer = SpeechRecognizer( - speech_config=speech_config, audio_config=audio_config) + speech_config=speech_config, audio_config=audio_config + ) self._speech_recognizer.recognized.connect(self._on_handle_recognized) async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: @@ -176,7 +180,6 @@ def _on_handle_recognized(self, event): class AzureImageGenServiceREST(ImageGenService): - def __init__( self, *, @@ -199,9 +202,7 @@ def __init__( async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]: url = f"{self._azure_endpoint}openai/images/generations:submit?api-version={self._api_version}" - headers = { - "api-key": self._api_key, - "Content-Type": "application/json"} + headers = {"api-key": self._api_key, "Content-Type": "application/json"} body = { # Enter your prompt text here @@ -243,8 +244,6 @@ async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]: image_stream = io.BytesIO(await response.content.read()) image = Image.open(image_stream) frame = URLImageRawFrame( - url=image_url, - image=image.tobytes(), - size=image.size, - format=image.format) + url=image_url, image=image.tobytes(), size=image.size, format=image.format + ) yield frame diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index a44daf70c..90deeda15 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -21,7 +21,7 @@ TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame, - LLMFullResponseEndFrame + LLMFullResponseEndFrame, ) from pipecat.processors.frame_processor import FrameDirection from pipecat.transcriptions.language import Language @@ -36,7 +36,8 @@ except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use Cartesia, you need to `pip install pipecat-ai[cartesia]`. Also, set `CARTESIA_API_KEY` environment variable.") + "In order to use Cartesia, you need to `pip install pipecat-ai[cartesia]`. Also, set `CARTESIA_API_KEY` environment variable." + ) raise Exception(f"Missing module: {e}") @@ -60,19 +61,19 @@ def language_to_cartesia_language(language: Language) -> str | None: class CartesiaTTSService(AsyncWordTTSService): - def __init__( - self, - *, - api_key: str, - voice_id: str, - cartesia_version: str = "2024-06-10", - url: str = "wss://api.cartesia.ai/tts/websocket", - model_id: str = "sonic-english", - encoding: str = "pcm_s16le", - sample_rate: int = 16000, - language: str = "en", - **kwargs): + self, + *, + api_key: str, + voice_id: str, + cartesia_version: str = "2024-06-10", + url: str = "wss://api.cartesia.ai/tts/websocket", + model_id: str = "sonic-english", + encoding: str = "pcm_s16le", + sample_rate: int = 16000, + language: str = "en", + **kwargs, + ): # Aggregating sentences still gives cleaner-sounding results and fewer # artifacts than streaming one word at a time. On average, waiting for a # full sentence should only "cost" us 15ms or so with GPT-4o or a Llama @@ -83,7 +84,9 @@ def __init__( # if we're interrupted. Cartesia gives us word-by-word timestamps. We # can use those to generate text frames ourselves aligned with the # playout timing of the audio! - super().__init__(aggregate_sentences=True, push_text_frames=False, sample_rate=sample_rate, **kwargs) + super().__init__( + aggregate_sentences=True, push_text_frames=False, sample_rate=sample_rate, **kwargs + ) self._api_key = api_key self._cartesia_version = cartesia_version @@ -175,10 +178,7 @@ async def flush_audio(self): "continue": False, "context_id": self._context_id, "model_id": self.model_name, - "voice": { - "mode": "id", - "id": self._voice_id - }, + "voice": {"mode": "id", "id": self._voice_id}, "output_format": self._output_format, "language": self._language, "add_timestamps": True, @@ -209,7 +209,7 @@ async def _receive_task_handler(self): frame = TTSAudioRawFrame( audio=base64.b64decode(msg["data"]), sample_rate=self._output_format["sample_rate"], - num_channels=1 + num_channels=1, ) await self.push_frame(frame) elif msg["type"] == "error": @@ -241,10 +241,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: "continue": True, "context_id": self._context_id, "model_id": self.model_name, - "voice": { - "mode": "id", - "id": self._voice_id - }, + "voice": {"mode": "id", "id": self._voice_id}, "output_format": self._output_format, "language": self._language, "add_timestamps": True, @@ -264,18 +261,18 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: class CartesiaHttpTTSService(TTSService): - def __init__( - self, - *, - api_key: str, - voice_id: str, - model_id: str = "sonic-english", - base_url: str = "https://api.cartesia.ai", - encoding: str = "pcm_s16le", - sample_rate: int = 16000, - language: str = "en", - **kwargs): + self, + *, + api_key: str, + voice_id: str, + model_id: str = "sonic-english", + base_url: str = "https://api.cartesia.ai", + encoding: str = "pcm_s16le", + sample_rate: int = 16000, + language: str = "en", + **kwargs, + ): super().__init__(**kwargs) self._api_key = api_key @@ -326,7 +323,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: voice_id=self._voice_id, output_format=self._output_format, language=self._language, - stream=False + stream=False, ) await self.stop_ttfb_metrics() @@ -334,7 +331,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: frame = TTSAudioRawFrame( audio=output["audio"], sample_rate=self._output_format["sample_rate"], - num_channels=1 + num_channels=1, ) yield frame except Exception as e: diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py index 25f7b7a56..fab12e080 100644 --- a/src/pipecat/services/deepgram.py +++ b/src/pipecat/services/deepgram.py @@ -18,7 +18,8 @@ TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame, - TranscriptionFrame) + TranscriptionFrame, +) from pipecat.services.ai_services import STTService, TTSService from pipecat.transcriptions.language import Language from pipecat.utils.time import time_now_iso8601 @@ -34,27 +35,28 @@ DeepgramClientOptions, LiveTranscriptionEvents, LiveOptions, - LiveResultResponse + LiveResultResponse, ) except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use Deepgram, you need to `pip install pipecat-ai[deepgram]`. Also, set `DEEPGRAM_API_KEY` environment variable.") + "In order to use Deepgram, you need to `pip install pipecat-ai[deepgram]`. Also, set `DEEPGRAM_API_KEY` environment variable." + ) raise Exception(f"Missing module: {e}") class DeepgramTTSService(TTSService): - def __init__( - self, - *, - api_key: str, - aiohttp_session: aiohttp.ClientSession, - voice: str = "aura-helios-en", - base_url: str = "https://api.deepgram.com/v1/speak", - sample_rate: int = 16000, - encoding: str = "linear16", - **kwargs): + self, + *, + api_key: str, + aiohttp_session: aiohttp.ClientSession, + voice: str = "aura-helios-en", + base_url: str = "https://api.deepgram.com/v1/speak", + sample_rate: int = 16000, + encoding: str = "linear16", + **kwargs, + ): super().__init__(**kwargs) self._voice = voice @@ -93,8 +95,11 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: return logger.error( - f"{self} error getting audio (status: {r.status}, error: {response_text})") - yield ErrorFrame(f"Error getting audio (status: {r.status}, error: {response_text})") + f"{self} error getting audio (status: {r.status}, error: {response_text})" + ) + yield ErrorFrame( + f"Error getting audio (status: {r.status}, error: {response_text})" + ) return await self.start_tts_usage_metrics(text) @@ -103,7 +108,8 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: async for data in r.content: await self.stop_ttfb_metrics() frame = TTSAudioRawFrame( - audio=data, sample_rate=self._sample_rate, num_channels=1) + audio=data, sample_rate=self._sample_rate, num_channels=1 + ) yield frame await self.push_frame(TTSStoppedFrame()) except Exception as e: @@ -111,36 +117,36 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: class DeepgramSTTService(STTService): - def __init__(self, - *, - api_key: str, - url: str = "", - live_options: LiveOptions = LiveOptions( - encoding="linear16", - language="en-US", - model="nova-2-conversationalai", - sample_rate=16000, - channels=1, - interim_results=True, - smart_format=True, - punctuate=True, - profanity_filter=True, - vad_events=False, - ), - **kwargs): + def __init__( + self, + *, + api_key: str, + url: str = "", + live_options: LiveOptions = LiveOptions( + encoding="linear16", + language="en-US", + model="nova-2-conversationalai", + sample_rate=16000, + channels=1, + interim_results=True, + smart_format=True, + punctuate=True, + profanity_filter=True, + vad_events=False, + ), + **kwargs, + ): super().__init__(**kwargs) self._live_options = live_options self._client = DeepgramClient( - api_key, config=DeepgramClientOptions(url=url, options={"keepalive": "true"})) - self._connection: AsyncListenWebSocketClient = self._client.listen.asyncwebsocket.v( - "1") - self._connection.on( - LiveTranscriptionEvents.Transcript, self._on_message) + api_key, config=DeepgramClientOptions(url=url, options={"keepalive": "true"}) + ) + self._connection: AsyncListenWebSocketClient = self._client.listen.asyncwebsocket.v("1") + self._connection.on(LiveTranscriptionEvents.Transcript, self._on_message) if self.vad_enabled: - self._connection.on( - LiveTranscriptionEvents.SpeechStarted, self._on_speech_started) + self._connection.on(LiveTranscriptionEvents.SpeechStarted, self._on_speech_started) @property def vad_enabled(self): @@ -206,7 +212,11 @@ async def _on_message(self, *args, **kwargs): if len(transcript) > 0: await self.stop_ttfb_metrics() if is_final: - await self.push_frame(TranscriptionFrame(transcript, "", time_now_iso8601(), language)) + await self.push_frame( + TranscriptionFrame(transcript, "", time_now_iso8601(), language) + ) await self.stop_processing_metrics() else: - await self.push_frame(InterimTranscriptionFrame(transcript, "", time_now_iso8601(), language)) + await self.push_frame( + InterimTranscriptionFrame(transcript, "", time_now_iso8601(), language) + ) diff --git a/src/pipecat/services/fal.py b/src/pipecat/services/fal.py index 58768180f..bb7b47dfc 100644 --- a/src/pipecat/services/fal.py +++ b/src/pipecat/services/fal.py @@ -22,7 +22,8 @@ except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use Fal, you need to `pip install pipecat-ai[fal]`. Also, set `FAL_KEY` environment variable.") + "In order to use Fal, you need to `pip install pipecat-ai[fal]`. Also, set `FAL_KEY` environment variable." + ) raise Exception(f"Missing module: {e}") @@ -43,7 +44,7 @@ def __init__( aiohttp_session: aiohttp.ClientSession, model: str = "fal-ai/fast-sdxl", key: str | None = None, - **kwargs + **kwargs, ): super().__init__(**kwargs) self.set_model_name(model) @@ -57,7 +58,7 @@ async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]: response = await fal_client.run_async( self.model_name, - arguments={"prompt": prompt, **self._params.model_dump(exclude_none=True)} + arguments={"prompt": prompt, **self._params.model_dump(exclude_none=True)}, ) image_url = response["images"][0]["url"] if response else None @@ -77,8 +78,6 @@ async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]: image = Image.open(image_stream) frame = URLImageRawFrame( - url=image_url, - image=image.tobytes(), - size=image.size, - format=image.format) + url=image_url, image=image.tobytes(), size=image.size, format=image.format + ) yield frame diff --git a/src/pipecat/services/fireworks.py b/src/pipecat/services/fireworks.py index 87fddd838..a6e826c12 100644 --- a/src/pipecat/services/fireworks.py +++ b/src/pipecat/services/fireworks.py @@ -13,13 +13,16 @@ except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use Fireworks, you need to `pip install pipecat-ai[fireworks]`. Also, set the `FIREWORKS_API_KEY` environment variable.") + "In order to use Fireworks, you need to `pip install pipecat-ai[fireworks]`. Also, set the `FIREWORKS_API_KEY` environment variable." + ) raise Exception(f"Missing module: {e}") class FireworksLLMService(BaseOpenAILLMService): - def __init__(self, - *, - model: str = "accounts/fireworks/models/firefunction-v1", - base_url: str = "https://api.fireworks.ai/inference/v1"): + def __init__( + self, + *, + model: str = "accounts/fireworks/models/firefunction-v1", + base_url: str = "https://api.fireworks.ai/inference/v1", + ): super().__init__(model=model, base_url=base_url) diff --git a/src/pipecat/services/gladia.py b/src/pipecat/services/gladia.py index ead8f63dc..12183adde 100644 --- a/src/pipecat/services/gladia.py +++ b/src/pipecat/services/gladia.py @@ -16,7 +16,8 @@ Frame, InterimTranscriptionFrame, StartFrame, - TranscriptionFrame) + TranscriptionFrame, +) from pipecat.services.ai_services import STTService from pipecat.utils.time import time_now_iso8601 @@ -28,7 +29,8 @@ except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use Gladia, you need to `pip install pipecat-ai[gladia]`. Also, set `GLADIA_API_KEY` environment variable.") + "In order to use Gladia, you need to `pip install pipecat-ai[gladia]`. Also, set `GLADIA_API_KEY` environment variable." + ) raise Exception(f"Missing module: {e}") @@ -40,13 +42,15 @@ class InputParams(BaseModel): endpointing: Optional[int] = 200 prosody: Optional[bool] = None - def __init__(self, - *, - api_key: str, - url: str = "wss://api.gladia.io/audio/text/audio-transcription", - confidence: float = 0.5, - params: InputParams = InputParams(), - **kwargs): + def __init__( + self, + *, + api_key: str, + url: str = "wss://api.gladia.io/audio/text/audio-transcription", + confidence: float = 0.5, + params: InputParams = InputParams(), + **kwargs, + ): super().__init__(sync=False, **kwargs) self._api_key = api_key @@ -80,15 +84,13 @@ async def _setup_gladia(self): "encoding": "WAV/PCM", "model_type": "fast", "language_behaviour": "manual", - **self._params.model_dump(exclude_none=True) + **self._params.model_dump(exclude_none=True), } await self._websocket.send(json.dumps(configuration)) async def _send_audio(self, audio: bytes): - message = { - 'frames': base64.b64encode(audio).decode("utf-8") - } + message = {"frames": base64.b64encode(audio).decode("utf-8")} await self._websocket.send(json.dumps(message)) async def _receive_task_handler(self): @@ -106,6 +108,10 @@ async def _receive_task_handler(self): transcript = utterance["transcription"] if confidence >= self._confidence: if type == "final": - await self.push_frame(TranscriptionFrame(transcript, "", time_now_iso8601())) + await self.push_frame( + TranscriptionFrame(transcript, "", time_now_iso8601()) + ) else: - await self.push_frame(InterimTranscriptionFrame(transcript, "", time_now_iso8601())) + await self.push_frame( + InterimTranscriptionFrame(transcript, "", time_now_iso8601()) + ) diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py index b72169b70..4de6b77fa 100644 --- a/src/pipecat/services/google.py +++ b/src/pipecat/services/google.py @@ -15,11 +15,14 @@ VisionImageRawFrame, LLMMessagesFrame, LLMFullResponseStartFrame, - LLMFullResponseEndFrame + LLMFullResponseEndFrame, ) from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import LLMService -from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext, OpenAILLMContextFrame +from pipecat.processors.aggregators.openai_llm_context import ( + OpenAILLMContext, + OpenAILLMContextFrame, +) from loguru import logger @@ -29,7 +32,8 @@ except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use Google AI, you need to `pip install pipecat-ai[google]`. Also, set `GOOGLE_API_KEY` environment variable.") + "In order to use Google AI, you need to `pip install pipecat-ai[google]`. Also, set `GOOGLE_API_KEY` environment variable." + ) raise Exception(f"Missing module: {e}") @@ -53,8 +57,7 @@ def _create_client(self, model: str): self.set_model_name(model) self._client = gai.GenerativeModel(model) - def _get_messages_from_openai_context( - self, context: OpenAILLMContext) -> List[glm.Content]: + def _get_messages_from_openai_context(self, context: OpenAILLMContext) -> List[glm.Content]: openai_messages = context.get_messages() google_messages = [] @@ -69,10 +72,12 @@ def _get_messages_from_openai_context( parts = [glm.Part(text=content)] if "mime_type" in message: parts.append( - glm.Part(inline_data=glm.Blob( - mime_type=message["mime_type"], - data=message["data"].getvalue() - ))) + glm.Part( + inline_data=glm.Blob( + mime_type=message["mime_type"], data=message["data"].getvalue() + ) + ) + ) google_messages.append({"role": role, "parts": parts}) return google_messages @@ -103,7 +108,8 @@ async def _process_context(self, context: OpenAILLMContext): # Google LLMs seem to flag safety issues a lot! if chunk.candidates[0].finish_reason == 3: logger.debug( - f"LLM refused to generate content for safety reasons - {messages}.") + f"LLM refused to generate content for safety reasons - {messages}." + ) else: logger.exception(f"{self} error: {e}") diff --git a/src/pipecat/services/lmnt.py b/src/pipecat/services/lmnt.py index 9285f1583..1ac24d731 100644 --- a/src/pipecat/services/lmnt.py +++ b/src/pipecat/services/lmnt.py @@ -30,20 +30,21 @@ except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use LMNT, you need to `pip install pipecat-ai[lmnt]`. Also, set `LMNT_API_KEY` environment variable.") + "In order to use LMNT, you need to `pip install pipecat-ai[lmnt]`. Also, set `LMNT_API_KEY` environment variable." + ) raise Exception(f"Missing module: {e}") class LmntTTSService(AsyncTTSService): - def __init__( - self, - *, - api_key: str, - voice_id: str, - sample_rate: int = 24000, - language: str = "en", - **kwargs): + self, + *, + api_key: str, + voice_id: str, + sample_rate: int = 24000, + language: str = "en", + **kwargs, + ): # Let TTSService produce TTSStoppedFrames after a short delay of # no activity. super().__init__(sync=False, push_stop_frames=True, sample_rate=sample_rate, **kwargs) @@ -92,7 +93,8 @@ async def _connect(self): try: self._speech = Speech() self._connection = await self._speech.synthesize_streaming( - self._voice_id, format="raw", sample_rate=self._output_format["sample_rate"]) + self._voice_id, format="raw", sample_rate=self._output_format["sample_rate"] + ) self._receive_task = self.get_event_loop().create_task(self._receive_task_handler()) except Exception as e: logger.exception(f"{self} initialization error: {e}") @@ -129,7 +131,7 @@ async def _receive_task_handler(self): frame = TTSAudioRawFrame( audio=msg["audio"], sample_rate=self._output_format["sample_rate"], - num_channels=1 + num_channels=1, ) await self.push_frame(frame) else: diff --git a/src/pipecat/services/moondream.py b/src/pipecat/services/moondream.py index b6391cc93..74442dfee 100644 --- a/src/pipecat/services/moondream.py +++ b/src/pipecat/services/moondream.py @@ -31,6 +31,7 @@ def detect_device(): """ try: import intel_extension_for_pytorch + if torch.xpu.is_available(): return torch.device("xpu"), torch.float32 except ImportError: @@ -45,12 +46,7 @@ def detect_device(): class MoondreamService(VisionService): def __init__( - self, - *, - model="vikhyatk/moondream2", - revision="2024-08-26", - use_cpu=False, - **kwargs + self, *, model="vikhyatk/moondream2", revision="2024-08-26", use_cpu=False, **kwargs ): super().__init__(**kwargs) @@ -85,9 +81,8 @@ def get_image_description(frame: VisionImageRawFrame): image = Image.frombytes(frame.format, frame.size, frame.image) image_embeds = self._model.encode_image(image) description = self._model.answer_question( - image_embeds=image_embeds, - question=frame.text, - tokenizer=self._tokenizer) + image_embeds=image_embeds, question=frame.text, tokenizer=self._tokenizer + ) return description description = await asyncio.to_thread(get_image_description, frame) diff --git a/src/pipecat/services/ollama.py b/src/pipecat/services/ollama.py index 8fa3fc2de..0a6a4ce6a 100644 --- a/src/pipecat/services/ollama.py +++ b/src/pipecat/services/ollama.py @@ -8,6 +8,5 @@ class OLLamaLLMService(BaseOpenAILLMService): - def __init__(self, *, model: str = "llama2", base_url: str = "http://localhost:11434/v1"): super().__init__(model=model, base_url=base_url, api_key="ollama") diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py index 4203f8194..e54898525 100644 --- a/src/pipecat/services/openai.py +++ b/src/pipecat/services/openai.py @@ -32,21 +32,20 @@ VisionImageRawFrame, FunctionCallResultFrame, FunctionCallInProgressFrame, - StartInterruptionFrame + StartInterruptionFrame, ) from pipecat.metrics.metrics import LLMTokenUsage -from pipecat.processors.aggregators.llm_response import LLMUserContextAggregator, LLMAssistantContextAggregator +from pipecat.processors.aggregators.llm_response import ( + LLMUserContextAggregator, + LLMAssistantContextAggregator, +) from pipecat.processors.aggregators.openai_llm_context import ( OpenAILLMContext, - OpenAILLMContextFrame + OpenAILLMContextFrame, ) from pipecat.processors.frame_processor import FrameDirection -from pipecat.services.ai_services import ( - ImageGenService, - LLMService, - TTSService -) +from pipecat.services.ai_services import ImageGenService, LLMService, TTSService try: from openai import AsyncOpenAI, AsyncStream, DefaultAsyncHttpxClient, BadRequestError, NOT_GIVEN @@ -54,7 +53,8 @@ except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use OpenAI, you need to `pip install pipecat-ai[openai]`. Also, set `OPENAI_API_KEY` environment variable.") + "In order to use OpenAI, you need to `pip install pipecat-ai[openai]`. Also, set `OPENAI_API_KEY` environment variable." + ) raise Exception(f"Missing module: {e}") ValidVoice = Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"] @@ -82,24 +82,28 @@ class BaseOpenAILLMService(LLMService): as well as tool choices and the tool, which is used if requesting function calls from the LLM. """ + class InputParams(BaseModel): frequency_penalty: Optional[float] = Field( - default_factory=lambda: NOT_GIVEN, ge=-2.0, le=2.0) + default_factory=lambda: NOT_GIVEN, ge=-2.0, le=2.0 + ) presence_penalty: Optional[float] = Field( - default_factory=lambda: NOT_GIVEN, ge=-2.0, le=2.0) + default_factory=lambda: NOT_GIVEN, ge=-2.0, le=2.0 + ) seed: Optional[int] = Field(default_factory=lambda: NOT_GIVEN, ge=0) temperature: Optional[float] = Field(default_factory=lambda: NOT_GIVEN, ge=0.0, le=2.0) top_p: Optional[float] = Field(default_factory=lambda: NOT_GIVEN, ge=0.0, le=1.0) extra: Optional[Dict[str, Any]] = Field(default_factory=dict) def __init__( - self, - *, - model: str, - api_key=None, - base_url=None, - params: InputParams = InputParams(), - **kwargs): + self, + *, + model: str, + api_key=None, + base_url=None, + params: InputParams = InputParams(), + **kwargs, + ): super().__init__(**kwargs) self.set_model_name(model) self._client = self.create_client(api_key=api_key, base_url=base_url, **kwargs) @@ -116,9 +120,10 @@ def create_client(self, api_key=None, base_url=None, **kwargs): base_url=base_url, http_client=DefaultAsyncHttpxClient( limits=httpx.Limits( - max_keepalive_connections=100, - max_connections=1000, - keepalive_expiry=None))) + max_keepalive_connections=100, max_connections=1000, keepalive_expiry=None + ) + ), + ) def can_generate_metrics(self) -> bool: return True @@ -148,10 +153,8 @@ async def set_extra(self, extra: Dict[str, Any]): self._extra = extra async def get_chat_completions( - self, - context: OpenAILLMContext, - messages: List[ChatCompletionMessageParam]) -> AsyncStream[ChatCompletionChunk]: - + self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam] + ) -> AsyncStream[ChatCompletionChunk]: params = { "model": self.model_name, "stream": True, @@ -172,7 +175,8 @@ async def get_chat_completions( return chunks async def _stream_chat_completions( - self, context: OpenAILLMContext) -> AsyncStream[ChatCompletionChunk]: + self, context: OpenAILLMContext + ) -> AsyncStream[ChatCompletionChunk]: logger.debug(f"Generating chat: {context.get_messages_json()}") messages: List[ChatCompletionMessageParam] = context.get_messages() @@ -184,7 +188,10 @@ async def _stream_chat_completions( text = message["content"] message["content"] = [ {"type": "text", "text": text}, - {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}} + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}, + }, ] del message["data"] del message["mime_type"] @@ -200,8 +207,8 @@ async def _process_context(self, context: OpenAILLMContext): await self.start_ttfb_metrics() - chunk_stream: AsyncStream[ChatCompletionChunk] = ( - await self._stream_chat_completions(context) + chunk_stream: AsyncStream[ChatCompletionChunk] = await self._stream_chat_completions( + context ) async for chunk in chunk_stream: @@ -209,7 +216,7 @@ async def _process_context(self, context: OpenAILLMContext): tokens = LLMTokenUsage( prompt_tokens=chunk.usage.prompt_tokens, completion_tokens=chunk.usage.completion_tokens, - total_tokens=chunk.usage.total_tokens + total_tokens=chunk.usage.total_tokens, ) await self.start_llm_usage_metrics(tokens) @@ -250,21 +257,16 @@ async def _process_context(self, context: OpenAILLMContext): await self._handle_function_call(context, tool_call_id, function_name, arguments) else: raise OpenAIUnhandledFunctionException( - f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function.") - - async def _handle_function_call( - self, - context, - tool_call_id, - function_name, - arguments - ): + f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function." + ) + + async def _handle_function_call(self, context, tool_call_id, function_name, arguments): arguments = json.loads(arguments) await self.call_function( context=context, tool_call_id=tool_call_id, function_name=function_name, - arguments=arguments + arguments=arguments, ) async def process_frame(self, frame: Frame, direction: FrameDirection): @@ -293,38 +295,34 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): @dataclass class OpenAIContextAggregatorPair: - _user: 'OpenAIUserContextAggregator' - _assistant: 'OpenAIAssistantContextAggregator' + _user: "OpenAIUserContextAggregator" + _assistant: "OpenAIAssistantContextAggregator" - def user(self) -> 'OpenAIUserContextAggregator': + def user(self) -> "OpenAIUserContextAggregator": return self._user - def assistant(self) -> 'OpenAIAssistantContextAggregator': + def assistant(self) -> "OpenAIAssistantContextAggregator": return self._assistant class OpenAILLMService(BaseOpenAILLMService): - def __init__( - self, - *, - model: str = "gpt-4o", - params: BaseOpenAILLMService.InputParams = BaseOpenAILLMService.InputParams(), - **kwargs): + self, + *, + model: str = "gpt-4o", + params: BaseOpenAILLMService.InputParams = BaseOpenAILLMService.InputParams(), + **kwargs, + ): super().__init__(model=model, params=params, **kwargs) @staticmethod def create_context_aggregator(context: OpenAILLMContext) -> OpenAIContextAggregatorPair: user = OpenAIUserContextAggregator(context) assistant = OpenAIAssistantContextAggregator(user) - return OpenAIContextAggregatorPair( - _user=user, - _assistant=assistant - ) + return OpenAIContextAggregatorPair(_user=user, _assistant=assistant) class OpenAIImageGenService(ImageGenService): - def __init__( self, *, @@ -343,10 +341,7 @@ async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating image from prompt: {prompt}") image = await self._client.images.generate( - prompt=prompt, - model=self.model_name, - n=1, - size=self._image_size + prompt=prompt, model=self.model_name, n=1, size=self._image_size ) image_url = image.data[0].url @@ -376,13 +371,14 @@ class OpenAITTSService(TTSService): """ def __init__( - self, - *, - api_key: str | None = None, - voice: str = "alloy", - model: Literal["tts-1", "tts-1-hd"] = "tts-1", - sample_rate: int = 24000, - **kwargs): + self, + *, + api_key: str | None = None, + voice: str = "alloy", + model: Literal["tts-1", "tts-1-hd"] = "tts-1", + sample_rate: int = 24000, + **kwargs, + ): super().__init__(sample_rate=sample_rate, **kwargs) self._voice: ValidVoice = VALID_VOICES.get(voice, "alloy") @@ -408,16 +404,19 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: await self.start_ttfb_metrics() async with self._client.audio.speech.with_streaming_response.create( - input=text, - model=self.model_name, - voice=self._voice, - response_format="pcm", + input=text, + model=self.model_name, + voice=self._voice, + response_format="pcm", ) as r: if r.status_code != 200: error = await r.text() logger.error( - f"{self} error getting audio (status: {r.status_code}, error: {error})") - yield ErrorFrame(f"Error getting audio (status: {r.status_code}, error: {error})") + f"{self} error getting audio (status: {r.status_code}, error: {error})" + ) + yield ErrorFrame( + f"Error getting audio (status: {r.status_code}, error: {error})" + ) return await self.start_tts_usage_metrics(text) @@ -454,14 +453,18 @@ async def process_frame(self, frame, direction): elif isinstance(frame, FunctionCallInProgressFrame): self._function_call_in_progress = frame elif isinstance(frame, FunctionCallResultFrame): - if self._function_call_in_progress and self._function_call_in_progress.tool_call_id == frame.tool_call_id: + if ( + self._function_call_in_progress + and self._function_call_in_progress.tool_call_id == frame.tool_call_id + ): self._function_call_in_progress = None self._function_call_result = frame # TODO-CB: Kwin wants us to refactor this out of here but I REFUSE await self._push_aggregation() else: logger.warning( - f"FunctionCallResultFrame tool_call_id does not match FunctionCallInProgressFrame tool_call_id") + f"FunctionCallResultFrame tool_call_id does not match FunctionCallInProgressFrame tool_call_id" + ) self._function_call_in_progress = None self._function_call_result = None @@ -479,24 +482,28 @@ async def _push_aggregation(self): frame = self._function_call_result self._function_call_result = None if frame.result: - self._context.add_message({ - "role": "assistant", - "tool_calls": [ - { - "id": frame.tool_call_id, - "function": { - "name": frame.function_name, - "arguments": json.dumps(frame.arguments) - }, - "type": "function" - } - ] - }) - self._context.add_message({ - "role": "tool", - "content": json.dumps(frame.result), - "tool_call_id": frame.tool_call_id - }) + self._context.add_message( + { + "role": "assistant", + "tool_calls": [ + { + "id": frame.tool_call_id, + "function": { + "name": frame.function_name, + "arguments": json.dumps(frame.arguments), + }, + "type": "function", + } + ], + } + ) + self._context.add_message( + { + "role": "tool", + "content": json.dumps(frame.result), + "tool_call_id": frame.tool_call_id, + } + ) run_llm = True else: self._context.add_message({"role": "assistant", "content": aggregation}) diff --git a/src/pipecat/services/openpipe.py b/src/pipecat/services/openpipe.py index e4e14dc15..1f28a85b1 100644 --- a/src/pipecat/services/openpipe.py +++ b/src/pipecat/services/openpipe.py @@ -13,33 +13,35 @@ try: from openpipe import AsyncOpenAI as OpenPipeAI, AsyncStream - from openai.types.chat import (ChatCompletionMessageParam, ChatCompletionChunk) + from openai.types.chat import ChatCompletionMessageParam, ChatCompletionChunk except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use OpenPipe, you need to `pip install pipecat-ai[openpipe]`. Also, set `OPENPIPE_API_KEY` and `OPENAI_API_KEY` environment variables.") + "In order to use OpenPipe, you need to `pip install pipecat-ai[openpipe]`. Also, set `OPENPIPE_API_KEY` and `OPENAI_API_KEY` environment variables." + ) raise Exception(f"Missing module: {e}") class OpenPipeLLMService(BaseOpenAILLMService): - def __init__( - self, - *, - model: str = "gpt-4o", - api_key: str | None = None, - base_url: str | None = None, - openpipe_api_key: str | None = None, - openpipe_base_url: str = "https://app.openpipe.ai/api/v1", - tags: Dict[str, str] | None = None, - **kwargs): + self, + *, + model: str = "gpt-4o", + api_key: str | None = None, + base_url: str | None = None, + openpipe_api_key: str | None = None, + openpipe_base_url: str = "https://app.openpipe.ai/api/v1", + tags: Dict[str, str] | None = None, + **kwargs, + ): super().__init__( model=model, api_key=api_key, base_url=base_url, openpipe_api_key=openpipe_api_key, openpipe_base_url=openpipe_base_url, - **kwargs) + **kwargs, + ) self._tags = tags def create_client(self, api_key=None, base_url=None, **kwargs): @@ -48,24 +50,17 @@ def create_client(self, api_key=None, base_url=None, **kwargs): client = OpenPipeAI( api_key=api_key, base_url=base_url, - openpipe={ - "api_key": openpipe_api_key, - "base_url": openpipe_base_url - } + openpipe={"api_key": openpipe_api_key, "base_url": openpipe_base_url}, ) return client async def get_chat_completions( - self, - context: OpenAILLMContext, - messages: List[ChatCompletionMessageParam]) -> AsyncStream[ChatCompletionChunk]: + self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam] + ) -> AsyncStream[ChatCompletionChunk]: chunks = await self._client.chat.completions.create( model=self.model_name, stream=True, messages=messages, - openpipe={ - "tags": self._tags, - "log_request": True - } + openpipe={"tags": self._tags, "log_request": True}, ) return chunks diff --git a/src/pipecat/services/playht.py b/src/pipecat/services/playht.py index ae8606e91..2ffa3a419 100644 --- a/src/pipecat/services/playht.py +++ b/src/pipecat/services/playht.py @@ -9,11 +9,7 @@ from typing import AsyncGenerator -from pipecat.frames.frames import ( - Frame, - TTSAudioRawFrame, - TTSStartedFrame, - TTSStoppedFrame) +from pipecat.frames.frames import Frame, TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame from pipecat.services.ai_services import TTSService from loguru import logger @@ -25,20 +21,15 @@ except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use PlayHT, you need to `pip install pipecat-ai[playht]`. Also, set `PLAY_HT_USER_ID` and `PLAY_HT_API_KEY` environment variables.") + "In order to use PlayHT, you need to `pip install pipecat-ai[playht]`. Also, set `PLAY_HT_USER_ID` and `PLAY_HT_API_KEY` environment variables." + ) raise Exception(f"Missing module: {e}") class PlayHTTTSService(TTSService): - def __init__( - self, - *, - api_key: str, - user_id: str, - voice_url: str, - sample_rate: int = 16000, - **kwargs): + self, *, api_key: str, user_id: str, voice_url: str, sample_rate: int = 16000, **kwargs + ): super().__init__(sample_rate=sample_rate, **kwargs) self._user_id = user_id @@ -49,10 +40,8 @@ def __init__( api_key=self._speech_key, ) self._options = TTSOptions( - voice=voice_url, - sample_rate=sample_rate, - quality="higher", - format=Format.FORMAT_WAV) + voice=voice_url, sample_rate=sample_rate, quality="higher", format=Format.FORMAT_WAV + ) def can_generate_metrics(self) -> bool: return True @@ -71,9 +60,8 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: await self.start_ttfb_metrics() playht_gen = self._client.tts( - text, - voice_engine="PlayHT2.0-turbo", - options=self._options) + text, voice_engine="PlayHT2.0-turbo", options=self._options + ) await self.start_tts_usage_metrics(text) @@ -87,10 +75,10 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: else: fh = io.BytesIO(b) fh.seek(36) - (data, size) = struct.unpack('<4sI', fh.read(8)) - while data != b'data': + (data, size) = struct.unpack("<4sI", fh.read(8)) + while data != b"data": fh.read(size) - (data, size) = struct.unpack('<4sI', fh.read(8)) + (data, size) = struct.unpack("<4sI", fh.read(8)) in_header = False else: if len(chunk): diff --git a/src/pipecat/services/to_be_updated/cloudflare_ai_service.py b/src/pipecat/services/to_be_updated/cloudflare_ai_service.py index 058e2212c..1329f9c79 100644 --- a/src/pipecat/services/to_be_updated/cloudflare_ai_service.py +++ b/src/pipecat/services/to_be_updated/cloudflare_ai_service.py @@ -12,15 +12,14 @@ def __init__(self): self.cloudflare_account_id = os.getenv("CLOUDFLARE_ACCOUNT_ID") self.cloudflare_api_token = os.getenv("CLOUDFLARE_API_TOKEN") - self.api_base_url = f'https://api.cloudflare.com/client/v4/accounts/{self.cloudflare_account_id}/ai/run/' - self.headers = {"Authorization": f'Bearer {self.cloudflare_api_token}'} + self.api_base_url = ( + f"https://api.cloudflare.com/client/v4/accounts/{self.cloudflare_account_id}/ai/run/" + ) + self.headers = {"Authorization": f"Bearer {self.cloudflare_api_token}"} # base endpoint, used by the others def run(self, model, input): - response = requests.post( - f"{self.api_base_url}{model}", - headers=self.headers, - json=input) + response = requests.post(f"{self.api_base_url}{model}", headers=self.headers, json=input) return response.json() # https://developers.cloudflare.com/workers-ai/models/llm/ @@ -28,7 +27,7 @@ def run_llm(self, messages, latest_user_message=None, stream=True): input = { "messages": [ {"role": "system", "content": "You are a friendly assistant"}, - {"role": "user", "content": sentence} + {"role": "user", "content": sentence}, ] } @@ -36,16 +35,14 @@ def run_llm(self, messages, latest_user_message=None, stream=True): # https://developers.cloudflare.com/workers-ai/models/translation/ def run_text_translation(self, sentence, source_language, target_language): - return self.run('@cf/meta/m2m100-1.2b', { - "text": sentence, - "source_lang": source_language, - "target_lang": target_language - }) + return self.run( + "@cf/meta/m2m100-1.2b", + {"text": sentence, "source_lang": source_language, "target_lang": target_language}, + ) # https://developers.cloudflare.com/workers-ai/models/sentiment-analysis/ def run_text_sentiment(self, sentence): - return self.run("@cf/huggingface/distilbert-sst-2-int8", - {"text": sentence}) + return self.run("@cf/huggingface/distilbert-sst-2-int8", {"text": sentence}) # https://developers.cloudflare.com/workers-ai/models/image-classification/ def run_image_classification(self, image_url): @@ -65,7 +62,7 @@ def run_embeddings(self, texts, size="medium"): models = { "small": "@cf/baai/bge-small-en-v1.5", # 384 output dimensions "medium": "@cf/baai/bge-base-en-v1.5", # 768 output dimensions - "large": "@cf/baai/bge-large-en-v1.5" # 1024 output dimensions + "large": "@cf/baai/bge-large-en-v1.5", # 1024 output dimensions } return self.run(models[size], {"text": texts}) diff --git a/src/pipecat/services/to_be_updated/google_ai_service.py b/src/pipecat/services/to_be_updated/google_ai_service.py index 7272964f4..25668ca0a 100644 --- a/src/pipecat/services/to_be_updated/google_ai_service.py +++ b/src/pipecat/services/to_be_updated/google_ai_service.py @@ -18,14 +18,12 @@ def __init__(self): ) self.audio_config = texttospeech.AudioConfig( - audio_encoding=texttospeech.AudioEncoding.LINEAR16, - sample_rate_hertz=16000 + audio_encoding=texttospeech.AudioEncoding.LINEAR16, sample_rate_hertz=16000 ) def run_tts(self, sentence): synthesis_input = texttospeech.SynthesisInput(text=sentence.strip()) result = self.client.synthesize_speech( - input=synthesis_input, - voice=self.voice, - audio_config=self.audio_config) + input=synthesis_input, voice=self.voice, audio_config=self.audio_config + ) return result diff --git a/src/pipecat/services/to_be_updated/huggingface_ai_service.py b/src/pipecat/services/to_be_updated/huggingface_ai_service.py index 7c4984067..09f0b8248 100644 --- a/src/pipecat/services/to_be_updated/huggingface_ai_service.py +++ b/src/pipecat/services/to_be_updated/huggingface_ai_service.py @@ -19,8 +19,8 @@ def run_text_sentiment(self, sentence): # models use 2-character language codes**) def run_text_translation(self, sentence, source_language, target_language): translator = pipeline( - f"translation", - model=f"Helsinki-NLP/opus-mt-{source_language}-{target_language}") + f"translation", model=f"Helsinki-NLP/opus-mt-{source_language}-{target_language}" + ) return translator(sentence)[0]["translation_text"] diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py index ce8c62730..b1365bc69 100644 --- a/src/pipecat/services/together.py +++ b/src/pipecat/services/together.py @@ -23,13 +23,19 @@ LLMFullResponseEndFrame, FunctionCallResultFrame, FunctionCallInProgressFrame, - StartInterruptionFrame + StartInterruptionFrame, ) from pipecat.metrics.metrics import LLMTokenUsage from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import LLMService -from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext, OpenAILLMContextFrame -from pipecat.processors.aggregators.llm_response import LLMUserContextAggregator, LLMAssistantContextAggregator +from pipecat.processors.aggregators.openai_llm_context import ( + OpenAILLMContext, + OpenAILLMContextFrame, +) +from pipecat.processors.aggregators.llm_response import ( + LLMUserContextAggregator, + LLMAssistantContextAggregator, +) from loguru import logger @@ -38,25 +44,26 @@ except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use Together.ai, you need to `pip install pipecat-ai[together]`. Also, set `TOGETHER_API_KEY` environment variable.") + "In order to use Together.ai, you need to `pip install pipecat-ai[together]`. Also, set `TOGETHER_API_KEY` environment variable." + ) raise Exception(f"Missing module: {e}") @dataclass class TogetherContextAggregatorPair: - _user: 'TogetherUserContextAggregator' - _assistant: 'TogetherAssistantContextAggregator' + _user: "TogetherUserContextAggregator" + _assistant: "TogetherAssistantContextAggregator" - def user(self) -> 'TogetherUserContextAggregator': + def user(self) -> "TogetherUserContextAggregator": return self._user - def assistant(self) -> 'TogetherAssistantContextAggregator': + def assistant(self) -> "TogetherAssistantContextAggregator": return self._assistant class TogetherLLMService(LLMService): - """This class implements inference with Together's Llama 3.1 models - """ + """This class implements inference with Together's Llama 3.1 models""" + class InputParams(BaseModel): frequency_penalty: Optional[float] = Field(default=None, ge=-2.0, le=2.0) max_tokens: Optional[int] = Field(default=4096, ge=1) @@ -67,12 +74,13 @@ class InputParams(BaseModel): extra: Optional[Dict[str, Any]] = Field(default_factory=dict) def __init__( - self, - *, - api_key: str, - model: str = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - params: InputParams = InputParams(), - **kwargs): + self, + *, + api_key: str, + model: str = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + params: InputParams = InputParams(), + **kwargs, + ): super().__init__(**kwargs) self._client = AsyncTogether(api_key=api_key) self.set_model_name(model) @@ -91,10 +99,7 @@ def can_generate_metrics(self) -> bool: def create_context_aggregator(context: OpenAILLMContext) -> TogetherContextAggregatorPair: user = TogetherUserContextAggregator(context) assistant = TogetherAssistantContextAggregator(user) - return TogetherContextAggregatorPair( - _user=user, - _assistant=assistant - ) + return TogetherContextAggregatorPair(_user=user, _assistant=assistant) async def set_frequency_penalty(self, frequency_penalty: float): logger.debug(f"Switching LLM frequency_penalty to: [{frequency_penalty}]") @@ -142,7 +147,7 @@ async def _process_context(self, context: OpenAILLMContext): "presence_penalty": self._presence_penalty, "temperature": self._temperature, "top_k": self._top_k, - "top_p": self._top_p + "top_p": self._top_p, } params.update(self._extra) @@ -160,7 +165,7 @@ async def _process_context(self, context: OpenAILLMContext): tokens = LLMTokenUsage( prompt_tokens=chunk.usage.prompt_tokens, completion_tokens=chunk.usage.completion_tokens, - total_tokens=chunk.usage.total_tokens + total_tokens=chunk.usage.total_tokens, ) await self.start_llm_usage_metrics(tokens) @@ -180,7 +185,7 @@ async def _process_context(self, context: OpenAILLMContext): else: await self.push_frame(TextFrame(chunk.choices[0].delta.content)) - if chunk.choices[0].finish_reason == 'eos' and accumulating_function_call: + if chunk.choices[0].finish_reason == "eos" and accumulating_function_call: await self._extract_function_call(context, function_call_accumulator) except CancelledError as e: @@ -219,10 +224,12 @@ async def _extract_function_call(self, context, function_call_accumulator): function_name, args_string = match.groups() try: arguments = json.loads(args_string) - await self.call_function(context=context, - tool_call_id=str(uuid.uuid4()), - function_name=function_name, - arguments=arguments) + await self.call_function( + context=context, + tool_call_id=str(uuid.uuid4()), + function_name=function_name, + arguments=arguments, + ) return except json.JSONDecodeError as error: # We get here if the LLM returns a function call with invalid JSON arguments. This could happen @@ -281,12 +288,13 @@ async def process_frame(self, frame, direction): # The LLM sends a UserImageRequestFrame upstream. Cache any context provided with # that frame so we can use it when we assemble the image message in the assistant # context aggregator. - if (frame.context): + if frame.context: if isinstance(frame.context, str): self._context._user_image_request_context[frame.user_id] = frame.context else: logger.error( - f"Unexpected UserImageRequestFrame context type: {type(frame.context)}") + f"Unexpected UserImageRequestFrame context type: {type(frame.context)}" + ) del self._context._user_image_request_context[frame.user_id] else: if frame.user_id in self._context._user_image_request_context: @@ -294,6 +302,7 @@ async def process_frame(self, frame, direction): except Exception as e: logger.error(f"Error processing frame: {e}") + # # Claude returns a text content block along with a tool use content block. This works quite nicely # with streaming. We get the text first, so we can start streaming it right away. Then we get the @@ -320,13 +329,17 @@ async def process_frame(self, frame, direction): elif isinstance(frame, FunctionCallInProgressFrame): self._function_call_in_progress = frame elif isinstance(frame, FunctionCallResultFrame): - if self._function_call_in_progress and self._function_call_in_progress.tool_call_id == frame.tool_call_id: + if ( + self._function_call_in_progress + and self._function_call_in_progress.tool_call_id == frame.tool_call_id + ): self._function_call_in_progress = None self._function_call_result = frame await self._push_aggregation() else: logger.warning( - f"FunctionCallResultFrame tool_call_id does not match FunctionCallInProgressFrame tool_call_id") + f"FunctionCallResultFrame tool_call_id does not match FunctionCallInProgressFrame tool_call_id" + ) self._function_call_in_progress = None self._function_call_result = None @@ -346,11 +359,13 @@ async def _push_aggregation(self): if self._function_call_result: frame = self._function_call_result self._function_call_result = None - self._context.add_message({ - "role": "tool", - # Together expects the content here to be a string, so stringify it - "content": str(frame.result) - }) + self._context.add_message( + { + "role": "tool", + # Together expects the content here to be a string, so stringify it + "content": str(frame.result), + } + ) run_llm = True else: self._context.add_message({"role": "assistant", "content": aggregation}) diff --git a/src/pipecat/services/whisper.py b/src/pipecat/services/whisper.py index 9f54f9ca0..a4635c6cb 100644 --- a/src/pipecat/services/whisper.py +++ b/src/pipecat/services/whisper.py @@ -23,13 +23,13 @@ from faster_whisper import WhisperModel except ModuleNotFoundError as e: logger.error(f"Exception: {e}") - logger.error( - "In order to use Whisper, you need to `pip install pipecat-ai[whisper]`.") + logger.error("In order to use Whisper, you need to `pip install pipecat-ai[whisper]`.") raise Exception(f"Missing module: {e}") class Model(Enum): """Class of basic Whisper model selection options""" + TINY = "tiny" BASE = "base" MEDIUM = "medium" @@ -41,14 +41,15 @@ class Model(Enum): class WhisperSTTService(SegmentedSTTService): """Class to transcribe audio with a locally-downloaded Whisper model""" - def __init__(self, - *, - model: str | Model = Model.DISTIL_MEDIUM_EN, - device: str = "auto", - compute_type: str = "default", - no_speech_prob: float = 0.4, - **kwargs): - + def __init__( + self, + *, + model: str | Model = Model.DISTIL_MEDIUM_EN, + device: str = "auto", + compute_type: str = "default", + no_speech_prob: float = 0.4, + **kwargs, + ): super().__init__(**kwargs) self._device: str = device self._compute_type = compute_type @@ -65,9 +66,8 @@ def _load(self): this model is being run, it will take time to download.""" logger.debug("Loading Whisper model...") self._model = WhisperModel( - self.model_name, - device=self._device, - compute_type=self._compute_type) + self.model_name, device=self._device, compute_type=self._compute_type + ) logger.debug("Loaded Whisper model") async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: diff --git a/src/pipecat/services/xtts.py b/src/pipecat/services/xtts.py index 69b754f55..5161efcf6 100644 --- a/src/pipecat/services/xtts.py +++ b/src/pipecat/services/xtts.py @@ -14,7 +14,8 @@ StartFrame, TTSAudioRawFrame, TTSStartedFrame, - TTSStoppedFrame) + TTSStoppedFrame, +) from pipecat.services.ai_services import TTSService from loguru import logger @@ -38,15 +39,15 @@ class XTTSService(TTSService): - def __init__( - self, - *, - voice_id: str, - language: str, - base_url: str, - aiohttp_session: aiohttp.ClientSession, - **kwargs): + self, + *, + voice_id: str, + language: str, + base_url: str, + aiohttp_session: aiohttp.ClientSession, + **kwargs, + ): super().__init__(**kwargs) self._voice_id = voice_id @@ -64,9 +65,13 @@ async def start(self, frame: StartFrame): if r.status != 200: text = await r.text() logger.error( - f"{self} error getting studio speakers (status: {r.status}, error: {text})") + f"{self} error getting studio speakers (status: {r.status}, error: {text})" + ) await self.push_error( - ErrorFrame(f"Error error getting studio speakers (status: {r.status}, error: {text})")) + ErrorFrame( + f"Error error getting studio speakers (status: {r.status}, error: {text})" + ) + ) return self._studio_speakers = await r.json() @@ -86,7 +91,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: url = self._base_url + "/tts_stream" payload = { - "text": text.replace('.', '').replace('*', ''), + "text": text.replace(".", "").replace("*", ""), "language": self._language, "speaker_embedding": embeddings["speaker_embedding"], "gpt_cond_latent": embeddings["gpt_cond_latent"], @@ -115,7 +120,9 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: buffer.extend(chunk) # Check if buffer has enough data for processing - while len(buffer) >= 48000: # Assuming at least 0.5 seconds of audio data at 24000 Hz + while ( + len(buffer) >= 48000 + ): # Assuming at least 0.5 seconds of audio data at 24000 Hz # Process the buffer up to a safe size for resampling process_data = buffer[:48000] # Remove processed data from buffer diff --git a/src/pipecat/transcriptions/language.py b/src/pipecat/transcriptions/language.py index f9e98104b..2ee3d9e95 100644 --- a/src/pipecat/transcriptions/language.py +++ b/src/pipecat/transcriptions/language.py @@ -9,6 +9,7 @@ from enum import Enum if sys.version_info < (3, 11): + class StrEnum(str, Enum): def __new__(cls, value): obj = str.__new__(cls, value) @@ -19,46 +20,46 @@ def __new__(cls, value): class Language(StrEnum): - BG = "bg" # Bulgarian - CA = "ca" # Catalan - ZH = "zh" # Chinese simplified - ZH_TW = "zh-TW" # Chinese traditional - CS = "cs" # Czech - DA = "da" # Danish - NL = "nl" # Dutch - EN = "en" # English - EN_US = "en-US" # English (USA) - EN_AU = "en-AU" # English (Australia) - EN_GB = "en-GB" # English (Great Britain) - EN_NZ = "en-NZ" # English (New Zealand) - EN_IN = "en-IN" # English (India) - ET = "et" # Estonian - FI = "fi" # Finnish - NL_BE = "nl-BE" # Flemmish - FR = "fr" # French - FR_CA = "fr-CA" # French (Canada) - DE = "de" # German - DE_CH = "de-CH" # German (Switzerland) - EL = "el" # Greek - HI = "hi" # Hindi - HU = "hu" # Hungarian - ID = "id" # Indonesian - IT = "it" # Italian - JA = "ja" # Japanese - KO = "ko" # Korean - LV = "lv" # Latvian - LT = "lt" # Lithuanian - MS = "ms" # Malay - NO = "no" # Norwegian - PL = "pl" # Polish - PT = "pt" # Portuguese - PT_BR = "pt-BR" # Portuguese (Brazil) - RO = "ro" # Romanian - RU = "ru" # Russian - SK = "sk" # Slovak - ES = "es" # Spanish - SV = "sv" # Swedish - TH = "th" # Thai - TR = "tr" # Turkish - UK = "uk" # Ukrainian - VI = "vi" # Vietnamese + BG = "bg" # Bulgarian + CA = "ca" # Catalan + ZH = "zh" # Chinese simplified + ZH_TW = "zh-TW" # Chinese traditional + CS = "cs" # Czech + DA = "da" # Danish + NL = "nl" # Dutch + EN = "en" # English + EN_US = "en-US" # English (USA) + EN_AU = "en-AU" # English (Australia) + EN_GB = "en-GB" # English (Great Britain) + EN_NZ = "en-NZ" # English (New Zealand) + EN_IN = "en-IN" # English (India) + ET = "et" # Estonian + FI = "fi" # Finnish + NL_BE = "nl-BE" # Flemmish + FR = "fr" # French + FR_CA = "fr-CA" # French (Canada) + DE = "de" # German + DE_CH = "de-CH" # German (Switzerland) + EL = "el" # Greek + HI = "hi" # Hindi + HU = "hu" # Hungarian + ID = "id" # Indonesian + IT = "it" # Italian + JA = "ja" # Japanese + KO = "ko" # Korean + LV = "lv" # Latvian + LT = "lt" # Lithuanian + MS = "ms" # Malay + NO = "no" # Norwegian + PL = "pl" # Polish + PT = "pt" # Portuguese + PT_BR = "pt-BR" # Portuguese (Brazil) + RO = "ro" # Romanian + RU = "ru" # Russian + SK = "sk" # Slovak + ES = "es" # Spanish + SV = "sv" # Swedish + TH = "th" # Thai + TR = "tr" # Turkish + UK = "uk" # Ukrainian + VI = "vi" # Vietnamese diff --git a/src/pipecat/transports/base_input.py b/src/pipecat/transports/base_input.py index 4e398e779..73ad3f5e3 100644 --- a/src/pipecat/transports/base_input.py +++ b/src/pipecat/transports/base_input.py @@ -21,7 +21,8 @@ SystemFrame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame, - VADParamsUpdateFrame) + VADParamsUpdateFrame, +) from pipecat.transports.base_transport import TransportParams from pipecat.vad.vad_analyzer import VADAnalyzer, VADState @@ -29,7 +30,6 @@ class BaseInputTransport(FrameProcessor): - def __init__(self, params: TransportParams, **kwargs): super().__init__(sync=False, **kwargs) @@ -129,12 +129,17 @@ async def _vad_analyze(self, audio_frames: bytes) -> VADState: vad_analyzer = self.vad_analyzer() if vad_analyzer: state = await self.get_event_loop().run_in_executor( - self._executor, vad_analyzer.analyze_audio, audio_frames) + self._executor, vad_analyzer.analyze_audio, audio_frames + ) return state async def _handle_vad(self, audio_frames: bytes, vad_state: VADState): new_vad_state = await self._vad_analyze(audio_frames) - if new_vad_state != vad_state and new_vad_state != VADState.STARTING and new_vad_state != VADState.STOPPING: + if ( + new_vad_state != vad_state + and new_vad_state != VADState.STARTING + and new_vad_state != VADState.STOPPING + ): frame = None if new_vad_state == VADState.SPEAKING: frame = UserStartedSpeakingFrame() diff --git a/src/pipecat/transports/base_output.py b/src/pipecat/transports/base_output.py index 263bb64f4..5423b122f 100644 --- a/src/pipecat/transports/base_output.py +++ b/src/pipecat/transports/base_output.py @@ -32,7 +32,8 @@ TTSStartedFrame, TTSStoppedFrame, TextFrame, - TransportMessageFrame) + TransportMessageFrame, +) from pipecat.transports.base_transport import TransportParams from loguru import logger @@ -41,7 +42,6 @@ class BaseOutputTransport(FrameProcessor): - def __init__(self, params: TransportParams, **kwargs): super().__init__(sync=False, **kwargs) @@ -53,8 +53,9 @@ def __init__(self, params: TransportParams, **kwargs): # We will write 20ms audio at a time. If we receive long audio frames we # will chunk them. This will help with interruption handling. - audio_bytes_10ms = int(self._params.audio_out_sample_rate / 100) * \ - self._params.audio_out_channels * 2 + audio_bytes_10ms = ( + int(self._params.audio_out_sample_rate / 100) * self._params.audio_out_channels * 2 + ) self._audio_chunk_size = audio_bytes_10ms * 2 self._audio_buffer = bytearray() @@ -74,7 +75,9 @@ async def start(self, frame: StartFrame): # Create camera output queue and task if needed. if self._params.camera_out_enabled: self._camera_out_queue = asyncio.Queue() - self._camera_out_task = self.get_event_loop().create_task(self._camera_out_task_handler()) + self._camera_out_task = self.get_event_loop().create_task( + self._camera_out_task_handler() + ) # Create audio output queue and task if needed. if self._params.audio_out_enabled and self._params.audio_out_is_live: self._audio_out_queue = asyncio.Queue() @@ -201,11 +204,12 @@ async def _handle_audio(self, frame: OutputAudioRawFrame): self._audio_buffer.extend(frame.audio) while len(self._audio_buffer) >= self._audio_chunk_size: chunk = OutputAudioRawFrame( - bytes(self._audio_buffer[:self._audio_chunk_size]), - sample_rate=frame.sample_rate, num_channels=frame.num_channels + bytes(self._audio_buffer[: self._audio_chunk_size]), + sample_rate=frame.sample_rate, + num_channels=frame.num_channels, ) await self._sink_queue.put(chunk) - self._audio_buffer = self._audio_buffer[self._audio_chunk_size:] + self._audio_buffer = self._audio_buffer[self._audio_chunk_size :] async def _handle_image(self, frame: OutputImageRawFrame | SpriteFrame): if not self._params.camera_out_enabled: @@ -316,12 +320,10 @@ async def _draw_image(self, frame: OutputImageRawFrame): if frame.size != desired_size: image = Image.frombytes(frame.format, frame.size, frame.image) resized_image = image.resize(desired_size) - logger.warning( - f"{frame} does not have the expected size {desired_size}, resizing") + logger.warning(f"{frame} does not have the expected size {desired_size}, resizing") frame = OutputImageRawFrame( - resized_image.tobytes(), - resized_image.size, - resized_image.format) + resized_image.tobytes(), resized_image.size, resized_image.format + ) await self.write_frame_to_camera(frame) diff --git a/src/pipecat/transports/base_transport.py b/src/pipecat/transports/base_transport.py index 083aeac37..5802993fa 100644 --- a/src/pipecat/transports/base_transport.py +++ b/src/pipecat/transports/base_transport.py @@ -42,11 +42,12 @@ class TransportParams(BaseModel): class BaseTransport(ABC): - - def __init__(self, - input_name: str | None = None, - output_name: str | None = None, - loop: asyncio.AbstractEventLoop | None = None): + def __init__( + self, + input_name: str | None = None, + output_name: str | None = None, + loop: asyncio.AbstractEventLoop | None = None, + ): self._input_name = input_name self._output_name = output_name self._loop = loop or asyncio.get_running_loop() @@ -64,6 +65,7 @@ def event_handler(self, event_name: str): def decorator(handler): self.add_event_handler(event_name, handler) return handler + return decorator def add_event_handler(self, event_name: str, handler): diff --git a/src/pipecat/transports/local/audio.py b/src/pipecat/transports/local/audio.py index 45d18db52..e1ccefec2 100644 --- a/src/pipecat/transports/local/audio.py +++ b/src/pipecat/transports/local/audio.py @@ -21,12 +21,12 @@ except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use local audio, you need to `pip install pipecat-ai[local]`. On MacOS, you also need to `brew install portaudio`.") + "In order to use local audio, you need to `pip install pipecat-ai[local]`. On MacOS, you also need to `brew install portaudio`." + ) raise Exception(f"Missing module: {e}") class LocalAudioInputTransport(BaseInputTransport): - def __init__(self, py_audio: pyaudio.PyAudio, params: TransportParams): super().__init__(params) @@ -39,7 +39,8 @@ def __init__(self, py_audio: pyaudio.PyAudio, params: TransportParams): rate=params.audio_in_sample_rate, frames_per_buffer=num_frames, stream_callback=self._audio_in_callback, - input=True) + input=True, + ) async def start(self, frame: StartFrame): await super().start(frame) @@ -54,9 +55,11 @@ async def cleanup(self): self._in_stream.close() def _audio_in_callback(self, in_data, frame_count, time_info, status): - frame = InputAudioRawFrame(audio=in_data, - sample_rate=self._params.audio_in_sample_rate, - num_channels=self._params.audio_in_channels) + frame = InputAudioRawFrame( + audio=in_data, + sample_rate=self._params.audio_in_sample_rate, + num_channels=self._params.audio_in_channels, + ) asyncio.run_coroutine_threadsafe(self.push_audio_frame(frame), self.get_event_loop()) @@ -64,7 +67,6 @@ def _audio_in_callback(self, in_data, frame_count, time_info, status): class LocalAudioOutputTransport(BaseOutputTransport): - def __init__(self, py_audio: pyaudio.PyAudio, params: TransportParams): super().__init__(params) @@ -74,7 +76,8 @@ def __init__(self, py_audio: pyaudio.PyAudio, params: TransportParams): format=py_audio.get_format_from_width(2), channels=params.audio_out_channels, rate=params.audio_out_sample_rate, - output=True) + output=True, + ) async def start(self, frame: StartFrame): await super().start(frame) @@ -93,7 +96,6 @@ async def write_raw_audio_frames(self, frames: bytes): class LocalAudioTransport(BaseTransport): - def __init__(self, params: TransportParams): self._params = params self._pyaudio = pyaudio.PyAudio() diff --git a/src/pipecat/transports/local/tk.py b/src/pipecat/transports/local/tk.py index 75dd30331..ed7cdbea6 100644 --- a/src/pipecat/transports/local/tk.py +++ b/src/pipecat/transports/local/tk.py @@ -23,7 +23,8 @@ except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use local audio, you need to `pip install pipecat-ai[local]`. On MacOS, you also need to `brew install portaudio`.") + "In order to use local audio, you need to `pip install pipecat-ai[local]`. On MacOS, you also need to `brew install portaudio`." + ) raise Exception(f"Missing module: {e}") try: @@ -35,7 +36,6 @@ class TkInputTransport(BaseInputTransport): - def __init__(self, py_audio: pyaudio.PyAudio, params: TransportParams): super().__init__(params) @@ -48,7 +48,8 @@ def __init__(self, py_audio: pyaudio.PyAudio, params: TransportParams): rate=params.audio_in_sample_rate, frames_per_buffer=num_frames, stream_callback=self._audio_in_callback, - input=True) + input=True, + ) async def start(self, frame: StartFrame): await super().start(frame) @@ -63,9 +64,11 @@ async def cleanup(self): self._in_stream.close() def _audio_in_callback(self, in_data, frame_count, time_info, status): - frame = InputAudioRawFrame(audio=in_data, - sample_rate=self._params.audio_in_sample_rate, - num_channels=self._params.audio_in_channels) + frame = InputAudioRawFrame( + audio=in_data, + sample_rate=self._params.audio_in_sample_rate, + num_channels=self._params.audio_in_channels, + ) asyncio.run_coroutine_threadsafe(self.push_audio_frame(frame), self.get_event_loop()) @@ -73,7 +76,6 @@ def _audio_in_callback(self, in_data, frame_count, time_info, status): class TkOutputTransport(BaseOutputTransport): - def __init__(self, tk_root: tk.Tk, py_audio: pyaudio.PyAudio, params: TransportParams): super().__init__(params) @@ -83,7 +85,8 @@ def __init__(self, tk_root: tk.Tk, py_audio: pyaudio.PyAudio, params: TransportP format=py_audio.get_format_from_width(2), channels=params.audio_out_channels, rate=params.audio_out_sample_rate, - output=True) + output=True, + ) # Start with a neutral gray background. array = np.ones((1024, 1024, 3)) * 128 @@ -114,11 +117,7 @@ def _write_frame_to_tk(self, frame: OutputImageRawFrame): width = frame.size[0] height = frame.size[1] data = f"P6 {width} {height} 255 ".encode() + frame.image - photo = tk.PhotoImage( - width=width, - height=height, - data=data, - format="PPM") + photo = tk.PhotoImage(width=width, height=height, data=data, format="PPM") self._image_label.config(image=photo) # This holds a reference to the photo, preventing it from being garbage @@ -127,7 +126,6 @@ def _write_frame_to_tk(self, frame: OutputImageRawFrame): class TkLocalTransport(BaseTransport): - def __init__(self, tk_root: tk.Tk, params: TransportParams): self._tk_root = tk_root self._params = params diff --git a/src/pipecat/transports/network/fastapi_websocket.py b/src/pipecat/transports/network/fastapi_websocket.py index 815d7c2ef..dac162530 100644 --- a/src/pipecat/transports/network/fastapi_websocket.py +++ b/src/pipecat/transports/network/fastapi_websocket.py @@ -19,7 +19,7 @@ Frame, InputAudioRawFrame, StartFrame, - StartInterruptionFrame + StartInterruptionFrame, ) from pipecat.processors.frame_processor import FrameDirection from pipecat.serializers.base_serializer import FrameSerializer @@ -35,7 +35,8 @@ except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use FastAPI websockets, you need to `pip install pipecat-ai[websocket]`.") + "In order to use FastAPI websockets, you need to `pip install pipecat-ai[websocket]`." + ) raise Exception(f"Missing module: {e}") @@ -51,13 +52,13 @@ class FastAPIWebsocketCallbacks(BaseModel): class FastAPIWebsocketInputTransport(BaseInputTransport): - def __init__( - self, - websocket: WebSocket, - params: FastAPIWebsocketParams, - callbacks: FastAPIWebsocketCallbacks, - **kwargs): + self, + websocket: WebSocket, + params: FastAPIWebsocketParams, + callbacks: FastAPIWebsocketCallbacks, + **kwargs, + ): super().__init__(params, **kwargs) self._websocket = websocket @@ -87,17 +88,18 @@ async def _receive_messages(self): continue if isinstance(frame, AudioRawFrame): - await self.push_audio_frame(InputAudioRawFrame( - audio=frame.audio, - sample_rate=frame.sample_rate, - num_channels=frame.num_channels) + await self.push_audio_frame( + InputAudioRawFrame( + audio=frame.audio, + sample_rate=frame.sample_rate, + num_channels=frame.num_channels, + ) ) await self._callbacks.on_client_disconnected(self._websocket) class FastAPIWebsocketOutputTransport(BaseOutputTransport): - def __init__(self, websocket: WebSocket, params: FastAPIWebsocketParams, **kwargs): super().__init__(params, **kwargs) @@ -115,10 +117,9 @@ async def write_raw_audio_frames(self, frames: bytes): self._websocket_audio_buffer += frames while len(self._websocket_audio_buffer): frame = AudioRawFrame( - audio=self._websocket_audio_buffer[: - self._params.audio_frame_size], + audio=self._websocket_audio_buffer[: self._params.audio_frame_size], sample_rate=self._params.audio_out_sample_rate, - num_channels=self._params.audio_out_channels + num_channels=self._params.audio_out_channels, ) if self._params.add_wav_header: @@ -131,9 +132,8 @@ async def write_raw_audio_frames(self, frames: bytes): ww.close() content.seek(0) wav_frame = AudioRawFrame( - content.read(), - sample_rate=frame.sample_rate, - num_channels=frame.num_channels) + content.read(), sample_rate=frame.sample_rate, num_channels=frame.num_channels + ) frame = wav_frame payload = self._params.serializer.serialize(frame) @@ -141,7 +141,8 @@ async def write_raw_audio_frames(self, frames: bytes): await self._websocket.send_text(payload) self._websocket_audio_buffer = self._websocket_audio_buffer[ - self._params.audio_frame_size:] + self._params.audio_frame_size : + ] async def _write_frame(self, frame: Frame): payload = self._params.serializer.serialize(frame) @@ -150,26 +151,28 @@ async def _write_frame(self, frame: Frame): class FastAPIWebsocketTransport(BaseTransport): - def __init__( - self, - websocket: WebSocket, - params: FastAPIWebsocketParams, - input_name: str | None = None, - output_name: str | None = None, - loop: asyncio.AbstractEventLoop | None = None): + self, + websocket: WebSocket, + params: FastAPIWebsocketParams, + input_name: str | None = None, + output_name: str | None = None, + loop: asyncio.AbstractEventLoop | None = None, + ): super().__init__(input_name=input_name, output_name=output_name, loop=loop) self._params = params self._callbacks = FastAPIWebsocketCallbacks( on_client_connected=self._on_client_connected, - on_client_disconnected=self._on_client_disconnected + on_client_disconnected=self._on_client_disconnected, ) self._input = FastAPIWebsocketInputTransport( - websocket, self._params, self._callbacks, name=self._input_name) + websocket, self._params, self._callbacks, name=self._input_name + ) self._output = FastAPIWebsocketOutputTransport( - websocket, self._params, name=self._output_name) + websocket, self._params, name=self._output_name + ) # Register supported handlers. The user will only be able to register # these handlers. diff --git a/src/pipecat/transports/network/websocket_server.py b/src/pipecat/transports/network/websocket_server.py index 329ae8994..b5d38f60e 100644 --- a/src/pipecat/transports/network/websocket_server.py +++ b/src/pipecat/transports/network/websocket_server.py @@ -11,7 +11,13 @@ from typing import Awaitable, Callable from pydantic.main import BaseModel -from pipecat.frames.frames import AudioRawFrame, CancelFrame, EndFrame, InputAudioRawFrame, StartFrame +from pipecat.frames.frames import ( + AudioRawFrame, + CancelFrame, + EndFrame, + InputAudioRawFrame, + StartFrame, +) from pipecat.serializers.base_serializer import FrameSerializer from pipecat.serializers.protobuf import ProtobufFrameSerializer from pipecat.transports.base_input import BaseInputTransport @@ -40,14 +46,14 @@ class WebsocketServerCallbacks(BaseModel): class WebsocketServerInputTransport(BaseInputTransport): - def __init__( - self, - host: str, - port: int, - params: WebsocketServerParams, - callbacks: WebsocketServerCallbacks, - **kwargs): + self, + host: str, + port: int, + params: WebsocketServerParams, + callbacks: WebsocketServerCallbacks, + **kwargs, + ): super().__init__(params, **kwargs) self._host = host @@ -97,10 +103,12 @@ async def _client_handler(self, websocket: websockets.WebSocketServerProtocol, p continue if isinstance(frame, AudioRawFrame): - await self.push_audio_frame(InputAudioRawFrame( - audio=frame.audio, - sample_rate=frame.sample_rate, - num_channels=frame.num_channels) + await self.push_audio_frame( + InputAudioRawFrame( + audio=frame.audio, + sample_rate=frame.sample_rate, + num_channels=frame.num_channels, + ) ) else: await self.push_frame(frame) @@ -115,7 +123,6 @@ async def _client_handler(self, websocket: websockets.WebSocketServerProtocol, p class WebsocketServerOutputTransport(BaseOutputTransport): - def __init__(self, params: WebsocketServerParams, **kwargs): super().__init__(params, **kwargs) @@ -138,9 +145,9 @@ async def write_raw_audio_frames(self, frames: bytes): self._websocket_audio_buffer += frames while len(self._websocket_audio_buffer) >= self._params.audio_frame_size: frame = AudioRawFrame( - audio=self._websocket_audio_buffer[:self._params.audio_frame_size], + audio=self._websocket_audio_buffer[: self._params.audio_frame_size], sample_rate=self._params.audio_out_sample_rate, - num_channels=self._params.audio_out_channels + num_channels=self._params.audio_out_channels, ) if self._params.add_wav_header: @@ -153,28 +160,29 @@ async def write_raw_audio_frames(self, frames: bytes): ww.close() content.seek(0) wav_frame = AudioRawFrame( - content.read(), - sample_rate=frame.sample_rate, - num_channels=frame.num_channels) + content.read(), sample_rate=frame.sample_rate, num_channels=frame.num_channels + ) frame = wav_frame proto = self._params.serializer.serialize(frame) if proto: await self._websocket.send(proto) - self._websocket_audio_buffer = self._websocket_audio_buffer[self._params.audio_frame_size:] + self._websocket_audio_buffer = self._websocket_audio_buffer[ + self._params.audio_frame_size : + ] class WebsocketServerTransport(BaseTransport): - def __init__( - self, - host: str = "localhost", - port: int = 8765, - params: WebsocketServerParams = WebsocketServerParams(), - input_name: str | None = None, - output_name: str | None = None, - loop: asyncio.AbstractEventLoop | None = None): + self, + host: str = "localhost", + port: int = 8765, + params: WebsocketServerParams = WebsocketServerParams(), + input_name: str | None = None, + output_name: str | None = None, + loop: asyncio.AbstractEventLoop | None = None, + ): super().__init__(input_name=input_name, output_name=output_name, loop=loop) self._host = host self._port = port @@ -182,7 +190,7 @@ def __init__( self._callbacks = WebsocketServerCallbacks( on_client_connected=self._on_client_connected, - on_client_disconnected=self._on_client_disconnected + on_client_disconnected=self._on_client_disconnected, ) self._input: WebsocketServerInputTransport | None = None self._output: WebsocketServerOutputTransport | None = None @@ -196,7 +204,8 @@ def __init__( def input(self) -> WebsocketServerInputTransport: if not self._input: self._input = WebsocketServerInputTransport( - self._host, self._port, self._params, self._callbacks, name=self._input_name) + self._host, self._port, self._params, self._callbacks, name=self._input_name + ) return self._input def output(self) -> WebsocketServerOutputTransport: diff --git a/src/pipecat/transports/services/daily.py b/src/pipecat/transports/services/daily.py index eb2d6da7a..48b59d8ff 100644 --- a/src/pipecat/transports/services/daily.py +++ b/src/pipecat/transports/services/daily.py @@ -18,7 +18,8 @@ EventHandler, VirtualCameraDevice, VirtualMicrophoneDevice, - VirtualSpeakerDevice) + VirtualSpeakerDevice, +) from pydantic.main import BaseModel from pipecat.frames.frames import ( @@ -35,8 +36,14 @@ TranscriptionFrame, TransportMessageFrame, UserImageRawFrame, - UserImageRequestFrame) -from pipecat.metrics.metrics import LLMUsageMetricsData, ProcessingMetricsData, TTFBMetricsData, TTSUsageMetricsData + UserImageRequestFrame, +) +from pipecat.metrics.metrics import ( + LLMUsageMetricsData, + ProcessingMetricsData, + TTFBMetricsData, + TTSUsageMetricsData, +) from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.transcriptions.language import Language from pipecat.transports.base_input import BaseInputTransport @@ -47,11 +54,12 @@ from loguru import logger try: - from daily import (EventHandler, CallClient, Daily) + from daily import EventHandler, CallClient, Daily except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use the Daily transport, you need to `pip install pipecat-ai[daily]`.") + "In order to use the Daily transport, you need to `pip install pipecat-ai[daily]`." + ) raise Exception(f"Missing module: {e}") VAD_RESET_PERIOD_MS = 2000 @@ -63,14 +71,11 @@ class DailyTransportMessageFrame(TransportMessageFrame): class WebRTCVADAnalyzer(VADAnalyzer): - def __init__(self, *, sample_rate=16000, num_channels=1, params: VADParams = VADParams()): super().__init__(sample_rate=sample_rate, num_channels=num_channels, params=params) self._webrtc_vad = Daily.create_native_vad( - reset_period_ms=VAD_RESET_PERIOD_MS, - sample_rate=sample_rate, - channels=num_channels + reset_period_ms=VAD_RESET_PERIOD_MS, sample_rate=sample_rate, channels=num_channels ) logger.debug("Loaded native WebRTC VAD") @@ -98,9 +103,7 @@ class DailyTranscriptionSettings(BaseModel): endpointing: bool = True punctuate: bool = True includeRawResponse: bool = True - extra: Mapping[str, Any] = { - "interim_results": True - } + extra: Mapping[str, Any] = {"interim_results": True} class DailyParams(TransportParams): @@ -139,12 +142,13 @@ def set_result(future, *args): future.set_result(*args) except asyncio.InvalidStateError: pass + future.get_loop().call_soon_threadsafe(set_result, future, *args) + return _callback class DailyTransportClient(EventHandler): - _daily_initialized: bool = False # This is necessary to override EventHandler's __new__ method. @@ -152,13 +156,14 @@ def __new__(cls, *args, **kwargs): return super().__new__(cls) def __init__( - self, - room_url: str, - token: str | None, - bot_name: str, - params: DailyParams, - callbacks: DailyCallbacks, - loop: asyncio.AbstractEventLoop): + self, + room_url: str, + token: str | None, + bot_name: str, + params: DailyParams, + callbacks: DailyCallbacks, + loop: asyncio.AbstractEventLoop, + ): super().__init__() if not DailyTransportClient._daily_initialized: @@ -191,7 +196,8 @@ def __init__( self._camera_name(), width=self._params.camera_out_width, height=self._params.camera_out_height, - color_format=self._params.camera_out_color_format) + color_format=self._params.camera_out_color_format, + ) self._mic: VirtualMicrophoneDevice | None = None if self._params.audio_out_enabled: @@ -199,7 +205,8 @@ def __init__( self._mic_name(), sample_rate=self._params.audio_out_sample_rate, channels=self._params.audio_out_channels, - non_blocking=True) + non_blocking=True, + ) self._speaker: VirtualSpeakerDevice | None = None if self._params.audio_in_enabled or self._params.vad_enabled: @@ -207,7 +214,8 @@ def __init__( self._speaker_name(), sample_rate=self._params.audio_in_sample_rate, channels=self._params.audio_in_channels, - non_blocking=True) + non_blocking=True, + ) Daily.select_speaker_device(self._speaker_name()) def _camera_name(self): @@ -236,9 +244,8 @@ async def send_message(self, frame: TransportMessageFrame): future = self._loop.create_future() self._client.send_app_message( - frame.message, - participant_id, - completion=completion_callback(future)) + frame.message, participant_id, completion=completion_callback(future) + ) await future async def read_next_audio_frame(self) -> InputAudioRawFrame | None: @@ -255,9 +262,8 @@ async def read_next_audio_frame(self) -> InputAudioRawFrame | None: if len(audio) > 0: return InputAudioRawFrame( - audio=audio, - sample_rate=sample_rate, - num_channels=num_channels) + audio=audio, sample_rate=sample_rate, num_channels=num_channels + ) else: # If we don't read any audio it could be there's no participant # connected. daily-python will return immediately if that's the @@ -290,12 +296,9 @@ async def join(self): # For performance reasons, never subscribe to video streams (unless a # video renderer is registered). - self._client.update_subscription_profiles({ - "base": { - "camera": "unsubscribed", - "screenVideo": "unsubscribed" - } - }) + self._client.update_subscription_profiles( + {"base": {"camera": "unsubscribed", "screenVideo": "unsubscribed"}} + ) self._client.set_user_name(self._bot_name) @@ -327,7 +330,7 @@ async def _start_transcription(self): future = self._loop.create_future() self._client.start_transcription( settings=self._params.transcription_settings.model_dump(exclude_none=True), - completion=completion_callback(future) + completion=completion_callback(future), ) error = await future if error: @@ -374,12 +377,15 @@ async def _join(self): }, "microphone": { "sendSettings": { - "channelConfig": "stereo" if self._params.audio_out_channels == 2 else "mono", + "channelConfig": "stereo" + if self._params.audio_out_channels == 2 + else "mono", "bitrate": self._params.audio_out_bitrate, } - } + }, }, - }) + }, + ) return await asyncio.wait_for(future, timeout=10) @@ -456,18 +462,17 @@ def capture_participant_transcription(self, participant_id: str, callback: Calla self._transcription_renderers[participant_id] = callback def capture_participant_video( - self, - participant_id: str, - callback: Callable, - framerate: int = 30, - video_source: str = "camera", - color_format: str = "RGB"): + self, + participant_id: str, + callback: Callable, + framerate: int = 30, + video_source: str = "camera", + color_format: str = "RGB", + ): # Only enable camera subscription on this participant - self._client.update_subscriptions(participant_settings={ - participant_id: { - "media": "subscribed" - } - }) + self._client.update_subscriptions( + participant_settings={participant_id: {"media": "subscribed"}} + ) self._video_renderers[participant_id] = callback @@ -475,7 +480,8 @@ def capture_participant_video( participant_id, self._video_frame_received, video_source=video_source, - color_format=color_format) + color_format=color_format, + ) # # @@ -553,9 +559,9 @@ def _video_frame_received(self, participant_id, video_frame): callback, participant_id, video_frame.buffer, - (video_frame.width, - video_frame.height), - video_frame.color_format) + (video_frame.width, video_frame.height), + video_frame.color_format, + ) def _call_async_callback(self, callback, *args): future = asyncio.run_coroutine_threadsafe(callback(*args), self._loop) @@ -563,7 +569,6 @@ def _call_async_callback(self, callback, *args): class DailyInputTransport(BaseInputTransport): - def __init__(self, client: DailyTransportClient, params: DailyParams, **kwargs): super().__init__(params, **kwargs) @@ -576,7 +581,8 @@ def __init__(self, client: DailyTransportClient, params: DailyParams, **kwargs): if params.vad_enabled and not params.vad_analyzer: self._vad_analyzer = WebRTCVADAnalyzer( sample_rate=self._params.audio_in_sample_rate, - num_channels=self._params.audio_in_channels) + num_channels=self._params.audio_in_channels, + ) async def start(self, frame: StartFrame): # Parent start. @@ -654,11 +660,12 @@ async def _audio_in_task_handler(self): # def capture_participant_video( - self, - participant_id: str, - framerate: int = 30, - video_source: str = "camera", - color_format: str = "RGB"): + self, + participant_id: str, + framerate: int = 30, + video_source: str = "camera", + color_format: str = "RGB", + ): self._video_renderers[participant_id] = { "framerate": framerate, "timestamp": 0, @@ -666,11 +673,7 @@ def capture_participant_video( } self._client.capture_participant_video( - participant_id, - self._on_participant_video_frame, - framerate, - video_source, - color_format + participant_id, self._on_participant_video_frame, framerate, video_source, color_format ) def request_participant_image(self, participant_id: str): @@ -693,17 +696,14 @@ async def _on_participant_video_frame(self, participant_id: str, buffer, size, f if render_frame: frame = UserImageRawFrame( - user_id=participant_id, - image=buffer, - size=size, - format=format) + user_id=participant_id, image=buffer, size=size, format=format + ) await self.push_frame(frame) self._video_renderers[participant_id]["timestamp"] = curr_time class DailyOutputTransport(BaseOutputTransport): - def __init__(self, client: DailyTransportClient, params: DailyParams, **kwargs): super().__init__(params, **kwargs) @@ -754,10 +754,9 @@ async def send_metrics(self, frame: MetricsFrame): metrics["characters"] = [] metrics["characters"].append(d.model_dump(exclude_none=True)) - message = DailyTransportMessageFrame(message={ - "type": "pipecat-metrics", - "metrics": metrics - }) + message = DailyTransportMessageFrame( + message={"type": "pipecat-metrics", "metrics": metrics} + ) await self._client.send_message(message) async def write_raw_audio_frames(self, frames: bytes): @@ -768,16 +767,16 @@ async def write_frame_to_camera(self, frame: OutputImageRawFrame): class DailyTransport(BaseTransport): - def __init__( - self, - room_url: str, - token: str | None, - bot_name: str, - params: DailyParams = DailyParams(), - input_name: str | None = None, - output_name: str | None = None, - loop: asyncio.AbstractEventLoop | None = None): + self, + room_url: str, + token: str | None, + bot_name: str, + params: DailyParams = DailyParams(), + input_name: str | None = None, + output_name: str | None = None, + loop: asyncio.AbstractEventLoop | None = None, + ): super().__init__(input_name=input_name, output_name=output_name, loop=loop) callbacks = DailyCallbacks( @@ -800,7 +799,8 @@ def __init__( self._params = params self._client = DailyTransportClient( - room_url, token, bot_name, params, callbacks, self._loop) + room_url, token, bot_name, params, callbacks, self._loop + ) self._input: DailyInputTransport | None = None self._output: DailyOutputTransport | None = None @@ -871,19 +871,20 @@ def stop_recording(self, stream_id=None): def capture_participant_transcription(self, participant_id: str): self._client.capture_participant_transcription( - participant_id, - self._on_transcription_message + participant_id, self._on_transcription_message ) def capture_participant_video( - self, - participant_id: str, - framerate: int = 30, - video_source: str = "camera", - color_format: str = "RGB"): + self, + participant_id: str, + framerate: int = 30, + video_source: str = "camera", + color_format: str = "RGB", + ): if self._input: self._input.capture_participant_video( - participant_id, framerate, video_source, color_format) + participant_id, framerate, video_source, color_format + ) async def _on_joined(self, data): await self._call_event_handler("on_joined", data) @@ -911,12 +912,12 @@ async def _handle_dialin_ready(self, sip_endpoint: str): async with aiohttp.ClientSession() as session: headers = { "Authorization": f"Bearer {self._params.api_key}", - "Content-Type": "application/json" + "Content-Type": "application/json", } data = { "callId": self._params.dialin_settings.call_id, "callDomain": self._params.dialin_settings.call_domain, - "sipUri": sip_endpoint + "sipUri": sip_endpoint, } url = f"{self._params.api_url}/dialin/pinlessCallUpdate" @@ -926,7 +927,8 @@ async def _handle_dialin_ready(self, sip_endpoint: str): if r.status != 200: text = await r.text() logger.error( - f"Unable to handle dialin-ready event (status: {r.status}, error: {text})") + f"Unable to handle dialin-ready event (status: {r.status}, error: {text})" + ) return logger.debug("Event dialin-ready was handled successfully") diff --git a/src/pipecat/transports/services/helpers/daily_rest.py b/src/pipecat/transports/services/helpers/daily_rest.py index 40c314613..4f15fc28a 100644 --- a/src/pipecat/transports/services/helpers/daily_rest.py +++ b/src/pipecat/transports/services/helpers/daily_rest.py @@ -41,12 +41,12 @@ def sip_endpoint(self) -> str: if not self.sip_uri: return "" else: - return "sip:%s" % self.sip_uri['endpoint'] + return "sip:%s" % self.sip_uri["endpoint"] class DailyRoomParams(BaseModel): name: Optional[str] = None - privacy: Literal['private', 'public'] = "public" + privacy: Literal["private", "public"] = "public" properties: DailyRoomProperties = Field(default_factory=DailyRoomProperties) @@ -61,11 +61,13 @@ class DailyRoomObject(BaseModel): class DailyRESTHelper: - def __init__(self, - *, - daily_api_key: str, - daily_api_url: str = "https://api.daily.co/v1", - aiohttp_session: aiohttp.ClientSession): + def __init__( + self, + *, + daily_api_key: str, + daily_api_url: str = "https://api.daily.co/v1", + aiohttp_session: aiohttp.ClientSession, + ): self.daily_api_key = daily_api_key self.daily_api_url = daily_api_url self.aiohttp_session = aiohttp_session @@ -80,7 +82,9 @@ async def get_room_from_url(self, room_url: str) -> DailyRoomObject: async def create_room(self, params: DailyRoomParams) -> DailyRoomObject: headers = {"Authorization": f"Bearer {self.daily_api_key}"} json = {**params.model_dump(exclude_none=True)} - async with self.aiohttp_session.post(f"{self.daily_api_url}/rooms", headers=headers, json=json) as r: + async with self.aiohttp_session.post( + f"{self.daily_api_url}/rooms", headers=headers, json=json + ) as r: if r.status != 200: text = await r.text() raise Exception(f"Unable to create room (status: {r.status}): {text}") @@ -95,27 +99,22 @@ async def create_room(self, params: DailyRoomParams) -> DailyRoomObject: return room async def get_token( - self, - room_url: str, - expiry_time: float = 60 * 60, - owner: bool = True) -> str: + self, room_url: str, expiry_time: float = 60 * 60, owner: bool = True + ) -> str: if not room_url: raise Exception( - "No Daily room specified. You must specify a Daily room in order a token to be generated.") + "No Daily room specified. You must specify a Daily room in order a token to be generated." + ) expiration: float = time.time() + expiry_time room_name = self.get_name_from_url(room_url) headers = {"Authorization": f"Bearer {self.daily_api_key}"} - json = { - "properties": { - "room_name": room_name, - "is_owner": owner, - "exp": expiration - } - } - async with self.aiohttp_session.post(f"{self.daily_api_url}/meeting-tokens", headers=headers, json=json) as r: + json = {"properties": {"room_name": room_name, "is_owner": owner, "exp": expiration}} + async with self.aiohttp_session.post( + f"{self.daily_api_url}/meeting-tokens", headers=headers, json=json + ) as r: if r.status != 200: text = await r.text() raise Exception(f"Failed to create meeting token (status: {r.status}): {text}") @@ -130,7 +129,9 @@ async def delete_room_by_url(self, room_url: str) -> bool: async def delete_room_by_name(self, room_name: str) -> bool: headers = {"Authorization": f"Bearer {self.daily_api_key}"} - async with self.aiohttp_session.delete(f"{self.daily_api_url}/rooms/{room_name}", headers=headers) as r: + async with self.aiohttp_session.delete( + f"{self.daily_api_url}/rooms/{room_name}", headers=headers + ) as r: if r.status != 200 and r.status != 404: text = await r.text() raise Exception(f"Failed to delete room [{room_name}] (status: {r.status}): {text}") @@ -139,7 +140,9 @@ async def delete_room_by_name(self, room_name: str) -> bool: async def _get_room_from_name(self, room_name: str) -> DailyRoomObject: headers = {"Authorization": f"Bearer {self.daily_api_key}"} - async with self.aiohttp_session.get(f"{self.daily_api_url}/rooms/{room_name}", headers=headers) as r: + async with self.aiohttp_session.get( + f"{self.daily_api_url}/rooms/{room_name}", headers=headers + ) as r: if r.status != 200: raise Exception(f"Room not found: {room_name}") diff --git a/src/pipecat/utils/test_frame_processor.py b/src/pipecat/utils/test_frame_processor.py index ed9429443..e46bae7ad 100644 --- a/src/pipecat/utils/test_frame_processor.py +++ b/src/pipecat/utils/test_frame_processor.py @@ -15,7 +15,9 @@ def __init__(self, test_frames): async def process_frame(self, frame, direction): await super().process_frame(frame, direction) - if not self.test_frames[0]: # then we've run out of required frames but the generator is still going? + if not self.test_frames[ + 0 + ]: # then we've run out of required frames but the generator is still going? raise TestException(f"Oops, got an extra frame, {frame}") if isinstance(self.test_frames[0], List): # We need to consume frames until we see the next frame type after this diff --git a/src/pipecat/vad/silero.py b/src/pipecat/vad/silero.py index c07ee2ae9..3e852b38b 100644 --- a/src/pipecat/vad/silero.py +++ b/src/pipecat/vad/silero.py @@ -8,7 +8,12 @@ import numpy as np -from pipecat.frames.frames import AudioRawFrame, Frame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame +from pipecat.frames.frames import ( + AudioRawFrame, + Frame, + UserStartedSpeakingFrame, + UserStoppedSpeakingFrame, +) from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.vad.vad_analyzer import VADAnalyzer, VADParams, VADState @@ -26,19 +31,20 @@ raise Exception(f"Missing module(s): {e}") -class SileroOnnxModel(): - +class SileroOnnxModel: def __init__(self, path, force_onnx_cpu=True): import numpy as np + global np opts = onnxruntime.SessionOptions() opts.inter_op_num_threads = 1 opts.intra_op_num_threads = 1 - if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers(): + if force_onnx_cpu and "CPUExecutionProvider" in onnxruntime.get_available_providers(): self.session = onnxruntime.InferenceSession( - path, providers=['CPUExecutionProvider'], sess_options=opts) + path, providers=["CPUExecutionProvider"], sess_options=opts + ) else: self.session = onnxruntime.InferenceSession(path, sess_options=opts) @@ -53,26 +59,27 @@ def _validate_input(self, x, sr: int): if sr not in self.sample_rates: raise ValueError( - f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)") + f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)" + ) if sr / np.shape(x)[1] > 31.25: raise ValueError("Input audio chunk is too short") return x, sr def reset_states(self, batch_size=1): - self._state = np.zeros((2, batch_size, 128), dtype='float32') - self._context = np.zeros((batch_size, 0), dtype='float32') + self._state = np.zeros((2, batch_size, 128), dtype="float32") + self._context = np.zeros((batch_size, 0), dtype="float32") self._last_sr = 0 self._last_batch_size = 0 def __call__(self, x, sr: int): - x, sr = self._validate_input(x, sr) num_samples = 512 if sr == 16000 else 256 if np.shape(x)[-1] != num_samples: raise ValueError( - f"Provided number of samples is {np.shape(x)[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)") + f"Provided number of samples is {np.shape(x)[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)" + ) batch_size = np.shape(x)[0] context_size = 64 if sr == 16000 else 32 @@ -85,12 +92,12 @@ def __call__(self, x, sr: int): self.reset_states(batch_size) if not np.shape(self._context)[1]: - self._context = np.zeros((batch_size, context_size), dtype='float32') + self._context = np.zeros((batch_size, context_size), dtype="float32") x = np.concatenate((self._context, x), axis=1) if sr in [8000, 16000]: - ort_inputs = {'input': x, 'state': self._state, 'sr': np.array(sr, dtype='int64')} + ort_inputs = {"input": x, "state": self._state, "sr": np.array(sr, dtype="int64")} ort_outs = self.session.run(None, ort_inputs) out, state = ort_outs self._state = state @@ -105,12 +112,7 @@ def __call__(self, x, sr: int): class SileroVADAnalyzer(VADAnalyzer): - - def __init__( - self, - *, - sample_rate: int = 16000, - params: VADParams = VADParams()): + def __init__(self, *, sample_rate: int = 16000, params: VADParams = VADParams()): super().__init__(sample_rate=sample_rate, num_channels=1, params=params) if sample_rate != 16000 and sample_rate != 8000: @@ -118,14 +120,16 @@ def __init__( logger.debug("Loading Silero VAD model...") - model_name = 'silero_vad.onnx' + model_name = "silero_vad.onnx" package_path = "pipecat.vad.data" try: import importlib_resources as impresources + model_file_path = str(impresources.files(package_path).joinpath(model_name)) except BaseException: from importlib import resources as impresources + try: with impresources.path(package_path, model_name) as f: model_file_path = f @@ -168,18 +172,16 @@ def voice_confidence(self, buffer) -> float: class SileroVAD(FrameProcessor): - def __init__( - self, - *, - sample_rate: int = 16000, - vad_params: VADParams = VADParams(), - audio_passthrough: bool = False): + self, + *, + sample_rate: int = 16000, + vad_params: VADParams = VADParams(), + audio_passthrough: bool = False, + ): super().__init__() - self._vad_analyzer = SileroVADAnalyzer( - sample_rate=sample_rate, - params=vad_params) + self._vad_analyzer = SileroVADAnalyzer(sample_rate=sample_rate, params=vad_params) self._audio_passthrough = audio_passthrough self._processor_vad_state: VADState = VADState.QUIET @@ -202,7 +204,11 @@ async def _analyze_audio(self, frame: AudioRawFrame): # Check VAD and push event if necessary. We just care about changes # from QUIET to SPEAKING and vice versa. new_vad_state = self._vad_analyzer.analyze_audio(frame.audio) - if new_vad_state != self._processor_vad_state and new_vad_state != VADState.STARTING and new_vad_state != VADState.STOPPING: + if ( + new_vad_state != self._processor_vad_state + and new_vad_state != VADState.STARTING + and new_vad_state != VADState.STOPPING + ): new_frame = None if new_vad_state == VADState.SPEAKING: diff --git a/src/pipecat/vad/vad_analyzer.py b/src/pipecat/vad/vad_analyzer.py index 3b7f9931d..198eb84ed 100644 --- a/src/pipecat/vad/vad_analyzer.py +++ b/src/pipecat/vad/vad_analyzer.py @@ -29,7 +29,6 @@ class VADParams(BaseModel): class VADAnalyzer: - def __init__(self, *, sample_rate: int, num_channels: int, params: VADParams): self._sample_rate = sample_rate self._num_channels = num_channels diff --git a/tests/integration/integration_azure_llm.py b/tests/integration/integration_azure_llm.py index b2e7a50cf..5a2b68c37 100644 --- a/tests/integration/integration_azure_llm.py +++ b/tests/integration/integration_azure_llm.py @@ -4,7 +4,7 @@ import os from pipecat.processors.aggregators.openai_llm_context import ( OpenAILLMContext, - OpenAILLMContextFrame + OpenAILLMContextFrame, ) from pipecat.services.azure import AzureLLMService @@ -13,6 +13,7 @@ ) if __name__ == "__main__": + @unittest.skip("Skip azure integration test") async def test_chat(): llm = AzureLLMService( @@ -22,7 +23,8 @@ async def test_chat(): ) context = OpenAILLMContext() message: ChatCompletionSystemMessageParam = ChatCompletionSystemMessageParam( - content="Please tell the world hello.", name="system", role="system") + content="Please tell the world hello.", name="system", role="system" + ) context.add_message(message) frame = OpenAILLMContextFrame(context) async for s in llm.process_frame(frame): diff --git a/tests/integration/integration_ollama_llm.py b/tests/integration/integration_ollama_llm.py index cbafa6324..ced24ed68 100644 --- a/tests/integration/integration_ollama_llm.py +++ b/tests/integration/integration_ollama_llm.py @@ -3,7 +3,7 @@ import asyncio from pipecat.processors.aggregators.openai_llm_context import ( OpenAILLMContext, - OpenAILLMContextFrame + OpenAILLMContextFrame, ) from openai.types.chat import ( @@ -12,12 +12,14 @@ from pipecat.services.ollama import OLLamaLLMService if __name__ == "__main__": + @unittest.skip("Skip azure integration test") async def test_chat(): llm = OLLamaLLMService() context = OpenAILLMContext() message: ChatCompletionSystemMessageParam = ChatCompletionSystemMessageParam( - content="Please tell the world hello.", name="system", role="system") + content="Please tell the world hello.", name="system", role="system" + ) context.add_message(message) frame = OpenAILLMContextFrame(context) async for s in llm.process_frame(frame): diff --git a/tests/integration/integration_openai_llm.py b/tests/integration/integration_openai_llm.py index e5dd12057..164dcba8d 100644 --- a/tests/integration/integration_openai_llm.py +++ b/tests/integration/integration_openai_llm.py @@ -5,11 +5,7 @@ from pipecat.services.openai import OpenAILLMContextFrame, OpenAILLMContext from pipecat.processors.frame_processor import FrameDirection, FrameProcessor -from pipecat.frames.frames import ( - LLMFullResponseStartFrame, - LLMFullResponseEndFrame, - TextFrame -) +from pipecat.frames.frames import LLMFullResponseStartFrame, LLMFullResponseEndFrame, TextFrame from pipecat.utils.test_frame_processor import TestFrameProcessor from openai.types.chat import ( ChatCompletionSystemMessageParam, @@ -34,21 +30,19 @@ }, "format": { "type": "string", - "enum": [ - "celsius", - "fahrenheit"], + "enum": ["celsius", "fahrenheit"], "description": "The temperature unit to use. Infer this from the users location.", }, }, - "required": [ - "location", - "format"], + "required": ["location", "format"], }, - })] + }, + ) +] if __name__ == "__main__": - async def test_simple_functions(): + async def test_simple_functions(): async def get_weather_from_api(llm, args): return json.dumps({"conditions": "nice", "temperature": "75"}) @@ -60,11 +54,7 @@ async def get_weather_from_api(llm, args): ) llm.register_function("get_current_weather", get_weather_from_api) - t = TestFrameProcessor([ - LLMFullResponseStartFrame, - TextFrame, - LLMFullResponseEndFrame - ]) + t = TestFrameProcessor([LLMFullResponseStartFrame, TextFrame, LLMFullResponseEndFrame]) llm.link(t) context = OpenAILLMContext(tools=tools) @@ -82,9 +72,13 @@ async def get_weather_from_api(llm, args): await llm.process_frame(frame, FrameDirection.DOWNSTREAM) async def test_advanced_functions(): - async def get_weather_from_api(llm, args): - return [{"role": "system", "content": "The user has asked for live weather. Respond by telling them we don't currently support live weather for that area, but it's coming soon."}] + return [ + { + "role": "system", + "content": "The user has asked for live weather. Respond by telling them we don't currently support live weather for that area, but it's coming soon.", + } + ] api_key = os.getenv("OPENAI_API_KEY") @@ -94,11 +88,7 @@ async def get_weather_from_api(llm, args): ) llm.register_function("get_current_weather", get_weather_from_api) - t = TestFrameProcessor([ - LLMFullResponseStartFrame, - TextFrame, - LLMFullResponseEndFrame - ]) + t = TestFrameProcessor([LLMFullResponseStartFrame, TextFrame, LLMFullResponseEndFrame]) llm.link(t) context = OpenAILLMContext(tools=tools) @@ -117,11 +107,7 @@ async def get_weather_from_api(llm, args): async def test_chat(): api_key = os.getenv("OPENAI_API_KEY") - t = TestFrameProcessor([ - LLMFullResponseStartFrame, - TextFrame, - LLMFullResponseEndFrame - ]) + t = TestFrameProcessor([LLMFullResponseStartFrame, TextFrame, LLMFullResponseEndFrame]) llm = OpenAILLMService( api_key=api_key or "", model="gpt-4o", @@ -129,7 +115,8 @@ async def test_chat(): llm.link(t) context = OpenAILLMContext() message: ChatCompletionSystemMessageParam = ChatCompletionSystemMessageParam( - content="Please tell the world hello.", name="system", role="system") + content="Please tell the world hello.", name="system", role="system" + ) context.add_message(message) frame = OpenAILLMContextFrame(context) await llm.process_frame(frame, FrameDirection.DOWNSTREAM) diff --git a/tests/test_aggregators.py b/tests/test_aggregators.py index 2fc6d226c..76834183c 100644 --- a/tests/test_aggregators.py +++ b/tests/test_aggregators.py @@ -47,9 +47,10 @@ async def test_sentence_aggregator(self): @unittest.skip("FIXME: This test is failing") async def test_gated_accumulator(self): gated_aggregator = GatedAggregator( - gate_open_fn=lambda frame: isinstance( - frame, ImageRawFrame), gate_close_fn=lambda frame: isinstance( - frame, LLMFullResponseStartFrame), start_open=False, ) + gate_open_fn=lambda frame: isinstance(frame, ImageRawFrame), + gate_close_fn=lambda frame: isinstance(frame, LLMFullResponseStartFrame), + start_open=False, + ) frames = [ LLMFullResponseStartFrame(), @@ -77,15 +78,12 @@ async def test_gated_accumulator(self): @unittest.skip("FIXME: This test is failing") async def test_parallel_pipeline(self): - async def slow_add(sleep_time: float, name: str, x: str): await asyncio.sleep(sleep_time) return ":".join([x, name]) - pipe1_annotation = StatelessTextTransformer( - functools.partial(slow_add, 0.1, 'pipe1')) - pipe2_annotation = StatelessTextTransformer( - functools.partial(slow_add, 0.2, 'pipe2')) + pipe1_annotation = StatelessTextTransformer(functools.partial(slow_add, 0.1, "pipe1")) + pipe2_annotation = StatelessTextTransformer(functools.partial(slow_add, 0.2, "pipe2")) sentence_aggregator = SentenceAggregator() add_dots = StatelessTextTransformer(lambda x: x + ".") @@ -93,26 +91,20 @@ async def slow_add(sleep_time: float, name: str, x: str): sink = asyncio.Queue() pipeline = Pipeline( [ - ParallelPipeline( - [[pipe1_annotation], [sentence_aggregator, pipe2_annotation]] - ), + ParallelPipeline([[pipe1_annotation], [sentence_aggregator, pipe2_annotation]]), add_dots, ], source, sink, ) - frames = [ - TextFrame("Hello, "), - TextFrame("world."), - EndFrame() - ] + frames = [TextFrame("Hello, "), TextFrame("world."), EndFrame()] expected_output_frames: list[Frame] = [ - TextFrame(text='Hello, :pipe1.'), - TextFrame(text='world.:pipe1.'), - TextFrame(text='Hello, world.:pipe2.'), - EndFrame() + TextFrame(text="Hello, :pipe1."), + TextFrame(text="world.:pipe1."), + TextFrame(text="Hello, world.:pipe2."), + EndFrame(), ] for frame in frames: @@ -126,7 +118,8 @@ async def slow_add(sleep_time: float, name: str, x: str): def load_tests(loader, tests, ignore): - """ Run doctests on the aggregators module. """ + """Run doctests on the aggregators module.""" from pipecat.processors import aggregators + tests.addTests(doctest.DocTestSuite(aggregators)) return tests diff --git a/tests/test_ai_services.py b/tests/test_ai_services.py index fb00fc893..c52b0cb56 100644 --- a/tests/test_ai_services.py +++ b/tests/test_ai_services.py @@ -15,10 +15,7 @@ class TestBaseAIService(unittest.IsolatedAsyncioTestCase): async def test_simple_processing(self): service = SimpleAIService() - input_frames = [ - TextFrame("hello"), - EndFrame() - ] + input_frames = [TextFrame("hello"), EndFrame()] output_frames = [] for input_frame in input_frames: diff --git a/tests/test_daily_transport_service.py b/tests/test_daily_transport_service.py index db85742c5..8c2788c9e 100644 --- a/tests/test_daily_transport_service.py +++ b/tests/test_daily_transport_service.py @@ -2,7 +2,6 @@ class TestDailyTransport(unittest.IsolatedAsyncioTestCase): - @unittest.skip("FIXME: This test is failing") async def test_event_handler(self): from pipecat.transports.daily_transport import DailyTransport diff --git a/tests/test_langchain.py b/tests/test_langchain.py index 7b32b2a9a..fb222205b 100644 --- a/tests/test_langchain.py +++ b/tests/test_langchain.py @@ -6,16 +6,22 @@ import unittest -from pipecat.frames.frames import (LLMFullResponseEndFrame, - LLMFullResponseStartFrame, StopTaskFrame, - TextFrame, TranscriptionFrame, - UserStartedSpeakingFrame, - UserStoppedSpeakingFrame) +from pipecat.frames.frames import ( + LLMFullResponseEndFrame, + LLMFullResponseStartFrame, + StopTaskFrame, + TextFrame, + TranscriptionFrame, + UserStartedSpeakingFrame, + UserStoppedSpeakingFrame, +) from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.llm_response import ( - LLMAssistantResponseAggregator, LLMUserResponseAggregator) + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) from pipecat.processors.frame_processor import FrameProcessor from pipecat.processors.frameworks.langchain import LangchainProcessor @@ -24,7 +30,6 @@ class TestLangchain(unittest.IsolatedAsyncioTestCase): - class MockProcessor(FrameProcessor): def __init__(self, name): self.name = name @@ -53,7 +58,6 @@ def setUp(self): self.mock_proc = self.MockProcessor("token_collector") async def test_langchain(self): - messages = [("system", "Say hello to {name}"), ("human", "{input}")] prompt = ChatPromptTemplate.from_messages(messages).partial(name="Thomas") chain = prompt | self.fake_llm diff --git a/tests/test_openai_tts.py b/tests/test_openai_tts.py index 5bb97b87d..1dc3929a6 100644 --- a/tests/test_openai_tts.py +++ b/tests/test_openai_tts.py @@ -15,10 +15,7 @@ class TestWhisperOpenAIService(unittest.IsolatedAsyncioTestCase): @unittest.skip("FIXME: This test is failing") async def test_whisper_tts(self): pa = pyaudio.PyAudio() - stream = pa.open(format=pyaudio.paInt16, - channels=1, - rate=24_000, - output=True) + stream = pa.open(format=pyaudio.paInt16, channels=1, rate=24_000, output=True) tts = OpenAITTSService(voice="nova") @@ -26,7 +23,7 @@ async def test_whisper_tts(self): self.assertIsInstance(frame, AudioRawFrame) stream.write(frame.audio) - await asyncio.sleep(.5) + await asyncio.sleep(0.5) stream.stop_stream() pa.terminate() diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 35974d2a0..ba82974bc 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -11,7 +11,6 @@ class TestDailyPipeline(unittest.IsolatedAsyncioTestCase): - @unittest.skip("FIXME: This test is failing") async def test_pipeline_simple(self): aggregator = SentenceAggregator() @@ -38,9 +37,7 @@ async def test_pipeline_multiple_stages(self): outgoing_queue = asyncio.Queue() incoming_queue = asyncio.Queue() pipeline = Pipeline( - [add_space, sentence_aggregator, to_upper], - incoming_queue, - outgoing_queue + [add_space, sentence_aggregator, to_upper], incoming_queue, outgoing_queue ) sentence = "Hello, world. It's me, a pipeline." @@ -50,9 +47,7 @@ async def test_pipeline_multiple_stages(self): await pipeline.run_pipeline() - self.assertEqual( - await outgoing_queue.get(), TextFrame("H E L L O , W O R L D .") - ) + self.assertEqual(await outgoing_queue.get(), TextFrame("H E L L O , W O R L D .")) self.assertEqual( await outgoing_queue.get(), TextFrame(" I T ' S M E , A P I P E L I N E ."), @@ -74,45 +69,49 @@ def __str__(self): return self.name def setUp(self): - self.processor1 = self.MockProcessor('processor1') - self.processor2 = self.MockProcessor('processor2') - self.pipeline = Pipeline( - processors=[self.processor1, self.processor2]) - self.pipeline._name = 'MyClass' + self.processor1 = self.MockProcessor("processor1") + self.processor2 = self.MockProcessor("processor2") + self.pipeline = Pipeline(processors=[self.processor1, self.processor2]) + self.pipeline._name = "MyClass" self.pipeline._logger = Mock() @unittest.skip("FIXME: This test is failing") def test_log_frame_from_source(self): - frame = Mock(__class__=Mock(__name__='MyFrame')) + frame = Mock(__class__=Mock(__name__="MyFrame")) self.pipeline._log_frame(frame, depth=1) self.pipeline._logger.debug.assert_called_once_with( - 'MyClass source -> MyFrame -> processor1') + "MyClass source -> MyFrame -> processor1" + ) @unittest.skip("FIXME: This test is failing") def test_log_frame_to_sink(self): - frame = Mock(__class__=Mock(__name__='MyFrame')) + frame = Mock(__class__=Mock(__name__="MyFrame")) self.pipeline._log_frame(frame, depth=3) self.pipeline._logger.debug.assert_called_once_with( - 'MyClass processor2 -> MyFrame -> sink') + "MyClass processor2 -> MyFrame -> sink" + ) @unittest.skip("FIXME: This test is failing") def test_log_frame_repeated_log(self): - frame = Mock(__class__=Mock(__name__='MyFrame')) + frame = Mock(__class__=Mock(__name__="MyFrame")) self.pipeline._log_frame(frame, depth=2) self.pipeline._logger.debug.assert_called_once_with( - 'MyClass processor1 -> MyFrame -> processor2') + "MyClass processor1 -> MyFrame -> processor2" + ) self.pipeline._log_frame(frame, depth=2) - self.pipeline._logger.debug.assert_called_with('MyClass ... repeated') + self.pipeline._logger.debug.assert_called_with("MyClass ... repeated") @unittest.skip("FIXME: This test is failing") def test_log_frame_reset_repeated_log(self): - frame1 = Mock(__class__=Mock(__name__='MyFrame1')) - frame2 = Mock(__class__=Mock(__name__='MyFrame2')) + frame1 = Mock(__class__=Mock(__name__="MyFrame1")) + frame2 = Mock(__class__=Mock(__name__="MyFrame2")) self.pipeline._log_frame(frame1, depth=2) self.pipeline._logger.debug.assert_called_once_with( - 'MyClass processor1 -> MyFrame1 -> processor2') + "MyClass processor1 -> MyFrame1 -> processor2" + ) self.pipeline._log_frame(frame1, depth=2) - self.pipeline._logger.debug.assert_called_with('MyClass ... repeated') + self.pipeline._logger.debug.assert_called_with("MyClass ... repeated") self.pipeline._log_frame(frame2, depth=2) self.pipeline._logger.debug.assert_called_with( - 'MyClass processor1 -> MyFrame2 -> processor2') + "MyClass processor1 -> MyFrame2 -> processor2" + ) diff --git a/tests/test_protobuf_serializer.py b/tests/test_protobuf_serializer.py index 2e74e88f4..7f9841622 100644 --- a/tests/test_protobuf_serializer.py +++ b/tests/test_protobuf_serializer.py @@ -10,20 +10,18 @@ def setUp(self): @unittest.skip("FIXME: This test is failing") async def test_roundtrip(self): - text_frame = TextFrame(text='hello world') - frame = self.serializer.deserialize( - self.serializer.serialize(text_frame)) - self.assertEqual(frame, TextFrame(text='hello world')) + text_frame = TextFrame(text="hello world") + frame = self.serializer.deserialize(self.serializer.serialize(text_frame)) + self.assertEqual(frame, TextFrame(text="hello world")) transcription_frame = TranscriptionFrame( - text="Hello there!", participantId="123", timestamp="2021-01-01") - frame = self.serializer.deserialize( - self.serializer.serialize(transcription_frame)) + text="Hello there!", participantId="123", timestamp="2021-01-01" + ) + frame = self.serializer.deserialize(self.serializer.serialize(transcription_frame)) self.assertEqual(frame, transcription_frame) - audio_frame = AudioRawFrame(data=b'1234567890') - frame = self.serializer.deserialize( - self.serializer.serialize(audio_frame)) + audio_frame = AudioRawFrame(data=b"1234567890") + frame = self.serializer.deserialize(self.serializer.serialize(audio_frame)) self.assertEqual(frame, audio_frame) From a4420dc88b105d14a105a9349d0c2c4576478823 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Mon, 23 Sep 2024 09:29:02 -0700 Subject: [PATCH 2/4] README: add vscode and emacs ruff instructions --- README.md | 26 ++++++++++++-------------- dev-requirements.txt | 2 +- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 5dfc1ad95..faf0137dc 100644 --- a/README.md +++ b/README.md @@ -170,22 +170,24 @@ pytest --doctest-modules --ignore-glob="*to_be_updated*" --ignore-glob=*pipeline ## Setting up your editor -This project uses strict [PEP 8](https://peps.python.org/pep-0008/) formatting. +This project uses strict [PEP 8](https://peps.python.org/pep-0008/) formatting via [Ruff](https://github.com/astral-sh/ruff). ### Emacs -You can use [use-package](https://github.com/jwiegley/use-package) to install [py-autopep8](https://codeberg.org/ideasman42/emacs-py-autopep8) package and configure `autopep8` arguments: +You can use [use-package](https://github.com/jwiegley/use-package) to install [emacs-lazy-ruff](https://github.com/christophermadsen/emacs-lazy-ruff) package and configure `ruff` arguments: ```elisp -(use-package py-autopep8 +(use-package lazy-ruff :ensure t - :defer t - :hook ((python-mode . py-autopep8-mode)) + :hook ((python-mode . lazy-ruff-mode)) :config - (setq py-autopep8-options '("-a" "-a", "--max-line-length=100"))) + (setq lazy-ruff-format-command "ruff format --config line-length=100") + (setq lazy-ruff-only-format-block t) + (setq lazy-ruff-only-format-region t) + (setq lazy-ruff-only-format-buffer t)) ``` -`autopep8` was installed in the `venv` environment described before, so you should be able to use [pyvenv-auto](https://github.com/ryotaro612/pyvenv-auto) to automatically load that environment inside Emacs. +`ruff` was installed in the `venv` environment described before, so you should be able to use [pyvenv-auto](https://github.com/ryotaro612/pyvenv-auto) to automatically load that environment inside Emacs. ```elisp (use-package pyvenv-auto @@ -198,18 +200,14 @@ You can use [use-package](https://github.com/jwiegley/use-package) to install [p ### Visual Studio Code Install the -[autopep8](https://marketplace.visualstudio.com/items?itemName=ms-python.autopep8) extension. Then edit the user settings (_Ctrl-Shift-P_ `Open User Settings (JSON)`) and set it as the default Python formatter, enable formatting on save and configure `autopep8` arguments: +[Ruff](https://marketplace.visualstudio.com/items?itemName=charliermarsh.ruff) extension. Then edit the user settings (_Ctrl-Shift-P_ `Open User Settings (JSON)`) and set it as the default Python formatter, enable formatting on save and configure `ruff` arguments: ```json "[python]": { - "editor.defaultFormatter": "ms-python.autopep8", + "editor.defaultFormatter": "charliermarsh.ruff", "editor.formatOnSave": true }, -"autopep8.args": [ - "-a", - "-a", - "--max-line-length=100" -], +"ruff.format.args": ["--config", "line-length=100"] ``` ## Getting help diff --git a/dev-requirements.txt b/dev-requirements.txt index cce356b14..c706d8fe6 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -3,6 +3,6 @@ grpcio-tools~=1.62.2 pip-tools~=7.4.1 pyright~=1.1.376 pytest~=8.3.2 -ruff~=0.6.6 +ruff~=0.6.7 setuptools~=72.2.0 setuptools_scm~=8.1.0 From da81df52847aa5dc3c65b81dcd78adea21d1d4c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Mon, 23 Sep 2024 09:29:21 -0700 Subject: [PATCH 3/4] github: install dev-requirements when running tests --- .github/workflows/tests.yaml | 2 +- test-requirements.txt | 10 +--------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 740848cee..ce3e13494 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -38,7 +38,7 @@ jobs: run: | source .venv/bin/activate python -m pip install --upgrade pip - pip install -r test-requirements.txt + pip install -r dev-requirements.txt -r test-requirements.txt - name: Test with pytest run: | source .venv/bin/activate diff --git a/test-requirements.txt b/test-requirements.txt index 7f52a49a1..78280b139 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -1,15 +1,12 @@ aiohttp~=3.10.3 -anthropic -autopep8~=2.3.1 +anthropic~=0.30.0 azure-cognitiveservices-speech~=1.40.0 -build~=1.2.1 daily-python~=0.10.1 deepgram-sdk~=3.5.0 fal-client~=0.4.1 fastapi~=0.112.1 faster-whisper~=1.0.3 google-generativeai~=0.7.2 -grpcio-tools~=1.62.2 langchain~=0.2.14 livekit~=0.13.1 lmnt~=1.1.4 @@ -18,17 +15,12 @@ numpy~=1.26.4 openai~=1.37.2 openpipe~=4.24.0 Pillow~=10.4.0 -pip-tools~=7.4.1 pyaudio~=0.2.14 pydantic~=2.8.2 pyloudnorm~=0.1.1 pyht~=0.0.28 -pyright~=1.1.376 -pytest~=8.3.2 python-dotenv~=1.0.1 resampy~=0.4.3 -setuptools~=72.2.0 -setuptools_scm~=8.1.0 silero-vad~=5.1 together~=1.2.7 transformers~=4.44.0 From c7ff79a652f432bb466b8ae7620906ba16b5ea2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Mon, 23 Sep 2024 09:38:40 -0700 Subject: [PATCH 4/4] processors: fix formatting string --- src/pipecat/processors/frame_processor.py | 3 +-- src/pipecat/processors/metrics/frame_processor_metrics.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/pipecat/processors/frame_processor.py b/src/pipecat/processors/frame_processor.py index 9dd92599e..1bf42311d 100644 --- a/src/pipecat/processors/frame_processor.py +++ b/src/pipecat/processors/frame_processor.py @@ -193,8 +193,7 @@ async def __internal_push_frame(self, frame: Frame, direction: FrameDirection): logger.trace(f"Pushing {frame} from {self} to {self._next}") await self._next.process_frame(frame, direction) elif direction == FrameDirection.UPSTREAM and self._prev: - logger.trace(f"Pushing {frame} upstream from { - self} to {self._prev}") + logger.trace(f"Pushing {frame} upstream from {self} to {self._prev}") await self._prev.process_frame(frame, direction) except Exception as e: logger.exception(f"Uncaught exception in {self}: {e}") diff --git a/src/pipecat/processors/metrics/frame_processor_metrics.py b/src/pipecat/processors/metrics/frame_processor_metrics.py index 52ea7e34c..a22639239 100644 --- a/src/pipecat/processors/metrics/frame_processor_metrics.py +++ b/src/pipecat/processors/metrics/frame_processor_metrics.py @@ -76,6 +76,5 @@ async def start_tts_usage_metrics(self, text: str): characters = TTSUsageMetricsData( processor=self._processor_name(), model=self._model_name(), value=len(text) ) - logger.debug(f"{self._processor_name()} usage characters: { - characters.value}") + logger.debug(f"{self._processor_name()} usage characters: {characters.value}") return MetricsFrame(data=[characters])