From 588137df0b0a9460315116c29329b0871ad3e23d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Wed, 24 Apr 2024 18:29:24 -0700 Subject: [PATCH] wip proposal: initial commit --- .github/workflows/publish.yaml | 4 +- .github/workflows/publish_test.yaml | 2 +- README.md | 28 +- dev-requirements.txt | 12 +- docs/README.md | 6 +- docs/architecture.md | 6 +- examples/foundational/01-say-one-thing.py | 46 +- examples/foundational/01a-local-audio.py | 53 ++ examples/foundational/01a-local-transport.py | 38 - examples/foundational/02-llm-say-one-thing.py | 45 +- examples/foundational/03-still-frame.py | 48 +- examples/foundational/03a-image-local.py | 58 -- .../foundational/03a-local-still-frame.py | 68 ++ .../foundational/04-utterance-and-speech.py | 57 +- .../foundational/05-sync-speech-and-image.py | 118 +-- .../05a-local-sync-speech-and-image.py | 164 ++++ .../05a-local-sync-speech-and-text.py | 146 ---- .../foundational/06-listen-and-respond.py | 82 +- examples/foundational/06a-image-sync.py | 91 ++- examples/foundational/07-interruptible.py | 14 +- examples/foundational/08-bots-arguing.py | 16 +- examples/foundational/09-mirror.py | 62 ++ examples/foundational/09a-local-mirror.py | 65 ++ examples/foundational/10-wake-word.py | 161 ++-- examples/foundational/11-sound-effects.py | 103 +-- examples/foundational/12-describe-video.py | 83 +- .../foundational/13-whisper-transcription.py | 70 +- examples/foundational/13a-whisper-local.py | 77 +- .../14-render-remote-participant.py | 52 -- .../14a-local-render-remote-participant.py | 71 -- .../websocket-server/frames.proto | 2 +- .../foundational/websocket-server/index.html | 2 +- .../foundational/websocket-server/sample.py | 14 +- examples/image-gen.py | 9 +- examples/internal/11a-dial-out.py | 10 +- examples/starter-apps/chatbot.py | 16 +- examples/starter-apps/patient-intake.py | 24 +- examples/starter-apps/storybot.py | 29 +- examples/starter-apps/translator.py | 18 +- linux-py3.10-requirements.txt | 88 +-- macos-py3.10-requirements.txt | 88 +-- pyproject.toml | 11 +- src/dailyai/pipeline/aggregators.py | 549 ------------- src/dailyai/pipeline/frame_processor.py | 34 - src/dailyai/pipeline/frames.py | 253 ------ src/dailyai/pipeline/openai_frames.py | 12 - src/dailyai/pipeline/pipeline.py | 149 ---- src/dailyai/services/ai_services.py | 165 ---- src/dailyai/services/anthropic_llm_service.py | 44 -- src/dailyai/services/deepgram_ai_service.py | 36 - src/dailyai/services/elevenlabs_ai_service.py | 46 -- src/dailyai/services/fireworks_ai_services.py | 18 - src/dailyai/services/local_stt_service.py | 74 -- src/dailyai/services/open_ai_services.py | 58 -- src/dailyai/services/openai_llm_context.py | 61 -- src/dailyai/transports/abstract_transport.py | 42 - src/dailyai/transports/daily_transport.py | 390 ---------- src/dailyai/transports/local_transport.py | 97 --- src/dailyai/transports/threaded_transport.py | 503 ------------ src/dailyai/transports/websocket_transport.py | 125 --- src/{dailyai => pipecat}/__init__.py | 0 .../pipeline => pipecat/frames}/__init__.py | 0 .../pipeline => pipecat/frames}/frames.proto | 8 +- src/pipecat/frames/frames.py | 467 +++++++++++ src/pipecat/frames/openai_frames.py | 15 + .../frames}/protobufs/frames_pb2.py | 2 +- .../services => pipecat/pipeline}/__init__.py | 0 .../pipeline/merge_pipeline.py | 4 +- src/pipecat/pipeline/parallel_pipeline.py | 137 ++++ src/pipecat/pipeline/pipeline.py | 76 ++ src/pipecat/pipeline/runner.py | 60 ++ src/pipecat/pipeline/task.py | 93 +++ .../processors}/__init__.py | 0 .../processors/aggregators}/__init__.py | 0 src/pipecat/processors/aggregators/gated.py | 72 ++ .../processors/aggregators/llm_context.py | 82 ++ .../processors/aggregators/llm_response.py | 190 +++++ .../aggregators/openai_llm_context.py} | 75 +- .../processors/aggregators/parallel_task.py | 104 +++ .../processors/aggregators/sentence.py | 50 ++ .../processors/aggregators/user_response.py | 139 ++++ .../aggregators/vision_image_frame.py | 42 + src/pipecat/processors/filter.py | 34 + src/pipecat/processors/frame_processor.py | 54 ++ src/pipecat/processors/logger.py | 22 + src/pipecat/processors/text_transformer.py | 36 + src/pipecat/processors/utils/audio.py | 21 + .../serializers/abstract_frame_serializer.py | 2 +- .../serializers/protobuf_serializer.py | 6 +- src/pipecat/services/__init__.py | 0 src/pipecat/services/ai_services.py | 169 ++++ src/pipecat/services/anthropic.py | 51 ++ .../services/azure.py} | 12 +- .../services/deepgram.py} | 25 +- src/pipecat/services/elevenlabs.py | 58 ++ .../services/fal.py} | 42 +- src/pipecat/services/fireworks.py | 24 + .../services/moondream.py} | 39 +- .../services/ollama.py} | 8 +- .../services/openai.py} | 117 ++- .../services/playht.py} | 52 +- .../services/to_be_updated/__init__.py | 0 .../to_be_updated/cloudflare_ai_service.py | 0 .../to_be_updated/google_ai_service.py | 0 .../to_be_updated/huggingface_ai_service.py | 0 .../services/to_be_updated/mock_ai_service.py | 0 .../services/whisper.py} | 50 +- src/pipecat/storage/__init__.py | 0 src/{dailyai => pipecat}/storage/search.py | 0 src/pipecat/transports/base_input.py | 138 ++++ src/pipecat/transports/base_output.py | 186 +++++ src/pipecat/transports/base_transport.py | 40 + src/pipecat/transports/local/__init__.py | 0 src/pipecat/transports/local/audio.py | 93 +++ src/pipecat/transports/local/tk.py | 130 ++++ src/pipecat/transports/services/daily.py | 728 ++++++++++++++++++ src/pipecat/utils/__init__.py | 0 src/pipecat/utils/utils.py | 31 + src/pipecat/vad/__init__.py | 0 src/pipecat/vad/silero.py | 103 +++ src/pipecat/vad/vad_analyzer.py | 104 +++ tests/integration/integration_azure_llm.py | 6 +- tests/integration/integration_ollama_llm.py | 6 +- tests/integration/integration_openai_llm.py | 6 +- tests/test_aggregators.py | 8 +- tests/test_ai_services.py | 4 +- tests/test_daily_transport_service.py | 10 +- tests/test_pipeline.py | 8 +- tests/test_protobuf_serializer.py | 4 +- tests/test_websocket_transport.py | 6 +- 130 files changed, 5058 insertions(+), 3804 deletions(-) create mode 100644 examples/foundational/01a-local-audio.py delete mode 100644 examples/foundational/01a-local-transport.py delete mode 100644 examples/foundational/03a-image-local.py create mode 100644 examples/foundational/03a-local-still-frame.py create mode 100644 examples/foundational/05a-local-sync-speech-and-image.py delete mode 100644 examples/foundational/05a-local-sync-speech-and-text.py create mode 100644 examples/foundational/09-mirror.py create mode 100644 examples/foundational/09a-local-mirror.py delete mode 100644 examples/foundational/14-render-remote-participant.py delete mode 100644 examples/foundational/14a-local-render-remote-participant.py delete mode 100644 src/dailyai/pipeline/aggregators.py delete mode 100644 src/dailyai/pipeline/frame_processor.py delete mode 100644 src/dailyai/pipeline/frames.py delete mode 100644 src/dailyai/pipeline/openai_frames.py delete mode 100644 src/dailyai/pipeline/pipeline.py delete mode 100644 src/dailyai/services/ai_services.py delete mode 100644 src/dailyai/services/anthropic_llm_service.py delete mode 100644 src/dailyai/services/deepgram_ai_service.py delete mode 100644 src/dailyai/services/elevenlabs_ai_service.py delete mode 100644 src/dailyai/services/fireworks_ai_services.py delete mode 100644 src/dailyai/services/local_stt_service.py delete mode 100644 src/dailyai/services/open_ai_services.py delete mode 100644 src/dailyai/services/openai_llm_context.py delete mode 100644 src/dailyai/transports/abstract_transport.py delete mode 100644 src/dailyai/transports/daily_transport.py delete mode 100644 src/dailyai/transports/local_transport.py delete mode 100644 src/dailyai/transports/threaded_transport.py delete mode 100644 src/dailyai/transports/websocket_transport.py rename src/{dailyai => pipecat}/__init__.py (100%) rename src/{dailyai/pipeline => pipecat/frames}/__init__.py (100%) rename src/{dailyai/pipeline => pipecat/frames}/frames.proto (76%) create mode 100644 src/pipecat/frames/frames.py create mode 100644 src/pipecat/frames/openai_frames.py rename src/{dailyai/pipeline => pipecat/frames}/protobufs/frames_pb2.py (85%) rename src/{dailyai/services => pipecat/pipeline}/__init__.py (100%) rename src/{dailyai => pipecat}/pipeline/merge_pipeline.py (87%) create mode 100644 src/pipecat/pipeline/parallel_pipeline.py create mode 100644 src/pipecat/pipeline/pipeline.py create mode 100644 src/pipecat/pipeline/runner.py create mode 100644 src/pipecat/pipeline/task.py rename src/{dailyai/services/to_be_updated => pipecat/processors}/__init__.py (100%) rename src/{dailyai/storage => pipecat/processors/aggregators}/__init__.py (100%) create mode 100644 src/pipecat/processors/aggregators/gated.py create mode 100644 src/pipecat/processors/aggregators/llm_context.py create mode 100644 src/pipecat/processors/aggregators/llm_response.py rename src/{dailyai/pipeline/opeanai_llm_aggregator.py => pipecat/processors/aggregators/openai_llm_context.py} (63%) create mode 100644 src/pipecat/processors/aggregators/parallel_task.py create mode 100644 src/pipecat/processors/aggregators/sentence.py create mode 100644 src/pipecat/processors/aggregators/user_response.py create mode 100644 src/pipecat/processors/aggregators/vision_image_frame.py create mode 100644 src/pipecat/processors/filter.py create mode 100644 src/pipecat/processors/frame_processor.py create mode 100644 src/pipecat/processors/logger.py create mode 100644 src/pipecat/processors/text_transformer.py create mode 100644 src/pipecat/processors/utils/audio.py rename src/{dailyai => pipecat}/serializers/abstract_frame_serializer.py (87%) rename src/{dailyai => pipecat}/serializers/protobuf_serializer.py (93%) create mode 100644 src/pipecat/services/__init__.py create mode 100644 src/pipecat/services/ai_services.py create mode 100644 src/pipecat/services/anthropic.py rename src/{dailyai/services/azure_ai_services.py => pipecat/services/azure.py} (93%) rename src/{dailyai/services/deepgram_ai_services.py => pipecat/services/deepgram.py} (61%) create mode 100644 src/pipecat/services/elevenlabs.py rename src/{dailyai/services/fal_ai_services.py => pipecat/services/fal.py} (55%) create mode 100644 src/pipecat/services/fireworks.py rename src/{dailyai/services/moondream_ai_service.py => pipecat/services/moondream.py} (56%) rename src/{dailyai/services/ollama_ai_services.py => pipecat/services/ollama.py} (59%) rename src/{dailyai/services/openai_api_llm_service.py => pipecat/services/openai.py} (58%) rename src/{dailyai/services/playht_ai_service.py => pipecat/services/playht.py} (56%) create mode 100644 src/pipecat/services/to_be_updated/__init__.py rename src/{dailyai => pipecat}/services/to_be_updated/cloudflare_ai_service.py (100%) rename src/{dailyai => pipecat}/services/to_be_updated/google_ai_service.py (100%) rename src/{dailyai => pipecat}/services/to_be_updated/huggingface_ai_service.py (100%) rename src/{dailyai => pipecat}/services/to_be_updated/mock_ai_service.py (100%) rename src/{dailyai/services/whisper_ai_services.py => pipecat/services/whisper.py} (59%) create mode 100644 src/pipecat/storage/__init__.py rename src/{dailyai => pipecat}/storage/search.py (100%) create mode 100644 src/pipecat/transports/base_input.py create mode 100644 src/pipecat/transports/base_output.py create mode 100644 src/pipecat/transports/base_transport.py create mode 100644 src/pipecat/transports/local/__init__.py create mode 100644 src/pipecat/transports/local/audio.py create mode 100644 src/pipecat/transports/local/tk.py create mode 100644 src/pipecat/transports/services/daily.py create mode 100644 src/pipecat/utils/__init__.py create mode 100644 src/pipecat/utils/utils.py create mode 100644 src/pipecat/vad/__init__.py create mode 100644 src/pipecat/vad/silero.py create mode 100644 src/pipecat/vad/vad_analyzer.py diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 497b20891..1a6faddad 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -46,7 +46,7 @@ jobs: needs: [ build ] environment: name: pypi - url: https://pypi.org/p/dailyai + url: https://pypi.org/p/pipecat permissions: id-token: write steps: @@ -67,7 +67,7 @@ jobs: needs: [ build ] environment: name: testpypi - url: https://pypi.org/p/dailyai + url: https://pypi.org/p/pipecat permissions: id-token: write steps: diff --git a/.github/workflows/publish_test.yaml b/.github/workflows/publish_test.yaml index eab9b63ea..deff68823 100644 --- a/.github/workflows/publish_test.yaml +++ b/.github/workflows/publish_test.yaml @@ -46,7 +46,7 @@ jobs: needs: [ build ] environment: name: testpypi - url: https://pypi.org/p/dailyai + url: https://pypi.org/p/pipecat permissions: id-token: write steps: diff --git a/README.md b/README.md index 40494bd1f..916151b19 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,14 @@ -[![PyPI](https://img.shields.io/pypi/v/dailyai)](https://pypi.org/project/dailyai) +[![PyPI](https://img.shields.io/pypi/v/pipecat)](https://pypi.org/project/pipecat) -> [!IMPORTANT] -> Hackathon attendees - getting started doc can be found [here](https://dailyco.notion.site/Daily-AI-ff356d3a799649e583fa91c1ccfe0d87) - - -# dailyai — an open source framework for real-time, multi-modal, conversational AI applications +# Pipecat — an open source framework for voice (and multimodal) assistants Build things like this: [![AI-powered voice patient intake for healthcare](https://img.youtube.com/vi/lDevgsp9vn0/0.jpg)](https://www.youtube.com/watch?v=lDevgsp9vn0) -[ [dailyai starter kits repository](https://github.com/daily-co/dailyai-examples) ] +[ [pipecat starter kits repository](https://github.com/daily-co/pipecat-examples) ] -**`dailyai` started as a toolkit for implementing generative AI voice bots.** Things like personal coaches, meeting assistants, story-telling toys for kids, customer support bots, and snarky social companions. +**`Pipecat` started as a toolkit for implementing generative AI voice bots.** Things like personal coaches, meeting assistants, story-telling toys for kids, customer support bots, and snarky social companions. In 2023 a _lot_ of us got excited about the possibility of having open-ended conversations with LLMs. It became clear pretty quickly that we were all solving the same [low-level problems](https://www.daily.co/blog/how-to-talk-to-an-llm-with-your-voice/): @@ -24,7 +20,7 @@ In 2023 a _lot_ of us got excited about the possibility of having open-ended con As our applications expanded to include additional things like image generation, function calling, and vision models, we started to think about what a complete framework for these kinds of apps could look like. -Today, `dailyai` is: +Today, `pipecat` is: 1. a set of code building blocks for interacting with generative AI services and creating low-latency, interruptible data pipelines that use multiple services 2. transport services that moves audio, video, and events across the Internet @@ -49,19 +45,19 @@ Currently implemented services: - ElevenLabs - Transport - Daily - - Local (in progress, intended as a quick start example service) + - Local - Vision - Moondream -If you'd like to [implement a service](<(https://github.com/daily-co/daily-ai-sdk/tree/main/src/dailyai/services)>), we welcome PRs! Our goal is to support lots of services in all of the above categories, plus new categories (like real-time video) as they emerge. +If you'd like to [implement a service](<(https://github.com/daily-co/pipecat/tree/main/src/pipecat/services)>), we welcome PRs! Our goal is to support lots of services in all of the above categories, plus new categories (like real-time video) as they emerge. ## Getting started -Today, the easiest way to get started with `dailyai` is to use [Daily](https://www.daily.co/) as your transport service. This toolkit started life as an internal SDK at Daily and millions of minutes of AI conversation have been served using it and its earlier prototype incarnations. (The [transport base class](https://github.com/daily-co/daily-ai-sdk/blob/main/src/dailyai/transports/abstract_transport.py) is easy to extend, though, so feel free to submit PRs if you'd like to implement another transport service.) +Today, the easiest way to get started with `pipecat` is to use [Daily](https://www.daily.co/) as your transport service. This toolkit started life as an internal SDK at Daily and millions of minutes of AI conversation have been served using it and its earlier prototype incarnations. ``` # install the module -pip install dailyai +pip install pipecat # set up an .env file with API keys cp dot-env.template .env @@ -71,7 +67,7 @@ By default, in order to minimize dependencies, only the basic framework function dependencies that you can install with: ``` -pip install "dailyai[option,...]" +pip install "pipecat[option,...]" ``` Your project may or may not need these, so they're made available as optional requirements. Here is a list: @@ -83,8 +79,8 @@ Your project may or may not need these, so they're made available as optional re There are two directories of examples: -- [foundational](https://github.com/daily-co/daily-ai-sdk/tree/main/examples/foundational) — demos that build on each other, introducing one or two concepts at a time -- [starter apps](https://github.com/daily-co/daily-ai-sdk/tree/main/examples/starter-apps) — complete applications that you can use as starting points for development +- [foundational](https://github.com/daily-co/pipecat/tree/main/examples/foundational) — examples that build on each other, introducing one or two concepts at a time +- [starter apps](https://github.com/daily-co/pipecat/tree/main/examples/starter-apps) — complete applications that you can use as starting points for development Before running the examples you need to install the dependencies (which will install all the dependencies to run all of the examples): diff --git a/dev-requirements.txt b/dev-requirements.txt index bbd666cf6..9e0d93cbe 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,6 +1,6 @@ -autopep8==2.0.4 -build==1.0.3 -pip-tools==7.4.1 -pytest==8.1.1 -setuptools==69.2.0 -setuptools_scm==8.0.4 +autopep8~=2.1.0 +build~=1.2.1 +pip-tools~=7.4.1 +pytest~=8.2.0 +setuptools~=69.5.1 +setuptools_scm~=8.1.0 diff --git a/docs/README.md b/docs/README.md index e5303dbd8..3de18ea09 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,8 +1,8 @@ -# Daily AI SDK Docs +# Pipecat Docs ## [Architecture Overview](architecture.md) -Learn about the thinking behind the SDK's design. +Learn about the thinking behind the framework's design. ## [A Frame's Progress](frame-progress.md) @@ -10,7 +10,7 @@ See how a Frame is processed through a Transport, a Pipeline, and a series of Fr ## [Example Code](examples/) -The repo includes several example apps in the `examples` directory. The docs explain how they work. +The repository includes several example apps in the `examples` directory. The docs explain how they work. ## [API Reference](api/) diff --git a/docs/architecture.md b/docs/architecture.md index 1c5946c9c..a44cd2add 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -1,4 +1,4 @@ -# Daily AI SDK Architecture Guide +# Pipecat architecture guide ## Frames @@ -10,8 +10,8 @@ Frame processors operate on frames. Every frame processor implements a `process_ ## Pipelines -Pipelines are lists of frame processors that read from a source queue and send the processed frames to a sink queue. A very simple pipeline might chain an LLM frame processor to a text-to-speech frame processor, with a transport's send queue as its sync. Placing LLM message frames on the pipeline's source queue will cause the LLM's response to be spoken. See example #2 for an implementation of this. +Pipelines are lists of frame processors linked together. Frame processors can push frames upstream or downstream to their peers. A very simple pipeline might chain an LLM frame processor to a text-to-speech frame processor, with a transport as an output. ## Transports -Transports provide a receive queue, which is input from "the outside world", and a sink queue, which is data that will be sent "to the outside world". The `LocalTransportService` does this with the local camera, mic, display and speaker. The `DailyTransportService` does this with a WebRTC session joined to a Daily.co room. +Transports provide input and output frame processors to receive or send frames respectively. For example, the `DailyTransport` does this with a WebRTC session joined to a Daily.co room. diff --git a/examples/foundational/01-say-one-thing.py b/examples/foundational/01-say-one-thing.py index aecda2963..970cc8145 100644 --- a/examples/foundational/01-say-one-thing.py +++ b/examples/foundational/01-say-one-thing.py @@ -1,31 +1,36 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + import asyncio import aiohttp -import logging import os -from dailyai.pipeline.frames import EndFrame, TextFrame -from dailyai.pipeline.pipeline import Pipeline +import sys -from dailyai.transports.daily_transport import DailyTransport -from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService +from pipecat.frames.frames import EndFrame, TextFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.task import PipelineTask +from pipecat.pipeline.runner import PipelineRunner +from pipecat.services.elevenlabs import ElevenLabsTTSService +from pipecat.transports.services.daily import DailyParams, DailyTransport from runner import configure +from loguru import logger + from dotenv import load_dotenv load_dotenv(override=True) -logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") -logger.setLevel(logging.DEBUG) +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") async def main(room_url): async with aiohttp.ClientSession() as session: transport = DailyTransport( - room_url, - None, - "Say One Thing", - mic_enabled=True, - ) + room_url, None, "Say One Thing", DailyParams(audio_out_enabled=True)) tts = ElevenLabsTTSService( aiohttp_session=session, @@ -33,21 +38,18 @@ async def main(room_url): voice_id=os.getenv("ELEVENLABS_VOICE_ID"), ) - pipeline = Pipeline([tts]) + runner = PipelineRunner() + + task = PipelineTask(Pipeline([tts, transport.output()])) # Register an event handler so we can play the audio when the # participant joins. @transport.event_handler("on_participant_joined") - async def on_participant_joined(transport, participant): - if participant["info"]["isLocal"]: - return - + async def on_new_participant_joined(transport, participant): participant_name = participant["info"]["userName"] or '' - await pipeline.queue_frames([TextFrame("Hello there, " + participant_name + "!"), EndFrame()]) - - await transport.run(pipeline) - del tts + await task.queue_frames([TextFrame(f"Hello there, {participant_name}!"), EndFrame()]) + await runner.run(task) if __name__ == "__main__": (url, token) = configure() diff --git a/examples/foundational/01a-local-audio.py b/examples/foundational/01a-local-audio.py new file mode 100644 index 000000000..2ed9a072b --- /dev/null +++ b/examples/foundational/01a-local-audio.py @@ -0,0 +1,53 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import aiohttp +import os +import sys + +from pipecat.frames.frames import EndFrame, TextFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineTask +from pipecat.services.elevenlabs import ElevenLabsTTSService +from pipecat.transports.base_transport import TransportParams +from pipecat.transports.local.audio import AudioLocalTransport + +from loguru import logger + +from dotenv import load_dotenv +load_dotenv(override=True) + +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + + +async def main(): + async with aiohttp.ClientSession() as session: + transport = AudioLocalTransport(TransportParams(audio_out_enabled=True)) + + tts = ElevenLabsTTSService( + aiohttp_session=session, + api_key=os.getenv("ELEVENLABS_API_KEY"), + voice_id=os.getenv("ELEVENLABS_VOICE_ID"), + ) + + pipeline = Pipeline([tts, transport.output()]) + + task = PipelineTask(pipeline) + + async def say_something(): + await asyncio.sleep(1) + await task.queue_frames([TextFrame("Hello there!"), EndFrame()]) + + runner = PipelineRunner() + + await asyncio.gather(runner.run(task), say_something()) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/foundational/01a-local-transport.py b/examples/foundational/01a-local-transport.py deleted file mode 100644 index 617459590..000000000 --- a/examples/foundational/01a-local-transport.py +++ /dev/null @@ -1,38 +0,0 @@ -import asyncio -import aiohttp -import logging -import os - -from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService -from dailyai.transports.local_transport import LocalTransport - -from dotenv import load_dotenv -load_dotenv(override=True) - -logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") -logger.setLevel(logging.DEBUG) - - -async def main(): - async with aiohttp.ClientSession() as session: - meeting_duration_minutes = 1 - transport = LocalTransport( - duration_minutes=meeting_duration_minutes, mic_enabled=True - ) - tts = ElevenLabsTTSService( - aiohttp_session=session, - api_key=os.getenv("ELEVENLABS_API_KEY"), - voice_id=os.getenv("ELEVENLABS_VOICE_ID"), - ) - - async def say_something(): - await asyncio.sleep(1) - await transport.say("Hello there.", tts) - await transport.stop_when_done() - - await asyncio.gather(transport.run(), say_something()) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/examples/foundational/02-llm-say-one-thing.py b/examples/foundational/02-llm-say-one-thing.py index a98815f1d..7e12263dc 100644 --- a/examples/foundational/02-llm-say-one-thing.py +++ b/examples/foundational/02-llm-say-one-thing.py @@ -1,23 +1,31 @@ -import asyncio -import os -import logging +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# +import asyncio import aiohttp +import os +import sys -from dailyai.pipeline.frames import EndFrame, LLMMessagesFrame -from dailyai.pipeline.pipeline import Pipeline -from dailyai.transports.daily_transport import DailyTransport -from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService -from dailyai.services.open_ai_services import OpenAILLMService +from pipecat.frames.frames import EndFrame, LLMMessagesFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineTask +from pipecat.services.elevenlabs import ElevenLabsTTSService +from pipecat.services.openai import OpenAILLMService +from pipecat.transports.services.daily import DailyParams, DailyTransport from runner import configure +from loguru import logger + from dotenv import load_dotenv load_dotenv(override=True) -logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") -logger.setLevel(logging.DEBUG) +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") async def main(room_url): @@ -26,8 +34,7 @@ async def main(room_url): room_url, None, "Say One Thing From an LLM", - mic_enabled=True, - ) + DailyParams(audio_out_enabled=True)) tts = ElevenLabsTTSService( aiohttp_session=session, @@ -45,13 +52,15 @@ async def main(room_url): "content": "You are an LLM in a WebRTC session, and this is a 'hello world' demo. Say hello to the world.", }] - pipeline = Pipeline([llm, tts]) + runner = PipelineRunner() + + task = PipelineTask(Pipeline([llm, tts, transport.output()])) - @transport.event_handler("on_first_other_participant_joined") - async def on_first_other_participant_joined(transport, participant): - await pipeline.queue_frames([LLMMessagesFrame(messages), EndFrame()]) + @transport.event_handler("on_first_participant_joined") + async def on_first_participant_joined(transport, participant): + await task.queue_frames([LLMMessagesFrame(messages), EndFrame()]) - await transport.run(pipeline) + await runner.run(task) if __name__ == "__main__": diff --git a/examples/foundational/03-still-frame.py b/examples/foundational/03-still-frame.py index 51ef47de8..48c95a29b 100644 --- a/examples/foundational/03-still-frame.py +++ b/examples/foundational/03-still-frame.py @@ -1,21 +1,30 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + import asyncio import aiohttp -import logging import os +import sys -from dailyai.pipeline.frames import TextFrame -from dailyai.pipeline.pipeline import Pipeline -from dailyai.transports.daily_transport import DailyTransport -from dailyai.services.fal_ai_services import FalImageGenService +from pipecat.frames.frames import TextFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineTask +from pipecat.services.fal import FalImageGenService +from pipecat.transports.services.daily import DailyParams, DailyTransport from runner import configure +from loguru import logger + from dotenv import load_dotenv load_dotenv(override=True) -logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") -logger.setLevel(logging.DEBUG) +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") async def main(room_url): @@ -24,10 +33,11 @@ async def main(room_url): room_url, None, "Show a still frame image", - camera_enabled=True, - camera_width=1024, - camera_height=1024, - duration_minutes=1 + DailyParams( + camera_out_enabled=True, + camera_out_width=1024, + camera_out_height=1024 + ) ) imagegen = FalImageGenService( @@ -38,19 +48,19 @@ async def main(room_url): key=os.getenv("FAL_KEY"), ) - pipeline = Pipeline([imagegen]) + runner = PipelineRunner() - @transport.event_handler("on_first_other_participant_joined") - async def on_first_other_participant_joined(transport, participant): + task = PipelineTask(Pipeline([imagegen, transport.output()])) + + @transport.event_handler("on_first_participant_joined") + async def on_first_participant_joined(transport, participant): # Note that we do not put an EndFrame() item in the pipeline for this demo. # This means that the bot will stay in the channel until it times out. # An EndFrame() in the pipeline would cause the transport to shut # down. - await pipeline.queue_frames( - [TextFrame("a cat in the style of picasso")] - ) + await task.queue_frames([TextFrame("a cat in the style of picasso")]) - await transport.run(pipeline) + await runner.run(task) if __name__ == "__main__": diff --git a/examples/foundational/03a-image-local.py b/examples/foundational/03a-image-local.py deleted file mode 100644 index f213f505b..000000000 --- a/examples/foundational/03a-image-local.py +++ /dev/null @@ -1,58 +0,0 @@ -import asyncio -import aiohttp -import logging -import os - -import tkinter as tk - -from dailyai.pipeline.frames import TextFrame -from dailyai.pipeline.pipeline import Pipeline -from dailyai.services.fal_ai_services import FalImageGenService -from dailyai.transports.local_transport import LocalTransport - -from dotenv import load_dotenv -load_dotenv(override=True) - -logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") -logger.setLevel(logging.DEBUG) - - -async def main(): - async with aiohttp.ClientSession() as session: - meeting_duration_minutes = 2 - - tk_root = tk.Tk() - tk_root.title("dailyai") - - transport = LocalTransport( - tk_root=tk_root, - mic_enabled=False, - camera_enabled=True, - camera_width=1024, - camera_height=1024, - duration_minutes=meeting_duration_minutes, - ) - - imagegen = FalImageGenService( - params=FalImageGenService.InputParams( - image_size="square_hd" - ), - aiohttp_session=session, - key=os.getenv("FAL_KEY"), - ) - - pipeline = Pipeline([imagegen]) - await pipeline.queue_frames([TextFrame("a cat in the style of picasso")]) - - async def run_tk(): - while not transport._stop_threads.is_set(): - tk_root.update() - tk_root.update_idletasks() - await asyncio.sleep(0.1) - - await asyncio.gather(transport.run(pipeline, override_pipeline_source_queue=False), run_tk()) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/examples/foundational/03a-local-still-frame.py b/examples/foundational/03a-local-still-frame.py new file mode 100644 index 000000000..d645f8b95 --- /dev/null +++ b/examples/foundational/03a-local-still-frame.py @@ -0,0 +1,68 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import aiohttp +import os +import sys + +import tkinter as tk + +from pipecat.frames.frames import TextFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineTask +from pipecat.services.fal import FalImageGenService +from pipecat.transports.base_transport import TransportParams +from pipecat.transports.local.tk import TkLocalTransport + +from loguru import logger + +from dotenv import load_dotenv +load_dotenv(override=True) + +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + + +async def main(): + async with aiohttp.ClientSession() as session: + tk_root = tk.Tk() + tk_root.title("Picasso Cat") + + transport = TkLocalTransport( + tk_root, + TransportParams( + camera_out_enabled=True, + camera_out_width=1024, + camera_out_height=1024)) + + imagegen = FalImageGenService( + params=FalImageGenService.InputParams( + image_size="square_hd" + ), + aiohttp_session=session, + key=os.getenv("FAL_KEY"), + ) + + pipeline = Pipeline([imagegen, transport.output()]) + + task = PipelineTask(pipeline) + await task.queue_frames([TextFrame("a cat in the style of picasso")]) + + runner = PipelineRunner() + + async def run_tk(): + while runner.is_active(): + tk_root.update() + tk_root.update_idletasks() + await asyncio.sleep(0.1) + + await asyncio.gather(runner.run(task), run_tk()) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/foundational/04-utterance-and-speech.py b/examples/foundational/04-utterance-and-speech.py index 908be03b4..acefed38d 100644 --- a/examples/foundational/04-utterance-and-speech.py +++ b/examples/foundational/04-utterance-and-speech.py @@ -1,37 +1,40 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import aiohttp import asyncio -import logging import os +import sys -import aiohttp -from dailyai.pipeline.merge_pipeline import SequentialMergePipeline -from dailyai.pipeline.pipeline import Pipeline +from pipecat.pipeline.merge_pipeline import SequentialMergePipeline +from pipecat.pipeline.pipeline import Pipeline -from dailyai.transports.daily_transport import DailyTransport -from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService -from dailyai.services.deepgram_ai_services import DeepgramTTSService -from dailyai.pipeline.frames import EndPipeFrame, LLMMessagesFrame, TextFrame -from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService +from pipecat.frames.frames import EndPipeFrame, LLMMessagesFrame, TextFrame +from pipecat.pipeline.task import PipelineTask +from pipecat.services.azure import AzureLLMService, AzureTTSService +from pipecat.services.elevenlabs import ElevenLabsTTSService +from pipecat.services.transport_services import TransportServiceOutput +from pipecat.services.transports.daily_transport import DailyTransport from runner import configure +from loguru import logger + from dotenv import load_dotenv load_dotenv(override=True) -logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") -logger.setLevel(logging.DEBUG) +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") async def main(room_url: str): async with aiohttp.ClientSession() as session: - transport = DailyTransport( - room_url, - None, - "Static And Dynamic Speech", - duration_minutes=1, - mic_enabled=True, - mic_sample_rate=16000, - ) + transport = DailyTransport(room_url, None, "Static And Dynamic Speech") + + meeting = TransportServiceOutput(transport, mic_enabled=True) llm = AzureLLMService( api_key=os.getenv("AZURE_CHATGPT_API_KEY"), @@ -43,10 +46,6 @@ async def main(room_url: str): region=os.getenv("AZURE_SPEECH_REGION"), ) - deepgram_tts = DeepgramTTSService( - aiohttp_session=session, - api_key=os.getenv("DEEPGRAM_API_KEY"), - ) elevenlabs_tts = ElevenLabsTTSService( aiohttp_session=session, api_key=os.getenv("ELEVENLABS_API_KEY"), @@ -56,11 +55,13 @@ async def main(room_url: str): messages = [{"role": "system", "content": "tell the user a joke about llamas"}] - # Start a task to run the LLM to create a joke, and convert the LLM output to audio frames. This task - # will run in parallel with generating and speaking the audio for static text, so there's no delay to - # speak the LLM response. + # Start a task to run the LLM to create a joke, and convert the LLM + # output to audio frames. This task will run in parallel with generating + # and speaking the audio for static text, so there's no delay to speak + # the LLM response. llm_pipeline = Pipeline([llm, elevenlabs_tts]) - await llm_pipeline.queue_frames([LLMMessagesFrame(messages), EndPipeFrame()]) + llm_task = PipelineTask(llm_pipeline) + await llm_task.queue_frames([LLMMessagesFrame(messages), EndPipeFrame()]) simple_tts_pipeline = Pipeline([azure_tts]) await simple_tts_pipeline.queue_frames( diff --git a/examples/foundational/05-sync-speech-and-image.py b/examples/foundational/05-sync-speech-and-image.py index 377e8579b..257a6181e 100644 --- a/examples/foundational/05-sync-speech-and-image.py +++ b/examples/foundational/05-sync-speech-and-image.py @@ -1,64 +1,81 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + import asyncio import aiohttp import os -import logging +import sys -from dataclasses import dataclass -from typing import AsyncGenerator +import daily -from dailyai.pipeline.aggregators import ( - GatedAggregator, - LLMFullResponseAggregator, - ParallelPipeline, - SentenceAggregator, -) -from dailyai.pipeline.frames import ( +from pipecat.frames.frames import ( + AppFrame, Frame, + ImageRawFrame, TextFrame, EndFrame, - ImageFrame, LLMMessagesFrame, LLMResponseStartFrame, ) -from dailyai.pipeline.frame_processor import FrameProcessor - -from dailyai.pipeline.pipeline import Pipeline -from dailyai.transports.daily_transport import DailyTransport -from dailyai.services.open_ai_services import OpenAILLMService -from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService -from dailyai.services.fal_ai_services import FalImageGenService +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineTask +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.processors.aggregators.gated import GatedAggregator +from pipecat.processors.aggregators.llm_response import LLMFullResponseAggregator +from pipecat.processors.aggregators.sentence import SentenceAggregator +from pipecat.processors.aggregators.parallel_task import ParallelTask +from pipecat.services.openai import OpenAILLMService +from pipecat.services.elevenlabs import ElevenLabsTTSService +from pipecat.services.fal import FalImageGenService +from pipecat.transports.services.daily import DailyParams, DailyTransport from runner import configure +from loguru import logger + from dotenv import load_dotenv load_dotenv(override=True) -logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") -logger.setLevel(logging.DEBUG) +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + + +class MonthFrame(AppFrame): + def __init__(self, month): + super().__init__() + self.metadata["month"] = month + + @ property + def month(self) -> str: + return self.metadata["month"] + def __str__(self): + return f"{self.name}(month: {self.month})" -@dataclass -class MonthFrame(Frame): month: str class MonthPrepender(FrameProcessor): def __init__(self): + super().__init__() self.most_recent_month = "Placeholder, month frame not yet received" self.prepend_to_next_text_frame = False - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: + async def process_frame(self, frame: Frame, direction: FrameDirection): if isinstance(frame, MonthFrame): self.most_recent_month = frame.month elif self.prepend_to_next_text_frame and isinstance(frame, TextFrame): - yield TextFrame(f"{self.most_recent_month}: {frame.text}") + await self.push_frame(TextFrame(f"{self.most_recent_month}: {frame.data}")) self.prepend_to_next_text_frame = False elif isinstance(frame, LLMResponseStartFrame): self.prepend_to_next_text_frame = True - yield frame + await self.push_frame(frame) else: - yield frame + await self.push_frame(frame, direction) async def main(room_url): @@ -67,11 +84,12 @@ async def main(room_url): room_url, None, "Month Narration Bot", - mic_enabled=True, - camera_enabled=True, - mic_sample_rate=16000, - camera_width=1024, - camera_height=1024, + DailyParams( + audio_out_enabled=True, + camera_out_enabled=True, + camera_out_width=1024, + camera_out_height=1024 + ) ) tts = ElevenLabsTTSService( @@ -93,24 +111,25 @@ async def main(room_url): ) gated_aggregator = GatedAggregator( - gate_open_fn=lambda frame: isinstance( - frame, ImageFrame), gate_close_fn=lambda frame: isinstance( - frame, LLMResponseStartFrame), start_open=False, ) + gate_open_fn=lambda frame: isinstance(frame, ImageRawFrame), + gate_close_fn=lambda frame: isinstance(frame, LLMResponseStartFrame), + start_open=False + ) sentence_aggregator = SentenceAggregator() month_prepender = MonthPrepender() llm_full_response_aggregator = LLMFullResponseAggregator() - pipeline = Pipeline( - processors=[ - llm, - sentence_aggregator, - ParallelPipeline( - [[month_prepender, tts], [llm_full_response_aggregator, imagegen]] - ), - gated_aggregator, - ], - ) + pipeline = Pipeline([ + llm, + sentence_aggregator, + ParallelTask( + [month_prepender, tts], + [llm_full_response_aggregator, imagegen] + ), + gated_aggregator, + transport.output() + ]) frames = [] for month in [ @@ -137,9 +156,14 @@ async def main(room_url): frames.append(LLMMessagesFrame(messages)) frames.append(EndFrame()) - await pipeline.queue_frames(frames) - await transport.run(pipeline, override_pipeline_source_queue=False) + runner = PipelineRunner() + + task = PipelineTask(pipeline) + + await task.queue_frames(frames) + + await runner.run(task) if __name__ == "__main__": diff --git a/examples/foundational/05a-local-sync-speech-and-image.py b/examples/foundational/05a-local-sync-speech-and-image.py new file mode 100644 index 000000000..529317a90 --- /dev/null +++ b/examples/foundational/05a-local-sync-speech-and-image.py @@ -0,0 +1,164 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import aiohttp +import asyncio +import os +import sys + +import tkinter as tk + +from pipecat.frames.frames import AudioRawFrame, Frame, URLImageRawFrame, LLMMessagesFrame, TextFrame +from pipecat.pipeline.parallel_pipeline import ParallelPipeline +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineTask +from pipecat.processors.aggregators.llm_response import LLMFullResponseAggregator +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.services.openai import OpenAILLMService +from pipecat.services.elevenlabs import ElevenLabsTTSService +from pipecat.services.fal import FalImageGenService +from pipecat.transports.base_transport import TransportParams +from pipecat.transports.local.tk import TkLocalTransport + +from loguru import logger + +from dotenv import load_dotenv + +load_dotenv(override=True) + +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + + +async def main(): + async with aiohttp.ClientSession() as session: + tk_root = tk.Tk() + tk_root.title("Calendar") + + runner = PipelineRunner() + + async def get_month_data(month): + messages = [{"role": "system", "content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.", }] + + class ImageDescription(FrameProcessor): + def __init__(self): + super().__init__() + self.text = "" + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, TextFrame): + self.text = frame.text + await self.push_frame(frame, direction) + + class AudioGrabber(FrameProcessor): + def __init__(self): + super().__init__() + self.audio = bytearray() + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, AudioRawFrame): + self.audio.extend(frame.data) + self.frame = AudioRawFrame( + bytes(self.audio), frame.sample_rate, frame.num_channels) + + class ImageGrabber(FrameProcessor): + def __init__(self): + super().__init__() + self.frame = None + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, URLImageRawFrame): + self.frame = frame + + llm = OpenAILLMService( + api_key=os.getenv("OPENAI_API_KEY"), + model="gpt-4-turbo-preview") + + tts = ElevenLabsTTSService( + aiohttp_session=session, + api_key=os.getenv("ELEVENLABS_API_KEY"), + voice_id=os.getenv("ELEVENLABS_VOICE_ID")) + + imagegen = FalImageGenService( + params=FalImageGenService.InputParams( + image_size="square_hd" + ), + aiohttp_session=session, + key=os.getenv("FAL_KEY")) + + aggregator = LLMFullResponseAggregator() + + description = ImageDescription() + + audio_grabber = AudioGrabber() + + image_grabber = ImageGrabber() + + pipeline = Pipeline([llm, aggregator, description, + ParallelPipeline([tts, audio_grabber], + [imagegen, image_grabber])]) + + task = PipelineTask(pipeline) + await task.queue_frame(LLMMessagesFrame(messages)) + await task.stop_when_done() + + await runner.run(task) + + return { + "month": month, + "text": description.text, + "image": image_grabber.frame, + "audio": audio_grabber.frame, + } + + transport = TkLocalTransport( + tk_root, + TransportParams( + audio_out_enabled=True, + camera_out_enabled=True, + camera_out_width=1024, + camera_out_height=1024)) + + pipeline = Pipeline([transport.output()]) + + task = PipelineTask(pipeline) + + # We only specify 5 months as we create tasks all at once and we might + # get rate limited otherwise. + months: list[str] = [ + "January", + "February", + # "March", + # "April", + # "May", + ] + + # We create one task per month. This will be executed concurrently. + month_tasks = [asyncio.create_task(get_month_data(month)) for month in months] + + # Now we wait for each month task in the order they're completed. The + # benefit is we'll have as little delay as possible before the first + # month, and likely no delay between months, but the months won't + # display in order. + async def show_images(month_tasks): + for month_data_task in asyncio.as_completed(month_tasks): + data = await month_data_task + await task.queue_frames([data["image"], data["audio"]]) + + await runner.stop_when_done() + + async def run_tk(): + while True: + tk_root.update() + tk_root.update_idletasks() + await asyncio.sleep(0.1) + + await asyncio.gather(runner.run(task), show_images(month_tasks), run_tk()) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/foundational/05a-local-sync-speech-and-text.py b/examples/foundational/05a-local-sync-speech-and-text.py deleted file mode 100644 index 7c4cf0186..000000000 --- a/examples/foundational/05a-local-sync-speech-and-text.py +++ /dev/null @@ -1,146 +0,0 @@ -import aiohttp -import asyncio -import logging -import tkinter as tk -import os -from dailyai.pipeline.aggregators import LLMFullResponseAggregator - -from dailyai.pipeline.frames import AudioFrame, URLImageFrame, LLMMessagesFrame, TextFrame -from dailyai.services.open_ai_services import OpenAILLMService -from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService -from dailyai.services.fal_ai_services import FalImageGenService -from dailyai.transports.local_transport import LocalTransport - -from dotenv import load_dotenv -load_dotenv(override=True) - -logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") -logger.setLevel(logging.DEBUG) - - -async def main(): - async with aiohttp.ClientSession() as session: - meeting_duration_minutes = 5 - tk_root = tk.Tk() - tk_root.title("dailyai") - - transport = LocalTransport( - mic_enabled=True, - camera_enabled=True, - camera_width=1024, - camera_height=1024, - duration_minutes=meeting_duration_minutes, - tk_root=tk_root, - ) - - tts = ElevenLabsTTSService( - aiohttp_session=session, - api_key=os.getenv("ELEVENLABS_API_KEY"), - voice_id=os.getenv("ELEVENLABS_VOICE_ID"), - ) - - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4-turbo-preview") - - imagegen = FalImageGenService( - params=FalImageGenService.InputParams( - image_size="1024x1024" - ), - aiohttp_session=session, - key=os.getenv("FAL_KEY"), - ) - - # Get a complete audio chunk from the given text. Splitting this into its own - # coroutine lets us ensure proper ordering of the audio chunks on the - # send queue. - async def get_all_audio(text): - all_audio = bytearray() - async for audio in tts.run_tts(text): - all_audio.extend(audio) - - return all_audio - - async def get_month_description(aggregator, frame): - async for frame in aggregator.process_frame(frame): - if isinstance(frame, TextFrame): - return frame.text - - async def get_month_data(month): - messages = [{"role": "system", "content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.", }] - - messages_frame = LLMMessagesFrame(messages) - - llm_full_response_aggregator = LLMFullResponseAggregator() - - image_description = None - async for frame in llm.process_frame(messages_frame): - result = await get_month_description(llm_full_response_aggregator, frame) - if result: - image_description = result - break - - if not image_description: - return - - to_speak = f"{month}: {image_description}" - audio_task = asyncio.create_task(get_all_audio(to_speak)) - image_task = asyncio.create_task( - imagegen.run_image_gen(image_description)) - (audio, image_data) = await asyncio.gather(audio_task, image_task) - - return { - "month": month, - "text": image_description, - "image_url": image_data[0], - "image": image_data[1], - "image_size": image_data[2], - "audio": audio, - } - - # We only specify 5 months as we create tasks all at once and we might - # get rate limited otherwise. - months: list[str] = [ - "January", - "February", - "March", - "April", - "May", - ] - - async def show_images(): - # This will play the months in the order they're completed. The benefit - # is we'll have as little delay as possible before the first month, and - # likely no delay between months, but the months won't display in - # order. - for month_data_task in asyncio.as_completed(month_tasks): - data = await month_data_task - if data: - await transport.send_queue.put( - [ - URLImageFrame(data["image_url"], data["image"], data["image_size"]), - AudioFrame(data["audio"]), - ] - ) - - await asyncio.sleep(25) - - # wait for the output queue to be empty, then leave the meeting - await transport.stop_when_done() - - async def run_tk(): - while not transport._stop_threads.is_set(): - tk_root.update() - tk_root.update_idletasks() - await asyncio.sleep(0.1) - - month_tasks = [ - asyncio.create_task( - get_month_data(month)) for month in months] - - await asyncio.gather(transport.run(), show_images(), run_tk()) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/examples/foundational/06-listen-and-respond.py b/examples/foundational/06-listen-and-respond.py index 0de16b270..4e5d0758f 100644 --- a/examples/foundational/06-listen-and-respond.py +++ b/examples/foundational/06-listen-and-respond.py @@ -1,26 +1,37 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + import asyncio import aiohttp -import logging import os -from dailyai.pipeline.frames import LLMMessagesFrame -from dailyai.pipeline.pipeline import Pipeline - -from dailyai.transports.daily_transport import DailyTransport -from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService -from dailyai.services.open_ai_services import OpenAILLMService -from dailyai.services.ai_services import FrameLogger -from dailyai.pipeline.aggregators import ( +import sys + +from pipecat.frames.frames import LLMMessagesFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineTask +from pipecat.processors.aggregators.llm_response import ( LLMAssistantResponseAggregator, LLMUserResponseAggregator, ) +from pipecat.processors.logger import FrameLogger +from pipecat.services.elevenlabs import ElevenLabsTTSService +from pipecat.services.openai import OpenAILLMService +from pipecat.transports.services.daily import DailyParams, DailyTransport +from pipecat.vad.silero import SileroVAD + from runner import configure +from loguru import logger + from dotenv import load_dotenv load_dotenv(override=True) -logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") -logger.setLevel(logging.DEBUG) +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") async def main(room_url: str, token): @@ -29,14 +40,15 @@ async def main(room_url: str, token): room_url, token, "Respond bot", - duration_minutes=5, - start_transcription=True, - mic_enabled=True, - mic_sample_rate=16000, - camera_enabled=False, - vad_enabled=True, + DailyParams( + audio_in_enabled=True, # This is so Silero VAD can get audio data + audio_out_enabled=True, + transcription_enabled=True + ) ) + vad = SileroVAD() + tts = ElevenLabsTTSService( aiohttp_session=session, api_key=os.getenv("ELEVENLABS_API_KEY"), @@ -46,37 +58,35 @@ async def main(room_url: str, token): llm = OpenAILLMService( api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4-turbo-preview") - fl = FrameLogger("Inner") - fl2 = FrameLogger("Outer") + + fl_in = FrameLogger("Inner") + fl_out = FrameLogger("Outer") + messages = [ { "role": "system", - "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so it should not contain special characters. Respond to what the user said in a creative and helpful way.", }, ] - tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) - pipeline = Pipeline( - processors=[ - fl, - tma_in, - llm, - fl2, - tts, - tma_out, - ], - ) + pipeline = Pipeline([fl_in, transport.input(), vad, tma_in, llm, + fl_out, tts, tma_out, transport.output()]) - @transport.event_handler("on_first_other_participant_joined") - async def on_first_other_participant_joined(transport, participant): + task = PipelineTask(pipeline) + + @transport.event_handler("on_first_participant_joined") + async def on_first_participant_joined(transport, participant): + transport.capture_participant_transcription(participant["id"]) # Kick off the conversation. messages.append( {"role": "system", "content": "Please introduce yourself to the user."}) - await pipeline.queue_frames([LLMMessagesFrame(messages)]) + await task.queue_frames([LLMMessagesFrame(messages)]) + + runner = PipelineRunner() - await transport.run(pipeline) + await runner.run(task) if __name__ == "__main__": diff --git a/examples/foundational/06a-image-sync.py b/examples/foundational/06a-image-sync.py index 912586ec4..6b7d219cd 100644 --- a/examples/foundational/06a-image-sync.py +++ b/examples/foundational/06a-image-sync.py @@ -1,43 +1,59 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + import asyncio -import os -import logging -from typing import AsyncGenerator import aiohttp +import os +import sys + from PIL import Image -from dailyai.pipeline.frames import ImageFrame, Frame, TextFrame -from dailyai.pipeline.pipeline import Pipeline -from dailyai.transports.daily_transport import DailyTransport -from dailyai.services.ai_services import AIService -from dailyai.pipeline.aggregators import ( +from pipecat.frames.frames import ImageRawFrame, Frame, SystemFrame, TextFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineTask +from pipecat.processors.aggregators.llm_context import ( LLMAssistantContextAggregator, LLMUserContextAggregator, ) -from dailyai.services.open_ai_services import OpenAILLMService -from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.services.openai import OpenAILLMService +from pipecat.services.elevenlabs import ElevenLabsTTSService +from pipecat.transports.services.daily import DailyTransport +from pipecat.transports.services.daily import DailyParams from runner import configure +from loguru import logger + from dotenv import load_dotenv load_dotenv(override=True) -logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") -logger.setLevel(logging.DEBUG) +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") -class ImageSyncAggregator(AIService): +class ImageSyncAggregator(FrameProcessor): def __init__(self, speaking_path: str, waiting_path: str): + super().__init__() self._speaking_image = Image.open(speaking_path) + self._speaking_image_format = self._speaking_image.format self._speaking_image_bytes = self._speaking_image.tobytes() self._waiting_image = Image.open(waiting_path) + self._waiting_image_format = self._waiting_image.format self._waiting_image_bytes = self._waiting_image.tobytes() - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - yield ImageFrame(self._speaking_image_bytes, (1024, 1024)) - yield frame - yield ImageFrame(self._waiting_image_bytes, (1024, 1024)) + async def process_frame(self, frame: Frame, direction: FrameDirection): + if not isinstance(frame, SystemFrame): + await self.push_frame(ImageRawFrame(self._speaking_image_bytes, (1024, 1024), self._speaking_image_format)) + await self.push_frame(frame) + await self.push_frame(ImageRawFrame(self._waiting_image_bytes, (1024, 1024), self._waiting_image_format)) + else: + await self.push_frame(frame) async def main(room_url: str, token): @@ -46,12 +62,12 @@ async def main(room_url: str, token): room_url, token, "Respond bot", - 5, - camera_enabled=True, - camera_width=1024, - camera_height=1024, - mic_enabled=True, - mic_sample_rate=16000, + DailyParams( + audio_out_enabled=True, + camera_out_width=1024, + camera_out_height=1024, + transcription_enabled=True + ) ) tts = ElevenLabsTTSService( @@ -67,27 +83,32 @@ async def main(room_url: str, token): messages = [ { "role": "system", - "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so it should not include any special characters. Respond to what the user said in a creative and helpful way.", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so it should not contain special characters. Respond to what the user said in a creative and helpful way.", }, ] - tma_in = LLMUserContextAggregator( - messages, transport._my_participant_id) - tma_out = LLMAssistantContextAggregator( - messages, transport._my_participant_id - ) + tma_in = LLMUserContextAggregator(messages) + tma_out = LLMAssistantContextAggregator(messages) + image_sync_aggregator = ImageSyncAggregator( os.path.join(os.path.dirname(__file__), "assets", "speaking.png"), os.path.join(os.path.dirname(__file__), "assets", "waiting.png"), ) - pipeline = Pipeline([image_sync_aggregator, tma_in, llm, tma_out, tts]) + pipeline = Pipeline([transport.input(), image_sync_aggregator, + tma_in, llm, tma_out, tts, transport.output()]) + + task = PipelineTask(pipeline) + + @transport.event_handler("on_first_participant_joined") + async def on_first_participant_joined(transport, participant): + participant_name = participant["info"]["userName"] or '' + transport.capture_participant_transcription(participant["id"]) + await task.queue_frames([TextFrame(f"Hi, this is {participant_name}.")]) - @transport.event_handler("on_first_other_participant_joined") - async def on_first_other_participant_joined(transport, participant): - await pipeline.queue_frames([TextFrame("Hi, I'm listening!")]) + runner = PipelineRunner() - await transport.run(pipeline) + await runner.run(task) if __name__ == "__main__": diff --git a/examples/foundational/07-interruptible.py b/examples/foundational/07-interruptible.py index 3f35a3536..fd0c2f842 100644 --- a/examples/foundational/07-interruptible.py +++ b/examples/foundational/07-interruptible.py @@ -2,16 +2,16 @@ import aiohttp import logging import os -from dailyai.pipeline.aggregators import ( +from pipecat.pipeline.aggregators import ( LLMAssistantResponseAggregator, LLMUserResponseAggregator, ) -from dailyai.pipeline.pipeline import Pipeline -from dailyai.services.ai_services import FrameLogger -from dailyai.transports.daily_transport import DailyTransport -from dailyai.services.open_ai_services import OpenAILLMService -from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService +from pipecat.pipeline.pipeline import Pipeline +from pipecat.services.ai_services import FrameLogger +from pipecat.transports.daily_transport import DailyTransport +from pipecat.services.open_ai_services import OpenAILLMService +from pipecat.services.elevenlabs_ai_services import ElevenLabsTTSService from runner import configure @@ -19,7 +19,7 @@ load_dotenv(override=True) logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") +logger = logging.getLogger("pipecat") logger.setLevel(logging.DEBUG) diff --git a/examples/foundational/08-bots-arguing.py b/examples/foundational/08-bots-arguing.py index ea6208827..9e88b7f4c 100644 --- a/examples/foundational/08-bots-arguing.py +++ b/examples/foundational/08-bots-arguing.py @@ -3,14 +3,14 @@ import asyncio import logging import os -from dailyai.pipeline.aggregators import SentenceAggregator -from dailyai.pipeline.pipeline import Pipeline +from pipecat.pipeline.aggregators import SentenceAggregator +from pipecat.pipeline.pipeline import Pipeline -from dailyai.transports.daily_transport import DailyTransport -from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService -from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService -from dailyai.services.fal_ai_services import FalImageGenService -from dailyai.pipeline.frames import AudioFrame, EndFrame, ImageFrame, LLMMessagesFrame, TextFrame +from pipecat.transports.daily_transport import DailyTransport +from pipecat.services.azure_ai_services import AzureLLMService, AzureTTSService +from pipecat.services.elevenlabs_ai_services import ElevenLabsTTSService +from pipecat.services.fal_ai_services import FalImageGenService +from pipecat.pipeline.frames import AudioFrame, EndFrame, ImageFrame, LLMMessagesFrame, TextFrame from runner import configure @@ -18,7 +18,7 @@ load_dotenv(override=True) logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") +logger = logging.getLogger("pipecat") logger.setLevel(logging.DEBUG) diff --git a/examples/foundational/09-mirror.py b/examples/foundational/09-mirror.py new file mode 100644 index 000000000..5991fdf70 --- /dev/null +++ b/examples/foundational/09-mirror.py @@ -0,0 +1,62 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import sys + +from pipecat.frames.frames import AudioRawFrame, ImageRawFrame +from pipecat.processors.filter import Filter +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineTask +from pipecat.pipeline.parallel_pipeline import ParallelPipeline +from pipecat.transports.services.daily import DailyTransport, DailyParams + +from runner import configure + +from loguru import logger + +from dotenv import load_dotenv +load_dotenv(override=True) + +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + + +async def main(room_url, token): + transport = DailyTransport( + room_url, token, "Test", + DailyParams( + audio_in_enabled=True, + audio_out_enabled=True, + camera_out_enabled=True, + camera_out_width=1280, + camera_out_height=720 + ) + ) + + @transport.event_handler("on_first_participant_joined") + async def on_first_participant_joined(transport, participant): + transport.capture_participant_video(participant["id"]) + + # The ParallelPipeline is not really necessary here but it shows how you + # would process audio and video concurrently in parallel pipelines. + pipeline = Pipeline([transport.input(), + ParallelPipeline( + [Filter([AudioRawFrame])], + [Filter([ImageRawFrame])]), + transport.output()]) + + runner = PipelineRunner() + + task = PipelineTask(pipeline) + + await runner.run(task) + + +if __name__ == "__main__": + (url, token) = configure() + asyncio.run(main(url, token)) diff --git a/examples/foundational/09a-local-mirror.py b/examples/foundational/09a-local-mirror.py new file mode 100644 index 000000000..d8da14343 --- /dev/null +++ b/examples/foundational/09a-local-mirror.py @@ -0,0 +1,65 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import sys + +import tkinter as tk + +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineTask +from pipecat.transports.base_transport import TransportParams +from pipecat.transports.local.tk import TkLocalTransport +from pipecat.transports.services.daily import DailyParams, DailyTransport + +from runner import configure + +from loguru import logger + +from dotenv import load_dotenv +load_dotenv(override=True) + +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + + +async def main(room_url, token): + tk_root = tk.Tk() + tk_root.title("Local Mirror") + + daily_transport = DailyTransport(room_url, token, "Test", DailyParams(audio_in_enabled=True)) + + tk_transport = TkLocalTransport( + tk_root, + TransportParams( + audio_out_enabled=True, + camera_out_enabled=True, + camera_out_width=1280, + camera_out_height=720)) + + @daily_transport.event_handler("on_first_participant_joined") + async def on_first_participant_joined(transport, participant): + transport.capture_participant_video(participant["id"]) + + pipeline = Pipeline([daily_transport.input(), tk_transport.output()]) + + runner = PipelineRunner() + + async def run_tk(): + while runner.is_active(): + tk_root.update() + tk_root.update_idletasks() + await asyncio.sleep(0.1) + + task = PipelineTask(pipeline) + + await asyncio.gather(runner.run(task), run_tk()) + + +if __name__ == "__main__": + (url, token) = configure() + asyncio.run(main(url, token)) diff --git a/examples/foundational/10-wake-word.py b/examples/foundational/10-wake-word.py index 4d997153a..7c7977707 100644 --- a/examples/foundational/10-wake-word.py +++ b/examples/foundational/10-wake-word.py @@ -1,36 +1,47 @@ -import aiohttp +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + import asyncio -import logging +import aiohttp import os import random -from typing import AsyncGenerator +import sys + from PIL import Image -from dailyai.pipeline.pipeline import Pipeline -from dailyai.transports.daily_transport import DailyTransport -from dailyai.services.open_ai_services import OpenAILLMService -from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService -from dailyai.pipeline.aggregators import ( - LLMUserContextAggregator, - LLMAssistantContextAggregator, -) -from dailyai.pipeline.frames import ( +from pipecat.frames.frames import ( Frame, + SystemFrame, TextFrame, - ImageFrame, + ImageRawFrame, SpriteFrame, TranscriptionFrame, ) -from dailyai.services.ai_services import AIService +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineTask +from pipecat.processors.aggregators.llm_context import ( + LLMUserContextAggregator, + LLMAssistantContextAggregator, +) +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.services.openai import OpenAILLMService +from pipecat.services.elevenlabs import ElevenLabsTTSService +from pipecat.transports.services.daily import DailyParams, DailyTransport from runner import configure +from loguru import logger + from dotenv import load_dotenv load_dotenv(override=True) -logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") -logger.setLevel(logging.DEBUG) +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + sprites = {} image_files = [ @@ -52,14 +63,15 @@ filename = os.path.splitext(os.path.basename(full_path))[0] # Open the image and convert it to bytes with Image.open(full_path) as img: - sprites[file] = img.tobytes() + sprites[file] = ImageRawFrame(img.tobytes(), img.size, img.format) # When the bot isn't talking, show a static image of the cat listening -quiet_frame = ImageFrame(sprites["sc-listen-1.png"], (720, 1280)) +quiet_frame = sprites["sc-listen-1.png"] + # When the bot is talking, build an animation from two sprites talking_list = [sprites["sc-default.png"], sprites["sc-talk.png"]] talking = [random.choice(talking_list) for x in range(30)] -talking_frame = SpriteFrame(images=talking) +talking_frame = SpriteFrame(talking) # TODO: Support "thinking" as soon as we get a valid transcript, while LLM # is processing @@ -69,50 +81,44 @@ sprites["sc-think-3.png"], sprites["sc-think-4.png"], ] -thinking_frame = SpriteFrame(images=thinking_list) - +thinking_frame = SpriteFrame(thinking_list) -class TranscriptFilter(AIService): - def __init__(self, bot_participant_id=None): - self.bot_participant_id = bot_participant_id - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - if isinstance(frame, TranscriptionFrame): - if frame.participantId != self.bot_participant_id: - yield frame - - -class NameCheckFilter(AIService): +class NameCheckFilter(FrameProcessor): def __init__(self, names: list[str]): - self.names = names - self.sentence = "" + super().__init__() + self._names = names + self._sentence = "" + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, SystemFrame): + await self.push_frame(frame, direction) + return - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: content: str = "" # TODO: split up transcription by participant - if isinstance(frame, TextFrame): - content = frame.text - - self.sentence += content - if self.sentence.endswith((".", "?", "!")): - if any(name in self.sentence for name in self.names): - out = self.sentence - self.sentence = "" - yield TextFrame(out) - else: - out = self.sentence - self.sentence = "" - - -class ImageSyncAggregator(AIService): + if isinstance(frame, TranscriptionFrame): + content = frame.data + self._sentence += content + if self._sentence.endswith((".", "?", "!")): + if any(name in self._sentence for name in self._names): + await self.push_frame(TextFrame(self._sentence)) + self._sentence = "" + else: + self._sentence = "" + else: + await self.push_frame(frame, direction) + + +class ImageSyncAggregator(FrameProcessor): def __init__(self): - pass + super().__init__() - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - yield talking_frame - yield frame - yield quiet_frame + async def process_frame(self, frame: Frame, direction: FrameDirection): + await self.push_frame(talking_frame) + await self.push_frame(frame) + await self.push_frame(quiet_frame) async def main(room_url: str, token): @@ -121,13 +127,14 @@ async def main(room_url: str, token): room_url, token, "Santa Cat", - duration_minutes=3, - start_transcription=True, - mic_enabled=True, - mic_sample_rate=16000, - camera_enabled=True, - camera_width=720, - camera_height=1280, + DailyParams( + audio_out_enabled=True, + camera_out_enabled=True, + camera_out_width=720, + camera_out_height=1280, + camera_out_framerate=10, + transcription_enabled=True + ) ) llm = OpenAILLMService( @@ -148,27 +155,27 @@ async def main(room_url: str, token): }, ] - tma_in = LLMUserContextAggregator( - messages, transport._my_participant_id) - tma_out = LLMAssistantContextAggregator( - messages, transport._my_participant_id - ) - tf = TranscriptFilter(transport._my_participant_id) + tma_in = LLMUserContextAggregator(messages) + tma_out = LLMAssistantContextAggregator(messages) ncf = NameCheckFilter(["Santa Cat", "Santa"]) - pipeline = Pipeline([isa, tf, ncf, tma_in, llm, tma_out, tts]) + pipeline = Pipeline([transport.input(), isa, ncf, tma_in, + llm, tma_out, tts, transport.output()]) - @transport.event_handler("on_first_other_participant_joined") - async def on_first_other_participant_joined(transport, participant): - await transport.say( - "Hi! If you want to talk to me, just say 'hey Santa Cat'.", - tts, - ) + @transport.event_handler("on_first_participant_joined") + async def on_first_participant_joined(transport, participant): + # Send some greeting at the beginning. + await tts.say("Hi! If you want to talk to me, just say 'hey Santa Cat'.") + transport.capture_participant_transcription(participant["id"]) async def starting_image(): - await transport.send_queue.put(quiet_frame) + await transport.send_image(quiet_frame) + + runner = PipelineRunner() + + task = PipelineTask(pipeline) - await asyncio.gather(transport.run(pipeline), starting_image()) + await asyncio.gather(runner.run(task), starting_image()) if __name__ == "__main__": diff --git a/examples/foundational/11-sound-effects.py b/examples/foundational/11-sound-effects.py index ee8a29ce3..837a2fb9d 100644 --- a/examples/foundational/11-sound-effects.py +++ b/examples/foundational/11-sound-effects.py @@ -1,34 +1,44 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + import aiohttp import asyncio -import logging import os +import sys import wave -from dailyai.pipeline.pipeline import Pipeline -from dailyai.transports.daily_transport import DailyTransport -from dailyai.services.open_ai_services import OpenAILLMService -from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService -from dailyai.pipeline.aggregators import ( - LLMUserContextAggregator, - LLMAssistantContextAggregator, -) -from dailyai.services.ai_services import AIService, FrameLogger -from dailyai.pipeline.frames import ( +from pipecat.frames.frames import ( Frame, - AudioFrame, + AudioRawFrame, LLMResponseEndFrame, LLMMessagesFrame, ) -from typing import AsyncGenerator +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineTask +from pipecat.processors.aggregators.llm_context import ( + LLMUserContextAggregator, + LLMAssistantContextAggregator, +) +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.processors.logger import FrameLogger +from pipecat.services.elevenlabs import ElevenLabsTTSService +from pipecat.services.openai import OpenAILLMService +from pipecat.transports.services.daily import DailyParams, DailyTransport from runner import configure +from loguru import logger + from dotenv import load_dotenv load_dotenv(override=True) -logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") -logger.setLevel(logging.DEBUG) +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + sounds = {} sound_files = ["ding1.wav", "ding2.wav"] @@ -42,33 +52,34 @@ filename = os.path.splitext(os.path.basename(full_path))[0] # Open the image and convert it to bytes with wave.open(full_path) as audio_file: - sounds[file] = audio_file.readframes(-1) + sounds[file] = AudioRawFrame(audio_file.readframes(-1), + audio_file.getframerate(), audio_file.getnchannels()) -class OutboundSoundEffectWrapper(AIService): +class OutboundSoundEffectWrapper(FrameProcessor): def __init__(self): - pass + super().__init__() - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: + async def process_frame(self, frame: Frame, direction: FrameDirection): if isinstance(frame, LLMResponseEndFrame): - yield AudioFrame(sounds["ding1.wav"]) - # In case anything else up the stack needs it - yield frame + await self.push_frame(sounds["ding1.wav"]) + # In case anything else downstream needs it + await self.push_frame(frame, direction) else: - yield frame + await self.push_frame(frame, direction) -class InboundSoundEffectWrapper(AIService): +class InboundSoundEffectWrapper(FrameProcessor): def __init__(self): - pass + super().__init__() - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: + async def process_frame(self, frame: Frame, direction: FrameDirection): if isinstance(frame, LLMMessagesFrame): - yield AudioFrame(sounds["ding2.wav"]) - # In case anything else up the stack needs it - yield frame + await self.push_frame(sounds["ding2.wav"]) + # In case anything else downstream needs it + await self.push_frame(frame, direction) else: - yield frame + await self.push_frame(frame, direction) async def main(room_url: str, token): @@ -77,10 +88,7 @@ async def main(room_url: str, token): room_url, token, "Respond bot", - duration_minutes=5, - mic_enabled=True, - mic_sample_rate=16000, - camera_enabled=False, + DailyParams(audio_out_enabled=True, transcription_enabled=True) ) llm = OpenAILLMService( @@ -100,24 +108,27 @@ async def main(room_url: str, token): }, ] - tma_in = LLMUserContextAggregator( - messages, transport._my_participant_id) - tma_out = LLMAssistantContextAggregator( - messages, transport._my_participant_id - ) + tma_in = LLMUserContextAggregator(messages) + tma_out = LLMAssistantContextAggregator(messages) out_sound = OutboundSoundEffectWrapper() in_sound = InboundSoundEffectWrapper() fl = FrameLogger("LLM Out") fl2 = FrameLogger("Transcription In") - pipeline = Pipeline([tma_in, in_sound, fl2, llm, tma_out, fl, tts, out_sound]) + pipeline = Pipeline([transport.input(), tma_in, in_sound, fl2, llm, + tma_out, fl, tts, out_sound, transport.output()]) + + @transport.event_handler("on_first_participant_joined") + async def on_first_participant_joined(transport, participant): + transport.capture_participant_transcription(participant["id"]) + await tts.say("Hi, I'm listening!") + await transport.send_audio(sounds["ding1.wav"]) + + runner = PipelineRunner() - @transport.event_handler("on_first_other_participant_joined") - async def on_first_other_participant_joined(transport, participant): - await transport.say("Hi, I'm listening!", tts) - await transport.send_queue.put(AudioFrame(sounds["ding1.wav"])) + task = PipelineTask(pipeline) - await asyncio.gather(transport.run(pipeline)) + await runner.run(task) if __name__ == "__main__": diff --git a/examples/foundational/12-describe-video.py b/examples/foundational/12-describe-video.py index 62e116020..feef343fc 100644 --- a/examples/foundational/12-describe-video.py +++ b/examples/foundational/12-describe-video.py @@ -1,38 +1,50 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + import asyncio import aiohttp -import logging import os - -from typing import AsyncGenerator - -from dailyai.pipeline.aggregators import FrameProcessor, UserResponseAggregator, VisionImageFrameAggregator - -from dailyai.pipeline.frames import Frame, TextFrame, UserImageRequestFrame -from dailyai.pipeline.pipeline import Pipeline -from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService -from dailyai.services.moondream_ai_service import MoondreamService -from dailyai.transports.daily_transport import DailyTransport +import sys + +from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineTask +from pipecat.processors.aggregators.user_response import UserResponseAggregator +from pipecat.processors.aggregators.vision_image_frame import VisionImageFrameAggregator +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.services.elevenlabs import ElevenLabsTTSService +from pipecat.services.moondream import MoondreamService +from pipecat.transports.services.daily import DailyParams, DailyTransport +from pipecat.vad.silero import SileroVAD from runner import configure +from loguru import logger + from dotenv import load_dotenv load_dotenv(override=True) -logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") -logger.setLevel(logging.DEBUG) +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") class UserImageRequester(FrameProcessor): - participant_id: str + + def __init__(self, participant_id: str | None = None): + super().__init__() + self._participant_id = participant_id def set_participant_id(self, participant_id: str): - self.participant_id = participant_id + self._participant_id = participant_id - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - if self.participant_id and isinstance(frame, TextFrame): - yield UserImageRequestFrame(self.participant_id) - yield frame + async def process_frame(self, frame: Frame, direction: FrameDirection): + if self._participant_id and isinstance(frame, TextFrame): + await self.push_frame(UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM) + await self.push_frame(frame, direction) async def main(room_url: str, token): @@ -41,14 +53,15 @@ async def main(room_url: str, token): room_url, token, "Describe participant video", - duration_minutes=5, - mic_enabled=True, - mic_sample_rate=16000, - vad_enabled=True, - start_transcription=True, - video_rendering_enabled=True + DailyParams( + audio_in_enabled=True, # This is so Silero VAD can get audio data + audio_out_enabled=True, + transcription_enabled=True + ) ) + vad = SileroVAD() + tts = ElevenLabsTTSService( aiohttp_session=session, api_key=os.getenv("ELEVENLABS_API_KEY"), @@ -70,15 +83,21 @@ async def main(room_url: str, token): voice_id=os.getenv("ELEVENLABS_VOICE_ID"), ) - @transport.event_handler("on_first_other_participant_joined") - async def on_first_other_participant_joined(transport, participant): - await transport.say("Hi there! Feel free to ask me what I see.", tts) - transport.render_participant_video(participant["id"], framerate=0) + @transport.event_handler("on_first_participant_joined") + async def on_first_participant_joined(transport, participant): + await tts.say("Hi there! Feel free to ask me what I see.") + transport.capture_participant_video(participant["id"], framerate=0) + transport.capture_participant_transcription(participant["id"]) image_requester.set_participant_id(participant["id"]) - pipeline = Pipeline([user_response, image_requester, vision_aggregator, moondream, tts]) + pipeline = Pipeline([transport.input(), vad, user_response, image_requester, + vision_aggregator, moondream, tts, transport.output()]) + + task = PipelineTask(pipeline) + + runner = PipelineRunner() - await transport.run(pipeline) + await runner.run(task) if __name__ == "__main__": (url, token) = configure() diff --git a/examples/foundational/13-whisper-transcription.py b/examples/foundational/13-whisper-transcription.py index 054cf8450..2e0ec4c2c 100644 --- a/examples/foundational/13-whisper-transcription.py +++ b/examples/foundational/13-whisper-transcription.py @@ -1,56 +1,56 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + import asyncio -import logging +import sys -from dailyai.pipeline.frames import EndFrame, TranscriptionFrame -from dailyai.transports.daily_transport import DailyTransport -from dailyai.services.whisper_ai_services import WhisperSTTService -from dailyai.pipeline.pipeline import Pipeline +from pipecat.frames.frames import Frame, TranscriptionFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineTask +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.services.whisper import WhisperSTTService +from pipecat.transports.services.daily import DailyParams, DailyTransport from runner import configure +from loguru import logger + from dotenv import load_dotenv load_dotenv(override=True) -logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") -logger.setLevel(logging.DEBUG) +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + + +class TranscriptionLogger(FrameProcessor): + + def __init__(self): + super().__init__() + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, TranscriptionFrame): + print(f"Transcription: {frame.text}") async def main(room_url: str): - transport = DailyTransport( - room_url, - None, - "Transcription bot", - start_transcription=False, - mic_enabled=False, - camera_enabled=False, - speaker_enabled=True, - ) + transport = DailyTransport(room_url, None, "Transcription bot", + DailyParams(audio_in_enabled=True)) stt = WhisperSTTService() - transcription_output_queue = asyncio.Queue() - transport_done = asyncio.Event() + tl = TranscriptionLogger() - pipeline = Pipeline([stt], source=transport.receive_queue, sink=transcription_output_queue) + pipeline = Pipeline([transport.input(), stt, tl]) - async def handle_transcription(): - print("`````````TRANSCRIPTION`````````") - while not transport_done.is_set(): - item = await transcription_output_queue.get() - print("got item from queue", item) - if isinstance(item, TranscriptionFrame): - print(item.text) - elif isinstance(item, EndFrame): - break - print("handle_transcription done") + task = PipelineTask(pipeline) - async def run_until_done(): - await transport.run() - transport_done.set() - print("run_until_done done") + runner = PipelineRunner() - await asyncio.gather(run_until_done(), pipeline.run_pipeline(), handle_transcription()) + await runner.run(task) if __name__ == "__main__": diff --git a/examples/foundational/13a-whisper-local.py b/examples/foundational/13a-whisper-local.py index 93ba93e4b..8bdffc1ea 100644 --- a/examples/foundational/13a-whisper-local.py +++ b/examples/foundational/13a-whisper-local.py @@ -1,51 +1,58 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + import asyncio -import logging +import sys + +from pipecat.frames.frames import Frame, TranscriptionFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineTask +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.services.whisper import WhisperSTTService +from pipecat.transports.base_transport import TransportParams +from pipecat.transports.local.audio import LocalAudioTransport + +from runner import configure + +from loguru import logger + +from dotenv import load_dotenv +load_dotenv(override=True) + +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + -from dailyai.pipeline.frames import EndFrame, TranscriptionFrame -from dailyai.transports.local_transport import LocalTransport -from dailyai.services.whisper_ai_services import WhisperSTTService -from dailyai.pipeline.pipeline import Pipeline +class TranscriptionLogger(FrameProcessor): -logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") -logger.setLevel(logging.DEBUG) + def __init__(self): + super().__init__() + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, TranscriptionFrame): + print(f"Transcription: {frame.text}") -async def main(): - meeting_duration_minutes = 1 - transport = LocalTransport( - mic_enabled=True, - camera_enabled=False, - speaker_enabled=True, - duration_minutes=meeting_duration_minutes, - ) +async def main(room_url: str): + transport = LocalAudioTransport(TransportParams(audio_in_enabled=True)) stt = WhisperSTTService() - transcription_output_queue = asyncio.Queue() - transport_done = asyncio.Event() + tl = TranscriptionLogger() - pipeline = Pipeline([stt], source=transport.receive_queue, sink=transcription_output_queue) + pipeline = Pipeline([transport.input(), stt, tl]) - async def handle_transcription(): - print("`````````TRANSCRIPTION`````````") - while not transport_done.is_set(): - item = await transcription_output_queue.get() - print("got item from queue", item) - if isinstance(item, TranscriptionFrame): - print(item.text) - elif isinstance(item, EndFrame): - break - print("handle_transcription done") + task = PipelineTask(pipeline) - async def run_until_done(): - await transport.run() - transport_done.set() - print("run_until_done done") + runner = PipelineRunner() - await asyncio.gather(run_until_done(), pipeline.run_pipeline(), handle_transcription()) + await runner.run(task) if __name__ == "__main__": - asyncio.run(main()) + (url, token) = configure() + asyncio.run(main(url)) diff --git a/examples/foundational/14-render-remote-participant.py b/examples/foundational/14-render-remote-participant.py deleted file mode 100644 index 7c2750754..000000000 --- a/examples/foundational/14-render-remote-participant.py +++ /dev/null @@ -1,52 +0,0 @@ -import asyncio -import logging - -from typing import AsyncGenerator - -from dailyai.pipeline.aggregators import FrameProcessor - -from dailyai.pipeline.frames import ImageFrame, Frame, UserImageFrame -from dailyai.pipeline.pipeline import Pipeline -from dailyai.transports.daily_transport import DailyTransport - -from runner import configure - -from dotenv import load_dotenv -load_dotenv(override=True) - -logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") -logger.setLevel(logging.DEBUG) - - -class UserImageProcessor(FrameProcessor): - - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - if isinstance(frame, UserImageFrame): - yield ImageFrame(frame.image, frame.size) - else: - yield frame - - -async def main(room_url: str, token): - transport = DailyTransport( - room_url, - token, - "Render participant video", - camera_width=1280, - camera_height=720, - camera_enabled=True, - video_rendering_enabled=True - ) - - @ transport.event_handler("on_first_other_participant_joined") - async def on_first_other_participant_joined(transport, participant): - transport.render_participant_video(participant["id"]) - - pipeline = Pipeline([UserImageProcessor()]) - - await asyncio.gather(transport.run(pipeline)) - -if __name__ == "__main__": - (url, token) = configure() - asyncio.run(main(url, token)) diff --git a/examples/foundational/14a-local-render-remote-participant.py b/examples/foundational/14a-local-render-remote-participant.py deleted file mode 100644 index 7614ef21d..000000000 --- a/examples/foundational/14a-local-render-remote-participant.py +++ /dev/null @@ -1,71 +0,0 @@ -import asyncio -import logging -import tkinter as tk - -from typing import AsyncGenerator - -from dailyai.pipeline.aggregators import FrameProcessor - -from dailyai.pipeline.frames import ImageFrame, Frame, UserImageFrame -from dailyai.pipeline.pipeline import Pipeline -from dailyai.transports.daily_transport import DailyTransport - -from dailyai.transports.local_transport import LocalTransport -from runner import configure - -from dotenv import load_dotenv -load_dotenv(override=True) - -logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") -logger.setLevel(logging.DEBUG) - - -class UserImageProcessor(FrameProcessor): - - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - if isinstance(frame, UserImageFrame): - yield ImageFrame(frame.image, frame.size) - else: - yield frame - - -async def main(room_url: str, token): - tk_root = tk.Tk() - tk_root.title("dailyai") - - local_transport = LocalTransport( - tk_root=tk_root, - camera_enabled=True, - camera_width=1280, - camera_height=720 - ) - - transport = DailyTransport( - room_url, - token, - "Render participant video", - video_rendering_enabled=True - ) - - @transport.event_handler("on_first_other_participant_joined") - async def on_first_other_participant_joined(transport, participant): - transport.render_participant_video(participant["id"]) - - async def run_tk(): - while not transport._stop_threads.is_set(): - tk_root.update() - tk_root.update_idletasks() - await asyncio.sleep(0.1) - - local_pipeline = Pipeline([UserImageProcessor()], source=transport.receive_queue) - - await asyncio.gather( - transport.run(), - local_transport.run(local_pipeline, override_pipeline_source_queue=False), - run_tk() - ) - -if __name__ == "__main__": - (url, token) = configure() - asyncio.run(main(url, token)) diff --git a/examples/foundational/websocket-server/frames.proto b/examples/foundational/websocket-server/frames.proto index 7ecea6d25..830e3062c 100644 --- a/examples/foundational/websocket-server/frames.proto +++ b/examples/foundational/websocket-server/frames.proto @@ -1,6 +1,6 @@ syntax = "proto3"; -package dailyai_proto; +package pipecat_proto; message TextFrame { string text = 1; diff --git a/examples/foundational/websocket-server/index.html b/examples/foundational/websocket-server/index.html index 77be13518..a38e1e78b 100644 --- a/examples/foundational/websocket-server/index.html +++ b/examples/foundational/websocket-server/index.html @@ -28,7 +28,7 @@

WebSocket Audio Stream

const proto = protobuf.load("frames.proto", (err, root) => { if (err) throw err; - frame = root.lookupType("dailyai_proto.Frame"); + frame = root.lookupType("pipecat_proto.Frame"); }); function initWebSocket() { diff --git a/examples/foundational/websocket-server/sample.py b/examples/foundational/websocket-server/sample.py index 22792270e..b3a4a731d 100644 --- a/examples/foundational/websocket-server/sample.py +++ b/examples/foundational/websocket-server/sample.py @@ -2,15 +2,15 @@ import aiohttp import logging import os -from dailyai.pipeline.frame_processor import FrameProcessor -from dailyai.pipeline.frames import TextFrame, TranscriptionFrame -from dailyai.pipeline.pipeline import Pipeline -from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService -from dailyai.transports.websocket_transport import WebsocketTransport -from dailyai.services.whisper_ai_services import WhisperSTTService +from pipecat.pipeline.frame_processor import FrameProcessor +from pipecat.pipeline.frames import TextFrame, TranscriptionFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.services.elevenlabs_ai_services import ElevenLabsTTSService +from pipecat.transports.websocket_transport import WebsocketTransport +from pipecat.services.whisper_ai_services import WhisperSTTService logging.basicConfig(format="%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") +logger = logging.getLogger("pipecat") logger.setLevel(logging.DEBUG) diff --git a/examples/image-gen.py b/examples/image-gen.py index 30d207447..d9b2cdc1e 100644 --- a/examples/image-gen.py +++ b/examples/image-gen.py @@ -5,11 +5,10 @@ import urllib.parse import random -from dailyai.transports.daily_transport import DailyTransport -from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService -from dailyai.pipeline.frames import Frame, FrameType -from dailyai.services.fal_ai_services import FalImageGenService -from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService +from pipecat.transports.daily_transport import DailyTransport +from pipecat.services.azure_ai_services import AzureLLMService, AzureTTSService +from pipecat.pipeline.frames import Frame +from pipecat.services.fal_ai_services import FalImageGenService async def main(room_url: str, token): diff --git a/examples/internal/11a-dial-out.py b/examples/internal/11a-dial-out.py index 2f3c456fc..f5d013eb1 100644 --- a/examples/internal/11a-dial-out.py +++ b/examples/internal/11a-dial-out.py @@ -3,11 +3,11 @@ import os import wave -from dailyai.transports.daily_transport import DailyTransport -from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService -from dailyai.pipeline.aggregators import LLMContextAggregator -from dailyai.services.ai_services import AIService, FrameLogger -from dailyai.pipeline.frames import Frame, AudioFrame, LLMResponseEndFrame, LLMMessagesFrame +from pipecat.transports.daily_transport import DailyTransport +from pipecat.services.azure_ai_services import AzureLLMService, AzureTTSService +from pipecat.pipeline.aggregators import LLMContextAggregator +from pipecat.services.ai_services import AIService, FrameLogger +from pipecat.pipeline.frames import Frame, AudioFrame, LLMResponseEndFrame, LLMMessagesFrame from typing import AsyncGenerator from runner import configure diff --git a/examples/starter-apps/chatbot.py b/examples/starter-apps/chatbot.py index a46f54c5a..c649d8e6a 100644 --- a/examples/starter-apps/chatbot.py +++ b/examples/starter-apps/chatbot.py @@ -5,11 +5,11 @@ from PIL import Image from typing import AsyncGenerator -from dailyai.pipeline.aggregators import ( +from pipecat.pipeline.aggregators import ( LLMAssistantResponseAggregator, LLMUserResponseAggregator, ) -from dailyai.pipeline.frames import ( +from pipecat.pipeline.frames import ( ImageFrame, SpriteFrame, Frame, @@ -18,11 +18,11 @@ AudioFrame, PipelineStartedFrame, ) -from dailyai.services.ai_services import AIService -from dailyai.pipeline.pipeline import Pipeline -from dailyai.transports.daily_transport import DailyTransport -from dailyai.services.open_ai_services import OpenAILLMService -from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService +from pipecat.services.ai_services import AIService +from pipecat.pipeline.pipeline import Pipeline +from pipecat.transports.daily_transport import DailyTransport +from pipecat.services.open_ai_services import OpenAILLMService +from pipecat.services.elevenlabs_ai_services import ElevenLabsTTSService from runner import configure @@ -30,7 +30,7 @@ load_dotenv(override=True) logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") +logger = logging.getLogger("pipecat") logger.setLevel(logging.DEBUG) sprites = [] diff --git a/examples/starter-apps/patient-intake.py b/examples/starter-apps/patient-intake.py index d8b11a93f..2f3823037 100644 --- a/examples/starter-apps/patient-intake.py +++ b/examples/starter-apps/patient-intake.py @@ -7,26 +7,26 @@ import re import wave from typing import AsyncGenerator, List -from dailyai.pipeline.opeanai_llm_aggregator import ( +from pipecat.pipeline.opeanai_llm_aggregator import ( OpenAIAssistantContextAggregator, OpenAIUserContextAggregator, ) -from dailyai.pipeline.pipeline import Pipeline -from dailyai.transports.daily_transport import DailyTransport -from dailyai.services.openai_llm_context import OpenAILLMContext -from dailyai.services.open_ai_services import OpenAILLMService -# from dailyai.services.deepgram_ai_services import DeepgramTTSService -from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService -from dailyai.services.fireworks_ai_services import FireworksLLMService -from dailyai.pipeline.frames import ( +from pipecat.pipeline.pipeline import Pipeline +from pipecat.transports.daily_transport import DailyTransport +from pipecat.services.openai_llm_context import OpenAILLMContext +from pipecat.services.open_ai_services import OpenAILLMService +# from pipecat.services.deepgram_ai_services import DeepgramTTSService +from pipecat.services.elevenlabs_ai_services import ElevenLabsTTSService +from pipecat.services.fireworks_ai_services import FireworksLLMService +from pipecat.pipeline.frames import ( Frame, LLMFunctionCallFrame, LLMFunctionStartFrame, AudioFrame, ) -from dailyai.pipeline.openai_frames import OpenAILLMContextFrame -from dailyai.services.ai_services import FrameLogger, AIService +from pipecat.pipeline.openai_frames import OpenAILLMContextFrame +from pipecat.services.ai_services import FrameLogger, AIService from openai._types import NotGiven, NOT_GIVEN from openai.types.chat import ( @@ -39,7 +39,7 @@ load_dotenv(override=True) logging.basicConfig(format="%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") +logger = logging.getLogger("pipecat") logger.setLevel(logging.DEBUG) sounds = {} diff --git a/examples/starter-apps/storybot.py b/examples/starter-apps/storybot.py index 69be94095..af23c7ec9 100644 --- a/examples/starter-apps/storybot.py +++ b/examples/starter-apps/storybot.py @@ -9,20 +9,21 @@ from typing import AsyncGenerator from PIL import Image -from dailyai.pipeline.pipeline import Pipeline -from dailyai.pipeline.frame_processor import FrameProcessor -from dailyai.transports.daily_transport import DailyTransport -from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService -from dailyai.services.fal_ai_services import FalImageGenService -from dailyai.services.open_ai_services import OpenAILLMService -from dailyai.services.deepgram_ai_services import DeepgramTTSService -from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService -from dailyai.pipeline.aggregators import ( +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.frame_processor import FrameProcessor +from pipecat.services.live_stream import LiveStream +from pipecat.transports.daily_transport import DailyTransport +from pipecat.services.azure_ai_services import AzureLLMService, AzureTTSService +from pipecat.services.fal_ai_services import FalImageGenService +from pipecat.services.open_ai_services import OpenAILLMService +from pipecat.services.deepgram_ai_services import DeepgramTTSService +from pipecat.services.elevenlabs_ai_services import ElevenLabsTTSService +from pipecat.pipeline.aggregators import ( LLMAssistantContextAggregator, LLMAssistantResponseAggregator, LLMUserResponseAggregator, ) -from dailyai.pipeline.frames import ( +from pipecat.pipeline.frames import ( EndPipeFrame, LLMMessagesFrame, Frame, @@ -32,7 +33,7 @@ ImageFrame, UserStoppedSpeakingFrame, ) -from dailyai.services.ai_services import FrameLogger, AIService +from pipecat.services.ai_services import FrameLogger, AIService from runner import configure @@ -40,7 +41,7 @@ load_dotenv(override=True) logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") +logger = logging.getLogger("pipecat") logger.setLevel(logging.DEBUG) sounds = {} @@ -261,6 +262,10 @@ async def storytime(): ) await local_pipeline.run_pipeline() + pipeline = Pipeline([llm, lca, tts, ls_sink]) + pipeline.queue_frames([...]) + pipeline.run() + fl = FrameLogger("### After Image Generation") pipeline = Pipeline( processors=[ diff --git a/examples/starter-apps/translator.py b/examples/starter-apps/translator.py index 382fc57f4..94a0fe1e9 100644 --- a/examples/starter-apps/translator.py +++ b/examples/starter-apps/translator.py @@ -4,21 +4,21 @@ import os from typing import AsyncGenerator -from dailyai.pipeline.aggregators import ( +from pipecat.pipeline.aggregators import ( SentenceAggregator, ) -from dailyai.pipeline.frames import ( +from pipecat.pipeline.frames import ( Frame, LLMMessagesFrame, TextFrame, SendAppMessageFrame, ) -from dailyai.pipeline.frame_processor import FrameProcessor -from dailyai.pipeline.pipeline import Pipeline -from dailyai.transports.daily_transport import DailyTransport -from dailyai.services.azure_ai_services import AzureTTSService -from dailyai.services.open_ai_services import OpenAILLMService -from dailyai.pipeline.aggregators import LLMFullResponseAggregator +from pipecat.pipeline.frame_processor import FrameProcessor +from pipecat.pipeline.pipeline import Pipeline +from pipecat.transports.daily_transport import DailyTransport +from pipecat.services.azure_ai_services import AzureTTSService +from pipecat.services.open_ai_services import OpenAILLMService +from pipecat.pipeline.aggregators import LLMFullResponseAggregator from runner import configure @@ -28,7 +28,7 @@ load_dotenv(override=True) logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") -logger = logging.getLogger("dailyai") +logger = logging.getLogger("pipecat") logger.setLevel(logging.DEBUG) """ diff --git a/linux-py3.10-requirements.txt b/linux-py3.10-requirements.txt index cc37fce96..f2fa829e1 100644 --- a/linux-py3.10-requirements.txt +++ b/linux-py3.10-requirements.txt @@ -4,14 +4,14 @@ # # pip-compile --all-extras pyproject.toml # -aiohttp==3.9.4 - # via dailyai (pyproject.toml) +aiohttp==3.9.5 + # via pipecat (pyproject.toml) aiosignal==1.3.1 # via aiohttp annotated-types==0.6.0 # via pydantic anthropic==0.20.0 - # via dailyai (pyproject.toml) + # via pipecat (pyproject.toml) anyio==4.3.0 # via # anthropic @@ -21,11 +21,11 @@ async-timeout==4.0.3 # via aiohttp attrs==23.2.0 # via aiohttp -av==11.0.0 +av==12.0.0 # via faster-whisper azure-cognitiveservices-speech==1.36.0 - # via dailyai (pyproject.toml) -blinker==1.7.0 + # via pipecat (pyproject.toml) +blinker==1.8.2 # via flask certifi==2024.2.2 # via @@ -38,23 +38,23 @@ click==8.1.7 # via flask coloredlogs==15.0.1 # via onnxruntime -ctranslate2==4.1.0 +ctranslate2==4.2.1 # via faster-whisper -daily-python==0.7.3 - # via dailyai (pyproject.toml) +daily-python==0.7.4 + # via pipecat (pyproject.toml) distro==1.9.0 # via # anthropic # openai einops==0.7.0 - # via dailyai (pyproject.toml) -exceptiongroup==1.2.0 + # via pipecat (pyproject.toml) +exceptiongroup==1.2.1 # via anyio -fal-client==0.2.2 - # via dailyai (pyproject.toml) -faster-whisper==1.0.1 - # via dailyai (pyproject.toml) -filelock==3.13.4 +fal-client==0.3.0 + # via pipecat (pyproject.toml) +faster-whisper==1.0.2 + # via pipecat (pyproject.toml) +filelock==3.14.0 # via # huggingface-hub # pyht @@ -63,10 +63,10 @@ filelock==3.13.4 # triton flask==3.0.3 # via - # dailyai (pyproject.toml) + # pipecat (pyproject.toml) # flask-cors -flask-cors==4.0.0 - # via dailyai (pyproject.toml) +flask-cors==4.0.1 + # via pipecat (pyproject.toml) flatbuffers==24.3.25 # via onnxruntime frozenlist==1.4.1 @@ -77,7 +77,7 @@ fsspec==2024.3.1 # via # huggingface-hub # torch -grpcio==1.62.1 +grpcio==1.63.0 # via pyht h11==0.14.0 # via httpcore @@ -90,7 +90,7 @@ httpx==0.27.0 # openai httpx-sse==0.4.0 # via fal-client -huggingface-hub==0.22.2 +huggingface-hub==0.23.0 # via # faster-whisper # timm @@ -104,12 +104,14 @@ idna==3.7 # httpx # requests # yarl -itsdangerous==2.1.2 +itsdangerous==2.2.0 # via flask -jinja2==3.1.3 +jinja2==3.1.4 # via # flask # torch +loguru==0.7.2 + # via pipecat (pyproject.toml) markupsafe==2.1.5 # via # jinja2 @@ -125,7 +127,7 @@ networkx==3.3 numpy==1.26.4 # via # ctranslate2 - # dailyai (pyproject.toml) + # pipecat (pyproject.toml) # onnxruntime # torchvision # transformers @@ -160,10 +162,10 @@ nvidia-nvjitlink-cu12==12.4.127 # nvidia-cusparse-cu12 nvidia-nvtx-cu12==12.1.105 # via torch -onnxruntime==1.17.1 +onnxruntime==1.17.3 # via faster-whisper openai==1.14.3 - # via dailyai (pyproject.toml) + # via pipecat (pyproject.toml) packaging==24.0 # via # huggingface-hub @@ -171,38 +173,38 @@ packaging==24.0 # transformers pillow==10.3.0 # via - # dailyai (pyproject.toml) + # pipecat (pyproject.toml) # torchvision protobuf==4.25.3 # via # onnxruntime # pyht pyaudio==0.2.14 - # via dailyai (pyproject.toml) -pydantic==2.7.0 + # via pipecat (pyproject.toml) +pydantic==2.7.1 # via # anthropic # openai -pydantic-core==2.18.1 +pydantic-core==2.18.2 # via pydantic -pyht==0.0.27 - # via dailyai (pyproject.toml) +pyht==0.0.28 + # via pipecat (pyproject.toml) python-dotenv==1.0.1 - # via dailyai (pyproject.toml) + # via pipecat (pyproject.toml) pyyaml==6.0.1 # via # ctranslate2 # huggingface-hub # timm # transformers -regex==2023.12.25 +regex==2024.4.28 # via transformers requests==2.31.0 # via # huggingface-hub # pyht # transformers -safetensors==0.4.2 +safetensors==0.4.3 # via # timm # transformers @@ -217,7 +219,7 @@ sympy==1.12 # onnxruntime # torch timm==0.9.16 - # via dailyai (pyproject.toml) + # via pipecat (pyproject.toml) tokenizers==0.15.2 # via # anthropic @@ -225,28 +227,28 @@ tokenizers==0.15.2 # transformers torch==2.2.2 # via - # dailyai (pyproject.toml) + # pipecat (pyproject.toml) # timm # torchaudio # torchvision torchaudio==2.2.2 - # via dailyai (pyproject.toml) + # via pipecat (pyproject.toml) torchvision==0.17.2 # via timm -tqdm==4.66.2 +tqdm==4.66.4 # via # huggingface-hub # openai # transformers transformers==4.39.3 - # via dailyai (pyproject.toml) + # via pipecat (pyproject.toml) triton==2.2.0 # via torch typing-extensions==4.10.0 # via # anthropic # anyio - # dailyai (pyproject.toml) + # pipecat (pyproject.toml) # huggingface-hub # openai # pydantic @@ -255,8 +257,8 @@ typing-extensions==4.10.0 urllib3==2.2.1 # via requests websockets==12.0 - # via dailyai (pyproject.toml) -werkzeug==3.0.2 + # via pipecat (pyproject.toml) +werkzeug==3.0.3 # via flask yarl==1.9.4 # via aiohttp diff --git a/macos-py3.10-requirements.txt b/macos-py3.10-requirements.txt index ffa772b17..e6fbf2f65 100644 --- a/macos-py3.10-requirements.txt +++ b/macos-py3.10-requirements.txt @@ -4,14 +4,14 @@ # # pip-compile --all-extras pyproject.toml # -aiohttp==3.9.4 - # via dailyai (pyproject.toml) +aiohttp==3.9.5 + # via pipecat (pyproject.toml) aiosignal==1.3.1 # via aiohttp annotated-types==0.6.0 # via pydantic anthropic==0.20.0 - # via dailyai (pyproject.toml) + # via pipecat (pyproject.toml) anyio==4.3.0 # via # anthropic @@ -21,11 +21,11 @@ async-timeout==4.0.3 # via aiohttp attrs==23.2.0 # via aiohttp -av==11.0.0 +av==12.0.0 # via faster-whisper azure-cognitiveservices-speech==1.36.0 - # via dailyai (pyproject.toml) -blinker==1.7.0 + # via pipecat (pyproject.toml) +blinker==1.8.2 # via flask certifi==2024.2.2 # via @@ -38,23 +38,23 @@ click==8.1.7 # via flask coloredlogs==15.0.1 # via onnxruntime -ctranslate2==4.2.0 +ctranslate2==4.2.1 # via faster-whisper -daily-python==0.7.3 - # via dailyai (pyproject.toml) +daily-python==0.7.4 + # via pipecat (pyproject.toml) distro==1.9.0 # via # anthropic # openai einops==0.7.0 - # via dailyai (pyproject.toml) -exceptiongroup==1.2.0 + # via pipecat (pyproject.toml) +exceptiongroup==1.2.1 # via anyio -fal-client==0.2.2 - # via dailyai (pyproject.toml) -faster-whisper==1.0.1 - # via dailyai (pyproject.toml) -filelock==3.13.4 +fal-client==0.3.0 + # via pipecat (pyproject.toml) +faster-whisper==1.0.2 + # via pipecat (pyproject.toml) +filelock==3.14.0 # via # huggingface-hub # pyht @@ -62,10 +62,10 @@ filelock==3.13.4 # transformers flask==3.0.3 # via - # dailyai (pyproject.toml) + # pipecat (pyproject.toml) # flask-cors -flask-cors==4.0.0 - # via dailyai (pyproject.toml) +flask-cors==4.0.1 + # via pipecat (pyproject.toml) flatbuffers==24.3.25 # via onnxruntime frozenlist==1.4.1 @@ -76,7 +76,7 @@ fsspec==2024.3.1 # via # huggingface-hub # torch -grpcio==1.62.1 +grpcio==1.63.0 # via pyht h11==0.14.0 # via httpcore @@ -89,7 +89,7 @@ httpx==0.27.0 # openai httpx-sse==0.4.0 # via fal-client -huggingface-hub==0.22.2 +huggingface-hub==0.23.0 # via # faster-whisper # timm @@ -103,12 +103,14 @@ idna==3.7 # httpx # requests # yarl -itsdangerous==2.1.2 +itsdangerous==2.2.0 # via flask -jinja2==3.1.3 +jinja2==3.1.4 # via # flask # torch +loguru==0.7.2 + # via pipecat (pyproject.toml) markupsafe==2.1.5 # via # jinja2 @@ -124,14 +126,14 @@ networkx==3.3 numpy==1.26.4 # via # ctranslate2 - # dailyai (pyproject.toml) + # pipecat (pyproject.toml) # onnxruntime # torchvision # transformers -onnxruntime==1.17.1 +onnxruntime==1.17.3 # via faster-whisper openai==1.14.3 - # via dailyai (pyproject.toml) + # via pipecat (pyproject.toml) packaging==24.0 # via # huggingface-hub @@ -139,38 +141,38 @@ packaging==24.0 # transformers pillow==10.3.0 # via - # dailyai (pyproject.toml) + # pipecat (pyproject.toml) # torchvision protobuf==4.25.3 # via # onnxruntime # pyht pyaudio==0.2.14 - # via dailyai (pyproject.toml) -pydantic==2.7.0 + # via pipecat (pyproject.toml) +pydantic==2.7.1 # via # anthropic # openai -pydantic-core==2.18.1 +pydantic-core==2.18.2 # via pydantic -pyht==0.0.27 - # via dailyai (pyproject.toml) +pyht==0.0.28 + # via pipecat (pyproject.toml) python-dotenv==1.0.1 - # via dailyai (pyproject.toml) + # via pipecat (pyproject.toml) pyyaml==6.0.1 # via # ctranslate2 # huggingface-hub # timm # transformers -regex==2023.12.25 +regex==2024.4.28 # via transformers requests==2.31.0 # via # huggingface-hub # pyht # transformers -safetensors==0.4.2 +safetensors==0.4.3 # via # timm # transformers @@ -185,7 +187,7 @@ sympy==1.12 # onnxruntime # torch timm==0.9.16 - # via dailyai (pyproject.toml) + # via pipecat (pyproject.toml) tokenizers==0.15.2 # via # anthropic @@ -193,26 +195,26 @@ tokenizers==0.15.2 # transformers torch==2.2.2 # via - # dailyai (pyproject.toml) + # pipecat (pyproject.toml) # timm # torchaudio # torchvision torchaudio==2.2.2 - # via dailyai (pyproject.toml) + # via pipecat (pyproject.toml) torchvision==0.17.2 # via timm -tqdm==4.66.2 +tqdm==4.66.4 # via # huggingface-hub # openai # transformers transformers==4.39.3 - # via dailyai (pyproject.toml) + # via pipecat (pyproject.toml) typing-extensions==4.10.0 # via # anthropic # anyio - # dailyai (pyproject.toml) + # pipecat (pyproject.toml) # huggingface-hub # openai # pydantic @@ -221,8 +223,8 @@ typing-extensions==4.10.0 urllib3==2.2.1 # via requests websockets==12.0 - # via dailyai (pyproject.toml) -werkzeug==3.0.2 + # via pipecat (pyproject.toml) +werkzeug==3.0.3 # via flask yarl==1.9.4 # via aiohttp diff --git a/pyproject.toml b/pyproject.toml index c83aa74ba..a05ee97f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,9 +3,9 @@ requires = ["setuptools>=64", "setuptools_scm>=8"] build-backend = "setuptools.build_meta" [project] -name = "dailyai" +name = "pipecat" dynamic = ["version"] -description = "An open source framework for real-time, multi-modal, conversational AI applications" +description = "An open source framework for voice (and multimodal) assistants" license = { text = "BSD 2-Clause License" } readme = "README.md" requires-python = ">=3.7" @@ -22,22 +22,23 @@ classifiers = [ dependencies = [ "aiohttp~=3.9.0", "numpy~=1.26.0", + "loguru~=0.7.0", "Pillow~=10.3.0", "typing-extensions~=4.10.0", ] [project.urls] -Source = "https://github.com/daily-co/dailyai" +Source = "https://github.com/daily-co/pipecat" Website = "https://daily.co" [project.optional-dependencies] anthropic = [ "anthropic~=0.20.0" ] +audio = [ "pyaudio~=0.2.0" ] azure = [ "azure-cognitiveservices-speech~=1.36.0" ] daily = [ "daily-python~=0.7.0" ] examples = [ "python-dotenv~=1.0.0", "flask~=3.0.0", "flask_cors~=4.0.0" ] -fal = [ "fal-client~=0.2.0" ] +fal = [ "fal-client~=0.3.0" ] fireworks = [ "openai~=1.14.0" ] -local = [ "pyaudio~=0.2.0" ] moondream = [ "einops~=0.7.0", "timm~=0.9.0", "transformers~=4.39.0" ] openai = [ "openai~=1.14.0" ] playht = [ "pyht~=0.0.26" ] diff --git a/src/dailyai/pipeline/aggregators.py b/src/dailyai/pipeline/aggregators.py deleted file mode 100644 index 81ea5815c..000000000 --- a/src/dailyai/pipeline/aggregators.py +++ /dev/null @@ -1,549 +0,0 @@ -import asyncio -import re -import time - -from dailyai.pipeline.frame_processor import FrameProcessor - -from dailyai.pipeline.frames import ( - EndFrame, - EndPipeFrame, - Frame, - ImageFrame, - InterimTranscriptionFrame, - LLMMessagesFrame, - LLMResponseEndFrame, - LLMResponseStartFrame, - TextFrame, - TranscriptionFrame, - UserStartedSpeakingFrame, - UserStoppedSpeakingFrame, - VisionImageFrame, -) -from dailyai.pipeline.pipeline import Pipeline -from dailyai.services.ai_services import AIService - -from typing import AsyncGenerator, Coroutine, List - - -class ResponseAggregator(FrameProcessor): - """This frame processor aggregates frames between a start and an end frame - into complete text frame sentences. - - For example, frame input/output: - UserStartedSpeakingFrame() -> None - TranscriptionFrame("Hello,") -> None - TranscriptionFrame(" world.") -> None - UserStoppedSpeakingFrame() -> TextFrame("Hello world.") - - Doctest: - >>> async def print_frames(aggregator, frame): - ... async for frame in aggregator.process_frame(frame): - ... if isinstance(frame, TextFrame): - ... print(frame.text) - - >>> aggregator = ResponseAggregator(start_frame = UserStartedSpeakingFrame, - ... end_frame=UserStoppedSpeakingFrame, - ... accumulator_frame=TranscriptionFrame, - ... pass_through=False) - >>> asyncio.run(print_frames(aggregator, UserStartedSpeakingFrame())) - >>> asyncio.run(print_frames(aggregator, TranscriptionFrame("Hello,", 1, 1))) - >>> asyncio.run(print_frames(aggregator, TranscriptionFrame("world.", 1, 2))) - >>> asyncio.run(print_frames(aggregator, UserStoppedSpeakingFrame())) - Hello, world. - - """ - - def __init__( - self, - *, - start_frame, - end_frame, - accumulator_frame, - pass_through=True, - ): - self.aggregation = "" - self.aggregating = False - self._start_frame = start_frame - self._end_frame = end_frame - self._accumulator_frame = accumulator_frame - self._pass_through = pass_through - - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - if isinstance(frame, self._start_frame): - self.aggregating = True - elif isinstance(frame, self._end_frame): - self.aggregating = False - # Sometimes VAD triggers quickly on and off. If we don't get any transcription, - # it creates empty LLM message queue frames - if len(self.aggregation) > 0: - output = self.aggregation - self.aggregation = "" - yield self._end_frame() - yield TextFrame(output.strip()) - elif isinstance(frame, self._accumulator_frame) and self.aggregating: - self.aggregation += f" {frame.text}" - if self._pass_through: - yield frame - else: - yield frame - - -class UserResponseAggregator(ResponseAggregator): - def __init__(self): - super().__init__( - start_frame=UserStartedSpeakingFrame, - end_frame=UserStoppedSpeakingFrame, - accumulator_frame=TranscriptionFrame, - pass_through=False, - ) - - -class LLMResponseAggregator(FrameProcessor): - - def __init__( - self, - *, - messages: list[dict] | None, - role: str, - start_frame, - end_frame, - accumulator_frame, - interim_accumulator_frame=None, - pass_through=True, - ): - self.aggregation = "" - self.aggregating = False - self.messages = messages - self._role = role - self._start_frame = start_frame - self._end_frame = end_frame - self._accumulator_frame = accumulator_frame - self._interim_accumulator_frame = interim_accumulator_frame - self._pass_through = pass_through - self._seen_start_frame = False - self._seen_end_frame = False - self._seen_interim_results = False - - # Use cases implemented: - # - # S: Start, E: End, T: Transcription, I: Interim, X: Text - # - # S E -> None - # S T E -> X - # S I T E -> X - # S I E T -> X - # S I E I T -> X - # - # The following case would not be supported: - # - # S I E T1 I T2 -> X - # - # and T2 would be dropped. - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - if not self.messages: - return - - send_aggregation = False - - if isinstance(frame, self._start_frame): - self._seen_start_frame = True - self.aggregating = True - elif isinstance(frame, self._end_frame): - self._seen_end_frame = True - - # We might have received the end frame but we might still be - # aggregating (i.e. we have seen interim results but not the final - # text). - self.aggregating = self._seen_interim_results - - # Send the aggregation if we are not aggregating anymore (i.e. no - # more interim results received). - send_aggregation = not self.aggregating - elif isinstance(frame, self._accumulator_frame): - if self.aggregating: - self.aggregation += f" {frame.text}" - # We have receied a complete sentence, so if we have seen the - # end frame and we were still aggregating, it means we should - # send the aggregation. - send_aggregation = self._seen_end_frame - - if self._pass_through: - yield frame - - # We just got our final result, so let's reset interim results. - self._seen_interim_results = False - elif self._interim_accumulator_frame and isinstance(frame, self._interim_accumulator_frame): - self._seen_interim_results = True - else: - yield frame - - if send_aggregation and len(self.aggregation) > 0: - self.messages.append({"role": self._role, "content": self.aggregation}) - yield self._end_frame() - yield LLMMessagesFrame(self.messages) - # Reset - self.aggregation = "" - self._seen_start_frame = False - self._seen_end_frame = False - self._seen_interim_results = False - - -class LLMAssistantResponseAggregator(LLMResponseAggregator): - def __init__(self, messages: list[dict]): - super().__init__( - messages=messages, - role="assistant", - start_frame=LLMResponseStartFrame, - end_frame=LLMResponseEndFrame, - accumulator_frame=TextFrame, - ) - - -class LLMUserResponseAggregator(LLMResponseAggregator): - def __init__(self, messages: list[dict]): - super().__init__( - messages=messages, - role="user", - start_frame=UserStartedSpeakingFrame, - end_frame=UserStoppedSpeakingFrame, - accumulator_frame=TranscriptionFrame, - interim_accumulator_frame=InterimTranscriptionFrame, - pass_through=False, - ) - - -class LLMContextAggregator(AIService): - def __init__( - self, - messages: list[dict], - role: str, - bot_participant_id=None, - complete_sentences=True, - pass_through=True, - ): - super().__init__() - self.messages = messages - self.bot_participant_id = bot_participant_id - self.role = role - self.sentence = "" - self.complete_sentences = complete_sentences - self.pass_through = pass_through - - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - # We don't do anything with non-text frames, pass it along to next in - # the pipeline. - if not isinstance(frame, TextFrame): - yield frame - return - - # Ignore transcription frames from the bot - if isinstance(frame, TranscriptionFrame): - if frame.participantId == self.bot_participant_id: - return - - # The common case for "pass through" is receiving frames from the LLM that we'll - # use to update the "assistant" LLM messages, but also passing the text frames - # along to a TTS service to be spoken to the user. - if self.pass_through: - yield frame - - # TODO: split up transcription by participant - if self.complete_sentences: - # type: ignore -- the linter thinks this isn't a TextFrame, even - # though we check it above - self.sentence += frame.text - if self.sentence.endswith((".", "?", "!")): - self.messages.append( - {"role": self.role, "content": self.sentence}) - self.sentence = "" - yield LLMMessagesFrame(self.messages) - else: - # type: ignore -- the linter thinks this isn't a TextFrame, even - # though we check it above - self.messages.append({"role": self.role, "content": frame.text}) - yield LLMMessagesFrame(self.messages) - - -class LLMUserContextAggregator(LLMContextAggregator): - def __init__( - self, - messages: list[dict], - bot_participant_id=None, - complete_sentences=True): - super().__init__( - messages, - "user", - bot_participant_id, - complete_sentences, - pass_through=False) - - -class LLMAssistantContextAggregator(LLMContextAggregator): - def __init__( - self, - messages: list[dict], - bot_participant_id=None, - complete_sentences=True): - super().__init__( - messages, - "assistant", - bot_participant_id, - complete_sentences, - pass_through=True, - ) - - -class SentenceAggregator(FrameProcessor): - """This frame processor aggregates text frames into complete sentences. - - Frame input/output: - TextFrame("Hello,") -> None - TextFrame(" world.") -> TextFrame("Hello world.") - - Doctest: - >>> async def print_frames(aggregator, frame): - ... async for frame in aggregator.process_frame(frame): - ... print(frame.text) - - >>> aggregator = SentenceAggregator() - >>> asyncio.run(print_frames(aggregator, TextFrame("Hello,"))) - >>> asyncio.run(print_frames(aggregator, TextFrame(" world."))) - Hello, world. - """ - - def __init__(self): - self.aggregation = "" - - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - if isinstance(frame, TextFrame): - m = re.search("(.*[?.!])(.*)", frame.text) - if m: - yield TextFrame(self.aggregation + m.group(1)) - self.aggregation = m.group(2) - else: - self.aggregation += frame.text - elif isinstance(frame, EndFrame): - if self.aggregation: - yield TextFrame(self.aggregation) - yield frame - else: - yield frame - - -class LLMFullResponseAggregator(FrameProcessor): - """This class aggregates Text frames until it receives a - LLMResponseEndFrame, then emits the concatenated text as - a single text frame. - - given the following frames: - - TextFrame("Hello,") - TextFrame(" world.") - TextFrame(" I am") - TextFrame(" an LLM.") - LLMResponseEndFrame()] - - this processor will yield nothing for the first 4 frames, then - - TextFrame("Hello, world. I am an LLM.") - LLMResponseEndFrame() - - when passed the last frame. - - >>> async def print_frames(aggregator, frame): - ... async for frame in aggregator.process_frame(frame): - ... if isinstance(frame, TextFrame): - ... print(frame.text) - ... else: - ... print(frame.__class__.__name__) - - >>> aggregator = LLMFullResponseAggregator() - >>> asyncio.run(print_frames(aggregator, TextFrame("Hello,"))) - >>> asyncio.run(print_frames(aggregator, TextFrame(" world."))) - >>> asyncio.run(print_frames(aggregator, TextFrame(" I am"))) - >>> asyncio.run(print_frames(aggregator, TextFrame(" an LLM."))) - >>> asyncio.run(print_frames(aggregator, LLMResponseEndFrame())) - Hello, world. I am an LLM. - LLMResponseEndFrame - """ - - def __init__(self): - self.aggregation = "" - - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - if isinstance(frame, TextFrame): - self.aggregation += frame.text - elif isinstance(frame, LLMResponseEndFrame): - yield TextFrame(self.aggregation) - yield frame - self.aggregation = "" - else: - yield frame - - -class StatelessTextTransformer(FrameProcessor): - """This processor calls the given function on any text in a text frame. - - >>> async def print_frames(aggregator, frame): - ... async for frame in aggregator.process_frame(frame): - ... print(frame.text) - - >>> aggregator = StatelessTextTransformer(lambda x: x.upper()) - >>> asyncio.run(print_frames(aggregator, TextFrame("Hello"))) - HELLO - """ - - def __init__(self, transform_fn): - self.transform_fn = transform_fn - - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - if isinstance(frame, TextFrame): - result = self.transform_fn(frame.text) - if isinstance(result, Coroutine): - result = await result - - yield TextFrame(result) - else: - yield frame - - -class ParallelPipeline(FrameProcessor): - """Run multiple pipelines in parallel. - - This class takes frames from its source queue and sends them to each - sub-pipeline. Each sub-pipeline emits its frames into this class's - sink queue. No guarantees are made about the ordering of frames in - the sink queue (that is, no sub-pipeline has higher priority than - any other, frames are put on the sink in the order they're emitted - by the sub-pipelines). - - After each frame is taken from this class's source queue and placed - in each sub-pipeline's source queue, an EndPipeFrame is put on each - sub-pipeline's source queue. This indicates to the sub-pipe runner - that it should exit. - - Since frame handlers pass through unhandled frames by convention, this - class de-dupes frames in its sink before yielding them. - """ - - def __init__(self, pipeline_definitions: List[List[FrameProcessor]]): - self.sources = [asyncio.Queue() for _ in pipeline_definitions] - self.sink: asyncio.Queue[Frame] = asyncio.Queue() - self.pipelines: list[Pipeline] = [ - Pipeline( - pipeline_definition, - source, - self.sink, - ) - for source, pipeline_definition in zip(self.sources, pipeline_definitions) - ] - - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - for source in self.sources: - await source.put(frame) - await source.put(EndPipeFrame()) - - await asyncio.gather(*[pipeline.run_pipeline() for pipeline in self.pipelines]) - - seen_ids = set() - while not self.sink.empty(): - frame = await self.sink.get() - - # de-dup frames. Because the convention is to yield a frame that isn't processed, - # each pipeline will likely yield the same frame, so we will end up with _n_ copies - # of unprocessed frames where _n_ is the number of parallel pipes that don't - # process that frame. - if id(frame) in seen_ids: - continue - seen_ids.add(id(frame)) - - # Skip passing along EndPipeFrame, because we use them - # for our own flow control. - if not isinstance(frame, EndPipeFrame): - yield frame - - -class GatedAggregator(FrameProcessor): - """Accumulate frames, with custom functions to start and stop accumulation. - Yields gate-opening frame before any accumulated frames, then ensuing frames - until and not including the gate-closed frame. - - >>> from dailyai.pipeline.frames import ImageFrame - - >>> async def print_frames(aggregator, frame): - ... async for frame in aggregator.process_frame(frame): - ... if isinstance(frame, TextFrame): - ... print(frame.text) - ... else: - ... print(frame.__class__.__name__) - - >>> aggregator = GatedAggregator( - ... gate_close_fn=lambda x: isinstance(x, LLMResponseStartFrame), - ... gate_open_fn=lambda x: isinstance(x, ImageFrame), - ... start_open=False) - >>> asyncio.run(print_frames(aggregator, TextFrame("Hello"))) - >>> asyncio.run(print_frames(aggregator, TextFrame("Hello again."))) - >>> asyncio.run(print_frames(aggregator, ImageFrame(image=bytes([]), size=(0, 0)))) - ImageFrame - Hello - Hello again. - >>> asyncio.run(print_frames(aggregator, TextFrame("Goodbye."))) - Goodbye. - """ - - def __init__(self, gate_open_fn, gate_close_fn, start_open): - self.gate_open_fn = gate_open_fn - self.gate_close_fn = gate_close_fn - self.gate_open = start_open - self.accumulator: List[Frame] = [] - - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - if self.gate_open: - if self.gate_close_fn(frame): - self.gate_open = False - else: - if self.gate_open_fn(frame): - self.gate_open = True - - if self.gate_open: - yield frame - if self.accumulator: - for frame in self.accumulator: - yield frame - self.accumulator = [] - else: - self.accumulator.append(frame) - - -class VisionImageFrameAggregator(FrameProcessor): - """This aggregator waits for a consecutive TextFrame and an - ImageFrame. After the ImageFrame arrives it will output a VisionImageFrame. - - >>> from dailyai.pipeline.frames import ImageFrame - - >>> async def print_frames(aggregator, frame): - ... async for frame in aggregator.process_frame(frame): - ... print(frame) - - >>> aggregator = VisionImageFrameAggregator() - >>> asyncio.run(print_frames(aggregator, TextFrame("What do you see?"))) - >>> asyncio.run(print_frames(aggregator, ImageFrame(image=bytes([]), size=(0, 0)))) - VisionImageFrame, text: What do you see?, image size: 0x0, buffer size: 0 B - - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - self._describe_text = None - - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - if isinstance(frame, TextFrame): - self._describe_text = frame.text - elif isinstance(frame, ImageFrame): - if self._describe_text: - yield VisionImageFrame(self._describe_text, frame.image, frame.size) - self._describe_text = None - else: - yield frame - else: - yield frame diff --git a/src/dailyai/pipeline/frame_processor.py b/src/dailyai/pipeline/frame_processor.py deleted file mode 100644 index e8c78e3e2..000000000 --- a/src/dailyai/pipeline/frame_processor.py +++ /dev/null @@ -1,34 +0,0 @@ -from abc import abstractmethod -from typing import AsyncGenerator - -from dailyai.pipeline.frames import ControlFrame, Frame - - -class FrameProcessor: - """This is the base class for all frame processors. Frame processors consume a frame - and yield 0 or more frames. Generally frame processors are used as part of a pipeline - where frames come from a source queue, are processed by a series of frame processors, - then placed on a sink queue. - - By convention, FrameProcessors should immediately yield any frames they don't process. - - Stateful FrameProcessors should watch for the EndFrame and finalize their - output, eg. yielding an unfinished sentence if they're aggregating LLM output to full - sentences. EndFrame is also a chance to clean up any services that need to - be closed, del'd, etc. - """ - - @abstractmethod - async def process_frame( - self, frame: Frame - ) -> AsyncGenerator[Frame, None]: - """Process a single frame and yield 0 or more frames.""" - yield frame - - @abstractmethod - async def interrupted(self) -> None: - """Handle any cleanup if the pipeline was interrupted.""" - pass - - def __str__(self): - return self.__class__.__name__ diff --git a/src/dailyai/pipeline/frames.py b/src/dailyai/pipeline/frames.py deleted file mode 100644 index 28a920dd8..000000000 --- a/src/dailyai/pipeline/frames.py +++ /dev/null @@ -1,253 +0,0 @@ -from dataclasses import dataclass -from typing import Any, List - - -class Frame: - def __str__(self): - return f"{self.__class__.__name__}" - - -class ControlFrame(Frame): - # Control frames should contain no instance data, so - # equality is based solely on the class. - def __eq__(self, other): - return isinstance(other, self.__class__) - - -class StartFrame(ControlFrame): - """Used (but not required) to start a pipeline, and is also used to - indicate that an interruption has ended and the transport should start - processing frames again.""" - pass - - -class EndFrame(ControlFrame): - """Indicates that a pipeline has ended and frame processors and pipelines - should be shut down. If the transport receives this frame, it will stop - sending frames to its output channel(s) and close all its threads.""" - pass - - -class EndPipeFrame(ControlFrame): - """Indicates that a pipeline has ended but that the transport should - continue processing. This frame is used in parallel pipelines and other - sub-pipelines.""" - pass - - -class PipelineStartedFrame(ControlFrame): - """ - Used by the transport to indicate that execution of a pipeline is starting - (or restarting). It should be the first frame your app receives when it - starts, or when an interruptible pipeline has been interrupted. - """ - - pass - - -class LLMResponseStartFrame(ControlFrame): - """Used to indicate the beginning of an LLM response. Following TextFrames - are part of the LLM response until an LLMResponseEndFrame""" - pass - - -class LLMResponseEndFrame(ControlFrame): - """Indicates the end of an LLM response.""" - pass - - -@dataclass() -class AudioFrame(Frame): - """A chunk of audio. Will be played by the transport if the transport's mic - has been enabled.""" - data: bytes - - def __str__(self): - return f"{self.__class__.__name__}, size: {len(self.data)} B" - - -@dataclass() -class ImageFrame(Frame): - """An image. Will be shown by the transport if the transport's camera is - enabled.""" - image: bytes - size: tuple[int, int] - - def __str__(self): - return f"{self.__class__.__name__}, image size: {self.size[0]}x{self.size[1]} buffer size: {len(self.image)} B" - - -@dataclass() -class URLImageFrame(ImageFrame): - """An image with an associated URL. Will be shown by the transport if the - transport's camera is enabled. - - """ - url: str | None - - def __init__(self, url, image, size): - super().__init__(image, size) - self.url = url - - def __str__(self): - return f"{self.__class__.__name__}, url: {self.url}, image size: {self.size[0]}x{self.size[1]}, buffer size: {len(self.image)} B" - - -@dataclass() -class VisionImageFrame(ImageFrame): - """An image with an associated text to ask for a description of it. Will be shown by the - transport if the transport's camera is enabled. - - """ - text: str | None - - def __init__(self, text, image, size): - super().__init__(image, size) - self.text = text - - def __str__(self): - return f"{self.__class__.__name__}, text: {self.text}, image size: {self.size[0]}x{self.size[1]}, buffer size: {len(self.image)} B" - - -@dataclass() -class UserImageFrame(ImageFrame): - """An image associated to a user. Will be shown by the transport if the transport's camera is - enabled.""" - user_id: str - - def __init__(self, user_id, image, size): - super().__init__(image, size) - self.user_id = user_id - - def __str__(self): - return f"{self.__class__.__name__}, user: {self.user_id}, image size: {self.size[0]}x{self.size[1]}, buffer size: {len(self.image)} B" - - -@dataclass() -class UserImageRequestFrame(Frame): - """A frame user to request an image from the given user.""" - user_id: str - - def __str__(self): - return f"{self.__class__.__name__}, user: {self.user_id}" - - -@dataclass() -class SpriteFrame(Frame): - """An animated sprite. Will be shown by the transport if the transport's - camera is enabled. Will play at the framerate specified in the transport's - `fps` constructor parameter.""" - images: list[bytes] - - def __str__(self): - return f"{self.__class__.__name__}, list size: {len(self.images)}" - - -@dataclass() -class TextFrame(Frame): - """A chunk of text. Emitted by LLM services, consumed by TTS services, can - be used to send text through pipelines.""" - text: str - - def __str__(self): - return f'{self.__class__.__name__}: "{self.text}"' - - -@dataclass() -class TranscriptionFrame(TextFrame): - """A text frame with transcription-specific data. Will be placed in the - transport's receive queue when a participant speaks.""" - participantId: str - timestamp: str - - def __str__(self): - return f"{self.__class__.__name__}, text: '{self.text}' participantId: {self.participantId}, timestamp: {self.timestamp}" - - -@dataclass() -class InterimTranscriptionFrame(TextFrame): - """A text frame with interim transcription-specific data. Will be placed in - the transport's receive queue when a participant speaks.""" - participantId: str - timestamp: str - - def __str__(self): - return f"{self.__class__.__name__}, text: '{self.text}' participantId: {self.participantId}, timestamp: {self.timestamp}" - - -class TTSStartFrame(ControlFrame): - """Used to indicate the beginning of a TTS response. Following AudioFrames - are part of the TTS response until an TTEndFrame. These frames can be used - for aggregating audio frames in a transport to optimize the size of frames - sent to the session, without needing to control this in the TTS service.""" - pass - - -class TTSEndFrame(ControlFrame): - """Indicates the end of a TTS response.""" - pass - - -@dataclass() -class LLMMessagesFrame(Frame): - """A frame containing a list of LLM messages. Used to signal that an LLM - service should run a chat completion and emit an LLMStartFrames, TextFrames - and an LLMEndFrame. - Note that the messages property on this class is mutable, and will be - be updated by various ResponseAggregator frame processors.""" - messages: List[dict] - - -@dataclass() -class ReceivedAppMessageFrame(Frame): - message: Any - sender: str - - def __str__(self): - return f"ReceivedAppMessageFrame: sender: {self.sender}, message: {self.message}" - - -@dataclass() -class SendAppMessageFrame(Frame): - message: Any - participant_id: str | None - - def __str__(self): - return f"SendAppMessageFrame: participant: {self.participant_id}, message: {self.message}" - - -class UserStartedSpeakingFrame(Frame): - """Emitted by VAD to indicate that a participant has started speaking. - This can be used for interruptions or other times when detecting that - someone is speaking is more important than knowing what they're saying - (as you will with a TranscriptionFrame)""" - pass - - -class UserStoppedSpeakingFrame(Frame): - """Emitted by the VAD to indicate that a user stopped speaking.""" - pass - - -class BotStartedSpeakingFrame(Frame): - pass - - -class BotStoppedSpeakingFrame(Frame): - pass - - -@dataclass() -class LLMFunctionStartFrame(Frame): - """Emitted when the LLM receives the beginning of a function call - completion. A frame processor can use this frame to indicate that it should - start preparing to make a function call, if it can do so in the absence of - any arguments.""" - function_name: str - - -@dataclass() -class LLMFunctionCallFrame(Frame): - """Emitted when the LLM has received an entire function call completion.""" - function_name: str - arguments: str diff --git a/src/dailyai/pipeline/openai_frames.py b/src/dailyai/pipeline/openai_frames.py deleted file mode 100644 index 2a14c670e..000000000 --- a/src/dailyai/pipeline/openai_frames.py +++ /dev/null @@ -1,12 +0,0 @@ -from dataclasses import dataclass - -from dailyai.pipeline.frames import Frame -from dailyai.services.openai_llm_context import OpenAILLMContext - - -@dataclass() -class OpenAILLMContextFrame(Frame): - """Like an LLMMessagesFrame, but with extra context specific to the - OpenAI API. The context in this message is also mutable, and will be - changed by the OpenAIContextAggregator frame processor.""" - context: OpenAILLMContext diff --git a/src/dailyai/pipeline/pipeline.py b/src/dailyai/pipeline/pipeline.py deleted file mode 100644 index e1a6a15f7..000000000 --- a/src/dailyai/pipeline/pipeline.py +++ /dev/null @@ -1,149 +0,0 @@ -import asyncio -import logging -from typing import AsyncGenerator, AsyncIterable, Iterable, List -from dailyai.pipeline.frame_processor import FrameProcessor - -from dailyai.pipeline.frames import AudioFrame, EndPipeFrame, EndFrame, Frame - - -class Pipeline: - """ - This class manages a pipe of FrameProcessors, and runs them in sequence. The "source" - and "sink" queues are managed by the caller. You can use this class stand-alone to - perform specialized processing, or you can use the Transport's run_pipeline method to - instantiate and run a pipeline with the Transport's sink and source queues. - """ - - def __init__( - self, - processors: List[FrameProcessor], - source: asyncio.Queue | None = None, - sink: asyncio.Queue[Frame] | None = None, - name: str | None = None, - ): - """Create a new pipeline. By default we create the sink and source queues - if they're not provided, but these can be overridden to point to other - queues. If this pipeline is run by a transport, its sink and source queues - will be overridden. - """ - self._processors: List[FrameProcessor] = processors - - self.source: asyncio.Queue[Frame] = source or asyncio.Queue() - self.sink: asyncio.Queue[Frame] = sink or asyncio.Queue() - - self._logger = logging.getLogger("dailyai.pipeline") - self._last_log_line = "" - self._shown_repeated_log = False - self._name = name or str(id(self)) - - def set_source(self, source: asyncio.Queue[Frame]): - """Set the source queue for this pipeline. Frames from this queue - will be processed by each frame_processor in the pipeline, or order - from first to last.""" - self.source = source - - def set_sink(self, sink: asyncio.Queue[Frame]): - """Set the sink queue for this pipeline. After the last frame_processor - has processed a frame, its output will be placed on this queue.""" - self.sink = sink - - def add_processor(self, processor: FrameProcessor): - self._processors.append(processor) - - async def get_next_source_frame(self) -> AsyncGenerator[Frame, None]: - """Convenience function to get the next frame from the source queue. This - lets us consistently have an AsyncGenerator yield frames, from either the - source queue or a frame_processor.""" - - yield await self.source.get() - - async def queue_frames( - self, - frames: Iterable[Frame] | AsyncIterable[Frame], - ) -> None: - """Insert frames directly into a pipeline. This is typically used inside a transport - participant_joined callback to prompt a bot to start a conversation, for example.""" - - if isinstance(frames, AsyncIterable): - async for frame in frames: - await self.source.put(frame) - elif isinstance(frames, Iterable): - for frame in frames: - await self.source.put(frame) - else: - raise Exception("Frames must be an iterable or async iterable") - - async def run_pipeline(self): - """Run the pipeline. Take each frame from the source queue, pass it to - the first frame_processor, pass the output of that frame_processor to the - next in the list, etc. until the last frame_processor has processed the - resulting frames, then place those frames in the sink queue. - - The source and sink queues must be set before calling this method. - - This method will exit when an EndFrame is placed on the sink queue. - No more frames will be placed on the sink queue after an EndFrame, even - if it's not the last frame yielded by the last frame_processor in the pipeline.. - """ - - try: - while True: - initial_frame = await self.source.get() - async for frame in self._run_pipeline_recursively( - initial_frame, self._processors - ): - self._log_frame(frame, len(self._processors) + 1) - await self.sink.put(frame) - - if isinstance(initial_frame, EndFrame) or isinstance( - initial_frame, EndPipeFrame - ): - break - except asyncio.CancelledError: - # this means there's been an interruption, do any cleanup necessary - # here. - for processor in self._processors: - await processor.interrupted() - - async def _run_pipeline_recursively( - self, initial_frame: Frame, processors: List[FrameProcessor], depth=1 - ) -> AsyncGenerator[Frame, None]: - """Internal function to add frames to the pipeline as they're yielded - by each processor.""" - if processors: - self._log_frame(initial_frame, depth) - async for frame in processors[0].process_frame(initial_frame): - async for final_frame in self._run_pipeline_recursively( - frame, processors[1:], depth + 1 - ): - yield final_frame - else: - yield initial_frame - - def _log_frame(self, frame: Frame, depth: int): - """Log a frame as it moves through the pipeline. This is useful for debugging. - Note that this function inherits the logging level from the "dailyai" logger. - If you want debug output from dailyai in general but not this function (it is - noisy) you can silence this function by doing something like this: - - # enable debug logging for the dailyai package. - logger = logging.getLogger("dailyai") - logger.setLevel(logging.DEBUG) - - # silence the pipeline logging - logger = logging.getLogger("dailyai.pipeline") - logger.setLevel(logging.WARNING) - """ - source = str(self._processors[depth - 2]) if depth > 1 else "source" - dest = str(self._processors[depth - 1]) if depth < (len(self._processors) + 1) else "sink" - prefix = self._name + " " * depth - logline = prefix + " -> ".join([source, frame.__class__.__name__, dest]) - if logline == self._last_log_line: - if self._shown_repeated_log: - return - self._shown_repeated_log = True - self._logger.debug(prefix + "... repeated") - else: - self._shown_repeated_log = False - self._last_log_line = logline - self._logger.debug(logline) diff --git a/src/dailyai/services/ai_services.py b/src/dailyai/services/ai_services.py deleted file mode 100644 index 5ba732acd..000000000 --- a/src/dailyai/services/ai_services.py +++ /dev/null @@ -1,165 +0,0 @@ -import io -import logging -import time -import wave -from dailyai.pipeline.frame_processor import FrameProcessor - -from dailyai.pipeline.frames import ( - AudioFrame, - EndFrame, - EndPipeFrame, - ImageFrame, - Frame, - TTSEndFrame, - TTSStartFrame, - TextFrame, - TranscriptionFrame, - URLImageFrame, - VisionImageFrame, -) - -from abc import abstractmethod -from typing import AsyncGenerator, BinaryIO - - -class AIService(FrameProcessor): - def __init__(self): - self.logger = logging.getLogger("dailyai") - - -class LLMService(AIService): - """This class is a no-op but serves as a base class for LLM services.""" - - def __init__(self): - super().__init__() - - -class TTSService(AIService): - def __init__(self, aggregate_sentences=True): - super().__init__() - self.aggregate_sentences: bool = aggregate_sentences - self.current_sentence: str = "" - - # Some TTS services require a specific sample rate. We default to 16k - def get_mic_sample_rate(self): - return 16000 - - # Converts the text to audio. Yields a list of audio frames that can - # be sent to the microphone device - @abstractmethod - async def run_tts(self, text) -> AsyncGenerator[bytes, None]: - # yield empty bytes here, so linting can infer what this method does - yield bytes() - - async def wrap_tts(self, text) -> AsyncGenerator[Frame, None]: - yield TTSStartFrame() - async for audio_chunk in self.run_tts(text): - yield AudioFrame(audio_chunk) - yield TTSEndFrame() - yield TextFrame(text) - - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - if isinstance(frame, EndFrame) or isinstance(frame, EndPipeFrame): - if self.current_sentence: - async for cleanup_frame in self.wrap_tts(self.current_sentence): - yield cleanup_frame - - if not isinstance(frame, TextFrame): - yield frame - return - - text: str | None = None - if not self.aggregate_sentences: - text = frame.text - else: - self.current_sentence += frame.text - if self.current_sentence.strip().endswith((".", "?", "!")): - text = self.current_sentence - self.current_sentence = "" - - if text: - async for frame in self.wrap_tts(text): - yield frame - - -class ImageGenService(AIService): - def __init__(self, **kwargs): - super().__init__(**kwargs) - - # Renders the image. Returns an Image object. - @abstractmethod - async def run_image_gen(self, prompt: str) -> tuple[str, bytes, tuple[int, int]]: - pass - - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - if not isinstance(frame, TextFrame): - yield frame - return - - (url, image_data, image_size) = await self.run_image_gen(frame.text) - yield URLImageFrame(url, image_data, image_size) - - -class VisionService(AIService): - """VisionService is a base class for vision services.""" - - def __init__(self, **kwargs): - super().__init__(**kwargs) - self._describe_text = None - - @abstractmethod - async def run_vision(self, frame: VisionImageFrame) -> str: - pass - - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - if isinstance(frame, VisionImageFrame): - description = await self.run_vision(frame) - yield TextFrame(description) - else: - yield frame - - -class STTService(AIService): - """STTService is a base class for speech-to-text services.""" - - _frame_rate: int - - def __init__(self, frame_rate: int = 16000, **kwargs): - super().__init__(**kwargs) - self._frame_rate = frame_rate - - @abstractmethod - async def run_stt(self, audio: BinaryIO) -> str: - """Returns transcript as a string""" - pass - - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - """Processes a frame of audio data, either buffering or transcribing it.""" - if not isinstance(frame, AudioFrame): - return - - data = frame.data - content = io.BufferedRandom(io.BytesIO()) - ww = wave.open(self._content, "wb") - ww.setnchannels(1) - ww.setsampwidth(2) - ww.setframerate(self._frame_rate) - ww.writeframesraw(data) - ww.close() - content.seek(0) - text = await self.run_stt(content) - yield TranscriptionFrame(text, "", str(time.time())) - - -class FrameLogger(AIService): - def __init__(self, prefix="Frame", **kwargs): - super().__init__(**kwargs) - self.prefix = prefix - - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - if isinstance(frame, (AudioFrame, ImageFrame)): - self.logger.info(f"{self.prefix}: {type(frame)}") - else: - print(f"{self.prefix}: {frame}") - - yield frame diff --git a/src/dailyai/services/anthropic_llm_service.py b/src/dailyai/services/anthropic_llm_service.py deleted file mode 100644 index 44c045992..000000000 --- a/src/dailyai/services/anthropic_llm_service.py +++ /dev/null @@ -1,44 +0,0 @@ -from typing import AsyncGenerator -from dailyai.pipeline.frames import Frame, LLMMessagesFrame, TextFrame - -from dailyai.services.ai_services import LLMService - -try: - from anthropic import AsyncAnthropic -except ModuleNotFoundError as e: - print(f"Exception: {e}") - print( - "In order to use Anthropic, you need to `pip install dailyai[anthropic]`. Also, set `ANTHROPIC_API_KEY` environment variable.") - raise Exception(f"Missing module: {e}") - - -class AnthropicLLMService(LLMService): - - def __init__( - self, - api_key, - model="claude-3-opus-20240229", - max_tokens=1024): - super().__init__() - self.client = AsyncAnthropic(api_key=api_key) - self.model = model - self.max_tokens = max_tokens - - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - if not isinstance(frame, LLMMessagesFrame): - yield frame - - stream = await self.client.messages.create( - max_tokens=self.max_tokens, - messages=[ - { - "role": "user", - "content": "Hello, Claude", - } - ], - model=self.model, - stream=True, - ) - async for event in stream: - if event.type == "content_block_delta": - yield TextFrame(event.delta.text) diff --git a/src/dailyai/services/deepgram_ai_service.py b/src/dailyai/services/deepgram_ai_service.py deleted file mode 100644 index 4b552927e..000000000 --- a/src/dailyai/services/deepgram_ai_service.py +++ /dev/null @@ -1,36 +0,0 @@ -import aiohttp - -from dailyai.services.ai_services import TTSService - - -class DeepgramAIService(TTSService): - def __init__( - self, - *, - aiohttp_session: aiohttp.ClientSession, - api_key, - voice, - sample_rate=16000 - ): - super().__init__() - - self._api_key = api_key - self._voice = voice - self._sample_rate = sample_rate - self._aiohttp_session = aiohttp_session - - async def run_tts(self, sentence): - self.logger.info(f"Running deepgram tts for {sentence}") - base_url = "https://api.beta.deepgram.com/v1/speak" - request_url = f"{base_url}?model={self._voice}&encoding=linear16&container=none&sample_rate={self._sample_rate}" - headers = { - "authorization": f"token {self._api_key}", - "Content-Type": "application/json"} - data = {"text": sentence} - - async with self._aiohttp_session.post( - request_url, headers=headers, json=data - ) as r: - async for chunk in r.content: - if chunk: - yield chunk diff --git a/src/dailyai/services/elevenlabs_ai_service.py b/src/dailyai/services/elevenlabs_ai_service.py deleted file mode 100644 index c31d50bed..000000000 --- a/src/dailyai/services/elevenlabs_ai_service.py +++ /dev/null @@ -1,46 +0,0 @@ -import aiohttp - -from typing import AsyncGenerator - -from dailyai.services.ai_services import TTSService - - -class ElevenLabsTTSService(TTSService): - - def __init__( - self, - *, - aiohttp_session: aiohttp.ClientSession, - api_key, - voice_id, - model="eleven_turbo_v2", - ): - super().__init__() - - self._api_key = api_key - self._voice_id = voice_id - self._aiohttp_session = aiohttp_session - self._model = model - - async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]: - url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream" - payload = {"text": sentence, "model_id": self._model} - querystring = { - "output_format": "pcm_16000", - "optimize_streaming_latency": 2} - headers = { - "xi-api-key": self._api_key, - "Content-Type": "application/json", - } - async with self._aiohttp_session.post( - url, json=payload, headers=headers, params=querystring - ) as r: - if r.status != 200: - self.logger.error( - f"audio fetch status code: {r.status}, error: {r.text}" - ) - return - - async for chunk in r.content: - if chunk: - yield chunk diff --git a/src/dailyai/services/fireworks_ai_services.py b/src/dailyai/services/fireworks_ai_services.py deleted file mode 100644 index e5ccbc658..000000000 --- a/src/dailyai/services/fireworks_ai_services.py +++ /dev/null @@ -1,18 +0,0 @@ -import os - -from dailyai.services.openai_api_llm_service import BaseOpenAILLMService - - -try: - from openai import AsyncOpenAI -except ModuleNotFoundError as e: - print(f"Exception: {e}") - print( - "In order to use Fireworks, you need to `pip install dailyai[fireworks]`. Also, set the `FIREWORKS_API_KEY` environment variable.") - raise Exception(f"Missing module: {e}") - - -class FireworksLLMService(BaseOpenAILLMService): - def __init__(self, model="accounts/fireworks/models/firefunction-v1", *args, **kwargs): - kwargs["base_url"] = "https://api.fireworks.ai/inference/v1" - super().__init__(model, *args, **kwargs) diff --git a/src/dailyai/services/local_stt_service.py b/src/dailyai/services/local_stt_service.py deleted file mode 100644 index 3d190dfb2..000000000 --- a/src/dailyai/services/local_stt_service.py +++ /dev/null @@ -1,74 +0,0 @@ -import array -import io -import math -import time -from typing import AsyncGenerator -import wave -from dailyai.pipeline.frames import AudioFrame, Frame, TranscriptionFrame -from dailyai.services.ai_services import STTService - - -class LocalSTTService(STTService): - _content: io.BufferedRandom - _wave: wave.Wave_write - _current_silence_frames: int - - # Configuration - _min_rms: int - _max_silence_frames: int - _frame_rate: int - - def __init__(self, - min_rms: int = 400, - max_silence_frames: int = 3, - frame_rate: int = 16000, - **kwargs): - super().__init__(frame_rate, **kwargs) - self._current_silence_frames = 0 - self._min_rms = min_rms - self._max_silence_frames = max_silence_frames - self._frame_rate = frame_rate - self._new_wave() - - def _new_wave(self): - """Creates a new wave object and content buffer.""" - self._content = io.BufferedRandom(io.BytesIO()) - ww = wave.open(self._content, "wb") - ww.setnchannels(1) - ww.setsampwidth(2) - ww.setframerate(self._frame_rate) - self._wave = ww - - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - """Processes a frame of audio data, either buffering or transcribing it.""" - if not isinstance(frame, AudioFrame): - yield frame - return - - data = frame.data - # Try to filter out empty background noise - # (Very rudimentary approach, can be improved) - rms = self._get_volume(data) - if rms >= self._min_rms: - # If volume is high enough, write new data to wave file - self._wave.writeframesraw(data) - - # If buffer is not empty and we detect a 3-frame pause in speech, - # transcribe the audio gathered so far. - if self._content.tell() > 0 and self._current_silence_frames > self._max_silence_frames: - self._current_silence_frames = 0 - self._wave.close() - self._content.seek(0) - text = await self.run_stt(self._content) - self._new_wave() - yield TranscriptionFrame(text, '', str(time.time())) - # If we get this far, this is a frame of silence - self._current_silence_frames += 1 - - def _get_volume(self, audio: bytes) -> float: - # https://docs.python.org/3/library/array.html - audio_array = array.array('h', audio) - squares = [sample**2 for sample in audio_array] - mean = sum(squares) / len(audio_array) - rms = math.sqrt(mean) - return rms diff --git a/src/dailyai/services/open_ai_services.py b/src/dailyai/services/open_ai_services.py deleted file mode 100644 index 9eaec5218..000000000 --- a/src/dailyai/services/open_ai_services.py +++ /dev/null @@ -1,58 +0,0 @@ -from typing import Literal -import aiohttp -from PIL import Image -import io - -from dailyai.services.ai_services import ImageGenService -from dailyai.services.openai_api_llm_service import BaseOpenAILLMService - - -try: - from openai import AsyncOpenAI -except ModuleNotFoundError as e: - print(f"Exception: {e}") - print( - "In order to use OpenAI, you need to `pip install dailyai[openai]`. Also, set `OPENAI_API_KEY` environment variable.") - raise Exception(f"Missing module: {e}") - - -class OpenAILLMService(BaseOpenAILLMService): - - def __init__(self, model="gpt-4", * args, **kwargs): - super().__init__(model, *args, **kwargs) - - -class OpenAIImageGenService(ImageGenService): - - def __init__( - self, - *, - image_size: Literal["256x256", "512x512", "1024x1024", "1792x1024", "1024x1792"], - aiohttp_session: aiohttp.ClientSession, - api_key, - model="dall-e-3", - ): - super().__init__() - self._model = model - self._image_size = image_size - self._client = AsyncOpenAI(api_key=api_key) - self._aiohttp_session = aiohttp_session - - async def run_image_gen(self, prompt: str) -> tuple[str, bytes, tuple[int, int]]: - self.logger.info("Generating OpenAI image", prompt) - - image = await self._client.images.generate( - prompt=prompt, - model=self._model, - n=1, - size=self._image_size - ) - image_url = image.data[0].url - if not image_url: - raise Exception("No image provided in response", image) - - # Load the image from the url - async with self._aiohttp_session.get(image_url) as response: - image_stream = io.BytesIO(await response.content.read()) - image = Image.open(image_stream) - return (image_url, image.tobytes(), image.size) diff --git a/src/dailyai/services/openai_llm_context.py b/src/dailyai/services/openai_llm_context.py deleted file mode 100644 index 2d16c3cb6..000000000 --- a/src/dailyai/services/openai_llm_context.py +++ /dev/null @@ -1,61 +0,0 @@ -from typing import List - -try: - from openai._types import NOT_GIVEN, NotGiven - - from openai.types.chat import ( - ChatCompletionToolParam, - ChatCompletionToolChoiceOptionParam, - ChatCompletionMessageParam, - ) -except ModuleNotFoundError as e: - print(f"Exception: {e}") - print( - "In order to use OpenAI, you need to `pip install dailyai[openai]`. Also, set `OPENAI_API_KEY` environment variable.") - raise Exception(f"Missing module: {e}") - - -class OpenAILLMContext: - - def __init__( - self, - messages: List[ChatCompletionMessageParam] | None = None, - tools: List[ChatCompletionToolParam] | NotGiven = NOT_GIVEN, - tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = NOT_GIVEN - ): - self.messages: List[ChatCompletionMessageParam] = messages if messages else [ - ] - self.tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = tool_choice - self.tools: List[ChatCompletionToolParam] | NotGiven = tools - - @staticmethod - def from_messages(messages: List[dict]) -> "OpenAILLMContext": - context = OpenAILLMContext() - for message in messages: - context.add_message({ - "content": message["content"], - "role": message["role"], - "name": message["name"] if "name" in message else message["role"] - }) - return context - - # def __deepcopy__(self, memo): - - def add_message(self, message: ChatCompletionMessageParam): - self.messages.append(message) - - def get_messages(self) -> List[ChatCompletionMessageParam]: - return self.messages - - def set_tool_choice( - self, tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven - ): - self.tool_choice = tool_choice - - def set_tools( - self, - tools: List[ChatCompletionToolParam] | NotGiven = NOT_GIVEN): - if tools != NOT_GIVEN and len(tools) == 0: - tools = NOT_GIVEN - - self.tools = tools diff --git a/src/dailyai/transports/abstract_transport.py b/src/dailyai/transports/abstract_transport.py deleted file mode 100644 index 1a30c9063..000000000 --- a/src/dailyai/transports/abstract_transport.py +++ /dev/null @@ -1,42 +0,0 @@ -from abc import abstractmethod -import asyncio -import logging -import time - -from dailyai.pipeline.frame_processor import FrameProcessor -from dailyai.pipeline.pipeline import Pipeline - - -class AbstractTransport: - def __init__(self, **kwargs): - self.send_queue = asyncio.Queue() - self.receive_queue = asyncio.Queue() - self.completed_queue = asyncio.Queue() - - duration_minutes = kwargs.get("duration_minutes") or 10 - self._expiration = time.time() + duration_minutes * 60 - - self._mic_enabled = kwargs.get("mic_enabled") or False - self._mic_sample_rate = kwargs.get("mic_sample_rate") or 16000 - self._camera_enabled = kwargs.get("camera_enabled") or False - self._camera_width = kwargs.get("camera_width") or 1024 - self._camera_height = kwargs.get("camera_height") or 768 - self._camera_bitrate = kwargs.get("camera_bitrate") or 250000 - self._camera_framerate = kwargs.get("camera_framerate") or 10 - self._speaker_enabled = kwargs.get("speaker_enabled") or False - self._speaker_sample_rate = kwargs.get("speaker_sample_rate") or 16000 - - self._logger: logging.Logger = logging.getLogger("dailyai.transport") - - @abstractmethod - async def run(self, pipeline: Pipeline, override_pipeline_source_queue=True): - pass - - @abstractmethod - async def run_interruptible_pipeline( - self, - pipeline: Pipeline, - pre_processor: FrameProcessor | None = None, - post_processor: FrameProcessor | None = None, - ): - pass diff --git a/src/dailyai/transports/daily_transport.py b/src/dailyai/transports/daily_transport.py deleted file mode 100644 index 86c0ee1cf..000000000 --- a/src/dailyai/transports/daily_transport.py +++ /dev/null @@ -1,390 +0,0 @@ -import asyncio -import inspect -import logging -import signal -import time -import threading -import types - -from functools import partial -from typing import Any - -from dailyai.pipeline.frames import ( - InterimTranscriptionFrame, - ReceivedAppMessageFrame, - TranscriptionFrame, - UserImageFrame, -) - -from threading import Event - -try: - from daily import ( - EventHandler, - CallClient, - Daily, - VirtualCameraDevice, - VirtualMicrophoneDevice, - VirtualSpeakerDevice, - ) -except ModuleNotFoundError as e: - print(f"Exception: {e}") - print( - "In order to use the Daily transport, you need to `pip install dailyai[daily]`.") - raise Exception(f"Missing module: {e}") - - -from dailyai.transports.threaded_transport import ThreadedTransport - -NUM_CHANNELS = 1 - -SPEECH_THRESHOLD = 0.90 -VAD_RESET_PERIOD_MS = 2000 - - -class DailyTransport(ThreadedTransport, EventHandler): - _daily_initialized = False - _lock = threading.Lock() - - _speaker_enabled: bool - _speaker_sample_rate: int - _vad_enabled: bool - - # This is necessary to override EventHandler's __new__ method. - def __new__(cls, *args, **kwargs): - return super().__new__(cls) - - def __init__( - self, - room_url: str, - token: str | None, - bot_name: str, - min_others_count: int = 1, - start_transcription: bool = False, - video_rendering_enabled: bool = False, - **kwargs, - ): - kwargs['has_webrtc_vad'] = True - # This will call ThreadedTransport.__init__ method, not EventHandler - super().__init__(**kwargs) - - self._room_url: str = room_url - self._bot_name: str = bot_name - self._token: str | None = token - self._min_others_count = min_others_count - self._start_transcription = start_transcription - self._video_rendering_enabled = video_rendering_enabled - - self._is_interrupted = Event() - self._stop_threads = Event() - - self._other_participant_has_joined = False - self._my_participant_id = None - - self._video_renderers = {} - - self.transcription_settings = { - "language": "en", - "tier": "nova", - "model": "2-conversationalai", - "profanity_filter": True, - "redact": False, - "endpointing": True, - "punctuate": True, - "includeRawResponse": True, - "extra": { - "interim_results": True, - }, - } - - self._logger: logging.Logger = logging.getLogger("dailyai") - - self._event_handlers = {} - - self.webrtc_vad = Daily.create_native_vad( - reset_period_ms=VAD_RESET_PERIOD_MS, - sample_rate=self._speaker_sample_rate, - channels=NUM_CHANNELS - ) - - def _patch_method(self, event_name, *args, **kwargs): - try: - for handler in self._event_handlers[event_name]: - if inspect.iscoroutinefunction(handler): - if self._loop: - future = asyncio.run_coroutine_threadsafe( - handler(*args, **kwargs), self._loop) - - # wait for the coroutine to finish. This will also - # raise any exceptions raised by the coroutine. - future.result() - else: - raise Exception( - "No event loop to run coroutine. In order to use async event handlers, you must run the DailyTransportService in an asyncio event loop.") - else: - handler(*args, **kwargs) - except Exception as e: - self._logger.error(f"Exception in event handler {event_name}: {e}") - raise e - - def _webrtc_vad_analyze(self): - buffer = self.read_audio_frames(int(self._vad_samples)) - if len(buffer) > 0: - confidence = self.webrtc_vad.analyze_frames(buffer) - # yeses = int(confidence * 20.0) - # nos = 20 - yeses - # out = "!" * yeses + "." * nos - # print(f"!!! confidence: {out} {confidence}") - talking = confidence > SPEECH_THRESHOLD - return talking - - def add_event_handler(self, event_name: str, handler): - if not event_name.startswith("on_"): - raise Exception( - f"Event handler {event_name} must start with 'on_'") - - methods = inspect.getmembers(self, predicate=inspect.ismethod) - if event_name not in [method[0] for method in methods]: - raise Exception(f"Event handler {event_name} not found") - - if event_name not in self._event_handlers: - self._event_handlers[event_name] = [ - getattr( - self, event_name), types.MethodType( - handler, self)] - setattr(self, event_name, partial(self._patch_method, event_name)) - else: - self._event_handlers[event_name].append( - types.MethodType(handler, self)) - - def event_handler(self, event_name: str): - def decorator(handler): - self.add_event_handler(event_name, handler) - return handler - - return decorator - - def write_frame_to_camera(self, frame: bytes): - if self._camera_enabled: - self.camera.write_frame(frame) - - def write_frame_to_mic(self, frame: bytes): - if self._mic_enabled: - self.mic.write_frames(frame) - - def request_participant_image(self, participant_id: str): - if participant_id in self._video_renderers: - self._video_renderers[participant_id]["render_next_frame"] = True - - def send_app_message(self, message: Any, participant_id: str | None): - self.client.send_app_message(message, participant_id) - - def read_audio_frames(self, desired_frame_count): - bytes = b"" - if self._speaker_enabled or self._vad_enabled: - bytes = self._speaker.read_frames(desired_frame_count) - return bytes - - def _prerun(self): - # Only initialize Daily once - if not DailyTransport._daily_initialized: - with DailyTransport._lock: - Daily.init() - DailyTransport._daily_initialized = True - self.client = CallClient(event_handler=self) - - if self._mic_enabled: - self.mic: VirtualMicrophoneDevice = Daily.create_microphone_device( - "mic", sample_rate=self._mic_sample_rate, channels=1 - ) - - if self._camera_enabled: - self.camera: VirtualCameraDevice = Daily.create_camera_device( - "camera", width=self._camera_width, height=self._camera_height, color_format="RGB") - - if self._speaker_enabled or self._vad_enabled: - self._speaker: VirtualSpeakerDevice = Daily.create_speaker_device( - "speaker", sample_rate=self._speaker_sample_rate, channels=1 - ) - Daily.select_speaker_device("speaker") - - self.client.set_user_name(self._bot_name) - self.client.join( - self._room_url, - self._token, - completion=self.call_joined, - client_settings={ - "inputs": { - "camera": { - "isEnabled": True, - "settings": { - "deviceId": "camera", - }, - }, - "microphone": { - "isEnabled": True, - "settings": { - "deviceId": "mic", - "customConstraints": { - "autoGainControl": {"exact": False}, - "echoCancellation": {"exact": False}, - "noiseSuppression": {"exact": False}, - }, - }, - }, - }, - "publishing": { - "camera": { - "sendSettings": { - "maxQuality": "low", - "encodings": { - "low": { - "maxBitrate": self._camera_bitrate, - "scaleResolutionDownBy": 1.333, - "maxFramerate": self._camera_framerate, - } - }, - } - } - }, - }, - ) - self._my_participant_id = self.client.participants()["local"]["id"] - - # For performance reasons, never subscribe to video streams (unless a - # video renderer is registered). - self.client.update_subscription_profiles({ - "base": { - "camera": "unsubscribed", - "screenVideo": "unsubscribed" - } - }) - - if self._token and self._start_transcription: - self.client.start_transcription(self.transcription_settings) - - self.original_sigint_handler = signal.getsignal(signal.SIGINT) - signal.signal(signal.SIGINT, self.process_interrupt_handler) - - def process_interrupt_handler(self, signum, frame): - self._post_run() - if callable(self.original_sigint_handler): - self.original_sigint_handler(signum, frame) - - def _post_run(self): - self.client.leave() - self.client.release() - - def on_first_other_participant_joined(self, participant): - pass - - def call_joined(self, join_data, client_error): - # self._logger.info(f"Call_joined: {join_data}, {client_error}") - pass - - def dialout(self, number): - self.client.start_dialout({"phoneNumber": number}) - - def start_recording(self): - self.client.start_recording() - - def render_participant_video(self, - participant_id, - framerate=10, - video_source="camera", - color_format="RGB") -> None: - if not self._video_rendering_enabled: - self._logger.warn("Video rendering is not enabled") - return - - # Only enable camera subscription on this participant - self.client.update_subscriptions(participant_settings={ - participant_id: { - "media": { - video_source: "subscribed" - } - } - }) - - self._video_renderers[participant_id] = { - "framerate": framerate, - "timestamp": 0, - "render_next_frame": False, - } - self.client.set_video_renderer( - participant_id, - self.on_participant_video_frame, - video_source=video_source, - color_format=color_format) - - def on_participant_video_frame(self, participant_id, video_frame): - if not self._loop: - return - - render_frame = False - - curr_time = time.time() - framerate = self._video_renderers[participant_id]["framerate"] - - if framerate > 0: - prev_time = self._video_renderers[participant_id]["timestamp"] - next_time = prev_time + 1 / framerate - render_frame = curr_time > next_time - elif self._video_renderers[participant_id]["render_next_frame"]: - self._video_renderers[participant_id]["render_next_frame"] = False - render_frame = True - - if render_frame: - frame = UserImageFrame(participant_id, video_frame.buffer, - (video_frame.width, video_frame.height)) - asyncio.run_coroutine_threadsafe(self.receive_queue.put(frame), self._loop) - - self._video_renderers[participant_id]["timestamp"] = curr_time - - def on_error(self, error): - self._logger.error(f"on_error: {error}") - - def on_call_state_updated(self, state): - pass - - def on_participant_joined(self, participant): - if not self._other_participant_has_joined and participant["id"] != self._my_participant_id: - self._other_participant_has_joined = True - self.on_first_other_participant_joined(participant) - - def on_participant_left(self, participant, reason): - if len(self.client.participants()) < self._min_others_count + 1: - self._stop_threads.set() - - def on_app_message(self, message: Any, sender: str): - if self._loop: - frame = ReceivedAppMessageFrame(message, sender) - asyncio.run_coroutine_threadsafe( - self.receive_queue.put(frame), self._loop - ) - - def on_transcription_message(self, message: dict): - if self._loop: - participantId = "" - if "participantId" in message: - participantId = message["participantId"] - elif "session_id" in message: - participantId = message["session_id"] - if self._my_participant_id and participantId != self._my_participant_id: - is_final = message["rawResponse"]["is_final"] - if is_final: - frame = TranscriptionFrame(message["text"], participantId, message["timestamp"]) - else: - frame = InterimTranscriptionFrame( - message["text"], participantId, message["timestamp"]) - asyncio.run_coroutine_threadsafe( - self.receive_queue.put(frame), self._loop) - - def on_transcription_error(self, message): - self._logger.error(f"Transcription error: {message}") - - def on_transcription_started(self, status): - pass - - def on_transcription_stopped(self, stopped_by, stopped_by_error): - pass diff --git a/src/dailyai/transports/local_transport.py b/src/dailyai/transports/local_transport.py deleted file mode 100644 index abfc8dc5e..000000000 --- a/src/dailyai/transports/local_transport.py +++ /dev/null @@ -1,97 +0,0 @@ -import asyncio -import numpy as np -import tkinter as tk - -from dailyai.transports.threaded_transport import ThreadedTransport - -try: - import pyaudio -except ModuleNotFoundError as e: - print(f"Exception: {e}") - print( - "In order to use the local transport, you need to `pip install dailyai[local]`. On MacOS, you also need to `brew install portaudio`.") - raise Exception(f"Missing module: {e}") - - -class LocalTransport(ThreadedTransport): - def __init__(self, **kwargs): - super().__init__(**kwargs) - self._sample_width = kwargs.get("sample_width") or 2 - self._n_channels = kwargs.get("n_channels") or 1 - self._tk_root = kwargs.get("tk_root") or None - self._pyaudio = None - - if self._camera_enabled and not self._tk_root: - raise ValueError( - "If camera is enabled, a tkinter root must be provided") - - if self._speaker_enabled: - self._speaker_buffer_pending = bytearray() - - async def _write_frame_to_tkinter(self, frame: bytes): - data = f"P6 {self._camera_width} {self._camera_height} 255 ".encode() + \ - frame - photo = tk.PhotoImage( - width=self._camera_width, - height=self._camera_height, - data=data, - format="PPM") - self._image_label.config(image=photo) - - # This holds a reference to the photo, preventing it from being garbage - # collected. - self._image_label.image = photo # type: ignore - - def write_frame_to_camera(self, frame: bytes): - if self._camera_enabled and self._loop: - asyncio.run_coroutine_threadsafe( - self._write_frame_to_tkinter(frame), self._loop - ) - - def write_frame_to_mic(self, frame: bytes): - if self._mic_enabled: - self._audio_stream.write(frame) - - def read_audio_frames(self, desired_frame_count): - bytes = b"" - if self._speaker_enabled: - bytes = self._speaker_stream.read( - desired_frame_count, - exception_on_overflow=False, - ) - return bytes - - def _prerun(self): - if self._mic_enabled: - if not self._pyaudio: - self._pyaudio = pyaudio.PyAudio() - self._audio_stream = self._pyaudio.open( - format=self._pyaudio.get_format_from_width(self._sample_width), - channels=self._n_channels, - rate=self._speaker_sample_rate, - output=True, - ) - - if self._camera_enabled: - # Start with a neutral gray background. - array = np.ones((1024, 1024, 3)) * 128 - data = f"P5 {1024} {1024} 255 ".encode( - ) + array.astype(np.uint8).tobytes() - photo = tk.PhotoImage( - width=1024, - height=1024, - data=data, - format="PPM") - self._image_label = tk.Label(self._tk_root, image=photo) - self._image_label.pack() - - if self._speaker_enabled: - if not self._pyaudio: - self._pyaudio = pyaudio.PyAudio() - self._speaker_stream = self._pyaudio.open( - format=self._pyaudio.get_format_from_width(self._sample_width), - channels=self._n_channels, - rate=self._speaker_sample_rate, - frames_per_buffer=self._speaker_sample_rate, - input=True - ) diff --git a/src/dailyai/transports/threaded_transport.py b/src/dailyai/transports/threaded_transport.py deleted file mode 100644 index 52736a008..000000000 --- a/src/dailyai/transports/threaded_transport.py +++ /dev/null @@ -1,503 +0,0 @@ -from abc import abstractmethod -import asyncio -import itertools -import numpy as np - -import queue -import threading -import time -from typing import Any, AsyncGenerator -from enum import Enum -from dailyai.pipeline.frame_processor import FrameProcessor - -from dailyai.pipeline.frames import ( - SendAppMessageFrame, - AudioFrame, - EndFrame, - ImageFrame, - Frame, - PipelineStartedFrame, - SpriteFrame, - StartFrame, - TextFrame, - UserImageRequestFrame, - UserStartedSpeakingFrame, - UserStoppedSpeakingFrame, -) -from dailyai.pipeline.pipeline import Pipeline -from dailyai.services.ai_services import TTSService -from dailyai.transports.abstract_transport import AbstractTransport - - -# Provided by Alexander Veysov - - -def int2float(sound): - try: - abs_max = np.abs(sound).max() - sound = sound.astype("float32") - if abs_max > 0: - sound *= 1 / 32768 - sound = sound.squeeze() # depends on the use case - return sound - except ValueError: - return sound - - -class VADState(Enum): - QUIET = 1 - STARTING = 2 - SPEAKING = 3 - STOPPING = 4 - - -class ThreadedTransport(AbstractTransport): - - def __init__( - self, - **kwargs, - ) -> None: - super().__init__(**kwargs) - self._vad_start_s = kwargs.get("vad_start_s") or 0.2 - self._vad_stop_s = kwargs.get("vad_stop_s") or 0.8 - self._context = kwargs.get("context") or [] - self._vad_enabled = kwargs.get("vad_enabled") or False - self._has_webrtc_vad = kwargs.get("has_webrtc_vad") or False - if self._vad_enabled and self._speaker_enabled: - raise Exception( - "Sorry, you can't use speaker_enabled and vad_enabled at the same time. Please set one to False." - ) - self._vad_samples = 1536 - - if self._vad_enabled: - try: - global torch, torchaudio - import torch - # We don't use torchaudio here, but we need to try importing it because - # Silero uses it - import torchaudio - torch.set_num_threads(1) - - (self.model, self.utils) = torch.hub.load( - repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=False - ) - self._logger.debug("Loaded Silero VAD") - - except ModuleNotFoundError as e: - if self._has_webrtc_vad: - self._logger.debug( - f"Couldn't load torch; using webrtc VAD") - self._vad_samples = int(self._speaker_sample_rate / 100.0) - else: - self._logger.error(f"Exception: {e}") - self._logger.error( - "In order to use Silero VAD, you'll need to `pip install dailyai[silero].") - raise Exception(f"Missing module(s): {e}") - - vad_frame_s = self._vad_samples / self._speaker_sample_rate - self._vad_start_frames = round(self._vad_start_s / vad_frame_s) - self._vad_stop_frames = round(self._vad_stop_s / vad_frame_s) - self._vad_starting_count = 0 - self._vad_stopping_count = 0 - self._vad_state = VADState.QUIET - self._user_is_speaking = False - - self._threadsafe_send_queue = queue.Queue() - - self._images = None - - try: - self._loop: asyncio.AbstractEventLoop | None = asyncio.get_running_loop() - except RuntimeError: - self._loop = None - - self._stop_threads = threading.Event() - self._is_interrupted = threading.Event() - - async def run(self, pipeline: Pipeline | None = None, override_pipeline_source_queue=True): - self._prerun() - - async_output_queue_marshal_task = asyncio.create_task( - self._marshal_frames()) - - self._camera_thread = threading.Thread( - target=self._run_camera, daemon=True) - self._camera_thread.start() - - self._frame_consumer_thread = threading.Thread( - target=self._frame_consumer, daemon=True - ) - self._frame_consumer_thread.start() - - if self._speaker_enabled: - self._receive_audio_thread = threading.Thread( - target=self._receive_audio, daemon=True - ) - self._receive_audio_thread.start() - - if self._vad_enabled: - self._vad_thread = threading.Thread(target=self._vad, daemon=True) - self._vad_thread.start() - - pipeline_task = None - if pipeline: - pipeline_task = asyncio.create_task( - self.run_pipeline(pipeline, override_pipeline_source_queue) - ) - - try: - while time.time() < self._expiration and not self._stop_threads.is_set(): - await asyncio.sleep(1) - except Exception as e: - self._logger.error(f"Exception {e}") - raise e - finally: - # Do anything that must be done to clean up - self._post_run() - - self._stop_threads.set() - - if pipeline_task: - pipeline_task.cancel() - - await self.send_queue.put(EndFrame()) - - await async_output_queue_marshal_task - self._frame_consumer_thread.join() - - if self._speaker_enabled: - self._receive_audio_thread.join() - - if self._vad_enabled: - self._vad_thread.join() - - async def run_pipeline(self, pipeline: Pipeline, override_pipeline_source_queue=True): - pipeline.set_sink(self.send_queue) - if override_pipeline_source_queue: - pipeline.set_source(self.receive_queue) - await pipeline.run_pipeline() - - async def run_interruptible_pipeline( - self, - pipeline: Pipeline, - pre_processor: FrameProcessor | None = None, - post_processor: FrameProcessor | None = None, - ): - pipeline.set_sink(self.send_queue) - source_queue = asyncio.Queue() - pipeline.set_source(source_queue) - pipeline_task = asyncio.create_task(pipeline.run_pipeline()) - - async def yield_frame(frame: Frame) -> AsyncGenerator[Frame, None]: - yield frame - - async def post_process(post_processor: FrameProcessor): - while True: - frame = await self.completed_queue.get() - - # We ignore the output of the post_processor's process frame; - # this is called to update the post-processor's state. - async for frame in post_processor.process_frame(frame): - pass - - if isinstance(frame, EndFrame): - break - - if post_processor: - post_process_task = asyncio.create_task( - post_process(post_processor)) - - started = False - - async for frame in self.get_receive_frames(): - if isinstance(frame, UserStartedSpeakingFrame): - pipeline_task.cancel() - self.interrupt() - pipeline_task = asyncio.create_task(pipeline.run_pipeline()) - started = False - - if not started: - await self.send_queue.put(StartFrame()) - - if pre_processor: - frame_generator = pre_processor.process_frame(frame) - else: - frame_generator = yield_frame(frame) - - async for frame in frame_generator: - await source_queue.put(frame) - - if isinstance(frame, EndFrame): - break - - await asyncio.gather(pipeline_task, post_process_task) - - async def say(self, text: str, tts: TTSService): - """Say a phrase. Use with caution; this bypasses any running pipelines.""" - async for frame in tts.process_frame(TextFrame(text)): - await self.send_queue.put(frame) - - def _post_run(self): - # Note that this function must be idempotent! It can be called multiple times - # if, for example, a keyboard interrupt occurs. - pass - - def stop(self): - self._stop_threads.set() - - async def stop_when_done(self): - await self._wait_for_send_queue_to_empty() - self.stop() - - async def _wait_for_send_queue_to_empty(self): - await self.send_queue.join() - self._threadsafe_send_queue.join() - - @abstractmethod - def write_frame_to_camera(self, frame: bytes): - pass - - @abstractmethod - def write_frame_to_mic(self, frame: bytes): - pass - - @abstractmethod - def read_audio_frames(self, desired_frame_count): - return bytes() - - @abstractmethod - def _prerun(self): - pass - - def _silero_vad_analyze(self): - try: - audio_chunk = self.read_audio_frames(self._vad_samples) - audio_int16 = np.frombuffer(audio_chunk, np.int16) - audio_float32 = int2float(audio_int16) - new_confidence = self.model( - torch.from_numpy(audio_float32), 16000).item() - # yeses = int(new_confidence * 20.0) - # nos = 20 - yeses - # out = "!" * yeses + "." * nos - # print(f"!!! confidence: {out}") - speaking = new_confidence > 0.5 - return speaking - except BaseException: - # This comes from an empty audio array - return False - - def _vad(self): - - while not self._stop_threads.is_set(): - if hasattr(self, 'model'): # we can use Silero - speaking = self._silero_vad_analyze() - elif self._has_webrtc_vad: - speaking = self._webrtc_vad_analyze() - else: - raise Exception("VAD is running with no VAD service available") - if speaking: - match self._vad_state: - case VADState.QUIET: - self._vad_state = VADState.STARTING - self._vad_starting_count = 1 - case VADState.STARTING: - self._vad_starting_count += 1 - case VADState.STOPPING: - self._vad_state = VADState.SPEAKING - self._vad_stopping_count = 0 - else: - match self._vad_state: - case VADState.STARTING: - self._vad_state = VADState.QUIET - self._vad_starting_count = 0 - case VADState.SPEAKING: - self._vad_state = VADState.STOPPING - self._vad_stopping_count = 1 - case VADState.STOPPING: - self._vad_stopping_count += 1 - - if ( - self._vad_state == VADState.STARTING - and self._vad_starting_count >= self._vad_start_frames - ): - if self._loop: - asyncio.run_coroutine_threadsafe( - self.receive_queue.put( - UserStartedSpeakingFrame()), self._loop) - # self.interrupt() - self._vad_state = VADState.SPEAKING - self._vad_starting_count = 0 - if ( - self._vad_state == VADState.STOPPING - and self._vad_stopping_count >= self._vad_stop_frames - ): - - if self._loop: - asyncio.run_coroutine_threadsafe( - self.receive_queue.put( - UserStoppedSpeakingFrame()), self._loop) - self._vad_state = VADState.QUIET - self._vad_stopping_count = 0 - - async def _marshal_frames(self): - while True: - frame: Frame | list = await self.send_queue.get() - self._threadsafe_send_queue.put(frame) - self.send_queue.task_done() - if isinstance(frame, EndFrame): - break - - def interrupt(self): - self._logger.debug("### Interrupting") - self._is_interrupted.set() - - async def get_receive_frames(self) -> AsyncGenerator[Frame, None]: - while True: - frame = await self.receive_queue.get() - yield frame - if isinstance(frame, EndFrame): - break - - def _receive_audio(self): - if not self._loop: - self._logger.error("No loop available for audio thread") - return - - seconds = 1 - desired_frame_count = self._speaker_sample_rate * seconds - while not self._stop_threads.is_set(): - buffer = self.read_audio_frames(desired_frame_count) - if len(buffer) > 0: - frame = AudioFrame(buffer) - asyncio.run_coroutine_threadsafe( - self.receive_queue.put(frame), self._loop - ) - - asyncio.run_coroutine_threadsafe( - self.receive_queue.put( - EndFrame()), self._loop) - - def _set_image(self, image: bytes): - self._images = itertools.cycle([image]) - - def _set_images(self, images: list[bytes], start_frame=0): - self._images = itertools.cycle(images) - - def request_participant_image(self, participant_id: str): - """ Child classes should override this to force an image from a user. """ - pass - - def send_app_message(self, message: Any, participant_id: str | None): - """ Child classes should override this to send a custom message to the room. """ - pass - - def _run_camera(self): - try: - while not self._stop_threads.is_set(): - if self._images: - this_frame = next(self._images) - self.write_frame_to_camera(this_frame) - - time.sleep(1.0 / self._camera_framerate) - except Exception as e: - self._logger.error(f"Exception {e} in camera thread.") - raise e - - def _frame_consumer(self): - self._logger.info("🎬 Starting frame consumer thread") - b = bytearray() - smallest_write_size = 3200 - largest_write_size = 8000 - while True: - try: - frames_or_frame: Frame | list[Frame] = self._threadsafe_send_queue.get( - ) - if ( - isinstance(frames_or_frame, AudioFrame) - and len(frames_or_frame.data) > largest_write_size - ): - # subdivide large audio frames to enable interruption - frames = [] - for i in range(0, len(frames_or_frame.data), - largest_write_size): - frames.append(AudioFrame( - frames_or_frame.data[i: i + largest_write_size])) - elif isinstance(frames_or_frame, Frame): - frames: list[Frame] = [frames_or_frame] - elif isinstance(frames_or_frame, list): - frames: list[Frame] = frames_or_frame - else: - raise Exception("Unknown type in output queue") - - for frame in frames: - if isinstance(frame, EndFrame): - self._logger.info("Stopping frame consumer thread") - self._stop_threads.set() - self._threadsafe_send_queue.task_done() - if self._loop: - asyncio.run_coroutine_threadsafe( - self.completed_queue.put(frame), self._loop - ) - # Also send the EndFrame to the pipeline so it can stop - asyncio.run_coroutine_threadsafe( - self.receive_queue.put(frame), self._loop - ) - return - - # if interrupted, we just pull frames off the queue and - # discard them - if not self._is_interrupted.is_set(): - if frame: - if isinstance(frame, AudioFrame): - chunk = frame.data - - b.extend(chunk) - truncated_length: int = len(b) - ( - len(b) % smallest_write_size - ) - if truncated_length: - self.write_frame_to_mic( - bytes(b[:truncated_length])) - b = b[truncated_length:] - elif isinstance(frame, ImageFrame): - self._set_image(frame.image) - elif isinstance(frame, SpriteFrame): - self._set_images(frame.images) - elif isinstance(frame, UserImageRequestFrame): - self.request_participant_image(frame.user_id) - elif isinstance(frame, SendAppMessageFrame): - self.send_app_message(frame.message, frame.participant_id) - elif len(b): - self.write_frame_to_mic(bytes(b)) - b = bytearray() - else: - # if there are leftover audio bytes, write them now; failing to do so - # can cause static in the audio stream. - if len(b): - truncated_length = len(b) - (len(b) % 160) - self.write_frame_to_mic( - bytes(b[:truncated_length])) - b = bytearray() - - if isinstance(frame, StartFrame): - self._is_interrupted.clear() - asyncio.run_coroutine_threadsafe( - self.receive_queue.put(PipelineStartedFrame()), - self._loop, - ) - - if self._loop: - asyncio.run_coroutine_threadsafe( - self.completed_queue.put(frame), self._loop - ) - - self._threadsafe_send_queue.task_done() - except queue.Empty: - if len(b): - self.write_frame_to_mic(bytes(b)) - - b = bytearray() - except Exception as e: - self._logger.error( - f"Exception in frame_consumer: {e}, {len(b)}") - raise e diff --git a/src/dailyai/transports/websocket_transport.py b/src/dailyai/transports/websocket_transport.py deleted file mode 100644 index f5009a02e..000000000 --- a/src/dailyai/transports/websocket_transport.py +++ /dev/null @@ -1,125 +0,0 @@ -import asyncio -import time -from typing import AsyncGenerator, List - -from dailyai.pipeline.frame_processor import FrameProcessor -from dailyai.pipeline.frames import AudioFrame, ControlFrame, EndFrame, Frame, TTSEndFrame, TTSStartFrame, TextFrame -from dailyai.pipeline.pipeline import Pipeline -from dailyai.serializers.protobuf_serializer import ProtobufFrameSerializer -from dailyai.transports.abstract_transport import AbstractTransport -from dailyai.transports.threaded_transport import ThreadedTransport - -try: - import websockets -except ModuleNotFoundError as e: - print(f"Exception: {e}") - print( - "In order to use the websocket transport, you need to `pip install dailyai[websocket]`.") - raise Exception(f"Missing module: {e}") - - -class WebSocketFrameProcessor(FrameProcessor): - """This FrameProcessor filters and mutates frames before they're sent over the websocket. - This is necessary to aggregate audio frames into sizes that are cleanly playable by the client""" - - def __init__( - self, - audio_frame_size: int | None = None, - sendable_frames: List[Frame] | None = None): - super().__init__() - if not audio_frame_size: - raise ValueError("audio_frame_size must be provided") - - self._audio_frame_size = audio_frame_size - self._sendable_frames = sendable_frames or [TextFrame, AudioFrame] - self._audio_buffer = bytes() - self._in_tts_audio = False - - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - if isinstance(frame, TTSStartFrame): - self._in_tts_audio = True - elif isinstance(frame, AudioFrame): - if self._in_tts_audio: - self._audio_buffer += frame.data - while len(self._audio_buffer) >= self._audio_frame_size: - yield AudioFrame(self._audio_buffer[:self._audio_frame_size]) - self._audio_buffer = self._audio_buffer[self._audio_frame_size:] - elif isinstance(frame, TTSEndFrame): - self._in_tts_audio = False - if self._audio_buffer: - yield AudioFrame(self._audio_buffer) - self._audio_buffer = bytes() - elif type(frame) in self._sendable_frames or isinstance(frame, ControlFrame): - yield frame - - -class WebsocketTransport(AbstractTransport): - def __init__(self, **kwargs): - super().__init__(**kwargs) - self._sample_width = kwargs.get("sample_width", 2) - self._n_channels = kwargs.get("n_channels", 1) - self._port = kwargs.get("port", 8765) - self._host = kwargs.get("host", "localhost") - self._audio_frame_size = kwargs.get("audio_frame_size", 16000) - self._sendable_frames = kwargs.get( - "sendable_frames", [ - TextFrame, AudioFrame, TTSEndFrame, TTSStartFrame]) - self._serializer = kwargs.get("serializer", ProtobufFrameSerializer()) - - self._server: websockets.WebSocketServer | None = None - self._websocket: websockets.WebSocketServerProtocol | None = None - - self._connection_handlers = [] - - async def run(self, pipeline: Pipeline, override_pipeline_source_queue=True): - self._stop_server_event = asyncio.Event() - pipeline.set_sink(self.send_queue) - if override_pipeline_source_queue: - pipeline.set_source(self.receive_queue) - - pipeline.add_processor(WebSocketFrameProcessor( - audio_frame_size=self._audio_frame_size, - sendable_frames=self._sendable_frames)) - - async def timeout(): - sleep_time = self._expiration - time.time() - await asyncio.sleep(sleep_time) - self._stop_server_event.set() - - async def send_task(): - while not self._stop_server_event.is_set(): - frame = await self.send_queue.get() - if isinstance(frame, EndFrame): - self._stop_server_event.set() - break - if self._websocket and frame: - proto = self._serializer.serialize(frame) - await self._websocket.send(proto) - - async def start_server(): - async with websockets.serve(self._websocket_handler, self._host, self._port) as server: - self._logger.debug("Websocket server started.") - await self._stop_server_event.wait() - self._logger.debug("Websocket server stopped.") - await self.receive_queue.put(EndFrame()) - - timeout_task = asyncio.create_task(timeout()) - await asyncio.gather(start_server(), send_task(), pipeline.run_pipeline()) - timeout_task.cancel() - - def on_connection(self, handler): - self._connection_handlers.append(handler) - - async def _websocket_handler(self, websocket: websockets.WebSocketServerProtocol, path): - if self._websocket: - await self._websocket.close() - self._logger.warning( - "Got another websocket connection; closing first.") - - for handler in self._connection_handlers: - await handler() - - self._websocket = websocket - async for message in websocket: - frame = self._serializer.deserialize(message) - await self.receive_queue.put(frame) diff --git a/src/dailyai/__init__.py b/src/pipecat/__init__.py similarity index 100% rename from src/dailyai/__init__.py rename to src/pipecat/__init__.py diff --git a/src/dailyai/pipeline/__init__.py b/src/pipecat/frames/__init__.py similarity index 100% rename from src/dailyai/pipeline/__init__.py rename to src/pipecat/frames/__init__.py diff --git a/src/dailyai/pipeline/frames.proto b/src/pipecat/frames/frames.proto similarity index 76% rename from src/dailyai/pipeline/frames.proto rename to src/pipecat/frames/frames.proto index b19fbccbf..18e59e492 100644 --- a/src/dailyai/pipeline/frames.proto +++ b/src/pipecat/frames/frames.proto @@ -1,6 +1,12 @@ +// +// Copyright (c) 2024, Daily +// +// SPDX-License-Identifier: BSD 2-Clause License +// + syntax = "proto3"; -package dailyai_proto; +package pipecat_proto; message TextFrame { string text = 1; diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py new file mode 100644 index 000000000..223a35746 --- /dev/null +++ b/src/pipecat/frames/frames.py @@ -0,0 +1,467 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from typing import Any, List + +from pipecat.utils.utils import obj_count, obj_id + + +class Frame: + def __init__(self, data=None): + self.id: int = obj_id() + self.data: Any = data + self.metadata = {} + self.name: str = f"{self.__class__.__name__}#{obj_count(self)}" + + def __str__(self): + return self.name + + +class DataFrame(Frame): + def __init__(self, data): + super().__init__(data) + + +class AudioRawFrame(DataFrame): + def __init__(self, data, sample_rate: int, num_channels: int): + super().__init__(data) + self.metadata["sample_rate"] = sample_rate + self.metadata["num_channels"] = num_channels + self.metadata["num_frames"] = int(len(data) / (num_channels * 2)) + + @property + def num_frames(self) -> int: + return self.metadata["num_frames"] + + @property + def sample_rate(self) -> int: + return self.metadata["sample_rate"] + + @property + def num_channels(self) -> int: + return self.metadata["num_channels"] + + def __str__(self): + return f"{self.name}(frames: {self.num_frames}, sample_rate: {self.sample_rate}, channels: {self.num_channels})" + + +class ImageRawFrame(DataFrame): + def __init__(self, data, size: tuple[int, int], format: str): + super().__init__(data) + self.metadata["size"] = size + self.metadata["format"] = format + + @property + def image(self) -> bytes: + return self.data + + @property + def size(self) -> tuple[int, int]: + return self.metadata["size"] + + @property + def format(self) -> str: + return self.metadata["format"] + + def __str__(self): + return f"{self.name}(size: {self.size}, format: {self.format})" + + +class URLImageRawFrame(ImageRawFrame): + def __init__(self, url: str, data, size: tuple[int, int], format: str): + super().__init__(data, size, format) + self.metadata["url"] = url + + @property + def url(self) -> str: + return self.metadata["url"] + + def __str__(self): + return f"{self.name}(url: {self.url}, size: {self.size}, format: {self.format})" + + +class VisionImageRawFrame(ImageRawFrame): + def __init__(self, text: str, data, size: tuple[int, int], format: str): + super().__init__(data, size, format) + self.metadata["text"] = text + + @property + def text(self) -> str: + return self.metadata["text"] + + def __str__(self): + return f"{self.name}(text: {self.text}, size: {self.size}, format: {self.format})" + + +class UserImageRawFrame(ImageRawFrame): + def __init__(self, user_id: str, data, size: tuple[int, int], format: str): + super().__init__(data, size, format) + self.metadata["user_id"] = user_id + + @property + def user_id(self) -> str: + return self.metadata["user_id"] + + def __str__(self): + return f"{self.name}(user: {self.user_id}, size: {self.size}, format: {self.format})" + + +class SpriteFrame(Frame): + def __init__(self, data): + super().__init__(data) + + @property + def images(self) -> List[ImageRawFrame]: + return self.data + + def __str__(self): + return f"{self.name}(size: {len(self.images)})" + + +class TextFrame(DataFrame): + def __init__(self, data): + super().__init__(data) + + @property + def text(self) -> str: + return self.data + + +class TranscriptionFrame(TextFrame): + def __init__(self, data, user_id: str, timestamp: int): + super().__init__(data) + self.metadata["user_id"] = user_id + self.metadata["timestamp"] = timestamp + + @property + def user_id(self) -> str: + return self.metadata["user_id"] + + @property + def timestamp(self) -> str: + return self.metadata["timestamp"] + + def __str__(self): + return f"{self.name}(user: {self.user_id}, timestamp: {self.timestamp})" + + +class InterimTranscriptionFrame(TextFrame): + def __init__(self, data, user_id: str, timestamp: int): + super().__init__(data) + self.metadata["user_id"] = user_id + self.metadata["timestamp"] = timestamp + + @property + def user_id(self) -> str: + return self.metadata["user_id"] + + @property + def timestamp(self) -> str: + return self.metadata["timestamp"] + + def __str__(self): + return f"{self.name}(user: {self.user_id}, timestamp: {self.timestamp})" + + +class LLMMessagesFrame(DataFrame): + """A frame containing a list of LLM messages. Used to signal that an LLM + service should run a chat completion and emit an LLM started response event, + text frames and an LLM stopped response event. + """ + + def __init__(self, messages): + super().__init__(messages) + +# +# App frames. Application user-defined frames. +# + + +class AppFrame(Frame): + def __init__(self, data=None): + super().__init__(data) + + +# +# System frames +# + +class SystemFrame(Frame): + def __init__(self, data=None): + super().__init__(data) + + +class StartFrame(SystemFrame): + def __init__(self): + super().__init__() + + +class CancelFrame(SystemFrame): + def __init__(self): + super().__init__() + + +class ErrorFrame(SystemFrame): + def __init__(self, data): + super().__init__(data) + self.metadata["error"] = data + + @property + def error(self) -> str: + return self.metadata["error"] + + def __str__(self): + return f"{self.name}(error: {self.error})" + +# +# Control frames +# + + +class ControlFrame(Frame): + def __init__(self, data=None): + super().__init__(data) + + +class EndFrame(ControlFrame): + def __init__(self): + super().__init__() + + +class LLMResponseStartFrame(ControlFrame): + """Used to indicate the beginning of an LLM response. Following TextFrames + are part of the LLM response until an LLMResponseEndFrame""" + + def __init__(self): + super().__init__() + + +class LLMResponseEndFrame(ControlFrame): + """Indicates the end of an LLM response.""" + + def __init__(self): + super().__init__() + + +class UserStartedSpeakingFrame(ControlFrame): + def __init__(self): + super().__init__() + + +class UserStoppedSpeakingFrame(ControlFrame): + def __init__(self): + super().__init__() + + +class TTSStartedFrame(ControlFrame): + def __init__(self): + super().__init__() + + +class TTSStoppedFrame(ControlFrame): + def __init__(self): + super().__init__() + + +class UserImageRequestFrame(ControlFrame): + def __init__(self, user_id): + super().__init__() + self.metadata["user_id"] = user_id + + @property + def user_id(self) -> str: + return self.metadata["user_id"] + + def __str__(self): + return f"{self.name}, user: {self.user_id}" + + +# class StartFrame(ControlFrame): +# """Used (but not required) to start a pipeline, and is also used to +# indicate that an interruption has ended and the transport should start +# processing frames again.""" +# pass + + +# class EndFrame(ControlFrame): +# """Indicates that a pipeline has ended and frame processors and pipelines +# should be shut down. If the transport receives this frame, it will stop +# sending frames to its output channel(s) and close all its threads.""" +# pass + + +# class EndPipeFrame(ControlFrame): +# """Indicates that a pipeline has ended but that the transport should +# continue processing. This frame is used in parallel pipelines and other +# sub-pipelines.""" +# pass + + +# class PipelineStartedFrame(ControlFrame): +# """ +# Used by the transport to indicate that execution of a pipeline is starting +# (or restarting). It should be the first frame your app receives when it +# starts, or when an interruptible pipeline has been interrupted. +# """ + +# pass + + +# @dataclass() +# class URLImageFrame(ImageFrame): +# """An image with an associated URL. Will be shown by the transport if the +# transport's camera is enabled. + +# """ +# url: str | None + +# def __init__(self, url, image, size): +# super().__init__(image, size) +# self.url = url + +# def __str__(self): +# return f"{self.__class__.__name__}, url: {self.url}, image size: +# {self.size[0]}x{self.size[1]}, buffer size: {len(self.image)} B" + + +# @dataclass() +# class VisionImageFrame(ImageFrame): +# """An image with an associated text to ask for a description of it. Will be shown by the +# transport if the transport's camera is enabled. + +# """ +# text: str | None + +# def __init__(self, text, image, size): +# super().__init__(image, size) +# self.text = text + +# def __str__(self): +# return f"{self.__class__.__name__}, text: {self.text}, image size: +# {self.size[0]}x{self.size[1]}, buffer size: {len(self.image)} B" + + +# @dataclass() +# class UserImageFrame(ImageFrame): +# """An image associated to a user. Will be shown by the transport if the transport's camera is +# enabled.""" +# user_id: str + +# def __init__(self, user_id, image, size): +# super().__init__(image, size) +# self.user_id = user_id + +# def __str__(self): +# return f"{self.__class__.__name__}, user: {self.user_id}, image size: +# {self.size[0]}x{self.size[1]}, buffer size: {len(self.image)} B" + + +# @dataclass() +# class UserImageRequestFrame(Frame): +# """A frame user to request an image from the given user.""" +# user_id: str + +# def __str__(self): +# return f"{self.__class__.__name__}, user: {self.user_id}" + + +# @dataclass() +# class SpriteFrame(Frame): +# """An animated sprite. Will be shown by the transport if the transport's +# camera is enabled. Will play at the framerate specified in the transport's +# `fps` constructor parameter.""" +# images: list[bytes] + +# def __str__(self): +# return f"{self.__class__.__name__}, list size: {len(self.images)}" + + +# @dataclass() +# class TextFrame(Frame): +# """A chunk of text. Emitted by LLM services, consumed by TTS services, can +# be used to send text through pipelines.""" +# text: str + +# def __str__(self): +# return f'{self.__class__.__name__}: "{self.text}"' + + +# class TTSStartFrame(ControlFrame): +# """Used to indicate the beginning of a TTS response. Following AudioFrames +# are part of the TTS response until an TTEndFrame. These frames can be used +# for aggregating audio frames in a transport to optimize the size of frames +# sent to the session, without needing to control this in the TTS service.""" +# pass + + +# class TTSEndFrame(ControlFrame): +# """Indicates the end of a TTS response.""" +# pass + + +# @dataclass() +# class LLMMessagesFrame(Frame): +# """A frame containing a list of LLM messages. Used to signal that an LLM +# service should run a chat completion and emit an LLMStartFrames, TextFrames +# and an LLMEndFrame. +# Note that the messages property on this class is mutable, and will be +# be updated by various ResponseAggregator frame processors.""" +# messages: List[dict] + + +# @dataclass() +# class ReceivedAppMessageFrame(Frame): +# message: Any +# sender: str + +# def __str__(self): +# return f"ReceivedAppMessageFrame: sender: {self.sender}, message: {self.message}" + + +# @dataclass() +# class SendAppMessageFrame(Frame): +# message: Any +# participant_id: str | None + +# def __str__(self): +# return f"SendAppMessageFrame: participant: {self.participant_id}, message: {self.message}" + + +# class UserStartedSpeakingFrame(Frame): +# """Emitted by VAD to indicate that a participant has started speaking. +# This can be used for interruptions or other times when detecting that +# someone is speaking is more important than knowing what they're saying +# (as you will with a TranscriptionFrame)""" +# pass + + +# class UserStoppedSpeakingFrame(Frame): +# """Emitted by the VAD to indicate that a user stopped speaking.""" +# pass + + +# class BotStartedSpeakingFrame(Frame): +# pass + + +# class BotStoppedSpeakingFrame(Frame): +# pass + + +# @dataclass() +# class LLMFunctionStartFrame(Frame): +# """Emitted when the LLM receives the beginning of a function call +# completion. A frame processor can use this frame to indicate that it should +# start preparing to make a function call, if it can do so in the absence of +# any arguments.""" +# function_name: str + + +# @dataclass() +# class LLMFunctionCallFrame(Frame): +# """Emitted when the LLM has received an entire function call completion.""" +# function_name: str +# arguments: str diff --git a/src/pipecat/frames/openai_frames.py b/src/pipecat/frames/openai_frames.py new file mode 100644 index 000000000..e5e47b222 --- /dev/null +++ b/src/pipecat/frames/openai_frames.py @@ -0,0 +1,15 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from pipecat.frames.frames import Frame + + +class OpenAILLMContextFrame(Frame): + """Like an LLMMessagesFrame, but with extra context specific to the + OpenAI API.""" + + def __init__(self, data): + super().__init__(data) diff --git a/src/dailyai/pipeline/protobufs/frames_pb2.py b/src/pipecat/frames/protobufs/frames_pb2.py similarity index 85% rename from src/dailyai/pipeline/protobufs/frames_pb2.py rename to src/pipecat/frames/protobufs/frames_pb2.py index ce71723d3..bdc34d385 100644 --- a/src/dailyai/pipeline/protobufs/frames_pb2.py +++ b/src/pipecat/frames/protobufs/frames_pb2.py @@ -14,7 +14,7 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0c\x66rames.proto\x12\rdailyai_proto\"\x19\n\tTextFrame\x12\x0c\n\x04text\x18\x01 \x01(\t\"\x1a\n\nAudioFrame\x12\x0c\n\x04\x64\x61ta\x18\x01 \x01(\x0c\"L\n\x12TranscriptionFrame\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x15\n\rparticipantId\x18\x02 \x01(\t\x12\x11\n\ttimestamp\x18\x03 \x01(\t\"\xa2\x01\n\x05\x46rame\x12(\n\x04text\x18\x01 \x01(\x0b\x32\x18.dailyai_proto.TextFrameH\x00\x12*\n\x05\x61udio\x18\x02 \x01(\x0b\x32\x19.dailyai_proto.AudioFrameH\x00\x12:\n\rtranscription\x18\x03 \x01(\x0b\x32!.dailyai_proto.TranscriptionFrameH\x00\x42\x07\n\x05\x66rameb\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0c\x66rames.proto\x12\rpipecat_proto\"\x19\n\tTextFrame\x12\x0c\n\x04text\x18\x01 \x01(\t\"\x1a\n\nAudioFrame\x12\x0c\n\x04\x64\x61ta\x18\x01 \x01(\x0c\"L\n\x12TranscriptionFrame\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x15\n\rparticipantId\x18\x02 \x01(\t\x12\x11\n\ttimestamp\x18\x03 \x01(\t\"\xa2\x01\n\x05\x46rame\x12(\n\x04text\x18\x01 \x01(\x0b\x32\x18.pipecat_proto.TextFrameH\x00\x12*\n\x05\x61udio\x18\x02 \x01(\x0b\x32\x19.pipecat_proto.AudioFrameH\x00\x12:\n\rtranscription\x18\x03 \x01(\x0b\x32!.pipecat_proto.TranscriptionFrameH\x00\x42\x07\n\x05\x66rameb\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) diff --git a/src/dailyai/services/__init__.py b/src/pipecat/pipeline/__init__.py similarity index 100% rename from src/dailyai/services/__init__.py rename to src/pipecat/pipeline/__init__.py diff --git a/src/dailyai/pipeline/merge_pipeline.py b/src/pipecat/pipeline/merge_pipeline.py similarity index 87% rename from src/dailyai/pipeline/merge_pipeline.py rename to src/pipecat/pipeline/merge_pipeline.py index 3d390d22d..019db55e1 100644 --- a/src/dailyai/pipeline/merge_pipeline.py +++ b/src/pipecat/pipeline/merge_pipeline.py @@ -1,6 +1,6 @@ from typing import List -from dailyai.pipeline.frames import EndFrame, EndPipeFrame -from dailyai.pipeline.pipeline import Pipeline +from pipecat.pipeline.frames import EndFrame, EndPipeFrame +from pipecat.pipeline.pipeline import Pipeline class SequentialMergePipeline(Pipeline): diff --git a/src/pipecat/pipeline/parallel_pipeline.py b/src/pipecat/pipeline/parallel_pipeline.py new file mode 100644 index 000000000..5dd840281 --- /dev/null +++ b/src/pipecat/pipeline/parallel_pipeline.py @@ -0,0 +1,137 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio + +from pipecat.pipeline.pipeline import Pipeline +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.frames.frames import CancelFrame, EndFrame, Frame, StartFrame + +from loguru import logger + + +class Source(FrameProcessor): + + def __init__(self, upstream_queue: asyncio.Queue): + super().__init__() + self._up_queue = upstream_queue + + async def process_frame(self, frame: Frame, direction: FrameDirection): + match direction: + case FrameDirection.UPSTREAM: + await self._up_queue.put(frame) + case FrameDirection.DOWNSTREAM: + await self.push_frame(frame, direction) + + +class Sink(FrameProcessor): + + def __init__(self, downstream_queue: asyncio.Queue): + super().__init__() + self._down_queue = downstream_queue + + async def process_frame(self, frame: Frame, direction: FrameDirection): + match direction: + case FrameDirection.UPSTREAM: + await self.push_frame(frame, direction) + case FrameDirection.DOWNSTREAM: + await self._down_queue.put(frame) + + +class ParallelPipeline(FrameProcessor): + def __init__(self, *args): + super().__init__() + + if len(args) == 0: + raise Exception(f"ParallelPipeline needs at least one argument") + + self._sources = [] + self._sinks = [] + + self._up_queue = asyncio.Queue() + self._down_queue = asyncio.Queue() + self._up_task: asyncio.Task | None = None + self._down_task: asyncio.Task | None = None + + self._pipelines = [] + + logger.debug(f"Creating {self} pipelines") + for processors in args: + if not isinstance(processors, list): + raise TypeError(f"ParallelPipeline argument {processors} is not a list") + + # We add a source at before the pipeline and a sink after. + source = Source(self._up_queue) + sink = Sink(self._down_queue) + self._sources.append(source) + self._sinks.append(sink) + + # Create pipeline + pipeline = Pipeline(processors) + source.link(pipeline) + pipeline.link(sink) + self._pipelines.append(pipeline) + + logger.debug(f"Finished creating {self} pipelines") + + # + # Frame processor + # + + async def cleanup(self): + await asyncio.gather(*[p.cleanup() for p in self._pipelines]) + + async def _start_tasks(self): + loop = self.get_event_loop() + self._up_task = loop.create_task(self._process_up_queue()) + self._down_task = loop.create_task(self._process_down_queue()) + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, StartFrame): + await self._start_tasks() + + if direction == FrameDirection.UPSTREAM: + # If we get an upstream frame we process it in each sink. + await asyncio.gather(*[s.process_frame(frame, direction) for s in self._sinks]) + elif direction == FrameDirection.DOWNSTREAM: + # If we get a downstream frame we process it in each source. + # TODO(aleix): We are creating task for each frame. For real-time + # video/audio this might be too slow. We should use an already + # created task instead. + await asyncio.gather(*[s.process_frame(frame, direction) for s in self._sources]) + + # If we get an EndFrame we stop our queue processing tasks and wait on + # all the pipelines to finish. + if isinstance(frame, CancelFrame) or isinstance(frame, EndFrame): + # Use None to indicate when queues should be done processing. + await self._up_queue.put(None) + await self._down_queue.put(None) + if self._up_task: + await self._up_task + if self._down_task: + await self._down_task + + async def _process_up_queue(self): + running = True + seen_ids = set() + while running: + frame = await self._up_queue.get() + if frame and frame.id not in seen_ids: + await self.push_frame(frame, FrameDirection.UPSTREAM) + seen_ids.add(frame.id) + running = frame is not None + self._up_queue.task_done() + + async def _process_down_queue(self): + running = True + seen_ids = set() + while running: + frame = await self._down_queue.get() + if frame and frame.id not in seen_ids: + await self.push_frame(frame, FrameDirection.DOWNSTREAM) + seen_ids.add(frame.id) + running = frame is not None + self._down_queue.task_done() diff --git a/src/pipecat/pipeline/pipeline.py b/src/pipecat/pipeline/pipeline.py new file mode 100644 index 000000000..5d1f9d3cf --- /dev/null +++ b/src/pipecat/pipeline/pipeline.py @@ -0,0 +1,76 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio + +from typing import Callable, Coroutine, List + +from pipecat.frames.frames import Frame +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor + + +class PipelineSource(FrameProcessor): + + def __init__(self, upstream_push_frame: Callable[[Frame, FrameDirection], Coroutine]): + super().__init__() + self._upstream_push_frame = upstream_push_frame + + async def process_frame(self, frame: Frame, direction: FrameDirection): + match direction: + case FrameDirection.UPSTREAM: + await self._upstream_push_frame(frame, direction) + case FrameDirection.DOWNSTREAM: + await self.push_frame(frame, direction) + + +class PipelineSink(FrameProcessor): + + def __init__(self, downstream_push_frame: Callable[[Frame, FrameDirection], Coroutine]): + super().__init__() + self._downstream_push_frame = downstream_push_frame + + async def process_frame(self, frame: Frame, direction: FrameDirection): + match direction: + case FrameDirection.UPSTREAM: + await self.push_frame(frame, direction) + case FrameDirection.DOWNSTREAM: + await self._downstream_push_frame(frame, direction) + + +class Pipeline(FrameProcessor): + + def __init__(self, processors: List[FrameProcessor]): + super().__init__() + + # Add a source and a sink queue so we can forward frames upstream and + # downstream outside of the pipeline. + self._source = PipelineSource(self.push_frame) + self._sink = PipelineSink(self.push_frame) + self._processors: List[FrameProcessor] = [self._source] + processors + [self._sink] + + self._link_processors() + + # + # Frame processor + # + + async def cleanup(self): + await self._cleanup_processors() + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if direction == FrameDirection.DOWNSTREAM: + await self._source.process_frame(frame, FrameDirection.DOWNSTREAM) + elif direction == FrameDirection.UPSTREAM: + await self._sink.process_frame(frame, FrameDirection.UPSTREAM) + + async def _cleanup_processors(self): + await asyncio.gather(*[p.cleanup() for p in self._processors]) + + def _link_processors(self): + prev = self._processors[0] + for curr in self._processors[1:]: + prev.link(curr) + prev = curr diff --git a/src/pipecat/pipeline/runner.py b/src/pipecat/pipeline/runner.py new file mode 100644 index 000000000..bb10280b7 --- /dev/null +++ b/src/pipecat/pipeline/runner.py @@ -0,0 +1,60 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import signal + +from pipecat.pipeline.task import PipelineTask +from pipecat.utils.utils import obj_count, obj_id + +from loguru import logger + + +class PipelineRunner: + + def __init__(self, name: str | None = None, handle_sigint: bool = True): + self.id: int = obj_id() + self.name: str = name or f"{self.__class__.__name__}#{obj_count(self)}" + self._loop: asyncio.AbstractEventLoop = asyncio.get_running_loop() + + self._tasks = {} + self._running = True + + if handle_sigint: + self._setup_sigint() + + async def run(self, task: PipelineTask): + logger.debug(f"Runner {self} started running {task}") + self._running = True + self._tasks[task.name] = task + await task.run() + del self._tasks[task.name] + self._running = False + logger.debug(f"Runner {self} finished running {task}") + + async def stop_when_done(self): + logger.debug(f"Runner {self} scheduled to stop when all tasks are done") + await asyncio.gather(*[t.stop_when_done() for t in self._tasks.values()]) + + async def cancel(self): + logger.debug(f"Canceling runner {self}") + await asyncio.gather(*[t.cancel() for t in self._tasks.values()]) + + def is_active(self): + return self._running + + def _setup_sigint(self): + self._loop.add_signal_handler( + signal.SIGINT, + lambda *args: asyncio.create_task(self._sigint_handler()) + ) + + async def _sigint_handler(self): + logger.warning(f"Ctrl-C detected. Canceling runner {self}") + await self.cancel() + + def __str__(self): + return self.name diff --git a/src/pipecat/pipeline/task.py b/src/pipecat/pipeline/task.py new file mode 100644 index 000000000..f13f82620 --- /dev/null +++ b/src/pipecat/pipeline/task.py @@ -0,0 +1,93 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio + +from typing import AsyncIterable, Iterable + +from pipecat.frames.frames import CancelFrame, EndFrame, ErrorFrame, Frame, StartFrame +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.utils.utils import obj_count, obj_id + +from loguru import logger + + +class Source(FrameProcessor): + + def __init__(self, up_queue: asyncio.Queue): + super().__init__() + self._up_queue = up_queue + + async def process_frame(self, frame: Frame, direction: FrameDirection): + match direction: + case FrameDirection.UPSTREAM: + await self._up_queue.put(frame) + case FrameDirection.DOWNSTREAM: + await self.push_frame(frame, direction) + + +class PipelineTask: + + def __init__(self, pipeline: FrameProcessor): + self.id: int = obj_id() + self.name: str = f"{self.__class__.__name__}#{obj_count(self)}" + + self._pipeline = pipeline + + self._task_queue = asyncio.Queue() + self._up_queue = asyncio.Queue() + + self._source = Source(self._up_queue) + self._source.link(pipeline) + + async def stop_when_done(self): + logger.debug(f"Task {self} scheduled to stop when done") + await self.queue_frame(EndFrame()) + + async def cancel(self): + logger.debug(f"Canceling pipeline task {self}") + await self.queue_frame(CancelFrame()) + + async def run(self): + await asyncio.gather(self._process_task_queue(), self._process_up_queue()) + + async def queue_frame(self, frame: Frame): + await self._task_queue.put(frame) + + async def queue_frames(self, frames: Iterable[Frame] | AsyncIterable[Frame]): + if isinstance(frames, AsyncIterable): + async for frame in frames: + await self.queue_frame(frame) + elif isinstance(frames, Iterable): + for frame in frames: + await self.queue_frame(frame) + else: + raise Exception("Frames must be an iterable or async iterable") + + async def _process_task_queue(self): + await self._source.process_frame(StartFrame(), FrameDirection.DOWNSTREAM) + running = True + while running: + frame = await self._task_queue.get() + await self._source.process_frame(frame, FrameDirection.DOWNSTREAM) + self._task_queue.task_done() + running = not (isinstance(frame, CancelFrame) or isinstance(frame, EndFrame)) + # We just enqueue None to terminate the task. + await self._up_queue.put(None) + + async def _process_up_queue(self): + running = True + while running: + frame = await self._up_queue.get() + if frame: + if isinstance(frame, ErrorFrame): + logger.error(f"Error running app: {frame.error}") + await self.queue_frame(CancelFrame()) + self._up_queue.task_done() + running = frame is not None + + def __str__(self): + return self.name diff --git a/src/dailyai/services/to_be_updated/__init__.py b/src/pipecat/processors/__init__.py similarity index 100% rename from src/dailyai/services/to_be_updated/__init__.py rename to src/pipecat/processors/__init__.py diff --git a/src/dailyai/storage/__init__.py b/src/pipecat/processors/aggregators/__init__.py similarity index 100% rename from src/dailyai/storage/__init__.py rename to src/pipecat/processors/aggregators/__init__.py diff --git a/src/pipecat/processors/aggregators/gated.py b/src/pipecat/processors/aggregators/gated.py new file mode 100644 index 000000000..3c80e4641 --- /dev/null +++ b/src/pipecat/processors/aggregators/gated.py @@ -0,0 +1,72 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from typing import List + +from pipecat.frames.frames import Frame, SystemFrame +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor + +from loguru import logger + + +class GatedAggregator(FrameProcessor): + """Accumulate frames, with custom functions to start and stop accumulation. + Yields gate-opening frame before any accumulated frames, then ensuing frames + until and not including the gate-closed frame. + + >>> from pipecat.pipeline.frames import ImageFrame + + >>> async def print_frames(aggregator, frame): + ... async for frame in aggregator.process_frame(frame): + ... if isinstance(frame, TextFrame): + ... print(frame.text) + ... else: + ... print(frame.__class__.__name__) + + >>> aggregator = GatedAggregator( + ... gate_close_fn=lambda x: isinstance(x, LLMResponseStartFrame), + ... gate_open_fn=lambda x: isinstance(x, ImageFrame), + ... start_open=False) + >>> asyncio.run(print_frames(aggregator, TextFrame("Hello"))) + >>> asyncio.run(print_frames(aggregator, TextFrame("Hello again."))) + >>> asyncio.run(print_frames(aggregator, ImageFrame(image=bytes([]), size=(0, 0)))) + ImageFrame + Hello + Hello again. + >>> asyncio.run(print_frames(aggregator, TextFrame("Goodbye."))) + Goodbye. + """ + + def __init__(self, gate_open_fn, gate_close_fn, start_open): + super().__init__() + self._gate_open_fn = gate_open_fn + self._gate_close_fn = gate_close_fn + self._gate_open = start_open + self._accumulator: List[Frame] = [] + + async def process_frame(self, frame: Frame, direction: FrameDirection): + # We must not block system frames. + if isinstance(frame, SystemFrame): + await self.push_frame(frame, direction) + return + + old_state = self._gate_open + if self._gate_open: + self._gate_open = not self._gate_close_fn(frame) + else: + self._gate_open = self._gate_open_fn(frame) + + if old_state != self._gate_open: + state = "open" if self._gate_open else "closed" + logger.debug(f"Gate is now {state} because of {frame}") + + if self._gate_open: + await self.push_frame(frame, direction) + for frame in self._accumulator: + await self.push_frame(frame, direction) + self._accumulator = [] + else: + self._accumulator.append(frame) diff --git a/src/pipecat/processors/aggregators/llm_context.py b/src/pipecat/processors/aggregators/llm_context.py new file mode 100644 index 000000000..06e91b8c3 --- /dev/null +++ b/src/pipecat/processors/aggregators/llm_context.py @@ -0,0 +1,82 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from pipecat.frames.frames import Frame, InterimTranscriptionFrame, LLMMessagesFrame, TextFrame +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor + + +class LLMContextAggregator(FrameProcessor): + def __init__( + self, + messages: list[dict], + role: str, + complete_sentences=True, + pass_through=True, + ): + super().__init__() + self._messages = messages + self._role = role + self._sentence = "" + self._complete_sentences = complete_sentences + self._pass_through = pass_through + + async def process_frame(self, frame: Frame, direction: FrameDirection): + # We don't do anything with non-text frames, pass it along to next in + # the pipeline. + if not isinstance(frame, TextFrame): + await self.push_frame(frame, direction) + return + + # If we get interim results, we ignore them. + if isinstance(frame, InterimTranscriptionFrame): + return + + # The common case for "pass through" is receiving frames from the LLM that we'll + # use to update the "assistant" LLM messages, but also passing the text frames + # along to a TTS service to be spoken to the user. + if self._pass_through: + await self.push_frame(frame, direction) + + # TODO: split up transcription by participant + if self._complete_sentences: + # type: ignore -- the linter thinks this isn't a TextFrame, even + # though we check it above + self._sentence += frame.text + if self._sentence.endswith((".", "?", "!")): + self._messages.append( + {"role": self._role, "content": self._sentence}) + self._sentence = "" + await self.push_frame(LLMMessagesFrame(self._messages)) + else: + # type: ignore -- the linter thinks this isn't a TextFrame, even + # though we check it above + self._messages.append({"role": self._role, "content": frame.text}) + await self.push_frame(LLMMessagesFrame(self._messages)) + + +class LLMUserContextAggregator(LLMContextAggregator): + def __init__( + self, + messages: list[dict], + complete_sentences=True): + super().__init__( + messages, + "user", + complete_sentences, + pass_through=False) + + +class LLMAssistantContextAggregator(LLMContextAggregator): + def __init__( + self, + messages: list[dict], + complete_sentences=True): + super().__init__( + messages, + "assistant", + complete_sentences, + pass_through=True, + ) diff --git a/src/pipecat/processors/aggregators/llm_response.py b/src/pipecat/processors/aggregators/llm_response.py new file mode 100644 index 000000000..9dc0e8862 --- /dev/null +++ b/src/pipecat/processors/aggregators/llm_response.py @@ -0,0 +1,190 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.frames.frames import ( + Frame, + InterimTranscriptionFrame, + LLMMessagesFrame, + LLMResponseStartFrame, + TextFrame, + LLMResponseEndFrame, + TranscriptionFrame, + UserStartedSpeakingFrame, + UserStoppedSpeakingFrame) + + +class LLMResponseAggregator(FrameProcessor): + + def __init__( + self, + *, + messages: list[dict] | None, + role: str, + start_frame, + end_frame, + accumulator_frame, + interim_accumulator_frame=None + ): + super().__init__() + + self._messages = messages + self._role = role + self._start_frame = start_frame + self._end_frame = end_frame + self._accumulator_frame = accumulator_frame + self._interim_accumulator_frame = interim_accumulator_frame + self._seen_start_frame = False + self._seen_end_frame = False + self._seen_interim_results = False + + self._aggregation = "" + self._aggregating = False + + # + # Frame processor + # + + # Use cases implemented: + # + # S: Start, E: End, T: Transcription, I: Interim, X: Text + # + # S E -> None + # S T E -> X + # S I T E -> X + # S I E T -> X + # S I E I T -> X + # + # The following case would not be supported: + # + # S I E T1 I T2 -> X + # + # and T2 would be dropped. + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if not self._messages: + return + + send_aggregation = False + + if isinstance(frame, self._start_frame): + self._seen_start_frame = True + self._aggregating = True + elif isinstance(frame, self._end_frame): + self._seen_end_frame = True + + # We might have received the end frame but we might still be + # aggregating (i.e. we have seen interim results but not the final + # text). + self._aggregating = self._seen_interim_results + + # Send the aggregation if we are not aggregating anymore (i.e. no + # more interim results received). + send_aggregation = not self._aggregating + elif isinstance(frame, self._accumulator_frame): + if self._aggregating: + self._aggregation += f" {frame.data}" + # We have recevied a complete sentence, so if we have seen the + # end frame and we were still aggregating, it means we should + # send the aggregation. + send_aggregation = self._seen_end_frame + + # We just got our final result, so let's reset interim results. + self._seen_interim_results = False + elif self._interim_accumulator_frame and isinstance(frame, self._interim_accumulator_frame): + self._seen_interim_results = True + else: + await self.push_frame(frame, direction) + + if send_aggregation: + await self._push_aggregation() + + async def _push_aggregation(self): + if len(self._aggregation) > 0: + self._messages.append({"role": self._role, "content": self._aggregation}) + frame = LLMMessagesFrame(self._messages) + await self.push_frame(frame) + + # Reset + self._aggregation = "" + self._seen_start_frame = False + self._seen_end_frame = False + self._seen_interim_results = False + + +class LLMAssistantResponseAggregator(LLMResponseAggregator): + def __init__(self, messages: list[dict]): + super().__init__( + messages=messages, + role="assistant", + start_frame=LLMResponseStartFrame, + end_frame=LLMResponseEndFrame, + accumulator_frame=TextFrame + ) + + +class LLMUserResponseAggregator(LLMResponseAggregator): + def __init__(self, messages: list[dict]): + super().__init__( + messages=messages, + role="user", + start_frame=UserStartedSpeakingFrame, + end_frame=UserStoppedSpeakingFrame, + accumulator_frame=TranscriptionFrame, + interim_accumulator_frame=InterimTranscriptionFrame + ) + + +class LLMFullResponseAggregator(FrameProcessor): + """This class aggregates Text frames until it receives a + LLMResponseEndFrame, then emits the concatenated text as + a single text frame. + + given the following frames: + + TextFrame("Hello,") + TextFrame(" world.") + TextFrame(" I am") + TextFrame(" an LLM.") + LLMResponseEndFrame()] + + this processor will yield nothing for the first 4 frames, then + + TextFrame("Hello, world. I am an LLM.") + LLMResponseEndFrame() + + when passed the last frame. + + >>> async def print_frames(aggregator, frame): + ... async for frame in aggregator.process_frame(frame): + ... if isinstance(frame, TextFrame): + ... print(frame.text) + ... else: + ... print(frame.__class__.__name__) + + >>> aggregator = LLMFullResponseAggregator() + >>> asyncio.run(print_frames(aggregator, TextFrame("Hello,"))) + >>> asyncio.run(print_frames(aggregator, TextFrame(" world."))) + >>> asyncio.run(print_frames(aggregator, TextFrame(" I am"))) + >>> asyncio.run(print_frames(aggregator, TextFrame(" an LLM."))) + >>> asyncio.run(print_frames(aggregator, LLMResponseEndFrame())) + Hello, world. I am an LLM. + LLMResponseEndFrame + """ + + def __init__(self): + super().__init__() + self._aggregation = "" + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, TextFrame): + self._aggregation += frame.data + elif isinstance(frame, LLMResponseEndFrame): + await self.push_frame(TextFrame(self._aggregation)) + await self.push_frame(frame) + self._aggregation = "" + else: + await self.push_frame(frame, direction) diff --git a/src/dailyai/pipeline/opeanai_llm_aggregator.py b/src/pipecat/processors/aggregators/openai_llm_context.py similarity index 63% rename from src/dailyai/pipeline/opeanai_llm_aggregator.py rename to src/pipecat/processors/aggregators/openai_llm_context.py index b4b254087..4e706de18 100644 --- a/src/dailyai/pipeline/opeanai_llm_aggregator.py +++ b/src/pipecat/processors/aggregators/openai_llm_context.py @@ -1,6 +1,12 @@ -from typing import AsyncGenerator, Callable -from dailyai.pipeline.frame_processor import FrameProcessor -from dailyai.pipeline.frames import ( +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from typing import AsyncGenerator, Callable, List + +from pipecat.frames.frames import ( Frame, LLMResponseEndFrame, LLMResponseStartFrame, @@ -9,16 +15,59 @@ UserStartedSpeakingFrame, UserStoppedSpeakingFrame, ) -from dailyai.pipeline.openai_frames import OpenAILLMContextFrame -from dailyai.services.openai_llm_context import OpenAILLMContext - -try: - from openai.types.chat import ChatCompletionRole -except ModuleNotFoundError as e: - print(f"Exception: {e}") - print( - "In order to use OpenAI, you need to `pip install dailyai[openai]`. Also, set `OPENAI_API_KEY` environment variable.") - raise Exception(f"Missing module: {e}") +from pipecat.frames.openai_frames import OpenAILLMContextFrame +from pipecat.processors.frame_processor import FrameProcessor + +from openai._types import NOT_GIVEN, NotGiven + +from openai.types.chat import ( + ChatCompletionRole, + ChatCompletionToolParam, + ChatCompletionToolChoiceOptionParam, + ChatCompletionMessageParam +) + + +class OpenAILLMContext: + + def __init__( + self, + messages: List[ChatCompletionMessageParam] | None = None, + tools: List[ChatCompletionToolParam] | NotGiven = NOT_GIVEN, + tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = NOT_GIVEN + ): + self.messages: List[ChatCompletionMessageParam] = messages if messages else [ + ] + self.tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = tool_choice + self.tools: List[ChatCompletionToolParam] | NotGiven = tools + + @ staticmethod + def from_messages(messages: List[dict]) -> "OpenAILLMContext": + context = OpenAILLMContext() + for message in messages: + context.add_message({ + "content": message["content"], + "role": message["role"], + "name": message["name"] if "name" in message else message["role"] + }) + return context + + def add_message(self, message: ChatCompletionMessageParam): + self.messages.append(message) + + def get_messages(self) -> List[ChatCompletionMessageParam]: + return self.messages + + def set_tool_choice( + self, tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven + ): + self.tool_choice = tool_choice + + def set_tools(self, tools: List[ChatCompletionToolParam] | NotGiven = NOT_GIVEN): + if tools != NOT_GIVEN and len(tools) == 0: + tools = NOT_GIVEN + + self.tools = tools class OpenAIContextAggregator(FrameProcessor): diff --git a/src/pipecat/processors/aggregators/parallel_task.py b/src/pipecat/processors/aggregators/parallel_task.py new file mode 100644 index 000000000..d2142f829 --- /dev/null +++ b/src/pipecat/processors/aggregators/parallel_task.py @@ -0,0 +1,104 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio + +from typing import List + +from pipecat.pipeline.pipeline import Pipeline +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.frames.frames import Frame + +from loguru import logger + + +class Source(FrameProcessor): + + def __init__(self, upstream_queue: asyncio.Queue): + super().__init__() + self._up_queue = upstream_queue + + async def process_frame(self, frame: Frame, direction: FrameDirection): + match direction: + case FrameDirection.UPSTREAM: + await self._up_queue.put(frame) + case FrameDirection.DOWNSTREAM: + await self.push_frame(frame, direction) + + +class Sink(FrameProcessor): + + def __init__(self, downstream_queue: asyncio.Queue): + super().__init__() + self._down_queue = downstream_queue + + async def process_frame(self, frame: Frame, direction: FrameDirection): + match direction: + case FrameDirection.UPSTREAM: + await self.push_frame(frame, direction) + case FrameDirection.DOWNSTREAM: + await self._down_queue.put(frame) + + +class ParallelTask(FrameProcessor): + def __init__(self, *args): + super().__init__() + + if len(args) == 0: + raise Exception(f"ParallelTask needs at least one argument") + + self._sinks = [] + self._pipelines = [] + + self._up_queue = asyncio.Queue() + self._down_queue = asyncio.Queue() + + logger.debug(f"Creating {self} pipelines") + for processors in args: + if not isinstance(processors, list): + raise TypeError(f"ParallelTask argument {processors} is not a list") + + # We add a source at the beginning of the pipeline and a sink at the end. + source = Source(self._up_queue) + sink = Sink(self._down_queue) + processors: List[FrameProcessor] = [source] + processors + processors.append(sink) + + # Keep track of sinks. We access the source through the pipeline. + self._sinks.append(sink) + + # Create pipeline + pipeline = Pipeline(processors) + self._pipelines.append(pipeline) + logger.debug(f"Finished creating {self} pipelines") + + # + # Frame processor + # + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if direction == FrameDirection.UPSTREAM: + # If we get an upstream frame we process it in each sink. + await asyncio.gather(*[s.process_frame(frame, direction) for s in self._sinks]) + elif direction == FrameDirection.DOWNSTREAM: + # If we get a downstream frame we process it in each source (using the pipeline). + await asyncio.gather(*[p.process_frame(frame, direction) for p in self._pipelines]) + + seen_ids = set() + while not self._up_queue.empty(): + frame = await self._up_queue.get() + if frame and frame.id not in seen_ids: + await self.push_frame(frame, FrameDirection.UPSTREAM) + seen_ids.add(frame.id) + self._up_queue.task_done() + + seen_ids = set() + while not self._down_queue.empty(): + frame = await self._down_queue.get() + if frame and frame.id not in seen_ids: + await self.push_frame(frame, FrameDirection.DOWNSTREAM) + seen_ids.add(frame.id) + self._down_queue.task_done() diff --git a/src/pipecat/processors/aggregators/sentence.py b/src/pipecat/processors/aggregators/sentence.py new file mode 100644 index 000000000..cf5efdb5e --- /dev/null +++ b/src/pipecat/processors/aggregators/sentence.py @@ -0,0 +1,50 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import re + +from typing import List + +from pipecat.frames.frames import EndFrame, Frame, TextFrame +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor + + +class SentenceAggregator(FrameProcessor): + """This frame processor aggregates text frames into complete sentences. + + Frame input/output: + TextFrame("Hello,") -> None + TextFrame(" world.") -> TextFrame("Hello world.") + + Doctest: + >>> async def print_frames(aggregator, frame): + ... async for frame in aggregator.process_frame(frame): + ... print(frame.text) + + >>> aggregator = SentenceAggregator() + >>> asyncio.run(print_frames(aggregator, TextFrame("Hello,"))) + >>> asyncio.run(print_frames(aggregator, TextFrame(" world."))) + Hello, world. + """ + + def __init__(self): + super().__init__() + self._aggregation = "" + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, TextFrame): + m = re.search("(.*[?.!])(.*)", frame.data) + if m: + await self.push_frame(TextFrame(self._aggregation + m.group(1))) + self._aggregation = m.group(2) + else: + self._aggregation += frame.data + elif isinstance(frame, EndFrame): + if self._aggregation: + await self.push_frame(TextFrame(self._aggregation)) + await self.push_frame(frame) + else: + await self.push_frame(frame, direction) diff --git a/src/pipecat/processors/aggregators/user_response.py b/src/pipecat/processors/aggregators/user_response.py new file mode 100644 index 000000000..8804d5b8d --- /dev/null +++ b/src/pipecat/processors/aggregators/user_response.py @@ -0,0 +1,139 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.frames.frames import ( + Frame, + InterimTranscriptionFrame, + TextFrame, + TranscriptionFrame, + UserStartedSpeakingFrame, + UserStoppedSpeakingFrame) + + +class ResponseAggregator(FrameProcessor): + """This frame processor aggregates frames between a start and an end frame + into complete text frame sentences. + + For example, frame input/output: + UserStartedSpeakingFrame() -> None + TranscriptionFrame("Hello,") -> None + TranscriptionFrame(" world.") -> None + UserStoppedSpeakingFrame() -> TextFrame("Hello world.") + + Doctest: + >>> async def print_frames(aggregator, frame): + ... async for frame in aggregator.process_frame(frame): + ... if isinstance(frame, TextFrame): + ... print(frame.text) + + >>> aggregator = ResponseAggregator(start_frame = UserStartedSpeakingFrame, + ... end_frame=UserStoppedSpeakingFrame, + ... accumulator_frame=TranscriptionFrame, + ... pass_through=False) + >>> asyncio.run(print_frames(aggregator, UserStartedSpeakingFrame())) + >>> asyncio.run(print_frames(aggregator, TranscriptionFrame("Hello,", 1, 1))) + >>> asyncio.run(print_frames(aggregator, TranscriptionFrame("world.", 1, 2))) + >>> asyncio.run(print_frames(aggregator, UserStoppedSpeakingFrame())) + Hello, world. + + """ + + def __init__( + self, + *, + start_frame, + end_frame, + accumulator_frame, + interim_accumulator_frame=None + ): + super().__init__() + + self._start_frame = start_frame + self._end_frame = end_frame + self._accumulator_frame = accumulator_frame + self._interim_accumulator_frame = interim_accumulator_frame + self._seen_start_frame = False + self._seen_end_frame = False + self._seen_interim_results = False + + self._aggregation = "" + self._aggregating = False + + # + # Frame processor + # + + # Use cases implemented: + # + # S: Start, E: End, T: Transcription, I: Interim, X: Text + # + # S E -> None + # S T E -> X + # S I T E -> X + # S I E T -> X + # S I E I T -> X + # + # The following case would not be supported: + # + # S I E T1 I T2 -> X + # + # and T2 would be dropped. + + async def process_frame(self, frame: Frame, direction: FrameDirection): + send_aggregation = False + + if isinstance(frame, self._start_frame): + self._seen_start_frame = True + self._aggregating = True + elif isinstance(frame, self._end_frame): + self._seen_end_frame = True + + # We might have received the end frame but we might still be + # aggregating (i.e. we have seen interim results but not the final + # text). + self._aggregating = self._seen_interim_results + + # Send the aggregation if we are not aggregating anymore (i.e. no + # more interim results received). + send_aggregation = not self._aggregating + elif isinstance(frame, self._accumulator_frame): + if self._aggregating: + self._aggregation += f" {frame.data}" + # We have recevied a complete sentence, so if we have seen the + # end frame and we were still aggregating, it means we should + # send the aggregation. + send_aggregation = self._seen_end_frame + + # We just got our final result, so let's reset interim results. + self._seen_interim_results = False + elif self._interim_accumulator_frame and isinstance(frame, self._interim_accumulator_frame): + self._seen_interim_results = True + else: + await self.push_frame(frame, direction) + + if send_aggregation: + await self._push_aggregation() + + async def _push_aggregation(self): + if len(self._aggregation) > 0: + await self.push_frame(TextFrame(self._aggregation.strip())) + + # Reset + self._aggregation = "" + self._seen_start_frame = False + self._seen_end_frame = False + self._seen_interim_results = False + + +class UserResponseAggregator(ResponseAggregator): + def __init__(self): + super().__init__( + start_frame=UserStartedSpeakingFrame, + end_frame=UserStoppedSpeakingFrame, + accumulator_frame=TranscriptionFrame, + interim_accumulator_frame=InterimTranscriptionFrame, + ) diff --git a/src/pipecat/processors/aggregators/vision_image_frame.py b/src/pipecat/processors/aggregators/vision_image_frame.py new file mode 100644 index 000000000..c5350665f --- /dev/null +++ b/src/pipecat/processors/aggregators/vision_image_frame.py @@ -0,0 +1,42 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from pipecat.frames.frames import Frame, ImageRawFrame, TextFrame, VisionImageRawFrame +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor + + +class VisionImageFrameAggregator(FrameProcessor): + """This aggregator waits for a consecutive TextFrame and an + ImageFrame. After the ImageFrame arrives it will output a VisionImageFrame. + + >>> from pipecat.pipeline.frames import ImageFrame + + >>> async def print_frames(aggregator, frame): + ... async for frame in aggregator.process_frame(frame): + ... print(frame) + + >>> aggregator = VisionImageFrameAggregator() + >>> asyncio.run(print_frames(aggregator, TextFrame("What do you see?"))) + >>> asyncio.run(print_frames(aggregator, ImageFrame(image=bytes([]), size=(0, 0)))) + VisionImageFrame, text: What do you see?, image size: 0x0, buffer size: 0 B + + """ + + def __init__(self): + super().__init__() + self._describe_text = None + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, TextFrame): + self._describe_text = frame.text + elif isinstance(frame, ImageRawFrame): + if self._describe_text: + frame = VisionImageRawFrame( + self._describe_text, frame.image, frame.size, frame.format) + await self.push_frame(frame) + self._describe_text = None + else: + await self.push_frame(frame, direction) diff --git a/src/pipecat/processors/filter.py b/src/pipecat/processors/filter.py new file mode 100644 index 000000000..0f7026b4a --- /dev/null +++ b/src/pipecat/processors/filter.py @@ -0,0 +1,34 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from typing import List + +from pipecat.frames.frames import AppFrame, ControlFrame, Frame, SystemFrame +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor + + +class Filter(FrameProcessor): + + def __init__(self, types: List[type]): + super().__init__() + self._types = types + + # + # Frame processor + # + + def _should_passthrough_frame(self, frame): + for t in self._types: + if isinstance(frame, t): + return True + + return (isinstance(frame, AppFrame) + or isinstance(frame, ControlFrame) + or isinstance(frame, SystemFrame)) + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if self._should_passthrough_frame(frame): + await self.push_frame(frame, direction) diff --git a/src/pipecat/processors/frame_processor.py b/src/pipecat/processors/frame_processor.py new file mode 100644 index 000000000..02ca62ea8 --- /dev/null +++ b/src/pipecat/processors/frame_processor.py @@ -0,0 +1,54 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +from asyncio import AbstractEventLoop +from enum import Enum + +from pipecat.frames.frames import Frame +from pipecat.utils.utils import obj_count, obj_id + +from loguru import logger + + +class FrameDirection(Enum): + DOWNSTREAM = 1 + UPSTREAM = 2 + + +class FrameProcessor: + + def __init__(self): + self.id: int = obj_id() + self.name = f"{self.__class__.__name__}#{obj_count(self)}" + self._prev: "FrameProcessor" | None = None + self._next: "FrameProcessor" | None = None + self._loop: AbstractEventLoop = asyncio.get_running_loop() + + async def cleanup(self): + pass + + def link(self, processor: 'FrameProcessor'): + self._next = processor + processor._prev = self + logger.debug(f"Linking {self} -> {self._next}") + + def get_event_loop(self) -> AbstractEventLoop: + return self._loop + + async def process_frame(self, frame: Frame, direction: FrameDirection): + pass + + async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM): + if direction == FrameDirection.DOWNSTREAM and self._next: + logger.trace(f"Pushing {frame} from {self} to {self._next}") + await self._next.process_frame(frame, direction) + elif direction == FrameDirection.UPSTREAM and self._prev: + logger.trace(f"Pushing {frame} upstream from {self} to {self._prev}") + await self._prev.process_frame(frame, direction) + + def __str__(self): + return self.name diff --git a/src/pipecat/processors/logger.py b/src/pipecat/processors/logger.py new file mode 100644 index 000000000..c8b2f10dc --- /dev/null +++ b/src/pipecat/processors/logger.py @@ -0,0 +1,22 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from pipecat.frames.frames import Frame +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor + + +class FrameLogger(FrameProcessor): + def __init__(self, prefix="Frame"): + super().__init__() + self._prefix = prefix + + async def process_frame(self, frame: Frame, direction: FrameDirection): + match direction: + case FrameDirection.UPSTREAM: + print(f"< {self._prefix}: {frame}") + case FrameDirection.DOWNSTREAM: + print(f"> {self._prefix}: {frame}") + await self.push_frame(frame, direction) diff --git a/src/pipecat/processors/text_transformer.py b/src/pipecat/processors/text_transformer.py new file mode 100644 index 000000000..a71bb2f16 --- /dev/null +++ b/src/pipecat/processors/text_transformer.py @@ -0,0 +1,36 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from typing import Coroutine + +from pipecat.frames.frames import Frame, TextFrame +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor + + +class StatelessTextTransformer(FrameProcessor): + """This processor calls the given function on any text in a text frame. + + >>> async def print_frames(aggregator, frame): + ... async for frame in aggregator.process_frame(frame): + ... print(frame.text) + + >>> aggregator = StatelessTextTransformer(lambda x: x.upper()) + >>> asyncio.run(print_frames(aggregator, TextFrame("Hello"))) + HELLO + """ + + def __init__(self, transform_fn): + super().__init__() + self._transform_fn = transform_fn + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, TextFrame): + result = self._transform_fn(frame.data) + if isinstance(result, Coroutine): + result = await result + await self.push_frame(result) + else: + await self.push_frame(frame, direction) diff --git a/src/pipecat/processors/utils/audio.py b/src/pipecat/processors/utils/audio.py new file mode 100644 index 000000000..0e657efd1 --- /dev/null +++ b/src/pipecat/processors/utils/audio.py @@ -0,0 +1,21 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from typing import List + +from pipecat.frames.frames import AudioRawFrame + + +def maybe_split_audio_frame(frame: AudioRawFrame, largest_write_size: int) -> List[AudioRawFrame]: + """Subdivide large audio frames to enable interruption.""" + frames: List[AudioRawFrame] = [] + if len(frame.data) > largest_write_size: + for i in range(0, len(frame.data), largest_write_size): + chunk = frame.data[i: i + largest_write_size] + frames.append(AudioRawFrame(chunk, frame.sample_rate, frame.num_channels)) + else: + frames.append(frame) + return frames diff --git a/src/dailyai/serializers/abstract_frame_serializer.py b/src/pipecat/serializers/abstract_frame_serializer.py similarity index 87% rename from src/dailyai/serializers/abstract_frame_serializer.py rename to src/pipecat/serializers/abstract_frame_serializer.py index cf0831bcb..8b33a6b8b 100644 --- a/src/dailyai/serializers/abstract_frame_serializer.py +++ b/src/pipecat/serializers/abstract_frame_serializer.py @@ -1,6 +1,6 @@ from abc import abstractmethod -from dailyai.pipeline.frames import Frame +from pipecat.pipeline.frames import Frame class FrameSerializer: diff --git a/src/dailyai/serializers/protobuf_serializer.py b/src/pipecat/serializers/protobuf_serializer.py similarity index 93% rename from src/dailyai/serializers/protobuf_serializer.py rename to src/pipecat/serializers/protobuf_serializer.py index 594d97a9b..04b348b86 100644 --- a/src/dailyai/serializers/protobuf_serializer.py +++ b/src/pipecat/serializers/protobuf_serializer.py @@ -1,8 +1,8 @@ import dataclasses from typing import Text -from dailyai.pipeline.frames import AudioFrame, Frame, TextFrame, TranscriptionFrame -import dailyai.pipeline.protobufs.frames_pb2 as frame_protos -from dailyai.serializers.abstract_frame_serializer import FrameSerializer +from pipecat.pipeline.frames import AudioFrame, Frame, TextFrame, TranscriptionFrame +import pipecat.pipeline.protobufs.frames_pb2 as frame_protos +from pipecat.serializers.abstract_frame_serializer import FrameSerializer class ProtobufFrameSerializer(FrameSerializer): diff --git a/src/pipecat/services/__init__.py b/src/pipecat/services/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py new file mode 100644 index 000000000..f9946ced9 --- /dev/null +++ b/src/pipecat/services/ai_services.py @@ -0,0 +1,169 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import array +import io +import math +import wave + +from abc import abstractmethod +from typing import BinaryIO + +from pipecat.frames.frames import ( + AudioRawFrame, + EndFrame, + Frame, + TextFrame, + VisionImageRawFrame, +) +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor + + +class AIService(FrameProcessor): + def __init__(self): + super().__init__() + + +class LLMService(AIService): + """This class is a no-op but serves as a base class for LLM services.""" + + def __init__(self): + super().__init__() + + +class TTSService(AIService): + def __init__(self, aggregate_sentences: bool = True): + super().__init__() + self._aggregate_sentences: bool = aggregate_sentences + self._current_sentence: str = "" + + # Converts the text to audio. + @abstractmethod + async def run_tts(self, text: str): + pass + + async def say(self, text: str): + await self.process_frame(TextFrame(text), FrameDirection.DOWNSTREAM) + + async def _process_text_frame(self, frame: TextFrame): + text: str | None = None + if not self._aggregate_sentences: + text = frame.data + else: + self._current_sentence += frame.data + if self._current_sentence.strip().endswith((".", "?", "!")): + text = self._current_sentence + self._current_sentence = "" + + if text: + await self.run_tts(text) + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, TextFrame): + await self._process_text_frame(frame) + elif isinstance(frame, EndFrame): + if self._current_sentence: + await self.run_tts(self._current_sentence) + await self.push_frame(frame) + else: + await self.push_frame(frame, direction) + + +class STTService(AIService): + """STTService is a base class for speech-to-text services.""" + + def __init__(self, + min_rms: int = 400, + max_silence_frames: int = 3, + sample_rate: int = 16000): + super().__init__() + self._min_rms = min_rms + self._max_silence_frames = max_silence_frames + self._sample_rate = sample_rate + self._current_silence_frames = 0 + (self._content, self._wave) = self._new_wave() + + @abstractmethod + async def run_stt(self, audio: BinaryIO): + """Returns transcript as a string""" + pass + + def _new_wave(self): + content = io.BufferedRandom(io.BytesIO()) + ww = wave.open(content, "wb") + ww.setnchannels(1) + ww.setsampwidth(2) + ww.setframerate(self._sample_rate) + return (content, ww) + + def _get_volume(self, audio: bytes) -> float: + # https://docs.python.org/3/library/array.html + audio_array = array.array('h', audio) + squares = [sample**2 for sample in audio_array] + mean = sum(squares) / len(audio_array) + rms = math.sqrt(mean) + return rms + + async def process_frame(self, frame: Frame, direction: FrameDirection): + """Processes a frame of audio data, either buffering or transcribing it.""" + if not isinstance(frame, AudioRawFrame): + await self.push_frame(frame, direction) + return + + data = frame.data + + # Try to filter out empty background noise + # (Very rudimentary approach, can be improved) + rms = self._get_volume(data) + if rms >= self._min_rms: + # If volume is high enough, write new data to wave file + self._wave.writeframesraw(data) + + # If buffer is not empty and we detect a 3-frame pause in speech, + # transcribe the audio gathered so far. + if self._content.tell() > 0 and self._current_silence_frames > self._max_silence_frames: + self._current_silence_frames = 0 + self._wave.close() + self._content.seek(0) + await self.run_stt(self._content) + (self._content, self._wave) = self._new_wave() + # If we get this far, this is a frame of silence + self._current_silence_frames += 1 + + +class ImageGenService(AIService): + + def __init__(self): + super().__init__() + + # Renders the image. Returns an Image object. + @abstractmethod + async def run_image_gen(self, prompt: str): + pass + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, TextFrame): + await self.run_image_gen(frame.data) + else: + await self.push_frame(frame, direction) + + +class VisionService(AIService): + """VisionService is a base class for vision services.""" + + def __init__(self): + super().__init__() + self._describe_text = None + + @abstractmethod + async def run_vision(self, frame: VisionImageRawFrame): + pass + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, VisionImageRawFrame): + await self.run_vision(frame) + else: + await self.push_frame(frame, direction) diff --git a/src/pipecat/services/anthropic.py b/src/pipecat/services/anthropic.py new file mode 100644 index 000000000..8632fdaf1 --- /dev/null +++ b/src/pipecat/services/anthropic.py @@ -0,0 +1,51 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from pipecat.frames.frames import Frame, LLMMessagesFrame, TextFrame +from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.ai_services import LLMService + +from loguru import logger + +try: + from anthropic import AsyncAnthropic +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error( + "In order to use Anthropic, you need to `pip install pipecat[anthropic]`. Also, set `ANTHROPIC_API_KEY` environment variable.") + raise Exception(f"Missing module: {e}") + + +class AnthropicLLMService(LLMService): + + def __init__( + self, + api_key, + model="claude-3-opus-20240229", + max_tokens=1024): + super().__init__() + self.client = AsyncAnthropic(api_key=api_key) + self.model = model + self.max_tokens = max_tokens + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, LLMMessagesFrame): + stream = await self.client.messages.create( + max_tokens=self.max_tokens, + messages=[ + { + "role": "user", + "content": "Hello, Claude", + } + ], + model=self.model, + stream=True, + ) + async for event in stream: + if event.type == "content_block_delta": + await self.push_frame(TextFrame(event.delta.text)) + else: + await self.push_frame(frame, direction) diff --git a/src/dailyai/services/azure_ai_services.py b/src/pipecat/services/azure.py similarity index 93% rename from src/dailyai/services/azure_ai_services.py rename to src/pipecat/services/azure.py index 151e78d62..d56058821 100644 --- a/src/dailyai/services/azure_ai_services.py +++ b/src/pipecat/services/azure.py @@ -5,9 +5,11 @@ from collections.abc import AsyncGenerator -from dailyai.services.ai_services import TTSService, ImageGenService +from pipecat.services.ai_services import TTSService, ImageGenService from PIL import Image +from loguru import logger + # See .env.example for Azure configuration needed try: from azure.cognitiveservices.speech import ( @@ -17,12 +19,12 @@ CancellationReason, ) except ModuleNotFoundError as e: - print(f"Exception: {e}") - print( - "In order to use Azure TTS, you need to `pip install dailyai[azure]`. Also, set `AZURE_SPEECH_API_KEY` and `AZURE_SPEECH_REGION` environment variables.") + logger.error(f"Exception: {e}") + logger.error( + "In order to use Azure TTS, you need to `pip install pipecat[azure]`. Also, set `AZURE_SPEECH_API_KEY` and `AZURE_SPEECH_REGION` environment variables.") raise Exception(f"Missing module: {e}") -from dailyai.services.openai_api_llm_service import BaseOpenAILLMService +from pipecat.services.openai_api_llm_service import BaseOpenAILLMService class AzureTTSService(TTSService): diff --git a/src/dailyai/services/deepgram_ai_services.py b/src/pipecat/services/deepgram.py similarity index 61% rename from src/dailyai/services/deepgram_ai_services.py rename to src/pipecat/services/deepgram.py index c6aaa55d1..42de548a1 100644 --- a/src/dailyai/services/deepgram_ai_services.py +++ b/src/pipecat/services/deepgram.py @@ -1,8 +1,17 @@ -from collections.abc import AsyncGenerator -from dailyai.services.ai_services import TTSService +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from pipecat.frames.frames import AudioRawFrame +from pipecat.services.ai_services import TTSService + +from loguru import logger class DeepgramTTSService(TTSService): + def __init__( self, *, @@ -15,15 +24,13 @@ def __init__( self._api_key = api_key self._aiohttp_session = aiohttp_session - def get_mic_sample_rate(self): - return 24000 - - async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]: - self.logger.info(f"Running deepgram tts for {sentence}") + async def run_tts(self, text: str): + logger.info(f"Running Deepgram TTS for {text}") base_url = "https://api.beta.deepgram.com/v1/speak" request_url = f"{base_url}?model={self._voice}&encoding=linear16&container=none&sample_rate=16000" headers = {"authorization": f"token {self._api_key}"} - body = {"text": sentence} + body = {"text": text} async with self._aiohttp_session.post(request_url, headers=headers, json=body) as r: async for data in r.content: - yield data + frame = AudioRawFrame(data, 16000, 1) + await self.push_frame(frame) diff --git a/src/pipecat/services/elevenlabs.py b/src/pipecat/services/elevenlabs.py new file mode 100644 index 000000000..15cdb66c9 --- /dev/null +++ b/src/pipecat/services/elevenlabs.py @@ -0,0 +1,58 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import aiohttp + +from pipecat.frames.frames import AudioRawFrame, TTSStartedFrame, TTSStoppedFrame +from pipecat.services.ai_services import TTSService + +from loguru import logger + + +class ElevenLabsTTSService(TTSService): + + def __init__( + self, + *, + aiohttp_session: aiohttp.ClientSession, + api_key: str, + voice_id: str, + model: str = "eleven_turbo_v2", + ): + super().__init__() + + self._api_key = api_key + self._voice_id = voice_id + self._aiohttp_session = aiohttp_session + self._model = model + + async def run_tts(self, text: str): + logger.debug(f"Transcribing text: {text}") + + url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream" + + payload = {"text": text, "model_id": self._model} + + querystring = { + "output_format": "pcm_16000", + "optimize_streaming_latency": 2} + + headers = { + "xi-api-key": self._api_key, + "Content-Type": "application/json", + } + + async with self._aiohttp_session.post(url, json=payload, headers=headers, params=querystring) as r: + if r.status != 200: + logger.error(f"Audio fetch status code: {r.status}, error: {r.text}") + return + + await self.push_frame(TTSStartedFrame()) + async for chunk in r.content: + if len(chunk) > 0: + frame = AudioRawFrame(chunk, 16000, 1) + await self.push_frame(frame) + await self.push_frame(TTSStoppedFrame()) diff --git a/src/dailyai/services/fal_ai_services.py b/src/pipecat/services/fal.py similarity index 55% rename from src/dailyai/services/fal_ai_services.py rename to src/pipecat/services/fal.py index a924607d2..1049b4428 100644 --- a/src/dailyai/services/fal_ai_services.py +++ b/src/pipecat/services/fal.py @@ -1,27 +1,36 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + import aiohttp -import asyncio import io import os + from PIL import Image +from numpy import result_type from pydantic import BaseModel from typing import Optional, Union, Dict +from pipecat.frames.frames import URLImageRawFrame +from pipecat.services.ai_services import ImageGenService -from dailyai.services.ai_services import ImageGenService +from loguru import logger try: import fal_client except ModuleNotFoundError as e: - print(f"Exception: {e}") - print( - "In order to use Fal, you need to `pip install dailyai[fal]`. Also, set `FAL_KEY` environment variable.") + logger.error(f"Exception: {e}") + logger.error( + "In order to use Fal, you need to `pip install pipecat[fal]`. Also, set `FAL_KEY` environment variable.") raise Exception(f"Missing module: {e}") class FalImageGenService(ImageGenService): class InputParams(BaseModel): seed: Optional[int] = None - num_inference_steps: int = 4 + num_inference_steps: int = 8 num_images: int = 1 image_size: Union[str, Dict[str, int]] = "square_hd" expand_prompt: bool = False @@ -33,8 +42,8 @@ def __init__( *, aiohttp_session: aiohttp.ClientSession, params: InputParams, - model="fal-ai/fast-sdxl", - key=None, + model: str = "fal-ai/fast-sdxl", + key: str | None = None, ): super().__init__() self._model = model @@ -43,19 +52,28 @@ def __init__( if key: os.environ["FAL_KEY"] = key - async def run_image_gen(self, prompt: str) -> tuple[str, bytes, tuple[int, int]]: + async def run_image_gen(self, prompt: str): + logger.debug(f"Generating image from prompt: {prompt}") + response = await fal_client.run_async( self._model, - arguments={"prompt": prompt, **self._params.dict()} + arguments={"prompt": prompt, **self._params.model_dump()} ) image_url = response["images"][0]["url"] if response else None if not image_url: - raise Exception("Image generation failed") + logger.error("Image generation failed") + return + + logger.debug(f"Image generated at: {image_url}") # Load the image from the url + logger.debug(f"Downloading image {image_url} ...") async with self._aiohttp_session.get(image_url) as response: + logger.debug(f"Downloaded image {image_url}") image_stream = io.BytesIO(await response.content.read()) image = Image.open(image_stream) - return (image_url, image.tobytes(), image.size) + + frame = URLImageRawFrame(image_url, image.tobytes(), image.size, image.format) + await self.push_frame(frame) diff --git a/src/pipecat/services/fireworks.py b/src/pipecat/services/fireworks.py new file mode 100644 index 000000000..402384d0d --- /dev/null +++ b/src/pipecat/services/fireworks.py @@ -0,0 +1,24 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from pipecat.services.openai import BaseOpenAILLMService + +from loguru import logger + +try: + from openai import AsyncOpenAI +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error( + "In order to use Fireworks, you need to `pip install pipecat[fireworks]`. Also, set the `FIREWORKS_API_KEY` environment variable.") + raise Exception(f"Missing module: {e}") + + +class FireworksLLMService(BaseOpenAILLMService): + def __init__(self, + model="accounts/fireworks/models/firefunction-v1", + base_url="https://api.fireworks.ai/inference/v1"): + super().__init__(model, base_url) diff --git a/src/dailyai/services/moondream_ai_service.py b/src/pipecat/services/moondream.py similarity index 56% rename from src/dailyai/services/moondream_ai_service.py rename to src/pipecat/services/moondream.py index 704d4c51b..f74ba828b 100644 --- a/src/dailyai/services/moondream_ai_service.py +++ b/src/pipecat/services/moondream.py @@ -1,13 +1,26 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + import asyncio -from dailyai.pipeline.frames import ImageFrame, VisionImageFrame -from dailyai.services.ai_services import VisionService +from pipecat.frames.frames import TextFrame, VisionImageRawFrame +from pipecat.services.ai_services import VisionService from PIL import Image -from transformers import AutoModelForCausalLM, AutoTokenizer +from loguru import logger + +try: + import torch -import torch + from transformers import AutoModelForCausalLM, AutoTokenizer +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error("In order to use Moondream, you need to `pip install pipecat[moondream]`.") + raise Exception(f"Missing module(s): {e}") def detect_device(): @@ -39,14 +52,24 @@ def __init__( self._tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision) + logger.debug("Loading Moondream model...") + self._model = AutoModelForCausalLM.from_pretrained( model_id, trust_remote_code=True, revision=revision ).to(device=device, dtype=dtype) self._model.eval() - async def run_vision(self, frame: VisionImageFrame) -> str: - def get_image_description(frame: VisionImageFrame): - image = Image.frombytes("RGB", (frame.size[0], frame.size[1]), frame.image) + logger.debug("Loaded Moondream model") + + async def run_vision(self, frame: VisionImageRawFrame): + if not self._model: + logger.error("Moondream model not available") + return + + logger.debug(f"Analyzing image: {frame}") + + def get_image_description(frame: VisionImageRawFrame): + image = Image.frombytes(frame.format, (frame.size[0], frame.size[1]), frame.data) image_embeds = self._model.encode_image(image) description = self._model.answer_question( image_embeds=image_embeds, @@ -56,4 +79,4 @@ def get_image_description(frame: VisionImageFrame): description = await asyncio.to_thread(get_image_description, frame) - return description + await self.push_frame(TextFrame(description)) diff --git a/src/dailyai/services/ollama_ai_services.py b/src/pipecat/services/ollama.py similarity index 59% rename from src/dailyai/services/ollama_ai_services.py rename to src/pipecat/services/ollama.py index adb69c7d6..781d00260 100644 --- a/src/dailyai/services/ollama_ai_services.py +++ b/src/pipecat/services/ollama.py @@ -1,4 +1,10 @@ -from dailyai.services.openai_api_llm_service import BaseOpenAILLMService +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from pipecat.services.openai import BaseOpenAILLMService class OLLamaLLMService(BaseOpenAILLMService): diff --git a/src/dailyai/services/openai_api_llm_service.py b/src/pipecat/services/openai.py similarity index 58% rename from src/dailyai/services/openai_api_llm_service.py rename to src/pipecat/services/openai.py index 2ddfc7796..b15d7950b 100644 --- a/src/dailyai/services/openai_api_llm_service.py +++ b/src/pipecat/services/openai.py @@ -1,18 +1,25 @@ +import io import json import time -from typing import AsyncGenerator, List -from dailyai.pipeline.frames import ( +import aiohttp +from PIL import Image + +from typing import List, Literal + +from pipecat.frames.frames import ( Frame, - LLMFunctionCallFrame, - LLMFunctionStartFrame, LLMMessagesFrame, LLMResponseEndFrame, LLMResponseStartFrame, TextFrame, + URLImageRawFrame ) -from dailyai.services.ai_services import LLMService -from dailyai.pipeline.openai_frames import OpenAILLMContextFrame -from dailyai.services.openai_llm_context import OpenAILLMContext +from pipecat.frames.openai_frames import OpenAILLMContextFrame +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.ai_services import LLMService, ImageGenService + +from loguru import logger try: from openai import AsyncOpenAI, AsyncStream @@ -23,9 +30,9 @@ ChatCompletionMessageParam, ) except ModuleNotFoundError as e: - print(f"Exception: {e}") - print( - "In order to use OpenAI, you need to `pip install dailyai[openai]`. Also, set `OPENAI_API_KEY` environment variable.") + logger.error(f"Exception: {e}") + logger.error( + "In order to use OpenAI, you need to `pip install pipecat[openai]`. Also, set `OPENAI_API_KEY` environment variable.") raise Exception(f"Missing module: {e}") @@ -52,7 +59,7 @@ async def _stream_chat_completions( ) -> AsyncStream[ChatCompletionChunk]: messages: List[ChatCompletionMessageParam] = context.get_messages() messages_for_log = json.dumps(messages) - self.logger.debug(f"Generating chat via openai: {messages_for_log}") + logger.debug(f"Generating chat: {messages_for_log}") start_time = time.time() chunks: AsyncStream[ChatCompletionChunk] = ( @@ -64,12 +71,15 @@ async def _stream_chat_completions( tool_choice=context.tool_choice, ) ) - self.logger.info(f"=== OpenAI LLM TTFB: {time.time() - start_time}") + + logger.debug(f"OpenAI LLM TTFB: {time.time() - start_time}") + return chunks async def _chat_completions(self, messages) -> str | None: messages_for_log = json.dumps(messages) - self.logger.debug(f"Generating chat via openai: {messages_for_log}") + + logger.debug(f"Generating chat: {messages_for_log}") response: ChatCompletion = await self._client.chat.completions.create( model=self._model, stream=False, messages=messages @@ -79,22 +89,16 @@ async def _chat_completions(self, messages) -> str | None: else: return None - async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - if isinstance(frame, OpenAILLMContextFrame): - context: OpenAILLMContext = frame.context - elif isinstance(frame, LLMMessagesFrame): - context = OpenAILLMContext.from_messages(frame.messages) - else: - yield frame - return - + async def _process_context(self, context: OpenAILLMContext): function_name = "" arguments = "" - yield LLMResponseStartFrame() + await self.push_frame(LLMResponseStartFrame()) + chunk_stream: AsyncStream[ChatCompletionChunk] = ( await self._stream_chat_completions(context) ) + async for chunk in chunk_stream: if len(chunk.choices) == 0: continue @@ -114,18 +118,75 @@ async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: tool_call = chunk.choices[0].delta.tool_calls[0] if tool_call.function and tool_call.function.name: function_name += tool_call.function.name - yield LLMFunctionStartFrame(function_name=tool_call.function.name) + # yield LLMFunctionStartFrame(function_name=tool_call.function.name) if tool_call.function and tool_call.function.arguments: # Keep iterating through the response to collect all the argument fragments and # yield a complete LLMFunctionCallFrame after run_llm_async # completes arguments += tool_call.function.arguments elif chunk.choices[0].delta.content: - yield TextFrame(chunk.choices[0].delta.content) + await self.push_frame(TextFrame(chunk.choices[0].delta.content)) # if we got a function name and arguments, yield the frame with all the info so # frame consumers can take action based on the function call. - if function_name and arguments: - yield LLMFunctionCallFrame(function_name=function_name, arguments=arguments) + # if function_name and arguments: + # yield LLMFunctionCallFrame(function_name=function_name, arguments=arguments) + + await self.push_frame(LLMResponseEndFrame()) + + async def process_frame(self, frame: Frame, direction: FrameDirection): + context = None + if isinstance(frame, OpenAILLMContextFrame): + context: OpenAILLMContext = frame.data + elif isinstance(frame, LLMMessagesFrame): + context = OpenAILLMContext.from_messages(frame.data) + else: + await self.push_frame(frame, direction) + + if context: + await self._process_context(context) + + +class OpenAILLMService(BaseOpenAILLMService): + + def __init__(self, model="gpt-4", **kwargs): + super().__init__(model, **kwargs) + + +class OpenAIImageGenService(ImageGenService): + + def __init__( + self, + *, + image_size: Literal["256x256", "512x512", "1024x1024", "1792x1024", "1024x1792"], + aiohttp_session: aiohttp.ClientSession, + api_key: str, + model: str = "dall-e-3", + ): + super().__init__() + self._model = model + self._image_size = image_size + self._client = AsyncOpenAI(api_key=api_key) + self._aiohttp_session = aiohttp_session + + async def run_image_gen(self, prompt: str): + logger.debug(f"Generating image from prompt: {prompt}") + + image = await self._client.images.generate( + prompt=prompt, + model=self._model, + n=1, + size=self._image_size + ) + + image_url = image.data[0].url + + if not image_url: + logger.error(f"no image provided in response: {image}") - yield LLMResponseEndFrame() + # Load the image from the url + async with self._aiohttp_session.get(image_url) as response: + image_stream = io.BytesIO(await response.content.read()) + image = Image.open(image_stream) + frame = URLImageRawFrame(image_url, image.tobytes(), image.size, image.format) + await self.push_frame(frame) diff --git a/src/dailyai/services/playht_ai_service.py b/src/pipecat/services/playht.py similarity index 56% rename from src/dailyai/services/playht_ai_service.py rename to src/pipecat/services/playht.py index 291855264..69c7bac9d 100644 --- a/src/dailyai/services/playht_ai_service.py +++ b/src/pipecat/services/playht.py @@ -1,50 +1,53 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + import io import struct -from dailyai.services.ai_services import TTSService +from pipecat.frames.frames import AudioRawFrame +from pipecat.services.ai_services import TTSService + +from loguru import logger try: from pyht import Client from pyht.client import TTSOptions from pyht.protos.api_pb2 import Format except ModuleNotFoundError as e: - print(f"Exception: {e}") - print( - "In order to use PlayHT, you need to `pip install dailyai[playht]`. Also, set `PLAY_HT_USER_ID` and `PLAY_HT_API_KEY` environment variables.") + logger.error(f"Exception: {e}") + logger.error( + "In order to use PlayHT, you need to `pip install pipecat[playht]`. Also, set `PLAY_HT_USER_ID` and `PLAY_HT_API_KEY` environment variables.") raise Exception(f"Missing module: {e}") class PlayHTAIService(TTSService): - def __init__( - self, - *, - api_key, - user_id, - voice_url - ): + def __init__(self, *, api_key, user_id, voice_url): super().__init__() - self.speech_key = api_key - self.user_id = user_id + self._user_id = user_id + self._speech_key = api_key - self.client = Client( - user_id=self.user_id, - api_key=self.speech_key, + self._client = Client( + user_id=self._user_id, + api_key=self._speech_key, ) - self.options = TTSOptions( + self._options = TTSOptions( voice=voice_url, sample_rate=16000, quality="higher", format=Format.FORMAT_WAV) def __del__(self): - self.client.close() + self._client.close() - async def run_tts(self, sentence): + async def run_tts(self, text: str): b = bytearray() in_header = True - for chunk in self.client.tts(sentence, self.options): + for chunk in self._client.tts(text, self._options): # skip the RIFF header. if in_header: b.extend(chunk) @@ -54,15 +57,16 @@ async def run_tts(self, sentence): fh = io.BytesIO(b) fh.seek(36) (data, size) = struct.unpack('<4sI', fh.read(8)) - self.logger.info( + logger.debug( f"first attempt: data: {data}, size: {hex(size)}, position: {fh.tell()}") while data != b'data': fh.read(size) (data, size) = struct.unpack('<4sI', fh.read(8)) - self.logger.info( + logger.debug( f"subsequent data: {data}, size: {hex(size)}, position: {fh.tell()}, data != data: {data != b'data'}") - self.logger.info("position: ", fh.tell()) + logger.debug("position: ", fh.tell()) in_header = False else: if len(chunk): - yield chunk + frame = AudioRawFrame(chunk, 16000, 1) + await self.push_frame(frame) diff --git a/src/pipecat/services/to_be_updated/__init__.py b/src/pipecat/services/to_be_updated/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/dailyai/services/to_be_updated/cloudflare_ai_service.py b/src/pipecat/services/to_be_updated/cloudflare_ai_service.py similarity index 100% rename from src/dailyai/services/to_be_updated/cloudflare_ai_service.py rename to src/pipecat/services/to_be_updated/cloudflare_ai_service.py diff --git a/src/dailyai/services/to_be_updated/google_ai_service.py b/src/pipecat/services/to_be_updated/google_ai_service.py similarity index 100% rename from src/dailyai/services/to_be_updated/google_ai_service.py rename to src/pipecat/services/to_be_updated/google_ai_service.py diff --git a/src/dailyai/services/to_be_updated/huggingface_ai_service.py b/src/pipecat/services/to_be_updated/huggingface_ai_service.py similarity index 100% rename from src/dailyai/services/to_be_updated/huggingface_ai_service.py rename to src/pipecat/services/to_be_updated/huggingface_ai_service.py diff --git a/src/dailyai/services/to_be_updated/mock_ai_service.py b/src/pipecat/services/to_be_updated/mock_ai_service.py similarity index 100% rename from src/dailyai/services/to_be_updated/mock_ai_service.py rename to src/pipecat/services/to_be_updated/mock_ai_service.py diff --git a/src/dailyai/services/whisper_ai_services.py b/src/pipecat/services/whisper.py similarity index 59% rename from src/dailyai/services/whisper_ai_services.py rename to src/pipecat/services/whisper.py index ddddb4f98..768e689c8 100644 --- a/src/dailyai/services/whisper_ai_services.py +++ b/src/pipecat/services/whisper.py @@ -1,17 +1,28 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + """This module implements Whisper transcription with a locally-downloaded model.""" + import asyncio +import time + from enum import Enum -import logging from typing import BinaryIO -from dailyai.services.local_stt_service import LocalSTTService +from pipecat.frames.frames import TranscriptionFrame +from pipecat.services.ai_services import STTService + +from loguru import logger try: from faster_whisper import WhisperModel except ModuleNotFoundError as e: - print(f"Exception: {e}") - print( - "In order to use Whisper, you need to `pip install dailyai[whisper]`.") + logger.error(f"Exception: {e}") + logger.error( + "In order to use Whisper, you need to `pip install pipecat[whisper]`.") raise Exception(f"Missing module: {e}") @@ -25,39 +36,40 @@ class Model(Enum): DISTIL_MEDIUM_EN = "Systran/faster-distil-whisper-medium.en" -class WhisperSTTService(LocalSTTService): +class WhisperSTTService(STTService): """Class to transcribe audio with a locally-downloaded Whisper model""" - _model: WhisperModel - - # Model configuration - _model_name: Model - _device: str - _compute_type: str def __init__(self, model_name: Model = Model.DISTIL_MEDIUM_EN, device: str = "auto", compute_type: str = "default"): super().__init__() - self.logger: logging.Logger = logging.getLogger("dailyai") - self._model_name = model_name - self._device = device + self._device: str = device self._compute_type = compute_type + self._model_name: Model = model_name + self._model: WhisperModel | None = None self._load() def _load(self): """Loads the Whisper model. Note that if this is the first time this model is being run, it will take time to download.""" + logger.debug("Loading Whisper model...") model = WhisperModel( self._model_name.value, device=self._device, compute_type=self._compute_type) self._model = model + logger.debug("Loaded Whisper model") - async def run_stt(self, audio: BinaryIO) -> str: + async def run_stt(self, audio: BinaryIO): """Transcribes given audio using Whisper""" + if not self._model: + logger.error("Whisper model not available") + return + segments, _ = await asyncio.to_thread(self._model.transcribe, audio) - res: str = "" + text: str = "" for segment in segments: - res += f"{segment.text} " - return res + text += f"{segment.text} " + + await self.push_frame(TranscriptionFrame(text, "", int(time.time_ns() / 1000000))) diff --git a/src/pipecat/storage/__init__.py b/src/pipecat/storage/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/dailyai/storage/search.py b/src/pipecat/storage/search.py similarity index 100% rename from src/dailyai/storage/search.py rename to src/pipecat/storage/search.py diff --git a/src/pipecat/transports/base_input.py b/src/pipecat/transports/base_input.py new file mode 100644 index 000000000..0ae2e7508 --- /dev/null +++ b/src/pipecat/transports/base_input.py @@ -0,0 +1,138 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import queue +import threading + +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.frames.frames import ( + AudioRawFrame, + CancelFrame, + StartFrame, + EndFrame, + Frame, + UserStartedSpeakingFrame, + UserStoppedSpeakingFrame) +from pipecat.transports.base_transport import TransportParams +from pipecat.vad.vad_analyzer import VADState + +from loguru import logger + + +class BaseInputTransport(FrameProcessor): + + def __init__(self, params: TransportParams): + super().__init__() + + self._params = params + + self._running = True + + # Start media threads. + if self._params.audio_in_enabled: + self._audio_in_queue = queue.Queue() + self._audio_in_thread = threading.Thread(target=self._audio_in_thread_handler) + self._audio_out_thread = threading.Thread(target=self._audio_out_thread_handler) + + self._stopped_event = asyncio.Event() + + async def start(self): + if self._params.audio_in_enabled: + self._audio_in_thread.start() + self._audio_out_thread.start() + + async def stop(self): + # This will exit all threads. + self._running = False + + self._stopped_event.set() + + def vad_analyze(self, audio_frames: bytes) -> VADState: + pass + + def read_raw_audio_frames(self, frame_count: int) -> bytes: + pass + + # + # Frame processor + # + + async def cleanup(self): + if self._params.audio_in_enabled: + self._audio_in_thread.join() + self._audio_out_thread.join() + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, StartFrame): + await self.push_frame(frame, direction) + await self.start() + elif isinstance(frame, CancelFrame) or isinstance(frame, EndFrame): + await self.push_frame(frame, direction) + await self.stop() + else: + await self.push_frame(frame, direction) + + # If we are finishing, wait here until we have stopped, otherwise we + # might close things too early upstream. + if isinstance(frame, CancelFrame) or isinstance(frame, EndFrame): + await self._stopped_event.wait() + + # + # Audio input + # + + def _handle_vad(self, audio_frames: bytes, vad_state: VADState): + new_vad_state = self.vad_analyze(audio_frames) + if new_vad_state != vad_state and new_vad_state != VADState.STARTING and new_vad_state != VADState.STOPPING: + frame = None + if new_vad_state == VADState.SPEAKING: + frame = UserStartedSpeakingFrame() + elif new_vad_state == VADState.QUIET: + frame = UserStoppedSpeakingFrame() + if frame: + future = asyncio.run_coroutine_threadsafe( + self.push_frame(frame), self.get_event_loop()) + future.result() + vad_state = new_vad_state + return vad_state + + def _audio_in_thread_handler(self): + sample_rate = self._params.audio_in_sample_rate + num_channels = self._params.audio_in_channels + num_frames = int(sample_rate / 100) # 10ms of audio + while self._running: + try: + audio_frames = self.read_raw_audio_frames(num_frames) + if len(audio_frames) > 0: + frame = AudioRawFrame(audio_frames, sample_rate, num_channels) + self._audio_in_queue.put(frame) + except BaseException as e: + logger.error(f"Error reading audio frames: {e}") + + def _audio_out_thread_handler(self): + vad_state: VADState = VADState.QUIET + while self._running: + try: + frame = self._audio_in_queue.get(timeout=1) + + audio_passthrough = True + + # Check VAD and push event if necessary. We just care about changes + # from QUIET to SPEAKING and vice versa. + if self._params.vad_enabled: + vad_state = self._handle_vad(frame.data, vad_state) + audio_passthrough = self._params.vad_audio_passthrough + + # Push audio downstream if passthrough. + if audio_passthrough: + future = asyncio.run_coroutine_threadsafe( + self.push_frame(frame), self.get_event_loop()) + future.result() + except queue.Empty: + pass + except BaseException as e: + logger.error(f"Error pushing audio frames: {e}") diff --git a/src/pipecat/transports/base_output.py b/src/pipecat/transports/base_output.py new file mode 100644 index 000000000..f104e36af --- /dev/null +++ b/src/pipecat/transports/base_output.py @@ -0,0 +1,186 @@ +# +# Copyright (c) 2024, Daily + +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import itertools +import queue +import threading +import time + +from typing import List + +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.frames.frames import ( + AudioRawFrame, + CancelFrame, + SpriteFrame, + StartFrame, + EndFrame, + Frame, + ImageRawFrame) +from pipecat.transports.base_transport import TransportParams + +from loguru import logger + + +class BaseOutputTransport(FrameProcessor): + + def __init__(self, params: TransportParams): + super().__init__() + + self._params = params + + self._running = True + + # These are the images that we should send to the camera at our desired + # framerate. + self._camera_images = None + + # Start media threads. + if self._params.camera_out_enabled: + self._camera_out_queue = queue.Queue() + self._camera_out_thread = threading.Thread(target=self._camera_out_thread_handler) + self._camera_out_thread.start() + + self._sink_queue = queue.Queue() + self._sink_thread = threading.Thread(target=self._sink_thread_handler) + + self._stopped_event = asyncio.Event() + + async def start(self): + self._sink_thread.start() + + async def stop(self): + # This will exit all threads. + self._running = False + + self._stopped_event.set() + + def write_frame_to_camera(self, frame: ImageRawFrame): + pass + + def write_raw_audio_frames(self, frames: bytes): + pass + + # + # Frame processor + # + + async def cleanup(self): + if self._params.camera_out_enabled: + self._camera_out_thread.join() + + self._sink_thread.join() + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, StartFrame): + await self.push_frame(frame, direction) + await self.start() + # EndFrame is managed in the queue handler. + elif isinstance(frame, CancelFrame): + await self.push_frame(frame, direction) + await self.stop() + elif self._frame_managed_by_sink(frame): + self._sink_queue.put(frame) + else: + await self.push_frame(frame, direction) + + # If we are finishing, wait here until we have stopped, otherwise we might + # close things too early upstream. + if isinstance(frame, CancelFrame) or isinstance(frame, EndFrame): + await self._stopped_event.wait() + + def _frame_managed_by_sink(self, frame: Frame): + return (isinstance(frame, AudioRawFrame) + or isinstance(frame, ImageRawFrame) + or isinstance(frame, SpriteFrame) + or isinstance(frame, CancelFrame) + or isinstance(frame, EndFrame)) + + def _sink_thread_handler(self): + buffer = bytearray() + bytes_size_10ms = int(self._params.audio_out_sample_rate / 100) * \ + self._params.audio_out_channels * 2 + while self._running: + try: + frame = self._sink_queue.get(timeout=1) + if isinstance(frame, CancelFrame) or isinstance(frame, EndFrame): + # Send all remaining audio before stopping (multiple of 10ms of audio). + self._send_audio_truncated(buffer, bytes_size_10ms) + future = asyncio.run_coroutine_threadsafe(self.stop(), self.get_event_loop()) + future.result() + elif isinstance(frame, AudioRawFrame): + if self._params.audio_out_enabled: + buffer.extend(frame.data) + buffer = self._send_audio_truncated(buffer, bytes_size_10ms) + elif isinstance(frame, ImageRawFrame) and self._params.camera_out_enabled: + self._set_camera_image(frame) + elif isinstance(frame, SpriteFrame) and self._params.camera_out_enabled: + self._set_camera_images(frame.images) + except queue.Empty: + pass + except BaseException as e: + logger.error(f"Error processing sink queue: {e}") + + # + # Camera out + # + + async def send_image(self, frame: ImageRawFrame | SpriteFrame): + await self.process_frame(frame, FrameDirection.DOWNSTREAM) + + def _draw_image(self, image: ImageRawFrame): + desired_size = (self._params.camera_out_width, self._params.camera_out_height) + + if image.size != desired_size: + logger.warning( + f"{image} does not have the expected size {desired_size}, ignoring") + return + + self.write_frame_to_camera(image) + + def _set_camera_image(self, image: ImageRawFrame): + if self._params.camera_out_is_live: + self._camera_out_queue.put(image) + else: + self._camera_images = itertools.cycle([image]) + + def _set_camera_images(self, images: List[ImageRawFrame]): + self._camera_images = itertools.cycle(images) + + def _camera_out_thread_handler(self): + while self._running: + try: + if self._params.camera_out_is_live: + image = self._camera_out_queue.get(timeout=1) + self._draw_image(image) + elif self._camera_images: + image = next(self._camera_images) + self._draw_image(image) + time.sleep(1.0 / self._params.camera_out_framerate) + except queue.Empty: + pass + except Exception as e: + logger.error(f"Error writing to camera: {e}") + + # + # Audio out + # + + async def send_audio(self, frame: AudioRawFrame): + await self.process_frame(frame, FrameDirection.DOWNSTREAM) + + def _send_audio_truncated(self, buffer: bytearray, smallest_write_size: int) -> bytearray: + try: + truncated_length: int = len(buffer) - (len(buffer) % smallest_write_size) + if truncated_length: + self.write_raw_audio_frames(bytes(buffer[:truncated_length])) + buffer = buffer[truncated_length:] + return buffer + except BaseException as e: + logger.error(f"Error writing audio frames: {e}") + return buffer diff --git a/src/pipecat/transports/base_transport.py b/src/pipecat/transports/base_transport.py new file mode 100644 index 000000000..e597ddace --- /dev/null +++ b/src/pipecat/transports/base_transport.py @@ -0,0 +1,40 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from abc import ABC, abstractmethod + +from pydantic.main import BaseModel + +from pipecat.processors.frame_processor import FrameProcessor + + +class TransportParams(BaseModel): + camera_out_enabled: bool = False + camera_out_is_live: bool = False + camera_out_width: int = 1024 + camera_out_height: int = 768 + camera_out_bitrate: int = 800000 + camera_out_framerate: int = 30 + camera_out_color_format: str = "RGB" + audio_out_enabled: bool = False + audio_out_sample_rate: int = 16000 + audio_out_channels: int = 1 + audio_in_enabled: bool = False + audio_in_sample_rate: int = 16000 + audio_in_channels: int = 1 + vad_enabled: bool = False + vad_audio_passthrough: bool = False + + +class BaseTransport(ABC): + + @abstractmethod + def input(self) -> FrameProcessor: + pass + + @abstractmethod + def output(self) -> FrameProcessor: + pass diff --git a/src/pipecat/transports/local/__init__.py b/src/pipecat/transports/local/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/pipecat/transports/local/audio.py b/src/pipecat/transports/local/audio.py new file mode 100644 index 000000000..b4a380259 --- /dev/null +++ b/src/pipecat/transports/local/audio.py @@ -0,0 +1,93 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio + +from pipecat.processors.frame_processor import FrameProcessor +from pipecat.transports.base_input import BaseInputTransport +from pipecat.transports.base_output import BaseOutputTransport +from pipecat.transports.base_transport import BaseTransport, TransportParams + +from loguru import logger + +try: + import pyaudio +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error( + "In order to use local audio, you need to `pip install pipecat[audio]`. On MacOS, you also need to `brew install portaudio`.") + raise Exception(f"Missing module: {e}") + + +class AudioInputTransport(BaseInputTransport): + + def __init__(self, py_audio: pyaudio.PyAudio, params: TransportParams): + super().__init__(params) + + self._in_stream = py_audio.open( + format=py_audio.get_format_from_width(2), + channels=params.audio_in_channels, + rate=params.audio_in_sample_rate, + frames_per_buffer=params.audio_in_sample_rate, + input=True) + + def read_raw_audio_frames(self, frame_count: int) -> bytes: + return self._in_stream.read(frame_count, exception_on_overflow=False) + + async def cleanup(self): + # This is not very pretty (taken from PyAudio docs). + while self._in_stream.is_active(): + await asyncio.sleep(0.1) + self._in_stream.close() + + await super().cleanup() + + +class AudioOutputTransport(BaseOutputTransport): + + def __init__(self, py_audio: pyaudio.PyAudio, params: TransportParams): + super().__init__(params) + + self._out_stream = py_audio.open( + format=py_audio.get_format_from_width(2), + channels=params.audio_out_channels, + rate=params.audio_out_sample_rate, + output=True) + + def write_raw_audio_frames(self, frames: bytes): + self._out_stream.write(frames) + + async def cleanup(self): + # This is not very pretty (taken from PyAudio docs). + while self._out_stream.is_active(): + await asyncio.sleep(0.1) + self._out_stream.close() + + await super().cleanup() + + +class LocalAudioTransport(BaseTransport): + + def __init__(self, params: TransportParams): + self._params = params + self._pyaudio = pyaudio.PyAudio() + + self._input: AudioInputTransport | None = None + self._output: AudioOutputTransport | None = None + + # + # BaseTransport + # + + def input(self) -> FrameProcessor: + if not self._input: + self._input = AudioInputTransport(self._pyaudio, self._params) + return self._input + + def output(self) -> FrameProcessor: + if not self._output: + self._output = AudioOutputTransport(self._pyaudio, self._params) + return self._output diff --git a/src/pipecat/transports/local/tk.py b/src/pipecat/transports/local/tk.py new file mode 100644 index 000000000..6a05c9a63 --- /dev/null +++ b/src/pipecat/transports/local/tk.py @@ -0,0 +1,130 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio + +import numpy as np +import tkinter as tk + +from pipecat.frames.frames import ImageRawFrame +from pipecat.processors.frame_processor import FrameProcessor +from pipecat.transports.base_input import BaseInputTransport +from pipecat.transports.base_output import BaseOutputTransport +from pipecat.transports.base_transport import BaseTransport, TransportParams + +from loguru import logger + +try: + import pyaudio +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error( + "In order to use local audio, you need to `pip install pipecat[audio]`. On MacOS, you also need to `brew install portaudio`.") + raise Exception(f"Missing module: {e}") + +try: + import tkinter as tk +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error("tkinter missing. Try `apt install python3-tk` or `brew install python-tk@3.10`.") + raise Exception(f"Missing module: {e}") + + +class TkInputTransport(BaseInputTransport): + + def __init__(self, py_audio: pyaudio.PyAudio, params: TransportParams): + super().__init__(params) + + self._in_stream = py_audio.open( + format=py_audio.get_format_from_width(2), + channels=params.audio_in_channels, + rate=params.audio_in_sample_rate, + frames_per_buffer=params.audio_in_sample_rate, + input=True) + + def read_raw_audio_frames(self, frame_count: int) -> bytes: + return self._in_stream.read(frame_count, exception_on_overflow=False) + + async def cleanup(self): + # This is not very pretty (taken from PyAudio docs). + while self._in_stream.is_active(): + await asyncio.sleep(0.1) + self._in_stream.close() + + await super().cleanup() + + +class TkOutputTransport(BaseOutputTransport): + + def __init__(self, tk_root: tk.Tk, py_audio: pyaudio.PyAudio, params: TransportParams): + super().__init__(params) + + self._out_stream = py_audio.open( + format=py_audio.get_format_from_width(2), + channels=params.audio_out_channels, + rate=params.audio_out_sample_rate, + output=True) + + # Start with a neutral gray background. + array = np.ones((1024, 1024, 3)) * 128 + data = f"P5 {1024} {1024} 255 ".encode() + array.astype(np.uint8).tobytes() + photo = tk.PhotoImage(width=1024, height=1024, data=data, format="PPM") + self._image_label = tk.Label(tk_root, image=photo) + self._image_label.pack() + + def write_raw_audio_frames(self, frames: bytes): + self._out_stream.write(frames) + + def write_frame_to_camera(self, frame: ImageRawFrame): + asyncio.run_coroutine_threadsafe(self._write_frame_to_tk(frame), self.get_event_loop()) + + async def cleanup(self): + # This is not very pretty (taken from PyAudio docs). + while self._out_stream.is_active(): + await asyncio.sleep(0.1) + self._out_stream.close() + + await super().cleanup() + + async def _write_frame_to_tk(self, frame: ImageRawFrame): + width = frame.size[0] + height = frame.size[1] + data = f"P6 {width} {height} 255 ".encode() + frame.data + photo = tk.PhotoImage( + width=width, + height=height, + data=data, + format="PPM") + self._image_label.config(image=photo) + + # This holds a reference to the photo, preventing it from being garbage + # collected. + self._image_label.image = photo + + +class TkLocalTransport(BaseTransport): + + def __init__(self, tk_root: tk.Tk, params: TransportParams): + self._tk_root = tk_root + self._params = params + self._pyaudio = pyaudio.PyAudio() + + self._input: TkInputTransport | None = None + self._output: TkOutputTransport | None = None + + # + # BaseTransport + # + + def input(self) -> FrameProcessor: + if not self._input: + self._input = TkInputTransport(self._pyaudio, self._params) + return self._input + + def output(self) -> FrameProcessor: + if not self._output: + self._output = TkOutputTransport(self._tk_root, self._pyaudio, self._params) + return self._output diff --git a/src/pipecat/transports/services/daily.py b/src/pipecat/transports/services/daily.py new file mode 100644 index 000000000..84d690569 --- /dev/null +++ b/src/pipecat/transports/services/daily.py @@ -0,0 +1,728 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import inspect +import queue +import threading +import time +import types + +from functools import partial +from typing import Any, Callable, Mapping + +from daily import ( + CallClient, + Daily, + EventHandler, + VirtualCameraDevice, + VirtualMicrophoneDevice, + VirtualSpeakerDevice) +from pydantic.main import BaseModel + +from pipecat.frames.frames import ( + AudioRawFrame, + Frame, + ImageRawFrame, + InterimTranscriptionFrame, + SpriteFrame, + TranscriptionFrame, + UserImageRawFrame, + UserImageRequestFrame) +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.transports.base_input import BaseInputTransport +from pipecat.transports.base_output import BaseOutputTransport +from pipecat.transports.base_transport import BaseTransport, TransportParams +from pipecat.vad.vad_analyzer import VADAnalyzer, VADState + +from loguru import logger + +try: + from daily import (EventHandler, CallClient, Daily) +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error("In order to use the Daily transport, you need to `pip install pipecat[daily]`.") + raise Exception(f"Missing module: {e}") + +VAD_RESET_PERIOD_MS = 2000 + + +class WebRTCVADAnalyzer(VADAnalyzer): + + def __init__(self, sample_rate=16000, num_channels=1): + super().__init__(sample_rate, num_channels) + + self._webrtc_vad = Daily.create_native_vad( + reset_period_ms=VAD_RESET_PERIOD_MS, + sample_rate=sample_rate, + channels=num_channels + ) + logger.debug("Loaded native WebRTC VAD") + + def num_frames_required(self) -> int: + return int(self.sample_rate / 100.0) + + def voice_confidence(self, buffer) -> float: + confidence = 0 + if len(buffer) > 0: + confidence = self._webrtc_vad.analyze_frames(buffer) + return confidence + + +class DailyParams(TransportParams): + transcription_enabled: bool = False + transcription_settings: Mapping[str, Any] = { + "language": "en", + "tier": "nova", + "model": "2-conversationalai", + "profanity_filter": True, + "redact": False, + "endpointing": True, + "punctuate": True, + "includeRawResponse": True, + "extra": { + "interim_results": True, + } + } + + +class DailyCallbacks(BaseModel): + on_joined: Callable[[Mapping[str, Any]], None] + on_left: Callable[[], None] + on_participant_joined: Callable[[Mapping[str, Any]], None] + on_first_participant_joined: Callable[[Mapping[str, Any]], None] + on_error: Callable[[str], None] + + +class DailySession(EventHandler): + + _daily_initialized: bool = False + + # This is necessary to override EventHandler's __new__ method. + def __new__(cls, *args, **kwargs): + return super().__new__(cls) + + def __init__( + self, + room_url: str, + token: str | None, + bot_name: str, + params: DailyParams, + callbacks: DailyCallbacks): + super().__init__() + + if not self._daily_initialized: + self._daily_initialized = True + Daily.init() + + self._room_url: str = room_url + self._token: str | None = token + self._bot_name: str = bot_name + self._params: DailyParams = params + self._callbacks = callbacks + + self._participant_id: str = "" + self._video_renderers = {} + self._transcription_renderers = {} + self._other_participant_has_joined = False + + self._joined = False + self._joining = False + self._leaving = False + self._sync_response = {k: queue.Queue() for k in ["join", "leave"]} + + self._client: CallClient = CallClient(event_handler=self) + + self._camera: VirtualCameraDevice = Daily.create_camera_device( + "camera", + width=self._params.camera_out_width, + height=self._params.camera_out_height, + color_format=self._params.camera_out_color_format) + + self._mic: VirtualMicrophoneDevice = Daily.create_microphone_device( + "mic", sample_rate=self._params.audio_out_sample_rate, channels=self._params.audio_out_channels) + + self._speaker: VirtualSpeakerDevice = Daily.create_speaker_device( + "speaker", sample_rate=self._params.audio_in_sample_rate, channels=self._params.audio_in_channels) + Daily.select_speaker_device("speaker") + + self._vad_analyzer = None + if self._params.vad_enabled: + self._vad_analyzer = WebRTCVADAnalyzer( + sample_rate=self._params.audio_in_sample_rate, + num_channels=self._params.audio_in_channels) + + @ property + def participant_id(self) -> str: + return self._participant_id + + def set_callbacks(self, callbacks: DailyCallbacks): + self._callbacks = callbacks + + def vad_analyze(self, audio_frames: bytes) -> VADState: + state = VADState.QUIET + if self._vad_analyzer: + state = self._vad_analyzer.analyze_audio(audio_frames) + return state + + def read_raw_audio_frames(self, frame_count: int) -> bytes: + return self._speaker.read_frames(frame_count) + + def write_raw_audio_frames(self, frames: bytes): + self._mic.write_frames(frames) + + def write_frame_to_camera(self, frame: ImageRawFrame): + self._camera.write_frame(frame.data) + + async def join(self): + # Transport already joined, ignore. + if self._joined or self._joining: + return + + self._joining = True + + loop = asyncio.get_running_loop() + await loop.run_in_executor(None, self._join) + + def _join(self): + logger.info(f"Joining {self._room_url}") + + # For performance reasons, never subscribe to video streams (unless a + # video renderer is registered). + self._client.update_subscription_profiles({ + "base": { + "camera": "unsubscribed", + "screenVideo": "unsubscribed" + } + }) + + self._client.set_user_name(self._bot_name) + + self._client.join( + self._room_url, + self._token, + completion=self._call_joined, + client_settings={ + "inputs": { + "camera": { + "isEnabled": True, + "settings": { + "deviceId": "camera", + }, + }, + "microphone": { + "isEnabled": True, + "settings": { + "deviceId": "mic", + "customConstraints": { + "autoGainControl": {"exact": False}, + "echoCancellation": {"exact": False}, + "noiseSuppression": {"exact": False}, + }, + }, + }, + }, + "publishing": { + "camera": { + "sendSettings": { + "maxQuality": "low", + "encodings": { + "low": { + "maxBitrate": self._params.camera_out_bitrate, + "maxFramerate": self._params.camera_out_framerate, + } + }, + } + } + }, + }) + + self._handle_join_response() + + def _handle_join_response(self): + try: + (data, error) = self._sync_response["join"].get(timeout=10) + if not error: + self._joined = True + self._joining = False + + logger.info(f"Joined {self._room_url}") + + if self._token and self._params.transcription_enabled: + logger.info( + f"Enabling transcription with settings {self._params.transcription_settings}") + self._client.start_transcription(self._params.transcription_settings) + + self._callbacks.on_joined(data["participants"]["local"]) + else: + error_msg = f"Error joining {self._room_url}: {error}" + logger.error(error_msg) + self._callbacks.on_error(error_msg) + except queue.Empty: + error_msg = f"Time out joining {self._room_url}" + logger.error(error_msg) + self._callbacks.on_error(error_msg) + + async def leave(self): + # Transport not joined, ignore. + if not self._joined or self._leaving: + return + + self._joined = False + self._leaving = True + + loop = asyncio.get_running_loop() + await loop.run_in_executor(None, self._leave) + + def _leave(self): + logger.info(f"Leaving {self._room_url}") + + if self._params.transcription_enabled: + self._client.stop_transcription() + + self._client.leave(completion=self._call_left) + + self._handle_leave_response() + + def _handle_leave_response(self): + try: + error = self._sync_response["leave"].get(timeout=10) + if not error: + self._leaving = False + logger.info(f"Left {self._room_url}") + self._callbacks.on_left() + else: + error_msg = f"Error leaving {self._room_url}: {error}" + logger.error(error_msg) + self._callbacks.on_error(error_msg) + except queue.Empty: + error_msg = f"Time out leaving {self._room_url}" + logger.error(error_msg) + self._callbacks.on_error(error_msg) + + async def cleanup(self): + loop = asyncio.get_running_loop() + await loop.run_in_executor(None, self._cleanup) + + def _cleanup(self): + if self._client: + self._client.release() + self._client = None + + def capture_participant_transcription(self, participant_id: str, callback: Callable): + if not self._params.transcription_enabled: + return + + self._transcription_renderers[participant_id] = callback + + def capture_participant_video( + self, + participant_id: str, + callback: Callable, + framerate: int = 30, + video_source: str = "camera", + color_format: str = "RGB"): + # Only enable camera subscription on this participant + self._client.update_subscriptions(participant_settings={ + participant_id: { + "media": "subscribed" + } + }) + + self._video_renderers[participant_id] = callback + + self._client.set_video_renderer( + participant_id, + self._video_frame_received, + video_source=video_source, + color_format=color_format) + + # + # + # Daily (EventHandler) + # + + def on_participant_joined(self, participant): + id = participant["id"] + logger.info(f"Participant joined {id}") + + if not self._other_participant_has_joined: + self._other_participant_has_joined = True + self._callbacks.on_first_participant_joined(participant) + + self._callbacks.on_participant_joined(participant) + + def on_transcription_message(self, message: Mapping[str, Any]): + participant_id = "" + if "participantId" in message: + participant_id = message["participantId"] + + if participant_id in self._transcription_renderers: + callback = self._transcription_renderers[participant_id] + callback(participant_id, message) + + def on_transcription_error(self, message): + logger.error(f"Transcription error: {message}") + + def on_transcription_started(self, status): + logger.debug(f"Transcription started: {status}") + + def on_transcription_stopped(self, stopped_by, stopped_by_error): + logger.debug("Transcription stopped") + + # Daily (CallClient callbacks) + # + + def _call_joined(self, data, error): + self._sync_response["join"].put((data, error)) + + def _call_left(self, error): + self._sync_response["leave"].put(error) + + def _video_frame_received(self, participant_id, video_frame): + callback = self._video_renderers[participant_id] + callback(participant_id, + video_frame.buffer, + (video_frame.width, video_frame.height), + video_frame.color_format) + + +class DailyInputTransport(BaseInputTransport): + + def __init__(self, session: DailySession, params: DailyParams): + super().__init__(params) + + self._session = session + + self._video_renderers = {} + self._camera_in_queue = queue.Queue() + self._camera_in_thread = threading.Thread(target=self._camera_in_thread_handler) + self._camera_in_thread.start() + + async def start(self): + await self._session.join() + await super().start() + + async def stop(self): + await self._session.leave() + await super().stop() + + async def cleanup(self): + self._camera_in_thread.join() + + await self._session.cleanup() + + await super().cleanup() + + def vad_analyze(self, audio_frames: bytes) -> VADState: + return self._session.vad_analyze(audio_frames) + + def read_raw_audio_frames(self, frame_count: int) -> bytes: + return self._session.read_raw_audio_frames(frame_count) + + # + # FrameProcessor + # + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, UserImageRequestFrame): + self.request_participant_image(frame.user_id) + + await super().process_frame(frame, direction) + + # + # Transcription + # + + def capture_participant_transcription(self, participant_id: str): + self._session.capture_participant_transcription( + participant_id, + self._on_transcription_message + ) + + def _on_transcription_message(self, participant_id, message): + text = message["text"] + timestamp = message["timestamp"] + is_final = message["rawResponse"]["is_final"] + if is_final: + frame = TranscriptionFrame(text, participant_id, timestamp) + else: + frame = InterimTranscriptionFrame(text, participant_id, timestamp) + future = asyncio.run_coroutine_threadsafe(self.push_frame(frame), self.get_event_loop()) + future.result() + + # + # Camera in + # + + def capture_participant_video( + self, + participant_id: str, + framerate: int = 30, + video_source: str = "camera", + color_format: str = "RGB"): + self._video_renderers[participant_id] = { + "framerate": framerate, + "timestamp": 0, + "render_next_frame": False, + } + + self._session.capture_participant_video( + participant_id, + self._on_participant_video_frame, + framerate, + video_source, + color_format + ) + + def request_participant_image(self, participant_id: str): + if participant_id in self._video_renderers: + self._video_renderers[participant_id]["render_next_frame"] = True + + def _on_participant_video_frame(self, participant_id: str, buffer, size, format): + render_frame = False + + curr_time = time.time() + prev_time = self._video_renderers[participant_id]["timestamp"] or curr_time + framerate = self._video_renderers[participant_id]["framerate"] + + if framerate > 0: + next_time = prev_time + 1 / framerate + render_frame = (curr_time - next_time) < 0.1 + elif self._video_renderers[participant_id]["render_next_frame"]: + self._video_renderers[participant_id]["render_next_frame"] = False + render_frame = True + + if render_frame: + frame = UserImageRawFrame(participant_id, buffer, size, format) + self._camera_in_queue.put(frame) + + self._video_renderers[participant_id]["timestamp"] = curr_time + + def _camera_in_thread_handler(self): + while self._running: + try: + frame = self._camera_in_queue.get(timeout=1) + future = asyncio.run_coroutine_threadsafe( + self.push_frame(frame), self.get_event_loop()) + future.result() + except queue.Empty: + pass + except BaseException as e: + logger.error(f"Error capturing video: {e}") + + +class DailyOutputTransport(BaseOutputTransport): + + def __init__(self, session: DailySession, params: DailyParams): + super().__init__(params) + + self._session = session + + async def start(self): + await self._session.join() + await super().start() + + async def stop(self): + await self._session.leave() + await super().stop() + + async def cleanup(self): + await self._session.cleanup() + await super().cleanup() + + def write_raw_audio_frames(self, frames: bytes): + self._session.write_raw_audio_frames(frames) + + def write_frame_to_camera(self, frame: ImageRawFrame): + self._session.write_frame_to_camera(frame) + + +class DailyTransport(BaseTransport): + + def __init__(self, room_url: str, token: str | None, bot_name: str, params: DailyParams): + callbacks = DailyCallbacks( + on_joined=self._on_joined, + on_left=self._on_left, + on_first_participant_joined=self._on_first_participant_joined, + on_participant_joined=self._on_participant_joined, + on_error=self._on_error, + ) + self._params = params + + self._session = DailySession(room_url, token, bot_name, params, callbacks) + self._input: DailyInputTransport | None = None + self._output: DailyOutputTransport | None = None + self._loop = asyncio.get_running_loop() + + self._event_handlers: dict = {} + + # Register supported handlers. The user will only be able to register + # these handlers. + self._register_event_handler("on_joined") + self._register_event_handler("on_left") + self._register_event_handler("on_participant_joined") + self._register_event_handler("on_first_participant_joined") + + # + # BaseTransport + # + + def input(self) -> FrameProcessor: + if not self._input: + self._input = DailyInputTransport(self._session, self._params) + return self._input + + def output(self) -> FrameProcessor: + if not self._output: + self._output = DailyOutputTransport(self._session, self._params) + return self._output + + # + # DailyTransport + # + + @property + def participant_id(self) -> str: + return self._session.participant_id + + async def send_image(self, frame: ImageRawFrame | SpriteFrame): + if self._output: + await self._output.process_frame(frame, FrameDirection.DOWNSTREAM) + + async def send_audio(self, frame: AudioRawFrame): + if self._output: + await self._output.process_frame(frame, FrameDirection.DOWNSTREAM) + + def capture_participant_transcription(self, participant_id: str): + if self._input: + self._input.capture_participant_transcription(participant_id) + + def capture_participant_video( + self, + participant_id: str, + framerate: int = 30, + video_source: str = "camera", + color_format: str = "RGB"): + if self._input: + self._input.capture_participant_video( + participant_id, framerate, video_source, color_format) + + def _on_joined(self, participant): + self.on_joined(participant) + + def _on_left(self): + self.on_left() + + def _on_error(self, error): + # TODO(aleix): Report error to input/output transports. The one managing + # the session should report the error. + pass + + def _on_participant_joined(self, participant): + self.on_participant_joined(participant) + + def _on_first_participant_joined(self, participant): + self.on_first_participant_joined(participant) + + # + # Decorators (event handlers) + # + + def on_joined(self, participant): + pass + + def on_left(self): + pass + + def on_participant_joined(self, participant): + pass + + def on_first_participant_joined(self, participant): + pass + + def event_handler(self, event_name: str): + def decorator(handler): + self._add_event_handler(event_name, handler) + return handler + return decorator + + def _register_event_handler(self, event_name: str): + methods = inspect.getmembers(self, predicate=inspect.ismethod) + if event_name not in [method[0] for method in methods]: + raise Exception(f"Event handler {event_name} not found") + + self._event_handlers[event_name] = [getattr(self, event_name)] + + patch_method = types.MethodType(partial(self._patch_method, event_name), self) + setattr(self, event_name, patch_method) + + def _add_event_handler(self, event_name: str, handler): + if event_name not in self._event_handlers: + raise Exception(f"Event handler {event_name} not registered") + self._event_handlers[event_name].append(types.MethodType(handler, self)) + + def _patch_method(self, event_name, *args, **kwargs): + try: + for handler in self._event_handlers[event_name]: + if inspect.iscoroutinefunction(handler): + # Beware, if handler() calls another event handler it + # will deadlock. You shouldn't do that anyways. + future = asyncio.run_coroutine_threadsafe( + handler(*args[1:], **kwargs), self._loop) + + # wait for the coroutine to finish. This will also + # raise any exceptions raised by the coroutine. + future.result() + else: + handler(*args[1:], **kwargs) + except Exception as e: + logger.error(f"Exception in event handler {event_name}: {e}") + raise e + + # def send_app_message(self, message: Any, participant_id: str | None): + # self.client.send_app_message(message, participant_id) + + # def process_interrupt_handler(self, signum, frame): + # self._post_run() + # if callable(self.original_sigint_handler): + # self.original_sigint_handler(signum, frame) + + # def _post_run(self): + # self.client.leave() + # self.client.release() + + # def on_first_other_participant_joined(self, participant): + # pass + + # def call_joined(self, join_data, client_error): + # # self._logger.info(f"Call_joined: {join_data}, {client_error}") + # pass + + # def dialout(self, number): + # self.client.start_dialout({"phoneNumber": number}) + + # def start_recording(self): + # self.client.start_recording() + + # def on_error(self, error): + # self._logger.error(f"on_error: {error}") + + # def on_participant_joined(self, participant): + # if not self._other_participant_has_joined and participant["id"] != self._my_participant_id: + # self._other_participant_has_joined = True + # self.on_first_other_participant_joined(participant) + + # def on_participant_left(self, participant, reason): + # if len(self.client.participants()) < self._min_others_count + 1: + # self._stop_threads.set() + + # def on_app_message(self, message: Any, sender: str): + # if self._loop: + # frame = ReceivedAppMessageFrame(message, sender) + # asyncio.run_coroutine_threadsafe( + # self.receive_queue.put(frame), self._loop + # ) diff --git a/src/pipecat/utils/__init__.py b/src/pipecat/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/pipecat/utils/utils.py b/src/pipecat/utils/utils.py new file mode 100644 index 000000000..a72f7234e --- /dev/null +++ b/src/pipecat/utils/utils.py @@ -0,0 +1,31 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from threading import Lock + +_COUNTS = {} +_COUNTS_MUTEX = Lock() + +_ID = 0 +_ID_MUTEX = Lock() + + +def obj_id() -> int: + global _ID, _ID_MUTEX + with _ID_MUTEX: + _ID += 1 + return _ID + + +def obj_count(obj) -> int: + global _COUNTS, COUNTS_MUTEX + name = obj.__class__.__name__ + with _COUNTS_MUTEX: + if name not in _COUNTS: + _COUNTS[name] = 0 + else: + _COUNTS[name] += 1 + return _COUNTS[name] diff --git a/src/pipecat/vad/__init__.py b/src/pipecat/vad/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/pipecat/vad/silero.py b/src/pipecat/vad/silero.py new file mode 100644 index 000000000..a9e5aa0ed --- /dev/null +++ b/src/pipecat/vad/silero.py @@ -0,0 +1,103 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import numpy as np + +from pipecat.frames.frames import AudioRawFrame, Frame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.vad.vad_analyzer import VADAnalyzer, VADState + +from loguru import logger + +try: + import torch + # We don't use torchaudio here, but we need to try importing it because + # Silero uses it. + import torchaudio + + torch.set_num_threads(1) + +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error("In order to use Silero VAD, you need to `pip install pipecat[silero]`.") + raise Exception(f"Missing module(s): {e}") + + +# Provided by Alexander Veysov +def int2float(sound): + try: + abs_max = np.abs(sound).max() + sound = sound.astype("float32") + if abs_max > 0: + sound *= 1 / 32768 + sound = sound.squeeze() # depends on the use case + return sound + except ValueError: + return sound + + +class SileroVAD(FrameProcessor, VADAnalyzer): + + def __init__(self, sample_rate=16000, audio_passthrough=False): + FrameProcessor.__init__(self) + VADAnalyzer.__init__(self, sample_rate=sample_rate, num_channels=1) + + logger.debug("Loading Silero VAD model...") + + (self._model, self._utils) = torch.hub.load( + repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=False + ) + + self._processor_vad_state: VADState = VADState.QUIET + self._audio_passthrough = audio_passthrough + + logger.debug("Loaded Silero VAD") + + # + # VADAnalyzer + # + + def num_frames_required(self) -> int: + return int(self.sample_rate / 100) * 4 # 40ms + + def voice_confidence(self, buffer) -> float: + try: + audio_int16 = np.frombuffer(buffer, np.int16) + audio_float32 = int2float(audio_int16) + new_confidence = self._model(torch.from_numpy(audio_float32), self.sample_rate).item() + return new_confidence + except BaseException as e: + # This comes from an empty audio array + logger.error(f"Error analyzing audio with Silero VAD: {e}") + return 0 + + # + # FrameProcessor + # + + async def process_frame(self, frame: Frame, direction: FrameDirection): + if isinstance(frame, AudioRawFrame): + await self._analyze_audio(frame) + if self._audio_passthrough: + await self.push_frame(frame, direction) + else: + await self.push_frame(frame, direction) + + async def _analyze_audio(self, frame: AudioRawFrame): + # Check VAD and push event if necessary. We just care about changes + # from QUIET to SPEAKING and vice versa. + new_vad_state = self.analyze_audio(frame.data) + if new_vad_state != self._processor_vad_state and new_vad_state != VADState.STARTING and new_vad_state != VADState.STOPPING: + new_frame = None + + if new_vad_state == VADState.SPEAKING: + new_frame = UserStartedSpeakingFrame() + elif new_vad_state == VADState.QUIET: + new_frame = UserStoppedSpeakingFrame() + + if new_frame: + await self.push_frame(new_frame) + self._processor_vad_state = new_vad_state diff --git a/src/pipecat/vad/vad_analyzer.py b/src/pipecat/vad/vad_analyzer.py new file mode 100644 index 000000000..a506292e2 --- /dev/null +++ b/src/pipecat/vad/vad_analyzer.py @@ -0,0 +1,104 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from abc import abstractmethod +from enum import Enum + + +class VADState(Enum): + QUIET = 1 + STARTING = 2 + SPEAKING = 3 + STOPPING = 4 + + +class VADAnalyzer: + + def __init__( + self, + sample_rate, + num_channels, + vad_confidence=0.5, + vad_start_s=0.2, + vad_stop_s=0.8): + self._sample_rate = sample_rate + self._vad_confidence = vad_confidence + self._vad_start_s = vad_start_s + self._vad_stop_s = vad_stop_s + self._vad_frames = self.num_frames_required() + self._vad_frames_num_bytes = self._vad_frames * num_channels * 2 + + vad_frame_s = self._vad_frames / self._sample_rate + + self._vad_start_frames = round(self._vad_start_s / vad_frame_s) + self._vad_stop_frames = round(self._vad_stop_s / vad_frame_s) + self._vad_starting_count = 0 + self._vad_stopping_count = 0 + self._vad_state: VADState = VADState.QUIET + + self._vad_buffer = b"" + + @property + def sample_rate(self): + return self._sample_rate + + @abstractmethod + def num_frames_required(self) -> int: + pass + + @abstractmethod + def voice_confidence(self, buffer) -> float: + pass + + def analyze_audio(self, buffer) -> VADState: + self._vad_buffer += buffer + + num_required_bytes = self._vad_frames_num_bytes + if len(self._vad_buffer) < num_required_bytes: + return self._vad_state + + audio_frames = self._vad_buffer[:num_required_bytes] + self._vad_buffer = self._vad_buffer[num_required_bytes:] + + confidence = self.voice_confidence(audio_frames) + speaking = confidence >= self._vad_confidence + + if speaking: + match self._vad_state: + case VADState.QUIET: + self._vad_state = VADState.STARTING + self._vad_starting_count = 1 + case VADState.STARTING: + self._vad_starting_count += 1 + case VADState.STOPPING: + self._vad_state = VADState.SPEAKING + self._vad_stopping_count = 0 + else: + match self._vad_state: + case VADState.STARTING: + self._vad_state = VADState.QUIET + self._vad_starting_count = 0 + case VADState.SPEAKING: + self._vad_state = VADState.STOPPING + self._vad_stopping_count = 1 + case VADState.STOPPING: + self._vad_stopping_count += 1 + + if ( + self._vad_state == VADState.STARTING + and self._vad_starting_count >= self._vad_start_frames + ): + self._vad_state = VADState.SPEAKING + self._vad_starting_count = 0 + + if ( + self._vad_state == VADState.STOPPING + and self._vad_stopping_count >= self._vad_stop_frames + ): + self._vad_state = VADState.QUIET + self._vad_stopping_count = 0 + + return self._vad_state diff --git a/tests/integration/integration_azure_llm.py b/tests/integration/integration_azure_llm.py index d4744bd8b..62527baa2 100644 --- a/tests/integration/integration_azure_llm.py +++ b/tests/integration/integration_azure_llm.py @@ -1,8 +1,8 @@ import asyncio import os -from dailyai.pipeline.openai_frames import OpenAILLMContextFrame -from dailyai.services.azure_ai_services import AzureLLMService -from dailyai.services.openai_llm_context import OpenAILLMContext +from pipecat.pipeline.openai_frames import OpenAILLMContextFrame +from pipecat.services.azure_ai_services import AzureLLMService +from pipecat.services.openai_llm_context import OpenAILLMContext from openai.types.chat import ( ChatCompletionSystemMessageParam, diff --git a/tests/integration/integration_ollama_llm.py b/tests/integration/integration_ollama_llm.py index 2ac90ce65..e85425f8e 100644 --- a/tests/integration/integration_ollama_llm.py +++ b/tests/integration/integration_ollama_llm.py @@ -1,11 +1,11 @@ import asyncio -from dailyai.pipeline.openai_frames import OpenAILLMContextFrame -from dailyai.services.openai_llm_context import OpenAILLMContext +from pipecat.pipeline.openai_frames import OpenAILLMContextFrame +from pipecat.services.openai_llm_context import OpenAILLMContext from openai.types.chat import ( ChatCompletionSystemMessageParam, ) -from dailyai.services.ollama_ai_services import OLLamaLLMService +from pipecat.services.ollama_ai_services import OLLamaLLMService if __name__ == "__main__": async def test_chat(): diff --git a/tests/integration/integration_openai_llm.py b/tests/integration/integration_openai_llm.py index fa4a449ec..6f87b7fec 100644 --- a/tests/integration/integration_openai_llm.py +++ b/tests/integration/integration_openai_llm.py @@ -1,7 +1,7 @@ import asyncio import os -from dailyai.pipeline.openai_frames import OpenAILLMContextFrame -from dailyai.services.openai_llm_context import OpenAILLMContext +from pipecat.pipeline.openai_frames import OpenAILLMContextFrame +from pipecat.services.openai_llm_context import OpenAILLMContext from openai.types.chat import ( ChatCompletionSystemMessageParam, @@ -9,7 +9,7 @@ ChatCompletionUserMessageParam, ) -from dailyai.services.openai_api_llm_service import BaseOpenAILLMService +from pipecat.services.openai_api_llm_service import BaseOpenAILLMService if __name__ == "__main__": async def test_functions(): diff --git a/tests/test_aggregators.py b/tests/test_aggregators.py index 5c522f787..47f65c90a 100644 --- a/tests/test_aggregators.py +++ b/tests/test_aggregators.py @@ -3,13 +3,13 @@ import functools import unittest -from dailyai.pipeline.aggregators import ( +from pipecat.pipeline.aggregators import ( GatedAggregator, ParallelPipeline, SentenceAggregator, StatelessTextTransformer, ) -from dailyai.pipeline.frames import ( +from pipecat.pipeline.frames import ( AudioFrame, EndFrame, ImageFrame, @@ -19,7 +19,7 @@ TextFrame, ) -from dailyai.pipeline.pipeline import Pipeline +from pipecat.pipeline.pipeline import Pipeline class TestDailyFrameAggregators(unittest.IsolatedAsyncioTestCase): @@ -124,6 +124,6 @@ async def slow_add(sleep_time: float, name: str, x: str): def load_tests(loader, tests, ignore): """ Run doctests on the aggregators module. """ - from dailyai.pipeline import aggregators + from pipecat.pipeline import aggregators tests.addTests(doctest.DocTestSuite(aggregators)) return tests diff --git a/tests/test_ai_services.py b/tests/test_ai_services.py index 0bc9231d8..ec44d5625 100644 --- a/tests/test_ai_services.py +++ b/tests/test_ai_services.py @@ -2,8 +2,8 @@ from typing import AsyncGenerator -from dailyai.services.ai_services import AIService -from dailyai.pipeline.frames import EndFrame, Frame, TextFrame +from pipecat.services.ai_services import AIService +from pipecat.pipeline.frames import EndFrame, Frame, TextFrame class SimpleAIService(AIService): diff --git a/tests/test_daily_transport_service.py b/tests/test_daily_transport_service.py index 9d02cd14b..b654f98d3 100644 --- a/tests/test_daily_transport_service.py +++ b/tests/test_daily_transport_service.py @@ -4,7 +4,7 @@ class TestDailyTransport(unittest.IsolatedAsyncioTestCase): async def test_event_handler(self): - from dailyai.transports.daily_transport import DailyTransport + from pipecat.transports.daily_transport import DailyTransport transport = DailyTransport("mock.daily.co/mock", "token", "bot") @@ -22,7 +22,7 @@ def test_event_handler(transport, participant): """ TODO: fix this test, it broke when I added the `.result` call in the patch. async def test_event_handler_async(self): - from dailyai.services.daily_transport_service import DailyTransportService + from pipecat.services.daily_transport_service import DailyTransportService transport = DailyTransportService("mock.daily.co/mock", "token", "bot") @@ -46,10 +46,10 @@ async def test_event_handler(transport, participant): """ """ - @patch("dailyai.services.daily_transport_service.CallClient") - @patch("dailyai.services.daily_transport_service.Daily") + @patch("pipecat.services.daily_transport_service.CallClient") + @patch("pipecat.services.daily_transport_service.Daily") async def test_run_with_camera_and_mic(self, daily_mock, callclient_mock): - from dailyai.services.daily_transport_service import DailyTransportService + from pipecat.services.daily_transport_service import DailyTransportService transport = DailyTransportService( "https://mock.daily.co/mock", "token", diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 27cb947e2..c116b2c8f 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -2,11 +2,11 @@ import unittest from unittest.mock import Mock -from dailyai.pipeline.aggregators import SentenceAggregator, StatelessTextTransformer -from dailyai.pipeline.frame_processor import FrameProcessor -from dailyai.pipeline.frames import EndFrame, TextFrame +from pipecat.pipeline.aggregators import SentenceAggregator, StatelessTextTransformer +from pipecat.pipeline.frame_processor import FrameProcessor +from pipecat.pipeline.frames import EndFrame, TextFrame -from dailyai.pipeline.pipeline import Pipeline +from pipecat.pipeline.pipeline import Pipeline class TestDailyPipeline(unittest.IsolatedAsyncioTestCase): diff --git a/tests/test_protobuf_serializer.py b/tests/test_protobuf_serializer.py index 302236df6..7109d7284 100644 --- a/tests/test_protobuf_serializer.py +++ b/tests/test_protobuf_serializer.py @@ -1,7 +1,7 @@ import unittest -from dailyai.pipeline.frames import AudioFrame, TextFrame, TranscriptionFrame -from dailyai.serializers.protobuf_serializer import ProtobufFrameSerializer +from pipecat.pipeline.frames import AudioFrame, TextFrame, TranscriptionFrame +from pipecat.serializers.protobuf_serializer import ProtobufFrameSerializer class TestProtobufFrameSerializer(unittest.IsolatedAsyncioTestCase): diff --git a/tests/test_websocket_transport.py b/tests/test_websocket_transport.py index ebcf94ea6..601ba21ae 100644 --- a/tests/test_websocket_transport.py +++ b/tests/test_websocket_transport.py @@ -2,9 +2,9 @@ import unittest from unittest.mock import AsyncMock, patch, Mock -from dailyai.pipeline.frames import AudioFrame, EndFrame, TextFrame, TTSEndFrame, TTSStartFrame -from dailyai.pipeline.pipeline import Pipeline -from dailyai.transports.websocket_transport import WebSocketFrameProcessor, WebsocketTransport +from pipecat.pipeline.frames import AudioFrame, EndFrame, TextFrame, TTSEndFrame, TTSStartFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.transports.websocket_transport import WebSocketFrameProcessor, WebsocketTransport class TestWebSocketTransportService(unittest.IsolatedAsyncioTestCase):