Merge remote-tracking branch 'upstream/main' into sentry-impl

cyrilS-dev · Sep 21, 2024 · 3b3f1e5 · 3b3f1e5
2 parents 3c5483c + 26a64af
commit 3b3f1e5
Show file tree

Hide file tree

Showing 78 changed files with 1,061 additions and 537 deletions.
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -20,23 +20,26 @@ jobs:
     name: "Unit and Integration Tests"
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
+      - name: Checkout repo
+        uses: actions/checkout@v4
       - name: Set up Python
         id: setup_python
         uses: actions/setup-python@v4
         with:
           python-version: "3.10"
       - name: Install system packages
-        run: sudo apt-get install -y portaudio19-dev
+        id: install_system_packages
+        run: |
+          sudo apt-get install -y portaudio19-dev
       - name: Setup virtual environment
         run: |
           python -m venv .venv
       - name: Install basic Python dependencies
         run: |
           source .venv/bin/activate
           python -m pip install --upgrade pip
-          pip install -r dev-requirements.txt
+          pip install -r test-requirements.txt
       - name: Test with pytest
         run: |
           source .venv/bin/activate
-          pytest --doctest-modules --ignore-glob="*to_be_updated*" src tests
+          pytest --ignore-glob="*to_be_updated*" --ignore-glob=*pipeline_source* src tests
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added configurable LLM parameters (e.g., temperature, top_p, max_tokens, seed)
+  for OpenAI, Anthropic, and Together AI services along with corresponding
+  setter functions.
+
+- Added `sample_rate` as a constructor parameter for TTS services.
+
 - Pipecat has a pipeline-based architecture. The pipeline consists of frame
   processors linked to each other. The elements traveling across the pipeline
   are called frames.
@@ -63,6 +69,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
+- We now distinguish between input and output audio and image frames. We
+  introduce `InputAudioRawFrame`, `OutputAudioRawFrame`, `InputImageRawFrame`
+  and `OutputImageRawFrame` (and other subclasses of those). The input frames
+  usually come from an input transport and are meant to be processed inside the
+  pipeline to generate new frames. However, the input frames will not be sent
+  through an output transport. The output frames can also be processed by any
+  frame processor in the pipeline and they are allowed to be sent by the output
+  transport.
+
 - `ParallelTask` has been renamed to `SyncParallelPipeline`. A
   `SyncParallelPipeline` is a frame processor that contains a list of different
   pipelines to be executed concurrently. The difference between a
@@ -334,7 +349,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - It is now possible to specify a Silero VAD version when using `SileroVADAnalyzer`
   or `SileroVAD`.
 
-- Added `AysncFrameProcessor` and `AsyncAIService`.  Some services like
+- Added `AysncFrameProcessor` and `AsyncAIService`. Some services like
   `DeepgramSTTService` need to process things asynchronously. For example, audio
   is sent to Deepgram but transcriptions are not returned immediately. In these
   cases we still require all frames (except system frames) to be pushed
@@ -351,7 +366,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - `WhisperSTTService` model can now also be a string.
 
-- Added missing * keyword separators in services.
+- Added missing \* keyword separators in services.
 
 ### Fixed
 
@@ -428,7 +443,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added new `TwilioFrameSerializer`. This is a new serializer that knows how to
   serialize and deserialize audio frames from Twilio.
 
-- Added Daily transport event: `on_dialout_answered`.  See
+- Added Daily transport event: `on_dialout_answered`. See
   https://reference-python.daily.co/api_reference.html#daily.EventHandler
 
 - Added new `AzureSTTService`. This allows you to use Azure Speech-To-Text.
@@ -668,7 +683,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added Daily transport support for dial-in use cases.
 
 - Added Daily transport events: `on_dialout_connected`, `on_dialout_stopped`,
-  `on_dialout_error` and `on_dialout_warning`.  See
+  `on_dialout_error` and `on_dialout_warning`. See
   https://reference-python.daily.co/api_reference.html#daily.EventHandler
 
 ## [0.0.21] - 2024-05-22

diff --git a/README.md b/README.md
@@ -165,7 +165,7 @@ pip install "path_to_this_repo[option,...]"
 From the root directory, run:
 
 ```shell
-pytest --doctest-modules --ignore-glob="*to_be_updated*" src tests
+pytest --doctest-modules --ignore-glob="*to_be_updated*" --ignore-glob=*pipeline_source* src tests
 ```
 
 ## Setting up your editor

diff --git a/examples/dialin-chatbot/requirements.txt b/examples/dialin-chatbot/requirements.txt
@@ -1,4 +1,4 @@
-pipecat-ai[daily,openai,silero]
+pipecat-ai[daily,elevenlabs,openai,silero]
 fastapi
 uvicorn
 python-dotenv

diff --git a/examples/foundational/04-utterance-and-speech.py b/examples/foundational/04-utterance-and-speech.py
@@ -4,6 +4,10 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
+#
+# This example broken on latest pipecat and needs updating.
+#
+
 import aiohttp
 import asyncio
 import os

diff --git a/examples/foundational/05a-local-sync-speech-and-image.py b/examples/foundational/05a-local-sync-speech-and-image.py
@@ -11,7 +11,13 @@
 
 import tkinter as tk
 
-from pipecat.frames.frames import AudioRawFrame, Frame, URLImageRawFrame, LLMMessagesFrame, TextFrame
+from pipecat.frames.frames import (
+    Frame,
+    OutputAudioRawFrame,
+    TTSAudioRawFrame,
+    URLImageRawFrame,
+    LLMMessagesFrame,
+    TextFrame)
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.sync_parallel_pipeline import SyncParallelPipeline
@@ -65,9 +71,9 @@ def __init__(self):
                 async def process_frame(self, frame: Frame, direction: FrameDirection):
                     await super().process_frame(frame, direction)
 
-                    if isinstance(frame, AudioRawFrame):
+                    if isinstance(frame, TTSAudioRawFrame):
                         self.audio.extend(frame.audio)
-                        self.frame = AudioRawFrame(
+                        self.frame = OutputAudioRawFrame(
                             bytes(self.audio), frame.sample_rate, frame.num_channels)
 
             class ImageGrabber(FrameProcessor):

diff --git a/examples/foundational/06-listen-and-respond.py b/examples/foundational/06-listen-and-respond.py
@@ -10,6 +10,7 @@
 import sys
 
 from pipecat.frames.frames import Frame, LLMMessagesFrame, MetricsFrame
+from pipecat.metrics.metrics import TTFBMetricsData, ProcessingMetricsData, LLMUsageMetricsData, TTSUsageMetricsData
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -37,8 +38,19 @@
 class MetricsLogger(FrameProcessor):
     async def process_frame(self, frame: Frame, direction: FrameDirection):
         if isinstance(frame, MetricsFrame):
-            print(
-                f"!!! MetricsFrame: {frame}, ttfb: {frame.ttfb}, processing: {frame.processing}, tokens: {frame.tokens}, characters: {frame.characters}")
+            for d in frame.data:
+                if isinstance(d, TTFBMetricsData):
+                    print(f"!!! MetricsFrame: {frame}, ttfb: {d.value}")
+                elif isinstance(d, ProcessingMetricsData):
+                    print(f"!!! MetricsFrame: {frame}, processing: {d.value}")
+                elif isinstance(d, LLMUsageMetricsData):
+                    tokens = d.value
+                    print(
+                        f"!!! MetricsFrame: {frame}, tokens: {
+                            tokens.prompt_tokens}, characters: {
+                            tokens.completion_tokens}")
+                elif isinstance(d, TTSUsageMetricsData):
+                    print(f"!!! MetricsFrame: {frame}, characters: {d.value}")
         await self.push_frame(frame, direction)
 
 

diff --git a/examples/foundational/06a-image-sync.py b/examples/foundational/06a-image-sync.py
@@ -11,7 +11,7 @@
 
 from PIL import Image
 
-from pipecat.frames.frames import ImageRawFrame, Frame, SystemFrame, TextFrame
+from pipecat.frames.frames import Frame, OutputImageRawFrame, SystemFrame, TextFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineTask
@@ -52,9 +52,16 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
         await super().process_frame(frame, direction)
 
         if not isinstance(frame, SystemFrame) and direction == FrameDirection.DOWNSTREAM:
-            await self.push_frame(ImageRawFrame(image=self._speaking_image_bytes, size=(1024, 1024), format=self._speaking_image_format))
+            await self.push_frame(OutputImageRawFrame(
+                image=self._speaking_image_bytes,
+                size=(1024, 1024),
+                format=self._speaking_image_format)
+            )
             await self.push_frame(frame)
-            await self.push_frame(ImageRawFrame(image=self._waiting_image_bytes, size=(1024, 1024), format=self._waiting_image_format))
+            await self.push_frame(OutputImageRawFrame(
+                image=self._waiting_image_bytes,
+                size=(1024, 1024),
+                format=self._waiting_image_format))
         else:
             await self.push_frame(frame)
 

diff --git a/examples/foundational/07l-interruptible-together.py b/examples/foundational/07l-interruptible-together.py
@@ -0,0 +1,100 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import asyncio
+import aiohttp
+import os
+import sys
+
+from pipecat.frames.frames import LLMMessagesFrame
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.llm_response import (
+    LLMAssistantResponseAggregator, LLMUserResponseAggregator)
+from pipecat.services.cartesia import CartesiaTTSService
+from pipecat.services.together import TogetherLLMService
+from pipecat.transports.services.daily import DailyParams, DailyTransport
+from pipecat.vad.silero import SileroVADAnalyzer
+
+from runner import configure
+
+from loguru import logger
+
+from dotenv import load_dotenv
+load_dotenv(override=True)
+
+logger.remove(0)
+logger.add(sys.stderr, level="DEBUG")
+
+
+async def main():
+    async with aiohttp.ClientSession() as session:
+        (room_url, token) = await configure(session)
+
+        transport = DailyTransport(
+            room_url,
+            token,
+            "Respond bot",
+            DailyParams(
+                audio_out_enabled=True,
+                transcription_enabled=True,
+                vad_enabled=True,
+                vad_analyzer=SileroVADAnalyzer()
+            )
+        )
+
+        tts = CartesiaTTSService(
+            api_key=os.getenv("CARTESIA_API_KEY"),
+            voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22",  # British Lady
+        )
+
+        llm = TogetherLLMService(
+            api_key=os.getenv("TOGETHER_API_KEY"),
+            model=os.getenv("TOGETHER_MODEL"),
+            params=TogetherLLMService.InputParams(
+                temperature=1.0,
+                frequency_penalty=2.0,
+                presence_penalty=0.0,
+                top_p=0.9,
+                top_k=40
+            )
+        )
+
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+            },
+        ]
+
+        tma_in = LLMUserResponseAggregator(messages)
+        tma_out = LLMAssistantResponseAggregator(messages)
+
+        pipeline = Pipeline([
+            transport.input(),   # Transport user input
+            tma_in,              # User responses
+            llm,                 # LLM
+            tts,                 # TTS
+            transport.output(),  # Transport bot output
+            tma_out              # Assistant spoken responses
+        ])
+
+        task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
+
+        @transport.event_handler("on_first_participant_joined")
+        async def on_first_participant_joined(transport, participant):
+            transport.capture_participant_transcription(participant["id"])
+            # Kick off the conversation.
+            await task.queue_frames([LLMMessagesFrame(messages)])
+
+        runner = PipelineRunner()
+
+        await runner.run(task)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/examples/foundational/08-bots-arguing.py b/examples/foundational/08-bots-arguing.py
@@ -3,14 +3,14 @@
 import asyncio
 import logging
 import os
-from pipecat.pipeline.aggregators import SentenceAggregator
+from pipecat.processors.aggregators import SentenceAggregator
 from pipecat.pipeline.pipeline import Pipeline
 
-from pipecat.transports.daily_transport import DailyTransport
-from pipecat.services.azure_ai_services import AzureLLMService, AzureTTSService
-from pipecat.services.elevenlabs_ai_services import ElevenLabsTTSService
-from pipecat.services.fal_ai_services import FalImageGenService
-from pipecat.pipeline.frames import AudioFrame, EndFrame, ImageFrame, LLMMessagesFrame, TextFrame
+from pipecat.transports.services.daily import DailyTransport
+from pipecat.services.azure import AzureLLMService, AzureTTSService
+from pipecat.services.elevenlabs import ElevenLabsTTSService
+from pipecat.services.fal import FalImageGenService
+from pipecat.frames.frames import AudioFrame, EndFrame, ImageFrame, LLMMessagesFrame, TextFrame
 
 from runner import configure
 

diff --git a/examples/foundational/09-mirror.py b/examples/foundational/09-mirror.py
@@ -8,9 +8,11 @@
 import asyncio
 import sys
 
+from pipecat.frames.frames import Frame, InputAudioRawFrame, InputImageRawFrame, OutputAudioRawFrame, OutputImageRawFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineTask
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.transports.services.daily import DailyTransport, DailyParams
 
 from runner import configure
@@ -24,6 +26,27 @@
 logger.add(sys.stderr, level="DEBUG")
 
 
+class MirrorProcessor(FrameProcessor):
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, InputAudioRawFrame):
+            await self.push_frame(OutputAudioRawFrame(
+                audio=frame.audio,
+                sample_rate=frame.sample_rate,
+                num_channels=frame.num_channels)
+            )
+        elif isinstance(frame, InputImageRawFrame):
+            await self.push_frame(OutputImageRawFrame(
+                image=frame.image,
+                size=frame.size,
+                format=frame.format)
+            )
+        else:
+            await self.push_frame(frame, direction)
+
+
 async def main():
     async with aiohttp.ClientSession() as session:
         (room_url, token) = await configure(session)
@@ -44,7 +67,7 @@ async def main():
         async def on_first_participant_joined(transport, participant):
             transport.capture_participant_video(participant["id"])
 
-        pipeline = Pipeline([transport.input(), transport.output()])
+        pipeline = Pipeline([transport.input(), MirrorProcessor(), transport.output()])
 
         runner = PipelineRunner()