Merge branch 'pipecat-ai:main' into main

joachimchauvet · Oct 3, 2024 · 2e8104b · 2e8104b
2 parents b98204e + 65eeb0f
commit 2e8104b
Show file tree

Hide file tree

Showing 34 changed files with 2,477 additions and 5,847 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,20 +1,29 @@
 # Changelog
 
-All notable changes to **pipecat** will be documented in this file.
+All notable changes to **Pipecat** will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [Unreleased]
+## [0.0.42] - 2024-10-02
 
 ### Added
 
-- Added Google TTS service and corresponding foundational example `07n-interruptible-google.py`
+- `SentryMetrics` has been added to report frame processor metrics to
+  Sentry. This is now possible because `FrameProcessorMetrics` can now be passed
+  to `FrameProcessor`.
+
+- Added Google TTS service and corresponding foundational example
+  `07n-interruptible-google.py`
 
 - Added AWS Polly TTS support and `07m-interruptible-aws.py` as an example.
 
 - Added InputParams to Azure TTS service.
 
+- Added `LivekitTransport` (audio-only for now).
+
+- RTVI 0.2.0 is now supported.
+
 - All `FrameProcessors` can now register event handlers.
 
 ```
@@ -86,8 +95,12 @@ async def on_connected(processor):
 
 ### Changed
 
-- Updated individual update settings frame classes into a single UpdateSettingsFrame
-  class for STT, LLM, and TTS.
+- Context frames are now pushed downstream from assistant context aggregators.
+
+- Removed Silero VAD torch dependency.
+
+- Updated individual update settings frame classes into a single
+  `ServiceUpdateSettingsFrame` class.
 
 - We now distinguish between input and output audio and image frames. We
   introduce `InputAudioRawFrame`, `OutputAudioRawFrame`, `InputImageRawFrame`
@@ -107,9 +120,9 @@ async def on_connected(processor):
   pipelines is synchronous (e.g. an HTTP-based service that waits for the
   response).
 
-- `StartFrame` is back a system frame so we make sure it's processed immediately
-  by all processors. `EndFrame` stays a control frame since it needs to be
-  ordered allowing the frames in the pipeline to be processed.
+- `StartFrame` is back a system frame to make sure it's processed immediately by
+  all processors. `EndFrame` stays a control frame since it needs to be ordered
+  allowing the frames in the pipeline to be processed.
 
 - Updated `MoondreamService` revision to `2024-08-26`.
 
@@ -133,6 +146,11 @@ async def on_connected(processor):
 
 ### Fixed
 
+- Fixed OpenAI multiple function calls.
+
+- Fixed a Cartesia TTS issue that would cause audio to be truncated in some
+  cases.
+
 - Fixed a `BaseOutputTransport` issue that would stop audio and video rendering
   tasks (after receiving and `EndFrame`) before the internal queue was emptied,
   causing the pipeline to finish prematurely.
@@ -146,6 +164,10 @@ async def on_connected(processor):
 - `obj_id()` and `obj_count()` now use `itertools.count` avoiding the need of
   `threading.Lock`.
 
+### Other
+
+- Pipecat now uses Ruff as its formatter (https://github.com/astral-sh/ruff).
+
 ## [0.0.41] - 2024-08-22
 
 ### Added

diff --git a/examples/foundational/07e-interruptible-playht.py b/examples/foundational/07e-interruptible-playht.py
@@ -4,11 +4,15 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
-import aiohttp
 import asyncio
 import os
 import sys
 
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+from runner import configure
+
 from pipecat.frames.frames import LLMMessagesFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
@@ -17,17 +21,11 @@
     LLMAssistantResponseAggregator,
     LLMUserResponseAggregator,
 )
-from pipecat.services.playht import PlayHTTTSService
 from pipecat.services.openai import OpenAILLMService
+from pipecat.services.playht import PlayHTTTSService
 from pipecat.transports.services.daily import DailyParams, DailyTransport
 from pipecat.vad.silero import SileroVADAnalyzer
 
-from runner import configure
-
-from loguru import logger
-
-from dotenv import load_dotenv
-
 load_dotenv(override=True)
 
 logger.remove(0)

diff --git a/examples/foundational/07g-interruptible-openai-tts.py b/examples/foundational/07g-interruptible-openai-tts.py
@@ -4,11 +4,15 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
-import aiohttp
 import asyncio
 import os
 import sys
 
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+from runner import configure
+
 from pipecat.frames.frames import LLMMessagesFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
@@ -17,17 +21,10 @@
     LLMAssistantResponseAggregator,
     LLMUserResponseAggregator,
 )
-from pipecat.services.openai import OpenAITTSService
-from pipecat.services.openai import OpenAILLMService
+from pipecat.services.openai import OpenAILLMService, OpenAITTSService
 from pipecat.transports.services.daily import DailyParams, DailyTransport
 from pipecat.vad.silero import SileroVADAnalyzer
 
-from runner import configure
-
-from loguru import logger
-
-from dotenv import load_dotenv
-
 load_dotenv(override=True)
 
 logger.remove(0)

diff --git a/examples/foundational/07l-interruptible-together.py b/examples/foundational/07l-interruptible-together.py
@@ -67,7 +67,7 @@ async def main():
         messages = [
             {
                 "role": "system",
-                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond in plain language. Respond to what the user said in a creative and helpful way.",
             },
         ]
 
@@ -87,7 +87,12 @@ async def main():
             ]
         )
 
-        task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
+        task = PipelineTask(
+            pipeline,
+            PipelineParams(
+                allow_interruptions=True, enable_metrics=True, enable_usage_metrics=True
+            ),
+        )
 
         @transport.event_handler("on_first_participant_joined")
         async def on_first_participant_joined(transport, participant):

diff --git a/examples/foundational/07n-interruptible-google.py b/examples/foundational/07n-interruptible-google.py
@@ -53,7 +53,6 @@ async def main():
         stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
 
         tts = GoogleTTSService(
-            credentials=os.getenv("GOOGLE_CREDENTIALS"),
             voice_id="en-US-Neural2-J",
             params=GoogleTTSService.InputParams(language="en-US", rate="1.05"),
         )

diff --git a/examples/foundational/14-function-calling.py b/examples/foundational/14-function-calling.py
@@ -9,11 +9,9 @@
 import os
 import sys
 
-from pipecat.frames.frames import TextFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineTask
-from pipecat.processors.logger import FrameLogger
 from pipecat.services.cartesia import CartesiaTTSService
 from pipecat.services.openai import OpenAILLMContext, OpenAILLMService
 from pipecat.transports.services.daily import DailyParams, DailyTransport
@@ -72,9 +70,6 @@ async def main():
         # sent to the same callback with an additional function_name parameter.
         llm.register_function(None, fetch_weather_from_api, start_callback=start_fetch_weather)
 
-        fl_in = FrameLogger("Inner")
-        fl_out = FrameLogger("Outer")
-
         tools = [
             ChatCompletionToolParam(
                 type="function",
@@ -111,11 +106,9 @@ async def main():
 
         pipeline = Pipeline(
             [
-                # fl_in,
                 transport.input(),
                 context_aggregator.user(),
                 llm,
-                # fl_out,
                 tts,
                 transport.output(),
                 context_aggregator.assistant(),

diff --git a/examples/foundational/19a-tools-anthropic.py → ...ational/14a-function-calling-anthropic.py b/examples/foundational/19a-tools-anthropic.py → ...ational/14a-function-calling-anthropic.py
diff --git a/...foundational/19b-tools-video-anthropic.py → ...l/14b-function-calling-anthropic-video.py b/...foundational/19b-tools-video-anthropic.py → ...l/14b-function-calling-anthropic-video.py
diff --git a/examples/foundational/14c-function-calling-together.py b/examples/foundational/14c-function-calling-together.py
@@ -0,0 +1,136 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import asyncio
+import aiohttp
+import os
+import sys
+
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineTask
+from pipecat.services.cartesia import CartesiaTTSService
+from pipecat.services.openai import OpenAILLMContext
+from pipecat.services.together import TogetherLLMService
+from pipecat.transports.services.daily import DailyParams, DailyTransport
+from pipecat.vad.silero import SileroVADAnalyzer
+
+from openai.types.chat import ChatCompletionToolParam
+
+from runner import configure
+
+from loguru import logger
+
+from dotenv import load_dotenv
+
+load_dotenv(override=True)
+
+logger.remove(0)
+logger.add(sys.stderr, level="DEBUG")
+
+
+async def start_fetch_weather(function_name, llm, context):
+    # note: we can't push a frame to the LLM here. the bot
+    # can interrupt itself and/or cause audio overlapping glitches.
+    # possible question for Aleix and Chad about what the right way
+    # to trigger speech is, now, with the new queues/async/sync refactors.
+    # await llm.push_frame(TextFrame("Let me check on that."))
+    logger.debug(f"Starting fetch_weather_from_api with function_name: {function_name}")
+
+
+async def fetch_weather_from_api(function_name, tool_call_id, args, llm, context, result_callback):
+    await result_callback({"conditions": "nice", "temperature": "75"})
+
+
+async def main():
+    async with aiohttp.ClientSession() as session:
+        (room_url, token) = await configure(session)
+
+        transport = DailyTransport(
+            room_url,
+            token,
+            "Respond bot",
+            DailyParams(
+                audio_out_enabled=True,
+                transcription_enabled=True,
+                vad_enabled=True,
+                vad_analyzer=SileroVADAnalyzer(),
+            ),
+        )
+
+        tts = CartesiaTTSService(
+            api_key=os.getenv("CARTESIA_API_KEY"),
+            voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22",  # British Lady
+        )
+
+        llm = TogetherLLMService(
+            api_key=os.getenv("TOGETHER_API_KEY"),
+            model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+        )
+        # Register a function_name of None to get all functions
+        # sent to the same callback with an additional function_name parameter.
+        llm.register_function(None, fetch_weather_from_api, start_callback=start_fetch_weather)
+
+        tools = [
+            ChatCompletionToolParam(
+                type="function",
+                function={
+                    "name": "get_current_weather",
+                    "description": "Get the current weather",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "location": {
+                                "type": "string",
+                                "description": "The city and state, e.g. San Francisco, CA",
+                            },
+                            "format": {
+                                "type": "string",
+                                "enum": ["celsius", "fahrenheit"],
+                                "description": "The temperature unit to use. Infer this from the users location.",
+                            },
+                        },
+                        "required": ["location", "format"],
+                    },
+                },
+            )
+        ]
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+            },
+        ]
+
+        context = OpenAILLMContext(messages, tools)
+        context_aggregator = llm.create_context_aggregator(context)
+
+        pipeline = Pipeline(
+            [
+                transport.input(),
+                context_aggregator.user(),
+                llm,
+                tts,
+                transport.output(),
+                context_aggregator.assistant(),
+            ]
+        )
+
+        task = PipelineTask(pipeline)
+
+        @transport.event_handler("on_first_participant_joined")
+        async def on_first_participant_joined(transport, participant):
+            transport.capture_participant_transcription(participant["id"])
+            # Kick off the conversation.
+            # await tts.say("Hi! Ask me about the weather in San Francisco.")
+
+        runner = PipelineRunner()
+
+        await runner.run(task)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())