Merge pull request #189 from pipecat-ai/khk-cartesia-etc

Cartesia TTS
pipecat-ai · May 30, 2024 · a157980 · a157980
2 parents a5d42a5 + d5f106a
commit a157980
Show file tree

Hide file tree

Showing 7 changed files with 212 additions and 22 deletions.
diff --git a/examples/foundational/07d-interruptible-cartesia.py b/examples/foundational/07d-interruptible-cartesia.py
@@ -0,0 +1,93 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import asyncio
+import aiohttp
+import os
+import sys
+
+from pipecat.frames.frames import LLMMessagesFrame
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.llm_response import (
+    LLMAssistantResponseAggregator, LLMUserResponseAggregator)
+from pipecat.services.cartesia import CartesiaTTSService
+from pipecat.services.openai import OpenAILLMService
+from pipecat.transports.services.daily import DailyParams, DailyTransport
+from pipecat.vad.silero import SileroVADAnalyzer
+
+from runner import configure
+
+from loguru import logger
+
+from dotenv import load_dotenv
+load_dotenv(override=True)
+
+logger.remove(0)
+logger.add(sys.stderr, level="DEBUG")
+
+
+async def main(room_url: str, token):
+    async with aiohttp.ClientSession() as session:
+        transport = DailyTransport(
+            room_url,
+            token,
+            "Respond bot",
+            DailyParams(
+                audio_out_enabled=True,
+                transcription_enabled=True,
+                vad_enabled=True,
+                vad_analyzer=SileroVADAnalyzer()
+            )
+        )
+
+        tts = CartesiaTTSService(
+            api_key=os.getenv("CARTESIA_API_KEY"),
+            voice_name="Barbershop Man"
+        )
+
+        llm = OpenAILLMService(
+            api_key=os.getenv("OPENAI_API_KEY"),
+            model="gpt-4o")
+
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+            },
+        ]
+
+        tma_in = LLMUserResponseAggregator(messages)
+        tma_out = LLMAssistantResponseAggregator(messages)
+
+        pipeline = Pipeline([
+            transport.input(),   # Transport user input
+            tma_in,              # User responses
+            llm,                 # LLM
+            tts,                 # TTS
+            transport.output(),  # Transport bot output
+            tma_out              # Assistant spoken responses
+        ])
+
+        task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
+
+        @transport.event_handler("on_first_participant_joined")
+        async def on_first_participant_joined(transport, participant):
+            transport.capture_participant_transcription(participant["id"])
+            # Kick off the conversation.
+            messages.append(
+                {"role": "system", "content": "Please introduce yourself to the user."})
+            await task.queue_frames([LLMMessagesFrame(messages)])
+
+        runner = PipelineRunner()
+
+        await runner.run(task)
+
+
+if __name__ == "__main__":
+    (url, token) = configure()
+    asyncio.run(main(url, token))
diff --git a/macos-py3.10-requirements.txt b/macos-py3.10-requirements.txt
@@ -5,7 +5,9 @@
 #    pip-compile --all-extras pyproject.toml
 #
 aiohttp==3.9.5
-    # via pipecat-ai (pyproject.toml)
+    # via
+    #   cartesia
+    #   pipecat-ai (pyproject.toml)
 aiosignal==1.3.1
     # via aiohttp
 annotated-types==0.7.0
@@ -21,19 +23,23 @@ async-timeout==4.0.3
     # via aiohttp
 attrs==23.2.0
     # via aiohttp
-av==12.0.0
+av==12.1.0
     # via faster-whisper
 azure-cognitiveservices-speech==1.37.0
     # via pipecat-ai (pyproject.toml)
 blinker==1.8.2
     # via flask
 cachetools==5.3.3
     # via google-auth
+cartesia==0.1.0
+    # via pipecat-ai (pyproject.toml)
 certifi==2024.2.2
     # via
     #   httpcore
     #   httpx
     #   requests
+cffi==1.16.0
+    # via sounddevice
 charset-normalizer==3.3.2
     # via requests
 click==8.1.7
@@ -42,7 +48,7 @@ coloredlogs==15.0.1
     # via onnxruntime
 ctranslate2==4.2.1
     # via faster-whisper
-daily-python==0.9.0
+daily-python==0.9.1
     # via pipecat-ai (pyproject.toml)
 distro==1.9.0
     # via
@@ -51,7 +57,9 @@ distro==1.9.0
 einops==0.8.0
     # via pipecat-ai (pyproject.toml)
 exceptiongroup==1.2.1
-    # via anyio
+    # via
+    #   anyio
+    #   pytest
 fal-client==0.4.0
     # via pipecat-ai (pyproject.toml)
 faster-whisper==1.0.2
@@ -122,6 +130,7 @@ httplib2==0.22.0
 httpx==0.27.0
     # via
     #   anthropic
+    #   cartesia
     #   fal-client
     #   openai
 httpx-sse==0.4.0
@@ -140,6 +149,8 @@ idna==3.7
     #   httpx
     #   requests
     #   yarl
+iniconfig==2.0.0
+    # via pytest
 itsdangerous==2.2.0
     # via flask
 jinja2==3.1.4
@@ -177,11 +188,14 @@ packaging==24.0
     # via
     #   huggingface-hub
     #   onnxruntime
+    #   pytest
     #   transformers
 pillow==10.3.0
     # via
     #   pipecat-ai (pyproject.toml)
     #   torchvision
+pluggy==1.5.0
+    # via pytest
 proto-plus==1.23.0
     # via
     #   google-ai-generativelanguage
@@ -204,6 +218,8 @@ pyasn1-modules==0.4.0
     # via google-auth
 pyaudio==0.2.14
     # via pipecat-ai (pyproject.toml)
+pycparser==2.22
+    # via cffi
 pydantic==2.7.2
     # via
     #   anthropic
@@ -217,6 +233,10 @@ pyloudnorm==0.1.1
     # via pipecat-ai (pyproject.toml)
 pyparsing==3.1.2
     # via httplib2
+pytest==8.2.1
+    # via pytest-asyncio
+pytest-asyncio==0.23.7
+    # via cartesia
 python-dotenv==1.0.1
     # via pipecat-ai (pyproject.toml)
 pyyaml==6.0.1
@@ -227,8 +247,9 @@ pyyaml==6.0.1
     #   transformers
 regex==2024.5.15
     # via transformers
-requests==2.32.2
+requests==2.32.3
     # via
+    #   cartesia
     #   google-api-core
     #   huggingface-hub
     #   pyht
@@ -247,7 +268,9 @@ sniffio==1.3.1
     #   anyio
     #   httpx
     #   openai
-sympy==1.12
+sounddevice==0.4.7
+    # via pipecat-ai (pyproject.toml)
+sympy==1.12.1
     # via
     #   onnxruntime
     #   torch
@@ -258,6 +281,8 @@ tokenizers==0.19.1
     #   anthropic
     #   faster-whisper
     #   transformers
+tomli==2.0.1
+    # via pytest
 torch==2.3.0
     # via
     #   pipecat-ai (pyproject.toml)
@@ -292,7 +317,9 @@ uritemplate==4.1.1
 urllib3==2.2.1
     # via requests
 websockets==12.0
-    # via pipecat-ai (pyproject.toml)
+    # via
+    #   cartesia
+    #   pipecat-ai (pyproject.toml)
 werkzeug==3.0.3
     # via flask
 yarl==1.9.4

diff --git a/pyproject.toml b/pyproject.toml
@@ -35,6 +35,7 @@ Website = "https://pipecat.ai"
 [project.optional-dependencies]
 anthropic = [ "anthropic~=0.25.7" ]
 azure = [ "azure-cognitiveservices-speech~=1.37.0" ]
+cartesia = [ "numpy~=1.26.0", "sounddevice", "cartesia" ]
 daily = [ "daily-python~=0.9.0" ]
 examples = [ "python-dotenv~=1.0.0", "flask~=3.0.3", "flask_cors~=4.0.1" ]
 fal = [ "fal-client~=0.4.0" ]

diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
@@ -0,0 +1,56 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+from cartesia.tts import AsyncCartesiaTTS
+
+import time
+from typing import AsyncGenerator
+
+from pipecat.frames.frames import AudioRawFrame, ErrorFrame, Frame
+from pipecat.services.ai_services import TTSService
+
+from loguru import logger
+
+
+class CartesiaTTSService(TTSService):
+
+    def __init__(
+            self,
+            *,
+            api_key: str,
+            voice_name: str,
+            **kwargs):
+        super().__init__(**kwargs)
+
+        self._api_key = api_key
+        self._voice_name = voice_name
+
+        self._client = None
+
+    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        logger.debug(f"Transcribing text: [{text}]")
+
+        try:
+            if self._client is None:
+                self._client = AsyncCartesiaTTS(api_key=self._api_key)
+                voices = self._client.get_voices()
+                self._voice_id = voices[self._voice_name]["id"]
+                self._voice = self._client.get_voice_embedding(voice_id=self._voice_id)
+
+            chunk_generator = await self._client.generate(
+                transcript=text, voice=self._voice, stream=True,
+                model_id="upbeat-moon", data_rtype='array', output_format='pcm_16000',
+                # a chunk_time of 0.1 seems to be the default. there are small audio pops/gaps which
+                # we need to debug
+                chunk_time=0.1
+            )
+
+            async for chunk in chunk_generator:
+                # print(f"")
+                frame = AudioRawFrame(chunk['audio'], 16000, 1)
+                yield frame
+        except Exception as e:
+            logger.error(f"Exception {e}")
diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py
@@ -32,17 +32,21 @@ def __init__(
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
         logger.info(f"Running Deepgram TTS for {text}")
         base_url = "https://api.deepgram.com/v1/speak"
-        request_url = f"{base_url}?model={self._voice}&encoding=linear16&container=none&sample_rate=16000"
+        request_url = f"{base_url}?model = {
+            self._voice} & encoding = linear16 & container = none & sample_rate = 16000"
         headers = {"authorization": f"token {self._api_key}"}
         body = {"text": text}
 
-        async with self._aiohttp_session.post(request_url, headers=headers, json=body) as r:
-            if r.status != 200:
-                text = await r.text()
-                logger.error(f"Error getting audio (status: {r.status}, error: {text})")
-                yield ErrorFrame(f"Error getting audio (status: {r.status}, error: {text})")
-                return
-
-            async for data in r.content:
-                frame = AudioRawFrame(audio=data, sample_rate=16000, num_channels=1)
-                yield frame
+        try:
+            async with self._aiohttp_session.post(request_url, headers=headers, json=body) as r:
+                if r.status != 200:
+                    text = await r.text()
+                    logger.error(f"Error getting audio (status: {r.status}, error: {text})")
+                    yield ErrorFrame(f"Error getting audio (status: {r.status}, error: {text})")
+                    return
+
+                async for data in r.content:
+                    frame = AudioRawFrame(audio=data, sample_rate=16000, num_channels=1)
+                    yield frame
+        except Exception as e:
+            logger.error(f"Exception {e}")
diff --git a/src/pipecat/services/elevenlabs.py b/src/pipecat/services/elevenlabs.py
@@ -8,7 +8,7 @@
 
 from typing import AsyncGenerator
 
-from pipecat.frames.frames import AudioRawFrame, ErrorFrame, Frame, TTSStartedFrame, TTSStoppedFrame, TextFrame
+from pipecat.frames.frames import AudioRawFrame, ErrorFrame, Frame
 from pipecat.services.ai_services import TTSService
 
 from loguru import logger

diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py
@@ -86,9 +86,18 @@ async def _process_context(self, context: OpenAILLMContext):
             logger.debug(f"Google LLM TTFB: {time.time() - start_time}")
 
             async for chunk in self._async_generator_wrapper(response):
-                await self.push_frame(LLMResponseStartFrame())
-                await self.push_frame(TextFrame(chunk.text))
-                await self.push_frame(LLMResponseEndFrame())
+                try:
+                    text = chunk.text
+                    await self.push_frame(LLMResponseStartFrame())
+                    await self.push_frame(TextFrame(text))
+                    await self.push_frame(LLMResponseEndFrame())
+                except Exception as e:
+                    # Google LLMs seem to flag safety issues a lot!
+                    if chunk.candidates[0].finish_reason == 3:
+                        logger.debug(
+                            f"LLM refused to generate content for safety reasons - {messages}.")
+                    else:
+                        logger.error(f"Error {e}")
 
         except Exception as e:
             logger.error(f"Exception: {e}")