Merge pull request #239 from pipecat-ai/aleix/azure-stt

azure stt support
pipecat-ai · Jun 14, 2024 · 55a9de7 · 55a9de7
2 parents 6cdccaf + ff51fc9
commit 55a9de7
Show file tree

Hide file tree

Showing 8 changed files with 370 additions and 289 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,17 @@ All notable changes to **pipecat** will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [Unreleased]
+
+### Added
+
+- Added new `AzureSTTService`. This allows you to use Azure Speech-To-Text.
+
+### Other
+
+- Updated `07f-interruptible-azure.py` to use `AzureLLMService`,
+  `AzureSTTService` and `AzureTTSService`.
+
 ## [0.0.31] - 2024-06-13
 
 ### Performance

diff --git a/README.md b/README.md
@@ -39,7 +39,7 @@ pip install "pipecat-ai[option,...]"
 
 Your project may or may not need these, so they're made available as optional requirements. Here is a list:
 
-- **AI services**: `anthropic`, `azure`, `deepgram`, `google`, `fal`, `moondream`, `openai`, `playht`, `silero`, `whisper`
+- **AI services**: `anthropic`, `azure`, `deepgram`, `google`, `fal`, `moondream`, `openai`, `openpipe`, `playht`, `silero`, `whisper`
 - **Transports**: `local`, `websocket`, `daily`
 
 ## Code examples

diff --git a/examples/foundational/07d-interruptible-cartesia.py b/examples/foundational/07d-interruptible-cartesia.py
@@ -5,7 +5,6 @@
 #
 
 import asyncio
-import aiohttp
 import os
 import sys
 
@@ -33,62 +32,61 @@
 
 
 async def main(room_url: str, token):
-    async with aiohttp.ClientSession() as session:
-        transport = DailyTransport(
-            room_url,
-            token,
-            "Respond bot",
-            DailyParams(
-                audio_out_enabled=True,
-                audio_out_sample_rate=44100,
-                transcription_enabled=True,
-                vad_enabled=True,
-                vad_analyzer=SileroVADAnalyzer()
-            )
+    transport = DailyTransport(
+        room_url,
+        token,
+        "Respond bot",
+        DailyParams(
+            audio_out_enabled=True,
+            audio_out_sample_rate=44100,
+            transcription_enabled=True,
+            vad_enabled=True,
+            vad_analyzer=SileroVADAnalyzer()
         )
-
-        tts = CartesiaTTSService(
-            api_key=os.getenv("CARTESIA_API_KEY"),
-            voice_name="British Lady",
-            output_format="pcm_44100"
-        )
-
-        llm = OpenAILLMService(
-            api_key=os.getenv("OPENAI_API_KEY"),
-            model="gpt-4o")
-
-        messages = [
-            {
-                "role": "system",
-                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
-            },
-        ]
-
-        tma_in = LLMUserResponseAggregator(messages)
-        tma_out = LLMAssistantResponseAggregator(messages)
-
-        pipeline = Pipeline([
-            transport.input(),   # Transport user input
-            tma_in,              # User responses
-            llm,                 # LLM
-            tts,                 # TTS
-            transport.output(),  # Transport bot output
-            tma_out              # Assistant spoken responses
-        ])
-
-        task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
-
-        @transport.event_handler("on_first_participant_joined")
-        async def on_first_participant_joined(transport, participant):
-            transport.capture_participant_transcription(participant["id"])
-            # Kick off the conversation.
-            messages.append(
-                {"role": "system", "content": "Please introduce yourself to the user."})
-            await task.queue_frames([LLMMessagesFrame(messages)])
-
-        runner = PipelineRunner()
-
-        await runner.run(task)
+    )
+
+    tts = CartesiaTTSService(
+        api_key=os.getenv("CARTESIA_API_KEY"),
+        voice_name="British Lady",
+        output_format="pcm_44100"
+    )
+
+    llm = OpenAILLMService(
+        api_key=os.getenv("OPENAI_API_KEY"),
+        model="gpt-4o")
+
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+        },
+    ]
+
+    tma_in = LLMUserResponseAggregator(messages)
+    tma_out = LLMAssistantResponseAggregator(messages)
+
+    pipeline = Pipeline([
+        transport.input(),   # Transport user input
+        tma_in,              # User responses
+        llm,                 # LLM
+        tts,                 # TTS
+        transport.output(),  # Transport bot output
+        tma_out              # Assistant spoken responses
+    ])
+
+    task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
+
+    @transport.event_handler("on_first_participant_joined")
+    async def on_first_participant_joined(transport, participant):
+        transport.capture_participant_transcription(participant["id"])
+        # Kick off the conversation.
+        messages.append(
+            {"role": "system", "content": "Please introduce yourself to the user."})
+        await task.queue_frames([LLMMessagesFrame(messages)])
+
+    runner = PipelineRunner()
+
+    await runner.run(task)
 
 
 if __name__ == "__main__":

diff --git a/examples/foundational/07e-interruptible-playht.py b/examples/foundational/07e-interruptible-playht.py
@@ -5,7 +5,6 @@
 #
 
 import asyncio
-import aiohttp
 import os
 import sys
 
@@ -19,7 +18,6 @@
 from pipecat.services.openai import OpenAILLMService
 from pipecat.transports.services.daily import DailyParams, DailyTransport
 from pipecat.vad.silero import SileroVADAnalyzer
-from pipecat.processors.logger import FrameLogger
 
 from runner import configure
 
@@ -33,62 +31,61 @@
 
 
 async def main(room_url: str, token):
-    async with aiohttp.ClientSession() as session:
-        transport = DailyTransport(
-            room_url,
-            token,
-            "Respond bot",
-            DailyParams(
-                audio_out_enabled=True,
-                audio_out_sample_rate=16000,
-                transcription_enabled=True,
-                vad_enabled=True,
-                vad_analyzer=SileroVADAnalyzer()
-            )
+    transport = DailyTransport(
+        room_url,
+        token,
+        "Respond bot",
+        DailyParams(
+            audio_out_enabled=True,
+            audio_out_sample_rate=16000,
+            transcription_enabled=True,
+            vad_enabled=True,
+            vad_analyzer=SileroVADAnalyzer()
         )
-
-        tts = PlayHTTTSService(
-            user_id=os.getenv("PLAYHT_USER_ID"),
-            api_key=os.getenv("PLAYHT_API_KEY"),
-            voice_url="s3://voice-cloning-zero-shot/801a663f-efd0-4254-98d0-5c175514c3e8/jennifer/manifest.json",
-        )
-
-        llm = OpenAILLMService(
-            api_key=os.getenv("OPENAI_API_KEY"),
-            model="gpt-4o")
-
-        messages = [
-            {
-                "role": "system",
-                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
-            },
-        ]
-
-        tma_in = LLMUserResponseAggregator(messages)
-        tma_out = LLMAssistantResponseAggregator(messages)
-
-        pipeline = Pipeline([
-            transport.input(),   # Transport user input
-            tma_in,              # User responses
-            llm,                 # LLM
-            tts,                 # TTS
-            transport.output(),  # Transport bot output
-            tma_out              # Assistant spoken responses
-        ])
-
-        task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
-
-        @transport.event_handler("on_first_participant_joined")
-        async def on_first_participant_joined(transport, participant):
-            transport.capture_participant_transcription(participant["id"])
-            # Kick off the conversation.
-            messages.append(
-                {"role": "system", "content": "Please introduce yourself to the user."})
-            await task.queue_frames([LLMMessagesFrame(messages)])
-
-        runner = PipelineRunner()
-
-        await runner.run(task)
+    )
+
+    tts = PlayHTTTSService(
+        user_id=os.getenv("PLAYHT_USER_ID"),
+        api_key=os.getenv("PLAYHT_API_KEY"),
+        voice_url="s3://voice-cloning-zero-shot/801a663f-efd0-4254-98d0-5c175514c3e8/jennifer/manifest.json",
+    )
+
+    llm = OpenAILLMService(
+        api_key=os.getenv("OPENAI_API_KEY"),
+        model="gpt-4o")
+
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+        },
+    ]
+
+    tma_in = LLMUserResponseAggregator(messages)
+    tma_out = LLMAssistantResponseAggregator(messages)
+
+    pipeline = Pipeline([
+        transport.input(),   # Transport user input
+        tma_in,              # User responses
+        llm,                 # LLM
+        tts,                 # TTS
+        transport.output(),  # Transport bot output
+        tma_out              # Assistant spoken responses
+    ])
+
+    task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
+
+    @transport.event_handler("on_first_participant_joined")
+    async def on_first_participant_joined(transport, participant):
+        transport.capture_participant_transcription(participant["id"])
+        # Kick off the conversation.
+        messages.append(
+            {"role": "system", "content": "Please introduce yourself to the user."})
+        await task.queue_frames([LLMMessagesFrame(messages)])
+
+    runner = PipelineRunner()
+
+    await runner.run(task)
 
 
 if __name__ == "__main__":

diff --git a/examples/foundational/07f-interruptible-azure-tts.py b/examples/foundational/07f-interruptible-azure-tts.py