Skip to content

Commit

Permalink
Merge pull request #921 from pipecat-ai/mb/playht-http
Browse files Browse the repository at this point in the history
PlayHTHttpTTSService fixes
  • Loading branch information
markbackman authored Jan 10, 2025
2 parents 5cd9dab + 86516d2 commit a8ae798
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 4 deletions.
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- Added a new foundational example `07e-interruptible-playht-http.py` for easy
testing of `PlayHTHttpTTSService`.

- Added support for Google TTS Journey voices in `GoogleTTSService`.

- Added `29-livekit-audio-chat.py`, as a new foundational examples for
Expand All @@ -27,12 +30,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Changed

- Changed the default model for `PlayHTHttpTTSService` to `Play3.0-mini-http`.

- api_key, aws_access_key_id and region are no longer required parameters for the PollyTTSService (AWSTTSService)

- Added `session_timeout` example in `examples/websocket-server/bot.py` to handle session timeout event.

- Changed `InputParams` in `src/pipecat/services/gemini_multimodal_live/gemini.py` to support different modalities.

### Fixed

- Fixed an import issue for `PlayHTHttpTTSService`.

- Fixed an issue where languages couldn't be used with the `PlayHTHttpTTSService`.

- Fixed an issue where `OpenAIRealtimeBetaLLMService` audio chunks were hitting
an error when truncating audio content.

Expand Down
101 changes: 101 additions & 0 deletions examples/foundational/07e-interruptible-playht-http.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

import asyncio
import os
import sys

import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure

from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.openai import OpenAILLMService
from pipecat.services.playht import PlayHTHttpTTSService
from pipecat.transcriptions.language import Language
from pipecat.transports.services.daily import DailyParams, DailyTransport

load_dotenv(override=True)

logger.remove(0)
logger.add(sys.stderr, level="DEBUG")


async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)

transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)

tts = PlayHTHttpTTSService(
user_id=os.getenv("PLAYHT_USER_ID"),
api_key=os.getenv("PLAYHT_API_KEY"),
voice_url="s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json",
)

llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")

messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]

context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)

pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)

task = PipelineTask(
pipeline,
PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
report_only_initial_ttfb=True,
),
)

@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])

runner = PipelineRunner()

await runner.run(task)


if __name__ == "__main__":
asyncio.run(main())
18 changes: 14 additions & 4 deletions src/pipecat/services/playht.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@

try:
from pyht.async_client import AsyncClient
from pyht.client import TTSOptions
from pyht.protos.api_pb2 import Format
from pyht.client import Format, TTSOptions
from pyht.client import Language as PlayHTLanguage
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error(
Expand Down Expand Up @@ -363,7 +363,7 @@ def __init__(
api_key: str,
user_id: str,
voice_url: str,
voice_engine: str = "Play3.0-mini",
voice_engine: str = "Play3.0-mini-http", # Options: Play3.0-mini-http, Play3.0-mini-ws
sample_rate: int = 24000,
params: InputParams = InputParams(),
**kwargs,
Expand All @@ -389,9 +389,19 @@ def __init__(
}
self.set_model_name(voice_engine)
self.set_voice(voice_url)

language_str = self._settings["language"]
playht_language = None
if language_str:
# Convert string to PlayHT Language enum
for lang in PlayHTLanguage:
if lang.value == language_str:
playht_language = lang
break

self._options = TTSOptions(
voice=self._voice_id,
language=self._settings["language"],
language=playht_language,
sample_rate=self._settings["sample_rate"],
format=self._settings["format"],
speed=self._settings["speed"],
Expand Down

0 comments on commit a8ae798

Please sign in to comment.