Skip to content

Commit

Permalink
Merge pull request #860 from pipecat-ai/mb/transcription
Browse files Browse the repository at this point in the history
Add a TranscriptProcessor and new frames
  • Loading branch information
markbackman authored Dec 19, 2024
2 parents 1ccc84d + 8a7a619 commit 6e0d3ae
Show file tree
Hide file tree
Showing 10 changed files with 892 additions and 5 deletions.
17 changes: 16 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
`audioop-lts` (https://github.com/AbstractUmbra/audioop) to provide the same
functionality.

- Added support for more languages to ElevenLabs (Arabic, Croatian, Filipino,
- Added timestamped conversation transcript support:

- New `TranscriptProcessor` factory provides access to user and assistant
transcript processors.
- `UserTranscriptProcessor` processes user speech with timestamps from
transcription.
- `AssistantTranscriptProcessor` processes assistant responses with LLM
context timestamps.
- Messages emitted with ISO 8601 timestamps indicating when they were spoken.
- Supports all LLM formats (OpenAI, Anthropic, Google) via standard message
format.
- New examples: `28a-transcription-processor-openai.py`,
`28b-transcription-processor-anthropic.py`, and
`28c-transcription-processor-gemini.py`.

- Add support for more languages to ElevenLabs (Arabic, Croatian, Filipino,
Tamil) and PlayHT (Afrikans, Albanian, Amharic, Arabic, Bengali, Croatian,
Galician, Hebrew, Mandarin, Serbian, Tagalog, Urdu, Xhosa).

Expand Down
137 changes: 137 additions & 0 deletions examples/foundational/28a-transcription-processor-openai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

import asyncio
import os
import sys
from typing import List

import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure

from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import TranscriptionMessage, TranscriptionUpdateFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.transcript_processor import TranscriptProcessor
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.deepgram import DeepgramSTTService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport

load_dotenv(override=True)

logger.remove(0)
logger.add(sys.stderr, level="DEBUG")


class TranscriptHandler:
"""Simple handler to demonstrate transcript processing.
Maintains a list of conversation messages and logs them with timestamps.
"""

def __init__(self):
self.messages: List[TranscriptionMessage] = []

async def on_transcript_update(
self, processor: TranscriptProcessor, frame: TranscriptionUpdateFrame
):
"""Handle new transcript messages.
Args:
processor: The TranscriptProcessor that emitted the update
frame: TranscriptionUpdateFrame containing new messages
"""
self.messages.extend(frame.messages)

# Log the new messages
logger.info("New transcript messages:")
for msg in frame.messages:
timestamp = f"[{msg.timestamp}] " if msg.timestamp else ""
logger.info(f"{timestamp}{msg.role}: {msg.content}")


async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)

transport = DailyTransport(
room_url,
None,
"Respond bot",
DailyParams(
audio_out_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True,
),
)

stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))

tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)

llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o",
)

messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative, helpful, and brief way. Say hello.",
},
]

context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)

# Create transcript processor and handler
transcript = TranscriptProcessor()
transcript_handler = TranscriptHandler()

# Register event handler for transcript updates
@transcript.event_handler("on_transcript_update")
async def on_transcript_update(processor, frame):
await transcript_handler.on_transcript_update(processor, frame)

pipeline = Pipeline(
[
transport.input(), # Transport user input
stt, # STT
transcript.user(), # User transcripts
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
transcript.assistant(), # Assistant transcripts
]
)

task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))

@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
await task.queue_frames([context_aggregator.user().get_context_frame()])

runner = PipelineRunner()

await runner.run(task)


if __name__ == "__main__":
asyncio.run(main())
137 changes: 137 additions & 0 deletions examples/foundational/28b-transcript-processor-anthropic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

import asyncio
import os
import sys
from typing import List

import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure

from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import TranscriptionMessage, TranscriptionUpdateFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.transcript_processor import TranscriptProcessor
from pipecat.services.anthropic import AnthropicLLMService
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.deepgram import DeepgramSTTService
from pipecat.transports.services.daily import DailyParams, DailyTransport

load_dotenv(override=True)

logger.remove(0)
logger.add(sys.stderr, level="DEBUG")


class TranscriptHandler:
"""Simple handler to demonstrate transcript processing.
Maintains a list of conversation messages and logs them with timestamps.
"""

def __init__(self):
self.messages: List[TranscriptionMessage] = []

async def on_transcript_update(
self, processor: TranscriptProcessor, frame: TranscriptionUpdateFrame
):
"""Handle new transcript messages.
Args:
processor: The TranscriptProcessor that emitted the update
frame: TranscriptionUpdateFrame containing new messages
"""
self.messages.extend(frame.messages)

# Log the new messages
logger.info("New transcript messages:")
for msg in frame.messages:
timestamp = f"[{msg.timestamp}] " if msg.timestamp else ""
logger.info(f"{timestamp}{msg.role}: {msg.content}")


async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)

transport = DailyTransport(
room_url,
None,
"Respond bot",
DailyParams(
audio_out_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True,
),
)

stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))

tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)

llm = AnthropicLLMService(
api_key=os.getenv("ANTHROPIC_API_KEY"), model="claude-3-5-sonnet-20241022"
)

messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative, helpful, and brief way.",
},
{"role": "user", "content": "Say hello."},
]

context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)

# Create transcript processor and handler
transcript = TranscriptProcessor()
transcript_handler = TranscriptHandler()

pipeline = Pipeline(
[
transport.input(), # Transport user input
stt, # STT
transcript.user(), # User transcripts
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
transcript.assistant(), # Assistant transcripts
]
)

task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))

@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
await task.queue_frames([context_aggregator.user().get_context_frame()])

# Register event handler for transcript updates
@transcript.event_handler("on_transcript_update")
async def on_transcript_update(processor, frame):
await transcript_handler.on_transcript_update(processor, frame)

runner = PipelineRunner()

await runner.run(task)


if __name__ == "__main__":
asyncio.run(main())
Loading

0 comments on commit 6e0d3ae

Please sign in to comment.