diff --git a/examples/foundational/06-listen-and-respond.py b/examples/foundational/06-listen-and-respond.py index a76812806..6a10f927c 100644 --- a/examples/foundational/06-listen-and-respond.py +++ b/examples/foundational/06-listen-and-respond.py @@ -44,10 +44,11 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): elif isinstance(d, ProcessingMetricsData): print(f"!!! MetricsFrame: {frame}, processing: {d.value}") elif isinstance(d, LLMUsageMetricsData): + tokens = d.value print( f"!!! MetricsFrame: {frame}, tokens: { - d.prompt_tokens}, characters: { - d.completion_tokens}") + tokens.prompt_tokens}, characters: { + tokens.completion_tokens}") elif isinstance(d, TTSUsageMetricsData): print(f"!!! MetricsFrame: {frame}, characters: {d.value}") await self.push_frame(frame, direction) diff --git a/src/pipecat/metrics/metrics.py b/src/pipecat/metrics/metrics.py index 9baffcbcb..053708998 100644 --- a/src/pipecat/metrics/metrics.py +++ b/src/pipecat/metrics/metrics.py @@ -4,31 +4,28 @@ class MetricsData(BaseModel): processor: str + model: Optional[str] = None class TTFBMetricsData(MetricsData): value: float - model: Optional[str] = None class ProcessingMetricsData(MetricsData): value: float - model: Optional[str] = None -class LLMUsageMetricsData(MetricsData): - model: str +class LLMTokenUsage(BaseModel): prompt_tokens: int completion_tokens: int total_tokens: int + cache_read_input_tokens: Optional[int] = None + cache_creation_input_tokens: Optional[int] = None -class CacheUsageMetricsData(LLMUsageMetricsData): - cache_read_input_tokens: int - cache_creation_input_tokens: int +class LLMUsageMetricsData(MetricsData): + value: LLMTokenUsage class TTSUsageMetricsData(MetricsData): - processor: str - model: Optional[str] = None value: int diff --git a/src/pipecat/processors/frame_processor.py b/src/pipecat/processors/frame_processor.py index 051542486..b97129e96 100644 --- a/src/pipecat/processors/frame_processor.py +++ b/src/pipecat/processors/frame_processor.py @@ -19,7 +19,13 @@ StartInterruptionFrame, StopInterruptionFrame, SystemFrame) -from pipecat.metrics.metrics import LLMUsageMetricsData, ProcessingMetricsData, TTFBMetricsData, TTSUsageMetricsData +from pipecat.metrics.metrics import ( + LLMTokenUsage, + LLMUsageMetricsData, + MetricsData, + ProcessingMetricsData, + TTFBMetricsData, + TTSUsageMetricsData) from pipecat.utils.utils import obj_count, obj_id from loguru import logger @@ -32,47 +38,68 @@ class FrameDirection(Enum): class FrameProcessorMetrics: def __init__(self, name: str): - self._name = name + self._core_metrics_data = MetricsData(processor=name) self._start_ttfb_time = 0 self._start_processing_time = 0 self._should_report_ttfb = True + def _processor_name(self): + return self._core_metrics_data.processor + + def _model_name(self): + return self._core_metrics_data.model + + def set_core_metrics_data(self, data: MetricsData): + self._core_metrics_data = data + async def start_ttfb_metrics(self, report_only_initial_ttfb): if self._should_report_ttfb: self._start_ttfb_time = time.time() self._should_report_ttfb = not report_only_initial_ttfb - async def stop_ttfb_metrics(self, model: str | None = None): + async def stop_ttfb_metrics(self): if self._start_ttfb_time == 0: return None value = time.time() - self._start_ttfb_time - logger.debug(f"{self._name} TTFB: {value}") - ttfb = TTFBMetricsData(processor=self._name, value=value, model=model) + logger.debug(f"{self._processor_name()} TTFB: {value}") + ttfb = TTFBMetricsData( + processor=self._processor_name(), + value=value, + model=self._model_name()) self._start_ttfb_time = 0 return MetricsFrame(data=[ttfb]) async def start_processing_metrics(self): self._start_processing_time = time.time() - async def stop_processing_metrics(self, model: str | None = None): + async def stop_processing_metrics(self): if self._start_processing_time == 0: return None value = time.time() - self._start_processing_time - logger.debug(f"{self._name} processing time: {value}") - processing = ProcessingMetricsData(processor=self._name, value=value, model=model) + logger.debug(f"{self._processor_name()} processing time: {value}") + processing = ProcessingMetricsData( + processor=self._processor_name(), value=value, model=self._model_name()) self._start_processing_time = 0 return MetricsFrame(data=[processing]) - async def start_llm_usage_metrics(self, usage_params: LLMUsageMetricsData): - logger.debug(f"{self._name} prompt tokens: { - usage_params.prompt_tokens}, completion tokens: {usage_params.completion_tokens}") - return MetricsFrame(data=[usage_params]) + async def start_llm_usage_metrics(self, tokens: LLMTokenUsage): + logger.debug(f"{self._processor_name()} prompt tokens: { + tokens.prompt_tokens}, completion tokens: {tokens.completion_tokens}") + value = LLMUsageMetricsData( + processor=self._processor_name(), + model=self._model_name(), + value=tokens) + return MetricsFrame(data=[value]) - async def start_tts_usage_metrics(self, usage_params: TTSUsageMetricsData): - logger.debug(f"{self._name} usage characters: {usage_params.value}") - return MetricsFrame(data=[usage_params]) + async def start_tts_usage_metrics(self, text: str): + characters = TTSUsageMetricsData( + processor=self._processor_name(), + model=self._model_name(), + value=len(text)) + logger.debug(f"{self._processor_name()} usage characters: {characters.value}") + return MetricsFrame(data=[characters]) class FrameProcessor: @@ -131,13 +158,16 @@ def report_only_initial_ttfb(self): def can_generate_metrics(self) -> bool: return False + def set_core_metrics_data(self, data: MetricsData): + self._metrics.set_core_metrics_data(data) + async def start_ttfb_metrics(self): if self.can_generate_metrics() and self.metrics_enabled: await self._metrics.start_ttfb_metrics(self._report_only_initial_ttfb) - async def stop_ttfb_metrics(self, model: str | None = None): + async def stop_ttfb_metrics(self): if self.can_generate_metrics() and self.metrics_enabled: - frame = await self._metrics.stop_ttfb_metrics(model) + frame = await self._metrics.stop_ttfb_metrics() if frame: await self.push_frame(frame) @@ -145,27 +175,27 @@ async def start_processing_metrics(self): if self.can_generate_metrics() and self.metrics_enabled: await self._metrics.start_processing_metrics() - async def stop_processing_metrics(self, model: str | None = None): + async def stop_processing_metrics(self): if self.can_generate_metrics() and self.metrics_enabled: - frame = await self._metrics.stop_processing_metrics(model) + frame = await self._metrics.stop_processing_metrics() if frame: await self.push_frame(frame) - async def start_llm_usage_metrics(self, usage_params: LLMUsageMetricsData): + async def start_llm_usage_metrics(self, tokens: LLMTokenUsage): if self.can_generate_metrics() and self.usage_metrics_enabled: - frame = await self._metrics.start_llm_usage_metrics(usage_params) + frame = await self._metrics.start_llm_usage_metrics(tokens) if frame: await self.push_frame(frame) - async def start_tts_usage_metrics(self, usage_params: TTSUsageMetricsData): + async def start_tts_usage_metrics(self, text: str): if self.can_generate_metrics() and self.usage_metrics_enabled: - frame = await self._metrics.start_tts_usage_metrics(usage_params) + frame = await self._metrics.start_tts_usage_metrics(text) if frame: await self.push_frame(frame) - async def stop_all_metrics(self, model: str | None = None): - await self.stop_ttfb_metrics(model) - await self.stop_processing_metrics(model) + async def stop_all_metrics(self): + await self.stop_ttfb_metrics() + await self.stop_processing_metrics() async def cleanup(self): pass diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index 1887d15a1..0254dbc8c 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -11,7 +11,6 @@ from abc import abstractmethod from typing import AsyncGenerator, List, Optional, Tuple -from attr import has from pipecat.frames.frames import ( AudioRawFrame, CancelFrame, @@ -33,6 +32,7 @@ UserImageRequestFrame, VisionImageRawFrame ) +from pipecat.metrics.metrics import MetricsData from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.transcriptions.language import Language from pipecat.utils.audio import calculate_audio_volume @@ -47,6 +47,11 @@ class AIService(FrameProcessor): def __init__(self, **kwargs): super().__init__(**kwargs) + self.model_name: str = "" + + def set_model_name(self, model: str): + self.model_name = model + self.set_core_metrics_data(MetricsData(processor=self.name, model=self.model_name)) async def start(self, frame: StartFrame): pass @@ -159,7 +164,7 @@ def sample_rate(self) -> int: @abstractmethod async def set_model(self, model: str): - pass + self.set_model_name(model) @abstractmethod async def set_voice(self, voice: str): @@ -368,7 +373,7 @@ def __init__(self, **kwargs): @abstractmethod async def set_model(self, model: str): - pass + self.set_model_name(model) @abstractmethod async def set_language(self, language: Language): @@ -483,7 +488,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): await self.push_frame(frame, direction) await self.start_processing_metrics() await self.process_generator(self.run_image_gen(frame.text)) - await self.stop_processing_metrics(self._model if hasattr(self, "_model") else None) + await self.stop_processing_metrics() else: await self.push_frame(frame, direction) @@ -505,6 +510,6 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): if isinstance(frame, VisionImageRawFrame): await self.start_processing_metrics() await self.process_generator(self.run_vision(frame)) - await self.stop_processing_metrics(self._model if hasattr(self, "_model") else None) + await self.stop_processing_metrics() else: await self.push_frame(frame, direction) diff --git a/src/pipecat/services/anthropic.py b/src/pipecat/services/anthropic.py index 754c1d7e2..7935691ce 100644 --- a/src/pipecat/services/anthropic.py +++ b/src/pipecat/services/anthropic.py @@ -29,7 +29,7 @@ FunctionCallInProgressFrame, StartInterruptionFrame ) -from pipecat.metrics.metrics import CacheUsageMetricsData +from pipecat.metrics.metrics import LLMTokenUsage from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import LLMService from pipecat.processors.aggregators.openai_llm_context import ( @@ -85,7 +85,7 @@ def __init__( **kwargs): super().__init__(**kwargs) self._client = AsyncAnthropic(api_key=api_key) - self._model = model + self.set_model_name(model) self._max_tokens = max_tokens self._enable_prompt_caching_beta = enable_prompt_caching_beta @@ -138,11 +138,11 @@ async def _process_context(self, context: OpenAILLMContext): tools=context.tools or [], system=context.system, messages=messages, - model=self._model, + model=self.model_name, max_tokens=self._max_tokens, stream=True) - await self.stop_ttfb_metrics(self._model) + await self.stop_ttfb_metrics() # Function calling tool_use_block = None @@ -206,7 +206,7 @@ async def _process_context(self, context: OpenAILLMContext): except Exception as e: logger.exception(f"{self} exception: {e}") finally: - await self.stop_processing_metrics(self._model) + await self.stop_processing_metrics() await self.push_frame(LLMFullResponseEndFrame()) comp_tokens = completion_tokens if not use_completion_tokens_estimate else completion_tokens_estimate await self._report_usage_metrics( @@ -232,7 +232,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): context = AnthropicLLMContext.from_image_frame(frame) elif isinstance(frame, LLMModelUpdateFrame): logger.debug(f"Switching LLM model to: [{frame.model}]") - self._model = frame.model + self.set_model_name(frame.model) elif isinstance(frame, LLMEnablePromptCachingFrame): logger.debug(f"Setting enable prompt caching to: [{frame.enable}]") self._enable_prompt_caching_beta = frame.enable @@ -252,9 +252,7 @@ async def _report_usage_metrics( cache_creation_input_tokens: int, cache_read_input_tokens: int): if prompt_tokens or completion_tokens or cache_creation_input_tokens or cache_read_input_tokens: - tokens = CacheUsageMetricsData( - processor=self.name, - model=self._model, + tokens = LLMTokenUsage( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, cache_creation_input_tokens=cache_creation_input_tokens, diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py index 07a75af2b..daac57119 100644 --- a/src/pipecat/services/azure.py +++ b/src/pipecat/services/azure.py @@ -113,7 +113,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: result = await asyncio.to_thread(self._speech_synthesizer.speak_ssml, (ssml)) if result.reason == ResultReason.SynthesizingAudioCompleted: - await self.start_tts_usage_metrics(TTSUsageMetricsData(processor=self.name, value=len(text))) + await self.start_tts_usage_metrics(text) await self.stop_ttfb_metrics() await self.push_frame(TTSStartedFrame()) # Azure always sends a 44-byte header. Strip it off. @@ -192,7 +192,7 @@ def __init__( self._api_key = api_key self._azure_endpoint = endpoint self._api_version = api_version - self._model = model + self.set_model_name(model) self._image_size = image_size self._aiohttp_session = aiohttp_session diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index 97aac634f..ab6026052 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -23,7 +23,6 @@ TTSStoppedFrame, LLMFullResponseEndFrame ) -from pipecat.metrics.metrics import TTSUsageMetricsData from pipecat.processors.frame_processor import FrameDirection from pipecat.transcriptions.language import Language from pipecat.services.ai_services import AsyncWordTTSService, TTSService @@ -90,7 +89,7 @@ def __init__( self._cartesia_version = cartesia_version self._url = url self._voice_id = voice_id - self._model_id = model_id + self.set_model_name(model_id) self._output_format = { "container": "raw", "encoding": encoding, @@ -106,8 +105,8 @@ def can_generate_metrics(self) -> bool: return True async def set_model(self, model: str): + await super().set_model(model) logger.debug(f"Switching TTS model to: [{model}]") - self._model_id = model async def set_voice(self, voice: str): logger.debug(f"Switching TTS voice to: [{voice}]") @@ -141,7 +140,7 @@ async def _connect(self): async def _disconnect(self): try: - await self.stop_all_metrics(self._model_id) + await self.stop_all_metrics() if self._websocket: await self._websocket.close() @@ -156,9 +155,14 @@ async def _disconnect(self): except Exception as e: logger.error(f"{self} error closing websocket: {e}") + def _get_websocket(self): + if self._websocket: + return self._websocket + raise Exception("Websocket not connected") + async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection): await super()._handle_interruption(frame, direction) - await self.stop_all_metrics(self._model_id) + await self.stop_all_metrics() await self.push_frame(LLMFullResponseEndFrame()) self._context_id = None @@ -170,7 +174,7 @@ async def flush_audio(self): "transcript": "", "continue": False, "context_id": self._context_id, - "model_id": self._model_id, + "model_id": self.model_name, "voice": { "mode": "id", "id": self._voice_id @@ -183,12 +187,12 @@ async def flush_audio(self): async def _receive_task_handler(self): try: - async for message in self._websocket: + async for message in self._get_websocket(): msg = json.loads(message) if not msg or msg["context_id"] != self._context_id: continue if msg["type"] == "done": - await self.stop_ttfb_metrics(self._model_id) + await self.stop_ttfb_metrics() await self.push_frame(TTSStoppedFrame()) # Unset _context_id but not the _context_id_start_timestamp # because we are likely still playing out audio and need the @@ -200,7 +204,7 @@ async def _receive_task_handler(self): list(zip(msg["word_timestamps"]["words"], msg["word_timestamps"]["start"])) ) elif msg["type"] == "chunk": - await self.stop_ttfb_metrics(self._model_id) + await self.stop_ttfb_metrics() self.start_word_timestamps() frame = AudioRawFrame( audio=base64.b64decode(msg["data"]), @@ -211,7 +215,7 @@ async def _receive_task_handler(self): elif msg["type"] == "error": logger.error(f"{self} error: {msg}") await self.push_frame(TTSStoppedFrame()) - await self.stop_all_metrics(self._model_id) + await self.stop_all_metrics() await self.push_error(ErrorFrame(f'{self} error: {msg["error"]}')) else: logger.error(f"Cartesia error, unknown message type: {msg}") @@ -236,7 +240,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: "transcript": text + " ", "continue": True, "context_id": self._context_id, - "model_id": self._model_id, + "model_id": self.model_name, "voice": { "mode": "id", "id": self._voice_id @@ -246,8 +250,8 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: "add_timestamps": True, } try: - await self._websocket.send(json.dumps(msg)) - await self.start_tts_usage_metrics(TTSUsageMetricsData(processor=self.name, model=self._model_id, value=len(text))) + await self._get_websocket().send(json.dumps(msg)) + await self.start_tts_usage_metrics(text) except Exception as e: logger.error(f"{self} error sending message: {e}") await self.push_frame(TTSStoppedFrame()) diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py index 6dfc9d5cb..7e5e16d3e 100644 --- a/src/pipecat/services/deepgram.py +++ b/src/pipecat/services/deepgram.py @@ -19,7 +19,6 @@ TTSStartedFrame, TTSStoppedFrame, TranscriptionFrame) -from pipecat.metrics.metrics import TTSUsageMetricsData from pipecat.services.ai_services import STTService, TTSService from pipecat.transcriptions.language import Language from pipecat.utils.time import time_now_iso8601 @@ -100,7 +99,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: yield ErrorFrame(f"Error getting audio (status: {r.status}, error: {response_text})") return - await self.start_tts_usage_metrics(TTSUsageMetricsData(processor=self.name, value=len(text))) + await self.start_tts_usage_metrics(text) await self.push_frame(TTSStartedFrame()) async for data in r.content: @@ -139,6 +138,7 @@ def __init__(self, self._connection.on(LiveTranscriptionEvents.Transcript, self._on_message) async def set_model(self, model: str): + await super().set_model(model) logger.debug(f"Switching STT model to: [{model}]") self._live_options.model = model await self._disconnect() @@ -165,7 +165,7 @@ async def cancel(self, frame: CancelFrame): async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: await self.start_processing_metrics() await self._connection.send(audio) - await self.stop_processing_metrics(self._live_options.model) + await self.stop_processing_metrics() yield None async def _connect(self): diff --git a/src/pipecat/services/elevenlabs.py b/src/pipecat/services/elevenlabs.py index 35af8b5e3..10015e791 100644 --- a/src/pipecat/services/elevenlabs.py +++ b/src/pipecat/services/elevenlabs.py @@ -20,7 +20,6 @@ StartInterruptionFrame, TTSStartedFrame, TTSStoppedFrame) -from pipecat.metrics.metrics import TTSUsageMetricsData from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import AsyncWordTTSService @@ -108,7 +107,7 @@ def __init__( self._api_key = api_key self._voice_id = voice_id - self._model = model + self.set_model_name(model) self._url = url self._params = params @@ -123,8 +122,8 @@ def can_generate_metrics(self) -> bool: return True async def set_model(self, model: str): + await super().set_model(model) logger.debug(f"Switching TTS model to: [{model}]") - self._model = model await self._disconnect() await self._connect() @@ -161,7 +160,7 @@ async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirect async def _connect(self): try: voice_id = self._voice_id - model = self._model + model = self.model_name output_format = self._params.output_format url = f"{ self._url}/v1/text-to-speech/{voice_id}/stream-input?model_id={model}&output_format={output_format}" @@ -181,7 +180,7 @@ async def _connect(self): async def _disconnect(self): try: - await self.stop_all_metrics(self._model) + await self.stop_all_metrics() if self._websocket: await self._websocket.send(json.dumps({"text": ""})) @@ -207,7 +206,7 @@ async def _receive_task_handler(self): async for message in self._websocket: msg = json.loads(message) if msg.get("audio"): - await self.stop_ttfb_metrics(self._model) + await self.stop_ttfb_metrics() self.start_word_timestamps() audio = base64.b64decode(msg["audio"]) @@ -253,7 +252,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: self._cumulative_time = 0 await self._send_text(text) - await self.start_tts_usage_metrics(TTSUsageMetricsData(processor=self.name, model=self._model, value=len(text))) + await self.start_tts_usage_metrics(text) except Exception as e: logger.error(f"{self} error sending message: {e}") await self.push_frame(TTSStoppedFrame()) diff --git a/src/pipecat/services/fal.py b/src/pipecat/services/fal.py index 672135d02..58768180f 100644 --- a/src/pipecat/services/fal.py +++ b/src/pipecat/services/fal.py @@ -46,7 +46,7 @@ def __init__( **kwargs ): super().__init__(**kwargs) - self._model = model + self.set_model_name(model) self._params = params self._aiohttp_session = aiohttp_session if key: @@ -56,7 +56,7 @@ async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating image from prompt: {prompt}") response = await fal_client.run_async( - self._model, + self.model_name, arguments={"prompt": prompt, **self._params.model_dump(exclude_none=True)} ) diff --git a/src/pipecat/services/fireworks.py b/src/pipecat/services/fireworks.py index 7fa4d64e8..87fddd838 100644 --- a/src/pipecat/services/fireworks.py +++ b/src/pipecat/services/fireworks.py @@ -22,4 +22,4 @@ def __init__(self, *, model: str = "accounts/fireworks/models/firefunction-v1", base_url: str = "https://api.fireworks.ai/inference/v1"): - super().__init__(model, base_url) + super().__init__(model=model, base_url=base_url) diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py index b542f247d..b72169b70 100644 --- a/src/pipecat/services/google.py +++ b/src/pipecat/services/google.py @@ -45,12 +45,12 @@ def __init__(self, *, api_key: str, model: str = "gemini-1.5-flash-latest", **kw super().__init__(**kwargs) gai.configure(api_key=api_key) self._create_client(model) - self._model = model def can_generate_metrics(self) -> bool: return True def _create_client(self, model: str): + self.set_model_name(model) self._client = gai.GenerativeModel(model) def _get_messages_from_openai_context( @@ -93,7 +93,7 @@ async def _process_context(self, context: OpenAILLMContext): response = self._client.generate_content(messages, stream=True) - await self.stop_ttfb_metrics(self._model) + await self.stop_ttfb_metrics() async for chunk in self._async_generator_wrapper(response): try: diff --git a/src/pipecat/services/lmnt.py b/src/pipecat/services/lmnt.py index 2fec2d21f..60f0cb7df 100644 --- a/src/pipecat/services/lmnt.py +++ b/src/pipecat/services/lmnt.py @@ -20,7 +20,6 @@ TTSStartedFrame, TTSStoppedFrame, ) -from pipecat.metrics.metrics import TTSUsageMetricsData from pipecat.services.ai_services import AsyncTTSService from loguru import logger @@ -155,7 +154,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: try: await self._connection.append_text(text) await self._connection.flush() - await self.start_tts_usage_metrics(TTSUsageMetricsData(processor=self.name, value=len(text))) + await self.start_tts_usage_metrics(text) except Exception as e: logger.error(f"{self} error sending message: {e}") await self.push_frame(TTSStoppedFrame()) diff --git a/src/pipecat/services/moondream.py b/src/pipecat/services/moondream.py index 3441aeeb9..b6391cc93 100644 --- a/src/pipecat/services/moondream.py +++ b/src/pipecat/services/moondream.py @@ -54,6 +54,8 @@ def __init__( ): super().__init__(**kwargs) + self.set_model_name(model) + if not use_cpu: device, dtype = detect_device() else: @@ -73,7 +75,7 @@ def __init__( async def run_vision(self, frame: VisionImageRawFrame) -> AsyncGenerator[Frame, None]: if not self._model: - logger.error(f"{self} error: Moondream model not available") + logger.error(f"{self} error: Moondream model not available ({self.model_name})") yield ErrorFrame("Moondream model not available") return diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py index 7ccec379f..6fe710b5e 100644 --- a/src/pipecat/services/openai.py +++ b/src/pipecat/services/openai.py @@ -33,7 +33,7 @@ FunctionCallInProgressFrame, StartInterruptionFrame ) -from pipecat.metrics.metrics import LLMUsageMetricsData, TTSUsageMetricsData +from pipecat.metrics.metrics import LLMTokenUsage from pipecat.processors.aggregators.llm_response import LLMUserContextAggregator, LLMAssistantContextAggregator from pipecat.processors.aggregators.openai_llm_context import ( @@ -84,7 +84,7 @@ class BaseOpenAILLMService(LLMService): def __init__(self, *, model: str, api_key=None, base_url=None, **kwargs): super().__init__(**kwargs) - self._model: str = model + self.set_model_name(model) self._client = self.create_client(api_key=api_key, base_url=base_url, **kwargs) def create_client(self, api_key=None, base_url=None, **kwargs): @@ -105,7 +105,7 @@ async def get_chat_completions( context: OpenAILLMContext, messages: List[ChatCompletionMessageParam]) -> AsyncStream[ChatCompletionChunk]: chunks = await self._client.chat.completions.create( - model=self._model, + model=self.model_name, stream=True, messages=messages, tools=context.tools, @@ -149,9 +149,7 @@ async def _process_context(self, context: OpenAILLMContext): async for chunk in chunk_stream: if chunk.usage: - tokens = LLMUsageMetricsData( - processor=self.name, - model=self._model, + tokens = LLMTokenUsage( prompt_tokens=chunk.usage.prompt_tokens, completion_tokens=chunk.usage.completion_tokens, total_tokens=chunk.usage.total_tokens @@ -161,7 +159,7 @@ async def _process_context(self, context: OpenAILLMContext): if len(chunk.choices) == 0: continue - await self.stop_ttfb_metrics(self._model) + await self.stop_ttfb_metrics() if chunk.choices[0].delta.tool_calls: # We're streaming the LLM response to enable the fastest response times. @@ -224,7 +222,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): context = OpenAILLMContext.from_image_frame(frame) elif isinstance(frame, LLMModelUpdateFrame): logger.debug(f"Switching LLM model to: [{frame.model}]") - self._model = frame.model + self.set_model_name(frame.model) else: await self.push_frame(frame, direction) @@ -232,7 +230,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): await self.push_frame(LLMFullResponseStartFrame()) await self.start_processing_metrics() await self._process_context(context) - await self.stop_processing_metrics(self._model) + await self.stop_processing_metrics() await self.push_frame(LLMFullResponseEndFrame()) @@ -274,7 +272,7 @@ def __init__( model: str = "dall-e-3", ): super().__init__() - self._model = model + self.set_model_name(model) self._image_size = image_size self._client = AsyncOpenAI(api_key=api_key) self._aiohttp_session = aiohttp_session @@ -284,7 +282,7 @@ async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]: image = await self._client.images.generate( prompt=prompt, - model=self._model, + model=self.model_name, n=1, size=self._image_size ) @@ -326,7 +324,7 @@ def __init__( super().__init__(sample_rate=sample_rate, **kwargs) self._voice: ValidVoice = VALID_VOICES.get(voice, "alloy") - self._model = model + self.set_model_name(model) self._sample_rate = sample_rate self._client = AsyncOpenAI(api_key=api_key) @@ -349,7 +347,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: async with self._client.audio.speech.with_streaming_response.create( input=text, - model=self._model, + model=self.model_name, voice=self._voice, response_format="pcm", ) as r: @@ -360,12 +358,12 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: yield ErrorFrame(f"Error getting audio (status: {r.status_code}, error: {error})") return - await self.start_tts_usage_metrics(TTSUsageMetricsData(processor=self.name, model=self._model, value=len(text))) + await self.start_tts_usage_metrics(text) await self.push_frame(TTSStartedFrame()) async for chunk in r.iter_bytes(8192): if len(chunk) > 0: - await self.stop_ttfb_metrics(self._model) + await self.stop_ttfb_metrics() frame = AudioRawFrame(chunk, self.sample_rate, 1) yield frame await self.push_frame(TTSStoppedFrame()) diff --git a/src/pipecat/services/openpipe.py b/src/pipecat/services/openpipe.py index ada7824fb..e4e14dc15 100644 --- a/src/pipecat/services/openpipe.py +++ b/src/pipecat/services/openpipe.py @@ -60,7 +60,7 @@ async def get_chat_completions( context: OpenAILLMContext, messages: List[ChatCompletionMessageParam]) -> AsyncStream[ChatCompletionChunk]: chunks = await self._client.chat.completions.create( - model=self._model, + model=self.model_name, stream=True, messages=messages, openpipe={ diff --git a/src/pipecat/services/playht.py b/src/pipecat/services/playht.py index 736d995c5..c3200fee9 100644 --- a/src/pipecat/services/playht.py +++ b/src/pipecat/services/playht.py @@ -10,7 +10,6 @@ from typing import AsyncGenerator from pipecat.frames.frames import AudioRawFrame, Frame, TTSStartedFrame, TTSStoppedFrame -from pipecat.metrics.metrics import TTSUsageMetricsData from pipecat.services.ai_services import TTSService from loguru import logger @@ -72,7 +71,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: voice_engine="PlayHT2.0-turbo", options=self._options) - await self.start_tts_usage_metrics(TTSUsageMetricsData(processor=self.name, value=len(text))) + await self.start_tts_usage_metrics(text) await self.push_frame(TTSStartedFrame()) async for chunk in playht_gen: diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py index 9bdad950b..004236ac8 100644 --- a/src/pipecat/services/together.py +++ b/src/pipecat/services/together.py @@ -26,7 +26,7 @@ FunctionCallInProgressFrame, StartInterruptionFrame ) -from pipecat.metrics.metrics import LLMUsageMetricsData +from pipecat.metrics.metrics import LLMTokenUsage from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import LLMService from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext, OpenAILLMContextFrame @@ -68,7 +68,7 @@ def __init__( **kwargs): super().__init__(**kwargs) self._client = AsyncTogether(api_key=api_key) - self._model = model + self.set_model_name(model) self._max_tokens = max_tokens def can_generate_metrics(self) -> bool: @@ -94,7 +94,7 @@ async def _process_context(self, context: OpenAILLMContext): stream = await self._client.chat.completions.create( messages=context.messages, - model=self._model, + model=self.model_name, max_tokens=self._max_tokens, stream=True, ) @@ -107,9 +107,7 @@ async def _process_context(self, context: OpenAILLMContext): async for chunk in stream: # logger.debug(f"Together LLM event: {chunk}") if chunk.usage: - tokens = LLMUsageMetricsData( - processor=self.name, - model=self._model, + tokens = LLMTokenUsage( prompt_tokens=chunk.usage.prompt_tokens, completion_tokens=chunk.usage.completion_tokens, total_tokens=chunk.usage.total_tokens @@ -120,7 +118,7 @@ async def _process_context(self, context: OpenAILLMContext): continue if not got_first_chunk: - await self.stop_ttfb_metrics(self._model) + await self.stop_ttfb_metrics() if chunk.choices[0].delta.content: got_first_chunk = True if chunk.choices[0].delta.content[0] == "<": @@ -142,7 +140,7 @@ async def _process_context(self, context: OpenAILLMContext): except Exception as e: logger.exception(f"{self} exception: {e}") finally: - await self.stop_processing_metrics(self._model) + await self.stop_processing_metrics() await self.push_frame(LLMFullResponseEndFrame()) async def process_frame(self, frame: Frame, direction: FrameDirection): @@ -155,7 +153,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): context = TogetherLLMContext.from_messages(frame.messages) elif isinstance(frame, LLMModelUpdateFrame): logger.debug(f"Switching LLM model to: [{frame.model}]") - self._model = frame.model + self.set_model_name(frame.model) else: await self.push_frame(frame, direction) diff --git a/src/pipecat/services/whisper.py b/src/pipecat/services/whisper.py index 952c30416..9f54f9ca0 100644 --- a/src/pipecat/services/whisper.py +++ b/src/pipecat/services/whisper.py @@ -52,7 +52,7 @@ def __init__(self, super().__init__(**kwargs) self._device: str = device self._compute_type = compute_type - self._model_name: str | Model = model + self.set_model_name(model if isinstance(model, str) else model.value) self._no_speech_prob = no_speech_prob self._model: WhisperModel | None = None self._load() @@ -60,15 +60,12 @@ def __init__(self, def can_generate_metrics(self) -> bool: return True - def _get_model_name_as_str(self) -> str: - return self._model_name.value if isinstance(self._model_name, Enum) else self._model_name - def _load(self): """Loads the Whisper model. Note that if this is the first time this model is being run, it will take time to download.""" logger.debug("Loading Whisper model...") self._model = WhisperModel( - self._get_model_name_as_str(), + self.model_name, device=self._device, compute_type=self._compute_type) logger.debug("Loaded Whisper model") @@ -92,8 +89,8 @@ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: if segment.no_speech_prob < self._no_speech_prob: text += f"{segment.text} " - await self.stop_ttfb_metrics(self._get_model_name_as_str()) - await self.stop_processing_metrics(self._get_model_name_as_str()) + await self.stop_ttfb_metrics() + await self.stop_processing_metrics() if text: logger.debug(f"Transcription: [{text}]") diff --git a/src/pipecat/services/xtts.py b/src/pipecat/services/xtts.py index 47c398c6a..38f0f9a64 100644 --- a/src/pipecat/services/xtts.py +++ b/src/pipecat/services/xtts.py @@ -15,7 +15,6 @@ StartFrame, TTSStartedFrame, TTSStoppedFrame) -from pipecat.metrics.metrics import TTSUsageMetricsData from pipecat.services.ai_services import TTSService from loguru import logger @@ -104,7 +103,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: yield ErrorFrame(f"Error getting audio (status: {r.status}, error: {text})") return - await self.start_tts_usage_metrics(TTSUsageMetricsData(processor=self.name, value=len(text))) + await self.start_tts_usage_metrics(text) await self.push_frame(TTSStartedFrame()) diff --git a/src/pipecat/transports/services/daily.py b/src/pipecat/transports/services/daily.py index 5a5baef31..e28fe6083 100644 --- a/src/pipecat/transports/services/daily.py +++ b/src/pipecat/transports/services/daily.py @@ -736,20 +736,19 @@ async def send_metrics(self, frame: MetricsFrame): if isinstance(d, TTFBMetricsData): if "ttfb" not in metrics: metrics["ttfb"] = [] - metrics["ttfb"].append(d.model_dump_json()) + metrics["ttfb"].append(d.model_dump()) elif isinstance(d, ProcessingMetricsData): if "processing" not in metrics: metrics["processing"] = [] - metrics["processing"].append(d.model_dump_json()) + metrics["processing"].append(d.model_dump()) elif isinstance(d, LLMUsageMetricsData): if "tokens" not in metrics: metrics["tokens"] = [] - metrics["tokens"].append(d.model_dump_json()) + metrics["tokens"].append(d.value.model_dump(exclude_none=True)) elif isinstance(d, TTSUsageMetricsData): if "characters" not in metrics: metrics["characters"] = [] - metrics["characters"].append(d.model_dump_json()) - print(f"Sending metrics: {metrics}") + metrics["characters"].append(d.model_dump()) message = DailyTransportMessageFrame(message={ "type": "pipecat-metrics",