From 5485897ff23c89d13087fc270ecf810a1638b303 Mon Sep 17 00:00:00 2001 From: Bohan Qu Date: Tue, 8 Oct 2024 17:48:23 +0800 Subject: [PATCH 1/3] feat: Add TTFT support in OpenAI generators --- haystack/components/generators/chat/openai.py | 12 +++++++++++- releasenotes/notes/openai-ttft-42b1ad551b542930.yaml | 6 ++++++ test/components/generators/chat/test_openai.py | 11 ++++++++++- 3 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 releasenotes/notes/openai-ttft-42b1ad551b542930.yaml diff --git a/haystack/components/generators/chat/openai.py b/haystack/components/generators/chat/openai.py index 8f54b7ad36..2de4e5b1ba 100644 --- a/haystack/components/generators/chat/openai.py +++ b/haystack/components/generators/chat/openai.py @@ -5,6 +5,7 @@ import copy import json import os +from datetime import datetime from typing import Any, Callable, Dict, List, Optional, Union from openai import OpenAI, Stream @@ -222,11 +223,15 @@ def run( raise ValueError("Cannot stream multiple responses, please set n=1.") chunks: List[StreamingChunk] = [] chunk = None + _first_token = True # pylint: disable=not-an-iterable for chunk in chat_completion: if chunk.choices and streaming_callback: chunk_delta: StreamingChunk = self._build_chunk(chunk) + if _first_token: + _first_token = False + chunk_delta.meta["completion_start_time"] = datetime.now().isoformat() chunks.append(chunk_delta) streaming_callback(chunk_delta) # invoke callback with the chunk_delta completions = [self._connect_chunks(chunk, chunks)] @@ -280,7 +285,12 @@ def _connect_chunks(self, chunk: Any, chunks: List[StreamingChunk]) -> ChatMessa payload["function"]["arguments"] += delta.arguments or "" complete_response = ChatMessage.from_assistant(json.dumps(payloads)) else: - complete_response = ChatMessage.from_assistant("".join([chunk.content for chunk in chunks])) + total_content = "" + total_meta = {} + for streaming_chunk in chunks: + total_content += streaming_chunk.content + total_meta.update(streaming_chunk.meta) + complete_response = ChatMessage.from_assistant(total_content, meta=total_meta) complete_response.meta.update( { "model": chunk.model, diff --git a/releasenotes/notes/openai-ttft-42b1ad551b542930.yaml b/releasenotes/notes/openai-ttft-42b1ad551b542930.yaml new file mode 100644 index 0000000000..bc39b96f7f --- /dev/null +++ b/releasenotes/notes/openai-ttft-42b1ad551b542930.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + Add TTFT (Time-to-First-Token) support for OpenAI generators. This + captures the time taken to generate the first token from the model and + can be used to analyze the latency of the application. diff --git a/test/components/generators/chat/test_openai.py b/test/components/generators/chat/test_openai.py index c3332aae97..9d5122d069 100644 --- a/test/components/generators/chat/test_openai.py +++ b/test/components/generators/chat/test_openai.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import logging import os +from unittest.mock import patch import pytest from openai import OpenAIError @@ -219,7 +220,8 @@ def streaming_callback(chunk: StreamingChunk) -> None: assert [isinstance(reply, ChatMessage) for reply in response["replies"]] assert "Hello" in response["replies"][0].content # see mock_chat_completion_chunk - def test_run_with_streaming_callback_in_run_method(self, chat_messages, mock_chat_completion_chunk): + @patch("haystack.components.generators.chat.openai.datetime") + def test_run_with_streaming_callback_in_run_method(self, mock_datetime, chat_messages, mock_chat_completion_chunk): streaming_callback_called = False def streaming_callback(chunk: StreamingChunk) -> None: @@ -240,6 +242,13 @@ def streaming_callback(chunk: StreamingChunk) -> None: assert [isinstance(reply, ChatMessage) for reply in response["replies"]] assert "Hello" in response["replies"][0].content # see mock_chat_completion_chunk + assert hasattr(response["replies"][0], "meta") + assert isinstance(response["replies"][0].meta, dict) + assert ( + response["replies"][0].meta["completion_start_time"] + == mock_datetime.now.return_value.isoformat.return_value + ) + def test_check_abnormal_completions(self, caplog): caplog.set_level(logging.INFO) component = OpenAIChatGenerator(api_key=Secret.from_token("test-api-key")) From 4fd2723185c9adec975f3e40fcbacc9ffde787ae Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 31 Oct 2024 15:51:23 +0100 Subject: [PATCH 2/3] pylint fixes --- haystack/components/generators/chat/openai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/generators/chat/openai.py b/haystack/components/generators/chat/openai.py index 2de4e5b1ba..42664d857f 100644 --- a/haystack/components/generators/chat/openai.py +++ b/haystack/components/generators/chat/openai.py @@ -64,7 +64,7 @@ class OpenAIChatGenerator: ``` """ - def __init__( + def __init__( # pylint: disable=too-many-arguments self, api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"), model: str = "gpt-4o-mini", From 7b9bdf95e773117de0bdf10180bbf8a2c3463bcf Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 31 Oct 2024 16:02:57 +0100 Subject: [PATCH 3/3] correct disable --- haystack/components/generators/chat/openai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/generators/chat/openai.py b/haystack/components/generators/chat/openai.py index 42664d857f..6013880e74 100644 --- a/haystack/components/generators/chat/openai.py +++ b/haystack/components/generators/chat/openai.py @@ -64,7 +64,7 @@ class OpenAIChatGenerator: ``` """ - def __init__( # pylint: disable=too-many-arguments + def __init__( # pylint: disable=too-many-positional-arguments self, api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"), model: str = "gpt-4o-mini",