From a40aa56bde37f2df21fa400f934bc7195db04067 Mon Sep 17 00:00:00 2001 From: Beibin Li Date: Sun, 28 Jul 2024 16:11:51 -0700 Subject: [PATCH 1/2] Catch token count issue while streaming with customized models If llama, llava, phi, or some other models are used for streaming (with stream=True), the current design would crash after fetching the response. A warning is enough in this case, just like the non-streaming use cases. --- autogen/oai/client.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/autogen/oai/client.py b/autogen/oai/client.py index 4cc7c697f73..63a0923afe6 100644 --- a/autogen/oai/client.py +++ b/autogen/oai/client.py @@ -272,7 +272,12 @@ def create(self, params: Dict[str, Any]) -> ChatCompletion: # Prepare the final ChatCompletion object based on the accumulated data model = chunk.model.replace("gpt-35", "gpt-3.5") # hack for Azure API - prompt_tokens = count_token(params["messages"], model) + try: + prompt_tokens = count_token(params["messages"], model) + except Exception as e: + # Catch token calculation error if streaming with customized models. + logger.warning(str(e)) + prompt_tokens = 0 response = ChatCompletion( id=chunk.id, model=chunk.model, From c27f0a926b8363bdd70c84bb7c1b341ac8f00cba Mon Sep 17 00:00:00 2001 From: Beibin Li Date: Mon, 29 Jul 2024 11:26:09 -0700 Subject: [PATCH 2/2] Only catch not implemented error --- autogen/oai/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autogen/oai/client.py b/autogen/oai/client.py index 63a0923afe6..38af18eb0af 100644 --- a/autogen/oai/client.py +++ b/autogen/oai/client.py @@ -274,7 +274,7 @@ def create(self, params: Dict[str, Any]) -> ChatCompletion: model = chunk.model.replace("gpt-35", "gpt-3.5") # hack for Azure API try: prompt_tokens = count_token(params["messages"], model) - except Exception as e: + except NotImplementedError as e: # Catch token calculation error if streaming with customized models. logger.warning(str(e)) prompt_tokens = 0