From a40aa56bde37f2df21fa400f934bc7195db04067 Mon Sep 17 00:00:00 2001
From: Beibin Li <BeibinLi@users.noreply.github.com>
Date: Sun, 28 Jul 2024 16:11:51 -0700
Subject: [PATCH 1/2] Catch token count issue while streaming with customized
 models

If llama, llava, phi, or some other models are used for streaming (with stream=True), the current design would crash after fetching the response.

A warning is enough in this case, just like the non-streaming use cases.
---
 autogen/oai/client.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/autogen/oai/client.py b/autogen/oai/client.py
index 4cc7c697f73..63a0923afe6 100644
--- a/autogen/oai/client.py
+++ b/autogen/oai/client.py
@@ -272,7 +272,12 @@ def create(self, params: Dict[str, Any]) -> ChatCompletion:
 
             # Prepare the final ChatCompletion object based on the accumulated data
             model = chunk.model.replace("gpt-35", "gpt-3.5")  # hack for Azure API
-            prompt_tokens = count_token(params["messages"], model)
+            try:
+                prompt_tokens = count_token(params["messages"], model)
+            except Exception as e:
+                # Catch token calculation error if streaming with customized models.
+                logger.warning(str(e))
+                prompt_tokens = 0
             response = ChatCompletion(
                 id=chunk.id,
                 model=chunk.model,

From c27f0a926b8363bdd70c84bb7c1b341ac8f00cba Mon Sep 17 00:00:00 2001
From: Beibin Li <BeibinLi@users.noreply.github.com>
Date: Mon, 29 Jul 2024 11:26:09 -0700
Subject: [PATCH 2/2] Only catch not implemented error

---
 autogen/oai/client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autogen/oai/client.py b/autogen/oai/client.py
index 63a0923afe6..38af18eb0af 100644
--- a/autogen/oai/client.py
+++ b/autogen/oai/client.py
@@ -274,7 +274,7 @@ def create(self, params: Dict[str, Any]) -> ChatCompletion:
             model = chunk.model.replace("gpt-35", "gpt-3.5")  # hack for Azure API
             try:
                 prompt_tokens = count_token(params["messages"], model)
-            except Exception as e:
+            except NotImplementedError as e:
                 # Catch token calculation error if streaming with customized models.
                 logger.warning(str(e))
                 prompt_tokens = 0