From ece69249e41eb1604fda7b675fef496307ca6995 Mon Sep 17 00:00:00 2001
From: Beibin Li <BeibinLi@users.noreply.github.com>
Date: Wed, 25 Sep 2024 08:14:20 -0700
Subject: [PATCH] Catch token count issue while streaming with customized
 models (#3241)

* Catch token count issue while streaming with customized models

If llama, llava, phi, or some other models are used for streaming (with stream=True), the current design would crash after fetching the response.

A warning is enough in this case, just like the non-streaming use cases.

* Only catch not implemented error

---------

Co-authored-by: Chi Wang <wang.chi@microsoft.com>
Co-authored-by: Jack Gerrits <jackgerrits@users.noreply.github.com>
---
 autogen/oai/client.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/autogen/oai/client.py b/autogen/oai/client.py
index 3ae37257b21..8f6e3f185b6 100644
--- a/autogen/oai/client.py
+++ b/autogen/oai/client.py
@@ -279,7 +279,12 @@ def create(self, params: Dict[str, Any]) -> ChatCompletion:
 
             # Prepare the final ChatCompletion object based on the accumulated data
             model = chunk.model.replace("gpt-35", "gpt-3.5")  # hack for Azure API
-            prompt_tokens = count_token(params["messages"], model)
+            try:
+                prompt_tokens = count_token(params["messages"], model)
+            except NotImplementedError as e:
+                # Catch token calculation error if streaming with customized models.
+                logger.warning(str(e))
+                prompt_tokens = 0
             response = ChatCompletion(
                 id=chunk.id,
                 model=chunk.model,