From ece69249e41eb1604fda7b675fef496307ca6995 Mon Sep 17 00:00:00 2001 From: Beibin Li Date: Wed, 25 Sep 2024 08:14:20 -0700 Subject: [PATCH] Catch token count issue while streaming with customized models (#3241) * Catch token count issue while streaming with customized models If llama, llava, phi, or some other models are used for streaming (with stream=True), the current design would crash after fetching the response. A warning is enough in this case, just like the non-streaming use cases. * Only catch not implemented error --------- Co-authored-by: Chi Wang Co-authored-by: Jack Gerrits --- autogen/oai/client.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/autogen/oai/client.py b/autogen/oai/client.py index 3ae37257b21..8f6e3f185b6 100644 --- a/autogen/oai/client.py +++ b/autogen/oai/client.py @@ -279,7 +279,12 @@ def create(self, params: Dict[str, Any]) -> ChatCompletion: # Prepare the final ChatCompletion object based on the accumulated data model = chunk.model.replace("gpt-35", "gpt-3.5") # hack for Azure API - prompt_tokens = count_token(params["messages"], model) + try: + prompt_tokens = count_token(params["messages"], model) + except NotImplementedError as e: + # Catch token calculation error if streaming with customized models. + logger.warning(str(e)) + prompt_tokens = 0 response = ChatCompletion( id=chunk.id, model=chunk.model,