From ddf708a4335ae03d52fe2b41c0ee0e3c42993238 Mon Sep 17 00:00:00 2001 From: zyzhang1130 <36942574+zyzhang1130@users.noreply.github.com> Date: Thu, 8 Aug 2024 22:17:44 +0800 Subject: [PATCH 1/3] Update litellm_model.py --- src/agentscope/models/litellm_model.py | 78 ++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/src/agentscope/models/litellm_model.py b/src/agentscope/models/litellm_model.py index 948481ae2..dfc37e101 100644 --- a/src/agentscope/models/litellm_model.py +++ b/src/agentscope/models/litellm_model.py @@ -4,6 +4,7 @@ from typing import Union, Any, List, Sequence, Optional, Generator from loguru import logger +import requests from ._model_utils import _verify_text_content_in_openai_delta_response from .model import ModelWrapperBase, ModelResponse @@ -359,3 +360,80 @@ def format( """ return ModelWrapperBase.format_for_common_chat_models(*args) + +from abc import ABC, abstractmethod + +# Step 1: Define the interface +class ImageHandler(ABC): + @abstractmethod + def send_image(self, image_data, additional_args=None): + pass + +# Step 2: Implement specific strategies +class Base64ImageHandler(ImageHandler): + def send_image(self, image_path, additional_args=None): + encoded_image = self.encode_image(image_path) + data = { + "inputs": { + "prompt": encoded_image, + "model": self.model_name, + "api_key": self.api_key, + }, + **(additional_args or {}) + } + return requests.post(self.api_url, json=data).json() + +class URLImageHandler(ImageHandler): + def send_image(self, image_url, additional_args=None): + data = { + "inputs": { + "prompt": image_url, + "model": self.model_name, + "api_key": self.api_key, + }, + **(additional_args or {}) + } + return requests.post(self.api_url, json=data).json() + + +class LiteLLMVisionWrapper(LiteLLMChatWrapper): + model_type: str = "litellm_chat_v" + def __init__(self, config_name, model_name=None, **kwargs): + super().__init__(config_name, model_name, **kwargs) + + def format(self, *args: Union[Msg, Sequence[Msg]]) -> List: + input_msgs = [] + for item in args: + if item is None: + continue + if isinstance(item, Msg): + input_msgs.append(item) + elif isinstance(item, list) and all(isinstance(subitem, Msg) for subitem in item): + input_msgs.extend(item) + else: + raise TypeError(f"The input should be a Msg object or a list of Msg objects, got {type(item)}.") + + messages = [] + + for msg in input_msgs: + formatted_content = [] + if msg.content: # Handle text content + formatted_content.append({ + "type": "text", + "text": msg.content + }) + + if msg.url: # Handle image URL content + formatted_content.append({ + "type": "image_url", + "image_url": { + "url": msg.url + } + }) + + messages.append({ + "role": msg.role, + "content": formatted_content + }) + + return messages From 2a225cfc2567fb52ae32590e44f5b9c0058b9972 Mon Sep 17 00:00:00 2001 From: zyzhang1130 <36942574+zyzhang1130@users.noreply.github.com> Date: Fri, 16 Aug 2024 15:33:26 +0800 Subject: [PATCH 2/3] Update litellm_model.py --- src/agentscope/models/litellm_model.py | 156 ++++++++++++++++--------- 1 file changed, 100 insertions(+), 56 deletions(-) diff --git a/src/agentscope/models/litellm_model.py b/src/agentscope/models/litellm_model.py index dfc37e101..9237d868e 100644 --- a/src/agentscope/models/litellm_model.py +++ b/src/agentscope/models/litellm_model.py @@ -4,7 +4,6 @@ from typing import Union, Any, List, Sequence, Optional, Generator from loguru import logger -import requests from ._model_utils import _verify_text_content_in_openai_delta_response from .model import ModelWrapperBase, ModelResponse @@ -361,79 +360,124 @@ def format( return ModelWrapperBase.format_for_common_chat_models(*args) -from abc import ABC, abstractmethod - -# Step 1: Define the interface -class ImageHandler(ABC): - @abstractmethod - def send_image(self, image_data, additional_args=None): - pass - -# Step 2: Implement specific strategies -class Base64ImageHandler(ImageHandler): - def send_image(self, image_path, additional_args=None): - encoded_image = self.encode_image(image_path) - data = { - "inputs": { - "prompt": encoded_image, - "model": self.model_name, - "api_key": self.api_key, - }, - **(additional_args or {}) - } - return requests.post(self.api_url, json=data).json() - -class URLImageHandler(ImageHandler): - def send_image(self, image_url, additional_args=None): - data = { - "inputs": { - "prompt": image_url, - "model": self.model_name, - "api_key": self.api_key, - }, - **(additional_args or {}) - } - return requests.post(self.api_url, json=data).json() - class LiteLLMVisionWrapper(LiteLLMChatWrapper): + """The model wrapper based on litellm chat API with vision capabilities. + + This class extends the LiteLLMChatWrapper to support multimodal inputs, + including both text and images. It is designed to work with vision-language + models that can process and respond to both textual and visual information. + + Note: + - The model used must support vision capabilities (e.g., GPT-4o). + + Example: + To use this wrapper with a vision-capable model: + 1. specify "model_type" as "litellm_chat_v". + 2. give the url of the image in message in the following way: + ```python + Msg( + name="Alice", + content="what is the image about", + role="user", + url="https://xxx.jpg", + ) + ``` + + + Response: + The response format is the same as LiteLLMChatWrapper, + but the model can now process and respond to both + text and image inputs. + """ + model_type: str = "litellm_chat_v" - def __init__(self, config_name, model_name=None, **kwargs): + + def __init__( + self, + config_name: str, + model_name: str = None, + **kwargs: Any, + ) -> None: + if model_name is None: + model_name = config_name + logger.warning("model_name is not set, use config_name instead.") + super().__init__(config_name, model_name, **kwargs) def format(self, *args: Union[Msg, Sequence[Msg]]) -> List: + """Format the input messages for vision-language models. + + This method processes a sequence of Msg objects, handling + both text and image content, and formats them into a + structure suitable for vision-language models. + + Args: + *args (Union[Msg, Sequence[Msg]]): A sequence of Msg objects + or lists of Msg objects. + + Returns: + List: A list of formatted messages ready for the + vision-language model. + + Raises: + TypeError: If the input is not a Msg object or a list + of Msg objects. + + Note: + - For 'system' role messages, only text content is allowed. + - For other roles, both text and image content can be included. + - Image content is expected to be provided as a URL in the + Msg object's 'url' field. + """ input_msgs = [] for item in args: if item is None: continue if isinstance(item, Msg): input_msgs.append(item) - elif isinstance(item, list) and all(isinstance(subitem, Msg) for subitem in item): + elif isinstance(item, list) and all( + isinstance(subitem, Msg) for subitem in item + ): input_msgs.extend(item) else: - raise TypeError(f"The input should be a Msg object or a list of Msg objects, got {type(item)}.") + raise TypeError( + "The input should be a Msg object or " + f"a list of Msg objects, got {type(item)}.", + ) messages = [] for msg in input_msgs: - formatted_content = [] - if msg.content: # Handle text content - formatted_content.append({ - "type": "text", - "text": msg.content - }) - - if msg.url: # Handle image URL content - formatted_content.append({ - "type": "image_url", - "image_url": { - "url": msg.url - } - }) + if msg.role == "system": + # For 'system' role, set 'content' directly to msg.content + content = msg.content + else: + formatted_content = [] + if msg.content: # Handle text content + formatted_content.append( + { + "type": "text", + "text": msg.content, + }, + ) + + if msg.url: # Handle image URL content + formatted_content.append( + { + "type": "image_url", + "image_url": { + "url": msg.url, + }, + }, + ) + content = formatted_content - messages.append({ - "role": msg.role, - "content": formatted_content - }) + messages.append( + { + "role": msg.role, + "content": content, + }, + ) return messages From c1281666f613afb2fbdfa16e54e01e14d885d976 Mon Sep 17 00:00:00 2001 From: zyzhang1130 <36942574+zyzhang1130@users.noreply.github.com> Date: Mon, 19 Aug 2024 15:14:17 +0800 Subject: [PATCH 3/3] Update litellm_model.py added reference url --- src/agentscope/models/litellm_model.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/agentscope/models/litellm_model.py b/src/agentscope/models/litellm_model.py index 9237d868e..3eb9517a9 100644 --- a/src/agentscope/models/litellm_model.py +++ b/src/agentscope/models/litellm_model.py @@ -368,6 +368,9 @@ class LiteLLMVisionWrapper(LiteLLMChatWrapper): including both text and images. It is designed to work with vision-language models that can process and respond to both textual and visual information. + reference: + https://docs.litellm.ai/docs/completion/vision#checking-if-a-model-supports-vision + Note: - The model used must support vision capabilities (e.g., GPT-4o).