From ddf708a4335ae03d52fe2b41c0ee0e3c42993238 Mon Sep 17 00:00:00 2001
From: zyzhang1130 <36942574+zyzhang1130@users.noreply.github.com>
Date: Thu, 8 Aug 2024 22:17:44 +0800
Subject: [PATCH 1/3] Update litellm_model.py

---
 src/agentscope/models/litellm_model.py | 78 ++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/src/agentscope/models/litellm_model.py b/src/agentscope/models/litellm_model.py
index 948481ae2..dfc37e101 100644
--- a/src/agentscope/models/litellm_model.py
+++ b/src/agentscope/models/litellm_model.py
@@ -4,6 +4,7 @@
 from typing import Union, Any, List, Sequence, Optional, Generator
 
 from loguru import logger
+import requests
 
 from ._model_utils import _verify_text_content_in_openai_delta_response
 from .model import ModelWrapperBase, ModelResponse
@@ -359,3 +360,80 @@ def format(
         """
 
         return ModelWrapperBase.format_for_common_chat_models(*args)
+
+from abc import ABC, abstractmethod
+
+# Step 1: Define the interface
+class ImageHandler(ABC):
+    @abstractmethod
+    def send_image(self, image_data, additional_args=None):
+        pass
+
+# Step 2: Implement specific strategies
+class Base64ImageHandler(ImageHandler):
+    def send_image(self, image_path, additional_args=None):
+        encoded_image = self.encode_image(image_path)
+        data = {
+            "inputs": {
+                "prompt": encoded_image,
+                "model": self.model_name,
+                "api_key": self.api_key,
+            },
+            **(additional_args or {})
+        }
+        return requests.post(self.api_url, json=data).json()
+
+class URLImageHandler(ImageHandler):
+    def send_image(self, image_url, additional_args=None):
+        data = {
+            "inputs": {
+                "prompt": image_url,
+                "model": self.model_name,
+                "api_key": self.api_key,
+            },
+            **(additional_args or {})
+        }
+        return requests.post(self.api_url, json=data).json()
+    
+
+class LiteLLMVisionWrapper(LiteLLMChatWrapper):
+    model_type: str = "litellm_chat_v"
+    def __init__(self, config_name, model_name=None, **kwargs):
+        super().__init__(config_name, model_name, **kwargs)
+
+    def format(self, *args: Union[Msg, Sequence[Msg]]) -> List:
+        input_msgs = []
+        for item in args:
+            if item is None:
+                continue
+            if isinstance(item, Msg):
+                input_msgs.append(item)
+            elif isinstance(item, list) and all(isinstance(subitem, Msg) for subitem in item):
+                input_msgs.extend(item)
+            else:
+                raise TypeError(f"The input should be a Msg object or a list of Msg objects, got {type(item)}.")
+
+        messages = []
+
+        for msg in input_msgs:
+            formatted_content = []
+            if msg.content:  # Handle text content
+                formatted_content.append({
+                    "type": "text",
+                    "text": msg.content
+                })
+
+            if msg.url:  # Handle image URL content
+                formatted_content.append({
+                    "type": "image_url",
+                    "image_url": {
+                        "url": msg.url
+                    }
+                })
+
+            messages.append({
+                "role": msg.role,
+                "content": formatted_content
+            })
+
+        return messages

From 2a225cfc2567fb52ae32590e44f5b9c0058b9972 Mon Sep 17 00:00:00 2001
From: zyzhang1130 <36942574+zyzhang1130@users.noreply.github.com>
Date: Fri, 16 Aug 2024 15:33:26 +0800
Subject: [PATCH 2/3] Update litellm_model.py

---
 src/agentscope/models/litellm_model.py | 156 ++++++++++++++++---------
 1 file changed, 100 insertions(+), 56 deletions(-)

diff --git a/src/agentscope/models/litellm_model.py b/src/agentscope/models/litellm_model.py
index dfc37e101..9237d868e 100644
--- a/src/agentscope/models/litellm_model.py
+++ b/src/agentscope/models/litellm_model.py
@@ -4,7 +4,6 @@
 from typing import Union, Any, List, Sequence, Optional, Generator
 
 from loguru import logger
-import requests
 
 from ._model_utils import _verify_text_content_in_openai_delta_response
 from .model import ModelWrapperBase, ModelResponse
@@ -361,79 +360,124 @@ def format(
 
         return ModelWrapperBase.format_for_common_chat_models(*args)
 
-from abc import ABC, abstractmethod
-
-# Step 1: Define the interface
-class ImageHandler(ABC):
-    @abstractmethod
-    def send_image(self, image_data, additional_args=None):
-        pass
-
-# Step 2: Implement specific strategies
-class Base64ImageHandler(ImageHandler):
-    def send_image(self, image_path, additional_args=None):
-        encoded_image = self.encode_image(image_path)
-        data = {
-            "inputs": {
-                "prompt": encoded_image,
-                "model": self.model_name,
-                "api_key": self.api_key,
-            },
-            **(additional_args or {})
-        }
-        return requests.post(self.api_url, json=data).json()
-
-class URLImageHandler(ImageHandler):
-    def send_image(self, image_url, additional_args=None):
-        data = {
-            "inputs": {
-                "prompt": image_url,
-                "model": self.model_name,
-                "api_key": self.api_key,
-            },
-            **(additional_args or {})
-        }
-        return requests.post(self.api_url, json=data).json()
-    
 
 class LiteLLMVisionWrapper(LiteLLMChatWrapper):
+    """The model wrapper based on litellm chat API with vision capabilities.
+
+    This class extends the LiteLLMChatWrapper to support multimodal inputs,
+    including both text and images. It is designed to work with vision-language
+    models that can process and respond to both textual and visual information.
+
+    Note:
+        - The model used must support vision capabilities (e.g., GPT-4o).
+
+    Example:
+        To use this wrapper with a vision-capable model:
+        1. specify "model_type" as "litellm_chat_v".
+        2. give the url of the image in message in the following way:
+        ```python
+        Msg(
+            name="Alice",
+            content="what is the image about",
+            role="user",
+            url="https://xxx.jpg",
+        )
+        ```
+
+
+    Response:
+        The response format is the same as LiteLLMChatWrapper,
+        but the model can now process and respond to both
+        text and image inputs.
+    """
+
     model_type: str = "litellm_chat_v"
-    def __init__(self, config_name, model_name=None, **kwargs):
+
+    def __init__(
+        self,
+        config_name: str,
+        model_name: str = None,
+        **kwargs: Any,
+    ) -> None:
+        if model_name is None:
+            model_name = config_name
+            logger.warning("model_name is not set, use config_name instead.")
+
         super().__init__(config_name, model_name, **kwargs)
 
     def format(self, *args: Union[Msg, Sequence[Msg]]) -> List:
+        """Format the input messages for vision-language models.
+
+        This method processes a sequence of Msg objects, handling
+        both text and image content, and formats them into a
+        structure suitable for vision-language models.
+
+        Args:
+            *args (Union[Msg, Sequence[Msg]]): A sequence of Msg objects
+                                               or lists of Msg objects.
+
+        Returns:
+            List: A list of formatted messages ready for the
+                  vision-language model.
+
+        Raises:
+            TypeError: If the input is not a Msg object or a list
+                       of Msg objects.
+
+        Note:
+            - For 'system' role messages, only text content is allowed.
+            - For other roles, both text and image content can be included.
+            - Image content is expected to be provided as a URL in the
+              Msg object's 'url' field.
+        """
         input_msgs = []
         for item in args:
             if item is None:
                 continue
             if isinstance(item, Msg):
                 input_msgs.append(item)
-            elif isinstance(item, list) and all(isinstance(subitem, Msg) for subitem in item):
+            elif isinstance(item, list) and all(
+                isinstance(subitem, Msg) for subitem in item
+            ):
                 input_msgs.extend(item)
             else:
-                raise TypeError(f"The input should be a Msg object or a list of Msg objects, got {type(item)}.")
+                raise TypeError(
+                    "The input should be a Msg object or "
+                    f"a list of Msg objects, got {type(item)}.",
+                )
 
         messages = []
 
         for msg in input_msgs:
-            formatted_content = []
-            if msg.content:  # Handle text content
-                formatted_content.append({
-                    "type": "text",
-                    "text": msg.content
-                })
-
-            if msg.url:  # Handle image URL content
-                formatted_content.append({
-                    "type": "image_url",
-                    "image_url": {
-                        "url": msg.url
-                    }
-                })
+            if msg.role == "system":
+                # For 'system' role, set 'content' directly to msg.content
+                content = msg.content
+            else:
+                formatted_content = []
+                if msg.content:  # Handle text content
+                    formatted_content.append(
+                        {
+                            "type": "text",
+                            "text": msg.content,
+                        },
+                    )
+
+                if msg.url:  # Handle image URL content
+                    formatted_content.append(
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": msg.url,
+                            },
+                        },
+                    )
+                content = formatted_content
 
-            messages.append({
-                "role": msg.role,
-                "content": formatted_content
-            })
+            messages.append(
+                {
+                    "role": msg.role,
+                    "content": content,
+                },
+            )
 
         return messages

From c1281666f613afb2fbdfa16e54e01e14d885d976 Mon Sep 17 00:00:00 2001
From: zyzhang1130 <36942574+zyzhang1130@users.noreply.github.com>
Date: Mon, 19 Aug 2024 15:14:17 +0800
Subject: [PATCH 3/3] Update litellm_model.py

added reference url
---
 src/agentscope/models/litellm_model.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/agentscope/models/litellm_model.py b/src/agentscope/models/litellm_model.py
index 9237d868e..3eb9517a9 100644
--- a/src/agentscope/models/litellm_model.py
+++ b/src/agentscope/models/litellm_model.py
@@ -368,6 +368,9 @@ class LiteLLMVisionWrapper(LiteLLMChatWrapper):
     including both text and images. It is designed to work with vision-language
     models that can process and respond to both textual and visual information.
 
+    reference:
+    https://docs.litellm.ai/docs/completion/vision#checking-if-a-model-supports-vision
+
     Note:
         - The model used must support vision capabilities (e.g., GPT-4o).