From 8b5574d10ae6bd9ba06a8132a0453830934420f7 Mon Sep 17 00:00:00 2001
From: Tianqi Xu <tianqi.xu@kaust.edu.sa>
Date: Tue, 17 Sep 2024 20:22:57 +0300
Subject: [PATCH 01/17] Refactor openai model

---
 crab/agents/backend_models/openai_model.py | 115 +++++++++++----------
 crab/core/models/__init__.py               |   3 +-
 crab/core/models/agent_interface.py        |   3 +
 3 files changed, 65 insertions(+), 56 deletions(-)

diff --git a/crab/agents/backend_models/openai_model.py b/crab/agents/backend_models/openai_model.py
index e959d5e..c270460 100644
--- a/crab/agents/backend_models/openai_model.py
+++ b/crab/agents/backend_models/openai_model.py
@@ -14,11 +14,11 @@
 import json
 from typing import Any
 
-from crab import Action, ActionOutput, BackendModel, BackendOutput, MessageType
+from crab import Action, ActionOutput, BackendModel, BackendOutput, Message, MessageType
 
 try:
     import openai
-    from openai.types.chat import ChatCompletion
+    from openai.types.chat import ChatCompletionMessage
 
     openai_model_enable = True
 except ImportError:
@@ -52,32 +52,15 @@ def reset(self, system_message: str, action_space: list[Action] | None) -> None:
         self.token_usage = 0
         self.chat_history = []
 
-    def chat(self, message: list[tuple[str, MessageType]]) -> BackendOutput:
-        # Initialize chat history
-        request = [self.openai_system_message]
-        if self.history_messages_len > 0 and len(self.chat_history) > 0:
-            for history_message in self.chat_history[-self.history_messages_len :]:
-                request = request + history_message
-
-        if not isinstance(message, list):
+    def chat(self, message: list[Message] | Message) -> BackendOutput:
+        if isinstance(message, tuple):
             message = [message]
-
-        new_message = {
-            "role": "user",
-            "content": [self._convert_message(part) for part in message],
-        }
+        request = self.fetch_from_memory()
+        new_message = self.construct_new_message(message)
         request.append(new_message)
-
-        response = self.call_api(request)
-        response_message = response.choices[0].message
+        response_message = self.call_api(request)
         self.record_message(new_message, response_message)
-
-        return BackendOutput(
-            message=response_message.content,
-            action_list=self._convert_tool_calls_to_action_list(
-                response_message.tool_calls
-            ),
-        )
+        return self.generate_backend_output(response_message)
 
     def get_token_usage(self):
         return self.token_usage
@@ -98,7 +81,7 @@ def record_message(self, new_message: dict, response_message: dict) -> None:
                     }
                 )  # extend conversation with function response
 
-    def call_api(self, request_messages: list) -> ChatCompletion:
+    def call_api(self, request_messages: list) -> ChatCompletionMessage:
         if self.action_schema is not None:
             response = self.client.chat.completions.create(
                 messages=request_messages,  # type: ignore
@@ -115,24 +98,58 @@ def call_api(self, request_messages: list) -> ChatCompletion:
             )
 
         self.token_usage += response.usage.total_tokens
-        return response
+        return response.choices[0].message
 
-    @staticmethod
-    def _convert_message(message: tuple[str, MessageType]):
-        match message[1]:
-            case MessageType.TEXT:
-                return {
-                    "type": "text",
-                    "text": message[0],
-                }
-            case MessageType.IMAGE_JPG_BASE64:
-                return {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": f"data:image/jpeg;base64,{message[0]}",
-                        "detail": "high",
-                    },
-                }
+    def fetch_from_memory(self) -> list[dict]:
+        request = [self.openai_system_message]
+        if self.history_messages_len > 0:
+            fetch_hisotry_len = min(self.history_messages_len, len(self.chat_history))
+            for history_message in self.chat_history[-fetch_hisotry_len:]:
+                request = request + history_message
+        return request
+
+    def construct_new_message(
+        self, message: list[tuple[str, MessageType]]
+    ) -> list[dict]:
+        new_message_content = []
+        for content, msg_type in message:
+            match msg_type:
+                case MessageType.TEXT:
+                    new_message_content.append(
+                        {
+                            "type": "text",
+                            "text": content,
+                        }
+                    )
+                case MessageType.IMAGE_JPG_BASE64:
+                    new_message_content.append(
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{message[0]}",
+                                "detail": "high",
+                            },
+                        }
+                    )
+
+        return {"role": "user", "content": new_message_content}
+
+    def generate_backend_output(
+        self, response_message: ChatCompletionMessage
+    ) -> BackendOutput:
+        if response_message.tool_calls is None:
+            return BackendOutput(message=response_message.content, action_list=None)
+        action_list = [
+            ActionOutput(
+                name=call.function.name,
+                arguments=json.loads(call.function.arguments),
+            )
+            for call in response_message.tool_calls
+        ]
+        return BackendOutput(
+            message=response_message.content,
+            action_list=action_list,
+        )
 
     @staticmethod
     def _convert_action_to_schema(action_space):
@@ -143,15 +160,3 @@ def _convert_action_to_schema(action_space):
             new_action = action.to_openai_json_schema()
             actions.append({"type": "function", "function": new_action})
         return actions
-
-    @staticmethod
-    def _convert_tool_calls_to_action_list(tool_calls) -> list[ActionOutput]:
-        if tool_calls is None:
-            return tool_calls
-        return [
-            ActionOutput(
-                name=call.function.name,
-                arguments=json.loads(call.function.arguments),
-            )
-            for call in tool_calls
-        ]
diff --git a/crab/core/models/__init__.py b/crab/core/models/__init__.py
index 4a10164..5906c98 100644
--- a/crab/core/models/__init__.py
+++ b/crab/core/models/__init__.py
@@ -13,7 +13,7 @@
 # =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
 # ruff: noqa: F401
 from .action import Action, ClosedAction
-from .agent_interface import ActionOutput, BackendOutput, MessageType
+from .agent_interface import ActionOutput, BackendOutput, Message, MessageType
 from .benchmark_interface import StepResult
 from .config import BenchmarkConfig, EnvironmentConfig, VMEnvironmentConfig
 from .evaluator import Evaluator
@@ -23,6 +23,7 @@
     "Action",
     "ClosedAction",
     "MessageType",
+    "Message",
     "ActionOutput",
     "BackendOutput",
     "StepResult",
diff --git a/crab/core/models/agent_interface.py b/crab/core/models/agent_interface.py
index 6ba2214..4639f7e 100644
--- a/crab/core/models/agent_interface.py
+++ b/crab/core/models/agent_interface.py
@@ -24,6 +24,9 @@ class MessageType(IntEnum):
     IMAGE_JPG_BASE64 = 1
 
 
+Message = tuple[str, MessageType]
+
+
 class ActionOutput(BaseModel):
     name: str
     arguments: dict[str, Any]

From 4147819a7df2cfd2511f0a7563ac826239c46cd5 Mon Sep 17 00:00:00 2001
From: Tianqi Xu <tianqi.xu@kaust.edu.sa>
Date: Tue, 17 Sep 2024 20:26:03 +0300
Subject: [PATCH 02/17] Tiny type fix

---
 crab/agents/backend_models/openai_model.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/crab/agents/backend_models/openai_model.py b/crab/agents/backend_models/openai_model.py
index c270460..f9d1650 100644
--- a/crab/agents/backend_models/openai_model.py
+++ b/crab/agents/backend_models/openai_model.py
@@ -108,9 +108,7 @@ def fetch_from_memory(self) -> list[dict]:
                 request = request + history_message
         return request
 
-    def construct_new_message(
-        self, message: list[tuple[str, MessageType]]
-    ) -> list[dict]:
+    def construct_new_message(self, message: list[Message]) -> list[dict]:
         new_message_content = []
         for content, msg_type in message:
             match msg_type:

From f0d5f6b26902f1e2f85ad4b9f1da8f6ef6016a75 Mon Sep 17 00:00:00 2001
From: Tianqi Xu <tianqi.xu@kaust.edu.sa>
Date: Tue, 17 Sep 2024 20:41:32 +0300
Subject: [PATCH 03/17] Tiny fix

---
 crab/agents/backend_models/openai_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crab/agents/backend_models/openai_model.py b/crab/agents/backend_models/openai_model.py
index f9d1650..ed875e8 100644
--- a/crab/agents/backend_models/openai_model.py
+++ b/crab/agents/backend_models/openai_model.py
@@ -124,7 +124,7 @@ def construct_new_message(self, message: list[Message]) -> list[dict]:
                         {
                             "type": "image_url",
                             "image_url": {
-                                "url": f"data:image/jpeg;base64,{message[0]}",
+                                "url": f"data:image/jpeg;base64,{content}",
                                 "detail": "high",
                             },
                         }

From 138c1d8abf0d8f1568fa4d0e0f7f2368db0ba6a8 Mon Sep 17 00:00:00 2001
From: Tianqi Xu <tianqi.xu@kaust.edu.sa>
Date: Wed, 18 Sep 2024 14:13:37 +0300
Subject: [PATCH 04/17] Fix mypy type checking

---
 crab/agents/backend_models/openai_model.py | 23 +++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/crab/agents/backend_models/openai_model.py b/crab/agents/backend_models/openai_model.py
index ed875e8..5e95535 100644
--- a/crab/agents/backend_models/openai_model.py
+++ b/crab/agents/backend_models/openai_model.py
@@ -50,7 +50,7 @@ def reset(self, system_message: str, action_space: list[Action] | None) -> None:
         self.action_space = action_space
         self.action_schema = self._convert_action_to_schema(self.action_space)
         self.token_usage = 0
-        self.chat_history = []
+        self.chat_history: list[list[ChatCompletionMessage | dict]] = []
 
     def chat(self, message: list[Message] | Message) -> BackendOutput:
         if isinstance(message, tuple):
@@ -65,13 +65,14 @@ def chat(self, message: list[Message] | Message) -> BackendOutput:
     def get_token_usage(self):
         return self.token_usage
 
-    def record_message(self, new_message: dict, response_message: dict) -> None:
+    def record_message(
+        self, new_message: dict, response_message: ChatCompletionMessage
+    ) -> None:
         self.chat_history.append([new_message])
         self.chat_history[-1].append(response_message)
 
-        if self.action_schema:
-            tool_calls = response_message.tool_calls
-            for tool_call in tool_calls:
+        if self.action_schema and response_message.tool_calls is not None:
+            for tool_call in response_message.tool_calls:
                 self.chat_history[-1].append(
                     {
                         "tool_call_id": tool_call.id,
@@ -84,7 +85,7 @@ def record_message(self, new_message: dict, response_message: dict) -> None:
     def call_api(self, request_messages: list) -> ChatCompletionMessage:
         if self.action_schema is not None:
             response = self.client.chat.completions.create(
-                messages=request_messages,  # type: ignore
+                messages=request_messages,
                 model=self.model,
                 tools=self.action_schema,
                 tool_choice="required",
@@ -92,7 +93,7 @@ def call_api(self, request_messages: list) -> ChatCompletionMessage:
             )
         else:
             response = self.client.chat.completions.create(
-                messages=request_messages,  # type: ignore
+                messages=request_messages,
                 model=self.model,
                 **self.parameters,
             )
@@ -100,16 +101,16 @@ def call_api(self, request_messages: list) -> ChatCompletionMessage:
         self.token_usage += response.usage.total_tokens
         return response.choices[0].message
 
-    def fetch_from_memory(self) -> list[dict]:
-        request = [self.openai_system_message]
+    def fetch_from_memory(self) -> list[ChatCompletionMessage | dict]:
+        request: list[ChatCompletionMessage | dict] = [self.openai_system_message]
         if self.history_messages_len > 0:
             fetch_hisotry_len = min(self.history_messages_len, len(self.chat_history))
             for history_message in self.chat_history[-fetch_hisotry_len:]:
                 request = request + history_message
         return request
 
-    def construct_new_message(self, message: list[Message]) -> list[dict]:
-        new_message_content = []
+    def construct_new_message(self, message: list[Message]) -> dict[str, Any]:
+        new_message_content: list[dict[str, Any]] = []
         for content, msg_type in message:
             match msg_type:
                 case MessageType.TEXT:

From aff284e9f85334f2e5b0815f4e0f8f3ba63b79cc Mon Sep 17 00:00:00 2001
From: Tianqi Xu <tianqi.xu@kaust.edu.sa>
Date: Wed, 18 Sep 2024 19:08:41 +0300
Subject: [PATCH 05/17] Fix all agent tests and add create_backend_model
 function

---
 crab/agents/backend_models/__init__.py        | 40 +++++++++++++
 crab/agents/backend_models/claude_model.py    |  9 ++-
 crab/agents/backend_models/gemini_model.py    | 18 +++---
 crab/agents/backend_models/openai_model.py    | 15 ++++-
 crab/agents/policies/multi_agent_by_env.py    | 23 ++++----
 crab/agents/policies/multi_agent_by_func.py   | 29 +++++-----
 crab/agents/policies/single_agent.py          | 19 ++++---
 crab/agents/utils.py                          | 56 +++++++++++++++++++
 crab/core/agent_policy.py                     | 53 +-----------------
 .../backend_models/test_claude_model.py       | 21 ++++---
 .../backend_models/test_gemini_model.py       | 14 +++--
 .../backend_models/test_openai_model.py       | 18 +++---
 .../policies/test_multi_agent_by_func.py      | 11 ++--
 .../policies/test_mutli_agent_by_env.py       | 11 ++--
 test/agents/policies/test_single_agent.py     |  7 ++-
 15 files changed, 218 insertions(+), 126 deletions(-)
 create mode 100644 crab/agents/utils.py

diff --git a/crab/agents/backend_models/__init__.py b/crab/agents/backend_models/__init__.py
index 5f36882..c087ca0 100644
--- a/crab/agents/backend_models/__init__.py
+++ b/crab/agents/backend_models/__init__.py
@@ -12,7 +12,47 @@
 # limitations under the License.
 # =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
 # ruff: noqa: F401
+from typing import Any, Literal
+
+from pydantic import BaseModel
+
+from crab.core.backend_model import BackendModel
+
 from .camel_model import CamelModel
 from .claude_model import ClaudeModel
 from .gemini_model import GeminiModel
 from .openai_model import OpenAIModel
+
+
+class BackendModelConfig(BaseModel):
+    model_class: Literal["openai", "claude", "gemini", "camel"]
+    model_name: str
+    history_messages_len: int = 0
+    parameters: dict[str, Any] = {}
+    tool_call_required: bool = False
+
+
+def create_backend_model(model_config: BackendModelConfig) -> BackendModel:
+    match model_config.model_class:
+        case "claude":
+            return ClaudeModel(
+                model=model_config.model_name,
+                parameters=model_config.parameters,
+                history_messages_len=model_config.history_messages_len,
+            )
+        case "gemini":
+            return GeminiModel(
+                model=model_config.model_name,
+                parameters=model_config.parameters,
+                history_messages_len=model_config.history_messages_len,
+            )
+        case "openai":
+            return OpenAIModel(
+                model=model_config.model_name,
+                parameters=model_config.parameters,
+                history_messages_len=model_config.history_messages_len,
+            )
+        case "camel":
+            raise NotImplementedError("Cannot support camel model currently.")
+        case _:
+            raise ValueError(f"Unsupported model name: {model_config.model_name}")
diff --git a/crab/agents/backend_models/claude_model.py b/crab/agents/backend_models/claude_model.py
index 7ffc4c2..cf03e55 100644
--- a/crab/agents/backend_models/claude_model.py
+++ b/crab/agents/backend_models/claude_model.py
@@ -32,6 +32,7 @@ def __init__(
         model: str,
         parameters: dict[str, Any] = dict(),
         history_messages_len: int = 0,
+        tool_call_required: bool = False,
     ) -> None:
         if anthropic_model_enable is False:
             raise ImportError("Please install anthropic to use ClaudeModel")
@@ -41,6 +42,7 @@ def __init__(
             history_messages_len,
         )
         self.client = anthropic.Anthropic()
+        self.tool_call_required = tool_call_required
 
     def reset(self, system_message: str, action_space: list[Action] | None) -> None:
         self.system_message = system_message
@@ -93,6 +95,7 @@ def record_message(self, new_message: dict, response_message: dict) -> None:
                             "content": "success",
                         }
                         for call in tool_calls
+                        if call is ToolUseBlock
                     ],
                 }
             )
@@ -101,12 +104,14 @@ def call_api(self, request_messages: list):
         while True:
             try:
                 if self.action_schema is not None:
-                    response = self.client.beta.tools.messages.create(
+                    response = self.client.messages.create(
                         system=self.system_message,  # <-- system prompt
                         messages=request_messages,  # type: ignore
                         model=self.model,
                         tools=self.action_schema,
-                        tool_choice={"type": "any"},
+                        tool_choice={
+                            "type": "any" if self.tool_call_required else "auto"
+                        },
                         **self.parameters,
                     )
                 else:
diff --git a/crab/agents/backend_models/gemini_model.py b/crab/agents/backend_models/gemini_model.py
index 663aba2..26123b6 100644
--- a/crab/agents/backend_models/gemini_model.py
+++ b/crab/agents/backend_models/gemini_model.py
@@ -35,6 +35,7 @@ def __init__(
         model: str,
         parameters: dict[str, Any] = dict(),
         history_messages_len: int = 0,
+        tool_call_required: bool = False,
     ) -> None:
         if gemini_model_enable is False:
             raise ImportError("Please install google.generativeai to use GeminiModel")
@@ -45,6 +46,7 @@ def __init__(
         )
         genai.configure(api_key=os.environ["GEMINI_API_KEY"])
         self.client = genai
+        self.tool_call_required = tool_call_required
 
     def reset(self, system_message: str, action_space: list[Action] | None) -> None:
         self.system_message = system_message
@@ -98,7 +100,11 @@ def call_api(self, request_messages: list):
             try:
                 if self.action_schema is not None:
                     tool_config = content_types.to_tool_config(
-                        {"function_calling_config": {"mode": "ANY"}}
+                        {
+                            "function_calling_config": {
+                                "mode": "ANY" if self.tool_call_required else "AUTO"
+                            }
+                        }
                     )
                     response = self.client.GenerativeModel(
                         self.model, system_instruction=self.system_message
@@ -141,9 +147,7 @@ def _convert_action_to_schema(cls, action_space):
             return None
         actions = []
         for action in action_space:
-            actions.append(
-                Tool(function_declarations=[cls._action_to_funcdec_policy(action)])
-            )
+            actions.append(Tool(function_declarations=[cls._action_to_funcdec(action)]))
         return actions
 
     @staticmethod
@@ -171,14 +175,14 @@ def _clear_schema(cls, schema_dict: dict):
             cls._clear_schema(schema_dict["items"])
 
     @classmethod
-    def _action_to_funcdec(cls, action: Action, env: str):
+    def _action_to_funcdec(cls, action: Action) -> FunctionDeclaration:
         "Converts crab Action to google FunctionDeclaration"
         p_schema = action.parameters.model_json_schema()
         if "$defs" in p_schema:
             p_schema = json_expand_refs(p_schema)
         cls._clear_schema(p_schema)
         return FunctionDeclaration(
-            name=action.name + "__in__" + env,
-            description="In {} environment, {}".format(env, action.description),
+            name=action.name,
+            description=action.description,
             parameters=p_schema,
         )
diff --git a/crab/agents/backend_models/openai_model.py b/crab/agents/backend_models/openai_model.py
index 5e95535..c7ba157 100644
--- a/crab/agents/backend_models/openai_model.py
+++ b/crab/agents/backend_models/openai_model.py
@@ -31,6 +31,7 @@ def __init__(
         model: str,
         parameters: dict[str, Any] = dict(),
         history_messages_len: int = 0,
+        tool_call_required: bool = False,
     ) -> None:
         if not openai_model_enable:
             raise ImportError("Please install openai to use OpenAIModel")
@@ -40,6 +41,16 @@ def __init__(
             history_messages_len,
         )
         self.client = openai.OpenAI()
+        self.tool_call_required = tool_call_required
+        self.system_message = "You are a helpful assistant."
+        self.openai_system_message = {
+            "role": "system",
+            "content": self.system_message,
+        }
+        self.action_space = None
+        self.action_schema = None
+        self.token_usage = 0
+        self.chat_history: list[list[ChatCompletionMessage | dict]] = []
 
     def reset(self, system_message: str, action_space: list[Action] | None) -> None:
         self.system_message = system_message
@@ -88,12 +99,12 @@ def call_api(self, request_messages: list) -> ChatCompletionMessage:
                 messages=request_messages,
                 model=self.model,
                 tools=self.action_schema,
-                tool_choice="required",
+                tool_choice="required" if self.tool_call_required else "auto",
                 **self.parameters,
             )
         else:
             response = self.client.chat.completions.create(
-                messages=request_messages,
+                messages=request_messages,  # type: ignore
                 model=self.model,
                 **self.parameters,
             )
diff --git a/crab/agents/policies/multi_agent_by_env.py b/crab/agents/policies/multi_agent_by_env.py
index d2cfc2c..b72a535 100644
--- a/crab/agents/policies/multi_agent_by_env.py
+++ b/crab/agents/policies/multi_agent_by_env.py
@@ -11,9 +11,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
-from copy import copy
-
 from crab import Action, ActionOutput
+from crab.agents.backend_models import BackendModelConfig, create_backend_model
+from crab.agents.utils import generate_action_prompt
 from crab.core.agent_policy import AgentPolicy
 from crab.core.backend_model import (
     BackendModel,
@@ -57,12 +57,12 @@ class MultiAgentByEnvPolicy(AgentPolicy):
 
     def __init__(
         self,
-        main_agent_model_backend: BackendModel,
-        env_agent_model_backend: BackendModel,
+        main_agent_model_backend: BackendModelConfig,
+        env_agent_model_backend: BackendModelConfig,
     ):
-        self.main_agent_model_backend = copy(main_agent_model_backend)
-        self.env_agent_model_backend = env_agent_model_backend
-        self.reset(task_description="", action_spaces=None, env_descriptions={})
+        self.main_agent_model_backend = create_backend_model(main_agent_model_backend)
+        self.env_agent_model_backend_config = env_agent_model_backend
+        self.reset(task_description="", action_spaces={}, env_descriptions={})
 
     def reset(
         self,
@@ -82,15 +82,16 @@ def reset(
         )
         self.env_agent_model_backends: dict[str, BackendModel] = {}
         for env in action_spaces:
-            backend = copy(self.env_agent_model_backend)
+            backend = create_backend_model(self.env_agent_model_backend_config)
             if env == "root":
                 backend.reset(root_agent_system_message, action_spaces[env])
             else:
+                backend.require_tool = True
                 env_agent_system_message = self._env_agent_prompt.format(
                     task_description=task_description,
                     environment=env,
                     env_description=env_descriptions[env],
-                    action_descriptions=self.generate_action_prompt(action_spaces[env]),
+                    action_descriptions=generate_action_prompt(action_spaces[env]),
                 )
                 backend.reset(env_agent_system_message, action_spaces[env])
             self.env_agent_model_backends[env] = backend
@@ -140,5 +141,7 @@ def chat(
                 )
             else:
                 output = backend.chat((main_agent_message, MessageType.TEXT))
+            for action in output.action_list:
+                action.env = env
             tool_calls.extend(output.action_list)
-        return self.decode_combined_action(tool_calls)
+        return tool_calls
diff --git a/crab/agents/policies/multi_agent_by_func.py b/crab/agents/policies/multi_agent_by_func.py
index 8f95b72..eec0159 100644
--- a/crab/agents/policies/multi_agent_by_func.py
+++ b/crab/agents/policies/multi_agent_by_func.py
@@ -11,14 +11,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
-from copy import copy
-
-from crab import Action, ActionOutput
-from crab.core.agent_policy import AgentPolicy
-from crab.core.backend_model import (
-    BackendModel,
-    MessageType,
+from crab.agents.backend_models import BackendModelConfig, create_backend_model
+from crab.agents.utils import (
+    combine_multi_env_action_space,
+    decode_combined_action,
+    generate_action_prompt,
 )
+from crab.core import Action, ActionOutput
+from crab.core.agent_policy import AgentPolicy
+from crab.core.backend_model import MessageType
 
 
 class MultiAgentByFuncPolicy(AgentPolicy):
@@ -40,11 +41,11 @@ class MultiAgentByFuncPolicy(AgentPolicy):
 
     def __init__(
         self,
-        main_agent_model_backend: BackendModel,
-        tool_agent_model_backend: BackendModel,
+        main_agent_model_backend: BackendModelConfig,
+        tool_agent_model_backend: BackendModelConfig,
     ):
-        self.main_agent_model_backend = copy(main_agent_model_backend)
-        self.tool_agent_model_backend = copy(tool_agent_model_backend)
+        self.main_agent_model_backend = create_backend_model(main_agent_model_backend)
+        self.tool_agent_model_backend = create_backend_model(tool_agent_model_backend)
         self.reset(task_description="", action_spaces=None, env_descriptions={})
 
     def reset(
@@ -54,11 +55,11 @@ def reset(
         env_descriptions: dict[str, str],
     ) -> list[ActionOutput]:
         self.task_description = task_description
-        self.action_space = self.combine_multi_env_action_space(action_spaces)
+        self.action_space = combine_multi_env_action_space(action_spaces)
 
         main_agent_system_message = self._system_prompt.format(
             task_description=task_description,
-            action_descriptions=self.generate_action_prompt(self.action_space),
+            action_descriptions=generate_action_prompt(self.action_space),
             env_description=str(env_descriptions),
         )
         self.main_agent_model_backend.reset(main_agent_system_message, None)
@@ -95,4 +96,4 @@ def chat(
         tool_output = self.tool_agent_model_backend.chat(
             (output.message, MessageType.TEXT)
         )
-        return self.decode_combined_action(tool_output.action_list)
+        return decode_combined_action(tool_output.action_list)
diff --git a/crab/agents/policies/single_agent.py b/crab/agents/policies/single_agent.py
index 7003c11..7746c53 100644
--- a/crab/agents/policies/single_agent.py
+++ b/crab/agents/policies/single_agent.py
@@ -11,12 +11,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
-from copy import copy
-
 from crab import Action, ActionOutput
+from crab.agents.backend_models import BackendModelConfig, create_backend_model
+from crab.agents.utils import (
+    combine_multi_env_action_space,
+    decode_combined_action,
+    generate_action_prompt,
+)
 from crab.core.agent_policy import AgentPolicy
 from crab.core.backend_model import (
-    BackendModel,
     MessageType,
 )
 from crab.utils.measure import timed
@@ -46,9 +49,9 @@ class SingleAgentPolicy(AgentPolicy):
 
     def __init__(
         self,
-        model_backend: BackendModel,
+        model_backend: BackendModelConfig,
     ):
-        self.model_backend = copy(model_backend)
+        self.model_backend = create_backend_model(model_backend)
         self.reset(task_description="", action_spaces=None, env_descriptions={})
 
     def reset(
@@ -58,10 +61,10 @@ def reset(
         env_descriptions: dict[str, str],
     ) -> list:
         self.task_description = task_description
-        self.action_space = self.combine_multi_env_action_space(action_spaces)
+        self.action_space = combine_multi_env_action_space(action_spaces)
         system_message = self._system_prompt.format(
             task_description=task_description,
-            action_descriptions=self.generate_action_prompt(self.action_space),
+            action_descriptions=generate_action_prompt(self.action_space),
             env_description=str(env_descriptions),
         )
         self.model_backend.reset(system_message, self.action_space)
@@ -87,4 +90,4 @@ def chat(
             )
         )
         output = self.model_backend.chat(prompt)
-        return self.decode_combined_action(output.action_list)
+        return decode_combined_action(output.action_list)
diff --git a/crab/agents/utils.py b/crab/agents/utils.py
new file mode 100644
index 0000000..e3a18c7
--- /dev/null
+++ b/crab/agents/utils.py
@@ -0,0 +1,56 @@
+# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
+from crab.core import Action, ActionOutput
+
+
+def combine_multi_env_action_space(
+    action_space: dict[str, list[Action]] | None,
+) -> list[Action]:
+    """Combine multi-env action space together to fit in a single agent."""
+    result = []
+    if action_space is None:
+        return result
+    for env in action_space:
+        for action in action_space[env]:
+            new_action = action.model_copy()
+            new_action.name = new_action.name + "__in__" + env
+            new_action.description = f"In {env} environment, " + new_action.description
+            result.append(new_action)
+    return result
+
+
+def decode_combined_action(
+    output_actions: list[ActionOutput],
+) -> list[ActionOutput]:
+    """Decode combined action output to action output with the corresponding
+    environment.
+    """
+    result = []
+    for output in output_actions:
+        name_env = output.name.split("__in__")
+        if len(name_env) != 2:
+            raise RuntimeError(
+                'The decoded action name should contain the splitter "__in__".'
+            )
+        new_output = output.model_copy()
+        new_output.name = name_env[0]
+        new_output.env = name_env[1]
+        result.append(new_output)
+    return result
+
+
+def generate_action_prompt(action_space: list[Action]) -> str:
+    return "".join(
+        [f"[{action.name}: {action.description}]\n" for action in action_space]
+    )
diff --git a/crab/core/agent_policy.py b/crab/core/agent_policy.py
index 7f460ff..baea1ad 100644
--- a/crab/core/agent_policy.py
+++ b/crab/core/agent_policy.py
@@ -13,14 +13,14 @@
 # =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
 from abc import ABC, abstractmethod
 
-from .models import Action, ActionOutput, MessageType
+from .models import Action, ActionOutput, Message
 
 
 class AgentPolicy(ABC):
     @abstractmethod
     def chat(
         self,
-        observation: dict[str, list[tuple[str, MessageType]]],
+        observation: dict[str, list[Message]],
     ) -> list[ActionOutput]: ...
 
     @abstractmethod
@@ -32,54 +32,7 @@ def reset(
     ) -> None: ...
 
     @abstractmethod
-    def get_token_usage(self): ...
+    def get_token_usage(self) -> int: ...
 
     @abstractmethod
     def get_backend_model_name(self) -> str: ...
-
-    @staticmethod
-    def combine_multi_env_action_space(
-        action_space: dict[str, list[Action]] | None,
-    ) -> list[Action]:
-        """Combine multi-env action space together to fit in a single agent."""
-        result = []
-        if action_space is None:
-            return result
-        for env in action_space:
-            for action in action_space[env]:
-                new_action = action.model_copy()
-                new_action.name = new_action.name + "__in__" + env
-                new_action.description = (
-                    f"In {env} environment, " + new_action.description
-                )
-                result.append(new_action)
-        return result
-
-    @staticmethod
-    def decode_combined_action(
-        output_actions: list[ActionOutput],
-    ) -> list[ActionOutput]:
-        """Decode combined action output to action output with the corresponding
-        environment.
-        """
-        result = []
-        for output in output_actions:
-            name_env = output.name.split("__in__")
-            if len(name_env) != 2:
-                raise RuntimeError(
-                    'The decoded action name should contain the splitter "__in__".'
-                )
-            new_output = output.model_copy()
-            new_output.name = name_env[0]
-            new_output.env = name_env[1]
-            result.append(new_output)
-        return result
-
-    @staticmethod
-    def generate_action_prompt(actions: list[Action] | None):
-        if actions is None:
-            return None
-        result = ""
-        for action in actions:
-            result += f"[{action.name}: {action.description}]\n"
-        return result
diff --git a/test/agents/backend_models/test_claude_model.py b/test/agents/backend_models/test_claude_model.py
index ace602a..be3ddb8 100644
--- a/test/agents/backend_models/test_claude_model.py
+++ b/test/agents/backend_models/test_claude_model.py
@@ -14,17 +14,20 @@
 import pytest
 
 from crab import MessageType, action
-from crab.agents.backend_models.claude_model import ClaudeModel
+from crab.agents.backend_models import BackendModelConfig, create_backend_model
 
 # TODO: Add mock data
 
 
 @pytest.fixture
 def claude_model_text():
-    return ClaudeModel(
-        model="claude-3-opus-20240229",
-        parameters={"max_tokens": 3000},
-        history_messages_len=1,
+    return create_backend_model(
+        BackendModelConfig(
+            model_class="claude",
+            model_name="claude-3-opus-20240229",
+            parameters={"max_tokens": 3000},
+            history_messages_len=1,
+        )
     )
 
 
@@ -39,7 +42,7 @@ def add(a: int, b: int):
     return a + b
 
 
-@pytest.mark.skip(reason="Mock data to be added")
+# @pytest.mark.skip(reason="Mock data to be added")
 def test_text_chat(claude_model_text):
     message = ("Hello!", MessageType.TEXT)
     output = claude_model_text.chat(message)
@@ -60,7 +63,7 @@ def test_text_chat(claude_model_text):
     assert len(claude_model_text.chat_history) == 3
 
 
-@pytest.mark.skip(reason="Mock data to be added")
+# @pytest.mark.skip(reason="Mock data to be added")
 def test_action_chat(claude_model_text):
     claude_model_text.reset("You are a helpful assistant.", [add])
     message = (
@@ -71,8 +74,8 @@ def test_action_chat(claude_model_text):
         0,
     )
     output = claude_model_text.chat(message)
-    assert output.message is None
     assert len(output.action_list) == 1
-    assert output.action_list[0].arguments == {"a": 10, "b": 15}
+    args = output.action_list[0].arguments
+    assert args["a"] + args["b"] == 25
     assert output.action_list[0].name == "add"
     assert claude_model_text.token_usage > 0
diff --git a/test/agents/backend_models/test_gemini_model.py b/test/agents/backend_models/test_gemini_model.py
index 86ece01..1ab7877 100644
--- a/test/agents/backend_models/test_gemini_model.py
+++ b/test/agents/backend_models/test_gemini_model.py
@@ -14,17 +14,21 @@
 import pytest
 
 from crab import MessageType, action
-from crab.agents.backend_models.gemini_model import GeminiModel
+from crab.agents.backend_models import BackendModelConfig, create_backend_model
 
 # TODO: Add mock data
 
 
 @pytest.fixture
 def gemini_model_text():
-    return GeminiModel(
-        model="gemini-1.5-pro-latest",
-        parameters={"max_tokens": 3000},
-        history_messages_len=1,
+    return create_backend_model(
+        BackendModelConfig(
+            model_class="gemini",
+            model_name="gemini-1.5-pro-latest",
+            parameters={"max_tokens": 3000},
+            history_messages_len=1,
+            tool_call_required=False,
+        )
     )
 
 
diff --git a/test/agents/backend_models/test_openai_model.py b/test/agents/backend_models/test_openai_model.py
index 51e56ab..57c9b72 100644
--- a/test/agents/backend_models/test_openai_model.py
+++ b/test/agents/backend_models/test_openai_model.py
@@ -18,10 +18,8 @@
 from openai.types.chat.chat_completion_message_tool_call import Function
 
 from crab import action
-from crab.agents.backend_models.openai_model import (
-    MessageType,
-    OpenAIModel,
-)
+from crab.agents.backend_models import BackendModelConfig, create_backend_model
+from crab.agents.backend_models.openai_model import MessageType
 
 # Mock data for the OpenAI API response
 openai_mock_response = MagicMock(
@@ -91,10 +89,14 @@
 @pytest.fixture
 def openai_model_text():
     os.environ["OPENAI_API_KEY"] = "MOCK"
-    return OpenAIModel(
-        model="gpt-4o",
-        parameters={"max_tokens": 3000},
-        history_messages_len=1,
+    return create_backend_model(
+        BackendModelConfig(
+            model_class="openai",
+            model_name="gpt-4o",
+            parameters={"max_tokens": 3000},
+            history_messages_len=1,
+            tool_call_required=False,
+        )
     )
 
 
diff --git a/test/agents/policies/test_multi_agent_by_func.py b/test/agents/policies/test_multi_agent_by_func.py
index d319488..b7e31af 100644
--- a/test/agents/policies/test_multi_agent_by_func.py
+++ b/test/agents/policies/test_multi_agent_by_func.py
@@ -14,15 +14,16 @@
 import pytest
 
 from crab import create_benchmark
-from crab.agents.backend_models.openai_model import OpenAIModel
+from crab.agents.backend_models import BackendModelConfig
 from crab.agents.policies.multi_agent_by_func import MultiAgentByFuncPolicy
 from crab.benchmarks.template import multienv_template_benchmark_config
 
 
 @pytest.fixture
 def policy_fixture():
-    model = OpenAIModel(
-        model="gpt-4o",
+    model = BackendModelConfig(
+        model_class="openai",
+        model_name="gpt-4o",
         parameters={"max_tokens": 3000},
         history_messages_len=1,
     )
@@ -30,9 +31,11 @@ def policy_fixture():
     benchmark = create_benchmark(benchmark_config)
     task, action_spaces = benchmark.start_task("0")
     policy = MultiAgentByFuncPolicy(
-        task_description=task.description,
         main_agent_model_backend=model,
         tool_agent_model_backend=model,
+    )
+    policy.reset(
+        task_description=task.description,
         action_spaces=action_spaces,
         env_descriptions=benchmark.get_env_descriptions(),
     )
diff --git a/test/agents/policies/test_mutli_agent_by_env.py b/test/agents/policies/test_mutli_agent_by_env.py
index 1f1e791..318e677 100644
--- a/test/agents/policies/test_mutli_agent_by_env.py
+++ b/test/agents/policies/test_mutli_agent_by_env.py
@@ -14,15 +14,16 @@
 import pytest
 
 from crab import create_benchmark
-from crab.agents.backend_models.openai_model import OpenAIModel
+from crab.agents.backend_models import BackendModelConfig
 from crab.agents.policies.multi_agent_by_env import MultiAgentByEnvPolicy
 from crab.benchmarks.template import multienv_template_benchmark_config
 
 
 @pytest.fixture
 def policy_fixture():
-    model = OpenAIModel(
-        model="gpt-4o",
+    model = BackendModelConfig(
+        model_class="openai",
+        model_name="gpt-4o",
         parameters={"max_tokens": 3000},
         history_messages_len=1,
     )
@@ -30,9 +31,11 @@ def policy_fixture():
     benchmark = create_benchmark(benchmark_config)
     task, action_spaces = benchmark.start_task("0")
     policy = MultiAgentByEnvPolicy(
-        task_description=task.description,
         main_agent_model_backend=model,
         env_agent_model_backend=model,
+    )
+    policy.reset(
+        task_description=task.description,
         action_spaces=action_spaces,
         env_descriptions=benchmark.get_env_descriptions(),
     )
diff --git a/test/agents/policies/test_single_agent.py b/test/agents/policies/test_single_agent.py
index 56f0bfa..440893c 100644
--- a/test/agents/policies/test_single_agent.py
+++ b/test/agents/policies/test_single_agent.py
@@ -26,7 +26,7 @@
 )
 
 from crab import create_benchmark
-from crab.agents.backend_models.openai_model import OpenAIModel
+from crab.agents.backend_models import BackendModelConfig
 from crab.agents.policies.single_agent import SingleAgentPolicy
 from crab.benchmarks.template import multienv_template_benchmark_config
 
@@ -75,8 +75,9 @@
 @pytest.fixture
 def policy_fixture():
     os.environ["OPENAI_API_KEY"] = "MOCK"
-    model = OpenAIModel(
-        model="gpt-4o",
+    model = BackendModelConfig(
+        model_class="openai",
+        model_name="gpt-4o",
         parameters={"max_tokens": 3000},
         history_messages_len=1,
     )

From 06f79ee16bc18a63106fc94055943082c9f005f3 Mon Sep 17 00:00:00 2001
From: Tianqi Xu <tianqi.xu@kaust.edu.sa>
Date: Wed, 18 Sep 2024 19:42:18 +0300
Subject: [PATCH 06/17] Refactor gemini model and pass mypy

---
 crab/agents/backend_models/gemini_model.py | 171 +++++++++++----------
 pyproject.toml                             |   2 +-
 2 files changed, 91 insertions(+), 82 deletions(-)

diff --git a/crab/agents/backend_models/gemini_model.py b/crab/agents/backend_models/gemini_model.py
index 26123b6..24d3ea9 100644
--- a/crab/agents/backend_models/gemini_model.py
+++ b/crab/agents/backend_models/gemini_model.py
@@ -15,12 +15,19 @@
 from time import sleep
 from typing import Any
 
-from crab import Action, ActionOutput, BackendModel, BackendOutput, MessageType
+from PIL.Image import Image
+
+from crab import Action, ActionOutput, BackendModel, BackendOutput, Message, MessageType
 from crab.utils.common import base64_to_image, json_expand_refs
 
 try:
     import google.generativeai as genai
-    from google.ai.generativelanguage_v1beta import FunctionDeclaration, Part, Tool
+    from google.ai.generativelanguage_v1beta import (
+        Content,
+        FunctionDeclaration,
+        Part,
+        Tool,
+    )
     from google.api_core.exceptions import ResourceExhausted
     from google.generativeai.types import content_types
 
@@ -51,51 +58,70 @@ def __init__(
     def reset(self, system_message: str, action_space: list[Action] | None) -> None:
         self.system_message = system_message
         self.action_space = action_space
-        self.action_schema = self._convert_action_to_schema(self.action_space)
+        self.action_schema = _convert_action_to_schema(self.action_space)
         self.token_usage = 0
-        self.chat_history = []
-
-    def chat(self, message: list[tuple[str, MessageType]]) -> BackendOutput:
-        # Initialize chat history
-        request = []
-        if self.history_messages_len > 0 and len(self.chat_history) > 0:
-            for history_message in self.chat_history[-self.history_messages_len :]:
-                request = request + history_message
+        self.chat_history: list[list[dict]] = []
 
-        if not isinstance(message, list):
+    def chat(self, message: list[Message] | Message) -> BackendOutput:
+        if isinstance(message, tuple):
             message = [message]
-
-        new_message = {
-            "role": "user",
-            "parts": [self._convert_message(part) for part in message],
-        }
+        request = self.fetch_from_memory()
+        new_message = self.construct_new_message(message)
         request.append(new_message)
-
-        response = self.call_api(request)
-        response_message = response.candidates[0].content
+        response_message = self.call_api(request)
         self.record_message(new_message, response_message)
+        return self.generate_backend_output(response_message)
+
+    def construct_new_message(self, message: list[Message]) -> dict[str, Any]:
+        parts: list[str | Image] = []
+        for content, msg_type in message:
+            match msg_type:
+                case MessageType.TEXT:
+                    parts.append(content)
+                case MessageType.IMAGE_JPG_BASE64:
+                    parts.append(base64_to_image(content))
+        return {
+            "role": "user",
+            "parts": parts,
+        }
 
-        tool_calls = [
-            Part.to_dict(part)["function_call"]
-            for part in response.parts
-            if "function_call" in Part.to_dict(part)
-        ]
+    def generate_backend_output(self, response_message: Content) -> BackendOutput:
+        tool_calls: list[ActionOutput] = []
+        for part in response_message.parts:
+            if "function_call" in Part.to_dict(part):
+                call = Part.to_dict(part)["function_call"]
+                tool_calls.append(
+                    ActionOutput(
+                        name=call["name"],
+                        arguments=call["args"],
+                    )
+                )
 
         return BackendOutput(
             message=response_message.parts[0].text or None,
-            action_list=self._convert_tool_calls_to_action_list(tool_calls),
+            action_list=tool_calls or None,
         )
 
+    def fetch_from_memory(self) -> list[dict]:
+        request: list[dict] = []
+        if self.history_messages_len > 0:
+            fetch_hisotry_len = min(self.history_messages_len, len(self.chat_history))
+            for history_message in self.chat_history[-fetch_hisotry_len:]:
+                request = request + history_message
+        return request
+
     def get_token_usage(self):
         return self.token_usage
 
-    def record_message(self, new_message: dict, response_message: dict) -> None:
+    def record_message(
+        self, new_message: dict[str, Any], response_message: Content
+    ) -> None:
         self.chat_history.append([new_message])
         self.chat_history[-1].append(
             {"role": response_message.role, "parts": response_message.parts}
         )
 
-    def call_api(self, request_messages: list):
+    def call_api(self, request_messages: list) -> Content:
         while True:
             try:
                 if self.action_schema is not None:
@@ -131,58 +157,41 @@ def call_api(self, request_messages: list):
                 break
 
         self.token_usage += response.candidates[0].token_count
-        return response
-
-    @staticmethod
-    def _convert_message(message: tuple[str, MessageType]):
-        match message[1]:
-            case MessageType.TEXT:
-                return message[0]
-            case MessageType.IMAGE_JPG_BASE64:
-                return base64_to_image(message[0])
-
-    @classmethod
-    def _convert_action_to_schema(cls, action_space):
-        if action_space is None:
-            return None
-        actions = []
-        for action in action_space:
-            actions.append(Tool(function_declarations=[cls._action_to_funcdec(action)]))
-        return actions
-
-    @staticmethod
-    def _convert_tool_calls_to_action_list(tool_calls) -> list[ActionOutput]:
-        if tool_calls:
-            return [
-                ActionOutput(
-                    name=call["name"],
-                    arguments=call["args"],
-                )
-                for call in tool_calls
+        return response.candidates[0].content
+
+
+def _convert_action_to_schema(action_space: list[Action] | None) -> list[Tool] | None:
+    if action_space is None:
+        return None
+    actions = [
+        Tool(
+            function_declarations=[
+                _action_to_funcdec(action) for action in action_space
             ]
-        else:
-            return None
-
-    @classmethod
-    def _clear_schema(cls, schema_dict: dict):
-        schema_dict.pop("title", None)
-        p_type = schema_dict.pop("type", None)
-        for prop in schema_dict.get("properties", {}).values():
-            cls._clear_schema(prop)
-        if p_type is not None:
-            schema_dict["type_"] = p_type.upper()
-        if "items" in schema_dict:
-            cls._clear_schema(schema_dict["items"])
-
-    @classmethod
-    def _action_to_funcdec(cls, action: Action) -> FunctionDeclaration:
-        "Converts crab Action to google FunctionDeclaration"
-        p_schema = action.parameters.model_json_schema()
-        if "$defs" in p_schema:
-            p_schema = json_expand_refs(p_schema)
-        cls._clear_schema(p_schema)
-        return FunctionDeclaration(
-            name=action.name,
-            description=action.description,
-            parameters=p_schema,
         )
+    ]
+    return actions
+
+
+def _clear_schema(schema_dict: dict):
+    schema_dict.pop("title", None)
+    p_type = schema_dict.pop("type", None)
+    for prop in schema_dict.get("properties", {}).values():
+        _clear_schema(prop)
+    if p_type is not None:
+        schema_dict["type_"] = p_type.upper()
+    if "items" in schema_dict:
+        _clear_schema(schema_dict["items"])
+
+
+def _action_to_funcdec(action: Action) -> FunctionDeclaration:
+    "Converts crab Action to google FunctionDeclaration"
+    p_schema = action.parameters.model_json_schema()
+    if "$defs" in p_schema:
+        p_schema = json_expand_refs(p_schema)
+    _clear_schema(p_schema)
+    return FunctionDeclaration(
+        name=action.name,
+        description=action.description,
+        parameters=p_schema,
+    )
diff --git a/pyproject.toml b/pyproject.toml
index f22670d..0019855 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -118,5 +118,5 @@ lint.ignore = ["E731"]
 exclude = ["docs/"]
 
 [[tool.mypy.overrides]]
-module = ["dill", "easyocr"]
+module = ["dill", "easyocr", "google.generativeai.*"]
 ignore_missing_imports = true

From 943011b88d985b4572482bd4acc52a423a01a28b Mon Sep 17 00:00:00 2001
From: Tianqi Xu <tianqi.xu@kaust.edu.sa>
Date: Wed, 18 Sep 2024 19:50:19 +0300
Subject: [PATCH 07/17] Replace try with tenacity.retry

---
 crab/agents/backend_models/gemini_model.py |  60 ++--
 poetry.lock                                | 332 ++++++++++-----------
 pyproject.toml                             |   1 +
 3 files changed, 186 insertions(+), 207 deletions(-)

diff --git a/crab/agents/backend_models/gemini_model.py b/crab/agents/backend_models/gemini_model.py
index 24d3ea9..e33d16b 100644
--- a/crab/agents/backend_models/gemini_model.py
+++ b/crab/agents/backend_models/gemini_model.py
@@ -12,10 +12,10 @@
 # limitations under the License.
 # =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
 import os
-from time import sleep
 from typing import Any
 
 from PIL.Image import Image
+from tenacity import retry, stop_after_attempt, wait_fixed
 
 from crab import Action, ActionOutput, BackendModel, BackendOutput, Message, MessageType
 from crab.utils.common import base64_to_image, json_expand_refs
@@ -28,7 +28,6 @@
         Part,
         Tool,
     )
-    from google.api_core.exceptions import ResourceExhausted
     from google.generativeai.types import content_types
 
     gemini_model_enable = True
@@ -121,40 +120,31 @@ def record_message(
             {"role": response_message.role, "parts": response_message.parts}
         )
 
+    @retry(wait=wait_fixed(10), stop=stop_after_attempt(7))
     def call_api(self, request_messages: list) -> Content:
-        while True:
-            try:
-                if self.action_schema is not None:
-                    tool_config = content_types.to_tool_config(
-                        {
-                            "function_calling_config": {
-                                "mode": "ANY" if self.tool_call_required else "AUTO"
-                            }
-                        }
-                    )
-                    response = self.client.GenerativeModel(
-                        self.model, system_instruction=self.system_message
-                    ).generate_content(
-                        contents=request_messages,
-                        tools=self.action_schema,
-                        tool_config=tool_config,
-                        # **self.parameters,
-                    )
-                else:
-                    response = self.client.GenerativeModel(
-                        self.model, system_instruction=self.system_message
-                    ).generate_content(
-                        contents=request_messages,
-                        # **self.parameters,
-                    )
-            except ResourceExhausted:
-                print(
-                    "ResourceExhausted: 429 Resource has been exhausted.",
-                    " Please waiting...",
-                )
-                sleep(10)
-            else:
-                break
+        if self.action_schema is not None:
+            tool_config = content_types.to_tool_config(
+                {
+                    "function_calling_config": {
+                        "mode": "ANY" if self.tool_call_required else "AUTO"
+                    }
+                }
+            )
+            response = self.client.GenerativeModel(
+                self.model, system_instruction=self.system_message
+            ).generate_content(
+                contents=request_messages,
+                tools=self.action_schema,
+                tool_config=tool_config,
+                # **self.parameters, # TODO(Tianqi): Fix this line in the future
+            )
+        else:
+            response = self.client.GenerativeModel(
+                self.model, system_instruction=self.system_message
+            ).generate_content(
+                contents=request_messages,
+                # **self.parameters, # TODO(Tianqi): Fix this line in the future
+            )
 
         self.token_usage += response.candidates[0].token_count
         return response.candidates[0].content
diff --git a/poetry.lock b/poetry.lock
index e0b737b..fb54a24 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -326,19 +326,6 @@ files = [
     {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
 ]
 
-[[package]]
-name = "asyncio"
-version = "3.4.3"
-description = "reference implementation of PEP 3156"
-optional = true
-python-versions = "*"
-files = [
-    {file = "asyncio-3.4.3-cp33-none-win32.whl", hash = "sha256:b62c9157d36187eca799c378e572c969f0da87cd5fc42ca372d92cdb06e7e1de"},
-    {file = "asyncio-3.4.3-cp33-none-win_amd64.whl", hash = "sha256:c46a87b48213d7464f22d9a497b9eef8c1928b68320a2fa94240f969f6fec08c"},
-    {file = "asyncio-3.4.3-py3-none-any.whl", hash = "sha256:c4d18b22701821de07bd6aea8b53d21449ec0ec5680645e5317062ea21817d2d"},
-    {file = "asyncio-3.4.3.tar.gz", hash = "sha256:83360ff8bc97980e4ff25c964c7bd3923d333d177aa4f7fb736b019f26c7cb41"},
-]
-
 [[package]]
 name = "attrs"
 version = "24.2.0"
@@ -431,23 +418,23 @@ aio = ["aiohttp (>=3.0)"]
 
 [[package]]
 name = "azure-storage-blob"
-version = "12.22.0"
+version = "12.23.0"
 description = "Microsoft Azure Blob Storage Client Library for Python"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "azure-storage-blob-12.22.0.tar.gz", hash = "sha256:b3804bb4fe8ab1c32771fa464053da772a682c2737b19da438a3f4e5e3b3736e"},
-    {file = "azure_storage_blob-12.22.0-py3-none-any.whl", hash = "sha256:bb7d2d824ce3f11f14a27ee7d9281289f7e072ac8311c52e3652672455b7d5e8"},
+    {file = "azure_storage_blob-12.23.0-py3-none-any.whl", hash = "sha256:8ac4b34624ed075eda1e38f0c6dadb601e1b199e27a09aa63edc429bf4a23329"},
+    {file = "azure_storage_blob-12.23.0.tar.gz", hash = "sha256:2fadbceda1d99c4a72dfd32e0122d7bca8b5e8d2563f5c624d634aeaff49c9df"},
 ]
 
 [package.dependencies]
-azure-core = ">=1.28.0"
+azure-core = ">=1.30.0"
 cryptography = ">=2.1.4"
 isodate = ">=0.6.1"
 typing-extensions = ">=4.6.0"
 
 [package.extras]
-aio = ["azure-core[aio] (>=1.28.0)"]
+aio = ["azure-core[aio] (>=1.30.0)"]
 
 [[package]]
 name = "babel"
@@ -512,13 +499,13 @@ lxml = ["lxml"]
 
 [[package]]
 name = "botocore"
-version = "1.35.19"
+version = "1.35.21"
 description = "Low-level, data-driven core of boto 3."
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "botocore-1.35.19-py3-none-any.whl", hash = "sha256:c83f7f0cacfe7c19b109b363ebfa8736e570d24922f16ed371681f58ebab44a9"},
-    {file = "botocore-1.35.19.tar.gz", hash = "sha256:42d6d8db7250cbd7899f786f9861e02cab17dc238f64d6acb976098ed9809625"},
+    {file = "botocore-1.35.21-py3-none-any.whl", hash = "sha256:3db9ddfe521edc0753fc8c68caef71c7806e1d2d21ce8cbabc2065b7d79192f2"},
+    {file = "botocore-1.35.21.tar.gz", hash = "sha256:db917e7d7b3a2eed1310c6496784bc813c91f020a021c2ab5f9df7d28cdb4f1d"},
 ]
 
 [package.dependencies]
@@ -1230,13 +1217,13 @@ dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"]
 
 [[package]]
 name = "diffusers"
-version = "0.30.2"
+version = "0.30.3"
 description = "State-of-the-art diffusion in PyTorch and JAX."
 optional = true
 python-versions = ">=3.8.0"
 files = [
-    {file = "diffusers-0.30.2-py3-none-any.whl", hash = "sha256:739826043147c2b59560944591dfdea5d24cd4fb15e751abbe20679a289bece8"},
-    {file = "diffusers-0.30.2.tar.gz", hash = "sha256:641875f78f36bdfa4b9af752b124d1fd6d431eadd5547fe0a3f354ae0af2636c"},
+    {file = "diffusers-0.30.3-py3-none-any.whl", hash = "sha256:1b70209e4d2c61223b96a7e13bc4d70869c8b0b68f54a35ce3a67fcf813edeee"},
+    {file = "diffusers-0.30.3.tar.gz", hash = "sha256:67c5eb25d5b50bf0742624ef43fe0f6d1e1604f64aad3e8558469cbe89ecf72f"},
 ]
 
 [package.dependencies]
@@ -1663,18 +1650,18 @@ sgmllib3k = "*"
 
 [[package]]
 name = "filelock"
-version = "3.16.0"
+version = "3.16.1"
 description = "A platform independent file lock."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "filelock-3.16.0-py3-none-any.whl", hash = "sha256:f6ed4c963184f4c84dd5557ce8fece759a3724b37b80c6c4f20a2f63a4dc6609"},
-    {file = "filelock-3.16.0.tar.gz", hash = "sha256:81de9eb8453c769b63369f87f11131a7ab04e367f8d97ad39dc230daa07e3bec"},
+    {file = "filelock-3.16.1-py3-none-any.whl", hash = "sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0"},
+    {file = "filelock-3.16.1.tar.gz", hash = "sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435"},
 ]
 
 [package.extras]
-docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"]
-testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.1.1)", "pytest (>=8.3.2)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.3)"]
+docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4.1)"]
+testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.2)", "pytest (>=8.3.3)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.4)"]
 typing = ["typing-extensions (>=4.12.2)"]
 
 [[package]]
@@ -1690,17 +1677,16 @@ files = [
 
 [[package]]
 name = "firecrawl-py"
-version = "1.2.3"
+version = "1.2.4"
 description = "Python SDK for Firecrawl API"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "firecrawl_py-1.2.3-py3-none-any.whl", hash = "sha256:1ce6a7a4c885f6969a4be7e2da1756aebe824486daf96b7f0b0f4d78326110a0"},
-    {file = "firecrawl_py-1.2.3.tar.gz", hash = "sha256:0e454552bbd3c97f52dfca2d278cdd3af0d1841fdca8daa116db6827f35e5343"},
+    {file = "firecrawl_py-1.2.4-py3-none-any.whl", hash = "sha256:0464992f354f4f7830dc29433dacad127a9cd73e331601c719f811df70bace58"},
+    {file = "firecrawl_py-1.2.4.tar.gz", hash = "sha256:bff3cfbce725739f6d7d7f8975b43be392f17c844f601485f19c2ddcf2b4f8de"},
 ]
 
 [package.dependencies]
-asyncio = "*"
 nest-asyncio = "*"
 python-dotenv = "*"
 requests = "*"
@@ -1971,13 +1957,13 @@ grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
 
 [[package]]
 name = "google-api-python-client"
-version = "2.145.0"
+version = "2.146.0"
 description = "Google API Client Library for Python"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "google_api_python_client-2.145.0-py2.py3-none-any.whl", hash = "sha256:d74da1358f3f2d63daf3c6f26bd96d89652051183bc87cf10a56ceb2a70beb50"},
-    {file = "google_api_python_client-2.145.0.tar.gz", hash = "sha256:8b84dde11aaccadc127e4846f5cd932331d804ea324e353131595e3f25376e97"},
+    {file = "google_api_python_client-2.146.0-py2.py3-none-any.whl", hash = "sha256:b1e62c9889c5ef6022f11d30d7ef23dc55100300f0e8aaf8aa09e8e92540acad"},
+    {file = "google_api_python_client-2.146.0.tar.gz", hash = "sha256:41f671be10fa077ee5143ee9f0903c14006d39dc644564f4e044ae96b380bf68"},
 ]
 
 [package.dependencies]
@@ -2490,13 +2476,13 @@ files = [
 
 [[package]]
 name = "huggingface-hub"
-version = "0.24.7"
+version = "0.25.0"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = true
 python-versions = ">=3.8.0"
 files = [
-    {file = "huggingface_hub-0.24.7-py3-none-any.whl", hash = "sha256:a212c555324c8a7b1ffdd07266bb7e7d69ca71aa238d27b7842d65e9a26ac3e5"},
-    {file = "huggingface_hub-0.24.7.tar.gz", hash = "sha256:0ad8fb756e2831da0ac0491175b960f341fe06ebcf80ed6f8728313f95fc0207"},
+    {file = "huggingface_hub-0.25.0-py3-none-any.whl", hash = "sha256:e2f357b35d72d5012cfd127108c4e14abcd61ba4ebc90a5a374dc2456cb34e12"},
+    {file = "huggingface_hub-0.25.0.tar.gz", hash = "sha256:fb5fbe6c12fcd99d187ec7db95db9110fb1a20505f23040a5449a717c1a0db4d"},
 ]
 
 [package.dependencies]
@@ -2985,13 +2971,13 @@ referencing = ">=0.31.0"
 
 [[package]]
 name = "jupyter-client"
-version = "8.6.2"
+version = "8.6.3"
 description = "Jupyter protocol implementation and client libraries"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "jupyter_client-8.6.2-py3-none-any.whl", hash = "sha256:50cbc5c66fd1b8f65ecb66bc490ab73217993632809b6e505687de18e9dea39f"},
-    {file = "jupyter_client-8.6.2.tar.gz", hash = "sha256:2bda14d55ee5ba58552a8c53ae43d215ad9868853489213f37da060ced54d8df"},
+    {file = "jupyter_client-8.6.3-py3-none-any.whl", hash = "sha256:e8a19cc986cc45905ac3362915f410f3af85424b4c0905e94fa5f2cb08e8f23f"},
+    {file = "jupyter_client-8.6.3.tar.gz", hash = "sha256:35b3a0947c4a6e9d589eb97d7d4cd5e90f910ee73101611f01283732bd6d9419"},
 ]
 
 [package.dependencies]
@@ -3263,13 +3249,13 @@ files = [
 
 [[package]]
 name = "litellm"
-version = "1.46.0"
+version = "1.46.4"
 description = "Library to easily interface with LLM API providers"
 optional = true
 python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8"
 files = [
-    {file = "litellm-1.46.0-py3-none-any.whl", hash = "sha256:40209dc6368677d03b21b2c9d9cb91937c9648f741d42bb5a8f992a1cd31fb42"},
-    {file = "litellm-1.46.0.tar.gz", hash = "sha256:6707eb4b17a2eca714f81261c3b6f33297cd25470c4843b8297e345ebdff0560"},
+    {file = "litellm-1.46.4-py3-none-any.whl", hash = "sha256:6c1410b50aa7e4deff05965aa270bbe3207d5d1d59979b13c62dc7ba6e24f329"},
+    {file = "litellm-1.46.4.tar.gz", hash = "sha256:b5a2d5b1425cd0246fd3e3932ea54dbb82433d8f9bc2f75f5e9e2fb6f3e10c1e"},
 ]
 
 [package.dependencies]
@@ -3706,20 +3692,21 @@ tqdm = "*"
 
 [[package]]
 name = "mistralai"
-version = "1.0.3"
+version = "1.1.0"
 description = "Python Client SDK for the Mistral AI API."
 optional = true
 python-versions = "<4.0,>=3.8"
 files = [
-    {file = "mistralai-1.0.3-py3-none-any.whl", hash = "sha256:64af7c9192e64dc66b2da6d1c4d54a1324a881c21665a2f93d6b35d9de9f87c8"},
-    {file = "mistralai-1.0.3.tar.gz", hash = "sha256:84f1a217666c76fec9d477ae266399b813c3ac32a4a348d2ecd5fe1c039b0667"},
+    {file = "mistralai-1.1.0-py3-none-any.whl", hash = "sha256:eea0938975195f331d0ded12d14e3c982f09f1b68210200ed4ff0c6b9b22d0fb"},
+    {file = "mistralai-1.1.0.tar.gz", hash = "sha256:9d1fe778e0e8c6ddab714e6a64c6096bd39cfe119ff38ceb5019d8e089df08ba"},
 ]
 
 [package.dependencies]
+eval-type-backport = ">=0.2.0,<0.3.0"
 httpx = ">=0.27.0,<0.28.0"
 jsonpath-python = ">=1.0.6,<2.0.0"
-pydantic = ">=2.8.2,<2.9.0"
-python-dateutil = ">=2.9.0.post0,<3.0.0"
+pydantic = ">=2.9.0,<3.0.0"
+python-dateutil = "2.8.2"
 typing-inspect = ">=0.9.0,<0.10.0"
 
 [package.extras]
@@ -4495,13 +4482,13 @@ sympy = "*"
 
 [[package]]
 name = "openai"
-version = "1.45.1"
+version = "1.46.0"
 description = "The official Python library for the openai API"
 optional = false
 python-versions = ">=3.7.1"
 files = [
-    {file = "openai-1.45.1-py3-none-any.whl", hash = "sha256:4a6cce402aec803ae57ae7eff4b5b94bf6c0e1703a8d85541c27243c2adeadf8"},
-    {file = "openai-1.45.1.tar.gz", hash = "sha256:f79e384916b219ab2f028bbf9c778e81291c61eb0645ccfa1828a4b18b55d534"},
+    {file = "openai-1.46.0-py3-none-any.whl", hash = "sha256:8e423690b121d0268c7bb83b552e14f339b0ba250e1d0f70d145c194e79c4e1b"},
+    {file = "openai-1.46.0.tar.gz", hash = "sha256:0c5a783530d7cd90e2370dbd52d9239d2d53dc7a0badf9ee1e2e23d3f148969b"},
 ]
 
 [package.dependencies]
@@ -4973,13 +4960,13 @@ xmp = ["defusedxml"]
 
 [[package]]
 name = "platformdirs"
-version = "4.3.3"
+version = "4.3.6"
 description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "platformdirs-4.3.3-py3-none-any.whl", hash = "sha256:50a5450e2e84f44539718293cbb1da0a0885c9d14adf21b77bae4e66fc99d9b5"},
-    {file = "platformdirs-4.3.3.tar.gz", hash = "sha256:d4e0b7d8ec176b341fb03cb11ca12d0276faa8c485f9cd218f613840463fc2c0"},
+    {file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"},
+    {file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"},
 ]
 
 [package.extras]
@@ -5481,119 +5468,120 @@ files = [
 
 [[package]]
 name = "pydantic"
-version = "2.8.2"
+version = "2.9.2"
 description = "Data validation using Python type hints"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pydantic-2.8.2-py3-none-any.whl", hash = "sha256:73ee9fddd406dc318b885c7a2eab8a6472b68b8fb5ba8150949fc3db939f23c8"},
-    {file = "pydantic-2.8.2.tar.gz", hash = "sha256:6f62c13d067b0755ad1c21a34bdd06c0c12625a22b0fc09c6b149816604f7c2a"},
+    {file = "pydantic-2.9.2-py3-none-any.whl", hash = "sha256:f048cec7b26778210e28a0459867920654d48e5e62db0958433636cde4254f12"},
+    {file = "pydantic-2.9.2.tar.gz", hash = "sha256:d155cef71265d1e9807ed1c32b4c8deec042a44a50a4188b25ac67ecd81a9c0f"},
 ]
 
 [package.dependencies]
-annotated-types = ">=0.4.0"
-pydantic-core = "2.20.1"
+annotated-types = ">=0.6.0"
+pydantic-core = "2.23.4"
 typing-extensions = {version = ">=4.6.1", markers = "python_version < \"3.13\""}
 
 [package.extras]
 email = ["email-validator (>=2.0.0)"]
+timezone = ["tzdata"]
 
 [[package]]
 name = "pydantic-core"
-version = "2.20.1"
+version = "2.23.4"
 description = "Core functionality for Pydantic validation and serialization"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pydantic_core-2.20.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3acae97ffd19bf091c72df4d726d552c473f3576409b2a7ca36b2f535ffff4a3"},
-    {file = "pydantic_core-2.20.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:41f4c96227a67a013e7de5ff8f20fb496ce573893b7f4f2707d065907bffdbd6"},
-    {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f239eb799a2081495ea659d8d4a43a8f42cd1fe9ff2e7e436295c38a10c286a"},
-    {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:53e431da3fc53360db73eedf6f7124d1076e1b4ee4276b36fb25514544ceb4a3"},
-    {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f1f62b2413c3a0e846c3b838b2ecd6c7a19ec6793b2a522745b0869e37ab5bc1"},
-    {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d41e6daee2813ecceea8eda38062d69e280b39df793f5a942fa515b8ed67953"},
-    {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d482efec8b7dc6bfaedc0f166b2ce349df0011f5d2f1f25537ced4cfc34fd98"},
-    {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e93e1a4b4b33daed65d781a57a522ff153dcf748dee70b40c7258c5861e1768a"},
-    {file = "pydantic_core-2.20.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e7c4ea22b6739b162c9ecaaa41d718dfad48a244909fe7ef4b54c0b530effc5a"},
-    {file = "pydantic_core-2.20.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4f2790949cf385d985a31984907fecb3896999329103df4e4983a4a41e13e840"},
-    {file = "pydantic_core-2.20.1-cp310-none-win32.whl", hash = "sha256:5e999ba8dd90e93d57410c5e67ebb67ffcaadcea0ad973240fdfd3a135506250"},
-    {file = "pydantic_core-2.20.1-cp310-none-win_amd64.whl", hash = "sha256:512ecfbefef6dac7bc5eaaf46177b2de58cdf7acac8793fe033b24ece0b9566c"},
-    {file = "pydantic_core-2.20.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d2a8fa9d6d6f891f3deec72f5cc668e6f66b188ab14bb1ab52422fe8e644f312"},
-    {file = "pydantic_core-2.20.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:175873691124f3d0da55aeea1d90660a6ea7a3cfea137c38afa0a5ffabe37b88"},
-    {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37eee5b638f0e0dcd18d21f59b679686bbd18917b87db0193ae36f9c23c355fc"},
-    {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:25e9185e2d06c16ee438ed39bf62935ec436474a6ac4f9358524220f1b236e43"},
-    {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:150906b40ff188a3260cbee25380e7494ee85048584998c1e66df0c7a11c17a6"},
-    {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ad4aeb3e9a97286573c03df758fc7627aecdd02f1da04516a86dc159bf70121"},
-    {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3f3ed29cd9f978c604708511a1f9c2fdcb6c38b9aae36a51905b8811ee5cbf1"},
-    {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b0dae11d8f5ded51699c74d9548dcc5938e0804cc8298ec0aa0da95c21fff57b"},
-    {file = "pydantic_core-2.20.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:faa6b09ee09433b87992fb5a2859efd1c264ddc37280d2dd5db502126d0e7f27"},
-    {file = "pydantic_core-2.20.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9dc1b507c12eb0481d071f3c1808f0529ad41dc415d0ca11f7ebfc666e66a18b"},
-    {file = "pydantic_core-2.20.1-cp311-none-win32.whl", hash = "sha256:fa2fddcb7107e0d1808086ca306dcade7df60a13a6c347a7acf1ec139aa6789a"},
-    {file = "pydantic_core-2.20.1-cp311-none-win_amd64.whl", hash = "sha256:40a783fb7ee353c50bd3853e626f15677ea527ae556429453685ae32280c19c2"},
-    {file = "pydantic_core-2.20.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:595ba5be69b35777474fa07f80fc260ea71255656191adb22a8c53aba4479231"},
-    {file = "pydantic_core-2.20.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a4f55095ad087474999ee28d3398bae183a66be4823f753cd7d67dd0153427c9"},
-    {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f9aa05d09ecf4c75157197f27cdc9cfaeb7c5f15021c6373932bf3e124af029f"},
-    {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e97fdf088d4b31ff4ba35db26d9cc472ac7ef4a2ff2badeabf8d727b3377fc52"},
-    {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc633a9fe1eb87e250b5c57d389cf28998e4292336926b0b6cdaee353f89a237"},
-    {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d573faf8eb7e6b1cbbcb4f5b247c60ca8be39fe2c674495df0eb4318303137fe"},
-    {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26dc97754b57d2fd00ac2b24dfa341abffc380b823211994c4efac7f13b9e90e"},
-    {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:33499e85e739a4b60c9dac710c20a08dc73cb3240c9a0e22325e671b27b70d24"},
-    {file = "pydantic_core-2.20.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:bebb4d6715c814597f85297c332297c6ce81e29436125ca59d1159b07f423eb1"},
-    {file = "pydantic_core-2.20.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:516d9227919612425c8ef1c9b869bbbee249bc91912c8aaffb66116c0b447ebd"},
-    {file = "pydantic_core-2.20.1-cp312-none-win32.whl", hash = "sha256:469f29f9093c9d834432034d33f5fe45699e664f12a13bf38c04967ce233d688"},
-    {file = "pydantic_core-2.20.1-cp312-none-win_amd64.whl", hash = "sha256:035ede2e16da7281041f0e626459bcae33ed998cca6a0a007a5ebb73414ac72d"},
-    {file = "pydantic_core-2.20.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:0827505a5c87e8aa285dc31e9ec7f4a17c81a813d45f70b1d9164e03a813a686"},
-    {file = "pydantic_core-2.20.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:19c0fa39fa154e7e0b7f82f88ef85faa2a4c23cc65aae2f5aea625e3c13c735a"},
-    {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa223cd1e36b642092c326d694d8bf59b71ddddc94cdb752bbbb1c5c91d833b"},
-    {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c336a6d235522a62fef872c6295a42ecb0c4e1d0f1a3e500fe949415761b8a19"},
-    {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7eb6a0587eded33aeefea9f916899d42b1799b7b14b8f8ff2753c0ac1741edac"},
-    {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70c8daf4faca8da5a6d655f9af86faf6ec2e1768f4b8b9d0226c02f3d6209703"},
-    {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e9fa4c9bf273ca41f940bceb86922a7667cd5bf90e95dbb157cbb8441008482c"},
-    {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:11b71d67b4725e7e2a9f6e9c0ac1239bbc0c48cce3dc59f98635efc57d6dac83"},
-    {file = "pydantic_core-2.20.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:270755f15174fb983890c49881e93f8f1b80f0b5e3a3cc1394a255706cabd203"},
-    {file = "pydantic_core-2.20.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:c81131869240e3e568916ef4c307f8b99583efaa60a8112ef27a366eefba8ef0"},
-    {file = "pydantic_core-2.20.1-cp313-none-win32.whl", hash = "sha256:b91ced227c41aa29c672814f50dbb05ec93536abf8f43cd14ec9521ea09afe4e"},
-    {file = "pydantic_core-2.20.1-cp313-none-win_amd64.whl", hash = "sha256:65db0f2eefcaad1a3950f498aabb4875c8890438bc80b19362cf633b87a8ab20"},
-    {file = "pydantic_core-2.20.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:4745f4ac52cc6686390c40eaa01d48b18997cb130833154801a442323cc78f91"},
-    {file = "pydantic_core-2.20.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a8ad4c766d3f33ba8fd692f9aa297c9058970530a32c728a2c4bfd2616d3358b"},
-    {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41e81317dd6a0127cabce83c0c9c3fbecceae981c8391e6f1dec88a77c8a569a"},
-    {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:04024d270cf63f586ad41fff13fde4311c4fc13ea74676962c876d9577bcc78f"},
-    {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eaad4ff2de1c3823fddf82f41121bdf453d922e9a238642b1dedb33c4e4f98ad"},
-    {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:26ab812fa0c845df815e506be30337e2df27e88399b985d0bb4e3ecfe72df31c"},
-    {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c5ebac750d9d5f2706654c638c041635c385596caf68f81342011ddfa1e5598"},
-    {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2aafc5a503855ea5885559eae883978c9b6d8c8993d67766ee73d82e841300dd"},
-    {file = "pydantic_core-2.20.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:4868f6bd7c9d98904b748a2653031fc9c2f85b6237009d475b1008bfaeb0a5aa"},
-    {file = "pydantic_core-2.20.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:aa2f457b4af386254372dfa78a2eda2563680d982422641a85f271c859df1987"},
-    {file = "pydantic_core-2.20.1-cp38-none-win32.whl", hash = "sha256:225b67a1f6d602de0ce7f6c1c3ae89a4aa25d3de9be857999e9124f15dab486a"},
-    {file = "pydantic_core-2.20.1-cp38-none-win_amd64.whl", hash = "sha256:6b507132dcfc0dea440cce23ee2182c0ce7aba7054576efc65634f080dbe9434"},
-    {file = "pydantic_core-2.20.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:b03f7941783b4c4a26051846dea594628b38f6940a2fdc0df00b221aed39314c"},
-    {file = "pydantic_core-2.20.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1eedfeb6089ed3fad42e81a67755846ad4dcc14d73698c120a82e4ccf0f1f9f6"},
-    {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:635fee4e041ab9c479e31edda27fcf966ea9614fff1317e280d99eb3e5ab6fe2"},
-    {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:77bf3ac639c1ff567ae3b47f8d4cc3dc20f9966a2a6dd2311dcc055d3d04fb8a"},
-    {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ed1b0132f24beeec5a78b67d9388656d03e6a7c837394f99257e2d55b461611"},
-    {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c6514f963b023aeee506678a1cf821fe31159b925c4b76fe2afa94cc70b3222b"},
-    {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10d4204d8ca33146e761c79f83cc861df20e7ae9f6487ca290a97702daf56006"},
-    {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2d036c7187b9422ae5b262badb87a20a49eb6c5238b2004e96d4da1231badef1"},
-    {file = "pydantic_core-2.20.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9ebfef07dbe1d93efb94b4700f2d278494e9162565a54f124c404a5656d7ff09"},
-    {file = "pydantic_core-2.20.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:6b9d9bb600328a1ce523ab4f454859e9d439150abb0906c5a1983c146580ebab"},
-    {file = "pydantic_core-2.20.1-cp39-none-win32.whl", hash = "sha256:784c1214cb6dd1e3b15dd8b91b9a53852aed16671cc3fbe4786f4f1db07089e2"},
-    {file = "pydantic_core-2.20.1-cp39-none-win_amd64.whl", hash = "sha256:d2fe69c5434391727efa54b47a1e7986bb0186e72a41b203df8f5b0a19a4f669"},
-    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a45f84b09ac9c3d35dfcf6a27fd0634d30d183205230a0ebe8373a0e8cfa0906"},
-    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d02a72df14dfdbaf228424573a07af10637bd490f0901cee872c4f434a735b94"},
-    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2b27e6af28f07e2f195552b37d7d66b150adbaa39a6d327766ffd695799780f"},
-    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:084659fac3c83fd674596612aeff6041a18402f1e1bc19ca39e417d554468482"},
-    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:242b8feb3c493ab78be289c034a1f659e8826e2233786e36f2893a950a719bb6"},
-    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:38cf1c40a921d05c5edc61a785c0ddb4bed67827069f535d794ce6bcded919fc"},
-    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e0bbdd76ce9aa5d4209d65f2b27fc6e5ef1312ae6c5333c26db3f5ade53a1e99"},
-    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:254ec27fdb5b1ee60684f91683be95e5133c994cc54e86a0b0963afa25c8f8a6"},
-    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:407653af5617f0757261ae249d3fba09504d7a71ab36ac057c938572d1bc9331"},
-    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:c693e916709c2465b02ca0ad7b387c4f8423d1db7b4649c551f27a529181c5ad"},
-    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b5ff4911aea936a47d9376fd3ab17e970cc543d1b68921886e7f64bd28308d1"},
-    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:177f55a886d74f1808763976ac4efd29b7ed15c69f4d838bbd74d9d09cf6fa86"},
-    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:964faa8a861d2664f0c7ab0c181af0bea66098b1919439815ca8803ef136fc4e"},
-    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:4dd484681c15e6b9a977c785a345d3e378d72678fd5f1f3c0509608da24f2ac0"},
-    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f6d6cff3538391e8486a431569b77921adfcdef14eb18fbf19b7c0a5294d4e6a"},
-    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a6d511cc297ff0883bc3708b465ff82d7560193169a8b93260f74ecb0a5e08a7"},
-    {file = "pydantic_core-2.20.1.tar.gz", hash = "sha256:26ca695eeee5f9f1aeeb211ffc12f10bcb6f71e2989988fda61dabd65db878d4"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:b10bd51f823d891193d4717448fab065733958bdb6a6b351967bd349d48d5c9b"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4fc714bdbfb534f94034efaa6eadd74e5b93c8fa6315565a222f7b6f42ca1166"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63e46b3169866bd62849936de036f901a9356e36376079b05efa83caeaa02ceb"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed1a53de42fbe34853ba90513cea21673481cd81ed1be739f7f2efb931b24916"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cfdd16ab5e59fc31b5e906d1a3f666571abc367598e3e02c83403acabc092e07"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:255a8ef062cbf6674450e668482456abac99a5583bbafb73f9ad469540a3a232"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a7cd62e831afe623fbb7aabbb4fe583212115b3ef38a9f6b71869ba644624a2"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f09e2ff1f17c2b51f2bc76d1cc33da96298f0a036a137f5440ab3ec5360b624f"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e38e63e6f3d1cec5a27e0afe90a085af8b6806ee208b33030e65b6516353f1a3"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0dbd8dbed2085ed23b5c04afa29d8fd2771674223135dc9bc937f3c09284d071"},
+    {file = "pydantic_core-2.23.4-cp310-none-win32.whl", hash = "sha256:6531b7ca5f951d663c339002e91aaebda765ec7d61b7d1e3991051906ddde119"},
+    {file = "pydantic_core-2.23.4-cp310-none-win_amd64.whl", hash = "sha256:7c9129eb40958b3d4500fa2467e6a83356b3b61bfff1b414c7361d9220f9ae8f"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:77733e3892bb0a7fa797826361ce8a9184d25c8dffaec60b7ffe928153680ba8"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b84d168f6c48fabd1f2027a3d1bdfe62f92cade1fb273a5d68e621da0e44e6d"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df49e7a0861a8c36d089c1ed57d308623d60416dab2647a4a17fe050ba85de0e"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ff02b6d461a6de369f07ec15e465a88895f3223eb75073ffea56b84d9331f607"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:996a38a83508c54c78a5f41456b0103c30508fed9abcad0a59b876d7398f25fd"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d97683ddee4723ae8c95d1eddac7c192e8c552da0c73a925a89fa8649bf13eea"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:216f9b2d7713eb98cb83c80b9c794de1f6b7e3145eef40400c62e86cee5f4e1e"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6f783e0ec4803c787bcea93e13e9932edab72068f68ecffdf86a99fd5918878b"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d0776dea117cf5272382634bd2a5c1b6eb16767c223c6a5317cd3e2a757c61a0"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d5f7a395a8cf1621939692dba2a6b6a830efa6b3cee787d82c7de1ad2930de64"},
+    {file = "pydantic_core-2.23.4-cp311-none-win32.whl", hash = "sha256:74b9127ffea03643e998e0c5ad9bd3811d3dac8c676e47db17b0ee7c3c3bf35f"},
+    {file = "pydantic_core-2.23.4-cp311-none-win_amd64.whl", hash = "sha256:98d134c954828488b153d88ba1f34e14259284f256180ce659e8d83e9c05eaa3"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f3e0da4ebaef65158d4dfd7d3678aad692f7666877df0002b8a522cdf088f231"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f69a8e0b033b747bb3e36a44e7732f0c99f7edd5cea723d45bc0d6e95377ffee"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:723314c1d51722ab28bfcd5240d858512ffd3116449c557a1336cbe3919beb87"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bb2802e667b7051a1bebbfe93684841cc9351004e2badbd6411bf357ab8d5ac8"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d18ca8148bebe1b0a382a27a8ee60350091a6ddaf475fa05ef50dc35b5df6327"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33e3d65a85a2a4a0dc3b092b938a4062b1a05f3a9abde65ea93b233bca0e03f2"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:128585782e5bfa515c590ccee4b727fb76925dd04a98864182b22e89a4e6ed36"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:68665f4c17edcceecc112dfed5dbe6f92261fb9d6054b47d01bf6371a6196126"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:20152074317d9bed6b7a95ade3b7d6054845d70584216160860425f4fbd5ee9e"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9261d3ce84fa1d38ed649c3638feefeae23d32ba9182963e465d58d62203bd24"},
+    {file = "pydantic_core-2.23.4-cp312-none-win32.whl", hash = "sha256:4ba762ed58e8d68657fc1281e9bb72e1c3e79cc5d464be146e260c541ec12d84"},
+    {file = "pydantic_core-2.23.4-cp312-none-win_amd64.whl", hash = "sha256:97df63000f4fea395b2824da80e169731088656d1818a11b95f3b173747b6cd9"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7530e201d10d7d14abce4fb54cfe5b94a0aefc87da539d0346a484ead376c3cc"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:df933278128ea1cd77772673c73954e53a1c95a4fdf41eef97c2b779271bd0bd"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cb3da3fd1b6a5d0279a01877713dbda118a2a4fc6f0d821a57da2e464793f05"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:42c6dcb030aefb668a2b7009c85b27f90e51e6a3b4d5c9bc4c57631292015b0d"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:696dd8d674d6ce621ab9d45b205df149399e4bb9aa34102c970b721554828510"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2971bb5ffe72cc0f555c13e19b23c85b654dd2a8f7ab493c262071377bfce9f6"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8394d940e5d400d04cad4f75c0598665cbb81aecefaca82ca85bd28264af7f9b"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0dff76e0602ca7d4cdaacc1ac4c005e0ce0dcfe095d5b5259163a80d3a10d327"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7d32706badfe136888bdea71c0def994644e09fff0bfe47441deaed8e96fdbc6"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ed541d70698978a20eb63d8c5d72f2cc6d7079d9d90f6b50bad07826f1320f5f"},
+    {file = "pydantic_core-2.23.4-cp313-none-win32.whl", hash = "sha256:3d5639516376dce1940ea36edf408c554475369f5da2abd45d44621cb616f769"},
+    {file = "pydantic_core-2.23.4-cp313-none-win_amd64.whl", hash = "sha256:5a1504ad17ba4210df3a045132a7baeeba5a200e930f57512ee02909fc5c4cb5"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:d4488a93b071c04dc20f5cecc3631fc78b9789dd72483ba15d423b5b3689b555"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:81965a16b675b35e1d09dd14df53f190f9129c0202356ed44ab2728b1c905658"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ffa2ebd4c8530079140dd2d7f794a9d9a73cbb8e9d59ffe24c63436efa8f271"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:61817945f2fe7d166e75fbfb28004034b48e44878177fc54d81688e7b85a3665"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:29d2c342c4bc01b88402d60189f3df065fb0dda3654744d5a165a5288a657368"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5e11661ce0fd30a6790e8bcdf263b9ec5988e95e63cf901972107efc49218b13"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d18368b137c6295db49ce7218b1a9ba15c5bc254c96d7c9f9e924a9bc7825ad"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ec4e55f79b1c4ffb2eecd8a0cfba9955a2588497d96851f4c8f99aa4a1d39b12"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:374a5e5049eda9e0a44c696c7ade3ff355f06b1fe0bb945ea3cac2bc336478a2"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5c364564d17da23db1106787675fc7af45f2f7b58b4173bfdd105564e132e6fb"},
+    {file = "pydantic_core-2.23.4-cp38-none-win32.whl", hash = "sha256:d7a80d21d613eec45e3d41eb22f8f94ddc758a6c4720842dc74c0581f54993d6"},
+    {file = "pydantic_core-2.23.4-cp38-none-win_amd64.whl", hash = "sha256:5f5ff8d839f4566a474a969508fe1c5e59c31c80d9e140566f9a37bba7b8d556"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:a4fa4fc04dff799089689f4fd502ce7d59de529fc2f40a2c8836886c03e0175a"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0a7df63886be5e270da67e0966cf4afbae86069501d35c8c1b3b6c168f42cb36"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dcedcd19a557e182628afa1d553c3895a9f825b936415d0dbd3cd0bbcfd29b4b"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f54b118ce5de9ac21c363d9b3caa6c800341e8c47a508787e5868c6b79c9323"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86d2f57d3e1379a9525c5ab067b27dbb8a0642fb5d454e17a9ac434f9ce523e3"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de6d1d1b9e5101508cb37ab0d972357cac5235f5c6533d1071964c47139257df"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1278e0d324f6908e872730c9102b0112477a7f7cf88b308e4fc36ce1bdb6d58c"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9a6b5099eeec78827553827f4c6b8615978bb4b6a88e5d9b93eddf8bb6790f55"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:e55541f756f9b3ee346b840103f32779c695a19826a4c442b7954550a0972040"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a5c7ba8ffb6d6f8f2ab08743be203654bb1aaa8c9dcb09f82ddd34eadb695605"},
+    {file = "pydantic_core-2.23.4-cp39-none-win32.whl", hash = "sha256:37b0fe330e4a58d3c58b24d91d1eb102aeec675a3db4c292ec3928ecd892a9a6"},
+    {file = "pydantic_core-2.23.4-cp39-none-win_amd64.whl", hash = "sha256:1498bec4c05c9c787bde9125cfdcc63a41004ff167f495063191b863399b1a29"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f455ee30a9d61d3e1a15abd5068827773d6e4dc513e795f380cdd59932c782d5"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1e90d2e3bd2c3863d48525d297cd143fe541be8bbf6f579504b9712cb6b643ec"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e203fdf807ac7e12ab59ca2bfcabb38c7cf0b33c41efeb00f8e5da1d86af480"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e08277a400de01bc72436a0ccd02bdf596631411f592ad985dcee21445bd0068"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f220b0eea5965dec25480b6333c788fb72ce5f9129e8759ef876a1d805d00801"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d06b0c8da4f16d1d1e352134427cb194a0a6e19ad5db9161bf32b2113409e728"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:ba1a0996f6c2773bd83e63f18914c1de3c9dd26d55f4ac302a7efe93fb8e7433"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:9a5bce9d23aac8f0cf0836ecfc033896aa8443b501c58d0602dbfd5bd5b37753"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:78ddaaa81421a29574a682b3179d4cf9e6d405a09b99d93ddcf7e5239c742e21"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:883a91b5dd7d26492ff2f04f40fbb652de40fcc0afe07e8129e8ae779c2110eb"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88ad334a15b32a791ea935af224b9de1bf99bcd62fabf745d5f3442199d86d59"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:233710f069d251feb12a56da21e14cca67994eab08362207785cf8c598e74577"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:19442362866a753485ba5e4be408964644dd6a09123d9416c54cd49171f50744"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:624e278a7d29b6445e4e813af92af37820fafb6dcc55c012c834f9e26f9aaaef"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f5ef8f42bec47f21d07668a043f077d507e5bf4e668d5c6dfe6aaba89de1a5b8"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:aea443fffa9fbe3af1a9ba721a87f926fe548d32cab71d188a6ede77d0ff244e"},
+    {file = "pydantic_core-2.23.4.tar.gz", hash = "sha256:2584f7cf844ac4d970fba483a717dbe10c1c1c96a969bf65d61ffe94df1b2863"},
 ]
 
 [package.dependencies]
@@ -6042,17 +6030,17 @@ files = [
 
 [[package]]
 name = "pyreadline3"
-version = "3.5.2"
+version = "3.5.3"
 description = "A python implementation of GNU readline."
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "pyreadline3-3.5.2-py3-none-any.whl", hash = "sha256:a87d56791e2965b2b187e2ea33dcf664600842c997c0623c95cf8ef07db83de9"},
-    {file = "pyreadline3-3.5.2.tar.gz", hash = "sha256:ba82292e52c5a3bb256b291af0c40b457c1e8699cac9a873abbcaac8aef3a1bb"},
+    {file = "pyreadline3-3.5.3-py3-none-any.whl", hash = "sha256:ddede153a92e5aad9c1fe63d692efd6a3e478f686adcd4938a051ffb63ec4f52"},
+    {file = "pyreadline3-3.5.3.tar.gz", hash = "sha256:9234684ca75a00a702fda42b17cc26ca665bc9d7c2da06af450468253099ff61"},
 ]
 
 [package.extras]
-dev = ["build", "flake8", "pytest", "twine"]
+dev = ["build", "flake8", "mypy", "pytest", "twine"]
 
 [[package]]
 name = "pyrect"
@@ -6284,13 +6272,13 @@ dev = ["pytest"]
 
 [[package]]
 name = "python-dateutil"
-version = "2.9.0.post0"
+version = "2.8.2"
 description = "Extensions to the standard Python datetime module"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
 files = [
-    {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
-    {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
+    {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
+    {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
 ]
 
 [package.dependencies]
@@ -6633,13 +6621,13 @@ cffi = {version = "*", markers = "implementation_name == \"pypy\""}
 
 [[package]]
 name = "qdrant-client"
-version = "1.11.1"
+version = "1.11.2"
 description = "Client library for the Qdrant vector search engine"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "qdrant_client-1.11.1-py3-none-any.whl", hash = "sha256:1375fad77c825c957181ff53775fb900c4383e817f864ea30b2605314da92f07"},
-    {file = "qdrant_client-1.11.1.tar.gz", hash = "sha256:bfc23239b027073352ad92152209ec50281519686b7da3041612faece0fcdfbd"},
+    {file = "qdrant_client-1.11.2-py3-none-any.whl", hash = "sha256:3151e3da61588ad138dfcd6760c2f13e57251c8b0c62001bfd0e03bb7bcd6c8e"},
+    {file = "qdrant_client-1.11.2.tar.gz", hash = "sha256:0d5aa3f778077762963a754459c9c7144ba48e13dea62e559323924126a1b4a4"},
 ]
 
 [package.dependencies]
@@ -7738,13 +7726,13 @@ files = [
 
 [[package]]
 name = "slack-sdk"
-version = "3.32.0"
+version = "3.33.0"
 description = "The Slack API Platform SDK for Python"
 optional = true
 python-versions = ">=3.6"
 files = [
-    {file = "slack_sdk-3.32.0-py2.py3-none-any.whl", hash = "sha256:f35e85f2847e6c25cf7c2d1df206ca0ad75556263fb592457bf03cca68ef64bb"},
-    {file = "slack_sdk-3.32.0.tar.gz", hash = "sha256:af8fc4ef1d1cbcecd28d01acf6955a3bb5b13d56f0a43a1b1c7e3b212cc5ec5b"},
+    {file = "slack_sdk-3.33.0-py2.py3-none-any.whl", hash = "sha256:853bb55154115d080cae342c4099f2ccb559a78ae8d0f5109b49842401a920fa"},
+    {file = "slack_sdk-3.33.0.tar.gz", hash = "sha256:070eb1fb355c149a5f80fa0be6eeb5f5588e4ddff4dd76acf060454435cb037e"},
 ]
 
 [package.extras]
@@ -8026,7 +8014,7 @@ widechars = ["wcwidth"]
 name = "tenacity"
 version = "9.0.0"
 description = "Retry code until it succeeds"
-optional = true
+optional = false
 python-versions = ">=3.8"
 files = [
     {file = "tenacity-9.0.0-py3-none-any.whl", hash = "sha256:93de0c98785b27fcf659856aa9f54bfbd399e29969b0621bc7f762bd441b4539"},
@@ -8571,13 +8559,13 @@ tutorials = ["matplotlib", "pandas", "tabulate"]
 
 [[package]]
 name = "types-networkx"
-version = "3.2.1.20240907"
+version = "3.2.1.20240918"
 description = "Typing stubs for networkx"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "types-networkx-3.2.1.20240907.tar.gz", hash = "sha256:fca541eb0b1964159b909cd7620572c9a0b6431ca24a7aee8706ba2957a6b857"},
-    {file = "types_networkx-3.2.1.20240907-py3-none-any.whl", hash = "sha256:50cede6735c969d9394a4c7b042979f8e0c66921c25cfc9b4b3fe8625c872799"},
+    {file = "types-networkx-3.2.1.20240918.tar.gz", hash = "sha256:251d256a4d2fe17ca596ee5b40869813a942341a5b876a7975f032e2a47785e1"},
+    {file = "types_networkx-3.2.1.20240918-py3-none-any.whl", hash = "sha256:666b6dfdfc89855dbe73ecee2e6f3f6735c3e51f3622be44118177012ba05218"},
 ]
 
 [package.dependencies]
@@ -8996,13 +8984,13 @@ test = ["Cython (>=0.29.36,<0.30.0)", "aiohttp (==3.9.0b0)", "aiohttp (>=3.8.1)"
 
 [[package]]
 name = "virtualenv"
-version = "20.26.4"
+version = "20.26.5"
 description = "Virtual Python Environment builder"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "virtualenv-20.26.4-py3-none-any.whl", hash = "sha256:48f2695d9809277003f30776d155615ffc11328e6a0a8c1f0ec80188d7874a55"},
-    {file = "virtualenv-20.26.4.tar.gz", hash = "sha256:c17f4e0f3e6036e9f26700446f85c76ab11df65ff6d8a9cbfad9f71aabfcf23c"},
+    {file = "virtualenv-20.26.5-py3-none-any.whl", hash = "sha256:4f3ac17b81fba3ce3bd6f4ead2749a72da5929c01774948e243db9ba41df4ff6"},
+    {file = "virtualenv-20.26.5.tar.gz", hash = "sha256:ce489cac131aa58f4b25e321d6d186171f78e6cb13fafbf32a840cee67733ff4"},
 ]
 
 [package.dependencies]
@@ -9648,4 +9636,4 @@ server = ["fastapi", "pydantic-settings", "uvicorn"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10, <3.12"
-content-hash = "16ba24706b3b4263bf15b895a33d33a716d64cd95003eb76f8a016734e4c7b35"
+content-hash = "3e9ebca70d7ffd424d86f8bbbefe6d9fe6993f8bd9067fd4cb4e996e7bb64ac6"
diff --git a/pyproject.toml b/pyproject.toml
index 0019855..706e8bd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ lxml = "^5.2.2"
 openai = "^1.12.0"
 cryptography = "^43.0.0"
 setuptools = "^73.0.1"
+tenacity = "^9.0.0"
 
 # desktop actions
 pillow = "^10.2.0"

From fb1af781e8e0999ee78d5c0803c372598b0e3f27 Mon Sep 17 00:00:00 2001
From: Tianqi Xu <tianqi.xu@kaust.edu.sa>
Date: Wed, 18 Sep 2024 20:23:57 +0300
Subject: [PATCH 08/17] Refactor claude model

---
 crab/agents/backend_models/claude_model.py    | 257 +++++++++---------
 crab/agents/backend_models/gemini_model.py    |   9 +-
 .../backend_models/test_claude_model.py       |   4 +-
 3 files changed, 139 insertions(+), 131 deletions(-)

diff --git a/crab/agents/backend_models/claude_model.py b/crab/agents/backend_models/claude_model.py
index cf03e55..27a499f 100644
--- a/crab/agents/backend_models/claude_model.py
+++ b/crab/agents/backend_models/claude_model.py
@@ -12,10 +12,11 @@
 # limitations under the License.
 # =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
 from copy import deepcopy
-from time import sleep
 from typing import Any
 
-from crab import Action, ActionOutput, BackendModel, BackendOutput, MessageType
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
+
+from crab import Action, ActionOutput, BackendModel, BackendOutput, Message, MessageType
 
 try:
     import anthropic
@@ -47,37 +48,61 @@ def __init__(
     def reset(self, system_message: str, action_space: list[Action] | None) -> None:
         self.system_message = system_message
         self.action_space = action_space
-        self.action_schema = self._convert_action_to_schema(self.action_space)
+        self.action_schema = _convert_action_to_schema(self.action_space)
         self.token_usage = 0
-        self.chat_history = []
-
-    def chat(self, message: list[tuple[str, MessageType]]) -> BackendOutput:
-        # Initialize chat history
-        request = []
-        if self.history_messages_len > 0 and len(self.chat_history) > 0:
-            for history_message in self.chat_history[-self.history_messages_len :]:
-                request = request + history_message
+        self.chat_history: list[list[dict]] = []
 
-        if not isinstance(message, list):
+    def chat(self, message: list[Message] | Message) -> BackendOutput:
+        if isinstance(message, tuple):
             message = [message]
-
-        new_message = {
-            "role": "user",
-            "content": [self._convert_message(part) for part in message],
-        }
+        request = self.fetch_from_memory()
+        new_message = self.construct_new_message(message)
         request.append(new_message)
-        request = self._merge_request(request)
-
-        response = self.call_api(request)
-        response_message = response
+        response_message = self.call_api(request)
         self.record_message(new_message, response_message)
+        return self.generate_backend_output(response_message)
+
+    def construct_new_message(self, message: list[Message]) -> dict[str, Any]:
+        parts: list[dict] = []
+        for content, msg_type in message:
+            match msg_type:
+                case MessageType.TEXT:
+                    parts.append(
+                        {
+                            "type": "text",
+                            "text": content,
+                        }
+                    )
+                case MessageType.IMAGE_JPG_BASE64:
+                    parts.append(
+                        {
+                            "type": "image",
+                            "source": {
+                                "data": content,
+                                "type": "base64",
+                                "media_type": "image/png",
+                            },
+                        }
+                    )
+        return {
+            "role": "user",
+            "content": parts,
+        }
 
-        return self._format_response(response_message.content)
+    def fetch_from_memory(self) -> list[dict]:
+        request: list[dict] = []
+        if self.history_messages_len > 0:
+            fetch_hisotry_len = min(self.history_messages_len, len(self.chat_history))
+            for history_message in self.chat_history[-fetch_hisotry_len:]:
+                request = request + history_message
+        return request
 
     def get_token_usage(self):
         return self.token_usage
 
-    def record_message(self, new_message: dict, response_message: dict) -> None:
+    def record_message(
+        self, new_message: dict, response_message: anthropic.types.Message
+    ) -> None:
         self.chat_history.append([new_message])
         self.chat_history[-1].append(
             {"role": response_message.role, "content": response_message.content}
@@ -85,128 +110,106 @@ def record_message(self, new_message: dict, response_message: dict) -> None:
 
         if self.action_schema:
             tool_calls = response_message.content
-            self.chat_history[-1].append(
-                {
-                    "role": "user",
-                    "content": [
+            tool_content = []
+            for call in tool_calls:
+                if isinstance(call, ToolUseBlock):
+                    tool_content.append(
                         {
                             "type": "tool_result",
                             "tool_use_id": call.id,
                             "content": "success",
                         }
-                        for call in tool_calls
-                        if call is ToolUseBlock
-                    ],
+                    )
+            self.chat_history[-1].append(
+                {
+                    "role": "user",
+                    "content": tool_content,
                 }
             )
 
-    def call_api(self, request_messages: list):
-        while True:
-            try:
-                if self.action_schema is not None:
-                    response = self.client.messages.create(
-                        system=self.system_message,  # <-- system prompt
-                        messages=request_messages,  # type: ignore
-                        model=self.model,
-                        tools=self.action_schema,
-                        tool_choice={
-                            "type": "any" if self.tool_call_required else "auto"
-                        },
-                        **self.parameters,
-                    )
-                else:
-                    response = self.client.messages.create(
-                        system=self.system_message,  # <-- system prompt
-                        messages=request_messages,  # type: ignore
-                        model=self.model,
-                        **self.parameters,
-                    )
-            except anthropic.RateLimitError:
-                print("Rate Limit Error: Please waiting...")
-                sleep(10)
-            except anthropic.APIStatusError:
-                print(len(request_messages))
-                raise
-            else:
-                break
+    @retry(
+        wait=wait_fixed(10),
+        stop=stop_after_attempt(7),
+        retry=retry_if_exception_type(
+            (
+                anthropic.APITimeoutError,
+                anthropic.APIConnectionError,
+                anthropic.InternalServerError,
+            )
+        ),
+    )
+    def call_api(self, request_messages: list[dict]) -> anthropic.types.Message:
+        request_messages = _merge_request(request_messages)
+        if self.action_schema is not None:
+            response = self.client.messages.create(
+                system=self.system_message,  # <-- system prompt
+                messages=request_messages,  # type: ignore
+                model=self.model,
+                tools=self.action_schema,
+                tool_choice={"type": "any" if self.tool_call_required else "auto"},
+                **self.parameters,
+            )
+        else:
+            response = self.client.messages.create(
+                system=self.system_message,  # <-- system prompt
+                messages=request_messages,  # type: ignore
+                model=self.model,
+                **self.parameters,
+            )
 
         self.token_usage += response.usage.input_tokens + response.usage.output_tokens
         return response
 
-    @staticmethod
-    def _convert_message(message: tuple[str, MessageType]):
-        match message[1]:
-            case MessageType.TEXT:
-                return {
-                    "type": "text",
-                    "text": message[0],
-                }
-            case MessageType.IMAGE_JPG_BASE64:
-                return {
-                    "type": "image",
-                    "source": {
-                        "data": message[0],
-                        "type": "base64",
-                        "media_type": "image/png",
-                    },
-                }
-
-    @staticmethod
-    def _convert_action_to_schema(action_space):
-        if action_space is None:
-            return None
-        actions = []
-        for action in action_space:
-            new_action = action.to_openai_json_schema()
-            new_action["input_schema"] = new_action.pop("parameters")
-            if "returns" in new_action:
-                new_action.pop("returns")
-            if "title" in new_action:
-                new_action.pop("title")
-            if "type" in new_action:
-                new_action["input_schema"]["type"] = new_action.pop("type")
-            if "required" in new_action:
-                new_action["input_schema"]["required"] = new_action.pop("required")
-
-            actions.append(new_action)
-        return actions
-
-    @staticmethod
-    def _convert_tool_calls_to_action_list(tool_calls) -> list[ActionOutput]:
-        if tool_calls is None:
-            return tool_calls
-        return [
-            ActionOutput(
-                name=call.name,
-                arguments=call.input,
-            )
-            for call in tool_calls
-        ]
-
-    @staticmethod
-    def _merge_request(request: list[dict]):
-        merge_request = [deepcopy(request[0])]
-        for idx in range(1, len(request)):
-            if request[idx]["role"] == merge_request[-1]["role"]:
-                merge_request[-1]["content"].extend(request[idx]["content"])
-            else:
-                merge_request.append(deepcopy(request[idx]))
-
-        return merge_request
-
-    @classmethod
-    def _format_response(cls, content: list):
-        message = None
+    def generate_backend_output(
+        cls, response_message: anthropic.types.Message
+    ) -> BackendOutput:
+        message = ""
         action_list = []
-        for block in content:
+        for block in response_message.content:
             if isinstance(block, TextBlock):
-                message = block.text
+                message += block.text
             elif isinstance(block, ToolUseBlock):
-                action_list.append(block)
+                action_list.append(
+                    ActionOutput(
+                        name=block.name,
+                        arguments=block.input,  # type: ignore
+                    )
+                )
         if not action_list:
             return BackendOutput(message=message, action_list=None)
         else:
             return BackendOutput(
                 message=message,
-                action_list=cls._convert_tool_calls_to_action_list(action_list),
+                action_list=action_list,
             )
+
+
+def _merge_request(request: list[dict]) -> list[dict]:
+    merge_request = [deepcopy(request[0])]
+    for idx in range(1, len(request)):
+        if request[idx]["role"] == merge_request[-1]["role"]:
+            merge_request[-1]["content"].extend(request[idx]["content"])
+        else:
+            merge_request.append(deepcopy(request[idx]))
+
+    return merge_request
+
+
+def _convert_action_to_schema(action_space):
+    if action_space is None:
+        return None
+    actions = []
+    for action in action_space:
+        new_action = action.to_openai_json_schema()
+        new_action["input_schema"] = new_action.pop("parameters")
+        if "returns" in new_action:
+            new_action.pop("returns")
+        if "title" in new_action:
+            new_action.pop("title")
+        if "type" in new_action:
+            new_action["input_schema"]["type"] = new_action.pop("type")
+        if "required" in new_action:
+            new_action["input_schema"]["required"] = new_action.pop("required")
+
+        actions.append(new_action)
+    return actions
diff --git a/crab/agents/backend_models/gemini_model.py b/crab/agents/backend_models/gemini_model.py
index e33d16b..4a25b56 100644
--- a/crab/agents/backend_models/gemini_model.py
+++ b/crab/agents/backend_models/gemini_model.py
@@ -15,7 +15,7 @@
 from typing import Any
 
 from PIL.Image import Image
-from tenacity import retry, stop_after_attempt, wait_fixed
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
 
 from crab import Action, ActionOutput, BackendModel, BackendOutput, Message, MessageType
 from crab.utils.common import base64_to_image, json_expand_refs
@@ -28,6 +28,7 @@
         Part,
         Tool,
     )
+    from google.api_core.exceptions import ResourceExhausted
     from google.generativeai.types import content_types
 
     gemini_model_enable = True
@@ -120,7 +121,11 @@ def record_message(
             {"role": response_message.role, "parts": response_message.parts}
         )
 
-    @retry(wait=wait_fixed(10), stop=stop_after_attempt(7))
+    @retry(
+        wait=wait_fixed(10),
+        stop=stop_after_attempt(7),
+        retry=retry_if_exception_type(ResourceExhausted),
+    )
     def call_api(self, request_messages: list) -> Content:
         if self.action_schema is not None:
             tool_config = content_types.to_tool_config(
diff --git a/test/agents/backend_models/test_claude_model.py b/test/agents/backend_models/test_claude_model.py
index be3ddb8..f8e361a 100644
--- a/test/agents/backend_models/test_claude_model.py
+++ b/test/agents/backend_models/test_claude_model.py
@@ -42,7 +42,7 @@ def add(a: int, b: int):
     return a + b
 
 
-# @pytest.mark.skip(reason="Mock data to be added")
+@pytest.mark.skip(reason="Mock data to be added")
 def test_text_chat(claude_model_text):
     message = ("Hello!", MessageType.TEXT)
     output = claude_model_text.chat(message)
@@ -63,7 +63,7 @@ def test_text_chat(claude_model_text):
     assert len(claude_model_text.chat_history) == 3
 
 
-# @pytest.mark.skip(reason="Mock data to be added")
+@pytest.mark.skip(reason="Mock data to be added")
 def test_action_chat(claude_model_text):
     claude_model_text.reset("You are a helpful assistant.", [add])
     message = (

From f61213cd3f6ae9dfd7f41e0894db51ba3062ebe5 Mon Sep 17 00:00:00 2001
From: Tianqi Xu <40522713+dandansamax@users.noreply.github.com>
Date: Wed, 18 Sep 2024 23:22:07 +0300
Subject: [PATCH 09/17] Update crab/agents/backend_models/gemini_model.py

Co-authored-by: Isaac Jin <whale3ye@gmail.com>
---
 crab/agents/backend_models/gemini_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crab/agents/backend_models/gemini_model.py b/crab/agents/backend_models/gemini_model.py
index 4a25b56..c297410 100644
--- a/crab/agents/backend_models/gemini_model.py
+++ b/crab/agents/backend_models/gemini_model.py
@@ -168,7 +168,7 @@ def _convert_action_to_schema(action_space: list[Action] | None) -> list[Tool] |
     return actions
 
 
-def _clear_schema(schema_dict: dict):
+def _clear_schema(schema_dict: dict) -> None:
     schema_dict.pop("title", None)
     p_type = schema_dict.pop("type", None)
     for prop in schema_dict.get("properties", {}).values():

From 9fece560e6b93118eb4e4ce74f6857bf13181d0e Mon Sep 17 00:00:00 2001
From: Tianqi Xu <tianqi.xu@kaust.edu.sa>
Date: Thu, 19 Sep 2024 13:42:06 +0300
Subject: [PATCH 10/17] Fix issues

---
 crab/agents/backend_models/claude_model.py | 26 +++++----
 crab/agents/backend_models/gemini_model.py | 27 +++++----
 crab/agents/backend_models/openai_model.py | 65 +++++++++++-----------
 crab/core/backend_model.py                 | 15 -----
 4 files changed, 66 insertions(+), 67 deletions(-)

diff --git a/crab/agents/backend_models/claude_model.py b/crab/agents/backend_models/claude_model.py
index 27a499f..ecc39a0 100644
--- a/crab/agents/backend_models/claude_model.py
+++ b/crab/agents/backend_models/claude_model.py
@@ -31,26 +31,32 @@ class ClaudeModel(BackendModel):
     def __init__(
         self,
         model: str,
-        parameters: dict[str, Any] = dict(),
+        parameters: dict[str, Any] | None = None,
         history_messages_len: int = 0,
         tool_call_required: bool = False,
     ) -> None:
         if anthropic_model_enable is False:
             raise ImportError("Please install anthropic to use ClaudeModel")
-        super().__init__(
-            model,
-            parameters,
-            history_messages_len,
-        )
+        self.model = model
+        self.parameters = parameters if parameters is not None else {}
+        self.history_messages_len = history_messages_len
+
+        assert self.history_messages_len >= 0
+
         self.client = anthropic.Anthropic()
-        self.tool_call_required = tool_call_required
+        self.tool_call_required: bool = tool_call_required
+        self.system_message: str = "You are a helpful assistant."
+        self.action_space: list[Action] | None = None
+        self.action_schema: list[dict] | None = None
+        self.token_usage: int = 0
+        self.chat_history: list[list[dict]] = []
 
     def reset(self, system_message: str, action_space: list[Action] | None) -> None:
         self.system_message = system_message
         self.action_space = action_space
         self.action_schema = _convert_action_to_schema(self.action_space)
         self.token_usage = 0
-        self.chat_history: list[list[dict]] = []
+        self.chat_history = []
 
     def chat(self, message: list[Message] | Message) -> BackendOutput:
         if isinstance(message, tuple):
@@ -92,8 +98,8 @@ def construct_new_message(self, message: list[Message]) -> dict[str, Any]:
     def fetch_from_memory(self) -> list[dict]:
         request: list[dict] = []
         if self.history_messages_len > 0:
-            fetch_hisotry_len = min(self.history_messages_len, len(self.chat_history))
-            for history_message in self.chat_history[-fetch_hisotry_len:]:
+            fetch_history_len = min(self.history_messages_len, len(self.chat_history))
+            for history_message in self.chat_history[-fetch_history_len:]:
                 request = request + history_message
         return request
 
diff --git a/crab/agents/backend_models/gemini_model.py b/crab/agents/backend_models/gemini_model.py
index c297410..213b7e3 100644
--- a/crab/agents/backend_models/gemini_model.py
+++ b/crab/agents/backend_models/gemini_model.py
@@ -40,27 +40,32 @@ class GeminiModel(BackendModel):
     def __init__(
         self,
         model: str,
-        parameters: dict[str, Any] = dict(),
+        parameters: dict[str, Any] | None = None,
         history_messages_len: int = 0,
         tool_call_required: bool = False,
     ) -> None:
         if gemini_model_enable is False:
             raise ImportError("Please install google.generativeai to use GeminiModel")
-        super().__init__(
-            model,
-            parameters,
-            history_messages_len,
-        )
+
+        self.model = model
+        self.parameters = parameters if parameters is not None else {}
+        self.history_messages_len = history_messages_len
+        assert self.history_messages_len >= 0
         genai.configure(api_key=os.environ["GEMINI_API_KEY"])
         self.client = genai
         self.tool_call_required = tool_call_required
+        self.system_message: str = "You are a helpful assistant."
+        self.action_space: list[Action] | None = None
+        self.action_schema: list[Tool] | None = None
+        self.token_usage: int = 0
+        self.chat_history: list[list[dict]] = []
 
     def reset(self, system_message: str, action_space: list[Action] | None) -> None:
         self.system_message = system_message
         self.action_space = action_space
         self.action_schema = _convert_action_to_schema(self.action_space)
         self.token_usage = 0
-        self.chat_history: list[list[dict]] = []
+        self.chat_history = []
 
     def chat(self, message: list[Message] | Message) -> BackendOutput:
         if isinstance(message, tuple):
@@ -105,8 +110,8 @@ def generate_backend_output(self, response_message: Content) -> BackendOutput:
     def fetch_from_memory(self) -> list[dict]:
         request: list[dict] = []
         if self.history_messages_len > 0:
-            fetch_hisotry_len = min(self.history_messages_len, len(self.chat_history))
-            for history_message in self.chat_history[-fetch_hisotry_len:]:
+            fetch_history_len = min(self.history_messages_len, len(self.chat_history))
+            for history_message in self.chat_history[-fetch_history_len:]:
                 request = request + history_message
         return request
 
@@ -161,7 +166,7 @@ def _convert_action_to_schema(action_space: list[Action] | None) -> list[Tool] |
     actions = [
         Tool(
             function_declarations=[
-                _action_to_funcdec(action) for action in action_space
+                _action_to_func_dec(action) for action in action_space
             ]
         )
     ]
@@ -179,7 +184,7 @@ def _clear_schema(schema_dict: dict) -> None:
         _clear_schema(schema_dict["items"])
 
 
-def _action_to_funcdec(action: Action) -> FunctionDeclaration:
+def _action_to_func_dec(action: Action) -> FunctionDeclaration:
     "Converts crab Action to google FunctionDeclaration"
     p_schema = action.parameters.model_json_schema()
     if "$defs" in p_schema:
diff --git a/crab/agents/backend_models/openai_model.py b/crab/agents/backend_models/openai_model.py
index c7ba157..4c9827e 100644
--- a/crab/agents/backend_models/openai_model.py
+++ b/crab/agents/backend_models/openai_model.py
@@ -29,27 +29,26 @@ class OpenAIModel(BackendModel):
     def __init__(
         self,
         model: str,
-        parameters: dict[str, Any] = dict(),
+        parameters: dict[str, Any] | None = None,
         history_messages_len: int = 0,
         tool_call_required: bool = False,
+        base_url: str | None = None,
     ) -> None:
         if not openai_model_enable:
             raise ImportError("Please install openai to use OpenAIModel")
-        super().__init__(
-            model,
-            parameters,
-            history_messages_len,
-        )
-        self.client = openai.OpenAI()
-        self.tool_call_required = tool_call_required
-        self.system_message = "You are a helpful assistant."
-        self.openai_system_message = {
-            "role": "system",
-            "content": self.system_message,
-        }
-        self.action_space = None
-        self.action_schema = None
-        self.token_usage = 0
+
+        self.model = model
+        self.parameters = parameters if parameters is not None else {}
+        self.history_messages_len = history_messages_len
+
+        assert self.history_messages_len >= 0
+
+        self.client = openai.OpenAI(base_url=base_url)
+        self.tool_call_required: bool = tool_call_required
+        self.system_message: str = "You are a helpful assistant."
+        self.action_space: list[Action] | None = None
+        self.action_schema: list[dict] | None = None
+        self.token_usage: int = 0
         self.chat_history: list[list[ChatCompletionMessage | dict]] = []
 
     def reset(self, system_message: str, action_space: list[Action] | None) -> None:
@@ -59,9 +58,9 @@ def reset(self, system_message: str, action_space: list[Action] | None) -> None:
             "content": system_message,
         }
         self.action_space = action_space
-        self.action_schema = self._convert_action_to_schema(self.action_space)
+        self.action_schema = _convert_action_to_schema(self.action_space)
         self.token_usage = 0
-        self.chat_history: list[list[ChatCompletionMessage | dict]] = []
+        self.chat_history = []
 
     def chat(self, message: list[Message] | Message) -> BackendOutput:
         if isinstance(message, tuple):
@@ -93,10 +92,12 @@ def record_message(
                     }
                 )  # extend conversation with function response
 
-    def call_api(self, request_messages: list) -> ChatCompletionMessage:
+    def call_api(
+        self, request_messages: list[ChatCompletionMessage | dict]
+    ) -> ChatCompletionMessage:
         if self.action_schema is not None:
             response = self.client.chat.completions.create(
-                messages=request_messages,
+                messages=request_messages,  # type: ignore
                 model=self.model,
                 tools=self.action_schema,
                 tool_choice="required" if self.tool_call_required else "auto",
@@ -115,8 +116,8 @@ def call_api(self, request_messages: list) -> ChatCompletionMessage:
     def fetch_from_memory(self) -> list[ChatCompletionMessage | dict]:
         request: list[ChatCompletionMessage | dict] = [self.openai_system_message]
         if self.history_messages_len > 0:
-            fetch_hisotry_len = min(self.history_messages_len, len(self.chat_history))
-            for history_message in self.chat_history[-fetch_hisotry_len:]:
+            fetch_history_len = min(self.history_messages_len, len(self.chat_history))
+            for history_message in self.chat_history[-fetch_history_len:]:
                 request = request + history_message
         return request
 
@@ -161,12 +162,14 @@ def generate_backend_output(
             action_list=action_list,
         )
 
-    @staticmethod
-    def _convert_action_to_schema(action_space):
-        if action_space is None:
-            return None
-        actions = []
-        for action in action_space:
-            new_action = action.to_openai_json_schema()
-            actions.append({"type": "function", "function": new_action})
-        return actions
+
+def _convert_action_to_schema(
+    action_space: list[Action] | None,
+) -> list[dict] | None:
+    if action_space is None:
+        return None
+    actions = []
+    for action in action_space:
+        new_action = action.to_openai_json_schema()
+        actions.append({"type": "function", "function": new_action})
+    return actions
diff --git a/crab/core/backend_model.py b/crab/core/backend_model.py
index 28e11b3..682e8f0 100644
--- a/crab/core/backend_model.py
+++ b/crab/core/backend_model.py
@@ -12,26 +12,11 @@
 # limitations under the License.
 # =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
 from abc import ABC, abstractmethod
-from typing import Any
 
 from .models import Action, BackendOutput, MessageType
 
 
 class BackendModel(ABC):
-    def __init__(
-        self,
-        model: str,
-        parameters: dict[str, Any] = dict(),
-        history_messages_len: int = 0,
-    ) -> None:
-        self.model = model
-        self.parameters = parameters
-        self.history_messages_len = history_messages_len
-
-        assert self.history_messages_len >= 0
-
-        self.reset("You are a helpful assistant.", None)
-
     @abstractmethod
     def chat(self, contents: list[tuple[str, MessageType]]) -> BackendOutput: ...
 

From 1c755699fd4a7b2b7ecef4a1878a7495ecf0070c Mon Sep 17 00:00:00 2001
From: Tianqi Xu <tianqi.xu@kaust.edu.sa>
Date: Thu, 19 Sep 2024 13:44:09 +0300
Subject: [PATCH 11/17] Tiny fix

---
 crab/agents/backend_models/claude_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crab/agents/backend_models/claude_model.py b/crab/agents/backend_models/claude_model.py
index ecc39a0..d2c8252 100644
--- a/crab/agents/backend_models/claude_model.py
+++ b/crab/agents/backend_models/claude_model.py
@@ -167,7 +167,7 @@ def call_api(self, request_messages: list[dict]) -> anthropic.types.Message:
         return response
 
     def generate_backend_output(
-        cls, response_message: anthropic.types.Message
+        self, response_message: anthropic.types.Message
     ) -> BackendOutput:
         message = ""
         action_list = []

From 0830a38f9df84028430e7c75c402cbb0cd9c7c92 Mon Sep 17 00:00:00 2001
From: Tianqi Xu <tianqi.xu@kaust.edu.sa>
Date: Thu, 19 Sep 2024 15:09:41 +0300
Subject: [PATCH 12/17] Fix test

---
 crab/agents/backend_models/openai_model.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/crab/agents/backend_models/openai_model.py b/crab/agents/backend_models/openai_model.py
index 4c9827e..2d1eb23 100644
--- a/crab/agents/backend_models/openai_model.py
+++ b/crab/agents/backend_models/openai_model.py
@@ -46,6 +46,10 @@ def __init__(
         self.client = openai.OpenAI(base_url=base_url)
         self.tool_call_required: bool = tool_call_required
         self.system_message: str = "You are a helpful assistant."
+        self.openai_system_message = {
+            "role": "system",
+            "content": self.system_message,
+        }
         self.action_space: list[Action] | None = None
         self.action_schema: list[dict] | None = None
         self.token_usage: int = 0

From f58281781f84f0a49b4fa99219edfccfed5836ce Mon Sep 17 00:00:00 2001
From: Tianqi Xu <tianqi.xu@kaust.edu.sa>
Date: Mon, 23 Sep 2024 16:02:43 +0300
Subject: [PATCH 13/17] Add VLLM models

---
 crab-benchmark-v0/main.py                  | 64 ++++++++++++++---
 crab/agents/backend_models/__init__.py     | 23 ++++++-
 crab/agents/backend_models/claude_model.py |  1 +
 crab/agents/backend_models/gemini_model.py |  1 +
 crab/agents/backend_models/openai_model.py |  6 +-
 crab/agents/backend_models/vllm_model.py   | 80 ++++++++++++++++++++++
 crab/agents/policies/single_agent.py       | 45 ++++++++++--
 crab/agents/utils.py                       | 61 +++++++++++++++--
 8 files changed, 257 insertions(+), 24 deletions(-)
 create mode 100644 crab/agents/backend_models/vllm_model.py

diff --git a/crab-benchmark-v0/main.py b/crab-benchmark-v0/main.py
index 231d4d0..07c4ba0 100644
--- a/crab-benchmark-v0/main.py
+++ b/crab-benchmark-v0/main.py
@@ -29,7 +29,7 @@
     get_elements_prompt,
     groundingdino_easyocr,
 )
-from crab.agents.backend_models import ClaudeModel, GeminiModel, OpenAIModel
+from crab.agents.backend_models import BackendModelConfig
 from crab.agents.policies import (
     MultiAgentByEnvPolicy,
     MultiAgentByFuncPolicy,
@@ -158,7 +158,7 @@ def get_benchmark(env: str, ubuntu_url: str):
         default="single",
     )
     parser.add_argument(
-        "--remote-url",
+        "--ubuntu-url",
         type=str,
         help="remote url of Ubunutu environment",
         default="http://127.0.0.1:8000",
@@ -170,6 +170,18 @@ def get_benchmark(env: str, ubuntu_url: str):
         default="cross",
     )
     parser.add_argument("--task-id", type=str, help="task id")
+    parser.add_argument(
+        "--model-base-url",
+        type=str,
+        help="URL of the model API",
+        default="http://127.0.0.1:8000/v1",
+    )
+    parser.add_argument(
+        "--model-api-key",
+        type=str,
+        help="API key of the model API",
+        default="EMPTY",
+    )
     parser.add_argument(
         "--loglevel",
         type=str,
@@ -183,16 +195,48 @@ def get_benchmark(env: str, ubuntu_url: str):
         raise ValueError("Invalid log level: %s" % loglevel)
     logging.basicConfig(level=numeric_level)
 
-    benchmark = get_benchmark(args.env, args.remote_url)
+    benchmark = get_benchmark(args.env, args.ubuntu_url)
 
     if args.model == "gpt4o":
-        model = OpenAIModel(model="gpt-4o")
-    elif args.policy == "gpt4turbo":
-        model = OpenAIModel(model="gpt-4-turbo")
-    elif args.policy == "gemini":
-        model = GeminiModel(model="gemini-1.5-pro-latest")
-    elif args.policy == "claude":
-        model = ClaudeModel(model="claude-3-opus-20240229")
+        model = BackendModelConfig(
+            model_class="openai",
+            model_name="gpt-4o",
+            history_messages_len=2,
+        )
+    elif args.model == "gpt4turbo":
+        model = BackendModelConfig(
+            model_class="openai",
+            model_name="gpt-4-turbo",
+            history_messages_len=2,
+        )
+    elif args.model == "gemini":
+        model = BackendModelConfig(
+            model_class="gemini",
+            model_name="gemini-1.5-pro-latest",
+            history_messages_len=2,
+        )
+    elif args.model == "claude":
+        model = BackendModelConfig(
+            model_class="claude",
+            model_name="claude-3-opus-20240229",
+            history_messages_len=2,
+        )
+    elif args.model == "llava-1.6":
+        model = BackendModelConfig(
+            model_class="vllm",
+            model_name="llava-hf/llava-v1.6-34b-hf",
+            history_messages_len=2,
+            base_url=args.model_base_url,
+            api_key=args.model_api_key,
+        )
+    elif args.model == "pixtral":
+        model = BackendModelConfig(
+            model_class="vllm",
+            model_name="mistralai/Pixtral-12B-2409",
+            history_messages_len=1,
+            base_url=args.model_base_url,
+            api_key=args.model_api_key,
+        )
     else:
         print("Unsupported model: ", args.model)
         exit()
diff --git a/crab/agents/backend_models/__init__.py b/crab/agents/backend_models/__init__.py
index c087ca0..172b6a1 100644
--- a/crab/agents/backend_models/__init__.py
+++ b/crab/agents/backend_models/__init__.py
@@ -22,25 +22,36 @@
 from .claude_model import ClaudeModel
 from .gemini_model import GeminiModel
 from .openai_model import OpenAIModel
+from .vllm_model import VLLMModel
 
 
 class BackendModelConfig(BaseModel):
-    model_class: Literal["openai", "claude", "gemini", "camel"]
+    model_class: Literal["openai", "claude", "gemini", "camel", "vllm"]
     model_name: str
     history_messages_len: int = 0
     parameters: dict[str, Any] = {}
     tool_call_required: bool = False
+    base_url: str | None = None  # Only used in OpenAIModel and VLLMModel currently
+    api_key: str | None = None  # Only used in OpenAIModel and VLLMModel currently
 
 
 def create_backend_model(model_config: BackendModelConfig) -> BackendModel:
     match model_config.model_class:
         case "claude":
+            if model_config.base_url is not None or model_config.api_key is not None:
+                raise Warning(
+                    "base_url and api_key are not supported for ClaudeModel currently."
+                )
             return ClaudeModel(
                 model=model_config.model_name,
                 parameters=model_config.parameters,
                 history_messages_len=model_config.history_messages_len,
             )
         case "gemini":
+            if model_config.base_url is not None or model_config.api_key is not None:
+                raise Warning(
+                    "base_url and api_key are not supported for GeminiModel currently."
+                )
             return GeminiModel(
                 model=model_config.model_name,
                 parameters=model_config.parameters,
@@ -51,6 +62,16 @@ def create_backend_model(model_config: BackendModelConfig) -> BackendModel:
                 model=model_config.model_name,
                 parameters=model_config.parameters,
                 history_messages_len=model_config.history_messages_len,
+                base_url=model_config.base_url,
+                api_key=model_config.api_key,
+            )
+        case "vllm":
+            return VLLMModel(
+                model=model_config.model_name,
+                parameters=model_config.parameters,
+                history_messages_len=model_config.history_messages_len,
+                base_url=model_config.base_url,
+                api_key=model_config.api_key,
             )
         case "camel":
             raise NotImplementedError("Cannot support camel model currently.")
diff --git a/crab/agents/backend_models/claude_model.py b/crab/agents/backend_models/claude_model.py
index d2c8252..ed37f47 100644
--- a/crab/agents/backend_models/claude_model.py
+++ b/crab/agents/backend_models/claude_model.py
@@ -50,6 +50,7 @@ def __init__(
         self.action_schema: list[dict] | None = None
         self.token_usage: int = 0
         self.chat_history: list[list[dict]] = []
+        self.support_tool_call = True
 
     def reset(self, system_message: str, action_space: list[Action] | None) -> None:
         self.system_message = system_message
diff --git a/crab/agents/backend_models/gemini_model.py b/crab/agents/backend_models/gemini_model.py
index 213b7e3..3032d94 100644
--- a/crab/agents/backend_models/gemini_model.py
+++ b/crab/agents/backend_models/gemini_model.py
@@ -59,6 +59,7 @@ def __init__(
         self.action_schema: list[Tool] | None = None
         self.token_usage: int = 0
         self.chat_history: list[list[dict]] = []
+        self.support_tool_call = True
 
     def reset(self, system_message: str, action_space: list[Action] | None) -> None:
         self.system_message = system_message
diff --git a/crab/agents/backend_models/openai_model.py b/crab/agents/backend_models/openai_model.py
index 2d1eb23..e8a11eb 100644
--- a/crab/agents/backend_models/openai_model.py
+++ b/crab/agents/backend_models/openai_model.py
@@ -33,6 +33,7 @@ def __init__(
         history_messages_len: int = 0,
         tool_call_required: bool = False,
         base_url: str | None = None,
+        api_key: str | None = None,
     ) -> None:
         if not openai_model_enable:
             raise ImportError("Please install openai to use OpenAIModel")
@@ -43,7 +44,7 @@ def __init__(
 
         assert self.history_messages_len >= 0
 
-        self.client = openai.OpenAI(base_url=base_url)
+        self.client = openai.OpenAI(api_key=api_key, base_url=base_url)
         self.tool_call_required: bool = tool_call_required
         self.system_message: str = "You are a helpful assistant."
         self.openai_system_message = {
@@ -54,6 +55,7 @@ def __init__(
         self.action_schema: list[dict] | None = None
         self.token_usage: int = 0
         self.chat_history: list[list[ChatCompletionMessage | dict]] = []
+        self.support_tool_call = True
 
     def reset(self, system_message: str, action_space: list[Action] | None) -> None:
         self.system_message = system_message
@@ -92,7 +94,7 @@ def record_message(
                         "tool_call_id": tool_call.id,
                         "role": "tool",
                         "name": tool_call.function.name,
-                        "content": "",
+                        "content": "success",
                     }
                 )  # extend conversation with function response
 
diff --git a/crab/agents/backend_models/vllm_model.py b/crab/agents/backend_models/vllm_model.py
new file mode 100644
index 0000000..18ed12c
--- /dev/null
+++ b/crab/agents/backend_models/vllm_model.py
@@ -0,0 +1,80 @@
+# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
+import json
+from typing import Any
+
+from openai.types.chat import ChatCompletionMessage
+
+from crab import Action, ActionOutput, BackendOutput
+from crab.agents.backend_models.openai_model import OpenAIModel
+from crab.agents.utils import extract_text_and_code_prompts
+
+
+class VLLMModel(OpenAIModel):
+    def __init__(
+        self,
+        model: str,
+        parameters: dict[str, Any] = dict(),
+        history_messages_len: int = 0,
+        base_url: str | None = None,
+        api_key: str | None = None,
+    ) -> None:
+        if base_url is None:
+            raise ValueError("base_url is required for VLLMModel")
+        super().__init__(
+            model,
+            parameters,
+            history_messages_len,
+            False,
+            base_url,
+            api_key,
+        )
+        self.support_tool_call = False
+
+    def reset(self, system_message: str, action_space: list[Action] | None) -> None:
+        super().reset(system_message, action_space)
+        self.action_schema = None
+
+    def record_message(
+        self, new_message: dict, response_message: ChatCompletionMessage
+    ) -> None:
+        self.chat_history.append([new_message])
+        self.chat_history[-1].append(
+            {"role": "assistant", "content": response_message.content}
+        )
+
+    def generate_backend_output(
+        self, response_message: ChatCompletionMessage
+    ) -> BackendOutput:
+        content = response_message.content
+        text_list, code_list = extract_text_and_code_prompts(content)
+
+        action_list = []
+        try:
+            for code_block in code_list:
+                action_object = json.loads(code_block)
+                action_list.append(
+                    ActionOutput(
+                        name=action_object["name"], arguments=action_object["arguments"]
+                    )
+                )
+        except json.JSONDecodeError as e:
+            raise RuntimeError(f"Failed to parse code block: {code_block}") from e
+        except KeyError as e:
+            raise RuntimeError(f"Received invalid action format: {code_block}") from e
+
+        return BackendOutput(
+            message="".join(text_list),
+            action_list=action_list,
+        )
diff --git a/crab/agents/policies/single_agent.py b/crab/agents/policies/single_agent.py
index 7746c53..74a6cd6 100644
--- a/crab/agents/policies/single_agent.py
+++ b/crab/agents/policies/single_agent.py
@@ -26,8 +26,8 @@
 
 
 class SingleAgentPolicy(AgentPolicy):
-    _system_prompt = """You are a helpful assistant. Now you have to do a task as
-    described below: 
+    _system_prompt_with_function_call = """\
+    You are a helpful assistant. Now you have to do a task as described below: 
 
     **"{task_description}."**
 
@@ -47,11 +47,45 @@ class SingleAgentPolicy(AgentPolicy):
     you. Always do them by yourself using function calls.
     """
 
+    _system_prompt_no_function_call = """\
+    You are a helpful assistant. Now you have to do a task as described below: 
+
+    **"{task_description}."**
+
+    You should never forget this task and always perform actions to achieve this task. 
+    And this is the description of each given environment: {env_description}. You will
+    receive screenshots of the environments. The interactive UI elements on the
+    screenshot are labeled with numeric tags starting from 1. 
+
+    A unit operation you can perform is called Action. You have a limited action space
+    as function calls: {action_descriptions}. You should generate JSON code blocks to
+    execute the actions. Each code block MUST contains only one json object, i.e. one
+    action. You can output multiple code blocks to execute multiple actions in a single
+    step. You must follow the JSON format below to output the action. 
+    ```json
+    {{"name": "action_name", "arguments": {{"arg1": "value1", "arg2": "value2"}}}}
+    ```
+    or if not arguments needed:
+    ```json
+    {{"name": "action_name", "arguments": {{}}}}
+    ```
+
+    In each step, You MUST explain what do you see from the current observation and the
+    plan of the next action, then use a provided action in each step to achieve the
+    task. You should state what action to take and what the parameters should be. Your
+    answer MUST contain at least one code block. You SHOULD NEVER ask me to do anything
+    for you. Always do them by yourself.
+    """
+
     def __init__(
         self,
         model_backend: BackendModelConfig,
     ):
         self.model_backend = create_backend_model(model_backend)
+        if self.model_backend.support_tool_call:
+            self.system_prompt = self._system_prompt_with_function_call
+        else:
+            self.system_prompt = self._system_prompt_no_function_call
         self.reset(task_description="", action_spaces=None, env_descriptions={})
 
     def reset(
@@ -62,9 +96,12 @@ def reset(
     ) -> list:
         self.task_description = task_description
         self.action_space = combine_multi_env_action_space(action_spaces)
-        system_message = self._system_prompt.format(
+        system_message = self.system_prompt.format(
             task_description=task_description,
-            action_descriptions=generate_action_prompt(self.action_space),
+            action_descriptions=generate_action_prompt(
+                self.action_space,
+                expand=not self.model_backend.support_tool_call,
+            ),
             env_description=str(env_descriptions),
         )
         self.model_backend.reset(system_message, self.action_space)
diff --git a/crab/agents/utils.py b/crab/agents/utils.py
index e3a18c7..b174b92 100644
--- a/crab/agents/utils.py
+++ b/crab/agents/utils.py
@@ -24,7 +24,7 @@ def combine_multi_env_action_space(
     for env in action_space:
         for action in action_space[env]:
             new_action = action.model_copy()
-            new_action.name = new_action.name + "__in__" + env
+            new_action.name = new_action.name + "_in_" + env
             new_action.description = f"In {env} environment, " + new_action.description
             result.append(new_action)
     return result
@@ -38,10 +38,10 @@ def decode_combined_action(
     """
     result = []
     for output in output_actions:
-        name_env = output.name.split("__in__")
+        name_env = output.name.split("_in_")
         if len(name_env) != 2:
             raise RuntimeError(
-                'The decoded action name should contain the splitter "__in__".'
+                'The decoded action name should contain the splitter "_in_".'
             )
         new_output = output.model_copy()
         new_output.name = name_env[0]
@@ -50,7 +50,54 @@ def decode_combined_action(
     return result
 
 
-def generate_action_prompt(action_space: list[Action]) -> str:
-    return "".join(
-        [f"[{action.name}: {action.description}]\n" for action in action_space]
-    )
+def generate_action_prompt(action_space: list[Action], expand: bool = False) -> str:
+    if expand:
+        return "".join(
+            [
+                f"[**{action.name}**:\n"
+                f"action description: {action.description}\n"
+                f"action arguments json schema: {action.to_openai_json_schema()}\n"
+                "]\n"
+                for action in action_space
+            ]
+        )
+    else:
+        return "".join(
+            [f"[{action.name}: {action.description}]\n" for action in action_space]
+        )
+
+
+def extract_text_and_code_prompts(content: str) -> tuple[list[str], list[str]]:
+    r"""Extract text and code prompts from the message content.
+
+    Returns:
+        A tuple (text_list, code_list) where, text_list is a list of text and  code_list
+        is a list of extracted codes both from the content.
+    """
+    text_prompts: list[str] = []
+    code_prompts: list[str] = []
+
+    lines = content.split("\n")
+    idx = 0
+    start_idx = 0
+    while idx < len(lines):
+        while idx < len(lines) and (not lines[idx].lstrip().startswith("```")):
+            idx += 1
+        text = "\n".join(lines[start_idx:idx]).strip()
+        text_prompts.append(text)
+
+        if idx >= len(lines):
+            break
+
+        # code_type = lines[idx].strip()[3:].strip()
+        idx += 1
+        start_idx = idx
+        while not lines[idx].lstrip().startswith("```"):
+            idx += 1
+        code = "\n".join(lines[start_idx:idx]).strip()
+        code_prompts.append(code)
+
+        idx += 1
+        start_idx = idx
+
+    return text_prompts, code_prompts

From f71a97582ed5dc01de2d8e32722738c5314d54a9 Mon Sep 17 00:00:00 2001
From: Tianqi Xu <tianqi.xu@kaust.edu.sa>
Date: Tue, 8 Oct 2024 17:43:14 +0300
Subject: [PATCH 14/17] Add new tasks and agents

---
 crab-benchmark-v0/README.md                   |   4 +
 crab-benchmark-v0/android_env.py              |   3 +-
 .../1005c437-50d1-465a-b3fc-833098b22bfc.json |  23 ++
 .../12333aa0-e76d-4a5c-8657-9f897f62f62d.json |  22 ++
 .../2ade6a13-c7a6-4df7-8c62-77382687369e.json |  22 ++
 .../4190c90c-b28c-4bb3-ab5c-af3c4fde0a3d.json |  15 ++
 .../483fbf9c-dc78-4ac2-9264-53c4f617f6cc.json |  21 ++
 .../4893a9b0-6477-495d-a73c-32503326e24a.json |  14 ++
 .../53010c40-dce4-4d72-a856-842c21059e2b.json |  22 ++
 .../71ef7fd2-0ae3-49c8-8238-06b7aa985d25.json |  23 ++
 .../7891ceab-7965-4ddb-a0fc-15740c9a4e44.json |  22 ++
 .../8bd51440-f959-4edc-baa5-cd03d32a5b0f.json |  22 ++
 .../94b1836b-3111-40ad-8d07-b8a57efe7438.json |  22 ++
 .../a225f7f8-6d03-4619-b57d-7a08610030d8.json |  22 ++
 .../b3965b07-4683-4445-9de1-a1dedf6c73ad.json |  22 ++
 .../cf4c496b-fbbd-4701-91ea-4590fe6a66e1.json |  22 ++
 .../d0811e47-d75f-40ce-b34b-e1ee3c8bed3f.json |  22 ++
 .../d7489d00-0046-4fb1-af5b-1fde7d87312c.json |  21 ++
 .../d92f6c33-e0a7-4101-957d-e7dd218d2565.json |  21 ++
 .../e55d7a39-7b6b-4852-8711-844cebc88cb8.json |  15 ++
 .../e9268070-91b7-4e8c-9976-1cf8126ba13b.json |  21 ++
 .../fbe6a1b1-63bb-4d4e-8a53-ff4f7839ef61.json |  22 ++
 crab-benchmark-v0/dataset/android_subtasks.py |   2 +
 .../05a7633d-b966-471c-8848-e18e69ad265f.json |   2 +-
 crab-benchmark-v0/dataset/handmade_tasks.py   | 224 ++++++++++++++++--
 .../0deafe05-8db5-445f-9031-f6e884569d03.json |  25 ++
 .../15a150a8-899c-4753-8dc5-05248ccc3640.json |  22 ++
 .../299db8f2-81eb-455f-9302-5c8cb30be691.json |  23 ++
 .../29f099b2-b3a5-463f-b10a-15363bf7e845.json |  22 ++
 .../51c91051-3efb-4e92-a967-739b18520714.json |  19 ++
 .../57b7e8a7-8c17-4cc4-9bb5-4385afde3ad8.json |  25 ++
 .../5ba74c6a-4513-448b-8b68-ff145ece0652.json |  22 ++
 .../6428f803-62de-40d2-a345-64e6cf955c9d.json |  24 ++
 .../696ca9bb-89ea-4cd5-b693-f2d749d964b1.json |  22 ++
 .../6c3105a2-328c-4190-823d-03d759be0b57.json |  21 ++
 .../6c560516-ca14-4f97-b51d-16ad81fc29e4.json |  22 ++
 .../730172f5-894a-4d46-9102-ac7d985a479d.json |  23 ++
 .../73038efb-ca0f-4d90-a947-fcfd097dd91b.json |  19 ++
 .../73da97c9-f084-4cab-8697-1151737387ff.json |  22 ++
 .../78502f1c-879b-4932-a5fd-d85f7f6b0f81.json |  22 ++
 .../7dda7e46-78be-4663-b882-6132dbbff335.json |  22 ++
 .../82c49e12-3b2f-432e-9069-4b67bafebbf7.json |  22 ++
 .../87910f23-ab23-4ccc-b115-d71cff6f0162.json |  21 ++
 .../8cb5ab6d-a56e-43b9-aa83-00a46331e20f.json |  23 ++
 .../a70ab903-835f-48b7-8356-2321b8b869d8.json |  19 ++
 .../abb16512-27ae-49c0-b12b-7fbf0e95056b.json |  21 ++
 .../b2ca21dc-dde9-49f5-bec7-321fbf769315.json |  24 ++
 .../ccf31785-ec13-4981-93c5-ca6c242ac0c3.json |  24 ++
 .../d3478489-70f2-4a82-b7d2-0a47b75986eb.json |  24 ++
 .../d3c917ff-406f-447a-87f5-b8d835cba750.json |  23 ++
 .../d6e460e4-c295-40ad-883c-11300d7832f0.json |  19 ++
 .../e31d4e3b-b753-4deb-b9ad-a0add5d4790e.json |  21 ++
 .../f67a26e4-58dd-4dc6-8859-affbf1d62f94.json |  22 ++
 crab-benchmark-v0/main.py                     |  57 +++--
 crab-benchmark-v0/ubuntu_env.py               |   2 +
 crab/actions/crab_actions.py                  |  10 +
 crab/actions/desktop_actions.py               |  30 ++-
 crab/agents/backend_models/__init__.py        |  22 +-
 crab/agents/backend_models/claude_model.py    |   4 +-
 crab/agents/backend_models/gemini_model.py    |   7 +-
 crab/agents/backend_models/openai_model.py    |  91 ++++++-
 crab/agents/backend_models/vllm_model.py      |  80 -------
 crab/agents/policies/multi_agent_by_env.py    |   5 +-
 crab/agents/policies/multi_agent_by_func.py   |   5 +-
 crab/agents/policies/single_agent.py          |  29 ++-
 crab/agents/utils.py                          |   4 +-
 crab/core/benchmark.py                        |  17 +-
 crab/core/environment.py                      |   2 +-
 crab/core/experiment.py                       |  11 +
 crab/core/task_generator.py                   |   3 +
 70 files changed, 1501 insertions(+), 147 deletions(-)
 create mode 100644 crab-benchmark-v0/dataset/android/1005c437-50d1-465a-b3fc-833098b22bfc.json
 create mode 100644 crab-benchmark-v0/dataset/android/12333aa0-e76d-4a5c-8657-9f897f62f62d.json
 create mode 100644 crab-benchmark-v0/dataset/android/2ade6a13-c7a6-4df7-8c62-77382687369e.json
 create mode 100644 crab-benchmark-v0/dataset/android/4190c90c-b28c-4bb3-ab5c-af3c4fde0a3d.json
 create mode 100644 crab-benchmark-v0/dataset/android/483fbf9c-dc78-4ac2-9264-53c4f617f6cc.json
 create mode 100644 crab-benchmark-v0/dataset/android/4893a9b0-6477-495d-a73c-32503326e24a.json
 create mode 100644 crab-benchmark-v0/dataset/android/53010c40-dce4-4d72-a856-842c21059e2b.json
 create mode 100644 crab-benchmark-v0/dataset/android/71ef7fd2-0ae3-49c8-8238-06b7aa985d25.json
 create mode 100644 crab-benchmark-v0/dataset/android/7891ceab-7965-4ddb-a0fc-15740c9a4e44.json
 create mode 100644 crab-benchmark-v0/dataset/android/8bd51440-f959-4edc-baa5-cd03d32a5b0f.json
 create mode 100644 crab-benchmark-v0/dataset/android/94b1836b-3111-40ad-8d07-b8a57efe7438.json
 create mode 100644 crab-benchmark-v0/dataset/android/a225f7f8-6d03-4619-b57d-7a08610030d8.json
 create mode 100644 crab-benchmark-v0/dataset/android/b3965b07-4683-4445-9de1-a1dedf6c73ad.json
 create mode 100644 crab-benchmark-v0/dataset/android/cf4c496b-fbbd-4701-91ea-4590fe6a66e1.json
 create mode 100644 crab-benchmark-v0/dataset/android/d0811e47-d75f-40ce-b34b-e1ee3c8bed3f.json
 create mode 100644 crab-benchmark-v0/dataset/android/d7489d00-0046-4fb1-af5b-1fde7d87312c.json
 create mode 100644 crab-benchmark-v0/dataset/android/d92f6c33-e0a7-4101-957d-e7dd218d2565.json
 create mode 100644 crab-benchmark-v0/dataset/android/e55d7a39-7b6b-4852-8711-844cebc88cb8.json
 create mode 100644 crab-benchmark-v0/dataset/android/e9268070-91b7-4e8c-9976-1cf8126ba13b.json
 create mode 100644 crab-benchmark-v0/dataset/android/fbe6a1b1-63bb-4d4e-8a53-ff4f7839ef61.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/0deafe05-8db5-445f-9031-f6e884569d03.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/15a150a8-899c-4753-8dc5-05248ccc3640.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/299db8f2-81eb-455f-9302-5c8cb30be691.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/29f099b2-b3a5-463f-b10a-15363bf7e845.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/51c91051-3efb-4e92-a967-739b18520714.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/57b7e8a7-8c17-4cc4-9bb5-4385afde3ad8.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/5ba74c6a-4513-448b-8b68-ff145ece0652.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/6428f803-62de-40d2-a345-64e6cf955c9d.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/696ca9bb-89ea-4cd5-b693-f2d749d964b1.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/6c3105a2-328c-4190-823d-03d759be0b57.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/6c560516-ca14-4f97-b51d-16ad81fc29e4.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/730172f5-894a-4d46-9102-ac7d985a479d.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/73038efb-ca0f-4d90-a947-fcfd097dd91b.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/73da97c9-f084-4cab-8697-1151737387ff.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/78502f1c-879b-4932-a5fd-d85f7f6b0f81.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/7dda7e46-78be-4663-b882-6132dbbff335.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/82c49e12-3b2f-432e-9069-4b67bafebbf7.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/87910f23-ab23-4ccc-b115-d71cff6f0162.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/8cb5ab6d-a56e-43b9-aa83-00a46331e20f.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/a70ab903-835f-48b7-8356-2321b8b869d8.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/abb16512-27ae-49c0-b12b-7fbf0e95056b.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/b2ca21dc-dde9-49f5-bec7-321fbf769315.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/ccf31785-ec13-4981-93c5-ca6c242ac0c3.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/d3478489-70f2-4a82-b7d2-0a47b75986eb.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/d3c917ff-406f-447a-87f5-b8d835cba750.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/d6e460e4-c295-40ad-883c-11300d7832f0.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/e31d4e3b-b753-4deb-b9ad-a0add5d4790e.json
 create mode 100644 crab-benchmark-v0/dataset/ubuntu/f67a26e4-58dd-4dc6-8859-affbf1d62f94.json
 delete mode 100644 crab/agents/backend_models/vllm_model.py

diff --git a/crab-benchmark-v0/README.md b/crab-benchmark-v0/README.md
index 29c8db8..6965338 100644
--- a/crab-benchmark-v0/README.md
+++ b/crab-benchmark-v0/README.md
@@ -29,3 +29,7 @@ After setting up the environment, you can start the experiment. A brief overview
 2. Start the CRAB server in the Ubuntu environment and get its IP address and port. Let's say they are `192.168.122.72` and `8000`.
 3. Choose a task. As an example, we take the task with ID `a3476778-e512-40ca-b1c0-d7aab0c7f18b` from [handmade_tasks](./dataset/handmade_tasks.py). The task is: "Open the 'Tasks' app on Android, check the first incomplete task, then perform the task according to its description."
 4. Run [main.py](./main.py) with the command `poetry run python -m crab-benchmark-v0.main --model gpt4o --policy single --remote-url http://192.168.122.72:8000 --task-id a3476778-e512-40ca-b1c0-d7aab0c7f18b`. In this command, `--model gpt4o` and `--policy single` determine the agent system, `--remote-url` specifies the Ubuntu environment interface, and `--task-id` indicates the task to be performed.
+
+#### Model
+
+For open source models, we use [VLLM](https://github.com/vllm-project/vllm) to host Pixtral model, check [here](https://docs.vllm.ai/en/latest/models/vlm.html#online-inference) for the setup commands; [SGLang](https://github.com/sgl-project/sglang) to host LLaVa-OneVision model, check [here](https://github.com/sgl-project/sglang?tab=readme-ov-file#supported-models) for the setup commands.
\ No newline at end of file
diff --git a/crab-benchmark-v0/android_env.py b/crab-benchmark-v0/android_env.py
index ec43644..d4d3c91 100644
--- a/crab-benchmark-v0/android_env.py
+++ b/crab-benchmark-v0/android_env.py
@@ -14,6 +14,7 @@
 from crab import EnvironmentConfig
 from crab.actions.android_actions import (
     key_press,
+    long_tap,
     open_app_drawer,
     screenshot,
     setup,
@@ -24,7 +25,7 @@
 
 ANDROID_ENV = EnvironmentConfig(
     name="android",
-    action_space=[tap, key_press, write_text, swipe, open_app_drawer],
+    action_space=[tap, key_press, long_tap, write_text, swipe, open_app_drawer],
     observation_space=[screenshot],
     description="""A Google Pixel smartphone runs on the Android operating system. \
 The interface displays a current screenshot at each step and primarily \
diff --git a/crab-benchmark-v0/dataset/android/1005c437-50d1-465a-b3fc-833098b22bfc.json b/crab-benchmark-v0/dataset/android/1005c437-50d1-465a-b3fc-833098b22bfc.json
new file mode 100644
index 0000000..b1a808e
--- /dev/null
+++ b/crab-benchmark-v0/dataset/android/1005c437-50d1-465a-b3fc-833098b22bfc.json
@@ -0,0 +1,23 @@
+{
+    "description": "In the Android operating system, use the \"Google Map\" app to find the city name corresponding to the postal code \"63002\" in South Korea, then use the \"Calendar\" app to add a new all-day event for 1 January 2025 with the text of the found city name.",
+    "tasks": [
+        {
+            "task": "51b2463c-9904-4a32-81ba-507bfb89d61f",
+            "attribute": {
+                "number": "63002",
+                "country": "South Korea"
+            },
+            "output": "Jeju"
+        },
+        {
+            "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ac",
+            "attribute": {
+                "content": "Jeju",
+                "date": "1 January 2025"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "1005c437-50d1-465a-b3fc-833098b22bfc"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/android/12333aa0-e76d-4a5c-8657-9f897f62f62d.json b/crab-benchmark-v0/dataset/android/12333aa0-e76d-4a5c-8657-9f897f62f62d.json
new file mode 100644
index 0000000..69aebbc
--- /dev/null
+++ b/crab-benchmark-v0/dataset/android/12333aa0-e76d-4a5c-8657-9f897f62f62d.json
@@ -0,0 +1,22 @@
+{
+    "description": "In Android, use the \"Google Map\" app to find the city name for the postal code \"2770885\" in Japan, and then, using the \"Keep Notes\" app, create a new note without a title to record the city name you found.",
+    "tasks": [
+        {
+            "task": "51b2463c-9904-4a32-81ba-507bfb89d61f",
+            "attribute": {
+                "number": "2770885",
+                "country": "Japan"
+            },
+            "output": "Chiba"
+        },
+        {
+            "task": "eb92a1e6-4c86-4d56-baac-95fc8397732e",
+            "attribute": {
+                "content": "Chiba"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "12333aa0-e76d-4a5c-8657-9f897f62f62d"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/android/2ade6a13-c7a6-4df7-8c62-77382687369e.json b/crab-benchmark-v0/dataset/android/2ade6a13-c7a6-4df7-8c62-77382687369e.json
new file mode 100644
index 0000000..2275b01
--- /dev/null
+++ b/crab-benchmark-v0/dataset/android/2ade6a13-c7a6-4df7-8c62-77382687369e.json
@@ -0,0 +1,22 @@
+{
+    "description": "In Android, using the \"Contacts\" app, find the email of the contact named John Lauphin, then using the \"Gmail\" app, send an email to that contact with the subject \"Hello John.\"",
+    "tasks": [
+        {
+            "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ap",
+            "attribute": {
+                "name": "John Lauphin"
+            },
+            "output": "crabbb@gmail.com"
+        },
+        {
+            "task": "0090f116-e02b-4562-a20d-b5df38be963a",
+            "attribute": {
+                "content": "Hello John",
+                "mail": "crabbb@gmail.com"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "2ade6a13-c7a6-4df7-8c62-77382687369e"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/android/4190c90c-b28c-4bb3-ab5c-af3c4fde0a3d.json b/crab-benchmark-v0/dataset/android/4190c90c-b28c-4bb3-ab5c-af3c4fde0a3d.json
new file mode 100644
index 0000000..5ff2b91
--- /dev/null
+++ b/crab-benchmark-v0/dataset/android/4190c90c-b28c-4bb3-ab5c-af3c4fde0a3d.json
@@ -0,0 +1,15 @@
+{
+    "description": "In Android, Using Google Map app, Find the city name of corresponding post code \"1010021\" in the country \"Japan\".",
+    "tasks": [
+        {
+            "task": "51b2463c-9904-4a32-81ba-507bfb89d61f",
+            "attribute": {
+                "country": "Japan",
+                "number": "101-0021"
+            },
+            "output": "Tokyo"
+        }
+    ],
+    "adjlist": "0",
+    "id": "4190c90c-b28c-4bb3-ab5c-af3c4fde0a3d"
+}
diff --git a/crab-benchmark-v0/dataset/android/483fbf9c-dc78-4ac2-9264-53c4f617f6cc.json b/crab-benchmark-v0/dataset/android/483fbf9c-dc78-4ac2-9264-53c4f617f6cc.json
new file mode 100644
index 0000000..95962b7
--- /dev/null
+++ b/crab-benchmark-v0/dataset/android/483fbf9c-dc78-4ac2-9264-53c4f617f6cc.json
@@ -0,0 +1,21 @@
+{
+    "description": "Open the calendar app in the Android system and find the title of an event on the date \"17 August 2024,\" then using the \"Google Drive\" app on the same Android device, create a new folder with the founded name",
+    "tasks": [
+        {
+            "task": "2394b768-2ca7-45e9-b41e-2aa4e9573192",
+            "attribute": {
+                "date": "17 August 2024"
+            },
+            "output": "Travel to Paris"
+        },
+        {
+            "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ar",
+            "attribute": {
+                "content": "Travel to Paris"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "483fbf9c-dc78-4ac2-9264-53c4f617f6cc"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/android/4893a9b0-6477-495d-a73c-32503326e24a.json b/crab-benchmark-v0/dataset/android/4893a9b0-6477-495d-a73c-32503326e24a.json
new file mode 100644
index 0000000..5df87bb
--- /dev/null
+++ b/crab-benchmark-v0/dataset/android/4893a9b0-6477-495d-a73c-32503326e24a.json
@@ -0,0 +1,14 @@
+{
+    "description": "In the Android system, use the calendar app to find the title of an event on the date \"16 July 2024,\".",
+    "tasks": [
+        {
+            "task": "2394b768-2ca7-45e9-b41e-2aa4e9573192",
+            "attribute": {
+                "date": "16 July 2024"
+            },
+            "output": "Japan"
+        }
+    ],
+    "adjlist": "0",
+    "id": "4893a9b0-6477-495d-a73c-32503326e24a"
+}
diff --git a/crab-benchmark-v0/dataset/android/53010c40-dce4-4d72-a856-842c21059e2b.json b/crab-benchmark-v0/dataset/android/53010c40-dce4-4d72-a856-842c21059e2b.json
new file mode 100644
index 0000000..05c7451
--- /dev/null
+++ b/crab-benchmark-v0/dataset/android/53010c40-dce4-4d72-a856-842c21059e2b.json
@@ -0,0 +1,22 @@
+{
+    "description": "In the Android system, use the calendar app to find the title of an event on the date \"16 July 2024,\" then, using the Google Map app, find the city name of the corresponding post code \"113-8654\" in the country with same name as title.",
+    "tasks": [
+        {
+            "task": "2394b768-2ca7-45e9-b41e-2aa4e9573192",
+            "attribute": {
+                "date": "16 July 2024"
+            },
+            "output": "Japan"
+        },
+        {
+            "task": "51b2463c-9904-4a32-81ba-507bfb89d61f",
+            "attribute": {
+                "number": "113-8654",
+                "country": "Japan"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "53010c40-dce4-4d72-a856-842c21059e2b"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/android/71ef7fd2-0ae3-49c8-8238-06b7aa985d25.json b/crab-benchmark-v0/dataset/android/71ef7fd2-0ae3-49c8-8238-06b7aa985d25.json
new file mode 100644
index 0000000..f13b507
--- /dev/null
+++ b/crab-benchmark-v0/dataset/android/71ef7fd2-0ae3-49c8-8238-06b7aa985d25.json
@@ -0,0 +1,23 @@
+{
+    "description": "Using the \"Google Map\" app on Android, find the distance of the shortest route from \"National University of Singapore\" to \"Nanyang Technology University,\" then using the \"Calendar\" app, add a new event with the text representing the found distance on the date 21 June 2024 as an all-day event.",
+    "tasks": [
+        {
+            "task": "1a1b72d7-78c9-4027-8278-86083ae01045",
+            "attribute": {
+                "place_name_1": "National University of Singapore",
+                "place_name_2": "Nanyang Technology University"
+            },
+            "output": "13km"
+        },
+        {
+            "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ac",
+            "attribute": {
+                "content": "13km",
+                "date": "21 June 2024"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "71ef7fd2-0ae3-49c8-8238-06b7aa985d25"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/android/7891ceab-7965-4ddb-a0fc-15740c9a4e44.json b/crab-benchmark-v0/dataset/android/7891ceab-7965-4ddb-a0fc-15740c9a4e44.json
new file mode 100644
index 0000000..41c48a4
--- /dev/null
+++ b/crab-benchmark-v0/dataset/android/7891ceab-7965-4ddb-a0fc-15740c9a4e44.json
@@ -0,0 +1,22 @@
+{
+    "description": "In Android, Using \"Google Map\" app, find the city name of corresponding post code \"560049\" in the country \"India\". Creat a folder with the city name in  \"Google Drive \" app",
+    "tasks": [
+        {
+            "task": "51b2463c-9904-4a32-81ba-507bfb89d61f",
+            "attribute": {
+                "country": "India",
+                "number": "560049"
+            },
+            "output": "Bengaluru"
+        },
+        {
+            "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ar",
+            "attribute": {
+                "content": "Bengaluru"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "7891ceab-7965-4ddb-a0fc-15740c9a4e44"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/android/8bd51440-f959-4edc-baa5-cd03d32a5b0f.json b/crab-benchmark-v0/dataset/android/8bd51440-f959-4edc-baa5-cd03d32a5b0f.json
new file mode 100644
index 0000000..c8f6ed8
--- /dev/null
+++ b/crab-benchmark-v0/dataset/android/8bd51440-f959-4edc-baa5-cd03d32a5b0f.json
@@ -0,0 +1,22 @@
+{
+    "description": "In Android, use the \"Google Map\" app to find the address of the University of Sydney, then using the \"Gmail\" app, send a message to crabbb@gmail.com with the found address.",
+    "tasks": [
+        {
+            "task": "a3d11574-2acf-4b26-a569-a5dbc9d548aw",
+            "attribute": {
+                "content": "The University of Sydney"
+            },
+            "output": "Camperdown NSW 2050 Australia"
+        },
+        {
+            "task": "0090f116-e02b-4562-a20d-b5df38be963a",
+            "attribute": {
+                "content": "Camperdown NSW 2050 Australia",
+                "mail": "crabbb@gmail.com"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "8bd51440-f959-4edc-baa5-cd03d32a5b0f"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/android/94b1836b-3111-40ad-8d07-b8a57efe7438.json b/crab-benchmark-v0/dataset/android/94b1836b-3111-40ad-8d07-b8a57efe7438.json
new file mode 100644
index 0000000..137e20f
--- /dev/null
+++ b/crab-benchmark-v0/dataset/android/94b1836b-3111-40ad-8d07-b8a57efe7438.json
@@ -0,0 +1,22 @@
+{
+    "description": "In an Android system, use the calendar app to find the title of an event on the date \"9 August 2024\", and then, using the Gmail app, send an email to crabbb@gmail.com with the event title as message.",
+    "tasks": [
+        {
+            "task": "2394b768-2ca7-45e9-b41e-2aa4e9573192",
+            "attribute": {
+                "date": "9 August 2024"
+            },
+            "output": "National Day of Singapore would be a public holiday"
+        },
+        {
+            "task": "0090f116-e02b-4562-a20d-b5df38be963a",
+            "attribute": {
+                "content": "National Day of Singapore would be a public holiday",
+                "mail": "crabbb@gmail.com"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "94b1836b-3111-40ad-8d07-b8a57efe7438"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/android/a225f7f8-6d03-4619-b57d-7a08610030d8.json b/crab-benchmark-v0/dataset/android/a225f7f8-6d03-4619-b57d-7a08610030d8.json
new file mode 100644
index 0000000..e7ee4b8
--- /dev/null
+++ b/crab-benchmark-v0/dataset/android/a225f7f8-6d03-4619-b57d-7a08610030d8.json
@@ -0,0 +1,22 @@
+{
+    "description": "In Android, Using \"Google Map\" app, Find the address of \"University of Oxford\" and send \"98801234\" the address using \"message\" App. ",
+    "tasks": [
+        {
+            "task": "a3d11574-2acf-4b26-a569-a5dbc9d548aw",
+            "attribute": {
+                "content": "University of Oxford"
+            },
+            "output": "Wellington Square, Oxford OX1 2JD, United Kingdom"
+        },
+        {
+            "task": "caa29623-1811-402d-963a-19f7eecc63d8",
+            "attribute": {
+                "content": "Wellington Square, Oxford OX1 2JD, United Kingdom",
+                "number": "98801234"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "a225f7f8-6d03-4619-b57d-7a08610030d8"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/android/b3965b07-4683-4445-9de1-a1dedf6c73ad.json b/crab-benchmark-v0/dataset/android/b3965b07-4683-4445-9de1-a1dedf6c73ad.json
new file mode 100644
index 0000000..aabd243
--- /dev/null
+++ b/crab-benchmark-v0/dataset/android/b3965b07-4683-4445-9de1-a1dedf6c73ad.json
@@ -0,0 +1,22 @@
+{
+    "description": "In Android, Using \"Google Map\" app, Find the address of \"University of Oxford\" and send \"abcdcly@qq.com\" the address using \"Gmail\" App. ",
+    "tasks": [
+        {
+            "task": "a3d11574-2acf-4b26-a569-a5dbc9d548aw",
+            "attribute": {
+                "content": "University of Oxford"
+            },
+            "output": "Wellington Square, Oxford OX1 2JD, United Kingdom"
+        },
+        {
+            "task": "0090f116-e02b-4562-a20d-b5df38be963a",
+            "attribute": {
+                "content": "Wellington Square, Oxford OX1 2JD, United Kingdom",
+                "mail": "abcdcly@qq.com"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "b3965b07-4683-4445-9de1-a1dedf6c73ad"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/android/cf4c496b-fbbd-4701-91ea-4590fe6a66e1.json b/crab-benchmark-v0/dataset/android/cf4c496b-fbbd-4701-91ea-4590fe6a66e1.json
new file mode 100644
index 0000000..2d2c72f
--- /dev/null
+++ b/crab-benchmark-v0/dataset/android/cf4c496b-fbbd-4701-91ea-4590fe6a66e1.json
@@ -0,0 +1,22 @@
+{
+    "description": "In Android, use the \"Google Map\" app to find the city name corresponding to the postcode \"110151\" in Colombia, then use the \"Clock\" app to set the time of that city in the clock and check the time gap between that city and your current city.",
+    "tasks": [
+        {
+            "task": "51b2463c-9904-4a32-81ba-507bfb89d61f",
+            "attribute": {
+                "number": "110151",
+                "country": "Columbia"
+            },
+            "output": "Bogota"
+        },
+        {
+            "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ah",
+            "attribute": {
+                "place_name": "Bogota"
+            },
+            "output": "-5h"
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "cf4c496b-fbbd-4701-91ea-4590fe6a66e1"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/android/d0811e47-d75f-40ce-b34b-e1ee3c8bed3f.json b/crab-benchmark-v0/dataset/android/d0811e47-d75f-40ce-b34b-e1ee3c8bed3f.json
new file mode 100644
index 0000000..8372ca5
--- /dev/null
+++ b/crab-benchmark-v0/dataset/android/d0811e47-d75f-40ce-b34b-e1ee3c8bed3f.json
@@ -0,0 +1,22 @@
+{
+    "description": "In Android, first use the \"Files\" app to find the creation date of the file /Movies/movie_list.txt, then use the \"Calendar\" app to add a new event titled \"Public Talking\" scheduled for all day on the founded day.",
+    "tasks": [
+        {
+            "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ak",
+            "attribute": {
+                "file_path": "/Movies/movie_list.txt"
+            },
+            "output": "4 June 2024"
+        },
+        {
+            "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ac",
+            "attribute": {
+                "content": "Public Talking",
+                "date": "4 June 2024"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "d0811e47-d75f-40ce-b34b-e1ee3c8bed3f"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/android/d7489d00-0046-4fb1-af5b-1fde7d87312c.json b/crab-benchmark-v0/dataset/android/d7489d00-0046-4fb1-af5b-1fde7d87312c.json
new file mode 100644
index 0000000..9050070
--- /dev/null
+++ b/crab-benchmark-v0/dataset/android/d7489d00-0046-4fb1-af5b-1fde7d87312c.json
@@ -0,0 +1,21 @@
+{
+    "description": "In Android, open the \"Contacts\" app to find the email address of the contact named Karoon Wei, then use the \"Tasks\" app to add a new task with the email address.",
+    "tasks": [
+        {
+            "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ap",
+            "attribute": {
+                "name": "Karoon Wei"
+            },
+            "output": "karroonw@gmail.com"
+        },
+        {
+            "task": "a3d11574-2acf-4b26-a569-a5dbc9d548af",
+            "attribute": {
+                "content": "karroonw@gmail.com"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "d7489d00-0046-4fb1-af5b-1fde7d87312c"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/android/d92f6c33-e0a7-4101-957d-e7dd218d2565.json b/crab-benchmark-v0/dataset/android/d92f6c33-e0a7-4101-957d-e7dd218d2565.json
new file mode 100644
index 0000000..9e4fe8b
--- /dev/null
+++ b/crab-benchmark-v0/dataset/android/d92f6c33-e0a7-4101-957d-e7dd218d2565.json
@@ -0,0 +1,21 @@
+{
+    "description": "Using the \"Files\" app on an Android device, locate the file /Movies/movie_list.txt and determine its creation date, then use the Task app in the same Android system to find the title of an event scheduled for the days.",
+    "tasks": [
+        {
+            "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ak",
+            "attribute": {
+                "file_path": "/Movies/movie_list.txt"
+            },
+            "output": "4 June 2024"
+        },
+        {
+            "task": "2394b768-2ca7-45e9-b41e-2aa4e9573192",
+            "attribute": {
+                "date": "4 June 2024"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "d92f6c33-e0a7-4101-957d-e7dd218d2565"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/android/e55d7a39-7b6b-4852-8711-844cebc88cb8.json b/crab-benchmark-v0/dataset/android/e55d7a39-7b6b-4852-8711-844cebc88cb8.json
new file mode 100644
index 0000000..450ac2c
--- /dev/null
+++ b/crab-benchmark-v0/dataset/android/e55d7a39-7b6b-4852-8711-844cebc88cb8.json
@@ -0,0 +1,15 @@
+{
+    "description": "In Android, use the \"Google Map\" app to find the city name corresponding to the postcode \"110151\" in Colombia.",
+    "tasks": [
+        {
+            "task": "51b2463c-9904-4a32-81ba-507bfb89d61f",
+            "attribute": {
+                "number": "110151",
+                "country": "Columbia"
+            },
+            "output": "Bogota"
+        }
+    ],
+    "adjlist": "0",
+    "id": "e55d7a39-7b6b-4852-8711-844cebc88cb8"
+}
diff --git a/crab-benchmark-v0/dataset/android/e9268070-91b7-4e8c-9976-1cf8126ba13b.json b/crab-benchmark-v0/dataset/android/e9268070-91b7-4e8c-9976-1cf8126ba13b.json
new file mode 100644
index 0000000..334ef3d
--- /dev/null
+++ b/crab-benchmark-v0/dataset/android/e9268070-91b7-4e8c-9976-1cf8126ba13b.json
@@ -0,0 +1,21 @@
+{
+    "description": "In the Android system, use the task app to find the title of an event on the date \"15 June 2024\", then using the \"Google Drive\" app, create a new folder named as the title we found.",
+    "tasks": [
+        {
+            "task": "2394b768-2ca7-45e9-b41e-2aa4e9573192",
+            "attribute": {
+                "date": "15 June 2024"
+            },
+            "output": "EMNLP24 DDL"
+        },
+        {
+            "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ar",
+            "attribute": {
+                "content": "EMNLP24 DDL"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "e9268070-91b7-4e8c-9976-1cf8126ba13b"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/android/fbe6a1b1-63bb-4d4e-8a53-ff4f7839ef61.json b/crab-benchmark-v0/dataset/android/fbe6a1b1-63bb-4d4e-8a53-ff4f7839ef61.json
new file mode 100644
index 0000000..2acd5b4
--- /dev/null
+++ b/crab-benchmark-v0/dataset/android/fbe6a1b1-63bb-4d4e-8a53-ff4f7839ef61.json
@@ -0,0 +1,22 @@
+{
+    "description": "In Android, open the \"Contacts\" app to find the email address of a contact named Luis Martin, then use the \"Messages\" app to send the found email address to the phone number \"04055891132\".",
+    "tasks": [
+        {
+            "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ap",
+            "attribute": {
+                "name": "Luis Martin"
+            },
+            "output": "lmartin0431@gmail.com"
+        },
+        {
+            "task": "caa29623-1811-402d-963a-19f7eecc63d8",
+            "attribute": {
+                "content": "lmartin0431@gmail.com",
+                "number": "04055891132"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "fbe6a1b1-63bb-4d4e-8a53-ff4f7839ef61"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/android_subtasks.py b/crab-benchmark-v0/dataset/android_subtasks.py
index 02c7960..80be4be 100644
--- a/crab-benchmark-v0/dataset/android_subtasks.py
+++ b/crab-benchmark-v0/dataset/android_subtasks.py
@@ -361,6 +361,8 @@ def check_event(date: str, env) -> bool:
     event_nodes = root.xpath('//node[@class="android.support.v7.widget.RecyclerView"]')
     if event_nodes is None:
         return False
+    if not event_nodes:
+        return False
     for node in event_nodes[0]:
         text = node.get("content-desc")
         if date in text:
diff --git a/crab-benchmark-v0/dataset/cross/05a7633d-b966-471c-8848-e18e69ad265f.json b/crab-benchmark-v0/dataset/cross/05a7633d-b966-471c-8848-e18e69ad265f.json
index 2e6c9f0..5837689 100644
--- a/crab-benchmark-v0/dataset/cross/05a7633d-b966-471c-8848-e18e69ad265f.json
+++ b/crab-benchmark-v0/dataset/cross/05a7633d-b966-471c-8848-e18e69ad265f.json
@@ -1,5 +1,5 @@
 {
-    "description": "In Android, use the \"Google Map\" app to find the city name corresponding to the postal code \"1010021\" in Japan, then paste the name into LibreOffice Writer on an Ubuntu system and save it as an ODT file at \"/home/crab/Desktop\".",
+    "description": "In Android, use the \"Google Map\" app to find the city name corresponding to the postal code \"1010021\" in Japan, then paste the name into LibreOffice Writer on an Ubuntu system and save it as an ODT file at \"/home/crab/Desktop/target.opt\".",
     "tasks": [
         {
             "task": "51b2463c-9904-4a32-81ba-507bfb89d61f",
diff --git a/crab-benchmark-v0/dataset/handmade_tasks.py b/crab-benchmark-v0/dataset/handmade_tasks.py
index ef20a70..f3a15f4 100644
--- a/crab-benchmark-v0/dataset/handmade_tasks.py
+++ b/crab-benchmark-v0/dataset/handmade_tasks.py
@@ -24,8 +24,10 @@
 
 from .android_subtasks import (
     check_current_package_name,
+    check_google_tasks_name,
     check_message_text_box_contain,
     check_message_text_box_empty,
+    check_note_content,
     get_xml_etree,
 )
 from .ubuntu_subtasks import *  # noqa: F403
@@ -148,6 +150,44 @@ def check_keep_notes_content(text: str, env) -> bool:
     return text_nodes[0].get("text") == text
 
 
+@evaluator(env_name="android")
+def check_keep_notes_contain_fd(env) -> bool:
+    global RESULT_fd0576be
+    text = RESULT_fd0576be
+    root = get_xml_etree(env)
+    if root is None or text is None:
+        return False
+    edit_node = root.xpath(
+        '//node[@resource-id="com.google.android.keep:id/editor_bottom_bar"]'
+    )
+    if len(edit_node) != 1:
+        return False
+    content_node = root.xpath(
+        '//node[@resource-id="com.google.android.keep:id/browse_note_interior_content"]'
+    )
+    for node in content_node:
+        text_nodes = node.getchildren()
+        if len(text_nodes) != 1:
+            continue
+        if text in text_nodes[0].get("text"):
+            return True
+    return False
+
+
+@evaluator(env_name="android")
+def check_alarm_contains(time: str, env) -> bool:
+    root = get_xml_etree(env)
+    if root is None or time is None:
+        return False
+    clock_node = root.xpath(
+        '//node[@resource-id="com.google.android.deskclock:id/digital_clock"]'
+    )
+    for node in clock_node:
+        if time == node.get("text"):
+            return True
+    return False
+
+
 @evaluator(env_name="android", local=True)
 def check_tap_text(text: str, env) -> bool:
     if env.trajectory:
@@ -361,6 +401,112 @@ def evaluator_ca79febf():
     return result
 
 
+def evaluator_dfabf84c():
+    result = nx.DiGraph()
+    keyword = "kaust"
+    a = check_text_in_current_window_name("Mozilla Firefox")
+    b = check_contain_input_text(keyword)
+    c = is_img_url_in_clipboard()
+    d = download_from_clipboard_and_verify_file("/home/crab/Desktop/download.jpg")
+    e = check_current_package_name("com.google.android.keep")
+    f = check_contain_input_text(keyword)
+    g = check_note_content(keyword)
+    result.add_edges_from([(a, b), (b, c), (c, d), (d, g)])
+    result.add_edges_from([(b, e), (e, f), (f, g)])
+    return result
+
+
+def evaluator_aab5555e():
+    result = nx.DiGraph()
+    a = check_current_window_process("gnome-terminal-server")
+    b = check_contain_input_text("uname -a")
+    d = check_current_package_name("com.google.android.apps.messaging")
+    e = check_message_text_box_contain("ubuntu")
+    f = check_message_text_box_contain("x86")
+    g = check_message_text_box_contain("linux")
+    h = check_message_text_box_contain("crab")
+    sink = check_message_text_box_empty()
+    result.add_edges_from(
+        [
+            (a, b),
+            (b, sink),
+            (d, e),
+            (d, f),
+            (d, g),
+            (d, h),
+            (e, sink),
+            (f, sink),
+            (g, sink),
+            (h, sink),
+        ]
+    )
+    return result
+
+
+RESULT_fd0576be = None
+
+
+@action(env_name="ubuntu")
+def get_root_usage() -> str:
+    try:
+        output = subprocess.check_output(["df", "/"], text=True)
+        return output.split("\n")[1].split()[4][:-1]
+    except Exception:
+        return None
+
+
+@evaluator(env_name="ubuntu", local=True)
+def check_contain_input_text_and_get_df_result(text: str, env) -> bool:
+    global RESULT_fd0576be
+    RESULT_fd0576be = env._action_endpoint(get_root_usage, parameters={})
+    if env.trajectory:
+        inputs = [
+            params["text"].lower()
+            for action_name, params, _ in env.trajectory
+            if action_name == "write_text"
+        ]
+        return any(text.lower() in input_text for input_text in inputs)
+
+    return False
+
+
+def evaluator_fd0576be():
+    result = nx.DiGraph()
+    a = check_current_window_process("gnome-terminal-server")
+    b = check_contain_input_text_and_get_df_result("df")
+    c = check_current_package_name("com.google.android.keep")
+    d = check_keep_notes_contain_fd()
+    result.add_edges_from([(a, b), (b, d), (c, d)])
+    return result
+
+
+def evaluator_7e08f7d4():
+    result = nx.DiGraph()
+    a = check_text_in_current_window_name("Mozilla Firefox")
+    b = check_contain_input_text(
+        "https://farm9.staticflickr.com/8293/7591378270_76059bc1cf_z.jpg"
+    )
+    c = check_current_package_name("com.android.deskclock.DeskClock")
+    d = check_alarm_contains("7:00\u200aAM")
+    result.add_edges_from([(a, b), (b, d), (c, d)])
+    return result
+
+
+def evaluator_4957e964():
+    result = nx.DiGraph()
+    a = check_current_window_process("gnome-terminal-server")
+    b = check_contain_input_text("wget")
+    c = check_contain_input_text(
+        "https://farm8.staticflickr.com/7451/10001676353_fd762e02f0_z.jpg"
+    )
+    d = check_file_exist("/home/crab/Desktop/download.jpg")
+    e = check_text_in_current_window_name("Image Viewer")
+    f = check_current_package_name("com.google.android.apps.tasks")
+    g = check_google_tasks_name("tennis")
+    result.add_edges_from([(a, b), (b, c), (c, d), (d, e), (e, g), (f, g)])
+    return result
+
+
 # Hand-made environment setup guide:
 # Ubuntu
 # * Make sure the Ubuntu slack login, and the default channel has at least two messages
@@ -370,7 +516,40 @@ def evaluator_ca79febf():
 # * Make sure the init page of "Calendar" app is "Day" view. There should be at least one element today.
 
 
-handmade_tasks = [
+ubuntu_handmade_tasks = [
+    Task(
+        id="82efbd82-c941-4be9-9ac0-a495dc629e02",
+        description='Download an image file from a given URL "https://media.cntraveller.com/photos/642aa1ad770beda2d4f5cc22/4:3/w_2664,h_1998,c_limit/Fiji-march2023issue-JackJohns15.jpg" to "/home/crab/Downloads/raw.jpg", then use GIMP (GNU Image Manipulation Program) to adjust the brightness of the image from "/home/crab/Downloads/raw.jpg" to be brighter and save the edited file to "/home/crab/Pictures/edited.jpg", and set the adjusted image "/home/crab/Pictures/edited.jpg" as the screen background of the system.',
+        evaluator=evaluator_82efbd82(),
+    ),
+    Task(
+        id="515a5467-b7ce-4cad-874d-da894361c1a3",
+        description='Download two image files from given URLs "https://media.cntraveller.com/photos/642aa1ad770beda2d4f5cc22/4:3/w_2664,h_1998,c_limit/Fiji-march2023issue-JackJohns15.jpg" and "https://upload.wikimedia.org/wikipedia/commons/thumb/7/71/Flag_of_Ethiopia.svg/250px-Flag_of_Ethiopia.svg.png" to "/home/crab/Downloads/img_1.jpg" and "/home/crab/Downloads/img_2.jpg", combine the first image ("/home/crab/Downloads/img_1.jpg") with the second image ("/home/crab/Downloads/img_2.jpg") using GIMP (GNU Image Manipulation Program) by placing the first image on the right side of the second image, and save the resulting combined image to "/home/crab/Downloads/combined_editing.jpg". Then, create a new directory "/home/crab/jpg" and copy all files with the specified "jpg" extension from "/home/crab/Downloads" to the newly created directory "/home/crab/jpg".',
+        evaluator=evaluator_515a5467(),
+    ),
+    Task(
+        id="5a1eba49-ed2d-4955-a684-32472090a45b",
+        description='Use Firefox to search for an image using the keyword "GPU", copy the URL of the found image to the clipboard, download the image file from the URL stored in the clipboard to "/home/crab/Pictures/GPU.png", and create a new directory "/home/crab/Pictures/png_files" to copy all files with the specified "png" extension from "/home/crab/Pictures" to the newly created directory "/home/crab/Pictures/png_files".',
+        evaluator=evaluator_5a1eba49(),
+    ),
+    Task(
+        id="c347f78a-4643-43c8-b41e-e437b70a2c5e",
+        description='Open a file at "/home/crab/assets/content.txt" using vim in a terminal, write the specified "An air quality health advisory is in effect Tuesday for New York City and the lower Hudson Valley, as well as western Connecticut and northern New Jersey, meaning it may not be safe for people with some conditions to be outside long." to it, then save and exit vim. Print the content of the file by printing it to the command line interface through a terminal, and finally, submit the printed content.',
+        evaluator=evaluator_c347f78a(),
+    ),
+    Task(
+        id="bf83c176-fa15-4057-996f-f75be4338c05",
+        description='Use Firefox to search for an image using the keyword "Waymo" first, copy the URL of the image to the clipboard, and download the image to "/home/crab/Desktop/waymo.jpg". Then, search for another image using the keyword "Tesla", copy the URL of the image to the clipboard, and download the image to "/home/crab/Desktop/tesla.png". Finally, combine the two images using LibreOffice Impress, placing Image 1 from "/home/crab/Desktop/waymo.jpg" on the left side of Image 2 "/home/crab/Desktop/tesla.png", and save the resulting file in PDF format to "/home/crab/Documents/self_driving.pdf".',
+        evaluator=evaluator_bf83c176(),
+    ),
+    Task(
+        id="74bb11dd-89ca-43d0-8edf-fe7b5201ecf7",
+        description='Use Firefox to search for information about the country "France" on Wikipedia. Extract the capital city and population, and save this information in an ODS file at "/home/crab/Documents/FR.ods" using LibreOffice Calc. Then, search for information about the country "Mexico" on Wikipedia, extract the capital city and population, and save this information in a separate ODS file at "/home/crab/Documents/MX.ods" using LibreOffice Calc. The format of the file are, first column for the country name, the second for the capital city name, and the third for the population without any header. Finally, create a new directory "/home/crab/Desktop/country_info" and copy all files with the specified "ods" extension from "/home/crab/Documents" to the newly created directory "/home/crab/Desktop/country_info".',
+        evaluator=evaluator_74bb11dd(),
+    ),
+]
+
+corss_environment_tasks = [
     Task(
         id="79832e15-5fd3-43b8-b3e3-66249edfe1db",
         description='Open slack in Ubuntu desktop, summarize the last two messages in current channel, then use "Messages" app in android phone to send the summary to the first contact in the list.',
@@ -401,38 +580,35 @@ def evaluator_ca79febf():
         evaluator=evaluator_97e6f333(),
     ),
     Task(
-        id="82efbd82-c941-4be9-9ac0-a495dc629e02",
-        description='Download an image file from a given URL "https://media.cntraveller.com/photos/642aa1ad770beda2d4f5cc22/4:3/w_2664,h_1998,c_limit/Fiji-march2023issue-JackJohns15.jpg" to "/home/crab/Downloads/raw.jpg", then use GIMP (GNU Image Manipulation Program) to adjust the brightness of the image from "/home/crab/Downloads/raw.jpg" to be brighter and save the edited file to "/home/crab/Pictures/edited.jpg", and set the adjusted image "/home/crab/Pictures/edited.jpg" as the screen background of the system.',
-        evaluator=evaluator_82efbd82(),
+        id="ca79febf-cae7-4669-8812-d3ec85ee2868",
+        description="Open the first note in the Keep Notes app on Android, copy its contents, and paste them into a new document in Google docs. Then, open the newly created document in Firefox on Ubuntu.",
+        evaluator=evaluator_ca79febf(),
     ),
     Task(
-        id="515a5467-b7ce-4cad-874d-da894361c1a3",
-        description='Download two image files from given URLs "https://media.cntraveller.com/photos/642aa1ad770beda2d4f5cc22/4:3/w_2664,h_1998,c_limit/Fiji-march2023issue-JackJohns15.jpg" and "https://upload.wikimedia.org/wikipedia/commons/thumb/7/71/Flag_of_Ethiopia.svg/250px-Flag_of_Ethiopia.svg.png" to "/home/crab/Downloads/img_1.jpg" and "/home/crab/Downloads/img_2.jpg", combine the first image ("/home/crab/Downloads/img_1.jpg") with the second image ("/home/crab/Downloads/img_2.jpg") using GIMP (GNU Image Manipulation Program) by placing the first image on the right side of the second image, and save the resulting combined image to "/home/crab/Downloads/combined_editing.jpg". Then, create a new directory "/home/crab/jpg" and copy all files with the specified "jpg" extension from "/home/crab/Downloads" to the newly created directory "/home/crab/jpg".',
-        evaluator=evaluator_515a5467(),
+        id="dfabf84c-d05f-4e25-9f21-ba0f08107bd5",
+        description='Use Firefox to search for an image using the keyword "kaust" and copy the URL of the image to the clipboard. Download a file from the URL stored in the clipboard to "/home/crab/Desktop/download.jpg". Then describe this image and save it in the Android Keep Notes app.',
+        evaluator=evaluator_dfabf84c(),
     ),
     Task(
-        id="5a1eba49-ed2d-4955-a684-32472090a45b",
-        description='Use Firefox to search for an image using the keyword "GPU", copy the URL of the found image to the clipboard, download the image file from the URL stored in the clipboard to "/home/crab/Pictures/GPU.png", and create a new directory "/home/crab/Pictures/png_files" to copy all files with the specified "png" extension from "/home/crab/Pictures" to the newly created directory "/home/crab/Pictures/png_files".',
-        evaluator=evaluator_5a1eba49(),
+        id="aab5555e-4b72-4ebf-816a-59c1da2cec86",
+        description="Check the all uname information of the system in Ubuntu, then explain the information to the first contact in the list of the Messages app in Android.",
+        evaluator=evaluator_aab5555e(),
     ),
     Task(
-        id="c347f78a-4643-43c8-b41e-e437b70a2c5e",
-        description='Open a file at "/home/crab/assets/content.txt" using vim in a terminal, write the specified "An air quality health advisory is in effect Tuesday for New York City and the lower Hudson Valley, as well as western Connecticut and northern New Jersey, meaning it may not be safe for people with some conditions to be outside long." to it, then save and exit vim. Print the content of the file by printing it to the command line interface through a terminal, and finally, submit the printed content.',
-        evaluator=evaluator_c347f78a(),
+        id="fd0576be-8b2c-45ce-b4a2-78659740879b",
+        description="Check the current disk usage through command line in Ubuntu, check the root directory usage in percentage and save the information to a note in Keep Notes app in Android.",
+        evaluator=evaluator_fd0576be(),
     ),
     Task(
-        id="bf83c176-fa15-4057-996f-f75be4338c05",
-        description='Use Firefox to search for an image using the keyword "Waymo" first, copy the URL of the image to the clipboard, and download the image to "/home/crab/Desktop/waymo.jpg". Then, search for another image using the keyword "Tesla", copy the URL of the image to the clipboard, and download the image to "/home/crab/Desktop/tesla.png". Finally, combine the two images using LibreOffice Impress, placing Image 1 from "/home/crab/Desktop/waymo.jpg" on the left side of Image 2 "/home/crab/Desktop/tesla.png", and save the resulting file in PDF format to "/home/crab/Documents/self_driving.pdf".',
-        evaluator=evaluator_bf83c176(),
+        id="7e08f7d4-9b11-4aec-9b42-6cbde083fb4c",
+        description='Use firefox on Ubuntu to openup the image "https://farm9.staticflickr.com/8293/7591378270_76059bc1cf_z.jpg", check the time of the clock in the image, then open the clock app in Android and set an alarm to the same as the image.',
+        evaluator=evaluator_7e08f7d4(),
     ),
     Task(
-        id="74bb11dd-89ca-43d0-8edf-fe7b5201ecf7",
-        description='Use Firefox to search for information about the country "France" on Wikipedia. Extract the capital city and population, and save this information in an ODS file at "/home/crab/Documents/FR.ods" using LibreOffice Calc. Then, search for information about the country "Mexico" on Wikipedia, extract the capital city and population, and save this information in a separate ODS file at "/home/crab/Documents/MX.ods" using LibreOffice Calc. The format of the file are, first column for the country name, the second for the capital city name, and the third for the population without any header. Finally, create a new directory "/home/crab/Desktop/country_info" and copy all files with the specified "ods" extension from "/home/crab/Documents" to the newly created directory "/home/crab/Desktop/country_info".',
-        evaluator=evaluator_74bb11dd(),
-    ),
-    Task(
-        id="ca79febf-cae7-4669-8812-d3ec85ee2868",
-        description="Open the first note in the Keep Notes app on Android, copy its contents, and paste them into a new document in Google docs. Then, open the newly created document in Firefox on Ubuntu.",
-        evaluator=evaluator_ca79febf(),
+        id="4957e964-5dd5-42f6-9d5d-f6a53a9a5d94",
+        description='Use wget to download the image "https://farm8.staticflickr.com/7451/10001676353_fd762e02f0_z.jpg" to /home/crab/Desktop/download.jpg, what does the people in the image do? Create a task in the Tasks app in Android to remind you to do the same thing.',
+        evaluator=evaluator_4957e964(),
     ),
 ]
+
+handmade_tasks = ubuntu_handmade_tasks + corss_environment_tasks
diff --git a/crab-benchmark-v0/dataset/ubuntu/0deafe05-8db5-445f-9031-f6e884569d03.json b/crab-benchmark-v0/dataset/ubuntu/0deafe05-8db5-445f-9031-f6e884569d03.json
new file mode 100644
index 0000000..391e321
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/0deafe05-8db5-445f-9031-f6e884569d03.json
@@ -0,0 +1,25 @@
+{
+    "description": "Create a new directory \"/home/crab/jpg_folder\", copy all files with the \"jpg\" extension from \"/home/crab/Pictures\" to this newly created directory, then open LibreOffice Impress to combine the two images located at \"/home/crab/jpg_folder/dog.jpg\" (Image 1) and \"/home/crab/jpg_folder/Interstellar.jpg\" (Image 2), placing Image 1 on the right side of Image 2, and save the combined image in PDF format to \"/home/crab/Documents/combination.pdf\".",
+    "tasks": [
+        {
+            "task": "217ababc-ccc7-4b9f-af07-c239d92848fe",
+            "attribute": {
+                "file_extension": "jpg",
+                "source_dir": "/home/crab/Pictures",
+                "target_dir": "/home/crab/jpg_folder"
+            },
+            "output": "/home/crab/jpg_folder"
+        },
+        {
+            "task": "467f17a6-c42f-4eda-996f-a53385eb3efd",
+            "attribute": {
+                "image_path_1": "/home/crab/jpg_folder/dog.jpg",
+                "image_path_2": "/home/crab/jpg_folder/Interstellar.jpg",
+                "output_path": "/home/crab/Documents/combination.pdf"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "0deafe05-8db5-445f-9031-f6e884569d03"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/15a150a8-899c-4753-8dc5-05248ccc3640.json b/crab-benchmark-v0/dataset/ubuntu/15a150a8-899c-4753-8dc5-05248ccc3640.json
new file mode 100644
index 0000000..8f88774
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/15a150a8-899c-4753-8dc5-05248ccc3640.json
@@ -0,0 +1,22 @@
+{
+    "description": "Download the file from \"https://media.cntraveller.com/photos/642aa1ad770beda2d4f5cc22/4:3/w_2664,h_1998,c_limit/Fiji-march2023issue-JackJohns15.jpg\" to the location \"/home/crab/Downloads/fiji.png\", and then set \"/home/crab/Downloads/fiji.png\" as the desktop background on the system.",
+    "tasks": [
+        {
+            "task": "a313ea4d-e501-4971-b4fe-db2aad19eac1",
+            "attribute": {
+                "url": "https://media.cntraveller.com/photos/642aa1ad770beda2d4f5cc22/4:3/w_2664,h_1998,c_limit/Fiji-march2023issue-JackJohns15.jpg",
+                "file_path": "/home/crab/Downloads/fiji.png"
+            },
+            "output": "/home/crab/Downloads/fiji.png"
+        },
+        {
+            "task": "a207ef38-b3b2-4c6c-a1e3-75c38162f5ba",
+            "attribute": {
+                "photo_path": "/home/crab/Downloads/fiji.png"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "15a150a8-899c-4753-8dc5-05248ccc3640"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/299db8f2-81eb-455f-9302-5c8cb30be691.json b/crab-benchmark-v0/dataset/ubuntu/299db8f2-81eb-455f-9302-5c8cb30be691.json
new file mode 100644
index 0000000..aff7c6e
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/299db8f2-81eb-455f-9302-5c8cb30be691.json
@@ -0,0 +1,23 @@
+{
+    "description": "Combine two images, Image 1 \"/home/crab/Pictures/Interstellar.jpg\" and Image 2 \"/home/crab/Pictures/cat.png\", using GIMP (GNU Image Manipulation Program) with Image 1 placed on the left side of Image 2, and save the resulting image to \"/home/crab/Pictures/edited_background.png\". Then, set \"/home/crab/Pictures/edited_background.png\" as the desktop background on the system.",
+    "tasks": [
+        {
+            "task": "4cf246ea-0a7f-43da-84b6-61d74a2699af",
+            "attribute": {
+                "image_path_1": "/home/crab/Pictures/Interstellar.jpg",
+                "image_path_2": "/home/crab/Pictures/cat.png",
+                "output_path": "/home/crab/Pictures/edited_background.png"
+            },
+            "output": "/home/crab/Pictures/edited_background.png"
+        },
+        {
+            "task": "a207ef38-b3b2-4c6c-a1e3-75c38162f5ba",
+            "attribute": {
+                "photo_path": "/home/crab/Pictures/edited_background.png"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "299db8f2-81eb-455f-9302-5c8cb30be691"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/29f099b2-b3a5-463f-b10a-15363bf7e845.json b/crab-benchmark-v0/dataset/ubuntu/29f099b2-b3a5-463f-b10a-15363bf7e845.json
new file mode 100644
index 0000000..0b9ee8d
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/29f099b2-b3a5-463f-b10a-15363bf7e845.json
@@ -0,0 +1,22 @@
+{
+    "description": "Use Firefox to search for a \"garden\" around \"ETH Zurich\" on Google Maps, copy the sharing URL of that \"garden\" to the clipboard, then paste the content into Visual Studio Code (VS Code) and save the file at \"/home/crab/eth_garden.txt\".",
+    "tasks": [
+        {
+            "task": "2b189dc2-c77f-4fa3-8432-ba4355cc294c",
+            "attribute": {
+                "place_type": "garden",
+                "place_name": "ETH Zurich"
+            },
+            "output": null
+        },
+        {
+            "task": "8491e674-596b-452b-9e0e-58a44d90f947",
+            "attribute": {
+                "file_path": "/home/crab/eth_garden.txt"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "29f099b2-b3a5-463f-b10a-15363bf7e845"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/51c91051-3efb-4e92-a967-739b18520714.json b/crab-benchmark-v0/dataset/ubuntu/51c91051-3efb-4e92-a967-739b18520714.json
new file mode 100644
index 0000000..6c7ce88
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/51c91051-3efb-4e92-a967-739b18520714.json
@@ -0,0 +1,19 @@
+{
+    "description": "Open Firefox and search for the torch.matmul example provided by the official PyTorch version 1.13 documentation, copy all the lines of code from the example, open Visual Studio Code (VS Code), paste the clipboard content into a new file, and save it as \"/home/crab/example.py\".",
+    "tasks": [
+        {
+            "task": "49b614c5-c4bb-4c20-aab8-ab9dcc7de1b5",
+            "attribute": {},
+            "output": null
+        },
+        {
+            "task": "8491e674-596b-452b-9e0e-58a44d90f947",
+            "attribute": {
+                "file_path": "/home/crab/example.py"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "51c91051-3efb-4e92-a967-739b18520714"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/57b7e8a7-8c17-4cc4-9bb5-4385afde3ad8.json b/crab-benchmark-v0/dataset/ubuntu/57b7e8a7-8c17-4cc4-9bb5-4385afde3ad8.json
new file mode 100644
index 0000000..8c52c37
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/57b7e8a7-8c17-4cc4-9bb5-4385afde3ad8.json
@@ -0,0 +1,25 @@
+{
+    "description": "Create a new directory \"/home/crab/assets_for_edit\" and copy all files with the \"png\" extension from \"/home/crab/assets\" to this new directory. Then, combining Image 1 \"/home/crab/assets_for_edit/background.png\" and Image 2 \"/home/crab/assets_for_edit/campus.png\" with LibreOffice Writer, place Image 1 above Image 2, and save the file in the ODT format to \"/home/crab/assets_for_edit/back_n_campus.odt\".",
+    "tasks": [
+        {
+            "task": "217ababc-ccc7-4b9f-af07-c239d92848fe",
+            "attribute": {
+                "file_extension": "png",
+                "source_dir": "/home/crab/assets",
+                "target_dir": "/home/crab/assets_for_edit"
+            },
+            "output": "/home/crab/assets_for_edit"
+        },
+        {
+            "task": "0111384f-38ca-41a2-9504-cb1c55002b3c",
+            "attribute": {
+                "image_path_1": "/home/crab/assets_for_edit/background.png",
+                "image_path_2": "/home/crab/assets_for_edit/campus.png",
+                "output_path": "/home/crab/assets_for_edit/back_n_campus.odt"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "57b7e8a7-8c17-4cc4-9bb5-4385afde3ad8"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/5ba74c6a-4513-448b-8b68-ff145ece0652.json b/crab-benchmark-v0/dataset/ubuntu/5ba74c6a-4513-448b-8b68-ff145ece0652.json
new file mode 100644
index 0000000..0a16cf3
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/5ba74c6a-4513-448b-8b68-ff145ece0652.json
@@ -0,0 +1,22 @@
+{
+    "description": "Download the file from \"https://raw.githubusercontent.com/camel-ai/camel/master/README.md\" to \"/home/crab/Documents/README.md\", and then print the content of \"/home/crab/Documents/README.md\" to the command line interface through a terminal.",
+    "tasks": [
+        {
+            "task": "a313ea4d-e501-4971-b4fe-db2aad19eac1",
+            "attribute": {
+                "url": "https://raw.githubusercontent.com/camel-ai/camel/master/README.md",
+                "file_path": "/home/crab/Documents/README.md"
+            },
+            "output": "/home/crab/Documents/README.md"
+        },
+        {
+            "task": "5b527839-0e58-426d-bab6-7160200b0d24",
+            "attribute": {
+                "file_path": "/home/crab/Documents/README.md"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "5ba74c6a-4513-448b-8b68-ff145ece0652"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/6428f803-62de-40d2-a345-64e6cf955c9d.json b/crab-benchmark-v0/dataset/ubuntu/6428f803-62de-40d2-a345-64e6cf955c9d.json
new file mode 100644
index 0000000..04f5684
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/6428f803-62de-40d2-a345-64e6cf955c9d.json
@@ -0,0 +1,24 @@
+{
+    "description": "First, use LibreOffice Impress to adjust the brightness of the image located at \"/home/crab/Pictures/cat.png\" to make it darker, and save the edited image as \"/home/crab/Pictures/cat_edited.png\". Then, using GIMP (GNU Image Manipulation Program), combine the image \"/home/crab/Pictures/dog.png\" with \"/home/crab/Pictures/cat_edited.png\" by placing the dog image on the left side of the cat image, and save the merged image to \"/home/crab/Pictures/dog_cat.png\".",
+    "tasks": [
+        {
+            "task": "434402f3-647a-4a9a-9d8f-10f5bb6c7cf0",
+            "attribute": {
+                "image_path_before_edit": "/home/crab/Pictures/cat.png",
+                "image_path_after_edit": "/home/crab/Pictures/cat_edited.png"
+            },
+            "output": "/home/crab/Pictures/cat_edited.png"
+        },
+        {
+            "task": "4cf246ea-0a7f-43da-84b6-61d74a2699af",
+            "attribute": {
+                "image_path_1": "/home/crab/Pictures/dog.png",
+                "image_path_2": "/home/crab/Pictures/cat_edited.png",
+                "output_path": "/home/crab/Pictures/dog_cat.png"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "6428f803-62de-40d2-a345-64e6cf955c9d"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/696ca9bb-89ea-4cd5-b693-f2d749d964b1.json b/crab-benchmark-v0/dataset/ubuntu/696ca9bb-89ea-4cd5-b693-f2d749d964b1.json
new file mode 100644
index 0000000..77d2049
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/696ca9bb-89ea-4cd5-b693-f2d749d964b1.json
@@ -0,0 +1,22 @@
+{
+    "description": "Adjust the brightness of the image located at \"/home/crab/assets/campus.png\" using GIMP (GNU Image Manipulation Program) to make it brighter, save the adjusted image to \"/home/crab/Pictures/campus_brighter.png\", and then set this enhanced image as the desktop background on an Ubuntu system.",
+    "tasks": [
+        {
+            "task": "cc1adae7-bef9-4c8a-865d-00d44486dd69",
+            "attribute": {
+                "image_path_before_edit": "/home/crab/assets/campus.png",
+                "image_path_after_edit": "/home/crab/Pictures/campus_brighter.png"
+            },
+            "output": "/home/crab/Pictures/campus_brighter.png"
+        },
+        {
+            "task": "a207ef38-b3b2-4c6c-a1e3-75c38162f5ba",
+            "attribute": {
+                "photo_path": "/home/crab/Pictures/campus_brighter.png"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "696ca9bb-89ea-4cd5-b693-f2d749d964b1"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/6c3105a2-328c-4190-823d-03d759be0b57.json b/crab-benchmark-v0/dataset/ubuntu/6c3105a2-328c-4190-823d-03d759be0b57.json
new file mode 100644
index 0000000..6eea98c
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/6c3105a2-328c-4190-823d-03d759be0b57.json
@@ -0,0 +1,21 @@
+{
+    "description": "Use Firefox to search for an image with the keyword \"reinforcement learning,\" copy the URL of the chosen image to the clipboard, and download the image from the URL in the clipboard to \"/home/crab/Downloads/RL.png\" on an Ubuntu system.",
+    "tasks": [
+        {
+            "task": "017102b6-d2c3-466b-96f7-37c8bcddc41a",
+            "attribute": {
+                "keyword": "reinforcement learning"
+            },
+            "output": null
+        },
+        {
+            "task": "a313ea4d-e501-4971-b4fe-db2aad19acsd",
+            "attribute": {
+                "file_path": "/home/crab/Downloads/RL.png"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "6c3105a2-328c-4190-823d-03d759be0b57"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/6c560516-ca14-4f97-b51d-16ad81fc29e4.json b/crab-benchmark-v0/dataset/ubuntu/6c560516-ca14-4f97-b51d-16ad81fc29e4.json
new file mode 100644
index 0000000..9b52848
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/6c560516-ca14-4f97-b51d-16ad81fc29e4.json
@@ -0,0 +1,22 @@
+{
+    "description": "Open \"/home/crab/assets/a.txt\" using vim in a terminal, write \"The most recent COMPUTEX was held from 30 May to 2 June 2023 with sessions about such topics as high-performance computing, artificial intelligence, next-gen connectivity and sustainability.\", then save and exit vim, and print the content of \"/home/crab/assets/a.txt\" to the command line interface.",
+    "tasks": [
+        {
+            "task": "0f589bf9-9b26-4581-8b78-2961b115ab49",
+            "attribute": {
+                "file_path": "/home/crab/assets/a.txt",
+                "content": "The most recent COMPUTEX was held from 30 May to 2 June 2023 with sessions about such topics as high-performance computing, artificial intelligence, next-gen connectivity and sustainability."
+            },
+            "output": "/home/crab/assets/a.txt"
+        },
+        {
+            "task": "5b527839-0e58-426d-bab6-7160200b0d24",
+            "attribute": {
+                "file_path": "/home/crab/assets/a.txt"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "6c560516-ca14-4f97-b51d-16ad81fc29e4"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/730172f5-894a-4d46-9102-ac7d985a479d.json b/crab-benchmark-v0/dataset/ubuntu/730172f5-894a-4d46-9102-ac7d985a479d.json
new file mode 100644
index 0000000..de96602
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/730172f5-894a-4d46-9102-ac7d985a479d.json
@@ -0,0 +1,23 @@
+{
+    "description": "Download the image of Jupiter from \"https://upload.wikimedia.org/wikipedia/commons/thumb/2/2b/Jupiter_and_its_shrunken_Great_Red_Spot.jpg/640px-Jupiter_and_its_shrunken_Great_Red_Spot.jpg\" to \"/home/crab/Pictures/jupiter.jpg\", then use LibreOffice Impress to adjust the brightness of this image to make it darker and save the edited version as \"/home/crab/Pictures/jupiter_edited.jpg\".",
+    "tasks": [
+        {
+            "task": "a313ea4d-e501-4971-b4fe-db2aad19eac1",
+            "attribute": {
+                "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/2/2b/Jupiter_and_its_shrunken_Great_Red_Spot.jpg/640px-Jupiter_and_its_shrunken_Great_Red_Spot.jpg",
+                "file_path": "/home/crab/Pictures/jupiter.jpg"
+            },
+            "output": "/home/crab/Pictures/jupiter.jpg"
+        },
+        {
+            "task": "434402f3-647a-4a9a-9d8f-10f5bb6c7cf0",
+            "attribute": {
+                "image_path_before_edit": "/home/crab/Pictures/jupiter.jpg",
+                "image_path_after_edit": "/home/crab/Pictures/jupiter_edited.jpg"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "730172f5-894a-4d46-9102-ac7d985a479d"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/73038efb-ca0f-4d90-a947-fcfd097dd91b.json b/crab-benchmark-v0/dataset/ubuntu/73038efb-ca0f-4d90-a947-fcfd097dd91b.json
new file mode 100644
index 0000000..4478eda
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/73038efb-ca0f-4d90-a947-fcfd097dd91b.json
@@ -0,0 +1,19 @@
+{
+    "description": "Open Firefox and navigate to the official PyTorch version 1.13 documentation to find an example of `torch.matmul`. Copy all the lines of code in the example to the clipboard. Then, paste the clipboard content into Visual Studio Code (VS Code) and save it as a file at \"/home/crab/example_code.txt\".",
+    "tasks": [
+        {
+            "task": "49b614c5-c4bb-4c20-aab8-ab9dcc7de1b5",
+            "attribute": {},
+            "output": null
+        },
+        {
+            "task": "8491e674-596b-452b-9e0e-58a44d90f947",
+            "attribute": {
+                "file_path": "/home/crab/example_code.txt"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "73038efb-ca0f-4d90-a947-fcfd097dd91b"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/73da97c9-f084-4cab-8697-1151737387ff.json b/crab-benchmark-v0/dataset/ubuntu/73da97c9-f084-4cab-8697-1151737387ff.json
new file mode 100644
index 0000000..00d368c
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/73da97c9-f084-4cab-8697-1151737387ff.json
@@ -0,0 +1,22 @@
+{
+    "description": "Download the file from \"https://images.top1market.com/images/cms/uploads/20230928/4950e1db0038feb506fdcfa0c936fd8e.png\" to \"/home/crab/Desktop/meta.png\", then set this image, \"/home/crab/Desktop/meta.png\", as the desktop background on the system.",
+    "tasks": [
+        {
+            "task": "a313ea4d-e501-4971-b4fe-db2aad19eac1",
+            "attribute": {
+                "url": "https://images.top1market.com/images/cms/uploads/20230928/4950e1db0038feb506fdcfa0c936fd8e.png",
+                "file_path": "/home/crab/Desktop/meta.png"
+            },
+            "output": "/home/crab/Desktop/meta.png"
+        },
+        {
+            "task": "a207ef38-b3b2-4c6c-a1e3-75c38162f5ba",
+            "attribute": {
+                "photo_path": "/home/crab/Desktop/meta.png"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "73da97c9-f084-4cab-8697-1151737387ff"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/78502f1c-879b-4932-a5fd-d85f7f6b0f81.json b/crab-benchmark-v0/dataset/ubuntu/78502f1c-879b-4932-a5fd-d85f7f6b0f81.json
new file mode 100644
index 0000000..37acf3d
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/78502f1c-879b-4932-a5fd-d85f7f6b0f81.json
@@ -0,0 +1,22 @@
+{
+    "description": "Download the file from \"https://cemse.kaust.edu.sa/sites/default/files/styles/large/public/2023-04/Web%20banner.jpg?itok=d1TvGUKY\" to \"/home/crab/Pictures/KAUST_AI.png\" and then set this image as the desktop background on the system.",
+    "tasks": [
+        {
+            "task": "a313ea4d-e501-4971-b4fe-db2aad19eac1",
+            "attribute": {
+                "url": "https://cemse.kaust.edu.sa/sites/default/files/styles/large/public/2023-04/Web%20banner.jpg?itok=d1TvGUKY",
+                "file_path": "/home/crab/Pictures/KAUST_AI.png"
+            },
+            "output": "/home/crab/Pictures/KAUST_AI.png"
+        },
+        {
+            "task": "a207ef38-b3b2-4c6c-a1e3-75c38162f5ba",
+            "attribute": {
+                "photo_path": "/home/crab/Pictures/KAUST_AI.png"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "78502f1c-879b-4932-a5fd-d85f7f6b0f81"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/7dda7e46-78be-4663-b882-6132dbbff335.json b/crab-benchmark-v0/dataset/ubuntu/7dda7e46-78be-4663-b882-6132dbbff335.json
new file mode 100644
index 0000000..eb2ee8f
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/7dda7e46-78be-4663-b882-6132dbbff335.json
@@ -0,0 +1,22 @@
+{
+    "description": "Adjust the brightness of the image located at \"/home/crab/Pictures/Interstellar.jpg\" to a higher value using GIMP (GNU Image Manipulation Program), save the edited image as \"/home/crab/edited_background.png\", and then set this edited image as the desktop background on the system.",
+    "tasks": [
+        {
+            "task": "cc1adae7-bef9-4c8a-865d-00d44486dd69",
+            "attribute": {
+                "image_path_before_edit": "/home/crab/Pictures/Interstellar.jpg",
+                "image_path_after_edit": "/home/crab/edited_background.png"
+            },
+            "output": "/home/crab/edited_background.png"
+        },
+        {
+            "task": "a207ef38-b3b2-4c6c-a1e3-75c38162f5ba",
+            "attribute": {
+                "photo_path": "/home/crab/edited_background.png"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "7dda7e46-78be-4663-b882-6132dbbff335"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/82c49e12-3b2f-432e-9069-4b67bafebbf7.json b/crab-benchmark-v0/dataset/ubuntu/82c49e12-3b2f-432e-9069-4b67bafebbf7.json
new file mode 100644
index 0000000..ab94179
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/82c49e12-3b2f-432e-9069-4b67bafebbf7.json
@@ -0,0 +1,22 @@
+{
+    "description": "Open Firefox to find a coffee shop around the hungarian parliament on Google Maps, copy the sharing URL of the coffee shop to the clipboard, then paste the clipboard content into Visual Studio Code (VS Code), and save the content as a file at \"/home/crab/Downloads/coffee\".",
+    "tasks": [
+        {
+            "task": "2b189dc2-c77f-4fa3-8432-ba4355cc294c",
+            "attribute": {
+                "place_type": "coffee shop",
+                "place_name": "hungarian parliament"
+            },
+            "output": null
+        },
+        {
+            "task": "8491e674-596b-452b-9e0e-58a44d90f947",
+            "attribute": {
+                "file_path": "/home/crab/Downloads/coffee"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "82c49e12-3b2f-432e-9069-4b67bafebbf7"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/87910f23-ab23-4ccc-b115-d71cff6f0162.json b/crab-benchmark-v0/dataset/ubuntu/87910f23-ab23-4ccc-b115-d71cff6f0162.json
new file mode 100644
index 0000000..d83c6b7
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/87910f23-ab23-4ccc-b115-d71cff6f0162.json
@@ -0,0 +1,21 @@
+{
+    "description": "Use Firefox to search for an image with the keyword \"patagonia,\" copy the URL of the chosen image to the clipboard, and download the file from that URL to \"/home/crab/Desktop/brand.jpg\".",
+    "tasks": [
+        {
+            "task": "017102b6-d2c3-466b-96f7-37c8bcddc41a",
+            "attribute": {
+                "keyword": "patagonia"
+            },
+            "output": null
+        },
+        {
+            "task": "a313ea4d-e501-4971-b4fe-db2aad19acsd",
+            "attribute": {
+                "file_path": "/home/crab/Desktop/brand.jpg"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "87910f23-ab23-4ccc-b115-d71cff6f0162"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/8cb5ab6d-a56e-43b9-aa83-00a46331e20f.json b/crab-benchmark-v0/dataset/ubuntu/8cb5ab6d-a56e-43b9-aa83-00a46331e20f.json
new file mode 100644
index 0000000..b7be94c
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/8cb5ab6d-a56e-43b9-aa83-00a46331e20f.json
@@ -0,0 +1,23 @@
+{
+    "description": "Download the image from \"https://res.cloudinary.com/simpleview/image/upload/v1648755098/clients/austin/Austin_Skyline_Credit_Christopher_Sherman_lifetime__4f60343d-9f69-450c-8ad3-fa636761786d.jpg\" to \"/home/crab/Downloads/Austin.jpg\", then use GIMP (GNU Image Manipulation Program) to adjust its brightness to a higher value and save the modified image as \"/home/crab/Downloads/brighter_austin.jpg\".",
+    "tasks": [
+        {
+            "task": "a313ea4d-e501-4971-b4fe-db2aad19eac1",
+            "attribute": {
+                "url": "https://res.cloudinary.com/simpleview/image/upload/v1648755098/clients/austin/Austin_Skyline_Credit_Christopher_Sherman_lifetime__4f60343d-9f69-450c-8ad3-fa636761786d.jpg",
+                "file_path": "/home/crab/Downloads/Austin.jpg"
+            },
+            "output": "/home/crab/Downloads/Austin.jpg"
+        },
+        {
+            "task": "cc1adae7-bef9-4c8a-865d-00d44486dd69",
+            "attribute": {
+                "image_path_before_edit": "/home/crab/Downloads/Austin.jpg",
+                "image_path_after_edit": "/home/crab/Downloads/brighter_austin.jpg"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "8cb5ab6d-a56e-43b9-aa83-00a46331e20f"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/a70ab903-835f-48b7-8356-2321b8b869d8.json b/crab-benchmark-v0/dataset/ubuntu/a70ab903-835f-48b7-8356-2321b8b869d8.json
new file mode 100644
index 0000000..a2e4ba1
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/a70ab903-835f-48b7-8356-2321b8b869d8.json
@@ -0,0 +1,19 @@
+{
+    "description": "Using Firefox, find the example of torch.matmul provided by the official PyTorch version 1.13 documentation and copy all the lines of code in the example to the clipboard, then paste the clipboard content into LibreOffice Writer and save it as an ODT file at \"/home/crab/Desktop/doc_torch.odt\".",
+    "tasks": [
+        {
+            "task": "49b614c5-c4bb-4c20-aab8-ab9dcc7de1b5",
+            "attribute": {},
+            "output": null
+        },
+        {
+            "task": "76de4bdb-c980-4b3a-9bd3-c87db467dffe",
+            "attribute": {
+                "file_path": "/home/crab/Desktop/doc_torch.odt"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "a70ab903-835f-48b7-8356-2321b8b869d8"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/abb16512-27ae-49c0-b12b-7fbf0e95056b.json b/crab-benchmark-v0/dataset/ubuntu/abb16512-27ae-49c0-b12b-7fbf0e95056b.json
new file mode 100644
index 0000000..190ddb9
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/abb16512-27ae-49c0-b12b-7fbf0e95056b.json
@@ -0,0 +1,21 @@
+{
+    "description": "Paste the clipboard content into Visual Studio Code (VS Code) and save the file as \"/home/crab/Desktop/content.txt\", then open a terminal and print the content of \"/home/crab/Desktop/content.txt\" to the command line interface.",
+    "tasks": [
+        {
+            "task": "8491e674-596b-452b-9e0e-58a44d90f947",
+            "attribute": {
+                "file_path": "/home/crab/Desktop/content.txt"
+            },
+            "output": "/home/crab/Desktop/content.txt"
+        },
+        {
+            "task": "5b527839-0e58-426d-bab6-7160200b0d24",
+            "attribute": {
+                "file_path": "/home/crab/Desktop/content.txt"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "abb16512-27ae-49c0-b12b-7fbf0e95056b"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/b2ca21dc-dde9-49f5-bec7-321fbf769315.json b/crab-benchmark-v0/dataset/ubuntu/b2ca21dc-dde9-49f5-bec7-321fbf769315.json
new file mode 100644
index 0000000..969ddff
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/b2ca21dc-dde9-49f5-bec7-321fbf769315.json
@@ -0,0 +1,24 @@
+{
+    "description": "Adjust the brightness of the image located at \"/home/crab/assets/desert.jpg\" to a darker value using LibreOffice Impress and save it as \"/home/crab/assets/darker_desert.jpg\", then use GIMP (GNU Image Manipulation Program) to combine this adjusted image with the original image at \"/home/crab/assets/desert.jpg\", placing the darker image on the left side and the original on the right, finally save the resulting comparison image to \"/home/crab/assets/desert_comparison.jpg\".",
+    "tasks": [
+        {
+            "task": "434402f3-647a-4a9a-9d8f-10f5bb6c7cf0",
+            "attribute": {
+                "image_path_before_edit": "/home/crab/assets/desert.jpg",
+                "image_path_after_edit": "/home/crab/assets/darker_desert.jpg"
+            },
+            "output": "/home/crab/assets/darker_desert.jpg"
+        },
+        {
+            "task": "4cf246ea-0a7f-43da-84b6-61d74a2699af",
+            "attribute": {
+                "image_path_1": "/home/crab/assets/darker_desert.jpg",
+                "image_path_2": "/home/crab/assets/desert.jpg",
+                "output_path": "/home/crab/assets/desert_comparison.jpg"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "b2ca21dc-dde9-49f5-bec7-321fbf769315"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/ccf31785-ec13-4981-93c5-ca6c242ac0c3.json b/crab-benchmark-v0/dataset/ubuntu/ccf31785-ec13-4981-93c5-ca6c242ac0c3.json
new file mode 100644
index 0000000..6d4a06d
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/ccf31785-ec13-4981-93c5-ca6c242ac0c3.json
@@ -0,0 +1,24 @@
+{
+    "description": "Download the flag of Ethiopia image from \"https://upload.wikimedia.org/wikipedia/commons/thumb/7/71/Flag_of_Ethiopia.svg/250px-Flag_of_Ethiopia.svg.png\" to \"/home/crab/Pictures/flag.png\", create a new directory named \"/home/crab/Pictures/png_\", and copy all PNG files from \"/home/crab/Pictures\" to the newly created directory \"/home/crab/Pictures/png_\".",
+    "tasks": [
+        {
+            "task": "a313ea4d-e501-4971-b4fe-db2aad19eac1",
+            "attribute": {
+                "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/7/71/Flag_of_Ethiopia.svg/250px-Flag_of_Ethiopia.svg.png",
+                "file_path": "/home/crab/Pictures/flag.png"
+            },
+            "output": "/home/crab/Pictures/flag.png"
+        },
+        {
+            "task": "217ababc-ccc7-4b9f-af07-c239d92848fe",
+            "attribute": {
+                "file_extension": "png",
+                "source_dir": "/home/crab/Pictures",
+                "target_dir": "/home/crab/Pictures/png_"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "ccf31785-ec13-4981-93c5-ca6c242ac0c3"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/d3478489-70f2-4a82-b7d2-0a47b75986eb.json b/crab-benchmark-v0/dataset/ubuntu/d3478489-70f2-4a82-b7d2-0a47b75986eb.json
new file mode 100644
index 0000000..b4745c2
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/d3478489-70f2-4a82-b7d2-0a47b75986eb.json
@@ -0,0 +1,24 @@
+{
+    "description": "Use Firefox to search for the country \"Ethiopia\" on Wikipedia, extract the capital city and population, save this information in an ODS file at \"/home/crab/Documents/africa.ods\" with LibreOffice Calc with the first column for the country name, the second for the capital city name, and the third for the population without any header, then create a new directory \"/home/crab/sheet\" and copy all ODS files from \"/home/crab/Documents\" to \"/home/crab/sheet\".",
+    "tasks": [
+        {
+            "task": "1cd6519a-9ee0-442b-ba5a-9238aeb00ff6",
+            "attribute": {
+                "country": "Ethiopia",
+                "file_path": "/home/crab/Documents/africa.ods"
+            },
+            "output": "/home/crab/Documents/africa.ods"
+        },
+        {
+            "task": "217ababc-ccc7-4b9f-af07-c239d92848fe",
+            "attribute": {
+                "file_extension": "ods",
+                "source_dir": "/home/crab/Documents",
+                "target_dir": "/home/crab/sheet"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "d3478489-70f2-4a82-b7d2-0a47b75986eb"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/d3c917ff-406f-447a-87f5-b8d835cba750.json b/crab-benchmark-v0/dataset/ubuntu/d3c917ff-406f-447a-87f5-b8d835cba750.json
new file mode 100644
index 0000000..485f6e6
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/d3c917ff-406f-447a-87f5-b8d835cba750.json
@@ -0,0 +1,23 @@
+{
+    "description": "Combine Image 1 \"/home/crab/Pictures/cat.png\" and Image 2 \"/home/crab/assets/campus.png\" using GIMP (GNU Image Manipulation Program), placing Image 1 on the left side of Image 2, and save the combined image to \"/home/crab/Desktop/background.png\". Then, set this combined image as the screen background of the system.",
+    "tasks": [
+        {
+            "task": "4cf246ea-0a7f-43da-84b6-61d74a2699af",
+            "attribute": {
+                "image_path_1": "/home/crab/Pictures/cat.png",
+                "image_path_2": "/home/crab/assets/campus.png",
+                "output_path": "/home/crab/Desktop/background.png"
+            },
+            "output": "/home/crab/Desktop/background.png"
+        },
+        {
+            "task": "a207ef38-b3b2-4c6c-a1e3-75c38162f5ba",
+            "attribute": {
+                "photo_path": "/home/crab/Desktop/background.png"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "d3c917ff-406f-447a-87f5-b8d835cba750"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/d6e460e4-c295-40ad-883c-11300d7832f0.json b/crab-benchmark-v0/dataset/ubuntu/d6e460e4-c295-40ad-883c-11300d7832f0.json
new file mode 100644
index 0000000..c0332e8
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/d6e460e4-c295-40ad-883c-11300d7832f0.json
@@ -0,0 +1,19 @@
+{
+    "description": "Using Firefox, locate the example provided of torch.matmul by the official PyTorch version 1.13 documentation and copy all the lines of code to the clipboard, then open LibreOffice Writer, paste the content from the clipboard, and save the document as an ODT file at \"/home/crab/Documents/torch_matmul.odt\".",
+    "tasks": [
+        {
+            "task": "49b614c5-c4bb-4c20-aab8-ab9dcc7de1b5",
+            "attribute": {},
+            "output": null
+        },
+        {
+            "task": "76de4bdb-c980-4b3a-9bd3-c87db467dffe",
+            "attribute": {
+                "file_path": "/home/crab/Documents/torch_matmul.odt"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "d6e460e4-c295-40ad-883c-11300d7832f0"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/e31d4e3b-b753-4deb-b9ad-a0add5d4790e.json b/crab-benchmark-v0/dataset/ubuntu/e31d4e3b-b753-4deb-b9ad-a0add5d4790e.json
new file mode 100644
index 0000000..815ed5a
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/e31d4e3b-b753-4deb-b9ad-a0add5d4790e.json
@@ -0,0 +1,21 @@
+{
+    "description": "Use Firefox to search for an image with the keyword \"Mission: Impossible\", copy the image's URL to the clipboard, and then download the file from the clipboard's URL to \"/home/crab/Pictures/movie.jpg\".",
+    "tasks": [
+        {
+            "task": "017102b6-d2c3-466b-96f7-37c8bcddc41a",
+            "attribute": {
+                "keyword": "Mission: Impossible"
+            },
+            "output": ""
+        },
+        {
+            "task": "a313ea4d-e501-4971-b4fe-db2aad19acsd",
+            "attribute": {
+                "file_path": "/home/crab/Pictures/movie.jpg"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "e31d4e3b-b753-4deb-b9ad-a0add5d4790e"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/dataset/ubuntu/f67a26e4-58dd-4dc6-8859-affbf1d62f94.json b/crab-benchmark-v0/dataset/ubuntu/f67a26e4-58dd-4dc6-8859-affbf1d62f94.json
new file mode 100644
index 0000000..b24bc25
--- /dev/null
+++ b/crab-benchmark-v0/dataset/ubuntu/f67a26e4-58dd-4dc6-8859-affbf1d62f94.json
@@ -0,0 +1,22 @@
+{
+    "description": "Open \"/home/crab/poem\" using vim in a terminal, write \"Two roads diverged in a yellow wood, and sorry I could not travel both and be one traveler, long I stood and looked down one as far as I could to where it bent in the undergrowth.\", save and exit vim, and then print the content of \"/home/crab/poem\" to the command line interface through the terminal.",
+    "tasks": [
+        {
+            "task": "0f589bf9-9b26-4581-8b78-2961b115ab49",
+            "attribute": {
+                "file_path": "/home/crab/poem",
+                "content": "Two roads diverged in a yellow wood, and sorry I could not travel both and be one traveler, long I stood and looked down one as far as I could to where it bent in the undergrowth."
+            },
+            "output": "/home/crab/poem"
+        },
+        {
+            "task": "5b527839-0e58-426d-bab6-7160200b0d24",
+            "attribute": {
+                "file_path": "/home/crab/poem"
+            },
+            "output": null
+        }
+    ],
+    "adjlist": "0 1\n1",
+    "id": "f67a26e4-58dd-4dc6-8859-affbf1d62f94"
+}
\ No newline at end of file
diff --git a/crab-benchmark-v0/main.py b/crab-benchmark-v0/main.py
index 07c4ba0..79e4afa 100644
--- a/crab-benchmark-v0/main.py
+++ b/crab-benchmark-v0/main.py
@@ -24,7 +24,7 @@
     TaskGenerator,
     create_benchmark,
 )
-from crab.actions.crab_actions import complete
+from crab.actions.crab_actions import complete, wait
 from crab.actions.visual_prompt_actions import (
     get_elements_prompt,
     groundingdino_easyocr,
@@ -96,7 +96,7 @@ def get_benchmark(env: str, ubuntu_url: str):
             tasks=[],
             environments=[ubuntu_env],
             prompting_tools=prompting_tools,
-            root_action_space=[complete],
+            root_action_space=[complete, wait],
             multienv=True,
         )
     elif env == "android":
@@ -106,7 +106,7 @@ def get_benchmark(env: str, ubuntu_url: str):
             tasks=[],
             environments=[ANDROID_ENV],
             prompting_tools=prompting_tools,
-            root_action_space=[complete],
+            root_action_space=[complete, wait],
             multienv=True,
         )
     elif env == "cross":
@@ -119,7 +119,7 @@ def get_benchmark(env: str, ubuntu_url: str):
             tasks=[],
             environments=[ubuntu_env, ANDROID_ENV],
             prompting_tools=prompting_tools,
-            root_action_space=[complete],
+            root_action_space=[complete, wait],
             multienv=True,
         )
     else:
@@ -137,7 +137,7 @@ def get_benchmark(env: str, ubuntu_url: str):
     # Load from handmade tasks
     benchmark_config.tasks.extend(handmade_tasks)
 
-    benchmark_config.step_limit = 15
+    benchmark_config.step_limit = 20
     return create_benchmark(benchmark_config)
 
 
@@ -188,6 +188,12 @@ def get_benchmark(env: str, ubuntu_url: str):
         help="logger level, debug, info, warning, or error",
         default="warning",
     )
+    parser.add_argument(
+        "--history-messages-len",
+        type=int,
+        help="The number of rounds of chat history to provide to the model",
+        default=2,
+    )
     args = parser.parse_args()
     loglevel = args.loglevel
     numeric_level = getattr(logging, loglevel.upper(), None)
@@ -197,43 +203,58 @@ def get_benchmark(env: str, ubuntu_url: str):
 
     benchmark = get_benchmark(args.env, args.ubuntu_url)
 
+    if args.model == "human":
+        expeirment = CrabBenchmarkV0(
+            benchmark=benchmark,
+            task_id=args.task_id,
+            agent_policy="human",
+        )
+        expeirment.start_benchmark()
+        exit()
+
     if args.model == "gpt4o":
         model = BackendModelConfig(
             model_class="openai",
             model_name="gpt-4o",
-            history_messages_len=2,
+            history_messages_len=args.history_messages_len,
         )
     elif args.model == "gpt4turbo":
         model = BackendModelConfig(
             model_class="openai",
             model_name="gpt-4-turbo",
-            history_messages_len=2,
+            history_messages_len=args.history_messages_len,
         )
     elif args.model == "gemini":
         model = BackendModelConfig(
             model_class="gemini",
             model_name="gemini-1.5-pro-latest",
-            history_messages_len=2,
+            history_messages_len=args.history_messages_len,
         )
     elif args.model == "claude":
         model = BackendModelConfig(
             model_class="claude",
             model_name="claude-3-opus-20240229",
-            history_messages_len=2,
+            history_messages_len=args.history_messages_len,
         )
-    elif args.model == "llava-1.6":
+    elif args.model == "pixtral":
         model = BackendModelConfig(
-            model_class="vllm",
-            model_name="llava-hf/llava-v1.6-34b-hf",
-            history_messages_len=2,
+            model_class="openai-json",
+            model_name="mistralai/Pixtral-12B-2409",
+            history_messages_len=args.history_messages_len,
             base_url=args.model_base_url,
             api_key=args.model_api_key,
         )
-    elif args.model == "pixtral":
+    elif args.model == "gpt4o-wofc":
         model = BackendModelConfig(
-            model_class="vllm",
-            model_name="mistralai/Pixtral-12B-2409",
-            history_messages_len=1,
+            model_class="openai-json",
+            model_name="gpt-4o",
+            history_messages_len=args.history_messages_len,
+        )
+    elif args.model == "llava-ov72b":
+        model = BackendModelConfig(
+            model_class="sglang-openai-json",
+            model_name="lmms-lab/llava-onevision-qwen2-72b-ov-chat",
+            history_messages_len=args.history_messages_len,
             base_url=args.model_base_url,
             api_key=args.model_api_key,
         )
@@ -255,7 +276,7 @@ def get_benchmark(env: str, ubuntu_url: str):
         print("Unsupported policy: ", args.policy)
         exit()
 
-    log_dir = (Path(__file__).parent / "logs").resolve()
+    log_dir = (Path(__file__).parent / "tianqi_logs").resolve()
     expeirment = CrabBenchmarkV0(
         benchmark=benchmark,
         task_id=args.task_id,
diff --git a/crab-benchmark-v0/ubuntu_env.py b/crab-benchmark-v0/ubuntu_env.py
index 2ecec7e..2fd5be2 100644
--- a/crab-benchmark-v0/ubuntu_env.py
+++ b/crab-benchmark-v0/ubuntu_env.py
@@ -13,6 +13,7 @@
 # =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
 from crab.actions.desktop_actions import (
     click,
+    double_click,
     key_press,
     press_hotkey,
     right_click,
@@ -31,6 +32,7 @@
         press_hotkey,
         search_application,
         right_click,
+        double_click,
     ],
     observation_space=[screenshot],
     description="""An Ubuntu 22.04 Linux desktop operating system. The interface \
diff --git a/crab/actions/crab_actions.py b/crab/actions/crab_actions.py
index d757cf2..8c41d5a 100644
--- a/crab/actions/crab_actions.py
+++ b/crab/actions/crab_actions.py
@@ -11,6 +11,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
+from time import sleep
+
 from crab import action, evaluator
 
 
@@ -42,6 +44,14 @@ def complete() -> bool:
     pass
 
 
+@action(env_name="root")
+def wait() -> bool:
+    """If the environment is still processing your action and you have nothing to do in
+    this step, you can use wait().
+    """
+    sleep(5)
+
+
 def get_element_position(element_id, env):
     """Get element position provided by function `zs_object_detection`"""
     box = env.element_position_map[element_id]
diff --git a/crab/actions/desktop_actions.py b/crab/actions/desktop_actions.py
index cf47ddc..861ecdd 100644
--- a/crab/actions/desktop_actions.py
+++ b/crab/actions/desktop_actions.py
@@ -69,7 +69,7 @@ def right_click(element: int, env) -> None:
     """
     Right-click an UI element shown on the desktop screen using the mouse, which is
     usually used for opening the menu of the element. A simple use case can be
-    rght_click(5), which right-clicks the UI element labeled with the number 5 to open
+    right_click(5), which right-clicks the UI element labeled with the number 5 to open
     up menu on it.
 
     Args:
@@ -80,6 +80,34 @@ def right_click(element: int, env) -> None:
     time.sleep(DELAY)
 
 
+@action
+def double_click_position(x: int, y: int) -> None:
+    """
+    Double-click on the current desktop screen.
+
+    Args:
+        x: The X coordinate, as a floating-point number in the range [0.0, 1.0].
+        y: The Y coordinate, as a floating-point number in the range [0.0, 1.0].
+    """
+    pyautogui.click(x, y, duration=DURATION, clicks=2, interval=0.2)
+
+
+@action(local=True)
+def double_click(element: int, env) -> None:
+    """
+    Double-click an UI element shown on the desktop screen using the mouse, which is
+    usually used for opening a folder or a file. A simple use case can be
+    double_click(5), which double-clicks the UI element labeled with the number 5 to
+    open it.
+
+    Args:
+        element: A numeric tag assigned to an UI element shown on the screenshot.
+    """
+    x, y = get_element_position(element, env)
+    env._action_endpoint(double_click_position, {"x": x, "y": y})
+    time.sleep(DELAY)
+
+
 @action
 def mouse_scroll(click: int = 1) -> None:
     """
diff --git a/crab/agents/backend_models/__init__.py b/crab/agents/backend_models/__init__.py
index 172b6a1..6c6bdab 100644
--- a/crab/agents/backend_models/__init__.py
+++ b/crab/agents/backend_models/__init__.py
@@ -21,16 +21,15 @@
 from .camel_model import CamelModel
 from .claude_model import ClaudeModel
 from .gemini_model import GeminiModel
-from .openai_model import OpenAIModel
-from .vllm_model import VLLMModel
+from .openai_model import OpenAIModel, OpenAIModelJSON, SGlangOpenAIModelJSON
 
 
 class BackendModelConfig(BaseModel):
-    model_class: Literal["openai", "claude", "gemini", "camel", "vllm"]
+    model_class: Literal["openai", "claude", "gemini", "camel", "vllm", "sglang"]
     model_name: str
     history_messages_len: int = 0
     parameters: dict[str, Any] = {}
-    tool_call_required: bool = False
+    tool_call_required: bool = True
     base_url: str | None = None  # Only used in OpenAIModel and VLLMModel currently
     api_key: str | None = None  # Only used in OpenAIModel and VLLMModel currently
 
@@ -46,6 +45,7 @@ def create_backend_model(model_config: BackendModelConfig) -> BackendModel:
                 model=model_config.model_name,
                 parameters=model_config.parameters,
                 history_messages_len=model_config.history_messages_len,
+                tool_call_required=model_config.tool_call_required,
             )
         case "gemini":
             if model_config.base_url is not None or model_config.api_key is not None:
@@ -56,6 +56,7 @@ def create_backend_model(model_config: BackendModelConfig) -> BackendModel:
                 model=model_config.model_name,
                 parameters=model_config.parameters,
                 history_messages_len=model_config.history_messages_len,
+                tool_call_required=model_config.tool_call_required,
             )
         case "openai":
             return OpenAIModel(
@@ -64,9 +65,18 @@ def create_backend_model(model_config: BackendModelConfig) -> BackendModel:
                 history_messages_len=model_config.history_messages_len,
                 base_url=model_config.base_url,
                 api_key=model_config.api_key,
+                tool_call_required=model_config.tool_call_required,
             )
-        case "vllm":
-            return VLLMModel(
+        case "openai-json":
+            return OpenAIModelJSON(
+                model=model_config.model_name,
+                parameters=model_config.parameters,
+                history_messages_len=model_config.history_messages_len,
+                base_url=model_config.base_url,
+                api_key=model_config.api_key,
+            )
+        case "sglang-openai-json":
+            return SGlangOpenAIModelJSON(
                 model=model_config.model_name,
                 parameters=model_config.parameters,
                 history_messages_len=model_config.history_messages_len,
diff --git a/crab/agents/backend_models/claude_model.py b/crab/agents/backend_models/claude_model.py
index ed37f47..92641aa 100644
--- a/crab/agents/backend_models/claude_model.py
+++ b/crab/agents/backend_models/claude_model.py
@@ -33,7 +33,7 @@ def __init__(
         model: str,
         parameters: dict[str, Any] | None = None,
         history_messages_len: int = 0,
-        tool_call_required: bool = False,
+        tool_call_required: bool = True,
     ) -> None:
         if anthropic_model_enable is False:
             raise ImportError("Please install anthropic to use ClaudeModel")
@@ -152,6 +152,7 @@ def call_api(self, request_messages: list[dict]) -> anthropic.types.Message:
                 system=self.system_message,  # <-- system prompt
                 messages=request_messages,  # type: ignore
                 model=self.model,
+                max_tokens=4096,
                 tools=self.action_schema,
                 tool_choice={"type": "any" if self.tool_call_required else "auto"},
                 **self.parameters,
@@ -161,6 +162,7 @@ def call_api(self, request_messages: list[dict]) -> anthropic.types.Message:
                 system=self.system_message,  # <-- system prompt
                 messages=request_messages,  # type: ignore
                 model=self.model,
+                max_tokens=4096,
                 **self.parameters,
             )
 
diff --git a/crab/agents/backend_models/gemini_model.py b/crab/agents/backend_models/gemini_model.py
index 3032d94..efa5dbe 100644
--- a/crab/agents/backend_models/gemini_model.py
+++ b/crab/agents/backend_models/gemini_model.py
@@ -42,7 +42,7 @@ def __init__(
         model: str,
         parameters: dict[str, Any] | None = None,
         history_messages_len: int = 0,
-        tool_call_required: bool = False,
+        tool_call_required: bool = True,
     ) -> None:
         if gemini_model_enable is False:
             raise ImportError("Please install google.generativeai to use GeminiModel")
@@ -191,6 +191,11 @@ def _action_to_func_dec(action: Action) -> FunctionDeclaration:
     if "$defs" in p_schema:
         p_schema = json_expand_refs(p_schema)
     _clear_schema(p_schema)
+    if not p_schema["properties"]:
+        return FunctionDeclaration(
+            name=action.name,
+            description=action.description,
+        )
     return FunctionDeclaration(
         name=action.name,
         description=action.description,
diff --git a/crab/agents/backend_models/openai_model.py b/crab/agents/backend_models/openai_model.py
index e8a11eb..714b3f1 100644
--- a/crab/agents/backend_models/openai_model.py
+++ b/crab/agents/backend_models/openai_model.py
@@ -15,6 +15,7 @@
 from typing import Any
 
 from crab import Action, ActionOutput, BackendModel, BackendOutput, Message, MessageType
+from crab.agents.utils import extract_text_and_code_prompts
 
 try:
     import openai
@@ -31,7 +32,7 @@ def __init__(
         model: str,
         parameters: dict[str, Any] | None = None,
         history_messages_len: int = 0,
-        tool_call_required: bool = False,
+        tool_call_required: bool = True,
         base_url: str | None = None,
         api_key: str | None = None,
     ) -> None:
@@ -179,3 +180,91 @@ def _convert_action_to_schema(
         new_action = action.to_openai_json_schema()
         actions.append({"type": "function", "function": new_action})
     return actions
+
+
+class OpenAIModelJSON(OpenAIModel):
+    def __init__(
+        self,
+        model: str,
+        parameters: dict[str, Any] = dict(),
+        history_messages_len: int = 0,
+        base_url: str | None = None,
+        api_key: str | None = None,
+    ) -> None:
+        super().__init__(
+            model,
+            parameters,
+            history_messages_len,
+            False,
+            base_url,
+            api_key,
+        )
+        self.support_tool_call = False
+
+    def reset(self, system_message: str, action_space: list[Action] | None) -> None:
+        super().reset(system_message, action_space)
+        self.action_schema = None
+
+    def record_message(
+        self, new_message: dict, response_message: ChatCompletionMessage
+    ) -> None:
+        self.chat_history.append([new_message])
+        self.chat_history[-1].append(
+            {"role": "assistant", "content": response_message.content}
+        )
+
+    def generate_backend_output(
+        self, response_message: ChatCompletionMessage
+    ) -> BackendOutput:
+        content = response_message.content
+        text_list, code_list = extract_text_and_code_prompts(content)
+
+        action_list = []
+        try:
+            for code_block in code_list:
+                action_object = json.loads(code_block)
+                action_list.append(
+                    ActionOutput(
+                        name=action_object["name"], arguments=action_object["arguments"]
+                    )
+                )
+        except json.JSONDecodeError as e:
+            raise RuntimeError(f"Failed to parse code block: {code_block}") from e
+        except KeyError as e:
+            raise RuntimeError(f"Received invalid action format: {code_block}") from e
+
+        return BackendOutput(
+            message="".join(text_list),
+            action_list=action_list,
+        )
+
+
+class SGlangOpenAIModelJSON(OpenAIModelJSON):
+    def construct_new_message(self, message: list[Message]) -> dict[str, Any]:
+        new_message_content: list[dict[str, Any]] = []
+        image_count = 0
+        for _, msg_type in message:
+            if msg_type == MessageType.IMAGE_JPG_BASE64:
+                image_count += 1
+        for content, msg_type in message:
+            match msg_type:
+                case MessageType.TEXT:
+                    new_message_content.append(
+                        {
+                            "type": "text",
+                            "text": content,
+                        }
+                    )
+                case MessageType.IMAGE_JPG_BASE64:
+                    image_content = {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/png;base64,{content}",
+                            "detail": "high",
+                        },
+                    }
+                    if image_count > 1:
+                        image_content["modalities"] = "multi-images"
+                    new_message_content.append(image_content)
+
+        return {"role": "user", "content": new_message_content}
diff --git a/crab/agents/backend_models/vllm_model.py b/crab/agents/backend_models/vllm_model.py
deleted file mode 100644
index 18ed12c..0000000
--- a/crab/agents/backend_models/vllm_model.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
-# Licensed under the Apache License, Version 2.0 (the “License”);
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an “AS IS” BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
-import json
-from typing import Any
-
-from openai.types.chat import ChatCompletionMessage
-
-from crab import Action, ActionOutput, BackendOutput
-from crab.agents.backend_models.openai_model import OpenAIModel
-from crab.agents.utils import extract_text_and_code_prompts
-
-
-class VLLMModel(OpenAIModel):
-    def __init__(
-        self,
-        model: str,
-        parameters: dict[str, Any] = dict(),
-        history_messages_len: int = 0,
-        base_url: str | None = None,
-        api_key: str | None = None,
-    ) -> None:
-        if base_url is None:
-            raise ValueError("base_url is required for VLLMModel")
-        super().__init__(
-            model,
-            parameters,
-            history_messages_len,
-            False,
-            base_url,
-            api_key,
-        )
-        self.support_tool_call = False
-
-    def reset(self, system_message: str, action_space: list[Action] | None) -> None:
-        super().reset(system_message, action_space)
-        self.action_schema = None
-
-    def record_message(
-        self, new_message: dict, response_message: ChatCompletionMessage
-    ) -> None:
-        self.chat_history.append([new_message])
-        self.chat_history[-1].append(
-            {"role": "assistant", "content": response_message.content}
-        )
-
-    def generate_backend_output(
-        self, response_message: ChatCompletionMessage
-    ) -> BackendOutput:
-        content = response_message.content
-        text_list, code_list = extract_text_and_code_prompts(content)
-
-        action_list = []
-        try:
-            for code_block in code_list:
-                action_object = json.loads(code_block)
-                action_list.append(
-                    ActionOutput(
-                        name=action_object["name"], arguments=action_object["arguments"]
-                    )
-                )
-        except json.JSONDecodeError as e:
-            raise RuntimeError(f"Failed to parse code block: {code_block}") from e
-        except KeyError as e:
-            raise RuntimeError(f"Received invalid action format: {code_block}") from e
-
-        return BackendOutput(
-            message="".join(text_list),
-            action_list=action_list,
-        )
diff --git a/crab/agents/policies/multi_agent_by_env.py b/crab/agents/policies/multi_agent_by_env.py
index b72a535..57afc76 100644
--- a/crab/agents/policies/multi_agent_by_env.py
+++ b/crab/agents/policies/multi_agent_by_env.py
@@ -106,9 +106,8 @@ def get_token_usage(self):
     def get_backend_model_name(self):
         return (
             self.main_agent_model_backend.__class__.__name__
-            + "(sub: "
-            + self.env_agent_model_backend.__class__.__name__
-            + ")"
+            + "_"
+            + self.main_agent_model_backend.model
         )
 
     def chat(
diff --git a/crab/agents/policies/multi_agent_by_func.py b/crab/agents/policies/multi_agent_by_func.py
index eec0159..8d4df64 100644
--- a/crab/agents/policies/multi_agent_by_func.py
+++ b/crab/agents/policies/multi_agent_by_func.py
@@ -74,9 +74,8 @@ def get_token_usage(self):
     def get_backend_model_name(self):
         return (
             self.main_agent_model_backend.__class__.__name__
-            + "(sub: "
-            + self.tool_agent_model_backend.__class__.__name__
-            + ")"
+            + "_"
+            + self.main_agent_model_backend.model
         )
 
     def chat(
diff --git a/crab/agents/policies/single_agent.py b/crab/agents/policies/single_agent.py
index 74a6cd6..fa4b846 100644
--- a/crab/agents/policies/single_agent.py
+++ b/crab/agents/policies/single_agent.py
@@ -11,6 +11,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
+import logging
+
 from crab import Action, ActionOutput
 from crab.agents.backend_models import BackendModelConfig, create_backend_model
 from crab.agents.utils import (
@@ -24,6 +26,8 @@
 )
 from crab.utils.measure import timed
 
+logger = logging.getLogger(__name__)
+
 
 class SingleAgentPolicy(AgentPolicy):
     _system_prompt_with_function_call = """\
@@ -69,6 +73,8 @@ class SingleAgentPolicy(AgentPolicy):
     ```json
     {{"name": "action_name", "arguments": {{}}}}
     ```
+    You MUST use exactly the same "action_name" as I gave to you in the action space.
+    You SHOULDN'T add any comments in the code blocks.
 
     In each step, You MUST explain what do you see from the current observation and the
     plan of the next action, then use a provided action in each step to achieve the
@@ -80,9 +86,19 @@ class SingleAgentPolicy(AgentPolicy):
     def __init__(
         self,
         model_backend: BackendModelConfig,
+        function_call: bool = True,
     ):
         self.model_backend = create_backend_model(model_backend)
-        if self.model_backend.support_tool_call:
+        self.function_call = function_call
+        if not self.model_backend.support_tool_call and self.function_call:
+            logger.warning(
+                "The backend model does not support tool call: {}".format(
+                    model_backend.model_name
+                )
+                + "\nFallback to no function call mode."
+            )
+            self.function_call = False
+        if self.function_call:
             self.system_prompt = self._system_prompt_with_function_call
         else:
             self.system_prompt = self._system_prompt_no_function_call
@@ -100,17 +116,20 @@ def reset(
             task_description=task_description,
             action_descriptions=generate_action_prompt(
                 self.action_space,
-                expand=not self.model_backend.support_tool_call,
+                expand=not self.function_call,
             ),
             env_description=str(env_descriptions),
         )
-        self.model_backend.reset(system_message, self.action_space)
+        if self.function_call:
+            self.model_backend.reset(system_message, self.action_space)
+        else:
+            self.model_backend.reset(system_message, None)
 
     def get_token_usage(self):
         return self.model_backend.get_token_usage()
 
     def get_backend_model_name(self):
-        return self.model_backend.__class__.__name__
+        return self.model_backend.__class__.__name__ + "_" + self.model_backend.model
 
     @timed
     def chat(
@@ -127,4 +146,6 @@ def chat(
             )
         )
         output = self.model_backend.chat(prompt)
+        # print("Agent Message: " + output.message, flush=True)
+        # print("Agent Action: " + str(output.action_list), flush=True)
         return decode_combined_action(output.action_list)
diff --git a/crab/agents/utils.py b/crab/agents/utils.py
index b174b92..e284406 100644
--- a/crab/agents/utils.py
+++ b/crab/agents/utils.py
@@ -92,8 +92,10 @@ def extract_text_and_code_prompts(content: str) -> tuple[list[str], list[str]]:
         # code_type = lines[idx].strip()[3:].strip()
         idx += 1
         start_idx = idx
-        while not lines[idx].lstrip().startswith("```"):
+        while not lines[idx].lstrip().startswith("```") and idx < len(lines):
             idx += 1
+        if idx >= len(lines):
+            break
         code = "\n".join(lines[start_idx:idx]).strip()
         code_prompts.append(code)
 
diff --git a/crab/core/benchmark.py b/crab/core/benchmark.py
index 87a0611..1c23b60 100644
--- a/crab/core/benchmark.py
+++ b/crab/core/benchmark.py
@@ -239,7 +239,22 @@ def step(
                 info=info,
             )
 
-        environment = self._get_env(env_name=env_name, action_name=action)
+        try:
+            environment = self._get_env(env_name=env_name, action_name=action)
+        except Exception:
+            print(traceback.format_exc())
+            terminated = True
+            info["terminate_reason"] = "action_format_error"
+            info["exception_detail"] = traceback.format_exc()
+            environment.reset()
+            self.close_task()
+            return StepResult(
+                truncated=False,
+                terminated=True,
+                action_returns=None,
+                evaluation_results=self.current_evaluator.stat(),
+                info=info,
+            )
         try:
             action_returns = environment.step(action, parameters)
         except Exception:
diff --git a/crab/core/environment.py b/crab/core/environment.py
index e045353..938c1da 100644
--- a/crab/core/environment.py
+++ b/crab/core/environment.py
@@ -89,7 +89,7 @@ def __init__(
 
         self._client: Client | None = None
         if remote_url is not None:
-            self._client = Client(base_url=remote_url)
+            self._client = Client(base_url=remote_url, timeout=60)
         for key, value in extra_attributes.items():
             setattr(self, key, value)
 
diff --git a/crab/core/experiment.py b/crab/core/experiment.py
index 9e14c9e..59721d2 100644
--- a/crab/core/experiment.py
+++ b/crab/core/experiment.py
@@ -138,6 +138,10 @@ def execute_action(self, response: list[ActionOutput]) -> bool:
                 print("\033[92m" f"Task finished, result: {self.metrics}" "\033[0m")
                 self.write_current_log_row(action)
                 self.write_main_csv_row(benchmark_result.info["terminate_reason"])
+                if "exception_detail" in benchmark_result.info:
+                    self.write_exception_detail(
+                        benchmark_result.info["exception_detail"]
+                    )
                 return True
             print(
                 "\033[92m"
@@ -171,6 +175,7 @@ def step(self, it) -> bool:
         except Exception:
             print(traceback.format_exc())
             self.write_main_csv_row("agent_exception")
+            self.write_exception_detail(traceback.format_exc())
             return True
         # content = response["content"]
         # self.write_message(str(content), it)
@@ -214,6 +219,12 @@ def start_benchmark(self):
             sleep(2)
             # input("Press enter to do next step:")
 
+    def write_exception_detail(self, exception_info: str):
+        if self.log_dir is None:
+            return
+        with open(self.current_experiment_dir / "exception_detail.txt", "w") as file:
+            file.write(exception_info)
+
     def write_current_log_row(self, action):
         if self.log_dir is None:
             return
diff --git a/crab/core/task_generator.py b/crab/core/task_generator.py
index 2f373eb..682f875 100644
--- a/crab/core/task_generator.py
+++ b/crab/core/task_generator.py
@@ -16,6 +16,7 @@
 import importlib
 import itertools
 import json
+import os
 import random
 from pathlib import Path
 
@@ -121,6 +122,8 @@ def __init__(
         self.attribute_pool = attribute_pool
         self.graph_generation(subtasks)
         self.task_mapping = {task.id: task for task in subtasks}
+        if not os.getenv("OPENAI_API_KEY"):
+            os.environ["OPENAI_API_KEY"] = "EMPTY"
         self.client = OpenAI()
 
     @classmethod

From 6eb3145432522b969ccc0e8d212fa79e04c3f2ce Mon Sep 17 00:00:00 2001
From: Tianqi Xu <tianqi.xu@kaust.edu.sa>
Date: Tue, 15 Oct 2024 13:39:26 +0300
Subject: [PATCH 15/17] Fix camel model in the new backend model ABC

---
 crab-benchmark-v0/main.py                     |  9 +-
 crab/agents/backend_models/__init__.py        | 83 ++++++++++++++-----
 crab/agents/backend_models/camel_model.py     | 16 ++--
 .../agents/backend_models/test_camel_model.py | 17 ++--
 4 files changed, 88 insertions(+), 37 deletions(-)

diff --git a/crab-benchmark-v0/main.py b/crab-benchmark-v0/main.py
index 79e4afa..f1751ed 100644
--- a/crab-benchmark-v0/main.py
+++ b/crab-benchmark-v0/main.py
@@ -238,22 +238,25 @@ def get_benchmark(env: str, ubuntu_url: str):
         )
     elif args.model == "pixtral":
         model = BackendModelConfig(
-            model_class="openai-json",
+            model_class="openai",
             model_name="mistralai/Pixtral-12B-2409",
+            json_structre_output=True,
             history_messages_len=args.history_messages_len,
             base_url=args.model_base_url,
             api_key=args.model_api_key,
         )
     elif args.model == "gpt4o-wofc":
         model = BackendModelConfig(
-            model_class="openai-json",
+            model_class="openai",
             model_name="gpt-4o",
+            json_structre_output=True,
             history_messages_len=args.history_messages_len,
         )
     elif args.model == "llava-ov72b":
         model = BackendModelConfig(
-            model_class="sglang-openai-json",
+            model_class="sglang",
             model_name="lmms-lab/llava-onevision-qwen2-72b-ov-chat",
+            json_structre_output=True,
             history_messages_len=args.history_messages_len,
             base_url=args.model_base_url,
             api_key=args.model_api_key,
diff --git a/crab/agents/backend_models/__init__.py b/crab/agents/backend_models/__init__.py
index 6c6bdab..32b21cc 100644
--- a/crab/agents/backend_models/__init__.py
+++ b/crab/agents/backend_models/__init__.py
@@ -25,13 +25,43 @@
 
 
 class BackendModelConfig(BaseModel):
-    model_class: Literal["openai", "claude", "gemini", "camel", "vllm", "sglang"]
+    model_class: Literal["openai", "claude", "gemini", "camel", "sglang"]
+    """Specify the model class to be used. Different model classese use different
+    APIs.
+    """
+
     model_name: str
+    """Specify the model name to be used. This value is directly passed to the API, 
+    check model provider API documentation for more details.
+    """
+
+    model_platform: str | None = None
+    """Required for CamelModel. Otherwise, it is ignored. Please check CAMEL
+    documentation for more details.
+    """
+
     history_messages_len: int = 0
+    """Number of rounds of previous messages to be used in the model input. 0 means no
+    history.
+    """
+
     parameters: dict[str, Any] = {}
+    """Additional parameters to be passed to the model."""
+
+    json_structre_output: bool = False
+    """If True, the model generate action through JSON without using "tool call" or
+    "function call". SGLang model only supports JSON output. OpenAI model supports both.
+    Other models do not support JSON output.
+    """
+
     tool_call_required: bool = True
-    base_url: str | None = None  # Only used in OpenAIModel and VLLMModel currently
-    api_key: str | None = None  # Only used in OpenAIModel and VLLMModel currently
+    """Specify if the model enforce each round to generate tool/function calls."""
+
+    base_url: str | None = None
+    """Specify the base URL of the API. Only used in OpenAI and SGLang currently."""
+
+    api_key: str | None = None
+    """Specify the API key to be used. Only used in OpenAI and SGLang currently."""
 
 
 def create_backend_model(model_config: BackendModelConfig) -> BackendModel:
@@ -41,6 +71,10 @@ def create_backend_model(model_config: BackendModelConfig) -> BackendModel:
                 raise Warning(
                     "base_url and api_key are not supported for ClaudeModel currently."
                 )
+            if model_config.json_structre_output:
+                raise Warning(
+                    "json_structre_output is not supported for ClaudeModel currently."
+                )
             return ClaudeModel(
                 model=model_config.model_name,
                 parameters=model_config.parameters,
@@ -52,6 +86,10 @@ def create_backend_model(model_config: BackendModelConfig) -> BackendModel:
                 raise Warning(
                     "base_url and api_key are not supported for GeminiModel currently."
                 )
+            if model_config.json_structre_output:
+                raise Warning(
+                    "json_structre_output is not supported for GeminiModel currently."
+                )
             return GeminiModel(
                 model=model_config.model_name,
                 parameters=model_config.parameters,
@@ -59,31 +97,38 @@ def create_backend_model(model_config: BackendModelConfig) -> BackendModel:
                 tool_call_required=model_config.tool_call_required,
             )
         case "openai":
-            return OpenAIModel(
-                model=model_config.model_name,
-                parameters=model_config.parameters,
-                history_messages_len=model_config.history_messages_len,
-                base_url=model_config.base_url,
-                api_key=model_config.api_key,
-                tool_call_required=model_config.tool_call_required,
-            )
-        case "openai-json":
-            return OpenAIModelJSON(
+            if not model_config.json_structre_output:
+                return OpenAIModel(
+                    model=model_config.model_name,
+                    parameters=model_config.parameters,
+                    history_messages_len=model_config.history_messages_len,
+                    base_url=model_config.base_url,
+                    api_key=model_config.api_key,
+                    tool_call_required=model_config.tool_call_required,
+                )
+            else:
+                return OpenAIModelJSON(
+                    model=model_config.model_name,
+                    parameters=model_config.parameters,
+                    history_messages_len=model_config.history_messages_len,
+                    base_url=model_config.base_url,
+                    api_key=model_config.api_key,
+                )
+        case "sglang":
+            return SGlangOpenAIModelJSON(
                 model=model_config.model_name,
                 parameters=model_config.parameters,
                 history_messages_len=model_config.history_messages_len,
                 base_url=model_config.base_url,
                 api_key=model_config.api_key,
             )
-        case "sglang-openai-json":
-            return SGlangOpenAIModelJSON(
+        case "camel":
+            return CamelModel(
                 model=model_config.model_name,
+                model_platform=model_config.model_platform,
                 parameters=model_config.parameters,
                 history_messages_len=model_config.history_messages_len,
-                base_url=model_config.base_url,
-                api_key=model_config.api_key,
+                tool_call_required=model_config.tool_call_required,
             )
-        case "camel":
-            raise NotImplementedError("Cannot support camel model currently.")
         case _:
             raise ValueError(f"Unsupported model name: {model_config.model_name}")
diff --git a/crab/agents/backend_models/camel_model.py b/crab/agents/backend_models/camel_model.py
index 6631c4c..636006b 100644
--- a/crab/agents/backend_models/camel_model.py
+++ b/crab/agents/backend_models/camel_model.py
@@ -84,20 +84,20 @@ def __init__(
         model_platform: str,
         parameters: dict[str, Any] | None = None,
         history_messages_len: int = 0,
+        tool_call_required: bool = True,
     ) -> None:
         if not CAMEL_ENABLED:
             raise ImportError("Please install camel-ai to use CamelModel")
-        self.parameters = parameters or {}
+        self.model = model
+        self.parameters = parameters if parameters is not None else {}
+        self.history_messages_len = history_messages_len
+
         self.model_type = _get_model_type(model)
         self.model_platform_type = _get_model_platform_type(model_platform)
         self.client: ChatAgent | None = None
         self.token_usage = 0
-
-        super().__init__(
-            model,
-            parameters,
-            history_messages_len,
-        )
+        self.tool_call_required = tool_call_required
+        self.history_messages_len = history_messages_len
 
     def get_token_usage(self) -> int:
         return self.token_usage
@@ -106,7 +106,7 @@ def reset(self, system_message: str, action_space: list[Action] | None) -> None:
         action_schema = _convert_action_to_schema(action_space)
         config = self.parameters.copy()
         if action_schema is not None:
-            config["tool_choice"] = "required"
+            config["tool_choice"] = "required" if self.tool_call_required else "auto"
             config["tools"] = [
                 schema.get_openai_tool_schema() for schema in action_schema
             ]
diff --git a/test/agents/backend_models/test_camel_model.py b/test/agents/backend_models/test_camel_model.py
index f1239ba..8694900 100644
--- a/test/agents/backend_models/test_camel_model.py
+++ b/test/agents/backend_models/test_camel_model.py
@@ -14,16 +14,19 @@
 import pytest
 
 from crab import action
-from crab.agents.backend_models import CamelModel
+from crab.agents.backend_models import BackendModelConfig, create_backend_model
 
 
 @pytest.fixture
 def camel_model():
-    return CamelModel(
-        model_platform="openai",
-        model="gpt-4o",
-        parameters={"max_tokens": 3000},
-        history_messages_len=1,
+    return create_backend_model(
+        BackendModelConfig(
+            model_class="camel",
+            model_name="gpt-4o",
+            model_platform="openai",
+            parameters={"max_tokens": 3000},
+            history_messages_len=1,
+        )
     )
 
 
@@ -38,7 +41,7 @@ def add(a: int, b: int):
     return a + b
 
 
-@pytest.mark.skip(reason="Mock data to be added")
+# @pytest.mark.skip(reason="Mock data to be added")
 def test_action_chat(camel_model):
     camel_model.reset("You are a helpful assistant.", [add])
     message = (

From 23d4ad6bb1c24b2c3a4771a0eb5fdae0071c0740 Mon Sep 17 00:00:00 2001
From: Tianqi Xu <tianqi.xu@kaust.edu.sa>
Date: Tue, 15 Oct 2024 13:56:07 +0300
Subject: [PATCH 16/17] Fix test

---
 test/agents/backend_models/test_camel_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/agents/backend_models/test_camel_model.py b/test/agents/backend_models/test_camel_model.py
index 8694900..6ecf1cc 100644
--- a/test/agents/backend_models/test_camel_model.py
+++ b/test/agents/backend_models/test_camel_model.py
@@ -41,7 +41,7 @@ def add(a: int, b: int):
     return a + b
 
 
-# @pytest.mark.skip(reason="Mock data to be added")
+@pytest.mark.skip(reason="Mock data to be added")
 def test_action_chat(camel_model):
     camel_model.reset("You are a helpful assistant.", [add])
     message = (

From 2d8b87ee9099f20d7a8ae74153e1236017b92f3c Mon Sep 17 00:00:00 2001
From: Tianqi Xu <tianqi.xu@kaust.edu.sa>
Date: Tue, 29 Oct 2024 13:27:34 +0300
Subject: [PATCH 17/17] Make inside-module functions private

---
 crab/agents/backend_models/claude_model.py | 20 ++++++++---------
 crab/agents/backend_models/gemini_model.py | 20 ++++++++---------
 crab/agents/backend_models/openai_model.py | 26 +++++++++++-----------
 3 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/crab/agents/backend_models/claude_model.py b/crab/agents/backend_models/claude_model.py
index 92641aa..fcb7dff 100644
--- a/crab/agents/backend_models/claude_model.py
+++ b/crab/agents/backend_models/claude_model.py
@@ -62,14 +62,14 @@ def reset(self, system_message: str, action_space: list[Action] | None) -> None:
     def chat(self, message: list[Message] | Message) -> BackendOutput:
         if isinstance(message, tuple):
             message = [message]
-        request = self.fetch_from_memory()
-        new_message = self.construct_new_message(message)
+        request = self._fetch_from_memory()
+        new_message = self._construct_new_message(message)
         request.append(new_message)
-        response_message = self.call_api(request)
-        self.record_message(new_message, response_message)
-        return self.generate_backend_output(response_message)
+        response_message = self._call_api(request)
+        self._record_message(new_message, response_message)
+        return self._generate_backend_output(response_message)
 
-    def construct_new_message(self, message: list[Message]) -> dict[str, Any]:
+    def _construct_new_message(self, message: list[Message]) -> dict[str, Any]:
         parts: list[dict] = []
         for content, msg_type in message:
             match msg_type:
@@ -96,7 +96,7 @@ def construct_new_message(self, message: list[Message]) -> dict[str, Any]:
             "content": parts,
         }
 
-    def fetch_from_memory(self) -> list[dict]:
+    def _fetch_from_memory(self) -> list[dict]:
         request: list[dict] = []
         if self.history_messages_len > 0:
             fetch_history_len = min(self.history_messages_len, len(self.chat_history))
@@ -107,7 +107,7 @@ def fetch_from_memory(self) -> list[dict]:
     def get_token_usage(self):
         return self.token_usage
 
-    def record_message(
+    def _record_message(
         self, new_message: dict, response_message: anthropic.types.Message
     ) -> None:
         self.chat_history.append([new_message])
@@ -145,7 +145,7 @@ def record_message(
             )
         ),
     )
-    def call_api(self, request_messages: list[dict]) -> anthropic.types.Message:
+    def _call_api(self, request_messages: list[dict]) -> anthropic.types.Message:
         request_messages = _merge_request(request_messages)
         if self.action_schema is not None:
             response = self.client.messages.create(
@@ -169,7 +169,7 @@ def call_api(self, request_messages: list[dict]) -> anthropic.types.Message:
         self.token_usage += response.usage.input_tokens + response.usage.output_tokens
         return response
 
-    def generate_backend_output(
+    def _generate_backend_output(
         self, response_message: anthropic.types.Message
     ) -> BackendOutput:
         message = ""
diff --git a/crab/agents/backend_models/gemini_model.py b/crab/agents/backend_models/gemini_model.py
index efa5dbe..66b49c0 100644
--- a/crab/agents/backend_models/gemini_model.py
+++ b/crab/agents/backend_models/gemini_model.py
@@ -71,14 +71,14 @@ def reset(self, system_message: str, action_space: list[Action] | None) -> None:
     def chat(self, message: list[Message] | Message) -> BackendOutput:
         if isinstance(message, tuple):
             message = [message]
-        request = self.fetch_from_memory()
-        new_message = self.construct_new_message(message)
+        request = self._fetch_from_memory()
+        new_message = self._construct_new_message(message)
         request.append(new_message)
-        response_message = self.call_api(request)
-        self.record_message(new_message, response_message)
-        return self.generate_backend_output(response_message)
+        response_message = self._call_api(request)
+        self._record_message(new_message, response_message)
+        return self._generate_backend_output(response_message)
 
-    def construct_new_message(self, message: list[Message]) -> dict[str, Any]:
+    def _construct_new_message(self, message: list[Message]) -> dict[str, Any]:
         parts: list[str | Image] = []
         for content, msg_type in message:
             match msg_type:
@@ -91,7 +91,7 @@ def construct_new_message(self, message: list[Message]) -> dict[str, Any]:
             "parts": parts,
         }
 
-    def generate_backend_output(self, response_message: Content) -> BackendOutput:
+    def _generate_backend_output(self, response_message: Content) -> BackendOutput:
         tool_calls: list[ActionOutput] = []
         for part in response_message.parts:
             if "function_call" in Part.to_dict(part):
@@ -108,7 +108,7 @@ def generate_backend_output(self, response_message: Content) -> BackendOutput:
             action_list=tool_calls or None,
         )
 
-    def fetch_from_memory(self) -> list[dict]:
+    def _fetch_from_memory(self) -> list[dict]:
         request: list[dict] = []
         if self.history_messages_len > 0:
             fetch_history_len = min(self.history_messages_len, len(self.chat_history))
@@ -119,7 +119,7 @@ def fetch_from_memory(self) -> list[dict]:
     def get_token_usage(self):
         return self.token_usage
 
-    def record_message(
+    def _record_message(
         self, new_message: dict[str, Any], response_message: Content
     ) -> None:
         self.chat_history.append([new_message])
@@ -132,7 +132,7 @@ def record_message(
         stop=stop_after_attempt(7),
         retry=retry_if_exception_type(ResourceExhausted),
     )
-    def call_api(self, request_messages: list) -> Content:
+    def _call_api(self, request_messages: list) -> Content:
         if self.action_schema is not None:
             tool_config = content_types.to_tool_config(
                 {
diff --git a/crab/agents/backend_models/openai_model.py b/crab/agents/backend_models/openai_model.py
index 714b3f1..f60d076 100644
--- a/crab/agents/backend_models/openai_model.py
+++ b/crab/agents/backend_models/openai_model.py
@@ -72,17 +72,17 @@ def reset(self, system_message: str, action_space: list[Action] | None) -> None:
     def chat(self, message: list[Message] | Message) -> BackendOutput:
         if isinstance(message, tuple):
             message = [message]
-        request = self.fetch_from_memory()
-        new_message = self.construct_new_message(message)
+        request = self._fetch_from_memory()
+        new_message = self._construct_new_message(message)
         request.append(new_message)
-        response_message = self.call_api(request)
-        self.record_message(new_message, response_message)
-        return self.generate_backend_output(response_message)
+        response_message = self._call_api(request)
+        self._record_message(new_message, response_message)
+        return self._generate_backend_output(response_message)
 
     def get_token_usage(self):
         return self.token_usage
 
-    def record_message(
+    def _record_message(
         self, new_message: dict, response_message: ChatCompletionMessage
     ) -> None:
         self.chat_history.append([new_message])
@@ -99,7 +99,7 @@ def record_message(
                     }
                 )  # extend conversation with function response
 
-    def call_api(
+    def _call_api(
         self, request_messages: list[ChatCompletionMessage | dict]
     ) -> ChatCompletionMessage:
         if self.action_schema is not None:
@@ -120,7 +120,7 @@ def call_api(
         self.token_usage += response.usage.total_tokens
         return response.choices[0].message
 
-    def fetch_from_memory(self) -> list[ChatCompletionMessage | dict]:
+    def _fetch_from_memory(self) -> list[ChatCompletionMessage | dict]:
         request: list[ChatCompletionMessage | dict] = [self.openai_system_message]
         if self.history_messages_len > 0:
             fetch_history_len = min(self.history_messages_len, len(self.chat_history))
@@ -128,7 +128,7 @@ def fetch_from_memory(self) -> list[ChatCompletionMessage | dict]:
                 request = request + history_message
         return request
 
-    def construct_new_message(self, message: list[Message]) -> dict[str, Any]:
+    def _construct_new_message(self, message: list[Message]) -> dict[str, Any]:
         new_message_content: list[dict[str, Any]] = []
         for content, msg_type in message:
             match msg_type:
@@ -152,7 +152,7 @@ def construct_new_message(self, message: list[Message]) -> dict[str, Any]:
 
         return {"role": "user", "content": new_message_content}
 
-    def generate_backend_output(
+    def _generate_backend_output(
         self, response_message: ChatCompletionMessage
     ) -> BackendOutput:
         if response_message.tool_calls is None:
@@ -205,7 +205,7 @@ def reset(self, system_message: str, action_space: list[Action] | None) -> None:
         super().reset(system_message, action_space)
         self.action_schema = None
 
-    def record_message(
+    def _record_message(
         self, new_message: dict, response_message: ChatCompletionMessage
     ) -> None:
         self.chat_history.append([new_message])
@@ -213,7 +213,7 @@ def record_message(
             {"role": "assistant", "content": response_message.content}
         )
 
-    def generate_backend_output(
+    def _generate_backend_output(
         self, response_message: ChatCompletionMessage
     ) -> BackendOutput:
         content = response_message.content
@@ -240,7 +240,7 @@ def generate_backend_output(
 
 
 class SGlangOpenAIModelJSON(OpenAIModelJSON):
-    def construct_new_message(self, message: list[Message]) -> dict[str, Any]:
+    def _construct_new_message(self, message: list[Message]) -> dict[str, Any]:
         new_message_content: list[dict[str, Any]] = []
         image_count = 0
         for _, msg_type in message: