Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Multimodal Abilities to Crew #1805

Merged
merged 20 commits into from
Dec 27, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/crewai/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,10 @@ class Agent(BaseAgent):
default=2,
description="Maximum number of retries for an agent to execute a task when an error occurs.",
)
multimodal: bool = Field(
default=False,
description="Whether the agent is multimodal.",
)
code_execution_mode: Literal["safe", "unsafe"] = Field(
default="safe",
description="Mode for code execution: 'safe' (using Docker) or 'unsafe' (direct execution).",
Expand Down Expand Up @@ -406,6 +410,10 @@ def get_delegation_tools(self, agents: List[BaseAgent]):
tools = agent_tools.tools()
return tools

def get_multimodal_tools(self):
from crewai.tools.agent_tools.add_image_tool import AddImageTool
return [AddImageTool()]

def get_code_execution_tools(self):
try:
from crewai_tools import CodeInterpreterTool
Expand Down
16 changes: 13 additions & 3 deletions src/crewai/agents/crew_agent_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,20 @@ def _invoke_loop(self, formatted_answer=None):
tool_result = self._execute_tool_and_check_finality(
formatted_answer
)
if self.step_callback:
self.step_callback(tool_result)

formatted_answer.text += f"\nObservation: {tool_result.result}"
# Directly append the result to the messages if the
# tool is "Add image to content" in case of multimodal
# agents
if formatted_answer.tool == "Add image to content":
self.messages.append(tool_result.result)
continue

else:
if self.step_callback:
self.step_callback(tool_result)

formatted_answer.text += f"\nObservation: {tool_result.result}"

formatted_answer.result = tool_result.result
if tool_result.result_as_answer:
return AgentFinish(
Expand Down
113 changes: 70 additions & 43 deletions src/crewai/crew.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from crewai.tasks.task_output import TaskOutput
from crewai.telemetry import Telemetry
from crewai.tools.agent_tools.agent_tools import AgentTools
from crewai.tools.base_tool import Tool
from crewai.types.usage_metrics import UsageMetrics
from crewai.utilities import I18N, FileHandler, Logger, RPMController
from crewai.utilities.constants import TRAINING_DATA_FILE
Expand Down Expand Up @@ -533,9 +534,6 @@ def kickoff(
if not agent.function_calling_llm: # type: ignore # "BaseAgent" has no attribute "function_calling_llm"
agent.function_calling_llm = self.function_calling_llm # type: ignore # "BaseAgent" has no attribute "function_calling_llm"

if agent.allow_code_execution: # type: ignore # BaseAgent" has no attribute "allow_code_execution"
agent.tools += agent.get_code_execution_tools() # type: ignore # "BaseAgent" has no attribute "get_code_execution_tools"; maybe "get_delegation_tools"?

if not agent.step_callback: # type: ignore # "BaseAgent" has no attribute "step_callback"
agent.step_callback = self.step_callback # type: ignore # "BaseAgent" has no attribute "step_callback"

Expand Down Expand Up @@ -672,7 +670,6 @@ def _create_manager_agent(self):
)
manager.tools = []
raise Exception("Manager agent should not have tools")
manager.tools = self.manager_agent.get_delegation_tools(self.agents)
else:
self.manager_llm = (
getattr(self.manager_llm, "model_name", None)
Expand All @@ -684,6 +681,7 @@ def _create_manager_agent(self):
goal=i18n.retrieve("hierarchical_manager_agent", "goal"),
backstory=i18n.retrieve("hierarchical_manager_agent", "backstory"),
tools=AgentTools(agents=self.agents).tools(),
allow_delegation=True,
llm=self.manager_llm,
verbose=self.verbose,
)
Expand Down Expand Up @@ -726,7 +724,14 @@ def _execute_tasks(
f"No agent available for task: {task.description}. Ensure that either the task has an assigned agent or a manager agent is provided."
)

self._prepare_agent_tools(task)
# Determine which tools to use - task tools take precedence over agent tools
tools_for_task = task.tools if task.tools else agent_to_use.tools or []
tools_for_task = self._prepare_tools(
agent_to_use,
task,
tools_for_task
)

self._log_task_start(task, agent_to_use.role)

if isinstance(task, ConditionalTask):
Expand All @@ -743,7 +748,7 @@ def _execute_tasks(
future = task.execute_async(
agent=agent_to_use,
context=context,
tools=agent_to_use.tools,
tools=tools_for_task,
)
futures.append((task, future, task_index))
else:
Expand All @@ -755,7 +760,7 @@ def _execute_tasks(
task_output = task.execute_sync(
agent=agent_to_use,
context=context,
tools=agent_to_use.tools,
tools=tools_for_task,
)
task_outputs = [task_output]
self._process_task_result(task, task_output)
Expand Down Expand Up @@ -792,60 +797,82 @@ def _handle_conditional_task(
return skipped_task_output
return None

def _prepare_agent_tools(self, task: Task):
if self.process == Process.hierarchical:
if self.manager_agent:
self._update_manager_tools(task)
else:
raise ValueError("Manager agent is required for hierarchical process.")
elif task.agent and task.agent.allow_delegation:
self._add_delegation_tools(task)
def _prepare_tools(self, agent: BaseAgent, task: Task, tools: List[Tool]):
# Add delegation tools if agent allows delegation
if agent.allow_delegation:
if self.process == Process.hierarchical:
if self.manager_agent:
tools = self._update_manager_tools(task, tools)
else:
raise ValueError("Manager agent is required for hierarchical process.")

elif agent and agent.allow_delegation:
tools = self._add_delegation_tools(task, tools)

# Add code execution tools if agent allows code execution
if agent.allow_code_execution:
tools = self._add_code_execution_tools(agent, tools)

if agent and agent.multimodal:
tools = self._add_multimodal_tools(agent, tools)

return tools

def _get_agent_to_use(self, task: Task) -> Optional[BaseAgent]:
if self.process == Process.hierarchical:
return self.manager_agent
return task.agent

def _add_delegation_tools(self, task: Task):
def _merge_tools(self, existing_tools: List[Tool], new_tools: List[Tool]) -> List[Tool]:
"""Merge new tools into existing tools list, avoiding duplicates by tool name."""
if not new_tools:
return existing_tools

# Create mapping of tool names to new tools
new_tool_map = {tool.name: tool for tool in new_tools}

# Remove any existing tools that will be replaced
tools = [tool for tool in existing_tools if tool.name not in new_tool_map]

# Add all new tools
tools.extend(new_tools)

return tools

def _inject_delegation_tools(self, tools: List[Tool], task_agent: BaseAgent, agents: List[BaseAgent]):
delegation_tools = task_agent.get_delegation_tools(agents)
return self._merge_tools(tools, delegation_tools)

def _add_multimodal_tools(self, agent: BaseAgent, tools: List[Tool]):
multimodal_tools = agent.get_multimodal_tools()
return self._merge_tools(tools, multimodal_tools)

def _add_code_execution_tools(self, agent: BaseAgent, tools: List[Tool]):
code_tools = agent.get_code_execution_tools()
return self._merge_tools(tools, code_tools)

def _add_delegation_tools(self, task: Task, tools: List[Tool]):
agents_for_delegation = [agent for agent in self.agents if agent != task.agent]
if len(self.agents) > 1 and len(agents_for_delegation) > 0 and task.agent:
delegation_tools = task.agent.get_delegation_tools(agents_for_delegation)

# Add tools if they are not already in task.tools
for new_tool in delegation_tools:
# Find the index of the tool with the same name
existing_tool_index = next(
(
index
for index, tool in enumerate(task.tools or [])
if tool.name == new_tool.name
),
None,
)
if not task.tools:
task.tools = []

if existing_tool_index is not None:
# Replace the existing tool
task.tools[existing_tool_index] = new_tool
else:
# Add the new tool
task.tools.append(new_tool)
if not tools:
tools = []
tools = self._inject_delegation_tools(tools, task.agent, agents_for_delegation)
return tools

def _log_task_start(self, task: Task, role: str = "None"):
if self.output_log_file:
self._file_handler.log(
task_name=task.name, task=task.description, agent=role, status="started"
)

def _update_manager_tools(self, task: Task):
def _update_manager_tools(self, task: Task, tools: List[Tool]):
if self.manager_agent:
if task.agent:
self.manager_agent.tools = task.agent.get_delegation_tools([task.agent])
tools = self._inject_delegation_tools(tools, task.agent, [task.agent])
else:
self.manager_agent.tools = self.manager_agent.get_delegation_tools(
self.agents
)
tools = self._inject_delegation_tools(tools, self.manager_agent, self.agents)
# self.manager_agent.tools = tools
return tools

def _get_context(self, task: Task, task_outputs: List[TaskOutput]):
context = (
Expand Down
2 changes: 2 additions & 0 deletions src/crewai/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ def flush(self):
"llama3-70b-8192": 8192,
"llama3-8b-8192": 8192,
"mixtral-8x7b-32768": 32768,
"llama-3.3-70b-versatile": 128000,
"llama-3.3-70b-instruct": 128000,
}

DEFAULT_CONTEXT_WINDOW_SIZE = 8192
Expand Down
3 changes: 1 addition & 2 deletions src/crewai/memory/storage/mem0_storage.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import os
from typing import Any, Dict, List

from mem0 import MemoryClient

from crewai.memory.storage.interface import Storage
from mem0 import MemoryClient


class Mem0Storage(Storage):
Expand Down
1 change: 0 additions & 1 deletion src/crewai/memory/storage/rag_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from typing import Any, Dict, List, Optional

from chromadb.api import ClientAPI

from crewai.memory.storage.base_rag_storage import BaseRAGStorage
from crewai.utilities import EmbeddingConfigurator
from crewai.utilities.constants import MAX_FILE_NAME_LENGTH
Expand Down
40 changes: 40 additions & 0 deletions src/crewai/tools/agent_tools/add_image_tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from crewai.tools.base_tool import BaseTool
from pydantic import BaseModel, Field


class AddImageToolSchema(BaseModel):
image_url: str = Field(..., description="The URL or path of the image to add")
action: str = Field(
default="Please provide a detailed description of this image, including all visual elements, context, and any notable details you can observe.",
description="Optional context or question about the image"
)


class AddImageTool(BaseTool):
"""Tool for adding images to the content"""

name: str = "Add image to content"
description: str = "See image to understand it's content, you can optionally ask a question about the image"
args_schema: type[BaseModel] = AddImageToolSchema

def _run(
self,
image_url: str,
action: str = None,
**kwargs,
) -> dict:
action = action or "Please provide a detailed description of this image, including all visual elements, context, and any notable details you can observe."
content = [
{"type": "text", "text": action},
{
"type": "image_url",
"image_url": {
"url": image_url,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How will this work for local images? Asking cause the OpenAI Vision has a base64 strategy to handle such cases.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

depends on the model, for most you need to have uploaded image or base64, but that is out of our control

},
}
]

return {
"role": "user",
"content": content
}
3 changes: 1 addition & 2 deletions src/crewai/tools/agent_tools/ask_question_tool.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from typing import Optional

from pydantic import BaseModel, Field

from crewai.tools.agent_tools.base_agent_tools import BaseAgentTool
from pydantic import BaseModel, Field


class AskQuestionToolSchema(BaseModel):
Expand Down
3 changes: 1 addition & 2 deletions src/crewai/tools/agent_tools/base_agent_tools.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from typing import Optional, Union

from pydantic import Field

from crewai.agents.agent_builder.base_agent import BaseAgent
from crewai.task import Task
from crewai.tools.base_tool import BaseTool
from crewai.utilities import I18N
from pydantic import Field


class BaseAgentTool(BaseTool):
Expand Down
3 changes: 1 addition & 2 deletions src/crewai/tools/agent_tools/delegate_work_tool.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from typing import Optional

from pydantic import BaseModel, Field

from crewai.tools.agent_tools.base_agent_tools import BaseAgentTool
from pydantic import BaseModel, Field


class DelegateWorkToolSchema(BaseModel):
Expand Down
14 changes: 14 additions & 0 deletions src/crewai/tools/tool_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from crewai.task import Task
from crewai.telemetry import Telemetry
from crewai.tools import BaseTool
from crewai.tools.structured_tool import CrewStructuredTool
from crewai.tools.tool_calling import InstructorToolCalling, ToolCalling
from crewai.tools.tool_usage_events import ToolUsageError, ToolUsageFinished
from crewai.utilities import I18N, Converter, ConverterError, Printer
Expand Down Expand Up @@ -103,6 +104,19 @@ def use(
if self.agent.verbose:
self._printer.print(content=f"\n\n{error}\n", color="red")
return error

if isinstance(tool, CrewStructuredTool) and tool.name == 'Add image to content':
try:
result = self._use(tool_string=tool_string, tool=tool, calling=calling)
return result

except Exception as e:
error = getattr(e, "message", str(e))
self.task.increment_tools_errors()
if self.agent.verbose:
self._printer.print(content=f"\n\n{error}\n", color="red")
return error

return f"{self._use(tool_string=tool_string, tool=tool, calling=calling)}" # type: ignore # BUG?: "_use" of "ToolUsage" does not return a value (it only ever returns None)

def _use(
Expand Down
9 changes: 4 additions & 5 deletions src/crewai/utilities/evaluators/crew_evaluator_handler.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
from collections import defaultdict

from pydantic import BaseModel, Field
from rich.box import HEAVY_EDGE
from rich.console import Console
from rich.table import Table

from crewai.agent import Agent
from crewai.task import Task
from crewai.tasks.task_output import TaskOutput
from crewai.telemetry import Telemetry
from pydantic import BaseModel, Field
from rich.box import HEAVY_EDGE
from rich.console import Console
from rich.table import Table


class TaskEvaluationPydanticOutput(BaseModel):
Expand Down
3 changes: 1 addition & 2 deletions src/crewai/utilities/evaluators/task_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from typing import List

from pydantic import BaseModel, Field

from crewai.utilities import Converter
from crewai.utilities.pydantic_schema_parser import PydanticSchemaParser
from pydantic import BaseModel, Field

agentops = None
try:
Expand Down
Loading
Loading