-
Notifications
You must be signed in to change notification settings - Fork 15.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
together, standard-tests: specify tool_choice in standard tests #25548
Changes from 7 commits
ebf9cbc
06b62ee
1b78a7b
57d6497
0ed918a
4470674
a68fa35
7202cdd
fc11b47
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
import base64 | ||
import json | ||
from typing import List, Optional | ||
from typing import List, Optional, Union | ||
|
||
import httpx | ||
import pytest | ||
|
@@ -170,7 +170,14 @@ def test_stop_sequence(self, model: BaseChatModel) -> None: | |
def test_tool_calling(self, model: BaseChatModel) -> None: | ||
if not self.has_tool_calling: | ||
pytest.skip("Test requires tool calling.") | ||
model_with_tools = model.bind_tools([magic_function]) | ||
if self.tool_choice_value == "dict": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this format is openai-specific (eg anthropic doesn't support i dont think). i actually tihnk more models support just passing in tool name as a string to bind_tools, and think thats what we'd want to standardize on from a devx perspective anyways There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, updated this to tool name. |
||
tool_choice: Union[dict, str, None] = { | ||
"type": "function", | ||
"function": {"name": "magic_function"}, | ||
} | ||
else: | ||
tool_choice = self.tool_choice_value | ||
model_with_tools = model.bind_tools([magic_function], tool_choice=tool_choice) | ||
|
||
# Test invoke | ||
query = "What is the value of magic_function(3)? Use the tool." | ||
|
@@ -188,7 +195,16 @@ def test_tool_calling_with_no_arguments(self, model: BaseChatModel) -> None: | |
if not self.has_tool_calling: | ||
pytest.skip("Test requires tool calling.") | ||
|
||
model_with_tools = model.bind_tools([magic_function_no_args]) | ||
if self.tool_choice_value == "dict": | ||
tool_choice: Union[dict, str, None] = { | ||
"type": "function", | ||
"function": {"name": "magic_function_no_args"}, | ||
} | ||
else: | ||
tool_choice = self.tool_choice_value | ||
model_with_tools = model.bind_tools( | ||
[magic_function_no_args], tool_choice=tool_choice | ||
) | ||
query = "What is the value of magic_function()? Use the tool." | ||
result = model_with_tools.invoke(query) | ||
_validate_tool_call_message_no_args(result) | ||
|
@@ -212,7 +228,14 @@ def test_bind_runnables_as_tools(self, model: BaseChatModel) -> None: | |
name="greeting_generator", | ||
description="Generate a greeting in a particular style of speaking.", | ||
) | ||
model_with_tools = model.bind_tools([tool_]) | ||
if self.tool_choice_value == "dict": | ||
tool_choice: Union[dict, str, None] = { | ||
"type": "function", | ||
"function": {"name": "greeting_generator"}, | ||
} | ||
else: | ||
tool_choice = self.tool_choice_value | ||
model_with_tools = model.bind_tools([tool_], tool_choice=tool_choice) | ||
query = "Using the tool, generate a Pirate greeting." | ||
result = model_with_tools.invoke(query) | ||
assert isinstance(result, AIMessage) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ooc why go smaller instead of bigger
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can do 70b (I changed it from 8b to 70b this morning). My thought is if they support the same features we should prefer smaller models for tests in spirit of testing functionality vs. benchmarking.