Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

python[patch]: pass Runnable to evaluate #1204

Merged
merged 6 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions python/langsmith/evaluation/_arunner.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""V2 Evaluation Interface."""

Check notice on line 1 in python/langsmith/evaluation/_arunner.py

View workflow job for this annotation

GitHub Actions / benchmark

Benchmark results

......................................... create_5_000_run_trees: Mean +- std dev: 620 ms +- 47 ms ......................................... create_10_000_run_trees: Mean +- std dev: 1.19 sec +- 0.06 sec ......................................... create_20_000_run_trees: Mean +- std dev: 1.20 sec +- 0.06 sec ......................................... dumps_class_nested_py_branch_and_leaf_200x400: Mean +- std dev: 706 us +- 14 us ......................................... dumps_class_nested_py_leaf_50x100: Mean +- std dev: 25.7 ms +- 0.5 ms ......................................... dumps_class_nested_py_leaf_100x200: Mean +- std dev: 104 ms +- 2 ms ......................................... dumps_dataclass_nested_50x100: Mean +- std dev: 25.8 ms +- 0.3 ms ......................................... WARNING: the benchmark result may be unstable * the standard deviation (17.0 ms) is 25% of the mean (68.5 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. dumps_pydantic_nested_50x100: Mean +- std dev: 68.5 ms +- 17.0 ms ......................................... WARNING: the benchmark result may be unstable * the standard deviation (32.2 ms) is 14% of the mean (224 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. dumps_pydanticv1_nested_50x100: Mean +- std dev: 224 ms +- 32 ms

Check notice on line 1 in python/langsmith/evaluation/_arunner.py

View workflow job for this annotation

GitHub Actions / benchmark

Comparison against main

+------------------------------------+----------+------------------------+ | Benchmark | main | changes | +====================================+==========+========================+ | dumps_class_nested_py_leaf_100x200 | 105 ms | 104 ms: 1.01x faster | +------------------------------------+----------+------------------------+ | create_20_000_run_trees | 1.18 sec | 1.20 sec: 1.02x slower | +------------------------------------+----------+------------------------+ | dumps_class_nested_py_leaf_50x100 | 25.2 ms | 25.7 ms: 1.02x slower | +------------------------------------+----------+------------------------+ | Geometric mean | (ref) | 1.01x slower | +------------------------------------+----------+------------------------+ Benchmark hidden because not significant (6): dumps_dataclass_nested_50x100, dumps_class_nested_py_branch_and_leaf_200x400, create_10_000_run_trees, create_5_000_run_trees, dumps_pydanticv1_nested_50x100, dumps_pydantic_nested_50x100

from __future__ import annotations

Expand Down Expand Up @@ -40,6 +40,7 @@
_ExperimentManagerMixin,
_extract_feedback_keys,
_ForwardResults,
_is_langchain_runnable,
_load_examples_map,
_load_experiment,
_load_tqdm,
Expand Down Expand Up @@ -940,7 +941,7 @@
def _ensure_async_traceable(
target: ATARGET_T,
) -> rh.SupportsLangsmithExtra[[dict], Awaitable]:
if not asyncio.iscoroutinefunction(target):
if not asyncio.iscoroutinefunction(target) and not _is_langchain_runnable(target):
if callable(target):
raise ValueError(
"Target must be an async function. For sync functions, use evaluate."
Expand All @@ -961,7 +962,10 @@
)
if rh.is_traceable_function(target):
return target # type: ignore
return rh.traceable(name="AsyncTarget")(target)
else:
if _is_langchain_runnable(target):
target = target.ainvoke # type: ignore[attr-defined]
return rh.traceable(name="AsyncTarget")(target)


def _aresolve_data(
Expand Down
39 changes: 30 additions & 9 deletions python/langsmith/evaluation/_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@

if TYPE_CHECKING:
import pandas as pd
from langchain_core.runnables import Runnable

DataFrame = pd.DataFrame
else:
Expand Down Expand Up @@ -96,7 +97,7 @@


def evaluate(
target: TARGET_T,
target: Union[TARGET_T, Runnable],
/,
data: DATA_T,
evaluators: Optional[Sequence[EVALUATOR_T]] = None,
Expand Down Expand Up @@ -878,12 +879,12 @@ def _print_comparative_experiment_start(
)


def _is_callable(target: Union[TARGET_T, Iterable[schemas.Run]]) -> bool:
return callable(target) or (hasattr(target, "invoke") and callable(target.invoke))
def _is_callable(target: Union[TARGET_T, Iterable[schemas.Run], Runnable]) -> bool:
return callable(target) or _is_langchain_runnable(target)


def _evaluate(
target: Union[TARGET_T, Iterable[schemas.Run]],
target: Union[TARGET_T, Iterable[schemas.Run], Runnable],
/,
data: DATA_T,
evaluators: Optional[Sequence[EVALUATOR_T]] = None,
Expand Down Expand Up @@ -1664,12 +1665,13 @@ def _resolve_data(


def _ensure_traceable(
target: TARGET_T | rh.SupportsLangsmithExtra[[dict], dict],
target: TARGET_T | rh.SupportsLangsmithExtra[[dict], dict] | Runnable,
) -> rh.SupportsLangsmithExtra[[dict], dict]:
"""Ensure the target function is traceable."""
if not callable(target):
if not _is_callable(target):
raise ValueError(
"Target must be a callable function. For example:\n\n"
"Target must be a callable function or a langchain/langgraph object. For "
"example:\n\n"
"def predict(inputs: dict) -> dict:\n"
" # do work, like chain.invoke(inputs)\n"
" return {...}\n\n"
Expand All @@ -1679,9 +1681,11 @@ def _ensure_traceable(
")"
)
if rh.is_traceable_function(target):
fn = target
fn: rh.SupportsLangsmithExtra[[dict], dict] = target
else:
fn = rh.traceable(name="Target")(target)
if _is_langchain_runnable(target):
target = target.invoke # type: ignore[union-attr]
fn = rh.traceable(name="Target")(cast(Callable, target))
return fn


Expand Down Expand Up @@ -1923,3 +1927,20 @@ def _flatten_experiment_results(
}
for x in results[start:end]
]


@functools.lru_cache(maxsize=1)
def _import_langchain_runnable() -> Optional[type]:
try:
from langchain_core.runnables import Runnable

return Runnable
except ImportError:
return None


def _is_langchain_runnable(o: Any) -> bool:
if (Runnable := _import_langchain_runnable()) and isinstance(o, Runnable):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could just be bool()

return True
else:
return False
6 changes: 2 additions & 4 deletions python/langsmith/run_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,7 @@ def tracing_context(
get_run_tree_context = get_current_run_tree


def is_traceable_function(
func: Callable[P, R],
) -> TypeGuard[SupportsLangsmithExtra[P, R]]:
def is_traceable_function(func: Any) -> TypeGuard[SupportsLangsmithExtra[P, R]]:
"""Check if a function is @traceable decorated."""
return (
_is_traceable_function(func)
Expand Down Expand Up @@ -1445,7 +1443,7 @@ def _handle_container_end(
LOGGER.warning(f"Unable to process trace outputs: {repr(e)}")


def _is_traceable_function(func: Callable) -> bool:
def _is_traceable_function(func: Any) -> bool:
return getattr(func, "__langsmith_traceable__", False)


Expand Down
39 changes: 28 additions & 11 deletions python/tests/unit_tests/evaluation/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,8 @@ def _wait_until(condition: Callable, timeout: int = 8):

@pytest.mark.skipif(sys.version_info < (3, 9), reason="requires python3.9 or higher")
@pytest.mark.parametrize("blocking", [False, True])
def test_evaluate_results(blocking: bool) -> None:
@pytest.mark.parametrize("as_runnable", [False, True])
def test_evaluate_results(blocking: bool, as_runnable: bool) -> None:
session = mock.Mock()
ds_name = "my-dataset"
ds_id = "00886375-eb2a-4038-9032-efff60309896"
Expand Down Expand Up @@ -180,6 +181,15 @@ def predict(inputs: dict) -> dict:
ordering_of_stuff.append("predict")
return {"output": inputs["in"] + 1}

if as_runnable:
try:
from langchain_core.runnables import RunnableLambda
except ImportError:
pytest.skip("langchain-core not installed.")
return
else:
predict = RunnableLambda(predict)

def score_value_first(run, example):
ordering_of_stuff.append("evaluate")
return {"score": 0.3}
Expand Down Expand Up @@ -263,26 +273,24 @@ async def my_other_func(inputs: dict, other_val: int):
with pytest.raises(ValueError, match=match):
evaluate(functools.partial(my_other_func, other_val=3), data="foo")

if sys.version_info < (3, 10):
return
try:
from langchain_core.runnables import RunnableLambda
except ImportError:
pytest.skip("langchain-core not installed.")

@RunnableLambda
def foo(inputs: dict):
return "bar"

with pytest.raises(ValueError, match=match):
evaluate(foo.ainvoke, data="foo")
if sys.version_info < (3, 10):
return
with pytest.raises(ValueError, match=match):
evaluate(functools.partial(foo.ainvoke, inputs={"foo": "bar"}), data="foo")
evaluate(
functools.partial(RunnableLambda(my_func).ainvoke, inputs={"foo": "bar"}),
data="foo",
)


@pytest.mark.skipif(sys.version_info < (3, 9), reason="requires python3.9 or higher")
@pytest.mark.parametrize("blocking", [False, True])
async def test_aevaluate_results(blocking: bool) -> None:
@pytest.mark.parametrize("as_runnable", [False, True])
async def test_aevaluate_results(blocking: bool, as_runnable: bool) -> None:
session = mock.Mock()
ds_name = "my-dataset"
ds_id = "00886375-eb2a-4038-9032-efff60309896"
Expand Down Expand Up @@ -343,6 +351,15 @@ async def predict(inputs: dict) -> dict:
ordering_of_stuff.append("predict")
return {"output": inputs["in"] + 1}

if as_runnable:
try:
from langchain_core.runnables import RunnableLambda
except ImportError:
pytest.skip("langchain-core not installed.")
return
else:
predict = RunnableLambda(predict)

async def score_value_first(run, example):
ordering_of_stuff.append("evaluate")
return {"score": 0.3}
Expand Down
Loading