Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

python[patch]: pass Runnable to evaluate #1204

Merged
merged 6 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions python/langsmith/evaluation/_arunner.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""V2 Evaluation Interface."""

Check notice on line 1 in python/langsmith/evaluation/_arunner.py

View workflow job for this annotation

GitHub Actions / benchmark

Benchmark results

......................................... create_5_000_run_trees: Mean +- std dev: 612 ms +- 47 ms ......................................... create_10_000_run_trees: Mean +- std dev: 1.19 sec +- 0.06 sec ......................................... create_20_000_run_trees: Mean +- std dev: 1.19 sec +- 0.06 sec ......................................... dumps_class_nested_py_branch_and_leaf_200x400: Mean +- std dev: 703 us +- 8 us ......................................... dumps_class_nested_py_leaf_50x100: Mean +- std dev: 25.1 ms +- 0.2 ms ......................................... dumps_class_nested_py_leaf_100x200: Mean +- std dev: 103 ms +- 2 ms ......................................... dumps_dataclass_nested_50x100: Mean +- std dev: 25.3 ms +- 0.3 ms ......................................... WARNING: the benchmark result may be unstable * the standard deviation (16.5 ms) is 25% of the mean (66.8 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. dumps_pydantic_nested_50x100: Mean +- std dev: 66.8 ms +- 16.5 ms ......................................... WARNING: the benchmark result may be unstable * the standard deviation (30.3 ms) is 14% of the mean (220 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. dumps_pydanticv1_nested_50x100: Mean +- std dev: 220 ms +- 30 ms

Check notice on line 1 in python/langsmith/evaluation/_arunner.py

View workflow job for this annotation

GitHub Actions / benchmark

Comparison against main

+------------------------------------+---------+-----------------------+ | Benchmark | main | changes | +====================================+=========+=======================+ | dumps_class_nested_py_leaf_100x200 | 105 ms | 103 ms: 1.02x faster | +------------------------------------+---------+-----------------------+ | dumps_dataclass_nested_50x100 | 25.7 ms | 25.3 ms: 1.02x faster | +------------------------------------+---------+-----------------------+ | dumps_class_nested_py_leaf_50x100 | 25.2 ms | 25.1 ms: 1.00x faster | +------------------------------------+---------+-----------------------+ | Geometric mean | (ref) | 1.00x faster | +------------------------------------+---------+-----------------------+ Benchmark hidden because not significant (6): dumps_pydantic_nested_50x100, dumps_class_nested_py_branch_and_leaf_200x400, create_5_000_run_trees, dumps_pydanticv1_nested_50x100, create_10_000_run_trees, create_20_000_run_trees

from __future__ import annotations

Expand Down Expand Up @@ -40,6 +40,7 @@
_ExperimentManagerMixin,
_extract_feedback_keys,
_ForwardResults,
_is_langchain_runnable,
_load_examples_map,
_load_experiment,
_load_tqdm,
Expand Down Expand Up @@ -379,8 +380,10 @@
blocking: bool = True,
experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
) -> AsyncExperimentResults:
is_async_target = asyncio.iscoroutinefunction(target) or (
hasattr(target, "__aiter__") and asyncio.iscoroutine(target.__aiter__())
is_async_target = (
asyncio.iscoroutinefunction(target)
or (hasattr(target, "__aiter__") and asyncio.iscoroutine(target.__aiter__()))
or _is_langchain_runnable(target)
)
client = client or rt.get_cached_client()
runs = None if is_async_target else cast(Iterable[schemas.Run], target)
Expand Down Expand Up @@ -940,7 +943,7 @@
def _ensure_async_traceable(
target: ATARGET_T,
) -> rh.SupportsLangsmithExtra[[dict], Awaitable]:
if not asyncio.iscoroutinefunction(target):
if not asyncio.iscoroutinefunction(target) and not _is_langchain_runnable(target):
if callable(target):
raise ValueError(
"Target must be an async function. For sync functions, use evaluate."
Expand All @@ -961,7 +964,10 @@
)
if rh.is_traceable_function(target):
return target # type: ignore
return rh.traceable(name="AsyncTarget")(target)
else:
if _is_langchain_runnable(target):
target = target.ainvoke # type: ignore[attr-defined]
return rh.traceable(name="AsyncTarget")(target)


def _aresolve_data(
Expand Down
41 changes: 29 additions & 12 deletions python/langsmith/evaluation/_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@

if TYPE_CHECKING:
import pandas as pd
from langchain_core.runnables import Runnable

DataFrame = pd.DataFrame
else:
Expand Down Expand Up @@ -96,7 +97,7 @@


def evaluate(
target: TARGET_T,
target: Union[TARGET_T, Runnable],
/,
data: DATA_T,
evaluators: Optional[Sequence[EVALUATOR_T]] = None,
Expand Down Expand Up @@ -878,12 +879,12 @@ def _print_comparative_experiment_start(
)


def _is_callable(target: Union[TARGET_T, Iterable[schemas.Run]]) -> bool:
return callable(target) or (hasattr(target, "invoke") and callable(target.invoke))
def _is_callable(target: Union[TARGET_T, Iterable[schemas.Run], Runnable]) -> bool:
return callable(target) or _is_langchain_runnable(target)


def _evaluate(
target: Union[TARGET_T, Iterable[schemas.Run]],
target: Union[TARGET_T, Iterable[schemas.Run], Runnable],
/,
data: DATA_T,
evaluators: Optional[Sequence[EVALUATOR_T]] = None,
Expand Down Expand Up @@ -1664,12 +1665,13 @@ def _resolve_data(


def _ensure_traceable(
target: TARGET_T | rh.SupportsLangsmithExtra[[dict], dict],
target: TARGET_T | rh.SupportsLangsmithExtra[[dict], dict] | Runnable,
) -> rh.SupportsLangsmithExtra[[dict], dict]:
"""Ensure the target function is traceable."""
if not callable(target):
if not _is_callable(target):
raise ValueError(
"Target must be a callable function. For example:\n\n"
"Target must be a callable function or a langchain/langgraph object. For "
"example:\n\n"
"def predict(inputs: dict) -> dict:\n"
" # do work, like chain.invoke(inputs)\n"
" return {...}\n\n"
Expand All @@ -1679,9 +1681,11 @@ def _ensure_traceable(
")"
)
if rh.is_traceable_function(target):
fn = target
fn: rh.SupportsLangsmithExtra[[dict], dict] = target
else:
fn = rh.traceable(name="Target")(target)
if _is_langchain_runnable(target):
target = target.invoke # type: ignore[union-attr]
fn = rh.traceable(name="Target")(cast(Callable, target))
return fn


Expand Down Expand Up @@ -1709,9 +1713,8 @@ def _resolve_experiment(
return experiment_, runs
# If we have runs, that means the experiment was already started.
if runs is not None:
if runs is not None:
runs_, runs = itertools.tee(runs)
first_run = next(runs_)
runs_, runs = itertools.tee(runs)
first_run = next(runs_)
experiment_ = client.read_project(project_id=first_run.session_id)
if not experiment_.name:
raise ValueError("Experiment name not found for provided runs.")
Expand Down Expand Up @@ -1923,3 +1926,17 @@ def _flatten_experiment_results(
}
for x in results[start:end]
]


@functools.lru_cache(maxsize=1)
def _import_langchain_runnable() -> Optional[type]:
try:
from langchain_core.runnables import Runnable

return Runnable
except ImportError:
return None


def _is_langchain_runnable(o: Any) -> bool:
return bool((Runnable := _import_langchain_runnable()) and isinstance(o, Runnable))
6 changes: 2 additions & 4 deletions python/langsmith/run_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,7 @@ def tracing_context(
get_run_tree_context = get_current_run_tree


def is_traceable_function(
func: Callable[P, R],
) -> TypeGuard[SupportsLangsmithExtra[P, R]]:
def is_traceable_function(func: Any) -> TypeGuard[SupportsLangsmithExtra[P, R]]:
"""Check if a function is @traceable decorated."""
return (
_is_traceable_function(func)
Expand Down Expand Up @@ -1445,7 +1443,7 @@ def _handle_container_end(
LOGGER.warning(f"Unable to process trace outputs: {repr(e)}")


def _is_traceable_function(func: Callable) -> bool:
def _is_traceable_function(func: Any) -> bool:
return getattr(func, "__langsmith_traceable__", False)


Expand Down
2 changes: 1 addition & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langsmith"
version = "0.1.142"
version = "0.1.143"
description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
authors = ["LangChain <[email protected]>"]
license = "MIT"
Expand Down
43 changes: 31 additions & 12 deletions python/tests/unit_tests/evaluation/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,9 @@ def request(self, verb: str, endpoint: str, *args, **kwargs):
res = MagicMock()
res.json.return_value = {
"runs": [
r for r in self.runs.values() if "reference_example_id" in r
r
for r in self.runs.values()
if r["trace_id"] == r["id"] and r.get("reference_example_id")
]
}
return res
Expand Down Expand Up @@ -120,7 +122,8 @@ def _wait_until(condition: Callable, timeout: int = 8):

@pytest.mark.skipif(sys.version_info < (3, 9), reason="requires python3.9 or higher")
@pytest.mark.parametrize("blocking", [False, True])
def test_evaluate_results(blocking: bool) -> None:
@pytest.mark.parametrize("as_runnable", [False, True])
def test_evaluate_results(blocking: bool, as_runnable: bool) -> None:
session = mock.Mock()
ds_name = "my-dataset"
ds_id = "00886375-eb2a-4038-9032-efff60309896"
Expand Down Expand Up @@ -180,6 +183,15 @@ def predict(inputs: dict) -> dict:
ordering_of_stuff.append("predict")
return {"output": inputs["in"] + 1}

if as_runnable:
try:
from langchain_core.runnables import RunnableLambda
except ImportError:
pytest.skip("langchain-core not installed.")
return
else:
predict = RunnableLambda(predict)

def score_value_first(run, example):
ordering_of_stuff.append("evaluate")
return {"score": 0.3}
Expand Down Expand Up @@ -263,26 +275,24 @@ async def my_other_func(inputs: dict, other_val: int):
with pytest.raises(ValueError, match=match):
evaluate(functools.partial(my_other_func, other_val=3), data="foo")

if sys.version_info < (3, 10):
return
try:
from langchain_core.runnables import RunnableLambda
except ImportError:
pytest.skip("langchain-core not installed.")

@RunnableLambda
def foo(inputs: dict):
return "bar"

with pytest.raises(ValueError, match=match):
evaluate(foo.ainvoke, data="foo")
if sys.version_info < (3, 10):
return
with pytest.raises(ValueError, match=match):
evaluate(functools.partial(foo.ainvoke, inputs={"foo": "bar"}), data="foo")
evaluate(
functools.partial(RunnableLambda(my_func).ainvoke, inputs={"foo": "bar"}),
data="foo",
)


@pytest.mark.skipif(sys.version_info < (3, 9), reason="requires python3.9 or higher")
@pytest.mark.parametrize("blocking", [False, True])
async def test_aevaluate_results(blocking: bool) -> None:
@pytest.mark.parametrize("as_runnable", [False, True])
async def test_aevaluate_results(blocking: bool, as_runnable: bool) -> None:
session = mock.Mock()
ds_name = "my-dataset"
ds_id = "00886375-eb2a-4038-9032-efff60309896"
Expand Down Expand Up @@ -343,6 +353,15 @@ async def predict(inputs: dict) -> dict:
ordering_of_stuff.append("predict")
return {"output": inputs["in"] + 1}

if as_runnable:
try:
from langchain_core.runnables import RunnableLambda
except ImportError:
pytest.skip("langchain-core not installed.")
return
else:
predict = RunnableLambda(predict)

async def score_value_first(run, example):
ordering_of_stuff.append("evaluate")
return {"score": 0.3}
Expand Down
Loading