Skip to content

Commit

Permalink
python[patch]: comparison evaluator simplification (#1240)
Browse files Browse the repository at this point in the history
- simpler comparison args
- simpler comparison returns
  • Loading branch information
baskaryan authored Nov 21, 2024
1 parent 9d52613 commit e3ab54c
Show file tree
Hide file tree
Showing 4 changed files with 229 additions and 53 deletions.
4 changes: 1 addition & 3 deletions python/langsmith/evaluation/_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,9 +801,7 @@ def evaluate_and_submit_feedback(
) as executor:
futures = []
for example_id, runs_list in tqdm(runs_dict.items()):
results[example_id] = {
"runs": runs_list,
}
results[example_id] = {"runs": runs_list}
for comparator in comparators:
if max_concurrency > 1:
future = executor.submit(
Expand Down
152 changes: 105 additions & 47 deletions python/langsmith/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,10 @@ def __init__(
func (Callable): A function that takes a `Run` and an optional `Example` as
arguments, and returns an `EvaluationResult` or `EvaluationResults`.
"""
func = _normalize_comparison_evaluator_func(func)
if afunc:
afunc = _normalize_comparison_evaluator_func(afunc) # type: ignore[assignment]

wraps(func)(self)
from langsmith import run_helpers # type: ignore

Expand Down Expand Up @@ -505,7 +509,7 @@ def compare_runs(
example,
langsmith_extra={"run_id": source_run_id, "tags": tags},
)
return self._format_results(result, source_run_id)
return self._format_results(result, source_run_id, runs)

async def acompare_runs(
self, runs: Sequence[Run], example: Optional[Example] = None
Expand All @@ -516,7 +520,7 @@ async def acompare_runs(
provided arguments.
Args:
run (Run): The run to be evaluated.
runs (Run): The runs to be evaluated.
example (Optional[Example]): An optional example to be used
in the evaluation.
Expand All @@ -533,7 +537,7 @@ async def acompare_runs(
example,
langsmith_extra={"run_id": source_run_id, "tags": tags},
)
return self._format_results(result, source_run_id)
return self._format_results(result, source_run_id, runs)

def __call__(
self, runs: Sequence[Run], example: Optional[Example] = None
Expand Down Expand Up @@ -567,53 +571,31 @@ def _get_tags(runs: Sequence[Run]) -> List[str]:
tags.append("experiment:" + str(run.session_id))
return tags

def _coerce_evaluation_result(
self,
result: Union[EvaluationResult, dict],
source_run_id: uuid.UUID,
allow_no_key: bool = False,
) -> EvaluationResult:
if isinstance(result, EvaluationResult):
if not result.source_run_id:
result.source_run_id = source_run_id
return result
try:
if "key" not in result:
if allow_no_key:
result["key"] = self._name
return EvaluationResult(**{"source_run_id": source_run_id, **result})
except ValidationError as e:
raise ValueError(
"Expected an EvaluationResult object, or dict with a metric"
f" 'key' and optional 'score'; got {result}"
) from e

def _coerce_evaluation_results(
self,
results: Union[dict, EvaluationResults],
source_run_id: uuid.UUID,
) -> Union[EvaluationResult, EvaluationResults]:
if "results" in results:
cp = results.copy()
cp["results"] = [
self._coerce_evaluation_result(r, source_run_id=source_run_id)
for r in results["results"]
]
return EvaluationResults(**cp)

return self._coerce_evaluation_result(
cast(dict, results), allow_no_key=True, source_run_id=source_run_id
)

def _format_results(
self,
result: Union[dict, ComparisonEvaluationResult],
result: Union[dict, list, ComparisonEvaluationResult],
source_run_id: uuid.UUID,
runs: Sequence[Run],
) -> ComparisonEvaluationResult:
if isinstance(result, ComparisonEvaluationResult):
if not result.source_run_id:
result.source_run_id = source_run_id
return result
elif isinstance(result, list):
result = {
"scores": {run.id: score for run, score in zip(runs, result)},
"key": self._name,
"source_run_id": source_run_id,
}
elif isinstance(result, dict):
if "key" not in result:
result["key"] = self._name
else:
msg = (
"Expected 'dict', 'list' or 'ComparisonEvaluationResult' result "
f"object. Received: {result=}"
)
raise ValueError(msg)
try:
return ComparisonEvaluationResult(
**{"source_run_id": source_run_id, **result}
Expand Down Expand Up @@ -669,13 +651,15 @@ def _normalize_evaluator_func(
else:
if inspect.iscoroutinefunction(func):

async def awrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
async def awrapper(
run: Run, example: Optional[Example]
) -> _RUNNABLE_OUTPUT:
arg_map = {
"run": run,
"example": example,
"inputs": example.inputs,
"inputs": example.inputs if example else {},
"outputs": run.outputs or {},
"reference_outputs": example.outputs or {},
"reference_outputs": example.outputs or {} if example else {},
}
args = (arg_map[arg] for arg in positional_args)
return await func(*args)
Expand All @@ -693,9 +677,83 @@ def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
arg_map = {
"run": run,
"example": example,
"inputs": example.inputs,
"inputs": example.inputs if example else {},
"outputs": run.outputs or {},
"reference_outputs": example.outputs or {},
"reference_outputs": example.outputs or {} if example else {},
}
args = (arg_map[arg] for arg in positional_args)
return func(*args)

wrapper.__name__ = (
getattr(func, "__name__")
if hasattr(func, "__name__")
else wrapper.__name__
)
return wrapper # type: ignore[return-value]


def _normalize_comparison_evaluator_func(
func: Callable,
) -> Union[
Callable[[Sequence[Run], Optional[Example]], _COMPARISON_OUTPUT],
Callable[[Sequence[Run], Optional[Example]], Awaitable[_COMPARISON_OUTPUT]],
]:
supported_args = ("runs", "example", "inputs", "outputs", "reference_outputs")
sig = inspect.signature(func)
positional_args = [
pname
for pname, p in sig.parameters.items()
if p.kind in (p.POSITIONAL_OR_KEYWORD, p.POSITIONAL_ONLY)
]
if not positional_args or (
not all(pname in supported_args for pname in positional_args)
and len(positional_args) != 2
):
msg = (
f"Invalid evaluator function. Must have at least one positional "
f"argument. Supported positional arguments are {supported_args}. Please "
f"see https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators"
# noqa: E501
)
raise ValueError(msg)
# For backwards compatibility we assume custom arg names are List[Run] and
# List[Example] types, respectively.
elif not all(
pname in supported_args for pname in positional_args
) or positional_args == ["runs", "example"]:
return func
else:
if inspect.iscoroutinefunction(func):

async def awrapper(
runs: Sequence[Run], example: Optional[Example]
) -> _COMPARISON_OUTPUT:
arg_map = {
"runs": runs,
"example": example,
"inputs": example.inputs if example else {},
"outputs": [run.outputs or {} for run in runs],
"reference_outputs": example.outputs or {} if example else {},
}
args = (arg_map[arg] for arg in positional_args)
return await func(*args)

awrapper.__name__ = (
getattr(func, "__name__")
if hasattr(func, "__name__")
else awrapper.__name__
)
return awrapper # type: ignore[return-value]

else:

def wrapper(runs: Sequence[Run], example: Example) -> _COMPARISON_OUTPUT:
arg_map = {
"runs": runs,
"example": example,
"inputs": example.inputs if example else {},
"outputs": [run.outputs or {} for run in runs],
"reference_outputs": example.outputs or {} if example else {},
}
args = (arg_map[arg] for arg in positional_args)
return func(*args)
Expand Down
2 changes: 1 addition & 1 deletion python/tests/unit_tests/evaluation/test_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def sample_evaluator(run: Run, example: Optional[Example]) -> EvaluationResult:
assert result.score == 1.0


async def test_dynamie_comparison_run_evaluator():
async def test_dynamic_comparison_run_evaluator():
def foo(runs: list, example):
return ComparisonEvaluationResult(key="bar", scores={uuid.uuid4(): 3.1})

Expand Down
124 changes: 122 additions & 2 deletions python/tests/unit_tests/evaluation/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from langsmith.evaluation._arunner import aevaluate, aevaluate_existing
from langsmith.evaluation._runner import evaluate_existing
from langsmith.evaluation.evaluator import (
_normalize_comparison_evaluator_func,
_normalize_evaluator_func,
_normalize_summary_evaluator,
)
Expand Down Expand Up @@ -58,7 +59,14 @@ def request(self, verb: str, endpoint: str, *args, **kwargs):
response = MagicMock()
response.json.return_value = res
return response

elif (
endpoint
== f"http://localhost:1984/sessions/{self.created_session['id']}"
): # type: ignore
res = self.created_session # type: ignore
response = MagicMock()
response.json.return_value = res
return response
else:
self.should_fail = True
raise ValueError(f"Unknown endpoint: {endpoint}")
Expand Down Expand Up @@ -94,6 +102,14 @@ def request(self, verb: str, endpoint: str, *args, **kwargs):
response = MagicMock()
response.json.return_value = {}
return response
elif endpoint == "http://localhost:1984/datasets/comparative":
response = MagicMock()
self.created_comparative_experiment = json.loads(kwargs["data"]) | {
"tenant_id": self.tenant_id,
"modified_at": datetime.now(),
}
response.json.return_value = self.created_comparative_experiment
return response

else:
raise ValueError(f"Unknown endpoint: {endpoint}")
Expand Down Expand Up @@ -303,7 +319,10 @@ def score_value(run, example):
return {"score": 0.7}

ex_results = evaluate_existing(
fake_request.created_session["name"], evaluators=[score_value], client=client
fake_request.created_session["name"],
evaluators=[score_value],
client=client,
blocking=blocking,
)
second_item = next(itertools.islice(iter(ex_results), 1, 2))
first_list = list(ex_results)
Expand Down Expand Up @@ -669,3 +688,104 @@ def summary_eval_unknown_positional_args(runs, examples, foo):
def test__normalize_summary_evaluator_invalid(evaluator: Callable) -> None:
with pytest.raises(ValueError, match="Invalid evaluator function."):
_normalize_summary_evaluator(evaluator)


def comparison_eval(runs, example):
return [len(r.outputs["response"]) for r in runs]


def comparison_eval_simple(inputs, outputs, reference_outputs):
return [len(o["response"]) for o in outputs]


def comparison_eval_no_inputs(outputs, reference_outputs):
return [min(len(o["response"]), len(reference_outputs["answer"])) for o in outputs]


@pytest.mark.parametrize(
"evaluator",
[comparison_eval, comparison_eval_simple, comparison_eval_no_inputs],
)
def test__normalize_comparison_evaluator(evaluator: Callable) -> None:
runs = [
ls_schemas.Run(
name="foo",
start_time=datetime.now(),
run_type="chain",
id=uuid.uuid4(),
dotted_order="a",
outputs={"response": "c" * 2},
),
ls_schemas.Run(
name="foo",
start_time=datetime.now(),
run_type="chain",
id=uuid.uuid4(),
dotted_order="d",
outputs={"response": "e" * 3},
),
]
example = ls_schemas.Example(
id=uuid.uuid4(), inputs={"in": "b"}, outputs={"answer": "f" * 4}
)
normalized = _normalize_comparison_evaluator_func(evaluator)
assert normalized(runs, example) == [2, 3]


async def acomparison_eval(runs, example):
return [len(r.outputs["response"]) for r in runs]


async def acomparison_eval_simple(inputs, outputs, reference_outputs):
return [len(o["response"]) for o in outputs]


async def acomparison_eval_no_inputs(outputs, reference_outputs):
return [min(len(o["response"]), len(reference_outputs["answer"])) for o in outputs]


@pytest.mark.parametrize(
"evaluator",
[acomparison_eval, acomparison_eval_simple, acomparison_eval_no_inputs],
)
async def test__normalize_comparison_evaluator_async(evaluator: Callable) -> None:
runs = [
ls_schemas.Run(
name="foo",
start_time=datetime.now(),
run_type="chain",
id=uuid.uuid4(),
dotted_order="a",
outputs={"response": "c" * 2},
),
ls_schemas.Run(
name="foo",
start_time=datetime.now(),
run_type="chain",
id=uuid.uuid4(),
dotted_order="d",
outputs={"response": "e" * 3},
),
]
example = ls_schemas.Example(
id=uuid.uuid4(), inputs={"in": "b"}, outputs={"answer": "f" * 4}
)
normalized = _normalize_comparison_evaluator_func(evaluator)
assert await normalized(runs, example) == [2, 3]


def comparison_eval_kwargs(*, runs, example):
return


def comparison_eval_unknown_positional_args(runs, example, foo):
return


@pytest.mark.parametrize(
"evaluator",
[comparison_eval_kwargs, comparison_eval_unknown_positional_args],
)
def test__normalize_comparison_evaluator_invalid(evaluator: Callable) -> None:
with pytest.raises(ValueError, match="Invalid evaluator function."):
_normalize_comparison_evaluator_func(evaluator)

0 comments on commit e3ab54c

Please sign in to comment.