Skip to content

Commit

Permalink
python[patch]: evaluate local mode (#1224)
Browse files Browse the repository at this point in the history
Adds an `upload_results` flag to avoid tracing any runs (target or
evaluator) or creating an experiment in langsmith:

```python
from langsmith import evaluate

results = evaluate(
    lambda x: x, 
    data="Sample Dataset 3", 
    evaluators=[lambda inputs: {"score": 1, "key": "correct"}],
    upload_results=False
)
```

---------

Co-authored-by: William Fu-Hinthorn <[email protected]>
  • Loading branch information
baskaryan and hinthornw authored Nov 22, 2024
1 parent c141d50 commit 6cf7c9b
Show file tree
Hide file tree
Showing 6 changed files with 254 additions and 149 deletions.
53 changes: 36 additions & 17 deletions python/langsmith/evaluation/_arunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from langsmith import run_trees as rt
from langsmith import utils as ls_utils
from langsmith._internal import _aiter as aitertools
from langsmith._internal._beta_decorator import _warn_once
from langsmith.evaluation._runner import (
AEVALUATOR_T,
DATA_T,
Expand Down Expand Up @@ -83,6 +84,7 @@ async def aevaluate(
client: Optional[langsmith.Client] = None,
blocking: bool = True,
experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
upload_results: bool = True,
) -> AsyncExperimentResults:
r"""Evaluate an async target system or function on a given dataset.
Expand Down Expand Up @@ -241,6 +243,8 @@ async def aevaluate(
... ) # doctest: +ELLIPSIS
View the evaluation results for experiment:...
""" # noqa: E501
if not upload_results:
_warn_once("'upload_results' parameter is in beta.")
if experiment and experiment_prefix:
raise ValueError(
"Expected at most one of 'experiment' or 'experiment_prefix',"
Expand All @@ -260,6 +264,7 @@ async def aevaluate(
client=client,
blocking=blocking,
experiment=experiment,
upload_results=upload_results,
)


Expand Down Expand Up @@ -379,6 +384,7 @@ async def _aevaluate(
client: Optional[langsmith.Client] = None,
blocking: bool = True,
experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
upload_results: bool = True,
) -> AsyncExperimentResults:
is_async_target = (
asyncio.iscoroutinefunction(target)
Expand All @@ -401,6 +407,7 @@ async def _aevaluate(
description=description,
num_repetitions=num_repetitions,
runs=runs,
upload_results=upload_results,
).astart()
cache_dir = ls_utils.get_cache_dir(None)
if cache_dir is not None:
Expand Down Expand Up @@ -461,6 +468,7 @@ def __init__(
summary_results: Optional[AsyncIterable[EvaluationResults]] = None,
description: Optional[str] = None,
num_repetitions: int = 1,
upload_results: bool = True,
):
super().__init__(
experiment=experiment,
Expand All @@ -476,6 +484,7 @@ def __init__(
self._evaluation_results = evaluation_results
self._summary_results = summary_results
self._num_repetitions = num_repetitions
self._upload_results = upload_results

async def aget_examples(self) -> AsyncIterator[schemas.Example]:
if self._examples is None:
Expand Down Expand Up @@ -535,7 +544,7 @@ async def astart(self) -> _AsyncExperimentManager:
"No examples found in the dataset."
"Please ensure the data provided to aevaluate is not empty."
)
project = self._get_project(first_example)
project = self._get_project(first_example) if self._upload_results else None
self._print_experiment_start(project, first_example)
self._metadata["num_repetitions"] = self._num_repetitions
return self.__class__(
Expand All @@ -545,6 +554,7 @@ async def astart(self) -> _AsyncExperimentManager:
client=self.client,
runs=self._runs,
evaluation_results=self._evaluation_results,
upload_results=self._upload_results,
)

async def awith_predictions(
Expand All @@ -561,6 +571,7 @@ async def awith_predictions(
metadata=self._metadata,
client=self.client,
runs=(pred["run"] async for pred in r2),
upload_results=self._upload_results,
)

async def awith_evaluators(
Expand All @@ -580,6 +591,7 @@ async def awith_evaluators(
runs=(result["run"] async for result in r2),
evaluation_results=(result["evaluation_results"] async for result in r3),
summary_results=self._summary_results,
upload_results=self._upload_results,
)

async def awith_summary_evaluators(
Expand All @@ -596,6 +608,7 @@ async def awith_summary_evaluators(
runs=self.aget_runs(),
evaluation_results=self._evaluation_results,
summary_results=aggregate_feedback_gen,
upload_results=self._upload_results,
)

async def aget_results(self) -> AsyncIterator[ExperimentResultRow]:
Expand Down Expand Up @@ -675,7 +688,7 @@ async def _arun_evaluators(
**current_context,
"project_name": "evaluators",
"metadata": metadata,
"enabled": True,
"enabled": "local" if not self._upload_results else True,
"client": self.client,
}
):
Expand All @@ -689,10 +702,12 @@ async def _arun_evaluators(
example=example,
)
eval_results["results"].extend(
self.client._select_eval_results(evaluator_response)
)
if self._upload_results:
self.client._log_evaluation_feedback(
evaluator_response, run=run, _executor=executor
)
)
except Exception as e:
try:
feedback_keys = _extract_feedback_keys(evaluator)
Expand All @@ -709,11 +724,12 @@ async def _arun_evaluators(
]
)
eval_results["results"].extend(
# TODO: This is a hack
self.client._select_eval_results(error_response)
)
if self._upload_results:
self.client._log_evaluation_feedback(
error_response, run=run, _executor=executor
)
)
except Exception as e2:
logger.debug(f"Error parsing feedback keys: {e2}")
pass
Expand Down Expand Up @@ -744,7 +760,7 @@ async def _aapply_summary_evaluators(
runs.append(run)
examples.append(example)
aggregate_feedback = []
project_id = self._get_experiment().id
project_id = self._get_experiment().id if self._upload_results else None
current_context = rh.get_tracing_context()
metadata = {
**(current_context["metadata"] or {}),
Expand All @@ -758,7 +774,7 @@ async def _aapply_summary_evaluators(
**current_context,
"project_name": "evaluators",
"metadata": metadata,
"enabled": True,
"enabled": "local" if not self._upload_results else True,
"client": self.client,
}
):
Expand All @@ -770,16 +786,17 @@ async def _aapply_summary_evaluators(
fn_name=evaluator.__name__,
)
aggregate_feedback.extend(flattened_results)
for result in flattened_results:
feedback = result.dict(exclude={"target_run_id"})
evaluator_info = feedback.pop("evaluator_info", None)
await aitertools.aio_to_thread(
self.client.create_feedback,
**feedback,
run_id=None,
project_id=project_id,
source_info=evaluator_info,
)
if self._upload_results:
for result in flattened_results:
feedback = result.dict(exclude={"target_run_id"})
evaluator_info = feedback.pop("evaluator_info", None)
await aitertools.aio_to_thread(
self.client.create_feedback,
**feedback,
run_id=None,
project_id=project_id,
source_info=evaluator_info,
)
except Exception as e:
logger.error(
f"Error running summary evaluator {repr(evaluator)}: {e}",
Expand Down Expand Up @@ -815,6 +832,8 @@ async def _get_dataset_splits(self) -> Optional[list[str]]:
return list(splits)

async def _aend(self) -> None:
if not self._upload_results:
return
experiment = self._experiment
if experiment is None:
raise ValueError("Experiment not started yet.")
Expand Down
Loading

0 comments on commit 6cf7c9b

Please sign in to comment.