-
Notifications
You must be signed in to change notification settings - Fork 85
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
python[patch]: evaluate
local mode
#1224
Changes from 24 commits
2dfc9fd
fb1469f
3fc7012
8f245d8
aa0acf5
82a7bf6
7ea5729
76f7132
0220f31
da548f9
9e83081
4d746aa
84cbb6e
c650280
a19f1d5
befed70
62f9476
02e5c75
8e59842
a0504f0
4b7b8b2
8f858a5
0047fe7
07085ef
94c4fef
b24f78a
324f3d1
1ba1bfa
bdfea79
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
"""V2 Evaluation Interface.""" | ||
Check notice on line 1 in python/langsmith/evaluation/_arunner.py GitHub Actions / benchmarkBenchmark results
Check notice on line 1 in python/langsmith/evaluation/_arunner.py GitHub Actions / benchmarkComparison against main
|
||
|
||
from __future__ import annotations | ||
|
||
|
@@ -83,6 +83,7 @@ | |
client: Optional[langsmith.Client] = None, | ||
blocking: bool = True, | ||
experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None, | ||
upload_results: bool = True, | ||
) -> AsyncExperimentResults: | ||
r"""Evaluate an async target system or function on a given dataset. | ||
|
||
|
@@ -260,6 +261,7 @@ | |
client=client, | ||
blocking=blocking, | ||
experiment=experiment, | ||
upload_results=upload_results, | ||
) | ||
|
||
|
||
|
@@ -379,6 +381,7 @@ | |
client: Optional[langsmith.Client] = None, | ||
blocking: bool = True, | ||
experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None, | ||
upload_results: bool = True, | ||
) -> AsyncExperimentResults: | ||
is_async_target = ( | ||
asyncio.iscoroutinefunction(target) | ||
|
@@ -401,6 +404,7 @@ | |
description=description, | ||
num_repetitions=num_repetitions, | ||
runs=runs, | ||
upload_results=upload_results, | ||
).astart() | ||
cache_dir = ls_utils.get_cache_dir(None) | ||
if cache_dir is not None: | ||
|
@@ -461,6 +465,7 @@ | |
summary_results: Optional[AsyncIterable[EvaluationResults]] = None, | ||
description: Optional[str] = None, | ||
num_repetitions: int = 1, | ||
upload_results: bool = True, | ||
): | ||
super().__init__( | ||
experiment=experiment, | ||
|
@@ -476,6 +481,7 @@ | |
self._evaluation_results = evaluation_results | ||
self._summary_results = summary_results | ||
self._num_repetitions = num_repetitions | ||
self._upload_results = upload_results | ||
|
||
async def aget_examples(self) -> AsyncIterator[schemas.Example]: | ||
if self._examples is None: | ||
|
@@ -535,7 +541,7 @@ | |
"No examples found in the dataset." | ||
"Please ensure the data provided to aevaluate is not empty." | ||
) | ||
project = self._get_project(first_example) | ||
project = self._get_project(first_example) if self._upload_results else None | ||
self._print_experiment_start(project, first_example) | ||
self._metadata["num_repetitions"] = self._num_repetitions | ||
return self.__class__( | ||
|
@@ -545,6 +551,7 @@ | |
client=self.client, | ||
runs=self._runs, | ||
evaluation_results=self._evaluation_results, | ||
upload_results=self._upload_results, | ||
) | ||
|
||
async def awith_predictions( | ||
|
@@ -561,6 +568,7 @@ | |
metadata=self._metadata, | ||
client=self.client, | ||
runs=(pred["run"] async for pred in r2), | ||
upload_results=self._upload_results, | ||
) | ||
|
||
async def awith_evaluators( | ||
|
@@ -580,6 +588,7 @@ | |
runs=(result["run"] async for result in r2), | ||
evaluation_results=(result["evaluation_results"] async for result in r3), | ||
summary_results=self._summary_results, | ||
upload_results=self._upload_results, | ||
) | ||
|
||
async def awith_summary_evaluators( | ||
|
@@ -596,6 +605,7 @@ | |
runs=self.aget_runs(), | ||
evaluation_results=self._evaluation_results, | ||
summary_results=aggregate_feedback_gen, | ||
upload_results=self._upload_results, | ||
) | ||
|
||
async def aget_results(self) -> AsyncIterator[ExperimentResultRow]: | ||
|
@@ -675,7 +685,7 @@ | |
**current_context, | ||
"project_name": "evaluators", | ||
"metadata": metadata, | ||
"enabled": True, | ||
"enabled": "local" if not self._upload_results else True, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. means we're tracing (ie tracking intermediate steps) but not sending to langsmith There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In that case, where do they get stored? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we're keeping all the runs in memory currently (on the ExperimentManager) |
||
"client": self.client, | ||
} | ||
): | ||
|
@@ -689,10 +699,12 @@ | |
example=example, | ||
) | ||
eval_results["results"].extend( | ||
self.client._select_eval_results(evaluator_response) | ||
) | ||
if self._upload_results: | ||
self.client._log_evaluation_feedback( | ||
evaluator_response, run=run, _executor=executor | ||
) | ||
) | ||
except Exception as e: | ||
try: | ||
feedback_keys = _extract_feedback_keys(evaluator) | ||
|
@@ -709,11 +721,12 @@ | |
] | ||
) | ||
eval_results["results"].extend( | ||
# TODO: This is a hack | ||
self.client._select_eval_results(error_response) | ||
) | ||
if self._upload_results: | ||
self.client._log_evaluation_feedback( | ||
error_response, run=run, _executor=executor | ||
) | ||
) | ||
except Exception as e2: | ||
logger.debug(f"Error parsing feedback keys: {e2}") | ||
pass | ||
|
@@ -758,7 +771,7 @@ | |
**current_context, | ||
"project_name": "evaluators", | ||
"metadata": metadata, | ||
"enabled": True, | ||
"enabled": "local" if not self._upload_results else True, | ||
"client": self.client, | ||
} | ||
): | ||
|
@@ -770,16 +783,17 @@ | |
fn_name=evaluator.__name__, | ||
) | ||
aggregate_feedback.extend(flattened_results) | ||
for result in flattened_results: | ||
feedback = result.dict(exclude={"target_run_id"}) | ||
evaluator_info = feedback.pop("evaluator_info", None) | ||
await aitertools.aio_to_thread( | ||
self.client.create_feedback, | ||
**feedback, | ||
run_id=None, | ||
project_id=project_id, | ||
source_info=evaluator_info, | ||
) | ||
if self._upload_results: | ||
for result in flattened_results: | ||
feedback = result.dict(exclude={"target_run_id"}) | ||
evaluator_info = feedback.pop("evaluator_info", None) | ||
await aitertools.aio_to_thread( | ||
self.client.create_feedback, | ||
**feedback, | ||
run_id=None, | ||
project_id=project_id, | ||
source_info=evaluator_info, | ||
) | ||
Comment on lines
+789
to
+799
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A couple of comments here:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fwiw the only change here was condition in L786 (rest is just indentation) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ya we can asyncio.gather() There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. re: background processing, it doesn't get put in the background queue because we lack a trace_id in this case |
||
except Exception as e: | ||
logger.error( | ||
f"Error running summary evaluator {repr(evaluator)}: {e}", | ||
|
@@ -815,6 +829,8 @@ | |
return list(splits) | ||
|
||
async def _aend(self) -> None: | ||
if not self._upload_results: | ||
return | ||
experiment = self._experiment | ||
if experiment is None: | ||
raise ValueError("Experiment not started yet.") | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
flyby comment: the fact that we have to update so many call sites feels pretty problematic
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
wdym call sites?
Our data model for evals is indeed too complicated lol -tracer sessions, feedback, runs in multiple sessions, ay caramba