python[patch]: evaluate local mode (#1224)

Adds an `upload_results` flag to avoid tracing any runs (target or evaluator) or creating an experiment in langsmith: ```python from langsmith import evaluate results = evaluate( lambda x: x, data="Sample Dataset 3", evaluators=[lambda inputs: {"score": 1, "key": "correct"}], upload_results=False ) ``` --------- Co-authored-by: William Fu-Hinthorn <[email protected]>
langchain-ai · Nov 22, 2024 · 6cf7c9b · 6cf7c9b
1 parent c141d50
commit 6cf7c9b
Show file tree

Hide file tree

Showing 6 changed files with 254 additions and 149 deletions.
diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py
@@ -31,6 +31,7 @@
 from langsmith import run_trees as rt
 from langsmith import utils as ls_utils
 from langsmith._internal import _aiter as aitertools
+from langsmith._internal._beta_decorator import _warn_once
 from langsmith.evaluation._runner import (
     AEVALUATOR_T,
     DATA_T,
@@ -83,6 +84,7 @@ async def aevaluate(
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
     experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
+    upload_results: bool = True,
 ) -> AsyncExperimentResults:
     r"""Evaluate an async target system or function on a given dataset.
 
@@ -241,6 +243,8 @@ async def aevaluate(
         ... )  # doctest: +ELLIPSIS
         View the evaluation results for experiment:...
     """  # noqa: E501
+    if not upload_results:
+        _warn_once("'upload_results' parameter is in beta.")
     if experiment and experiment_prefix:
         raise ValueError(
             "Expected at most one of 'experiment' or 'experiment_prefix',"
@@ -260,6 +264,7 @@ async def aevaluate(
         client=client,
         blocking=blocking,
         experiment=experiment,
+        upload_results=upload_results,
     )
 
 
@@ -379,6 +384,7 @@ async def _aevaluate(
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
     experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
+    upload_results: bool = True,
 ) -> AsyncExperimentResults:
     is_async_target = (
         asyncio.iscoroutinefunction(target)
@@ -401,6 +407,7 @@ async def _aevaluate(
         description=description,
         num_repetitions=num_repetitions,
         runs=runs,
+        upload_results=upload_results,
     ).astart()
     cache_dir = ls_utils.get_cache_dir(None)
     if cache_dir is not None:
@@ -461,6 +468,7 @@ def __init__(
         summary_results: Optional[AsyncIterable[EvaluationResults]] = None,
         description: Optional[str] = None,
         num_repetitions: int = 1,
+        upload_results: bool = True,
     ):
         super().__init__(
             experiment=experiment,
@@ -476,6 +484,7 @@ def __init__(
         self._evaluation_results = evaluation_results
         self._summary_results = summary_results
         self._num_repetitions = num_repetitions
+        self._upload_results = upload_results
 
     async def aget_examples(self) -> AsyncIterator[schemas.Example]:
         if self._examples is None:
@@ -535,7 +544,7 @@ async def astart(self) -> _AsyncExperimentManager:
                 "No examples found in the dataset."
                 "Please ensure the data provided to aevaluate is not empty."
             )
-        project = self._get_project(first_example)
+        project = self._get_project(first_example) if self._upload_results else None
         self._print_experiment_start(project, first_example)
         self._metadata["num_repetitions"] = self._num_repetitions
         return self.__class__(
@@ -545,6 +554,7 @@ async def astart(self) -> _AsyncExperimentManager:
             client=self.client,
             runs=self._runs,
             evaluation_results=self._evaluation_results,
+            upload_results=self._upload_results,
         )
 
     async def awith_predictions(
@@ -561,6 +571,7 @@ async def awith_predictions(
             metadata=self._metadata,
             client=self.client,
             runs=(pred["run"] async for pred in r2),
+            upload_results=self._upload_results,
         )
 
     async def awith_evaluators(
@@ -580,6 +591,7 @@ async def awith_evaluators(
             runs=(result["run"] async for result in r2),
             evaluation_results=(result["evaluation_results"] async for result in r3),
             summary_results=self._summary_results,
+            upload_results=self._upload_results,
         )
 
     async def awith_summary_evaluators(
@@ -596,6 +608,7 @@ async def awith_summary_evaluators(
             runs=self.aget_runs(),
             evaluation_results=self._evaluation_results,
             summary_results=aggregate_feedback_gen,
+            upload_results=self._upload_results,
         )
 
     async def aget_results(self) -> AsyncIterator[ExperimentResultRow]:
@@ -675,7 +688,7 @@ async def _arun_evaluators(
                 **current_context,
                 "project_name": "evaluators",
                 "metadata": metadata,
-                "enabled": True,
+                "enabled": "local" if not self._upload_results else True,
                 "client": self.client,
             }
         ):
@@ -689,10 +702,12 @@ async def _arun_evaluators(
                         example=example,
                     )
                     eval_results["results"].extend(
+                        self.client._select_eval_results(evaluator_response)
+                    )
+                    if self._upload_results:
                         self.client._log_evaluation_feedback(
                             evaluator_response, run=run, _executor=executor
                         )
-                    )
                 except Exception as e:
                     try:
                         feedback_keys = _extract_feedback_keys(evaluator)
@@ -709,11 +724,12 @@ async def _arun_evaluators(
                             ]
                         )
                         eval_results["results"].extend(
-                            # TODO: This is a hack
+                            self.client._select_eval_results(error_response)
+                        )
+                        if self._upload_results:
                             self.client._log_evaluation_feedback(
                                 error_response, run=run, _executor=executor
                             )
-                        )
                     except Exception as e2:
                         logger.debug(f"Error parsing feedback keys: {e2}")
                         pass
@@ -744,7 +760,7 @@ async def _aapply_summary_evaluators(
             runs.append(run)
             examples.append(example)
         aggregate_feedback = []
-        project_id = self._get_experiment().id
+        project_id = self._get_experiment().id if self._upload_results else None
         current_context = rh.get_tracing_context()
         metadata = {
             **(current_context["metadata"] or {}),
@@ -758,7 +774,7 @@ async def _aapply_summary_evaluators(
                 **current_context,
                 "project_name": "evaluators",
                 "metadata": metadata,
-                "enabled": True,
+                "enabled": "local" if not self._upload_results else True,
                 "client": self.client,
             }
         ):
@@ -770,16 +786,17 @@ async def _aapply_summary_evaluators(
                         fn_name=evaluator.__name__,
                     )
                     aggregate_feedback.extend(flattened_results)
-                    for result in flattened_results:
-                        feedback = result.dict(exclude={"target_run_id"})
-                        evaluator_info = feedback.pop("evaluator_info", None)
-                        await aitertools.aio_to_thread(
-                            self.client.create_feedback,
-                            **feedback,
-                            run_id=None,
-                            project_id=project_id,
-                            source_info=evaluator_info,
-                        )
+                    if self._upload_results:
+                        for result in flattened_results:
+                            feedback = result.dict(exclude={"target_run_id"})
+                            evaluator_info = feedback.pop("evaluator_info", None)
+                            await aitertools.aio_to_thread(
+                                self.client.create_feedback,
+                                **feedback,
+                                run_id=None,
+                                project_id=project_id,
+                                source_info=evaluator_info,
+                            )
                 except Exception as e:
                     logger.error(
                         f"Error running summary evaluator {repr(evaluator)}: {e}",
@@ -815,6 +832,8 @@ async def _get_dataset_splits(self) -> Optional[list[str]]:
         return list(splits)
 
     async def _aend(self) -> None:
+        if not self._upload_results:
+            return
         experiment = self._experiment
         if experiment is None:
             raise ValueError("Experiment not started yet.")