From 0a2f1d5c35ed711f8aed302ed309937864741527 Mon Sep 17 00:00:00 2001
From: William Fu-Hinthorn <13333726+hinthornw@users.noreply.github.com>
Date: Fri, 13 Dec 2024 08:33:21 -0800
Subject: [PATCH] gs

---
 python/langsmith/evaluation/_runner.py | 54 ++++++++++++++++----------
 1 file changed, 34 insertions(+), 20 deletions(-)

diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
index b54c8b858..e98792517 100644
--- a/python/langsmith/evaluation/_runner.py
+++ b/python/langsmith/evaluation/_runner.py
@@ -651,6 +651,7 @@ def evaluate_comparative(
     metadata: Optional[dict] = None,
     load_nested: bool = False,
     randomize_order: bool = False,
+    upload_results: bool = True,
 ) -> ComparativeExperimentResults:
     r"""Evaluate existing experiment runs against each other.
 
@@ -675,6 +676,8 @@ def evaluate_comparative(
             Default is to only load the top-level root runs.
         randomize_order (bool): Whether to randomize the order of the outputs for each evaluation.
             Default is False.
+        upload_results (bool): Whether to upload the results to LangSmith.
+            Default is True.
 
     Returns:
         ComparativeExperimentResults: The results of the comparative evaluation.
@@ -910,6 +913,7 @@ def evaluate_comparative(
 
     comparators = [comparison_evaluator(evaluator) for evaluator in evaluators or []]
     results: dict = {}
+    tracing_mode = "local" if not upload_results else True
 
     def evaluate_and_submit_feedback(
         runs_list: list[schemas.Run],
@@ -920,10 +924,26 @@ def evaluate_and_submit_feedback(
         feedback_group_id = uuid.uuid4()
         if randomize_order:
             random.shuffle(runs_list)
-        with rh.tracing_context(project_name="evaluators", client=client):
+        current_context = rh.get_tracing_context()
+        metadata = (current_context["metadata"] or {}) | {
+            "experiment": comparative_experiment.name,
+            "experiment_id": comparative_experiment.id,
+            "reference_example_id": example.id,
+            "reference_run_ids": [r.id for r in runs_list],
+        }
+        with rh.tracing_context(
+            **(
+                current_context
+                | {
+                    "project_name": "evaluators",
+                    "metadata": metadata,
+                    "enabled": tracing_mode,
+                    "client": client,
+                }
+            )
+        ):
             result = comparator.compare_runs(runs_list, example)
-            if client is None:
-                raise ValueError("Client is required to submit feedback.")
+
         comments = (
             {str(rid): result.comment for rid in result.scores}
             if isinstance(result.comment, str)
@@ -1548,17 +1568,14 @@ def _run_evaluators(
         executor: cf.ThreadPoolExecutor,
     ) -> ExperimentResultRow:
         current_context = rh.get_tracing_context()
-        metadata = {
-            **(current_context["metadata"] or {}),
-            **{
-                "experiment": self.experiment_name,
-                "reference_example_id": current_results["example"].id,
-                "reference_run_id": current_results["run"].id,
-            },
+        metadata = (current_context["metadata"] or {}) | {
+            "experiment": self.experiment_name,
+            "reference_example_id": current_results["example"].id,
+            "reference_run_id": current_results["run"].id,
         }
         with rh.tracing_context(
-            **{
-                **current_context,
+            current_context
+            | {
                 "project_name": "evaluators",
                 "metadata": metadata,
                 "enabled": "local" if not self._upload_results else True,
@@ -1681,16 +1698,13 @@ def _apply_summary_evaluators(
         with ls_utils.ContextThreadPoolExecutor() as executor:
             project_id = self._get_experiment().id if self._upload_results else None
             current_context = rh.get_tracing_context()
-            metadata = {
-                **(current_context["metadata"] or {}),
-                **{
-                    "experiment": self.experiment_name,
-                    "experiment_id": project_id,
-                },
+            metadata = (current_context["metadata"] or {}) | {
+                "experiment": self.experiment_name,
+                "experiment_id": project_id,
             }
             with rh.tracing_context(
-                **{
-                    **current_context,
+                current_context
+                | {
                     "project_name": "evaluators",
                     "metadata": metadata,
                     "client": self.client,