From 388764f085ec9abc0a94043684bd1e47b5f93e32 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Mon, 11 Nov 2024 10:35:01 -0800
Subject: [PATCH 01/31] rfc: evaluators can return primitives

---
 python/langsmith/evaluation/evaluator.py | 53 +++++++++++++++++-------
 1 file changed, 39 insertions(+), 14 deletions(-)

diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
index 065f5b16b..e12a1a6ab 100644
--- a/python/langsmith/evaluation/evaluator.py
+++ b/python/langsmith/evaluation/evaluator.py
@@ -224,7 +224,7 @@ def _coerce_evaluation_result(
         self,
         result: Union[EvaluationResult, dict],
         source_run_id: uuid.UUID,
-        allow_no_key: bool = False,
+        default_name: str = "",
     ) -> EvaluationResult:
         if isinstance(result, EvaluationResult):
             if not result.source_run_id:
@@ -237,8 +237,7 @@ def _coerce_evaluation_result(
                     f" 'key' and optional 'score'; got empty result: {result}"
                 )
             if "key" not in result:
-                if allow_no_key:
-                    result["key"] = self._name
+                result["key"] = default_name
             if all(k not in result for k in ("score", "value", "comment")):
                 raise ValueError(
                     "Expected an EvaluationResult object, or dict with a metric"
@@ -259,33 +258,45 @@ def _coerce_evaluation_results(
         if "results" in results:
             cp = results.copy()
             cp["results"] = [
-                self._coerce_evaluation_result(r, source_run_id=source_run_id)
-                for r in results["results"]
+                self._coerce_evaluation_result(
+                    r, source_run_id=source_run_id, default_name=f"{self._name}_{i+1}"
+                )
+                for i, r in enumerate(results["results"])
             ]
             return EvaluationResults(**cp)
 
         return self._coerce_evaluation_result(
-            cast(dict, results), allow_no_key=True, source_run_id=source_run_id
+            cast(dict, results), source_run_id=source_run_id, default_name=self._name
         )
 
     def _format_result(
         self,
-        result: Union[EvaluationResult, EvaluationResults, dict],
+        result: Union[
+            EvaluationResult, EvaluationResults, dict, str, int, bool, float, list
+        ],
         source_run_id: uuid.UUID,
     ) -> Union[EvaluationResult, EvaluationResults]:
-        if isinstance(result, EvaluationResult):
-            if not result.source_run_id:
-                result.source_run_id = source_run_id
-            return result
-        if not result:
+        if not result and not isinstance(result, (int, float, bool)):
             raise ValueError(
                 "Expected an EvaluationResult or EvaluationResults object, or a"
                 " dict with key and one of score or value, EvaluationResults,"
                 f" got {result}"
             )
-        if not isinstance(result, dict):
+
+        if isinstance(result, EvaluationResult):
+            if not result.source_run_id:
+                result.source_run_id = source_run_id
+            return result
+        elif isinstance(result, list):
+            result = {"results": [_primitive_to_result_dict(r) for r in result]}  # type: ignore[misc]
+        elif isinstance(result, (bool, int, float, str, dict)):
+            result = _primitive_to_result_dict(
+                cast(Union[bool, int, float, str, dict], result)
+            )
+        else:
             raise ValueError(
-                f"Expected a dict, EvaluationResult, or EvaluationResults, got {result}"
+                f"Expected a dict, str, bool, int, float, list, EvaluationResult, or "
+                f"EvaluationResults. Got {result}"
             )
 
         return self._coerce_evaluation_results(result, source_run_id)
@@ -632,3 +643,17 @@ def comparison_evaluator(
 ) -> DynamicComparisonRunEvaluator:
     """Create a comaprison evaluator from a function."""
     return DynamicComparisonRunEvaluator(func)
+
+
+def _primitive_to_result_dict(result: Union[float, str, int, bool, dict]) -> dict:
+    if isinstance(result, (bool, float, int)):
+        return {"score": result}
+    elif isinstance(result, str):
+        return {"value": result}
+    elif isinstance(result, dict):
+        return result
+    else:
+        raise ValueError(
+            f"Expected evaluation result to be int, float, str, bool, or dict. "
+            f"Received: {result}"
+        )

From a18c13981363dfda6f2b5d971879eb952f801827 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Mon, 11 Nov 2024 11:55:22 -0800
Subject: [PATCH 02/31] fmt

---
 python/langsmith/evaluation/_runner.py        | 12 +++++
 python/langsmith/evaluation/evaluator.py      | 52 +++++++------------
 .../unit_tests/evaluation/test_evaluator.py   | 33 ++++++++++--
 .../unit_tests/evaluation/test_runner.py      | 46 +++++++++++++---
 4 files changed, 98 insertions(+), 45 deletions(-)

diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
index 111986b76..ee16da764 100644
--- a/python/langsmith/evaluation/_runner.py
+++ b/python/langsmith/evaluation/_runner.py
@@ -7,6 +7,7 @@
 import concurrent.futures as cf
 import datetime
 import functools
+import importlib
 import inspect
 import itertools
 import logging
@@ -277,6 +278,17 @@ def evaluate(
             "    # ... other parameters\n"
             ")"
         )
+    if not callable(target):
+        if importlib.util.find_spec("langchain_core"):
+            from langchain_core.runnables import Runnable
+
+            if isinstance(target, Runnable):
+                target = target.invoke
+
+    if not callable(target):
+        msg = ""
+        raise ValueError(msg)
+
     if experiment and experiment_prefix:
         raise ValueError(
             "Expected at most one of 'experiment' or 'experiment_prefix',"
diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
index e12a1a6ab..2470d61cc 100644
--- a/python/langsmith/evaluation/evaluator.py
+++ b/python/langsmith/evaluation/evaluator.py
@@ -224,7 +224,7 @@ def _coerce_evaluation_result(
         self,
         result: Union[EvaluationResult, dict],
         source_run_id: uuid.UUID,
-        default_name: str = "",
+        allow_no_key: bool = False,
     ) -> EvaluationResult:
         if isinstance(result, EvaluationResult):
             if not result.source_run_id:
@@ -236,8 +236,8 @@ def _coerce_evaluation_result(
                     "Expected an EvaluationResult object, or dict with a metric"
                     f" 'key' and optional 'score'; got empty result: {result}"
                 )
-            if "key" not in result:
-                result["key"] = default_name
+            if "key" not in result and allow_no_key:
+                result["key"] = self._name
             if all(k not in result for k in ("score", "value", "comment")):
                 raise ValueError(
                     "Expected an EvaluationResult object, or dict with a metric"
@@ -258,15 +258,13 @@ def _coerce_evaluation_results(
         if "results" in results:
             cp = results.copy()
             cp["results"] = [
-                self._coerce_evaluation_result(
-                    r, source_run_id=source_run_id, default_name=f"{self._name}_{i+1}"
-                )
+                self._coerce_evaluation_result(r, source_run_id=source_run_id)
                 for i, r in enumerate(results["results"])
             ]
             return EvaluationResults(**cp)
 
         return self._coerce_evaluation_result(
-            cast(dict, results), source_run_id=source_run_id, default_name=self._name
+            cast(dict, results), source_run_id=source_run_id, allow_no_key=True
         )
 
     def _format_result(
@@ -276,23 +274,27 @@ def _format_result(
         ],
         source_run_id: uuid.UUID,
     ) -> Union[EvaluationResult, EvaluationResults]:
-        if not result and not isinstance(result, (int, float, bool)):
+        if isinstance(result, (bool, float, int)):
+            result = {"score": result}
+        elif not result:
             raise ValueError(
-                "Expected an EvaluationResult or EvaluationResults object, or a"
-                " dict with key and one of score or value, EvaluationResults,"
-                f" got {result}"
+                f"Expected a non-empty dict, str, bool, int, float, list, "
+                f"EvaluationResult, or EvaluationResults. Got {result}"
             )
-
-        if isinstance(result, EvaluationResult):
+        elif isinstance(result, EvaluationResult):
             if not result.source_run_id:
                 result.source_run_id = source_run_id
             return result
         elif isinstance(result, list):
-            result = {"results": [_primitive_to_result_dict(r) for r in result]}  # type: ignore[misc]
-        elif isinstance(result, (bool, int, float, str, dict)):
-            result = _primitive_to_result_dict(
-                cast(Union[bool, int, float, str, dict], result)
-            )
+            if not all(isinstance(x, dict) for x in result):
+                raise ValueError(
+                    f"Expected a list of dicts or EvaluationResult. Received {result}."
+                )
+            result = {"results": result}  # type: ignore[misc]
+        elif isinstance(result, str):
+            result = {"value": result}
+        elif isinstance(result, dict):
+            pass
         else:
             raise ValueError(
                 f"Expected a dict, str, bool, int, float, list, EvaluationResult, or "
@@ -643,17 +645,3 @@ def comparison_evaluator(
 ) -> DynamicComparisonRunEvaluator:
     """Create a comaprison evaluator from a function."""
     return DynamicComparisonRunEvaluator(func)
-
-
-def _primitive_to_result_dict(result: Union[float, str, int, bool, dict]) -> dict:
-    if isinstance(result, (bool, float, int)):
-        return {"score": result}
-    elif isinstance(result, str):
-        return {"value": result}
-    elif isinstance(result, dict):
-        return result
-    else:
-        raise ValueError(
-            f"Expected evaluation result to be int, float, str, bool, or dict. "
-            f"Received: {result}"
-        )
diff --git a/python/tests/unit_tests/evaluation/test_evaluator.py b/python/tests/unit_tests/evaluation/test_evaluator.py
index c3b907701..09f1d7eb3 100644
--- a/python/tests/unit_tests/evaluation/test_evaluator.py
+++ b/python/tests/unit_tests/evaluation/test_evaluator.py
@@ -321,8 +321,8 @@ async def sample_evaluator(
     assert result["results"][1].score == 2.0
 
 
-@pytest.mark.parametrize("response", [None, {}, {"accuracy": 5}])
-async def test_evaluator_raises_for_null_ouput(response: Any):
+@pytest.mark.parametrize("response", [None, {}, []])
+async def test_evaluator_raises_for_null_output(response: Any):
     @run_evaluator  # type: ignore
     def bad_evaluator(run: schemas.Run, example: schemas.Example):
         return response
@@ -334,13 +334,36 @@ async def abad_evaluator(run: schemas.Run, example: schemas.Example):
     fake_run = MagicMock()
     fake_example = MagicMock()
 
-    with pytest.raises(ValueError, match="Expected an EvaluationResult "):
+    with pytest.raises(ValueError, match="Expected a non-empty "):
         bad_evaluator.evaluate_run(fake_run, fake_example)
 
-    with pytest.raises(ValueError, match="Expected an EvaluationResult "):
+    with pytest.raises(ValueError, match="Expected a non-empty "):
         await bad_evaluator.aevaluate_run(fake_run, fake_example)
 
-    with pytest.raises(ValueError, match="Expected an EvaluationResult "):
+    with pytest.raises(ValueError, match="Expected a non-empty "):
+        await abad_evaluator.aevaluate_run(fake_run, fake_example)
+
+
+@pytest.mark.parametrize("response", [[5], {"accuracy": 5}])
+async def test_evaluator_raises_for_bad_output(response: Any):
+    @run_evaluator  # type: ignore
+    def bad_evaluator(run: schemas.Run, example: schemas.Example):
+        return response
+
+    @run_evaluator  # type: ignore
+    async def abad_evaluator(run: schemas.Run, example: schemas.Example):
+        return response
+
+    fake_run = MagicMock()
+    fake_example = MagicMock()
+
+    with pytest.raises(ValueError, match="Expected"):
+        bad_evaluator.evaluate_run(fake_run, fake_example)
+
+    with pytest.raises(ValueError, match="Expected"):
+        await bad_evaluator.aevaluate_run(fake_run, fake_example)
+
+    with pytest.raises(ValueError, match="Expected"):
         await abad_evaluator.aevaluate_run(fake_run, fake_example)
 
 
diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py
index d20960d3e..9f61723da 100644
--- a/python/tests/unit_tests/evaluation/test_runner.py
+++ b/python/tests/unit_tests/evaluation/test_runner.py
@@ -184,11 +184,26 @@ def score_value_first(run, example):
         ordering_of_stuff.append("evaluate")
         return {"score": 0.3}
 
+    def eval_float(run, example):
+        ordering_of_stuff.append("evaluate")
+        return 0.2
+
+    def eval_str(run, example):
+        ordering_of_stuff.append("evaluate")
+        return "good"
+
+    def eval_list(run, example):
+        ordering_of_stuff.append("evaluate")
+        return [
+            {"score": True, "key": "list_eval_bool"},
+            {"score": 1, "key": "list_eval_int"},
+        ]
+
     results = evaluate(
         predict,
         client=client,
         data=dev_split,
-        evaluators=[score_value_first],
+        evaluators=[score_value_first, eval_float, eval_str, eval_list],
         num_repetitions=NUM_REPETITIONS,
         blocking=blocking,
     )
@@ -219,14 +234,14 @@ def score_value_first(run, example):
     assert fake_request.created_session
     _wait_until(lambda: fake_request.runs)
     N_PREDS = SPLIT_SIZE * NUM_REPETITIONS
-    _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 2)
+    _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 5)
     _wait_until(lambda: slow_index is not None)
     # Want it to be interleaved
-    assert ordering_of_stuff != ["predict"] * N_PREDS + ["evaluate"] * N_PREDS
+    assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS
 
     # It's delayed, so it'll be the penultimate event
     # Will run all other preds and evals, then this, then the last eval
-    assert slow_index == (N_PREDS * 2) - 2
+    assert slow_index == (N_PREDS - 1) * 5
 
     def score_value(run, example):
         return {"score": 0.7}
@@ -347,11 +362,26 @@ async def score_value_first(run, example):
         ordering_of_stuff.append("evaluate")
         return {"score": 0.3}
 
+    async def eval_float(run, example):
+        ordering_of_stuff.append("evaluate")
+        return 0.2
+
+    async def eval_str(run, example):
+        ordering_of_stuff.append("evaluate")
+        return "good"
+
+    async def eval_list(run, example):
+        ordering_of_stuff.append("evaluate")
+        return [
+            {"score": True, "key": "list_eval_bool"},
+            {"score": 1, "key": "list_eval_int"},
+        ]
+
     results = await aevaluate(
         predict,
         client=client,
         data=dev_split,
-        evaluators=[score_value_first],
+        evaluators=[score_value_first, eval_float, eval_str, eval_list],
         num_repetitions=NUM_REPETITIONS,
         blocking=blocking,
     )
@@ -387,14 +417,14 @@ async def score_value_first(run, example):
     assert fake_request.created_session
     _wait_until(lambda: fake_request.runs)
     N_PREDS = SPLIT_SIZE * NUM_REPETITIONS
-    _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 2)
+    _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 5)
     _wait_until(lambda: slow_index is not None)
     # Want it to be interleaved
-    assert ordering_of_stuff != ["predict"] * N_PREDS + ["evaluate"] * N_PREDS
+    assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS
     assert slow_index is not None
     # It's delayed, so it'll be the penultimate event
     # Will run all other preds and evals, then this, then the last eval
-    assert slow_index == (N_PREDS * 2) - 2
+    assert slow_index == (N_PREDS - 1) * 5
 
     assert fake_request.created_session["name"]
 

From 7ea48979abf76b2b12e7ef8ed750fcbc0cfa1dd3 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Mon, 11 Nov 2024 11:56:58 -0800
Subject: [PATCH 03/31] undo

---
 python/langsmith/evaluation/_runner.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
index ee16da764..111986b76 100644
--- a/python/langsmith/evaluation/_runner.py
+++ b/python/langsmith/evaluation/_runner.py
@@ -7,7 +7,6 @@
 import concurrent.futures as cf
 import datetime
 import functools
-import importlib
 import inspect
 import itertools
 import logging
@@ -278,17 +277,6 @@ def evaluate(
             "    # ... other parameters\n"
             ")"
         )
-    if not callable(target):
-        if importlib.util.find_spec("langchain_core"):
-            from langchain_core.runnables import Runnable
-
-            if isinstance(target, Runnable):
-                target = target.invoke
-
-    if not callable(target):
-        msg = ""
-        raise ValueError(msg)
-
     if experiment and experiment_prefix:
         raise ValueError(
             "Expected at most one of 'experiment' or 'experiment_prefix',"

From 33a621cd4eec7ef95da9c5889f9f830a2374578f Mon Sep 17 00:00:00 2001
From: William FH <13333726+hinthornw@users.noreply.github.com>
Date: Wed, 13 Nov 2024 13:16:19 -0800
Subject: [PATCH 04/31] Lazy internal import (#1198)

---
 python/langsmith/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/langsmith/utils.py b/python/langsmith/utils.py
index cfbc071d2..f7c257d1f 100644
--- a/python/langsmith/utils.py
+++ b/python/langsmith/utils.py
@@ -40,7 +40,6 @@
 from urllib3.util import Retry  # type: ignore[import-untyped]
 
 from langsmith import schemas as ls_schemas
-from langsmith._internal import _patch as patch_urllib3
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -494,6 +493,8 @@ def with_cache(
             'pip install -U "langsmith[vcr]"'
         )
     # Fix concurrency issue in vcrpy's patching
+    from langsmith._internal import _patch as patch_urllib3
+
     patch_urllib3.patch_urllib3()
 
     def _filter_request_headers(request: Any) -> Any:

From cf80a82aafd47d6038caa33236d58527755d5d51 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Thu, 14 Nov 2024 09:53:12 -0800
Subject: [PATCH 05/31] fmt

---
 python/langsmith/evaluation/_runner.py        |  2 +-
 python/langsmith/evaluation/evaluator.py      |  2 +-
 .../unit_tests/evaluation/test_runner.py      | 32 +++++++++++++++++++
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
index ceb5d8561..382bc6399 100644
--- a/python/langsmith/evaluation/_runner.py
+++ b/python/langsmith/evaluation/_runner.py
@@ -1848,7 +1848,7 @@ def extract_evaluation_results_keys(node, variables):
     try:
         tree = ast.parse(python_code)
         function_def = tree.body[0]
-        if not isinstance(function_def, ast.FunctionDef):
+        if not isinstance(function_def, (ast.FunctionDef, ast.AsyncFunctionDef)):
             return []
 
         variables = {}
diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
index 2470d61cc..21e475d6c 100644
--- a/python/langsmith/evaluation/evaluator.py
+++ b/python/langsmith/evaluation/evaluator.py
@@ -259,7 +259,7 @@ def _coerce_evaluation_results(
             cp = results.copy()
             cp["results"] = [
                 self._coerce_evaluation_result(r, source_run_id=source_run_id)
-                for i, r in enumerate(results["results"])
+                for r in results["results"]
             ]
             return EvaluationResults(**cp)
 
diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py
index 58fd8b46f..38cee0488 100644
--- a/python/tests/unit_tests/evaluation/test_runner.py
+++ b/python/tests/unit_tests/evaluation/test_runner.py
@@ -275,6 +275,22 @@ def score_value(run, example):
         assert r["run"].reference_example_id in dev_xample_ids
     assert not fake_request.should_fail
 
+    # Returning list of non-dicts not supported.
+    def bad_eval_list(run, example):
+        ordering_of_stuff.append("evaluate")
+        return ["foo", 1]
+
+    results = evaluate(
+        predict,
+        client=client,
+        data=dev_split,
+        evaluators=[bad_eval_list],
+        num_repetitions=NUM_REPETITIONS,
+        blocking=blocking,
+    )
+    for r in results:
+        assert r["evaluation_results"]["results"][0].extra == {"error": True}
+
 
 def test_evaluate_raises_for_async():
     async def my_func(inputs: dict):
@@ -461,3 +477,19 @@ async def score_value(run, example):
         assert r["evaluation_results"]["results"][0].score == 0.7
         assert r["run"].reference_example_id in dev_xample_ids
     assert not fake_request.should_fail
+    # Returning list of non-dicts not supported.
+
+    async def bad_eval_list(run, example):
+        ordering_of_stuff.append("evaluate")
+        return ["foo", 1]
+
+    results = await aevaluate(
+        predict,
+        client=client,
+        data=dev_split,
+        evaluators=[bad_eval_list],
+        num_repetitions=NUM_REPETITIONS,
+        blocking=blocking,
+    )
+    async for r in results:
+        assert r["evaluation_results"]["results"][0].extra == {"error": True}

From 2df6407c85ddce2d014f8cbc2f3441bfe8b08b99 Mon Sep 17 00:00:00 2001
From: jakerachleff <jake@langchain.dev>
Date: Thu, 14 Nov 2024 16:17:24 -0800
Subject: [PATCH 06/31] perf: cut down cpu time of aevaluate by 30% on 1-4MB
 examples with this one trick (#1217)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A huge percentage of our `aevaluate` calls were trying to serialize part
of the example to send in the evaluator trace (where we used to show the
first 10k characters). Let's just not do that.

## Experiment view before and after
<img width="1457" alt="Screenshot 2024-11-14 at 3 22 21 PM"
src="https://github.com/user-attachments/assets/5003ffe4-58ce-4e60-8854-a6616d2b8d36">
<img width="1366" alt="Screenshot 2024-11-14 at 3 23 06 PM"
src="https://github.com/user-attachments/assets/65fc0ef3-2c77-4f5e-b4df-ca543408a03e">


## flame graphs before and after
<img width="1509" alt="Screenshot 2024-11-14 at 2 43 36 PM"
src="https://github.com/user-attachments/assets/01eb0a2b-fb14-49ba-ae0c-90f5eb516fd9">

<img width="1510" alt="Screenshot 2024-11-14 at 3 32 50 PM"
src="https://github.com/user-attachments/assets/dea47939-ac99-4cdc-bd18-e698511f8bad">


## Benchmarking code
```
async def run_abenchmark(
    n_examples=200,
    min_size=1000000,
    max_size=4000000,
    n_evaluators=5,
    min_llm_time=0.2,
    max_llm_time=1.2,
    n_concurrency=None
):

    # setup dataset
    inputs = [
        {"key": "a" * randint(min_size / 2, max_size / 2)}
        for _ in range(n_examples)
    ]
    outputs = [
        {"key": "b" * randint(min_size / 2, max_size / 2)}
        for _ in range(n_examples)
    ]

    if ls_client.has_dataset(dataset_name="jake_benchmarking"):
        ls_client.delete_dataset(dataset_name="jake_benchmarking")

    print("Creating dataset...")
    dataset = ls_client.create_dataset("jake_benchmarking")

    print("Uploading examples...")
    for i in range(0, n_examples, UPLOAD_BATCH_SIZE):
        ls_client.create_examples(
            dataset_id=dataset.id,
            inputs=inputs[i:i+UPLOAD_BATCH_SIZE],
            outputs=outputs[i:i+UPLOAD_BATCH_SIZE]
        )

    # setup evaluators
    evaluators = []
    for i in range(n_evaluators):
        evaluators.append(create_aevaluator(f"jake_benchmarking_{i}", uniform(min_llm_time, max_llm_time)))

    async def target(input):
        await asyncio.sleep(uniform(min_llm_time, max_llm_time))
        return {"value": "b" * len(input["key"])}

    print("Running evaluation...")
    await aevaluate(
        target,
        data=dataset.id,
        evaluators=evaluators,
        max_concurrency=n_concurrency,
        client=ls_client
    )
```
---
 python/langsmith/schemas.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/langsmith/schemas.py b/python/langsmith/schemas.py
index 80e112e46..dff6d212a 100644
--- a/python/langsmith/schemas.py
+++ b/python/langsmith/schemas.py
@@ -124,6 +124,10 @@ def url(self) -> Optional[str]:
             return f"{self._host_url}{path}"
         return None
 
+    def __repr__(self):
+        """Return a string representation of the RunBase object."""
+        return f"{self.__class__}(id={self.id}, dataset_id={self.dataset_id}, link='{self.url}')"
+
 
 class ExampleSearch(ExampleBase):
     """Example returned via search."""

From 2cc6e1fb7a96a653ddd087bf930f15ef0fd5a8e1 Mon Sep 17 00:00:00 2001
From: jakerachleff <jake@langchain.dev>
Date: Thu, 14 Nov 2024 23:28:50 -0800
Subject: [PATCH 07/31] nit: stop calling importlib on every api req (#1219)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each call is like 1-3ms, and we call to get the version as part of every
client request's headers. If you're running an eval over a lot of traces
(say 10 evaluators on 1k examples), that's slow. Esp if just doing
custom code evaluators which should run fast.

### Flame Graphs before and after
(ran on 2k smallish examples w 10 evaluators)
<img width="1510" alt="Screenshot 2024-11-14 at 5 25 43 PM"
src="https://github.com/user-attachments/assets/2366b357-f3de-4f18-b7a8-5b362c992119">

<img width="1507" alt="Screenshot 2024-11-14 at 5 25 53 PM"
src="https://github.com/user-attachments/assets/3311e686-373c-4fd7-80e1-d6939a3c4632">

---------

Co-authored-by: William FH <13333726+hinthornw@users.noreply.github.com>
---
 python/langsmith/__init__.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/python/langsmith/__init__.py b/python/langsmith/__init__.py
index f3a1de90a..9055cb5f6 100644
--- a/python/langsmith/__init__.py
+++ b/python/langsmith/__init__.py
@@ -1,5 +1,6 @@
 """LangSmith Client."""
 
+from importlib import metadata
 from typing import TYPE_CHECKING, Any
 
 if TYPE_CHECKING:
@@ -21,15 +22,17 @@
         ContextThreadPoolExecutor,
     )
 
+# Avoid calling into importlib on every call to __version__
+version = ""
+try:
+    version = metadata.version(__package__)
+except metadata.PackageNotFoundError:
+    pass
+
 
 def __getattr__(name: str) -> Any:
     if name == "__version__":
-        try:
-            from importlib import metadata
-
-            return metadata.version(__package__)
-        except metadata.PackageNotFoundError:
-            return ""
+        return version
     elif name == "Client":
         from langsmith.client import Client
 

From 47671be397dc74a9116851afb3018cd2b43f25d7 Mon Sep 17 00:00:00 2001
From: Bagatur <22008038+baskaryan@users.noreply.github.com>
Date: Fri, 15 Nov 2024 14:57:47 -0800
Subject: [PATCH 08/31] python[patch]: evaluate accept Dataset (#1222)

Seems unnecessary that after i create a dataset i cant just pass in the
dataset object to evaluate
---
 python/langsmith/evaluation/_runner.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
index 382bc6399..8c2fe800c 100644
--- a/python/langsmith/evaluation/_runner.py
+++ b/python/langsmith/evaluation/_runner.py
@@ -67,7 +67,7 @@
 
 TARGET_T = Callable[[dict], dict]
 # Data format: dataset-name, dataset_id, or examples
-DATA_T = Union[str, uuid.UUID, Iterable[schemas.Example]]
+DATA_T = Union[str, uuid.UUID, Iterable[schemas.Example], schemas.Dataset]
 # Summary evaluator runs over the whole dataset
 # and reports aggregate metric(s)
 SUMMARY_EVALUATOR_T = Union[
@@ -1661,6 +1661,8 @@ def _resolve_data(
         return client.list_examples(dataset_name=data)
     elif isinstance(data, uuid.UUID):
         return client.list_examples(dataset_id=data)
+    elif isinstance(data, schemas.Dataset):
+        return client.list_examples(dataset_id=data.id)
     return data
 
 
From 9336fce0f22bb0d8128a4feb9407917bf6a611cb Mon Sep 17 00:00:00 2001
From: Bagatur <22008038+baskaryan@users.noreply.github.com>
Date: Mon, 18 Nov 2024 09:39:49 -0800
Subject: [PATCH 09/31] python[patch]: accept simple evaluators (#1200)

can write evaluators like this:

```python
from langsmith import evaluate

def simp(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
    return {"results": [
        {"score": inputs == outputs, "key": 'identity'},
        {"score": outputs == reference_outputs, "key": "correct"}
    ]}

evaluate(
    (lambda x: x),
    data="Sample Dataset 3",
    evaluators=[simp],
)
```

example experiment: left-tray-86
https://dev.smith.langchain.com/public/e7782ea0-3de5-4352-8cd4-7b2cdbb03e4c/d

---------

Co-authored-by: William FH <13333726+hinthornw@users.noreply.github.com>
---
 python/langsmith/evaluation/_runner.py        |   1 +
 python/langsmith/evaluation/evaluator.py      |  78 ++++++++++-
 .../unit_tests/evaluation/test_runner.py      | 121 ++++++++++++++----
 3 files changed, 175 insertions(+), 25 deletions(-)

diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
index 8c2fe800c..8ed55f6bf 100644
--- a/python/langsmith/evaluation/_runner.py
+++ b/python/langsmith/evaluation/_runner.py
@@ -87,6 +87,7 @@
         [schemas.Run, Optional[schemas.Example]],
         Union[EvaluationResult, EvaluationResults],
     ],
+    Callable[..., Union[dict, EvaluationResults, EvaluationResult]],
 ]
 AEVALUATOR_T = Union[
     Callable[
diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
index 21e475d6c..feb0e95e4 100644
--- a/python/langsmith/evaluation/evaluator.py
+++ b/python/langsmith/evaluation/evaluator.py
@@ -194,6 +194,10 @@ def __init__(
             func (Callable): A function that takes a `Run` and an optional `Example` as
             arguments, and returns a dict or `ComparisonEvaluationResult`.
         """
+        func = _normalize_evaluator_func(func)
+        if afunc:
+            afunc = _normalize_evaluator_func(afunc)  # type: ignore[assignment]
+
         wraps(func)(self)
         from langsmith import run_helpers  # type: ignore
 
@@ -288,7 +292,7 @@ def _format_result(
         elif isinstance(result, list):
             if not all(isinstance(x, dict) for x in result):
                 raise ValueError(
-                    f"Expected a list of dicts or EvaluationResult. Received {result}."
+                    f"Expected a list of dicts or EvaluationResults. Received {result}."
                 )
             result = {"results": result}  # type: ignore[misc]
         elif isinstance(result, str):
@@ -645,3 +649,75 @@ def comparison_evaluator(
 ) -> DynamicComparisonRunEvaluator:
     """Create a comaprison evaluator from a function."""
     return DynamicComparisonRunEvaluator(func)
+
+
+def _normalize_evaluator_func(
+    func: Callable,
+) -> Union[
+    Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT],
+    Callable[[Run, Optional[Example]], Awaitable[_RUNNABLE_OUTPUT]],
+]:
+    supported_args = ("run", "example", "inputs", "outputs", "reference_outputs")
+    sig = inspect.signature(func)
+    positional_args = [
+        pname
+        for pname, p in sig.parameters.items()
+        if p.kind in (p.POSITIONAL_OR_KEYWORD, p.POSITIONAL_ONLY)
+    ]
+    if not positional_args or (
+        not all(pname in supported_args for pname in positional_args)
+        and len(positional_args) != 2
+    ):
+        msg = (
+            f"Invalid evaluator function. Must have at least one positional "
+            f"argument. Supported positional arguments are {supported_args}. Please "
+            f"see https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators"
+            # noqa: E501
+        )
+        raise ValueError(msg)
+    elif not all(
+        pname in supported_args for pname in positional_args
+    ) or positional_args == ["run", "example"]:
+        # For backwards compatibility we assume custom arg names are Run and Example
+        # types, respectively.
+        return func
+    else:
+        if inspect.iscoroutinefunction(func):
+
+            async def awrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
+                arg_map = {
+                    "run": run,
+                    "example": example,
+                    "inputs": example.inputs,
+                    "outputs": run.outputs or {},
+                    "reference_outputs": example.outputs or {},
+                }
+                args = (arg_map[arg] for arg in positional_args)
+                return await func(*args)
+
+            awrapper.__name__ = (
+                getattr(func, "__name__")
+                if hasattr(func, "__name__")
+                else awrapper.__name__
+            )
+            return awrapper  # type: ignore[return-value]
+
+        else:
+
+            def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
+                arg_map = {
+                    "run": run,
+                    "example": example,
+                    "inputs": example.inputs,
+                    "outputs": run.outputs or {},
+                    "reference_outputs": example.outputs or {},
+                }
+                args = (arg_map[arg] for arg in positional_args)
+                return func(*args)
+
+            wrapper.__name__ = (
+                getattr(func, "__name__")
+                if hasattr(func, "__name__")
+                else wrapper.__name__
+            )
+            return wrapper  # type: ignore[return-value]
diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py
index 38cee0488..408d4508d 100644
--- a/python/tests/unit_tests/evaluation/test_runner.py
+++ b/python/tests/unit_tests/evaluation/test_runner.py
@@ -21,6 +21,7 @@
 from langsmith.client import Client
 from langsmith.evaluation._arunner import aevaluate, aevaluate_existing
 from langsmith.evaluation._runner import evaluate_existing
+from langsmith.evaluation.evaluator import _normalize_evaluator_func
 
 
 class FakeRequest:
@@ -120,6 +121,16 @@ def _wait_until(condition: Callable, timeout: int = 8):
     raise TimeoutError("Condition not met")
 
 
+def _create_example(idx: int) -> ls_schemas.Example:
+    return ls_schemas.Example(
+        id=uuid.uuid4(),
+        inputs={"in": idx},
+        outputs={"answer": idx + 1},
+        dataset_id="00886375-eb2a-4038-9032-efff60309896",
+        created_at=datetime.now(timezone.utc),
+    )
+
+
 @pytest.mark.skipif(sys.version_info < (3, 9), reason="requires python3.9 or higher")
 @pytest.mark.parametrize("blocking", [False, True])
 @pytest.mark.parametrize("as_runnable", [False, True])
@@ -128,15 +139,6 @@ def test_evaluate_results(blocking: bool, as_runnable: bool) -> None:
     ds_name = "my-dataset"
     ds_id = "00886375-eb2a-4038-9032-efff60309896"
 
-    def _create_example(idx: int) -> ls_schemas.Example:
-        return ls_schemas.Example(
-            id=uuid.uuid4(),
-            inputs={"in": idx},
-            outputs={"answer": idx + 1},
-            dataset_id=ds_id,
-            created_at=datetime.now(timezone.utc),
-        )
-
     SPLIT_SIZE = 3
     NUM_REPETITIONS = 4
     ds_examples = [_create_example(i) for i in range(10)]
@@ -196,6 +198,14 @@ def score_value_first(run, example):
         ordering_of_stuff.append("evaluate")
         return {"score": 0.3}
 
+    def score_unpacked_inputs_outputs(inputs, outputs):
+        ordering_of_stuff.append("evaluate")
+        return {"score": outputs["output"]}
+
+    def score_unpacked_inputs_outputs_reference(inputs, outputs, reference_outputs):
+        ordering_of_stuff.append("evaluate")
+        return {"score": reference_outputs["answer"]}
+
     def eval_float(run, example):
         ordering_of_stuff.append("evaluate")
         return 0.2
@@ -211,11 +221,20 @@ def eval_list(run, example):
             {"score": 1, "key": "list_eval_int"},
         ]
 
+    evaluators = [
+        score_value_first,
+        score_unpacked_inputs_outputs,
+        score_unpacked_inputs_outputs_reference,
+        eval_float,
+        eval_str,
+        eval_list,
+    ]
+
     results = evaluate(
         predict,
         client=client,
         data=dev_split,
-        evaluators=[score_value_first, eval_float, eval_str, eval_list],
+        evaluators=evaluators,
         num_repetitions=NUM_REPETITIONS,
         blocking=blocking,
     )
@@ -242,18 +261,19 @@ def eval_list(run, example):
     for r in results:
         assert r["run"].outputs["output"] == r["example"].inputs["in"] + 1  # type: ignore
         assert set(r["run"].outputs.keys()) == {"output"}  # type: ignore
+        assert len(r["evaluation_results"]["results"]) == len(evaluators) + 1
 
     assert fake_request.created_session
     _wait_until(lambda: fake_request.runs)
     N_PREDS = SPLIT_SIZE * NUM_REPETITIONS
-    _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 5)
+    _wait_until(lambda: len(ordering_of_stuff) == (N_PREDS * (len(evaluators) + 1)))
     _wait_until(lambda: slow_index is not None)
     # Want it to be interleaved
     assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS
 
     # It's delayed, so it'll be the penultimate event
     # Will run all other preds and evals, then this, then the last eval
-    assert slow_index == (N_PREDS - 1) * 5
+    assert slow_index == (len(evaluators) + 1) * (N_PREDS - 1)
 
     def score_value(run, example):
         return {"score": 0.7}
@@ -291,6 +311,25 @@ def bad_eval_list(run, example):
     for r in results:
         assert r["evaluation_results"]["results"][0].extra == {"error": True}
 
+    # test invalid evaluators
+    # args need to be positional
+    def eval1(*, inputs, outputs):
+        pass
+
+    # if more than 2 positional args, they must all have default arg names
+    # (run, example, ...)
+    def eval2(x, y, inputs):
+        pass
+
+    evaluators = [eval1, eval2]
+
+    for eval_ in evaluators:
+        with pytest.raises(ValueError, match="Invalid evaluator function."):
+            _normalize_evaluator_func(eval_)
+
+        with pytest.raises(ValueError, match="Invalid evaluator function."):
+            evaluate((lambda x: x), data=ds_examples, evaluators=[eval_], client=client)
+
 
 def test_evaluate_raises_for_async():
     async def my_func(inputs: dict):
@@ -328,15 +367,6 @@ async def test_aevaluate_results(blocking: bool, as_runnable: bool) -> None:
     ds_name = "my-dataset"
     ds_id = "00886375-eb2a-4038-9032-efff60309896"
 
-    def _create_example(idx: int) -> ls_schemas.Example:
-        return ls_schemas.Example(
-            id=uuid.uuid4(),
-            inputs={"in": idx},
-            outputs={"answer": idx + 1},
-            dataset_id=ds_id,
-            created_at=datetime.now(timezone.utc),
-        )
-
     SPLIT_SIZE = 3
     NUM_REPETITIONS = 4
     ds_examples = [_create_example(i) for i in range(10)]
@@ -397,6 +427,16 @@ async def score_value_first(run, example):
         ordering_of_stuff.append("evaluate")
         return {"score": 0.3}
 
+    async def score_unpacked_inputs_outputs(inputs, outputs):
+        ordering_of_stuff.append("evaluate")
+        return {"score": outputs["output"]}
+
+    async def score_unpacked_inputs_outputs_reference(
+        inputs, outputs, reference_outputs
+    ):
+        ordering_of_stuff.append("evaluate")
+        return {"score": reference_outputs["answer"]}
+
     async def eval_float(run, example):
         ordering_of_stuff.append("evaluate")
         return 0.2
@@ -412,11 +452,20 @@ async def eval_list(run, example):
             {"score": 1, "key": "list_eval_int"},
         ]
 
+    evaluators = [
+        score_value_first,
+        score_unpacked_inputs_outputs,
+        score_unpacked_inputs_outputs_reference,
+        eval_float,
+        eval_str,
+        eval_list,
+    ]
+
     results = await aevaluate(
         predict,
         client=client,
         data=dev_split,
-        evaluators=[score_value_first, eval_float, eval_str, eval_list],
+        evaluators=evaluators,
         num_repetitions=NUM_REPETITIONS,
         blocking=blocking,
     )
@@ -452,14 +501,14 @@ async def eval_list(run, example):
     assert fake_request.created_session
     _wait_until(lambda: fake_request.runs)
     N_PREDS = SPLIT_SIZE * NUM_REPETITIONS
-    _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 5)
+    _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * (len(evaluators) + 1))
     _wait_until(lambda: slow_index is not None)
     # Want it to be interleaved
     assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS
     assert slow_index is not None
     # It's delayed, so it'll be the penultimate event
     # Will run all other preds and evals, then this, then the last eval
-    assert slow_index == (N_PREDS - 1) * 5
+    assert slow_index == (N_PREDS - 1) * (len(evaluators) + 1)
 
     assert fake_request.created_session["name"]
 
@@ -493,3 +542,27 @@ async def bad_eval_list(run, example):
     )
     async for r in results:
         assert r["evaluation_results"]["results"][0].extra == {"error": True}
+
+    # test invalid evaluators
+    # args need to be positional
+    async def eval1(*, inputs, outputs):
+        pass
+
+    # if more than 2 positional args, they must all have default arg names
+    # (run, example, ...)
+    async def eval2(x, y, inputs):
+        pass
+
+    evaluators = [eval1, eval2]
+
+    async def atarget(x):
+        return x
+
+    for eval_ in evaluators:
+        with pytest.raises(ValueError, match="Invalid evaluator function."):
+            _normalize_evaluator_func(eval_)
+
+        with pytest.raises(ValueError, match="Invalid evaluator function."):
+            await aevaluate(
+                atarget, data=ds_examples, evaluators=[eval_], client=client
+            )

From cecf5b54726c5bb335dcff362572aeac5a50498d Mon Sep 17 00:00:00 2001
From: William FH <13333726+hinthornw@users.noreply.github.com>
Date: Mon, 18 Nov 2024 09:56:32 -0800
Subject: [PATCH 10/31] NotMyPy (#1221)

---
 python/langsmith/_internal/_patch.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/langsmith/_internal/_patch.py b/python/langsmith/_internal/_patch.py
index ad3cfe25a..e7bf88bdf 100644
--- a/python/langsmith/_internal/_patch.py
+++ b/python/langsmith/_internal/_patch.py
@@ -1,8 +1,7 @@
-# mypy: disable-error-code="import-untyped"
 import functools
 
-from urllib3 import __version__ as urllib3version  # noqa
-from urllib3 import connection  # noqa
+from urllib3 import __version__ as urllib3version  # type: ignore[import-untyped]
+from urllib3 import connection  # type: ignore[import-untyped]
 
 
 def _ensure_str(s, encoding="utf-8", errors="strict") -> str:

From 3fce7b72c8d38b138666a7363baa5c78d9e9dd19 Mon Sep 17 00:00:00 2001
From: Erick Friis <erick@langchain.dev>
Date: Mon, 18 Nov 2024 18:19:38 -0800
Subject: [PATCH 11/31] orjson optional wip (#1223)

---
 .../langsmith/_internal/_background_thread.py | 24 ++++--
 python/langsmith/_internal/_operations.py     |  9 +-
 python/langsmith/_internal/_orjson.py         | 84 +++++++++++++++++++
 python/langsmith/_internal/_serde.py          | 18 ++--
 python/langsmith/_testing.py                  |  4 +-
 python/langsmith/client.py                    | 31 ++++---
 python/poetry.lock                            |  4 +-
 python/pyproject.toml                         |  2 +-
 python/tests/unit_tests/test_client.py        |  8 +-
 python/tests/unit_tests/test_operations.py    |  9 +-
 10 files changed, 147 insertions(+), 46 deletions(-)
 create mode 100644 python/langsmith/_internal/_orjson.py

diff --git a/python/langsmith/_internal/_background_thread.py b/python/langsmith/_internal/_background_thread.py
index b6aee1f4e..844851996 100644
--- a/python/langsmith/_internal/_background_thread.py
+++ b/python/langsmith/_internal/_background_thread.py
@@ -155,13 +155,25 @@ def tracing_control_thread_func(client_ref: weakref.ref[Client]) -> None:
     # 1 for this func, 1 for getrefcount, 1 for _get_data_type_cached
     num_known_refs = 3
 
+    def keep_thread_active() -> bool:
+        # if `client.cleanup()` was called, stop thread
+        if client and client._manual_cleanup:
+            return False
+        if not threading.main_thread().is_alive():
+            # main thread is dead. should not be active
+            return False
+
+        if hasattr(sys, "getrefcount"):
+            # check if client refs count indicates we're the only remaining
+            # reference to the client
+            return sys.getrefcount(client) > num_known_refs + len(sub_threads)
+        else:
+            # in PyPy, there is no sys.getrefcount attribute
+            # for now, keep thread alive
+            return True
+
     # loop until
-    while (
-        # the main thread dies
-        threading.main_thread().is_alive()
-        # or we're the only remaining reference to the client
-        and sys.getrefcount(client) > num_known_refs + len(sub_threads)
-    ):
+    while keep_thread_active():
         for thread in sub_threads:
             if not thread.is_alive():
                 sub_threads.remove(thread)
diff --git a/python/langsmith/_internal/_operations.py b/python/langsmith/_internal/_operations.py
index e1e99d6e2..66decff0f 100644
--- a/python/langsmith/_internal/_operations.py
+++ b/python/langsmith/_internal/_operations.py
@@ -5,9 +5,8 @@
 import uuid
 from typing import Literal, Optional, Union, cast
 
-import orjson
-
 from langsmith import schemas as ls_schemas
+from langsmith._internal import _orjson
 from langsmith._internal._multipart import MultipartPart, MultipartPartsAndContext
 from langsmith._internal._serde import dumps_json as _dumps_json
 
@@ -169,12 +168,12 @@ def combine_serialized_queue_operations(
             if op._none is not None and op._none != create_op._none:
                 # TODO optimize this more - this would currently be slowest
                 # for large payloads
-                create_op_dict = orjson.loads(create_op._none)
+                create_op_dict = _orjson.loads(create_op._none)
                 op_dict = {
-                    k: v for k, v in orjson.loads(op._none).items() if v is not None
+                    k: v for k, v in _orjson.loads(op._none).items() if v is not None
                 }
                 create_op_dict.update(op_dict)
-                create_op._none = orjson.dumps(create_op_dict)
+                create_op._none = _orjson.dumps(create_op_dict)
 
             if op.inputs is not None:
                 create_op.inputs = op.inputs
diff --git a/python/langsmith/_internal/_orjson.py b/python/langsmith/_internal/_orjson.py
new file mode 100644
index 000000000..ecd9e20bc
--- /dev/null
+++ b/python/langsmith/_internal/_orjson.py
@@ -0,0 +1,84 @@
+"""Stubs for orjson operations, compatible with PyPy via a json fallback."""
+
+try:
+    from orjson import (
+        OPT_NON_STR_KEYS,
+        OPT_SERIALIZE_DATACLASS,
+        OPT_SERIALIZE_NUMPY,
+        OPT_SERIALIZE_UUID,
+        Fragment,
+        JSONDecodeError,
+        dumps,
+        loads,
+    )
+
+except ImportError:
+    import dataclasses
+    import json
+    import uuid
+    from typing import Any, Callable, Optional
+
+    OPT_NON_STR_KEYS = 1
+    OPT_SERIALIZE_DATACLASS = 2
+    OPT_SERIALIZE_NUMPY = 4
+    OPT_SERIALIZE_UUID = 8
+
+    class Fragment:  # type: ignore
+        def __init__(self, payloadb: bytes):
+            self.payloadb = payloadb
+
+    from json import JSONDecodeError  # type: ignore
+
+    def dumps(  # type: ignore
+        obj: Any,
+        /,
+        default: Optional[Callable[[Any], Any]] = None,
+        option: int = 0,
+    ) -> bytes:  # type: ignore
+        # for now, don't do anything for this case because `json.dumps`
+        # automatically encodes non-str keys as str by default, unlike orjson
+        # enable_non_str_keys = bool(option & OPT_NON_STR_KEYS)
+
+        enable_serialize_numpy = bool(option & OPT_SERIALIZE_NUMPY)
+        enable_serialize_dataclass = bool(option & OPT_SERIALIZE_DATACLASS)
+        enable_serialize_uuid = bool(option & OPT_SERIALIZE_UUID)
+
+        class CustomEncoder(json.JSONEncoder):  # type: ignore
+            def encode(self, o: Any) -> str:
+                if isinstance(o, Fragment):
+                    return o.payloadb.decode("utf-8")  # type: ignore
+                return super().encode(o)
+
+            def default(self, o: Any) -> Any:
+                if enable_serialize_uuid and isinstance(o, uuid.UUID):
+                    return str(o)
+                if enable_serialize_numpy and hasattr(o, "tolist"):
+                    # even objects like np.uint16(15) have a .tolist() function
+                    return o.tolist()
+                if (
+                    enable_serialize_dataclass
+                    and dataclasses.is_dataclass(o)
+                    and not isinstance(o, type)
+                ):
+                    return dataclasses.asdict(o)
+                if default is not None:
+                    return default(o)
+
+                return super().default(o)
+
+        return json.dumps(obj, cls=CustomEncoder).encode("utf-8")
+
+    def loads(payload: bytes, /) -> Any:  # type: ignore
+        return json.loads(payload)
+
+
+__all__ = [
+    "loads",
+    "dumps",
+    "Fragment",
+    "JSONDecodeError",
+    "OPT_SERIALIZE_NUMPY",
+    "OPT_SERIALIZE_DATACLASS",
+    "OPT_SERIALIZE_UUID",
+    "OPT_NON_STR_KEYS",
+]
diff --git a/python/langsmith/_internal/_serde.py b/python/langsmith/_internal/_serde.py
index e77f7319d..1bf8865c1 100644
--- a/python/langsmith/_internal/_serde.py
+++ b/python/langsmith/_internal/_serde.py
@@ -12,7 +12,7 @@
 import uuid
 from typing import Any
 
-import orjson
+from langsmith._internal import _orjson
 
 try:
     from zoneinfo import ZoneInfo  # type: ignore[import-not-found]
@@ -133,13 +133,13 @@ def dumps_json(obj: Any) -> bytes:
         The JSON formatted string.
     """
     try:
-        return orjson.dumps(
+        return _orjson.dumps(
             obj,
             default=_serialize_json,
-            option=orjson.OPT_SERIALIZE_NUMPY
-            | orjson.OPT_SERIALIZE_DATACLASS
-            | orjson.OPT_SERIALIZE_UUID
-            | orjson.OPT_NON_STR_KEYS,
+            option=_orjson.OPT_SERIALIZE_NUMPY
+            | _orjson.OPT_SERIALIZE_DATACLASS
+            | _orjson.OPT_SERIALIZE_UUID
+            | _orjson.OPT_NON_STR_KEYS,
         )
     except TypeError as e:
         # Usually caused by UTF surrogate characters
@@ -150,9 +150,9 @@ def dumps_json(obj: Any) -> bytes:
             ensure_ascii=True,
         ).encode("utf-8")
         try:
-            result = orjson.dumps(
-                orjson.loads(result.decode("utf-8", errors="surrogateescape"))
+            result = _orjson.dumps(
+                _orjson.loads(result.decode("utf-8", errors="surrogateescape"))
             )
-        except orjson.JSONDecodeError:
+        except _orjson.JSONDecodeError:
             result = _elide_surrogates(result)
         return result
diff --git a/python/langsmith/_testing.py b/python/langsmith/_testing.py
index 8dd72fbcb..9eaa0877f 100644
--- a/python/langsmith/_testing.py
+++ b/python/langsmith/_testing.py
@@ -12,7 +12,6 @@
 from pathlib import Path
 from typing import Any, Callable, Optional, Sequence, Tuple, TypeVar, overload
 
-import orjson
 from typing_extensions import TypedDict
 
 from langsmith import client as ls_client
@@ -21,6 +20,7 @@
 from langsmith import run_trees as rt
 from langsmith import schemas as ls_schemas
 from langsmith import utils as ls_utils
+from langsmith._internal import _orjson
 
 try:
     import pytest  # type: ignore
@@ -374,7 +374,7 @@ def _serde_example_values(values: VT) -> VT:
     if values is None:
         return values
     bts = ls_client._dumps_json(values)
-    return orjson.loads(bts)
+    return _orjson.loads(bts)
 
 
 class _LangSmithTestSuite:
diff --git a/python/langsmith/client.py b/python/langsmith/client.py
index eb397b4c4..8348b57d1 100644
--- a/python/langsmith/client.py
+++ b/python/langsmith/client.py
@@ -55,7 +55,6 @@
 )
 from urllib import parse as urllib_parse
 
-import orjson
 import requests
 from requests import adapters as requests_adapters
 from requests_toolbelt import (  # type: ignore[import-untyped]
@@ -69,6 +68,7 @@
 from langsmith import env as ls_env
 from langsmith import schemas as ls_schemas
 from langsmith import utils as ls_utils
+from langsmith._internal import _orjson
 from langsmith._internal._background_thread import (
     TracingQueueItem,
 )
@@ -368,6 +368,7 @@ class Client:
         "_info",
         "_write_api_urls",
         "_settings",
+        "_manual_cleanup",
     ]
 
     def __init__(
@@ -516,6 +517,8 @@ def __init__(
 
         self._settings: Union[ls_schemas.LangSmithSettings, None] = None
 
+        self._manual_cleanup = False
+
     def _repr_html_(self) -> str:
         """Return an HTML representation of the instance with a link to the URL.
 
@@ -1252,7 +1255,7 @@ def _hide_run_inputs(self, inputs: dict):
         if self._hide_inputs is True:
             return {}
         if self._anonymizer:
-            json_inputs = orjson.loads(_dumps_json(inputs))
+            json_inputs = _orjson.loads(_dumps_json(inputs))
             return self._anonymizer(json_inputs)
         if self._hide_inputs is False:
             return inputs
@@ -1262,7 +1265,7 @@ def _hide_run_outputs(self, outputs: dict):
         if self._hide_outputs is True:
             return {}
         if self._anonymizer:
-            json_outputs = orjson.loads(_dumps_json(outputs))
+            json_outputs = _orjson.loads(_dumps_json(outputs))
             return self._anonymizer(json_outputs)
         if self._hide_outputs is False:
             return outputs
@@ -1282,20 +1285,20 @@ def _batch_ingest_run_ops(
         # form the partial body and ids
         for op in ops:
             if isinstance(op, SerializedRunOperation):
-                curr_dict = orjson.loads(op._none)
+                curr_dict = _orjson.loads(op._none)
                 if op.inputs:
-                    curr_dict["inputs"] = orjson.Fragment(op.inputs)
+                    curr_dict["inputs"] = _orjson.Fragment(op.inputs)
                 if op.outputs:
-                    curr_dict["outputs"] = orjson.Fragment(op.outputs)
+                    curr_dict["outputs"] = _orjson.Fragment(op.outputs)
                 if op.events:
-                    curr_dict["events"] = orjson.Fragment(op.events)
+                    curr_dict["events"] = _orjson.Fragment(op.events)
                 if op.attachments:
                     logger.warning(
                         "Attachments are not supported when use_multipart_endpoint "
                         "is False"
                     )
                 ids_and_partial_body[op.operation].append(
-                    (f"trace={op.trace_id},id={op.id}", orjson.dumps(curr_dict))
+                    (f"trace={op.trace_id},id={op.id}", _orjson.dumps(curr_dict))
                 )
             elif isinstance(op, SerializedFeedbackOperation):
                 logger.warning(
@@ -1321,7 +1324,7 @@ def _batch_ingest_run_ops(
                     and body_size + len(body_deque[0][1]) > size_limit_bytes
                 ):
                     self._post_batch_ingest_runs(
-                        orjson.dumps(body_chunks),
+                        _orjson.dumps(body_chunks),
                         _context=f"\n{key}: {'; '.join(context_ids[key])}",
                     )
                     body_size = 0
@@ -1329,12 +1332,12 @@ def _batch_ingest_run_ops(
                     context_ids.clear()
                 curr_id, curr_body = body_deque.popleft()
                 body_size += len(curr_body)
-                body_chunks[key].append(orjson.Fragment(curr_body))
+                body_chunks[key].append(_orjson.Fragment(curr_body))
                 context_ids[key].append(curr_id)
         if body_size:
             context = "; ".join(f"{k}: {'; '.join(v)}" for k, v in context_ids.items())
             self._post_batch_ingest_runs(
-                orjson.dumps(body_chunks), _context="\n" + context
+                _orjson.dumps(body_chunks), _context="\n" + context
             )
 
     def batch_ingest_runs(
@@ -2759,7 +2762,7 @@ def create_dataset(
             "POST",
             "/datasets",
             headers={**self._headers, "Content-Type": "application/json"},
-            data=orjson.dumps(dataset),
+            data=_orjson.dumps(dataset),
         )
         ls_utils.raise_for_status_with_text(response)
 
@@ -5675,6 +5678,10 @@ def push_prompt(
         )
         return url
 
+    def cleanup(self) -> None:
+        """Manually trigger cleanup of the background thread."""
+        self._manual_cleanup = True
+
 
 def convert_prompt_to_openai_format(
     messages: Any,
diff --git a/python/poetry.lock b/python/poetry.lock
index a2e1c3667..2b362f986 100644
--- a/python/poetry.lock
+++ b/python/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
 [[package]]
 name = "annotated-types"
@@ -2070,4 +2070,4 @@ vcr = []
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "ca8fa5c9a82d58bea646d5e7e1089175111ddec2c24cd0b19920d1afd4dd93da"
+content-hash = "a5a6c61cba1b5ce9cf739700a780c2df63ff7aaa482c29de9910418263318586"
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 81645c912..0278d6ddc 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -31,7 +31,7 @@ pydantic = [
   { version = "^2.7.4", python = ">=3.12.4" },
 ]
 requests = "^2"
-orjson = "^3.9.14"
+orjson = { version = "^3.9.14", markers = "platform_python_implementation != 'PyPy'" }
 httpx = ">=0.23.0,<1"
 requests-toolbelt = "^1.0.0"
 
diff --git a/python/tests/unit_tests/test_client.py b/python/tests/unit_tests/test_client.py
index 5dc1bbe1e..feec2c2f6 100644
--- a/python/tests/unit_tests/test_client.py
+++ b/python/tests/unit_tests/test_client.py
@@ -22,7 +22,6 @@
 from unittest.mock import MagicMock, patch
 
 import dataclasses_json
-import orjson
 import pytest
 import requests
 from multipart import MultipartParser, MultipartPart, parse_options_header
@@ -33,6 +32,7 @@
 import langsmith.utils as ls_utils
 from langsmith import AsyncClient, EvaluationResult, run_trees
 from langsmith import schemas as ls_schemas
+from langsmith._internal import _orjson
 from langsmith._internal._serde import _serialize_json
 from langsmith.client import (
     Client,
@@ -848,7 +848,7 @@ class MyNamedTuple(NamedTuple):
         "set_with_class": set([MyClass(1)]),
         "my_mock": MagicMock(text="Hello, world"),
     }
-    res = orjson.loads(_dumps_json(to_serialize))
+    res = _orjson.loads(_dumps_json(to_serialize))
     assert (
         "model_dump" not in caplog.text
     ), f"Unexpected error logs were emitted: {caplog.text}"
@@ -898,7 +898,7 @@ def __repr__(self) -> str:
     my_cyclic = CyclicClass(other=CyclicClass(other=None))
     my_cyclic.other.other = my_cyclic  # type: ignore
 
-    res = orjson.loads(_dumps_json({"cyclic": my_cyclic}))
+    res = _orjson.loads(_dumps_json({"cyclic": my_cyclic}))
     assert res == {"cyclic": "my_cycles..."}
     expected = {"foo": "foo", "bar": 1}
 
@@ -1142,7 +1142,7 @@ def test_batch_ingest_run_splits_large_batches(
             op
             for call in mock_session.request.call_args_list
             for reqs in (
-                orjson.loads(call[1]["data"]).values() if call[0][0] == "POST" else []
+                _orjson.loads(call[1]["data"]).values() if call[0][0] == "POST" else []
             )
             for op in reqs
         ]
diff --git a/python/tests/unit_tests/test_operations.py b/python/tests/unit_tests/test_operations.py
index a6b5cdeb3..43d06ebc5 100644
--- a/python/tests/unit_tests/test_operations.py
+++ b/python/tests/unit_tests/test_operations.py
@@ -1,5 +1,4 @@
-import orjson
-
+from langsmith._internal import _orjson
 from langsmith._internal._operations import (
     SerializedFeedbackOperation,
     SerializedRunOperation,
@@ -14,7 +13,7 @@ def test_combine_serialized_queue_operations():
             operation="post",
             id="id1",
             trace_id="trace_id1",
-            _none=orjson.dumps({"a": 1}),
+            _none=_orjson.dumps({"a": 1}),
             inputs="inputs1",
             outputs="outputs1",
             events="events1",
@@ -24,7 +23,7 @@ def test_combine_serialized_queue_operations():
             operation="patch",
             id="id1",
             trace_id="trace_id1",
-            _none=orjson.dumps({"b": "2"}),
+            _none=_orjson.dumps({"b": "2"}),
             inputs="inputs1-patched",
             outputs="outputs1-patched",
             events="events1",
@@ -87,7 +86,7 @@ def test_combine_serialized_queue_operations():
             operation="post",
             id="id1",
             trace_id="trace_id1",
-            _none=orjson.dumps({"a": 1, "b": "2"}),
+            _none=_orjson.dumps({"a": 1, "b": "2"}),
             inputs="inputs1-patched",
             outputs="outputs1-patched",
             events="events1",

From db2f12846992a4139e9f95d7a86f7f0b8f46d24a Mon Sep 17 00:00:00 2001
From: Erick Friis <erick@langchain.dev>
Date: Mon, 18 Nov 2024 18:24:59 -0800
Subject: [PATCH 12/31] release 0.1.144rc1 (#1226)

---
 python/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyproject.toml b/python/pyproject.toml
index 0278d6ddc..191d61b22 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langsmith"
-version = "0.1.143"
+version = "0.1.144rc1"
 description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
 authors = ["LangChain <support@langchain.dev>"]
 license = "MIT"

From a638687017f7c0adb970b9a0086d651294277a0c Mon Sep 17 00:00:00 2001
From: Bagatur <22008038+baskaryan@users.noreply.github.com>
Date: Tue, 19 Nov 2024 06:49:38 -0800
Subject: [PATCH 13/31] Revert "orjson optional wip (#1223)" (#1228)

This reverts commit 3fce7b72c8d38b138666a7363baa5c78d9e9dd19.
---
 .../langsmith/_internal/_background_thread.py | 24 ++----
 python/langsmith/_internal/_operations.py     |  9 +-
 python/langsmith/_internal/_orjson.py         | 84 -------------------
 python/langsmith/_internal/_serde.py          | 18 ++--
 python/langsmith/_testing.py                  |  4 +-
 python/langsmith/client.py                    | 31 +++----
 python/poetry.lock                            |  4 +-
 python/pyproject.toml                         |  2 +-
 python/tests/unit_tests/test_client.py        |  8 +-
 python/tests/unit_tests/test_operations.py    |  9 +-
 10 files changed, 46 insertions(+), 147 deletions(-)
 delete mode 100644 python/langsmith/_internal/_orjson.py

diff --git a/python/langsmith/_internal/_background_thread.py b/python/langsmith/_internal/_background_thread.py
index 844851996..b6aee1f4e 100644
--- a/python/langsmith/_internal/_background_thread.py
+++ b/python/langsmith/_internal/_background_thread.py
@@ -155,25 +155,13 @@ def tracing_control_thread_func(client_ref: weakref.ref[Client]) -> None:
     # 1 for this func, 1 for getrefcount, 1 for _get_data_type_cached
     num_known_refs = 3
 
-    def keep_thread_active() -> bool:
-        # if `client.cleanup()` was called, stop thread
-        if client and client._manual_cleanup:
-            return False
-        if not threading.main_thread().is_alive():
-            # main thread is dead. should not be active
-            return False
-
-        if hasattr(sys, "getrefcount"):
-            # check if client refs count indicates we're the only remaining
-            # reference to the client
-            return sys.getrefcount(client) > num_known_refs + len(sub_threads)
-        else:
-            # in PyPy, there is no sys.getrefcount attribute
-            # for now, keep thread alive
-            return True
-
     # loop until
-    while keep_thread_active():
+    while (
+        # the main thread dies
+        threading.main_thread().is_alive()
+        # or we're the only remaining reference to the client
+        and sys.getrefcount(client) > num_known_refs + len(sub_threads)
+    ):
         for thread in sub_threads:
             if not thread.is_alive():
                 sub_threads.remove(thread)
diff --git a/python/langsmith/_internal/_operations.py b/python/langsmith/_internal/_operations.py
index 66decff0f..e1e99d6e2 100644
--- a/python/langsmith/_internal/_operations.py
+++ b/python/langsmith/_internal/_operations.py
@@ -5,8 +5,9 @@
 import uuid
 from typing import Literal, Optional, Union, cast
 
+import orjson
+
 from langsmith import schemas as ls_schemas
-from langsmith._internal import _orjson
 from langsmith._internal._multipart import MultipartPart, MultipartPartsAndContext
 from langsmith._internal._serde import dumps_json as _dumps_json
 
@@ -168,12 +169,12 @@ def combine_serialized_queue_operations(
             if op._none is not None and op._none != create_op._none:
                 # TODO optimize this more - this would currently be slowest
                 # for large payloads
-                create_op_dict = _orjson.loads(create_op._none)
+                create_op_dict = orjson.loads(create_op._none)
                 op_dict = {
-                    k: v for k, v in _orjson.loads(op._none).items() if v is not None
+                    k: v for k, v in orjson.loads(op._none).items() if v is not None
                 }
                 create_op_dict.update(op_dict)
-                create_op._none = _orjson.dumps(create_op_dict)
+                create_op._none = orjson.dumps(create_op_dict)
 
             if op.inputs is not None:
                 create_op.inputs = op.inputs
diff --git a/python/langsmith/_internal/_orjson.py b/python/langsmith/_internal/_orjson.py
deleted file mode 100644
index ecd9e20bc..000000000
--- a/python/langsmith/_internal/_orjson.py
+++ /dev/null
@@ -1,84 +0,0 @@
-"""Stubs for orjson operations, compatible with PyPy via a json fallback."""
-
-try:
-    from orjson import (
-        OPT_NON_STR_KEYS,
-        OPT_SERIALIZE_DATACLASS,
-        OPT_SERIALIZE_NUMPY,
-        OPT_SERIALIZE_UUID,
-        Fragment,
-        JSONDecodeError,
-        dumps,
-        loads,
-    )
-
-except ImportError:
-    import dataclasses
-    import json
-    import uuid
-    from typing import Any, Callable, Optional
-
-    OPT_NON_STR_KEYS = 1
-    OPT_SERIALIZE_DATACLASS = 2
-    OPT_SERIALIZE_NUMPY = 4
-    OPT_SERIALIZE_UUID = 8
-
-    class Fragment:  # type: ignore
-        def __init__(self, payloadb: bytes):
-            self.payloadb = payloadb
-
-    from json import JSONDecodeError  # type: ignore
-
-    def dumps(  # type: ignore
-        obj: Any,
-        /,
-        default: Optional[Callable[[Any], Any]] = None,
-        option: int = 0,
-    ) -> bytes:  # type: ignore
-        # for now, don't do anything for this case because `json.dumps`
-        # automatically encodes non-str keys as str by default, unlike orjson
-        # enable_non_str_keys = bool(option & OPT_NON_STR_KEYS)
-
-        enable_serialize_numpy = bool(option & OPT_SERIALIZE_NUMPY)
-        enable_serialize_dataclass = bool(option & OPT_SERIALIZE_DATACLASS)
-        enable_serialize_uuid = bool(option & OPT_SERIALIZE_UUID)
-
-        class CustomEncoder(json.JSONEncoder):  # type: ignore
-            def encode(self, o: Any) -> str:
-                if isinstance(o, Fragment):
-                    return o.payloadb.decode("utf-8")  # type: ignore
-                return super().encode(o)
-
-            def default(self, o: Any) -> Any:
-                if enable_serialize_uuid and isinstance(o, uuid.UUID):
-                    return str(o)
-                if enable_serialize_numpy and hasattr(o, "tolist"):
-                    # even objects like np.uint16(15) have a .tolist() function
-                    return o.tolist()
-                if (
-                    enable_serialize_dataclass
-                    and dataclasses.is_dataclass(o)
-                    and not isinstance(o, type)
-                ):
-                    return dataclasses.asdict(o)
-                if default is not None:
-                    return default(o)
-
-                return super().default(o)
-
-        return json.dumps(obj, cls=CustomEncoder).encode("utf-8")
-
-    def loads(payload: bytes, /) -> Any:  # type: ignore
-        return json.loads(payload)
-
-
-__all__ = [
-    "loads",
-    "dumps",
-    "Fragment",
-    "JSONDecodeError",
-    "OPT_SERIALIZE_NUMPY",
-    "OPT_SERIALIZE_DATACLASS",
-    "OPT_SERIALIZE_UUID",
-    "OPT_NON_STR_KEYS",
-]
diff --git a/python/langsmith/_internal/_serde.py b/python/langsmith/_internal/_serde.py
index 1bf8865c1..e77f7319d 100644
--- a/python/langsmith/_internal/_serde.py
+++ b/python/langsmith/_internal/_serde.py
@@ -12,7 +12,7 @@
 import uuid
 from typing import Any
 
-from langsmith._internal import _orjson
+import orjson
 
 try:
     from zoneinfo import ZoneInfo  # type: ignore[import-not-found]
@@ -133,13 +133,13 @@ def dumps_json(obj: Any) -> bytes:
         The JSON formatted string.
     """
     try:
-        return _orjson.dumps(
+        return orjson.dumps(
             obj,
             default=_serialize_json,
-            option=_orjson.OPT_SERIALIZE_NUMPY
-            | _orjson.OPT_SERIALIZE_DATACLASS
-            | _orjson.OPT_SERIALIZE_UUID
-            | _orjson.OPT_NON_STR_KEYS,
+            option=orjson.OPT_SERIALIZE_NUMPY
+            | orjson.OPT_SERIALIZE_DATACLASS
+            | orjson.OPT_SERIALIZE_UUID
+            | orjson.OPT_NON_STR_KEYS,
         )
     except TypeError as e:
         # Usually caused by UTF surrogate characters
@@ -150,9 +150,9 @@ def dumps_json(obj: Any) -> bytes:
             ensure_ascii=True,
         ).encode("utf-8")
         try:
-            result = _orjson.dumps(
-                _orjson.loads(result.decode("utf-8", errors="surrogateescape"))
+            result = orjson.dumps(
+                orjson.loads(result.decode("utf-8", errors="surrogateescape"))
             )
-        except _orjson.JSONDecodeError:
+        except orjson.JSONDecodeError:
             result = _elide_surrogates(result)
         return result
diff --git a/python/langsmith/_testing.py b/python/langsmith/_testing.py
index 9eaa0877f..8dd72fbcb 100644
--- a/python/langsmith/_testing.py
+++ b/python/langsmith/_testing.py
@@ -12,6 +12,7 @@
 from pathlib import Path
 from typing import Any, Callable, Optional, Sequence, Tuple, TypeVar, overload
 
+import orjson
 from typing_extensions import TypedDict
 
 from langsmith import client as ls_client
@@ -20,7 +21,6 @@
 from langsmith import run_trees as rt
 from langsmith import schemas as ls_schemas
 from langsmith import utils as ls_utils
-from langsmith._internal import _orjson
 
 try:
     import pytest  # type: ignore
@@ -374,7 +374,7 @@ def _serde_example_values(values: VT) -> VT:
     if values is None:
         return values
     bts = ls_client._dumps_json(values)
-    return _orjson.loads(bts)
+    return orjson.loads(bts)
 
 
 class _LangSmithTestSuite:
diff --git a/python/langsmith/client.py b/python/langsmith/client.py
index 8348b57d1..eb397b4c4 100644
--- a/python/langsmith/client.py
+++ b/python/langsmith/client.py
@@ -55,6 +55,7 @@
 )
 from urllib import parse as urllib_parse
 
+import orjson
 import requests
 from requests import adapters as requests_adapters
 from requests_toolbelt import (  # type: ignore[import-untyped]
@@ -68,7 +69,6 @@
 from langsmith import env as ls_env
 from langsmith import schemas as ls_schemas
 from langsmith import utils as ls_utils
-from langsmith._internal import _orjson
 from langsmith._internal._background_thread import (
     TracingQueueItem,
 )
@@ -368,7 +368,6 @@ class Client:
         "_info",
         "_write_api_urls",
         "_settings",
-        "_manual_cleanup",
     ]
 
     def __init__(
@@ -517,8 +516,6 @@ def __init__(
 
         self._settings: Union[ls_schemas.LangSmithSettings, None] = None
 
-        self._manual_cleanup = False
-
     def _repr_html_(self) -> str:
         """Return an HTML representation of the instance with a link to the URL.
 
@@ -1255,7 +1252,7 @@ def _hide_run_inputs(self, inputs: dict):
         if self._hide_inputs is True:
             return {}
         if self._anonymizer:
-            json_inputs = _orjson.loads(_dumps_json(inputs))
+            json_inputs = orjson.loads(_dumps_json(inputs))
             return self._anonymizer(json_inputs)
         if self._hide_inputs is False:
             return inputs
@@ -1265,7 +1262,7 @@ def _hide_run_outputs(self, outputs: dict):
         if self._hide_outputs is True:
             return {}
         if self._anonymizer:
-            json_outputs = _orjson.loads(_dumps_json(outputs))
+            json_outputs = orjson.loads(_dumps_json(outputs))
             return self._anonymizer(json_outputs)
         if self._hide_outputs is False:
             return outputs
@@ -1285,20 +1282,20 @@ def _batch_ingest_run_ops(
         # form the partial body and ids
         for op in ops:
             if isinstance(op, SerializedRunOperation):
-                curr_dict = _orjson.loads(op._none)
+                curr_dict = orjson.loads(op._none)
                 if op.inputs:
-                    curr_dict["inputs"] = _orjson.Fragment(op.inputs)
+                    curr_dict["inputs"] = orjson.Fragment(op.inputs)
                 if op.outputs:
-                    curr_dict["outputs"] = _orjson.Fragment(op.outputs)
+                    curr_dict["outputs"] = orjson.Fragment(op.outputs)
                 if op.events:
-                    curr_dict["events"] = _orjson.Fragment(op.events)
+                    curr_dict["events"] = orjson.Fragment(op.events)
                 if op.attachments:
                     logger.warning(
                         "Attachments are not supported when use_multipart_endpoint "
                         "is False"
                     )
                 ids_and_partial_body[op.operation].append(
-                    (f"trace={op.trace_id},id={op.id}", _orjson.dumps(curr_dict))
+                    (f"trace={op.trace_id},id={op.id}", orjson.dumps(curr_dict))
                 )
             elif isinstance(op, SerializedFeedbackOperation):
                 logger.warning(
@@ -1324,7 +1321,7 @@ def _batch_ingest_run_ops(
                     and body_size + len(body_deque[0][1]) > size_limit_bytes
                 ):
                     self._post_batch_ingest_runs(
-                        _orjson.dumps(body_chunks),
+                        orjson.dumps(body_chunks),
                         _context=f"\n{key}: {'; '.join(context_ids[key])}",
                     )
                     body_size = 0
@@ -1332,12 +1329,12 @@ def _batch_ingest_run_ops(
                     context_ids.clear()
                 curr_id, curr_body = body_deque.popleft()
                 body_size += len(curr_body)
-                body_chunks[key].append(_orjson.Fragment(curr_body))
+                body_chunks[key].append(orjson.Fragment(curr_body))
                 context_ids[key].append(curr_id)
         if body_size:
             context = "; ".join(f"{k}: {'; '.join(v)}" for k, v in context_ids.items())
             self._post_batch_ingest_runs(
-                _orjson.dumps(body_chunks), _context="\n" + context
+                orjson.dumps(body_chunks), _context="\n" + context
             )
 
     def batch_ingest_runs(
@@ -2762,7 +2759,7 @@ def create_dataset(
             "POST",
             "/datasets",
             headers={**self._headers, "Content-Type": "application/json"},
-            data=_orjson.dumps(dataset),
+            data=orjson.dumps(dataset),
         )
         ls_utils.raise_for_status_with_text(response)
 
@@ -5678,10 +5675,6 @@ def push_prompt(
         )
         return url
 
-    def cleanup(self) -> None:
-        """Manually trigger cleanup of the background thread."""
-        self._manual_cleanup = True
-
 
 def convert_prompt_to_openai_format(
     messages: Any,
diff --git a/python/poetry.lock b/python/poetry.lock
index 2b362f986..a2e1c3667 100644
--- a/python/poetry.lock
+++ b/python/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
 
 [[package]]
 name = "annotated-types"
@@ -2070,4 +2070,4 @@ vcr = []
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "a5a6c61cba1b5ce9cf739700a780c2df63ff7aaa482c29de9910418263318586"
+content-hash = "ca8fa5c9a82d58bea646d5e7e1089175111ddec2c24cd0b19920d1afd4dd93da"
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 191d61b22..fc1d71da3 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -31,7 +31,7 @@ pydantic = [
   { version = "^2.7.4", python = ">=3.12.4" },
 ]
 requests = "^2"
-orjson = { version = "^3.9.14", markers = "platform_python_implementation != 'PyPy'" }
+orjson = "^3.9.14"
 httpx = ">=0.23.0,<1"
 requests-toolbelt = "^1.0.0"
 
diff --git a/python/tests/unit_tests/test_client.py b/python/tests/unit_tests/test_client.py
index feec2c2f6..5dc1bbe1e 100644
--- a/python/tests/unit_tests/test_client.py
+++ b/python/tests/unit_tests/test_client.py
@@ -22,6 +22,7 @@
 from unittest.mock import MagicMock, patch
 
 import dataclasses_json
+import orjson
 import pytest
 import requests
 from multipart import MultipartParser, MultipartPart, parse_options_header
@@ -32,7 +33,6 @@
 import langsmith.utils as ls_utils
 from langsmith import AsyncClient, EvaluationResult, run_trees
 from langsmith import schemas as ls_schemas
-from langsmith._internal import _orjson
 from langsmith._internal._serde import _serialize_json
 from langsmith.client import (
     Client,
@@ -848,7 +848,7 @@ class MyNamedTuple(NamedTuple):
         "set_with_class": set([MyClass(1)]),
         "my_mock": MagicMock(text="Hello, world"),
     }
-    res = _orjson.loads(_dumps_json(to_serialize))
+    res = orjson.loads(_dumps_json(to_serialize))
     assert (
         "model_dump" not in caplog.text
     ), f"Unexpected error logs were emitted: {caplog.text}"
@@ -898,7 +898,7 @@ def __repr__(self) -> str:
     my_cyclic = CyclicClass(other=CyclicClass(other=None))
     my_cyclic.other.other = my_cyclic  # type: ignore
 
-    res = _orjson.loads(_dumps_json({"cyclic": my_cyclic}))
+    res = orjson.loads(_dumps_json({"cyclic": my_cyclic}))
     assert res == {"cyclic": "my_cycles..."}
     expected = {"foo": "foo", "bar": 1}
 
@@ -1142,7 +1142,7 @@ def test_batch_ingest_run_splits_large_batches(
             op
             for call in mock_session.request.call_args_list
             for reqs in (
-                _orjson.loads(call[1]["data"]).values() if call[0][0] == "POST" else []
+                orjson.loads(call[1]["data"]).values() if call[0][0] == "POST" else []
             )
             for op in reqs
         ]
diff --git a/python/tests/unit_tests/test_operations.py b/python/tests/unit_tests/test_operations.py
index 43d06ebc5..a6b5cdeb3 100644
--- a/python/tests/unit_tests/test_operations.py
+++ b/python/tests/unit_tests/test_operations.py
@@ -1,4 +1,5 @@
-from langsmith._internal import _orjson
+import orjson
+
 from langsmith._internal._operations import (
     SerializedFeedbackOperation,
     SerializedRunOperation,
@@ -13,7 +14,7 @@ def test_combine_serialized_queue_operations():
             operation="post",
             id="id1",
             trace_id="trace_id1",
-            _none=_orjson.dumps({"a": 1}),
+            _none=orjson.dumps({"a": 1}),
             inputs="inputs1",
             outputs="outputs1",
             events="events1",
@@ -23,7 +24,7 @@ def test_combine_serialized_queue_operations():
             operation="patch",
             id="id1",
             trace_id="trace_id1",
-            _none=_orjson.dumps({"b": "2"}),
+            _none=orjson.dumps({"b": "2"}),
             inputs="inputs1-patched",
             outputs="outputs1-patched",
             events="events1",
@@ -86,7 +87,7 @@ def test_combine_serialized_queue_operations():
             operation="post",
             id="id1",
             trace_id="trace_id1",
-            _none=_orjson.dumps({"a": 1, "b": "2"}),
+            _none=orjson.dumps({"a": 1, "b": "2"}),
             inputs="inputs1-patched",
             outputs="outputs1-patched",
             events="events1",

From 79f3008a00c0aa015efb69e1271d3c101c3f7e12 Mon Sep 17 00:00:00 2001
From: Erick Friis <erick@langchain.dev>
Date: Tue, 19 Nov 2024 14:30:50 -0800
Subject: [PATCH 14/31] python[patch]: orjson optional take 2 (#1230)

---
 .../langsmith/_internal/_background_thread.py | 26 ++++--
 python/langsmith/_internal/_operations.py     |  9 +-
 python/langsmith/_internal/_orjson.py         | 84 +++++++++++++++++++
 python/langsmith/_internal/_serde.py          | 18 ++--
 python/langsmith/_testing.py                  |  4 +-
 python/langsmith/client.py                    | 31 ++++---
 python/poetry.lock                            |  4 +-
 python/pyproject.toml                         |  2 +-
 python/tests/unit_tests/test_client.py        |  8 +-
 python/tests/unit_tests/test_operations.py    |  9 +-
 10 files changed, 149 insertions(+), 46 deletions(-)
 create mode 100644 python/langsmith/_internal/_orjson.py

diff --git a/python/langsmith/_internal/_background_thread.py b/python/langsmith/_internal/_background_thread.py
index b6aee1f4e..c0f3d46ab 100644
--- a/python/langsmith/_internal/_background_thread.py
+++ b/python/langsmith/_internal/_background_thread.py
@@ -155,13 +155,27 @@ def tracing_control_thread_func(client_ref: weakref.ref[Client]) -> None:
     # 1 for this func, 1 for getrefcount, 1 for _get_data_type_cached
     num_known_refs = 3
 
+    def keep_thread_active() -> bool:
+        # if `client.cleanup()` was called, stop thread
+        if not client or (
+            hasattr(client, "_manual_cleanup") and client._manual_cleanup
+        ):
+            return False
+        if not threading.main_thread().is_alive():
+            # main thread is dead. should not be active
+            return False
+
+        if hasattr(sys, "getrefcount"):
+            # check if client refs count indicates we're the only remaining
+            # reference to the client
+            return sys.getrefcount(client) > num_known_refs + len(sub_threads)
+        else:
+            # in PyPy, there is no sys.getrefcount attribute
+            # for now, keep thread alive
+            return True
+
     # loop until
-    while (
-        # the main thread dies
-        threading.main_thread().is_alive()
-        # or we're the only remaining reference to the client
-        and sys.getrefcount(client) > num_known_refs + len(sub_threads)
-    ):
+    while keep_thread_active():
         for thread in sub_threads:
             if not thread.is_alive():
                 sub_threads.remove(thread)
diff --git a/python/langsmith/_internal/_operations.py b/python/langsmith/_internal/_operations.py
index e1e99d6e2..66decff0f 100644
--- a/python/langsmith/_internal/_operations.py
+++ b/python/langsmith/_internal/_operations.py
@@ -5,9 +5,8 @@
 import uuid
 from typing import Literal, Optional, Union, cast
 
-import orjson
-
 from langsmith import schemas as ls_schemas
+from langsmith._internal import _orjson
 from langsmith._internal._multipart import MultipartPart, MultipartPartsAndContext
 from langsmith._internal._serde import dumps_json as _dumps_json
 
@@ -169,12 +168,12 @@ def combine_serialized_queue_operations(
             if op._none is not None and op._none != create_op._none:
                 # TODO optimize this more - this would currently be slowest
                 # for large payloads
-                create_op_dict = orjson.loads(create_op._none)
+                create_op_dict = _orjson.loads(create_op._none)
                 op_dict = {
-                    k: v for k, v in orjson.loads(op._none).items() if v is not None
+                    k: v for k, v in _orjson.loads(op._none).items() if v is not None
                 }
                 create_op_dict.update(op_dict)
-                create_op._none = orjson.dumps(create_op_dict)
+                create_op._none = _orjson.dumps(create_op_dict)
 
             if op.inputs is not None:
                 create_op.inputs = op.inputs
diff --git a/python/langsmith/_internal/_orjson.py b/python/langsmith/_internal/_orjson.py
new file mode 100644
index 000000000..ecd9e20bc
--- /dev/null
+++ b/python/langsmith/_internal/_orjson.py
@@ -0,0 +1,84 @@
+"""Stubs for orjson operations, compatible with PyPy via a json fallback."""
+
+try:
+    from orjson import (
+        OPT_NON_STR_KEYS,
+        OPT_SERIALIZE_DATACLASS,
+        OPT_SERIALIZE_NUMPY,
+        OPT_SERIALIZE_UUID,
+        Fragment,
+        JSONDecodeError,
+        dumps,
+        loads,
+    )
+
+except ImportError:
+    import dataclasses
+    import json
+    import uuid
+    from typing import Any, Callable, Optional
+
+    OPT_NON_STR_KEYS = 1
+    OPT_SERIALIZE_DATACLASS = 2
+    OPT_SERIALIZE_NUMPY = 4
+    OPT_SERIALIZE_UUID = 8
+
+    class Fragment:  # type: ignore
+        def __init__(self, payloadb: bytes):
+            self.payloadb = payloadb
+
+    from json import JSONDecodeError  # type: ignore
+
+    def dumps(  # type: ignore
+        obj: Any,
+        /,
+        default: Optional[Callable[[Any], Any]] = None,
+        option: int = 0,
+    ) -> bytes:  # type: ignore
+        # for now, don't do anything for this case because `json.dumps`
+        # automatically encodes non-str keys as str by default, unlike orjson
+        # enable_non_str_keys = bool(option & OPT_NON_STR_KEYS)
+
+        enable_serialize_numpy = bool(option & OPT_SERIALIZE_NUMPY)
+        enable_serialize_dataclass = bool(option & OPT_SERIALIZE_DATACLASS)
+        enable_serialize_uuid = bool(option & OPT_SERIALIZE_UUID)
+
+        class CustomEncoder(json.JSONEncoder):  # type: ignore
+            def encode(self, o: Any) -> str:
+                if isinstance(o, Fragment):
+                    return o.payloadb.decode("utf-8")  # type: ignore
+                return super().encode(o)
+
+            def default(self, o: Any) -> Any:
+                if enable_serialize_uuid and isinstance(o, uuid.UUID):
+                    return str(o)
+                if enable_serialize_numpy and hasattr(o, "tolist"):
+                    # even objects like np.uint16(15) have a .tolist() function
+                    return o.tolist()
+                if (
+                    enable_serialize_dataclass
+                    and dataclasses.is_dataclass(o)
+                    and not isinstance(o, type)
+                ):
+                    return dataclasses.asdict(o)
+                if default is not None:
+                    return default(o)
+
+                return super().default(o)
+
+        return json.dumps(obj, cls=CustomEncoder).encode("utf-8")
+
+    def loads(payload: bytes, /) -> Any:  # type: ignore
+        return json.loads(payload)
+
+
+__all__ = [
+    "loads",
+    "dumps",
+    "Fragment",
+    "JSONDecodeError",
+    "OPT_SERIALIZE_NUMPY",
+    "OPT_SERIALIZE_DATACLASS",
+    "OPT_SERIALIZE_UUID",
+    "OPT_NON_STR_KEYS",
+]
diff --git a/python/langsmith/_internal/_serde.py b/python/langsmith/_internal/_serde.py
index e77f7319d..1bf8865c1 100644
--- a/python/langsmith/_internal/_serde.py
+++ b/python/langsmith/_internal/_serde.py
@@ -12,7 +12,7 @@
 import uuid
 from typing import Any
 
-import orjson
+from langsmith._internal import _orjson
 
 try:
     from zoneinfo import ZoneInfo  # type: ignore[import-not-found]
@@ -133,13 +133,13 @@ def dumps_json(obj: Any) -> bytes:
         The JSON formatted string.
     """
     try:
-        return orjson.dumps(
+        return _orjson.dumps(
             obj,
             default=_serialize_json,
-            option=orjson.OPT_SERIALIZE_NUMPY
-            | orjson.OPT_SERIALIZE_DATACLASS
-            | orjson.OPT_SERIALIZE_UUID
-            | orjson.OPT_NON_STR_KEYS,
+            option=_orjson.OPT_SERIALIZE_NUMPY
+            | _orjson.OPT_SERIALIZE_DATACLASS
+            | _orjson.OPT_SERIALIZE_UUID
+            | _orjson.OPT_NON_STR_KEYS,
         )
     except TypeError as e:
         # Usually caused by UTF surrogate characters
@@ -150,9 +150,9 @@ def dumps_json(obj: Any) -> bytes:
             ensure_ascii=True,
         ).encode("utf-8")
         try:
-            result = orjson.dumps(
-                orjson.loads(result.decode("utf-8", errors="surrogateescape"))
+            result = _orjson.dumps(
+                _orjson.loads(result.decode("utf-8", errors="surrogateescape"))
             )
-        except orjson.JSONDecodeError:
+        except _orjson.JSONDecodeError:
             result = _elide_surrogates(result)
         return result
diff --git a/python/langsmith/_testing.py b/python/langsmith/_testing.py
index 8dd72fbcb..9eaa0877f 100644
--- a/python/langsmith/_testing.py
+++ b/python/langsmith/_testing.py
@@ -12,7 +12,6 @@
 from pathlib import Path
 from typing import Any, Callable, Optional, Sequence, Tuple, TypeVar, overload
 
-import orjson
 from typing_extensions import TypedDict
 
 from langsmith import client as ls_client
@@ -21,6 +20,7 @@
 from langsmith import run_trees as rt
 from langsmith import schemas as ls_schemas
 from langsmith import utils as ls_utils
+from langsmith._internal import _orjson
 
 try:
     import pytest  # type: ignore
@@ -374,7 +374,7 @@ def _serde_example_values(values: VT) -> VT:
     if values is None:
         return values
     bts = ls_client._dumps_json(values)
-    return orjson.loads(bts)
+    return _orjson.loads(bts)
 
 
 class _LangSmithTestSuite:
diff --git a/python/langsmith/client.py b/python/langsmith/client.py
index eb397b4c4..8348b57d1 100644
--- a/python/langsmith/client.py
+++ b/python/langsmith/client.py
@@ -55,7 +55,6 @@
 )
 from urllib import parse as urllib_parse
 
-import orjson
 import requests
 from requests import adapters as requests_adapters
 from requests_toolbelt import (  # type: ignore[import-untyped]
@@ -69,6 +68,7 @@
 from langsmith import env as ls_env
 from langsmith import schemas as ls_schemas
 from langsmith import utils as ls_utils
+from langsmith._internal import _orjson
 from langsmith._internal._background_thread import (
     TracingQueueItem,
 )
@@ -368,6 +368,7 @@ class Client:
         "_info",
         "_write_api_urls",
         "_settings",
+        "_manual_cleanup",
     ]
 
     def __init__(
@@ -516,6 +517,8 @@ def __init__(
 
         self._settings: Union[ls_schemas.LangSmithSettings, None] = None
 
+        self._manual_cleanup = False
+
     def _repr_html_(self) -> str:
         """Return an HTML representation of the instance with a link to the URL.
 
@@ -1252,7 +1255,7 @@ def _hide_run_inputs(self, inputs: dict):
         if self._hide_inputs is True:
             return {}
         if self._anonymizer:
-            json_inputs = orjson.loads(_dumps_json(inputs))
+            json_inputs = _orjson.loads(_dumps_json(inputs))
             return self._anonymizer(json_inputs)
         if self._hide_inputs is False:
             return inputs
@@ -1262,7 +1265,7 @@ def _hide_run_outputs(self, outputs: dict):
         if self._hide_outputs is True:
             return {}
         if self._anonymizer:
-            json_outputs = orjson.loads(_dumps_json(outputs))
+            json_outputs = _orjson.loads(_dumps_json(outputs))
             return self._anonymizer(json_outputs)
         if self._hide_outputs is False:
             return outputs
@@ -1282,20 +1285,20 @@ def _batch_ingest_run_ops(
         # form the partial body and ids
         for op in ops:
             if isinstance(op, SerializedRunOperation):
-                curr_dict = orjson.loads(op._none)
+                curr_dict = _orjson.loads(op._none)
                 if op.inputs:
-                    curr_dict["inputs"] = orjson.Fragment(op.inputs)
+                    curr_dict["inputs"] = _orjson.Fragment(op.inputs)
                 if op.outputs:
-                    curr_dict["outputs"] = orjson.Fragment(op.outputs)
+                    curr_dict["outputs"] = _orjson.Fragment(op.outputs)
                 if op.events:
-                    curr_dict["events"] = orjson.Fragment(op.events)
+                    curr_dict["events"] = _orjson.Fragment(op.events)
                 if op.attachments:
                     logger.warning(
                         "Attachments are not supported when use_multipart_endpoint "
                         "is False"
                     )
                 ids_and_partial_body[op.operation].append(
-                    (f"trace={op.trace_id},id={op.id}", orjson.dumps(curr_dict))
+                    (f"trace={op.trace_id},id={op.id}", _orjson.dumps(curr_dict))
                 )
             elif isinstance(op, SerializedFeedbackOperation):
                 logger.warning(
@@ -1321,7 +1324,7 @@ def _batch_ingest_run_ops(
                     and body_size + len(body_deque[0][1]) > size_limit_bytes
                 ):
                     self._post_batch_ingest_runs(
-                        orjson.dumps(body_chunks),
+                        _orjson.dumps(body_chunks),
                         _context=f"\n{key}: {'; '.join(context_ids[key])}",
                     )
                     body_size = 0
@@ -1329,12 +1332,12 @@ def _batch_ingest_run_ops(
                     context_ids.clear()
                 curr_id, curr_body = body_deque.popleft()
                 body_size += len(curr_body)
-                body_chunks[key].append(orjson.Fragment(curr_body))
+                body_chunks[key].append(_orjson.Fragment(curr_body))
                 context_ids[key].append(curr_id)
         if body_size:
             context = "; ".join(f"{k}: {'; '.join(v)}" for k, v in context_ids.items())
             self._post_batch_ingest_runs(
-                orjson.dumps(body_chunks), _context="\n" + context
+                _orjson.dumps(body_chunks), _context="\n" + context
             )
 
     def batch_ingest_runs(
@@ -2759,7 +2762,7 @@ def create_dataset(
             "POST",
             "/datasets",
             headers={**self._headers, "Content-Type": "application/json"},
-            data=orjson.dumps(dataset),
+            data=_orjson.dumps(dataset),
         )
         ls_utils.raise_for_status_with_text(response)
 
@@ -5675,6 +5678,10 @@ def push_prompt(
         )
         return url
 
+    def cleanup(self) -> None:
+        """Manually trigger cleanup of the background thread."""
+        self._manual_cleanup = True
+
 
 def convert_prompt_to_openai_format(
     messages: Any,
diff --git a/python/poetry.lock b/python/poetry.lock
index a2e1c3667..2b362f986 100644
--- a/python/poetry.lock
+++ b/python/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
 [[package]]
 name = "annotated-types"
@@ -2070,4 +2070,4 @@ vcr = []
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "ca8fa5c9a82d58bea646d5e7e1089175111ddec2c24cd0b19920d1afd4dd93da"
+content-hash = "a5a6c61cba1b5ce9cf739700a780c2df63ff7aaa482c29de9910418263318586"
diff --git a/python/pyproject.toml b/python/pyproject.toml
index fc1d71da3..191d61b22 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -31,7 +31,7 @@ pydantic = [
   { version = "^2.7.4", python = ">=3.12.4" },
 ]
 requests = "^2"
-orjson = "^3.9.14"
+orjson = { version = "^3.9.14", markers = "platform_python_implementation != 'PyPy'" }
 httpx = ">=0.23.0,<1"
 requests-toolbelt = "^1.0.0"
 
diff --git a/python/tests/unit_tests/test_client.py b/python/tests/unit_tests/test_client.py
index 5dc1bbe1e..feec2c2f6 100644
--- a/python/tests/unit_tests/test_client.py
+++ b/python/tests/unit_tests/test_client.py
@@ -22,7 +22,6 @@
 from unittest.mock import MagicMock, patch
 
 import dataclasses_json
-import orjson
 import pytest
 import requests
 from multipart import MultipartParser, MultipartPart, parse_options_header
@@ -33,6 +32,7 @@
 import langsmith.utils as ls_utils
 from langsmith import AsyncClient, EvaluationResult, run_trees
 from langsmith import schemas as ls_schemas
+from langsmith._internal import _orjson
 from langsmith._internal._serde import _serialize_json
 from langsmith.client import (
     Client,
@@ -848,7 +848,7 @@ class MyNamedTuple(NamedTuple):
         "set_with_class": set([MyClass(1)]),
         "my_mock": MagicMock(text="Hello, world"),
     }
-    res = orjson.loads(_dumps_json(to_serialize))
+    res = _orjson.loads(_dumps_json(to_serialize))
     assert (
         "model_dump" not in caplog.text
     ), f"Unexpected error logs were emitted: {caplog.text}"
@@ -898,7 +898,7 @@ def __repr__(self) -> str:
     my_cyclic = CyclicClass(other=CyclicClass(other=None))
     my_cyclic.other.other = my_cyclic  # type: ignore
 
-    res = orjson.loads(_dumps_json({"cyclic": my_cyclic}))
+    res = _orjson.loads(_dumps_json({"cyclic": my_cyclic}))
     assert res == {"cyclic": "my_cycles..."}
     expected = {"foo": "foo", "bar": 1}
 
@@ -1142,7 +1142,7 @@ def test_batch_ingest_run_splits_large_batches(
             op
             for call in mock_session.request.call_args_list
             for reqs in (
-                orjson.loads(call[1]["data"]).values() if call[0][0] == "POST" else []
+                _orjson.loads(call[1]["data"]).values() if call[0][0] == "POST" else []
             )
             for op in reqs
         ]
diff --git a/python/tests/unit_tests/test_operations.py b/python/tests/unit_tests/test_operations.py
index a6b5cdeb3..43d06ebc5 100644
--- a/python/tests/unit_tests/test_operations.py
+++ b/python/tests/unit_tests/test_operations.py
@@ -1,5 +1,4 @@
-import orjson
-
+from langsmith._internal import _orjson
 from langsmith._internal._operations import (
     SerializedFeedbackOperation,
     SerializedRunOperation,
@@ -14,7 +13,7 @@ def test_combine_serialized_queue_operations():
             operation="post",
             id="id1",
             trace_id="trace_id1",
-            _none=orjson.dumps({"a": 1}),
+            _none=_orjson.dumps({"a": 1}),
             inputs="inputs1",
             outputs="outputs1",
             events="events1",
@@ -24,7 +23,7 @@ def test_combine_serialized_queue_operations():
             operation="patch",
             id="id1",
             trace_id="trace_id1",
-            _none=orjson.dumps({"b": "2"}),
+            _none=_orjson.dumps({"b": "2"}),
             inputs="inputs1-patched",
             outputs="outputs1-patched",
             events="events1",
@@ -87,7 +86,7 @@ def test_combine_serialized_queue_operations():
             operation="post",
             id="id1",
             trace_id="trace_id1",
-            _none=orjson.dumps({"a": 1, "b": "2"}),
+            _none=_orjson.dumps({"a": 1, "b": "2"}),
             inputs="inputs1-patched",
             outputs="outputs1-patched",
             events="events1",

From 4062079d878fe8c16fca8fdf8ef5ac0661f0f831 Mon Sep 17 00:00:00 2001
From: Erick Friis <erick@langchain.dev>
Date: Tue, 19 Nov 2024 14:31:54 -0800
Subject: [PATCH 15/31] skip flaky js integration test (#1232)

saw some flakiness in comparative evals stuff too, but not skipping
those because seems less common

if we do want to remove comparative evals:
https://github.com/langchain-ai/langsmith-sdk/pull/1232/commits/c07164a846d9cd131e7bb737c68e40dfe103c27b
---
 js/src/tests/vercel.int.test.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/js/src/tests/vercel.int.test.ts b/js/src/tests/vercel.int.test.ts
index b98fbb893..28a2280be 100644
--- a/js/src/tests/vercel.int.test.ts
+++ b/js/src/tests/vercel.int.test.ts
@@ -102,7 +102,7 @@ test("generateText with image", async () => {
   expect(storedRun.id).toEqual(runId);
 });
 
-test("streamText", async () => {
+test.skip("streamText", async () => {
   const runId = uuid();
   const result = await streamText({
     model: openai("gpt-4o-mini"),

From 707e56dbc49c5bd78067eba05c44ad01cbf8528f Mon Sep 17 00:00:00 2001
From: Erick Friis <erick@langchain.dev>
Date: Tue, 19 Nov 2024 14:36:00 -0800
Subject: [PATCH 16/31] infra: use benchmark-fast in pybench ci (#1160)

---
 .github/workflows/py-bench.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/py-bench.yml b/.github/workflows/py-bench.yml
index 20ce118d1..66058d0d5 100644
--- a/.github/workflows/py-bench.yml
+++ b/.github/workflows/py-bench.yml
@@ -43,7 +43,7 @@ jobs:
         run: |
           {
             echo 'OUTPUT<<EOF'
-            make -s benchmark
+            make -s benchmark-fast
             echo EOF
           } >> "$GITHUB_OUTPUT"
       - name: Compare benchmarks

From ec10b88db58ca01f928e95411776efc46204d13d Mon Sep 17 00:00:00 2001
From: Erick Friis <erick@langchain.dev>
Date: Tue, 19 Nov 2024 16:48:46 -0800
Subject: [PATCH 17/31] timeout_ms docstring (#1234)

---
 python/langsmith/client.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/langsmith/client.py b/python/langsmith/client.py
index 8348b57d1..5e906507d 100644
--- a/python/langsmith/client.py
+++ b/python/langsmith/client.py
@@ -399,8 +399,9 @@ def __init__(
             environment variable.
         retry_config : Retry or None, default=None
             Retry configuration for the HTTPAdapter.
-        timeout_ms : int or None, default=None
-            Timeout in milliseconds for the HTTPAdapter.
+        timeout_ms : int, tuple[int, int], or None, default=None
+            Timeout for the HTTPAdapter. Can also be a 2-tuple of
+            (connect timeout, read timeout) to set them separately.
         web_url : str or None, default=None
             URL for the LangSmith web app. Default is auto-inferred from
             the ENDPOINT.

From 7670d0f33deab1444e5ab1667c2f34e33a64193d Mon Sep 17 00:00:00 2001
From: Jacob Lee <jacoblee93@gmail.com>
Date: Wed, 20 Nov 2024 08:52:04 -0800
Subject: [PATCH 18/31] fix(js): Fix multipart upload with overridden node
 fetch (#1235)

---
 js/package.json                       |  5 +--
 js/src/client.ts                      | 46 +++++++++++++++++++--------
 js/src/index.ts                       |  2 +-
 js/src/tests/batch_client.int.test.ts | 32 +++++++++++++++++++
 js/src/tests/batch_client.test.ts     | 46 ++++++++++++++++++++-------
 js/yarn.lock                          | 37 ++++++++++++++-------
 6 files changed, 129 insertions(+), 39 deletions(-)

diff --git a/js/package.json b/js/package.json
index 51c247496..1d6e0e2ee 100644
--- a/js/package.json
+++ b/js/package.json
@@ -1,6 +1,6 @@
 {
   "name": "langsmith",
-  "version": "0.2.5",
+  "version": "0.2.6",
   "description": "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform.",
   "packageManager": "yarn@1.22.19",
   "files": [
@@ -114,7 +114,7 @@
     "@faker-js/faker": "^8.4.1",
     "@jest/globals": "^29.5.0",
     "@langchain/core": "^0.3.14",
-    "@langchain/langgraph": "^0.2.18",
+    "@langchain/langgraph": "^0.2.20",
     "@langchain/openai": "^0.3.11",
     "@opentelemetry/sdk-trace-base": "^1.26.0",
     "@opentelemetry/sdk-trace-node": "^1.26.0",
@@ -133,6 +133,7 @@
     "eslint-plugin-prettier": "^4.2.1",
     "jest": "^29.5.0",
     "langchain": "^0.3.3",
+    "node-fetch": "^2.7.0",
     "openai": "^4.67.3",
     "prettier": "^2.8.8",
     "ts-jest": "^29.1.0",
diff --git a/js/src/client.ts b/js/src/client.ts
index 8c094e7fd..6e75751bb 100644
--- a/js/src/client.ts
+++ b/js/src/client.ts
@@ -1150,33 +1150,51 @@ export class Client {
 
   private async _sendMultipartRequest(parts: MultipartPart[], context: string) {
     try {
-      const formData = new FormData();
+      // Create multipart form data manually using Blobs
+      const boundary =
+        "----LangSmithFormBoundary" + Math.random().toString(36).slice(2);
+      const chunks: Blob[] = [];
+
       for (const part of parts) {
-        formData.append(part.name, part.payload);
+        // Add field boundary
+        chunks.push(new Blob([`--${boundary}\r\n`]));
+        chunks.push(
+          new Blob([
+            `Content-Disposition: form-data; name="${part.name}"\r\n`,
+            `Content-Type: ${part.payload.type}\r\n\r\n`,
+          ])
+        );
+        chunks.push(part.payload);
+        chunks.push(new Blob(["\r\n"]));
       }
-      // Log the form data
-      await this.batchIngestCaller.call(
+
+      // Add final boundary
+      chunks.push(new Blob([`--${boundary}--\r\n`]));
+
+      // Combine all chunks into a single Blob
+      const body = new Blob(chunks);
+
+      // Convert Blob to ArrayBuffer for compatibility
+      const arrayBuffer = await body.arrayBuffer();
+
+      const res = await this.batchIngestCaller.call(
         _getFetchImplementation(),
         `${this.apiUrl}/runs/multipart`,
         {
           method: "POST",
           headers: {
             ...this.headers,
+            "Content-Type": `multipart/form-data; boundary=${boundary}`,
           },
-          body: formData,
+          body: arrayBuffer,
           signal: AbortSignal.timeout(this.timeout_ms),
           ...this.fetchOptions,
         }
       );
-    } catch (e) {
-      let errorMessage = "Failed to multipart ingest runs";
-      // eslint-disable-next-line no-instanceof/no-instanceof
-      if (e instanceof Error) {
-        errorMessage += `: ${e.stack || e.message}`;
-      } else {
-        errorMessage += `: ${String(e)}`;
-      }
-      console.warn(`${errorMessage.trim()}\n\nContext: ${context}`);
+      await raiseForStatus(res, "ingest multipart runs", true);
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    } catch (e: any) {
+      console.warn(`${e.message.trim()}\n\nContext: ${context}`);
     }
   }
 
diff --git a/js/src/index.ts b/js/src/index.ts
index 3c5fedef9..e0831adc2 100644
--- a/js/src/index.ts
+++ b/js/src/index.ts
@@ -14,4 +14,4 @@ export { RunTree, type RunTreeConfig } from "./run_trees.js";
 export { overrideFetchImplementation } from "./singletons/fetch.js";
 
 // Update using yarn bump-version
-export const __version__ = "0.2.5";
+export const __version__ = "0.2.6";
diff --git a/js/src/tests/batch_client.int.test.ts b/js/src/tests/batch_client.int.test.ts
index 5e4126c74..d87adfe1a 100644
--- a/js/src/tests/batch_client.int.test.ts
+++ b/js/src/tests/batch_client.int.test.ts
@@ -2,6 +2,7 @@ import { v4 as uuidv4 } from "uuid";
 import * as fs from "node:fs";
 import * as path from "node:path";
 import { fileURLToPath } from "node:url";
+import nodeFetch from "node-fetch";
 
 import { Client } from "../client.js";
 import { RunTree, convertToDottedOrderFormat } from "../run_trees.js";
@@ -11,6 +12,7 @@ import {
   waitUntilRunFound,
 } from "./utils.js";
 import { traceable } from "../traceable.js";
+import { overrideFetchImplementation } from "../singletons/fetch.js";
 
 test.concurrent(
   "Test persist update run",
@@ -229,6 +231,7 @@ test.concurrent(
       outputs: { output: ["Hi"] },
       dotted_order: dottedOrder,
       trace_id: runId,
+      end_time: Math.floor(new Date().getTime() / 1000),
     });
 
     await Promise.all([
@@ -282,3 +285,32 @@ test.skip("very large runs", async () => {
 
   await langchainClient.deleteProject({ projectName });
 }, 180_000);
+
+test("multipart should work with overridden node-fetch", async () => {
+  overrideFetchImplementation(nodeFetch);
+
+  const langchainClient = new Client({
+    autoBatchTracing: true,
+    timeout_ms: 120_000,
+  });
+
+  const projectName = "__test_node_fetch" + uuidv4().substring(0, 4);
+  await deleteProject(langchainClient, projectName);
+
+  await traceable(
+    async () => {
+      return "testing with node fetch";
+    },
+    {
+      project_name: projectName,
+      client: langchainClient,
+      tracingEnabled: true,
+    }
+  )();
+
+  await langchainClient.awaitPendingTraceBatches();
+
+  await Promise.all([waitUntilProjectFound(langchainClient, projectName)]);
+
+  await langchainClient.deleteProject({ projectName });
+});
diff --git a/js/src/tests/batch_client.test.ts b/js/src/tests/batch_client.test.ts
index f81d24bfa..e3b358aa6 100644
--- a/js/src/tests/batch_client.test.ts
+++ b/js/src/tests/batch_client.test.ts
@@ -7,24 +7,36 @@ import { convertToDottedOrderFormat } from "../run_trees.js";
 import { _getFetchImplementation } from "../singletons/fetch.js";
 import { RunCreate } from "../schemas.js";
 
-const parseMockRequestBody = async (body: string | FormData) => {
+const parseMockRequestBody = async (body: string | ArrayBuffer) => {
   if (typeof body === "string") {
     return JSON.parse(body);
   }
   // Typing is missing
-  const entries: any[] = Array.from((body as any).entries());
+  const rawMultipart = new TextDecoder().decode(body);
+  // Parse the multipart form data boundary from the raw text
+  const boundary = rawMultipart.split("\r\n")[0].trim();
+  // Split the multipart body into individual parts
+  const parts = rawMultipart.split(boundary).slice(1, -1);
+
+  const entries: [string, any][] = parts.map((part) => {
+    const [headers, ...contentParts] = part.trim().split("\r\n\r\n");
+    const content = contentParts.join("\r\n\r\n");
+    // Extract the name from Content-Disposition header
+    const nameMatch = headers.match(/name="([^"]+)"/);
+    const name = nameMatch ? nameMatch[1] : "";
+    return [name, content.trim()];
+  });
   const reconstructedBody: any = {
     post: [],
     patch: [],
   };
   for (const [key, value] of entries) {
     let [method, id, type] = key.split(".");
-    const text = await value.text();
     let parsedValue;
     try {
-      parsedValue = JSON.parse(text);
+      parsedValue = JSON.parse(value);
     } catch (e) {
-      parsedValue = text;
+      parsedValue = value;
     }
     // if (method === "attachment") {
     //   for (const item of reconstructedBody.post) {
@@ -131,7 +143,7 @@ describe.each(ENDPOINT_TYPES)(
         _getFetchImplementation(),
         expectedTraceURL,
         expect.objectContaining({
-          body: expect.any(endpointType === "batch" ? String : FormData),
+          body: expect.any(endpointType === "batch" ? String : ArrayBuffer),
         })
       );
     });
@@ -245,7 +257,7 @@ describe.each(ENDPOINT_TYPES)(
         _getFetchImplementation(),
         expectedTraceURL,
         expect.objectContaining({
-          body: expect.any(endpointType === "batch" ? String : FormData),
+          body: expect.any(endpointType === "batch" ? String : ArrayBuffer),
         })
       );
     });
@@ -326,7 +338,7 @@ describe.each(ENDPOINT_TYPES)(
         _getFetchImplementation(),
         expectedTraceURL,
         expect.objectContaining({
-          body: expect.any(endpointType === "batch" ? String : FormData),
+          body: expect.any(endpointType === "batch" ? String : ArrayBuffer),
         })
       );
     });
@@ -612,9 +624,21 @@ describe.each(ENDPOINT_TYPES)(
       const calledRequestParam: any = callSpy.mock.calls[0][2];
       const calledRequestParam2: any = callSpy.mock.calls[1][2];
 
+      const firstBatchBody = await parseMockRequestBody(
+        calledRequestParam?.body
+      );
+      const secondBatchBody = await parseMockRequestBody(
+        calledRequestParam2?.body
+      );
+
+      const initialBatchBody =
+        firstBatchBody.post.length === 10 ? firstBatchBody : secondBatchBody;
+      const followupBatchBody =
+        firstBatchBody.post.length === 10 ? secondBatchBody : firstBatchBody;
+
       // Queue should drain as soon as size limit is reached,
       // sending both batches
-      expect(await parseMockRequestBody(calledRequestParam?.body)).toEqual({
+      expect(initialBatchBody).toEqual({
         post: runIds.slice(0, 10).map((runId, i) =>
           expect.objectContaining({
             id: runId,
@@ -628,7 +652,7 @@ describe.each(ENDPOINT_TYPES)(
         patch: [],
       });
 
-      expect(await parseMockRequestBody(calledRequestParam2?.body)).toEqual({
+      expect(followupBatchBody).toEqual({
         post: runIds.slice(10).map((runId, i) =>
           expect.objectContaining({
             id: runId,
@@ -903,7 +927,7 @@ describe.each(ENDPOINT_TYPES)(
         _getFetchImplementation(),
         expectedTraceURL,
         expect.objectContaining({
-          body: expect.any(endpointType === "batch" ? String : FormData),
+          body: expect.any(endpointType === "batch" ? String : ArrayBuffer),
         })
       );
     });
diff --git a/js/yarn.lock b/js/yarn.lock
index eed0130ac..92dc3ffb1 100644
--- a/js/yarn.lock
+++ b/js/yarn.lock
@@ -1395,13 +1395,23 @@
   dependencies:
     uuid "^10.0.0"
 
-"@langchain/langgraph@^0.2.18":
-  version "0.2.18"
-  resolved "https://registry.yarnpkg.com/@langchain/langgraph/-/langgraph-0.2.18.tgz#254e9fa2462cfa4c468a87e2d34e09d629eb3d2f"
-  integrity sha512-72ZLIpyVMsuI2FdAnttgLx++rZZ1yKvu1A35d1A9XbcQGSfmfouXGQ7GMhVOFgmldVB3AlKKGOoahGpTwyXsbg==
+"@langchain/langgraph-sdk@~0.0.20":
+  version "0.0.25"
+  resolved "https://registry.yarnpkg.com/@langchain/langgraph-sdk/-/langgraph-sdk-0.0.25.tgz#35feaf85436d5c9cc3131520db07dde07db28416"
+  integrity sha512-12QYwGuhZ9HPbXFb+SNyUse3V8Hx8d375ytHPuMPZ5nS+UTFozIBxLzCTE0QMoXt0Hfr4X772W/YP6P+v751gw==
+  dependencies:
+    "@types/json-schema" "^7.0.15"
+    p-queue "^6.6.2"
+    p-retry "4"
+    uuid "^9.0.0"
+
+"@langchain/langgraph@^0.2.20":
+  version "0.2.20"
+  resolved "https://registry.yarnpkg.com/@langchain/langgraph/-/langgraph-0.2.20.tgz#0773b9262351ed07a125d6ae6188def519488d3c"
+  integrity sha512-MMD4G++gHs+5OO5Uu75gduskTboJ8Q7ZAwzd1s64a1Y/38pdgDqJdYRHRCGpx8eeCuKhsRzV2Sssnl5lujfj8w==
   dependencies:
     "@langchain/langgraph-checkpoint" "~0.0.10"
-    double-ended-queue "^2.1.0-0"
+    "@langchain/langgraph-sdk" "~0.0.20"
     uuid "^10.0.0"
     zod "^3.23.8"
 
@@ -1634,6 +1644,11 @@
     expect "^29.0.0"
     pretty-format "^29.0.0"
 
+"@types/json-schema@^7.0.15":
+  version "7.0.15"
+  resolved "https://registry.yarnpkg.com/@types/json-schema/-/json-schema-7.0.15.tgz#596a1747233694d50f6ad8a7869fcb6f56cf5841"
+  integrity sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==
+
 "@types/json-schema@^7.0.9":
   version "7.0.12"
   resolved "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.12.tgz"
@@ -2378,11 +2393,6 @@ dotenv@^16.1.3:
   resolved "https://registry.yarnpkg.com/dotenv/-/dotenv-16.1.3.tgz#0c67e90d0ddb48d08c570888f709b41844928210"
   integrity sha512-FYssxsmCTtKL72fGBSvb1K9dRz0/VZeWqFme/vSb7r7323x4CRaHu4LvQ5JG3+s6yt2YPbBrkpiEODktfyjI9A==
 
-double-ended-queue@^2.1.0-0:
-  version "2.1.0-0"
-  resolved "https://registry.yarnpkg.com/double-ended-queue/-/double-ended-queue-2.1.0-0.tgz#103d3527fd31528f40188130c841efdd78264e5c"
-  integrity sha512-+BNfZ+deCo8hMNpDqDnvT+c0XpJ5cUa6mqYq89bho2Ifze4URTqRkcwR399hWoTrTkbZ/XJYDgP6rc7pRgffEQ==
-
 electron-to-chromium@^1.4.411:
   version "1.4.414"
   resolved "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.414.tgz"
@@ -3948,7 +3958,7 @@ node-domexception@1.0.0:
   resolved "https://registry.yarnpkg.com/node-domexception/-/node-domexception-1.0.0.tgz#6888db46a1f71c0b76b3f7555016b63fe64766e5"
   integrity sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==
 
-node-fetch@^2.6.7:
+node-fetch@^2.6.7, node-fetch@^2.7.0:
   version "2.7.0"
   resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.7.0.tgz#d0f0fa6e3e2dc1d27efcd8ad99d550bda94d187d"
   integrity sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==
@@ -4807,6 +4817,11 @@ uuid@^10.0.0:
   resolved "https://registry.yarnpkg.com/uuid/-/uuid-10.0.0.tgz#5a95aa454e6e002725c79055fd42aaba30ca6294"
   integrity sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==
 
+uuid@^9.0.0:
+  version "9.0.1"
+  resolved "https://registry.yarnpkg.com/uuid/-/uuid-9.0.1.tgz#e188d4c8853cc722220392c424cd637f32293f30"
+  integrity sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==
+
 v8-compile-cache-lib@^3.0.1:
   version "3.0.1"
   resolved "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz"

From 27d8cafa1d793797d1db083d3b94e3046d06568e Mon Sep 17 00:00:00 2001
From: Bagatur <22008038+baskaryan@users.noreply.github.com>
Date: Wed, 20 Nov 2024 13:58:55 -0500
Subject: [PATCH 19/31] python[patch]: accept simple summary eval signature
 (#1227)

---
 python/langsmith/evaluation/_runner.py        |  45 ++++++++
 .../unit_tests/evaluation/test_runner.py      | 102 +++++++++++++++++-
 2 files changed, 146 insertions(+), 1 deletion(-)

diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
index 8ed55f6bf..6a7f3ab3d 100644
--- a/python/langsmith/evaluation/_runner.py
+++ b/python/langsmith/evaluation/_runner.py
@@ -1584,6 +1584,7 @@ def _wrap_summary_evaluators(
 ) -> List[SUMMARY_EVALUATOR_T]:
     def _wrap(evaluator: SUMMARY_EVALUATOR_T) -> SUMMARY_EVALUATOR_T:
         eval_name = getattr(evaluator, "__name__", "BatchEvaluator")
+        evaluator = _normalize_summary_evaluator(evaluator)
 
         @functools.wraps(evaluator)
         def _wrapper_inner(
@@ -1943,3 +1944,47 @@ def _import_langchain_runnable() -> Optional[type]:
 
 def _is_langchain_runnable(o: Any) -> bool:
     return bool((Runnable := _import_langchain_runnable()) and isinstance(o, Runnable))
+
+
+def _normalize_summary_evaluator(func: Callable) -> SUMMARY_EVALUATOR_T:
+    supported_args = ("runs", "examples", "inputs", "outputs", "reference_outputs")
+    sig = inspect.signature(func)
+    positional_args = [
+        pname
+        for pname, p in sig.parameters.items()
+        if p.kind in (p.POSITIONAL_OR_KEYWORD, p.POSITIONAL_ONLY)
+    ]
+    if not positional_args or (
+        not all(pname in supported_args for pname in positional_args)
+        and len(positional_args) != 2
+    ):
+        msg = (
+            f"Invalid evaluator function. Must have at least one positional "
+            f"argument. Supported positional arguments are {supported_args}."
+        )
+        raise ValueError(msg)
+    # For backwards compatibility we assume custom arg names are Sequence[Run] and
+    # Sequence[Example] types, respectively.
+    elif not all(
+        pname in supported_args for pname in positional_args
+    ) or positional_args == ["runs", "examples"]:
+        return func
+    else:
+
+        def wrapper(
+            runs: Sequence[schemas.Run], examples: Sequence[schemas.Example]
+        ) -> Union[EvaluationResult, EvaluationResults]:
+            arg_map = {
+                "runs": runs,
+                "examples": examples,
+                "inputs": [example.inputs for example in examples],
+                "outputs": [run.outputs or {} for run in runs],
+                "reference_outputs": [example.outputs or {} for example in examples],
+            }
+            args = (arg_map[arg] for arg in positional_args)
+            return func(*args)
+
+        wrapper.__name__ = (
+            getattr(func, "__name__") if hasattr(func, "__name__") else wrapper.__name__
+        )
+        return wrapper  # type: ignore[return-value]
diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py
index 408d4508d..c2a0b8f2d 100644
--- a/python/tests/unit_tests/evaluation/test_runner.py
+++ b/python/tests/unit_tests/evaluation/test_runner.py
@@ -20,7 +20,7 @@
 from langsmith import schemas as ls_schemas
 from langsmith.client import Client
 from langsmith.evaluation._arunner import aevaluate, aevaluate_existing
-from langsmith.evaluation._runner import evaluate_existing
+from langsmith.evaluation._runner import _normalize_summary_evaluator, evaluate_existing
 from langsmith.evaluation.evaluator import _normalize_evaluator_func
 
 
@@ -221,6 +221,15 @@ def eval_list(run, example):
             {"score": 1, "key": "list_eval_int"},
         ]
 
+    def summary_eval_runs_examples(runs_, examples_):
+        return {"score": len(runs_[0].dotted_order)}
+
+    def summary_eval_inputs_outputs(inputs, outputs):
+        return {"score": len([x["in"] for x in inputs])}
+
+    def summary_eval_outputs_reference(outputs, reference_outputs):
+        return {"score": len([x["answer"] for x in reference_outputs])}
+
     evaluators = [
         score_value_first,
         score_unpacked_inputs_outputs,
@@ -230,11 +239,18 @@ def eval_list(run, example):
         eval_list,
     ]
 
+    summary_evaluators = [
+        summary_eval_runs_examples,
+        summary_eval_inputs_outputs,
+        summary_eval_outputs_reference,
+    ]
+
     results = evaluate(
         predict,
         client=client,
         data=dev_split,
         evaluators=evaluators,
+        summary_evaluators=summary_evaluators,
         num_repetitions=NUM_REPETITIONS,
         blocking=blocking,
     )
@@ -262,6 +278,11 @@ def eval_list(run, example):
         assert r["run"].outputs["output"] == r["example"].inputs["in"] + 1  # type: ignore
         assert set(r["run"].outputs.keys()) == {"output"}  # type: ignore
         assert len(r["evaluation_results"]["results"]) == len(evaluators) + 1
+        assert all(
+            er.score is not None or er.value is not None
+            for er in r["evaluation_results"]["results"]
+        )
+    assert len(results._summary_results["results"]) == len(summary_evaluators)
 
     assert fake_request.created_session
     _wait_until(lambda: fake_request.runs)
@@ -452,6 +473,15 @@ async def eval_list(run, example):
             {"score": 1, "key": "list_eval_int"},
         ]
 
+    def summary_eval_runs_examples(runs_, examples_):
+        return {"score": len(runs_[0].dotted_order)}
+
+    def summary_eval_inputs_outputs(inputs, outputs):
+        return {"score": len([x["in"] for x in inputs])}
+
+    def summary_eval_outputs_reference(outputs, reference_outputs):
+        return {"score": len([x["answer"] for x in reference_outputs])}
+
     evaluators = [
         score_value_first,
         score_unpacked_inputs_outputs,
@@ -461,11 +491,18 @@ async def eval_list(run, example):
         eval_list,
     ]
 
+    summary_evaluators = [
+        summary_eval_runs_examples,
+        summary_eval_inputs_outputs,
+        summary_eval_outputs_reference,
+    ]
+
     results = await aevaluate(
         predict,
         client=client,
         data=dev_split,
         evaluators=evaluators,
+        summary_evaluators=summary_evaluators,
         num_repetitions=NUM_REPETITIONS,
         blocking=blocking,
     )
@@ -497,6 +534,11 @@ async def eval_list(run, example):
     async for r in results:
         assert r["run"].outputs["output"] == r["example"].inputs["in"] + 1  # type: ignore
         assert set(r["run"].outputs.keys()) == {"output"}  # type: ignore
+        assert all(
+            er.score is not None or er.value is not None
+            for er in r["evaluation_results"]["results"]
+        )
+    assert len(results._summary_results["results"]) == len(summary_evaluators)
 
     assert fake_request.created_session
     _wait_until(lambda: fake_request.runs)
@@ -566,3 +608,61 @@ async def atarget(x):
             await aevaluate(
                 atarget, data=ds_examples, evaluators=[eval_], client=client
             )
+
+
+def summary_eval_runs_examples(runs_, examples_):
+    return {"score": len(runs_[0].dotted_order)}
+
+
+def summary_eval_inputs_outputs(inputs, outputs):
+    return {"score": max([len(x["in"]) for x in inputs])}
+
+
+def summary_eval_outputs_reference(outputs, reference_outputs):
+    return {"score": min([len(x["response"]) for x in outputs])}
+
+
+@pytest.mark.parametrize(
+    "evaluator",
+    [
+        summary_eval_runs_examples,
+        summary_eval_inputs_outputs,
+        summary_eval_outputs_reference,
+    ],
+)
+def test__normalize_summary_evaluator(evaluator: Callable) -> None:
+    normalized = _normalize_summary_evaluator(evaluator)
+    runs = [
+        ls_schemas.Run(
+            name="foo",
+            start_time=datetime.now(),
+            run_type="chain",
+            id=uuid.uuid4(),
+            dotted_order="a" * 12,
+            outputs={"response": "c" * 12},
+        )
+    ]
+    examples = [
+        ls_schemas.Example(
+            id=uuid.uuid4(),
+            inputs={"in": "b" * 12},
+        )
+    ]
+    assert normalized(runs, examples)["score"] == 12
+
+
+def summary_eval_kwargs(*, runs, examples):
+    return
+
+
+def summary_eval_unknown_positional_args(runs, examples, foo):
+    return
+
+
+@pytest.mark.parametrize(
+    "evaluator",
+    [summary_eval_kwargs, summary_eval_unknown_positional_args],
+)
+def test__normalize_summary_evaluator_invalid(evaluator: Callable) -> None:
+    with pytest.raises(ValueError, match="Invalid evaluator function."):
+        _normalize_summary_evaluator(evaluator)

From a07d3d6564f8d0be3dfd79228767db0031056fad Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 20 Nov 2024 11:25:21 -0800
Subject: [PATCH 20/31] Bump cross-spawn from 7.0.3 to 7.0.6 in /js (#1233)

Bumps [cross-spawn](https://github.com/moxystudio/node-cross-spawn) from
7.0.3 to 7.0.6.
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a
href="https://github.com/moxystudio/node-cross-spawn/blob/master/CHANGELOG.md">cross-spawn's
changelog</a>.</em></p>
<blockquote>
<h3><a
href="https://github.com/moxystudio/node-cross-spawn/compare/v7.0.5...v7.0.6">7.0.6</a>
(2024-11-18)</h3>
<h3>Bug Fixes</h3>
<ul>
<li>update cross-spawn version to 7.0.5 in package-lock.json (<a
href="https://github.com/moxystudio/node-cross-spawn/commit/f700743918d901eff92960e15a8dd68f87bd4176">f700743</a>)</li>
</ul>
<h3><a
href="https://github.com/moxystudio/node-cross-spawn/compare/v7.0.4...v7.0.5">7.0.5</a>
(2024-11-07)</h3>
<h3>Bug Fixes</h3>
<ul>
<li>fix escaping bug introduced by backtracking (<a
href="https://github.com/moxystudio/node-cross-spawn/commit/640d391fde65388548601d95abedccc12943374f">640d391</a>)</li>
</ul>
<h3><a
href="https://github.com/moxystudio/node-cross-spawn/compare/v7.0.3...v7.0.4">7.0.4</a>
(2024-11-07)</h3>
<h3>Bug Fixes</h3>
<ul>
<li>disable regexp backtracking (<a
href="https://redirect.github.com/moxystudio/node-cross-spawn/issues/160">#160</a>)
(<a
href="https://github.com/moxystudio/node-cross-spawn/commit/5ff3a07d9add449021d806e45c4168203aa833ff">5ff3a07</a>)</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/moxystudio/node-cross-spawn/commit/77cd97f3ca7b62c904a63a698fc4a79bf41977d0"><code>77cd97f</code></a>
chore(release): 7.0.6</li>
<li><a
href="https://github.com/moxystudio/node-cross-spawn/commit/6717de49ff1e5de49622488dcb9c33fb25370c85"><code>6717de4</code></a>
chore: upgrade standard-version</li>
<li><a
href="https://github.com/moxystudio/node-cross-spawn/commit/f700743918d901eff92960e15a8dd68f87bd4176"><code>f700743</code></a>
fix: update cross-spawn version to 7.0.5 in package-lock.json</li>
<li><a
href="https://github.com/moxystudio/node-cross-spawn/commit/9a7e3b2165917367f74b8365faad9873b30d7263"><code>9a7e3b2</code></a>
chore: fix build status badge</li>
<li><a
href="https://github.com/moxystudio/node-cross-spawn/commit/085268352dcbcad8064c64c5efb25268b4023184"><code>0852683</code></a>
chore(release): 7.0.5</li>
<li><a
href="https://github.com/moxystudio/node-cross-spawn/commit/640d391fde65388548601d95abedccc12943374f"><code>640d391</code></a>
fix: fix escaping bug introduced by backtracking</li>
<li><a
href="https://github.com/moxystudio/node-cross-spawn/commit/bff0c87c8b627c4e6d04ec2449e733048bebb464"><code>bff0c87</code></a>
chore: remove codecov</li>
<li><a
href="https://github.com/moxystudio/node-cross-spawn/commit/a7c6abc6fee79641d45b452fe6217deaa1bd0973"><code>a7c6abc</code></a>
chore: replace travis with github workflows</li>
<li><a
href="https://github.com/moxystudio/node-cross-spawn/commit/9b9246e0969e86656d7ccd527716bc3c18842a19"><code>9b9246e</code></a>
chore(release): 7.0.4</li>
<li><a
href="https://github.com/moxystudio/node-cross-spawn/commit/5ff3a07d9add449021d806e45c4168203aa833ff"><code>5ff3a07</code></a>
fix: disable regexp backtracking (<a
href="https://redirect.github.com/moxystudio/node-cross-spawn/issues/160">#160</a>)</li>
<li>Additional commits viewable in <a
href="https://github.com/moxystudio/node-cross-spawn/compare/v7.0.3...v7.0.6">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=cross-spawn&package-manager=npm_and_yarn&previous-version=7.0.3&new-version=7.0.6)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/langchain-ai/langsmith-sdk/network/alerts).

</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 js/yarn.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/js/yarn.lock b/js/yarn.lock
index 92dc3ffb1..1c7b89807 100644
--- a/js/yarn.lock
+++ b/js/yarn.lock
@@ -2283,9 +2283,9 @@ cross-env@^7.0.3:
     cross-spawn "^7.0.1"
 
 cross-spawn@^7.0.1, cross-spawn@^7.0.2, cross-spawn@^7.0.3:
-  version "7.0.3"
-  resolved "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz"
-  integrity sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==
+  version "7.0.6"
+  resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.6.tgz#8a58fe78f00dcd70c370451759dfbfaf03e8ee9f"
+  integrity sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==
   dependencies:
     path-key "^3.1.0"
     shebang-command "^2.0.0"

From b95f6d2ca1bb364cd5de4c924fc47db945817d2b Mon Sep 17 00:00:00 2001
From: Isaac Francisco <78627776+isahers1@users.noreply.github.com>
Date: Wed, 20 Nov 2024 12:04:05 -0800
Subject: [PATCH 21/31] sdk patch: raise errors on update/create examples when
 passing mismatching length sequences (#1238)

---
 python/langsmith/client.py                    | 31 ++++++++++++++
 python/tests/integration_tests/test_client.py | 41 +++++++++++++++++++
 2 files changed, 72 insertions(+)

diff --git a/python/langsmith/client.py b/python/langsmith/client.py
index 5e906507d..1647b790d 100644
--- a/python/langsmith/client.py
+++ b/python/langsmith/client.py
@@ -3413,6 +3413,22 @@ def create_examples(
 
         if dataset_id is None:
             dataset_id = self.read_dataset(dataset_name=dataset_name).id
+
+        sequence_args = {
+            "outputs": outputs,
+            "metadata": metadata,
+            "splits": splits,
+            "ids": ids,
+            "source_run_ids": source_run_ids,
+        }
+        # Since inputs are required, we will check against them
+        input_len = len(inputs)
+        for arg_name, arg_value in sequence_args.items():
+            if arg_value is not None and len(arg_value) != input_len:
+                raise ValueError(
+                    f"Length of {arg_name} ({len(arg_value)}) does not match"
+                    f" length of inputs ({input_len})"
+                )
         examples = [
             {
                 "inputs": in_,
@@ -3816,6 +3832,21 @@ def update_examples(
         Dict[str, Any]
             The response from the server (specifies the number of examples updated).
         """
+        sequence_args = {
+            "inputs": inputs,
+            "outputs": outputs,
+            "metadata": metadata,
+            "splits": splits,
+            "dataset_ids": dataset_ids,
+        }
+        # Since inputs are required, we will check against them
+        examples_len = len(example_ids)
+        for arg_name, arg_value in sequence_args.items():
+            if arg_value is not None and len(arg_value) != examples_len:
+                raise ValueError(
+                    f"Length of {arg_name} ({len(arg_value)}) does not match"
+                    f" length of examples ({examples_len})"
+                )
         examples = [
             {
                 "id": id_,
diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py
index 57a6e2171..9bea700cd 100644
--- a/python/tests/integration_tests/test_client.py
+++ b/python/tests/integration_tests/test_client.py
@@ -1018,3 +1018,44 @@ def create_encoder(*args, **kwargs):
                 myobj["key_1"]
 
         assert not caplog.records
+
+
+def test_examples_length_validation(langchain_client: Client) -> None:
+    """Test that mismatched lengths raise ValueError for create and update examples."""
+    dataset_name = "__test_examples_length_validation" + uuid4().hex[:4]
+    dataset = langchain_client.create_dataset(dataset_name=dataset_name)
+
+    # Test create_examples validation
+    inputs = [{"text": "hello"}, {"text": "world"}]
+    outputs = [{"response": "hi"}]  # One less than inputs
+    with pytest.raises(ValueError) as exc_info:
+        langchain_client.create_examples(
+            inputs=inputs, outputs=outputs, dataset_id=dataset.id
+        )
+    assert "Length of outputs (1) does not match length of inputs (2)" in str(
+        exc_info.value
+    )
+
+    # Create some valid examples for testing update
+    langchain_client.create_examples(
+        inputs=[{"text": "hello"}, {"text": "world"}],
+        outputs=[{"response": "hi"}, {"response": "earth"}],
+        dataset_id=dataset.id,
+    )
+    example_ids = [
+        example.id for example in langchain_client.list_examples(dataset_id=dataset.id)
+    ]
+
+    # Test update_examples validation
+    with pytest.raises(ValueError) as exc_info:
+        langchain_client.update_examples(
+            example_ids=example_ids,
+            inputs=[{"text": "new hello"}],  # One less than example_ids
+            outputs=[{"response": "new hi"}, {"response": "new earth"}],
+        )
+    assert "Length of inputs (1) does not match length of examples (2)" in str(
+        exc_info.value
+    )
+
+    # Clean up
+    langchain_client.delete_dataset(dataset_id=dataset.id)

From 232b9ca6671002477993c514eb8f46223b116ab5 Mon Sep 17 00:00:00 2001
From: William FH <13333726+hinthornw@users.noreply.github.com>
Date: Wed, 20 Nov 2024 12:32:19 -0800
Subject: [PATCH 22/31] Support comments in evaluate_comparative (#1211)

---
 python/langsmith/evaluation/_runner.py   | 12 +++++++++++-
 python/langsmith/evaluation/evaluator.py |  3 +++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
index 6a7f3ab3d..2ab94700c 100644
--- a/python/langsmith/evaluation/_runner.py
+++ b/python/langsmith/evaluation/_runner.py
@@ -647,16 +647,20 @@ def evaluate_comparative(
         ...         },
         ...     )
         ...     tool_args = completion.choices[0].message.tool_calls[0].function.arguments
-        ...     preference = json.loads(tool_args)["preferred_option"]
+        ...     loaded_args = json.loads(tool_args)
+        ...     preference = loaded_args["preferred_option"]
+        ...     comment = loaded_args["reasoning"]
         ...     if preference == "A":
         ...         return {
         ...             "key": "ranked_preference",
         ...             "scores": {runs[0].id: 1, runs[1].id: 0},
+        ...             "comment": comment,
         ...         }
         ...     else:
         ...         return {
         ...             "key": "ranked_preference",
         ...             "scores": {runs[0].id: 0, runs[1].id: 1},
+        ...             "comment": comment,
         ...         }
         >>> def score_length_difference(runs: list, example: schemas.Example):
         ...     # Just return whichever response is longer.
@@ -781,12 +785,18 @@ def evaluate_and_submit_feedback(
             result = comparator.compare_runs(runs_list, example)
             if client is None:
                 raise ValueError("Client is required to submit feedback.")
+        comments = (
+            {str(rid): result.comment for rid in result.scores}
+            if isinstance(result.comment, str)
+            else (result.comment or {})
+        )
         for run_id, score in result.scores.items():
             executor.submit(
                 client.create_feedback,
                 run_id=run_id,
                 key=result.key,
                 score=score,
+                comment=comments.get(str(run_id)),
                 comparative_experiment_id=comparative_experiment.id,
                 source_run_id=result.source_run_id,
                 feedback_group_id=feedback_group_id,
diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
index feb0e95e4..668f99e9d 100644
--- a/python/langsmith/evaluation/evaluator.py
+++ b/python/langsmith/evaluation/evaluator.py
@@ -158,6 +158,9 @@ class ComparisonEvaluationResult(BaseModel):
     """The scores for each run in the comparison."""
     source_run_id: Optional[Union[uuid.UUID, str]] = None
     """The ID of the trace of the evaluator itself."""
+    comment: Optional[Union[str, Dict[Union[uuid.UUID, str], str]]] = None
+    """Comment for the scores. If a string, it's shared across all target runs.
+    If a dict, it maps run IDs to individual comments."""
 
 
 _COMPARISON_OUTPUT = Union[ComparisonEvaluationResult, dict]

From a3700ddb5e319c70a7d674a7cbf69875813455bc Mon Sep 17 00:00:00 2001
From: William FH <13333726+hinthornw@users.noreply.github.com>
Date: Wed, 20 Nov 2024 14:11:04 -0800
Subject: [PATCH 23/31] [python] Ensure timezone (#1239)

---
 python/langsmith/schemas.py | 6 ++++--
 python/pyproject.toml       | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/langsmith/schemas.py b/python/langsmith/schemas.py
index dff6d212a..38643d877 100644
--- a/python/langsmith/schemas.py
+++ b/python/langsmith/schemas.py
@@ -21,9 +21,9 @@
 from typing_extensions import NotRequired, TypedDict
 
 try:
-    from pydantic.v1 import (  # type: ignore[import]
+    from pydantic.v1 import (
         BaseModel,
-        Field,
+        Field,  # type: ignore[import]
         PrivateAttr,
         StrictBool,
         StrictFloat,
@@ -571,6 +571,8 @@ def __init__(self, _host_url: Optional[str] = None, **kwargs: Any) -> None:
         """Initialize a Run object."""
         super().__init__(**kwargs)
         self._host_url = _host_url
+        if self.start_time.tzinfo is None:
+            self.start_time = self.start_time.replace(tzinfo=timezone.utc)
 
     @property
     def url(self) -> Optional[str]:
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 191d61b22..d6425e255 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langsmith"
-version = "0.1.144rc1"
+version = "0.1.144"
 description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
 authors = ["LangChain <support@langchain.dev>"]
 license = "MIT"

From 09ab56bef2ee57a5acd61b020b4aac3ae1f9c1be Mon Sep 17 00:00:00 2001
From: William FH <13333726+hinthornw@users.noreply.github.com>
Date: Wed, 20 Nov 2024 16:32:29 -0800
Subject: [PATCH 24/31] Attachments in patch (#1212)

Co-authored-by: isaac hershenson <ihershenson@hmc.edu>
---
 python/langsmith/client.py                  |  8 ++++++
 python/langsmith/run_trees.py               | 30 +++++++++++++++++++++
 python/tests/unit_tests/test_run_helpers.py | 25 +++++++++++++----
 3 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/python/langsmith/client.py b/python/langsmith/client.py
index 1647b790d..9dd1442a1 100644
--- a/python/langsmith/client.py
+++ b/python/langsmith/client.py
@@ -1620,6 +1620,9 @@ def update_run(
         events: Optional[Sequence[dict]] = None,
         extra: Optional[Dict] = None,
         tags: Optional[List[str]] = None,
+        attachments: Optional[
+            Dict[str, tuple[str, bytes] | ls_schemas.Attachment]
+        ] = None,
         **kwargs: Any,
     ) -> None:
         """Update a run in the LangSmith API.
@@ -1644,6 +1647,9 @@ def update_run(
             The extra information for the run.
         tags : List[str] or None, default=None
             The tags for the run.
+        attachments: dict[str, ls_schemas.Attachment] or None, default=None
+            A dictionary of attachments to add to the run. The keys are the attachment names,
+            and the values are Attachment objects containing the data and mime type.
         **kwargs : Any
             Kwargs are ignored.
         """
@@ -1658,6 +1664,8 @@ def update_run(
             "session_id": kwargs.pop("session_id", None),
             "session_name": kwargs.pop("session_name", None),
         }
+        if attachments:
+            data["attachments"] = attachments
         use_multipart = (
             self.tracing_queue is not None
             # batch ingest requires trace_id and dotted_order to be set
diff --git a/python/langsmith/run_trees.py b/python/langsmith/run_trees.py
index d3083e4f6..63f0cb4e5 100644
--- a/python/langsmith/run_trees.py
+++ b/python/langsmith/run_trees.py
@@ -301,6 +301,15 @@ def post(self, exclude_child_runs: bool = True) -> None:
         """Post the run tree to the API asynchronously."""
         kwargs = self._get_dicts_safe()
         self.client.create_run(**kwargs)
+        if attachments := kwargs.get("attachments"):
+            keys = [str(name) for name in attachments]
+            self.events.append(
+                {
+                    "name": "uploaded_attachment",
+                    "time": datetime.now(timezone.utc).isoformat(),
+                    "message": set(keys),
+                }
+            )
         if not exclude_child_runs:
             for child_run in self.child_runs:
                 child_run.post(exclude_child_runs=False)
@@ -309,6 +318,26 @@ def patch(self) -> None:
         """Patch the run tree to the API in a background thread."""
         if not self.end_time:
             self.end()
+        attachments = self.attachments
+        try:
+            # Avoid loading the same attachment twice
+            if attachments:
+                uploaded = next(
+                    (
+                        ev
+                        for ev in self.events
+                        if ev.get("name") == "uploaded_attachment"
+                    ),
+                    None,
+                )
+                if uploaded:
+                    attachments = {
+                        a: v
+                        for a, v in attachments.items()
+                        if a not in uploaded["message"]
+                    }
+        except Exception as e:
+            logger.warning(f"Error filtering attachments to upload: {e}")
         self.client.update_run(
             name=self.name,
             run_id=self.id,
@@ -322,6 +351,7 @@ def patch(self) -> None:
             events=self.events,
             tags=self.tags,
             extra=self.extra,
+            attachments=attachments,
         )
 
     def wait(self) -> None:
diff --git a/python/tests/unit_tests/test_run_helpers.py b/python/tests/unit_tests/test_run_helpers.py
index dbbbe1adf..34df400e7 100644
--- a/python/tests/unit_tests/test_run_helpers.py
+++ b/python/tests/unit_tests/test_run_helpers.py
@@ -1714,7 +1714,11 @@ def my_func(
             val: int,
             att1: ls_schemas.Attachment,
             att2: Annotated[tuple, ls_schemas.Attachment],
+            run_tree: RunTree,
         ):
+            run_tree.attachments["anoutput"] = ls_schemas.Attachment(
+                mime_type="text/plain", data=b"noidea"
+            )
             return "foo"
 
         mock_client = _get_mock_client(
@@ -1739,11 +1743,15 @@ def my_func(
             )
             assert result == "foo"
 
-        calls = _get_calls(mock_client)
-        datas = _get_multipart_data(calls)
+        for _ in range(10):
+            calls = _get_calls(mock_client)
+            datas = _get_multipart_data(calls)
+            if len(datas) >= 7:
+                break
+            time.sleep(1)
 
-        # main run, inputs, outputs, events, att1, att2
-        assert len(datas) == 6
+        # main run, inputs, outputs, events, att1, att2, anoutput
+        assert len(datas) == 7
         # First 4 are type application/json (run, inputs, outputs, events)
         trace_id = datas[0][0].split(".")[1]
         _, (_, run_stuff) = next(
@@ -1760,7 +1768,7 @@ def my_func(
             data for data in datas if data[0] == f"post.{trace_id}.inputs"
         )
         assert json.loads(inputs) == {"val": 42}
-        # last two are the mime types provided
+        # last three are the mime types provided
         _, (mime_type1, content1) = next(
             data for data in datas if data[0] == f"attachment.{trace_id}.att1"
         )
@@ -1772,3 +1780,10 @@ def my_func(
         )
         assert mime_type2 == "application/octet-stream"
         assert content2 == b"content2"
+
+        # Assert that anoutput is uploaded
+        _, (mime_type_output, content_output) = next(
+            data for data in datas if data[0] == f"attachment.{trace_id}.anoutput"
+        )
+        assert mime_type_output == "text/plain"
+        assert content_output == b"noidea"

From 4ef9789d930ccbf5500ae31e01e46f8df29d7b82 Mon Sep 17 00:00:00 2001
From: Jacob Lee <jacoblee93@gmail.com>
Date: Thu, 21 Nov 2024 08:11:00 -0800
Subject: [PATCH 25/31] fix(js): Add LANGSMITH_ env vars to metadata (#1243)

---
 js/package.json     | 2 +-
 js/src/index.ts     | 2 +-
 js/src/utils/env.ts | 7 ++++++-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/js/package.json b/js/package.json
index 1d6e0e2ee..4b8881b4d 100644
--- a/js/package.json
+++ b/js/package.json
@@ -1,6 +1,6 @@
 {
   "name": "langsmith",
-  "version": "0.2.6",
+  "version": "0.2.7",
   "description": "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform.",
   "packageManager": "yarn@1.22.19",
   "files": [
diff --git a/js/src/index.ts b/js/src/index.ts
index e0831adc2..daf8c12a8 100644
--- a/js/src/index.ts
+++ b/js/src/index.ts
@@ -14,4 +14,4 @@ export { RunTree, type RunTreeConfig } from "./run_trees.js";
 export { overrideFetchImplementation } from "./singletons/fetch.js";
 
 // Update using yarn bump-version
-export const __version__ = "0.2.6";
+export const __version__ = "0.2.7";
diff --git a/js/src/utils/env.ts b/js/src/utils/env.ts
index e02eae3a8..5330fa4de 100644
--- a/js/src/utils/env.ts
+++ b/js/src/utils/env.ts
@@ -132,11 +132,16 @@ export function getLangChainEnvVarsMetadata(): Record<string, string> {
     "LANGCHAIN_TRACING_V2",
     "LANGCHAIN_PROJECT",
     "LANGCHAIN_SESSION",
+    "LANGSMITH_API_KEY",
+    "LANGSMITH_ENDPOINT",
+    "LANGSMITH_TRACING_V2",
+    "LANGSMITH_PROJECT",
+    "LANGSMITH_SESSION",
   ];
 
   for (const [key, value] of Object.entries(allEnvVars)) {
     if (
-      key.startsWith("LANGCHAIN_") &&
+      (key.startsWith("LANGCHAIN_") || key.startsWith("LANGSMITH_")) &&
       typeof value === "string" &&
       !excluded.includes(key) &&
       !key.toLowerCase().includes("key") &&

From 25e606b696703adb64d8c18d1652ffd134fe2cda Mon Sep 17 00:00:00 2001
From: Predrag Gruevski <2348618+obi1kenobi@users.noreply.github.com>
Date: Thu, 21 Nov 2024 11:32:43 -0500
Subject: [PATCH 26/31] Add placeholder workflow for building PyO3 wheels.
 (#1244)

GitHub doesn't allow running new workflows from branches if the workflow
by that name doesn't already exist on the main branch. I'm creating this
placeholder workflow to work around that, so I can then trigger and test
the workflow from my own branch.
---
 .../workflows/build_langsmith_pyo3_wheels.yml    | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 .github/workflows/build_langsmith_pyo3_wheels.yml

diff --git a/.github/workflows/build_langsmith_pyo3_wheels.yml b/.github/workflows/build_langsmith_pyo3_wheels.yml
new file mode 100644
index 000000000..1e36fccc4
--- /dev/null
+++ b/.github/workflows/build_langsmith_pyo3_wheels.yml
@@ -0,0 +1,16 @@
+name: Build langsmith_pyo3 wheels
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+  workflow_dispatch:
+
+jobs:
+  hello-world:
+    runs-on: ubuntu-20.04
+    steps:
+      - run: echo 'hello world'

From 9d526137e4a28ff809b273a93957a2d924b4aaf9 Mon Sep 17 00:00:00 2001
From: Bagatur <22008038+baskaryan@users.noreply.github.com>
Date: Thu, 21 Nov 2024 13:05:54 -0500
Subject: [PATCH 27/31] python[patch]: summary evaluator simpler returns
 (#1241)

---
 python/langsmith/evaluation/_arunner.py       |   2 +-
 python/langsmith/evaluation/_runner.py        |  56 +--------
 python/langsmith/evaluation/evaluator.py      | 119 ++++++++++++++----
 .../unit_tests/evaluation/test_runner.py      |  13 +-
 4 files changed, 106 insertions(+), 84 deletions(-)

diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py
index a2c3b2705..f37901a4b 100644
--- a/python/langsmith/evaluation/_arunner.py
+++ b/python/langsmith/evaluation/_arunner.py
@@ -35,7 +35,6 @@
     AEVALUATOR_T,
     DATA_T,
     EVALUATOR_T,
-    SUMMARY_EVALUATOR_T,
     ExperimentResultRow,
     _ExperimentManagerMixin,
     _extract_feedback_keys,
@@ -52,6 +51,7 @@
     _wrap_summary_evaluators,
 )
 from langsmith.evaluation.evaluator import (
+    SUMMARY_EVALUATOR_T,
     EvaluationResult,
     EvaluationResults,
     RunEvaluator,
diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
index 2ab94700c..0f677f389 100644
--- a/python/langsmith/evaluation/_runner.py
+++ b/python/langsmith/evaluation/_runner.py
@@ -45,12 +45,14 @@
 from langsmith import schemas
 from langsmith import utils as ls_utils
 from langsmith.evaluation.evaluator import (
+    SUMMARY_EVALUATOR_T,
     ComparisonEvaluationResult,
     DynamicComparisonRunEvaluator,
     DynamicRunEvaluator,
     EvaluationResult,
     EvaluationResults,
     RunEvaluator,
+    _normalize_summary_evaluator,
     comparison_evaluator,
     run_evaluator,
 )
@@ -70,16 +72,6 @@
 DATA_T = Union[str, uuid.UUID, Iterable[schemas.Example], schemas.Dataset]
 # Summary evaluator runs over the whole dataset
 # and reports aggregate metric(s)
-SUMMARY_EVALUATOR_T = Union[
-    Callable[
-        [Sequence[schemas.Run], Sequence[schemas.Example]],
-        Union[EvaluationResult, EvaluationResults],
-    ],
-    Callable[
-        [List[schemas.Run], List[schemas.Example]],
-        Union[EvaluationResult, EvaluationResults],
-    ],
-]
 # Row-level evaluator
 EVALUATOR_T = Union[
     RunEvaluator,
@@ -1954,47 +1946,3 @@ def _import_langchain_runnable() -> Optional[type]:
 
 def _is_langchain_runnable(o: Any) -> bool:
     return bool((Runnable := _import_langchain_runnable()) and isinstance(o, Runnable))
-
-
-def _normalize_summary_evaluator(func: Callable) -> SUMMARY_EVALUATOR_T:
-    supported_args = ("runs", "examples", "inputs", "outputs", "reference_outputs")
-    sig = inspect.signature(func)
-    positional_args = [
-        pname
-        for pname, p in sig.parameters.items()
-        if p.kind in (p.POSITIONAL_OR_KEYWORD, p.POSITIONAL_ONLY)
-    ]
-    if not positional_args or (
-        not all(pname in supported_args for pname in positional_args)
-        and len(positional_args) != 2
-    ):
-        msg = (
-            f"Invalid evaluator function. Must have at least one positional "
-            f"argument. Supported positional arguments are {supported_args}."
-        )
-        raise ValueError(msg)
-    # For backwards compatibility we assume custom arg names are Sequence[Run] and
-    # Sequence[Example] types, respectively.
-    elif not all(
-        pname in supported_args for pname in positional_args
-    ) or positional_args == ["runs", "examples"]:
-        return func
-    else:
-
-        def wrapper(
-            runs: Sequence[schemas.Run], examples: Sequence[schemas.Example]
-        ) -> Union[EvaluationResult, EvaluationResults]:
-            arg_map = {
-                "runs": runs,
-                "examples": examples,
-                "inputs": [example.inputs for example in examples],
-                "outputs": [run.outputs or {} for run in runs],
-                "reference_outputs": [example.outputs or {} for example in examples],
-            }
-            args = (arg_map[arg] for arg in positional_args)
-            return func(*args)
-
-        wrapper.__name__ = (
-            getattr(func, "__name__") if hasattr(func, "__name__") else wrapper.__name__
-        )
-        return wrapper  # type: ignore[return-value]
diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
index 668f99e9d..9056ecfe7 100644
--- a/python/langsmith/evaluation/evaluator.py
+++ b/python/langsmith/evaluation/evaluator.py
@@ -1,5 +1,7 @@
 """This module contains the evaluator classes for evaluating runs."""
 
+from __future__ import annotations
+
 import asyncio
 import inspect
 import uuid
@@ -19,6 +21,8 @@
 
 from typing_extensions import TypedDict
 
+from langsmith import schemas
+
 try:
     from pydantic.v1 import (  # type: ignore[import]
         BaseModel,
@@ -281,33 +285,11 @@ def _format_result(
         ],
         source_run_id: uuid.UUID,
     ) -> Union[EvaluationResult, EvaluationResults]:
-        if isinstance(result, (bool, float, int)):
-            result = {"score": result}
-        elif not result:
-            raise ValueError(
-                f"Expected a non-empty dict, str, bool, int, float, list, "
-                f"EvaluationResult, or EvaluationResults. Got {result}"
-            )
-        elif isinstance(result, EvaluationResult):
+        if isinstance(result, EvaluationResult):
             if not result.source_run_id:
                 result.source_run_id = source_run_id
             return result
-        elif isinstance(result, list):
-            if not all(isinstance(x, dict) for x in result):
-                raise ValueError(
-                    f"Expected a list of dicts or EvaluationResults. Received {result}."
-                )
-            result = {"results": result}  # type: ignore[misc]
-        elif isinstance(result, str):
-            result = {"value": result}
-        elif isinstance(result, dict):
-            pass
-        else:
-            raise ValueError(
-                f"Expected a dict, str, bool, int, float, list, EvaluationResult, or "
-                f"EvaluationResults. Got {result}"
-            )
-
+        result = _format_evaluator_result(result)
         return self._coerce_evaluation_results(result, source_run_id)
 
     @property
@@ -724,3 +706,92 @@ def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
                 else wrapper.__name__
             )
             return wrapper  # type: ignore[return-value]
+
+
+def _format_evaluator_result(
+    result: Union[EvaluationResults, dict, str, int, bool, float, list],
+) -> Union[EvaluationResults, dict]:
+    if isinstance(result, (bool, float, int)):
+        result = {"score": result}
+    elif not result:
+        raise ValueError(
+            f"Expected a non-empty dict, str, bool, int, float, list, "
+            f"EvaluationResult, or EvaluationResults. Got {result}"
+        )
+    elif isinstance(result, list):
+        if not all(isinstance(x, dict) for x in result):
+            raise ValueError(
+                f"Expected a list of dicts or EvaluationResults. Received {result}."
+            )
+        result = {"results": result}  # type: ignore[misc]
+    elif isinstance(result, str):
+        result = {"value": result}
+    elif isinstance(result, dict):
+        pass
+    else:
+        raise ValueError(
+            f"Expected a dict, str, bool, int, float, list, EvaluationResult, or "
+            f"EvaluationResults. Got {result}"
+        )
+    return result
+
+
+SUMMARY_EVALUATOR_T = Union[
+    Callable[
+        [Sequence[schemas.Run], Sequence[schemas.Example]],
+        Union[EvaluationResult, EvaluationResults],
+    ],
+    Callable[
+        [List[schemas.Run], List[schemas.Example]],
+        Union[EvaluationResult, EvaluationResults],
+    ],
+]
+
+
+def _normalize_summary_evaluator(func: Callable) -> SUMMARY_EVALUATOR_T:
+    supported_args = ("runs", "examples", "inputs", "outputs", "reference_outputs")
+    sig = inspect.signature(func)
+    positional_args = [
+        pname
+        for pname, p in sig.parameters.items()
+        if p.kind in (p.POSITIONAL_OR_KEYWORD, p.POSITIONAL_ONLY)
+    ]
+    if not positional_args or (
+        not all(pname in supported_args for pname in positional_args)
+        and len(positional_args) != 2
+    ):
+        msg = (
+            f"Invalid evaluator function. Must have at least one positional "
+            f"argument. Supported positional arguments are {supported_args}."
+        )
+        if positional_args:
+            msg += f" Received positional arguments {positional_args}."
+        raise ValueError(msg)
+    # For backwards compatibility we assume custom arg names are Sequence[Run] and
+    # Sequence[Example] types, respectively.
+    elif not all(
+        pname in supported_args for pname in positional_args
+    ) or positional_args == ["runs", "examples"]:
+        return func
+    else:
+
+        def wrapper(
+            runs: Sequence[schemas.Run], examples: Sequence[schemas.Example]
+        ) -> Union[EvaluationResult, EvaluationResults]:
+            arg_map = {
+                "runs": runs,
+                "examples": examples,
+                "inputs": [example.inputs for example in examples],
+                "outputs": [run.outputs or {} for run in runs],
+                "reference_outputs": [example.outputs or {} for example in examples],
+            }
+            args = (arg_map[arg] for arg in positional_args)
+            result = func(*args)
+            if isinstance(result, EvaluationResult):
+                return result
+            return _format_evaluator_result(result)  # type: ignore[return-value]
+
+        wrapper.__name__ = (
+            getattr(func, "__name__") if hasattr(func, "__name__") else wrapper.__name__
+        )
+        return wrapper  # type: ignore[return-value]
diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py
index c2a0b8f2d..7f45c22c7 100644
--- a/python/tests/unit_tests/evaluation/test_runner.py
+++ b/python/tests/unit_tests/evaluation/test_runner.py
@@ -20,8 +20,11 @@
 from langsmith import schemas as ls_schemas
 from langsmith.client import Client
 from langsmith.evaluation._arunner import aevaluate, aevaluate_existing
-from langsmith.evaluation._runner import _normalize_summary_evaluator, evaluate_existing
-from langsmith.evaluation.evaluator import _normalize_evaluator_func
+from langsmith.evaluation._runner import evaluate_existing
+from langsmith.evaluation.evaluator import (
+    _normalize_evaluator_func,
+    _normalize_summary_evaluator,
+)
 
 
 class FakeRequest:
@@ -225,10 +228,10 @@ def summary_eval_runs_examples(runs_, examples_):
         return {"score": len(runs_[0].dotted_order)}
 
     def summary_eval_inputs_outputs(inputs, outputs):
-        return {"score": len([x["in"] for x in inputs])}
+        return [{"score": len([x["in"] for x in inputs])}]
 
     def summary_eval_outputs_reference(outputs, reference_outputs):
-        return {"score": len([x["answer"] for x in reference_outputs])}
+        return len([x["answer"] for x in reference_outputs])
 
     evaluators = [
         score_value_first,
@@ -619,7 +622,7 @@ def summary_eval_inputs_outputs(inputs, outputs):
 
 
 def summary_eval_outputs_reference(outputs, reference_outputs):
-    return {"score": min([len(x["response"]) for x in outputs])}
+    return min([len(x["response"]) for x in outputs])
 
 
 @pytest.mark.parametrize(

From e3ab54c4b37361219d3450e337418e63204eba37 Mon Sep 17 00:00:00 2001
From: Bagatur <22008038+baskaryan@users.noreply.github.com>
Date: Thu, 21 Nov 2024 13:18:29 -0500
Subject: [PATCH 28/31] python[patch]: comparison evaluator simplification
 (#1240)

- simpler comparison args
- simpler comparison returns
---
 python/langsmith/evaluation/_runner.py        |   4 +-
 python/langsmith/evaluation/evaluator.py      | 152 ++++++++++++------
 .../unit_tests/evaluation/test_evaluator.py   |   2 +-
 .../unit_tests/evaluation/test_runner.py      | 124 +++++++++++++-
 4 files changed, 229 insertions(+), 53 deletions(-)

diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
index 0f677f389..556e697d4 100644
--- a/python/langsmith/evaluation/_runner.py
+++ b/python/langsmith/evaluation/_runner.py
@@ -801,9 +801,7 @@ def evaluate_and_submit_feedback(
     ) as executor:
         futures = []
         for example_id, runs_list in tqdm(runs_dict.items()):
-            results[example_id] = {
-                "runs": runs_list,
-            }
+            results[example_id] = {"runs": runs_list}
             for comparator in comparators:
                 if max_concurrency > 1:
                     future = executor.submit(
diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
index 9056ecfe7..02fab3b71 100644
--- a/python/langsmith/evaluation/evaluator.py
+++ b/python/langsmith/evaluation/evaluator.py
@@ -435,6 +435,10 @@ def __init__(
             func (Callable): A function that takes a `Run` and an optional `Example` as
             arguments, and returns an `EvaluationResult` or `EvaluationResults`.
         """
+        func = _normalize_comparison_evaluator_func(func)
+        if afunc:
+            afunc = _normalize_comparison_evaluator_func(afunc)  # type: ignore[assignment]
+
         wraps(func)(self)
         from langsmith import run_helpers  # type: ignore
 
@@ -505,7 +509,7 @@ def compare_runs(
             example,
             langsmith_extra={"run_id": source_run_id, "tags": tags},
         )
-        return self._format_results(result, source_run_id)
+        return self._format_results(result, source_run_id, runs)
 
     async def acompare_runs(
         self, runs: Sequence[Run], example: Optional[Example] = None
@@ -516,7 +520,7 @@ async def acompare_runs(
             provided arguments.
 
         Args:
-            run (Run): The run to be evaluated.
+            runs (Run): The runs to be evaluated.
             example (Optional[Example]): An optional example to be used
                 in the evaluation.
 
@@ -533,7 +537,7 @@ async def acompare_runs(
             example,
             langsmith_extra={"run_id": source_run_id, "tags": tags},
         )
-        return self._format_results(result, source_run_id)
+        return self._format_results(result, source_run_id, runs)
 
     def __call__(
         self, runs: Sequence[Run], example: Optional[Example] = None
@@ -567,53 +571,31 @@ def _get_tags(runs: Sequence[Run]) -> List[str]:
                 tags.append("experiment:" + str(run.session_id))
         return tags
 
-    def _coerce_evaluation_result(
-        self,
-        result: Union[EvaluationResult, dict],
-        source_run_id: uuid.UUID,
-        allow_no_key: bool = False,
-    ) -> EvaluationResult:
-        if isinstance(result, EvaluationResult):
-            if not result.source_run_id:
-                result.source_run_id = source_run_id
-            return result
-        try:
-            if "key" not in result:
-                if allow_no_key:
-                    result["key"] = self._name
-            return EvaluationResult(**{"source_run_id": source_run_id, **result})
-        except ValidationError as e:
-            raise ValueError(
-                "Expected an EvaluationResult object, or dict with a metric"
-                f" 'key' and optional 'score'; got {result}"
-            ) from e
-
-    def _coerce_evaluation_results(
-        self,
-        results: Union[dict, EvaluationResults],
-        source_run_id: uuid.UUID,
-    ) -> Union[EvaluationResult, EvaluationResults]:
-        if "results" in results:
-            cp = results.copy()
-            cp["results"] = [
-                self._coerce_evaluation_result(r, source_run_id=source_run_id)
-                for r in results["results"]
-            ]
-            return EvaluationResults(**cp)
-
-        return self._coerce_evaluation_result(
-            cast(dict, results), allow_no_key=True, source_run_id=source_run_id
-        )
-
     def _format_results(
         self,
-        result: Union[dict, ComparisonEvaluationResult],
+        result: Union[dict, list, ComparisonEvaluationResult],
         source_run_id: uuid.UUID,
+        runs: Sequence[Run],
     ) -> ComparisonEvaluationResult:
         if isinstance(result, ComparisonEvaluationResult):
             if not result.source_run_id:
                 result.source_run_id = source_run_id
             return result
+        elif isinstance(result, list):
+            result = {
+                "scores": {run.id: score for run, score in zip(runs, result)},
+                "key": self._name,
+                "source_run_id": source_run_id,
+            }
+        elif isinstance(result, dict):
+            if "key" not in result:
+                result["key"] = self._name
+        else:
+            msg = (
+                "Expected 'dict', 'list' or 'ComparisonEvaluationResult' result "
+                f"object. Received: {result=}"
+            )
+            raise ValueError(msg)
         try:
             return ComparisonEvaluationResult(
                 **{"source_run_id": source_run_id, **result}
@@ -669,13 +651,15 @@ def _normalize_evaluator_func(
     else:
         if inspect.iscoroutinefunction(func):
 
-            async def awrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
+            async def awrapper(
+                run: Run, example: Optional[Example]
+            ) -> _RUNNABLE_OUTPUT:
                 arg_map = {
                     "run": run,
                     "example": example,
-                    "inputs": example.inputs,
+                    "inputs": example.inputs if example else {},
                     "outputs": run.outputs or {},
-                    "reference_outputs": example.outputs or {},
+                    "reference_outputs": example.outputs or {} if example else {},
                 }
                 args = (arg_map[arg] for arg in positional_args)
                 return await func(*args)
@@ -693,9 +677,83 @@ def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
                 arg_map = {
                     "run": run,
                     "example": example,
-                    "inputs": example.inputs,
+                    "inputs": example.inputs if example else {},
                     "outputs": run.outputs or {},
-                    "reference_outputs": example.outputs or {},
+                    "reference_outputs": example.outputs or {} if example else {},
+                }
+                args = (arg_map[arg] for arg in positional_args)
+                return func(*args)
+
+            wrapper.__name__ = (
+                getattr(func, "__name__")
+                if hasattr(func, "__name__")
+                else wrapper.__name__
+            )
+            return wrapper  # type: ignore[return-value]
+
+
+def _normalize_comparison_evaluator_func(
+    func: Callable,
+) -> Union[
+    Callable[[Sequence[Run], Optional[Example]], _COMPARISON_OUTPUT],
+    Callable[[Sequence[Run], Optional[Example]], Awaitable[_COMPARISON_OUTPUT]],
+]:
+    supported_args = ("runs", "example", "inputs", "outputs", "reference_outputs")
+    sig = inspect.signature(func)
+    positional_args = [
+        pname
+        for pname, p in sig.parameters.items()
+        if p.kind in (p.POSITIONAL_OR_KEYWORD, p.POSITIONAL_ONLY)
+    ]
+    if not positional_args or (
+        not all(pname in supported_args for pname in positional_args)
+        and len(positional_args) != 2
+    ):
+        msg = (
+            f"Invalid evaluator function. Must have at least one positional "
+            f"argument. Supported positional arguments are {supported_args}. Please "
+            f"see https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators"
+            # noqa: E501
+        )
+        raise ValueError(msg)
+    # For backwards compatibility we assume custom arg names are List[Run] and
+    # List[Example] types, respectively.
+    elif not all(
+        pname in supported_args for pname in positional_args
+    ) or positional_args == ["runs", "example"]:
+        return func
+    else:
+        if inspect.iscoroutinefunction(func):
+
+            async def awrapper(
+                runs: Sequence[Run], example: Optional[Example]
+            ) -> _COMPARISON_OUTPUT:
+                arg_map = {
+                    "runs": runs,
+                    "example": example,
+                    "inputs": example.inputs if example else {},
+                    "outputs": [run.outputs or {} for run in runs],
+                    "reference_outputs": example.outputs or {} if example else {},
+                }
+                args = (arg_map[arg] for arg in positional_args)
+                return await func(*args)
+
+            awrapper.__name__ = (
+                getattr(func, "__name__")
+                if hasattr(func, "__name__")
+                else awrapper.__name__
+            )
+            return awrapper  # type: ignore[return-value]
+
+        else:
+
+            def wrapper(runs: Sequence[Run], example: Example) -> _COMPARISON_OUTPUT:
+                arg_map = {
+                    "runs": runs,
+                    "example": example,
+                    "inputs": example.inputs if example else {},
+                    "outputs": [run.outputs or {} for run in runs],
+                    "reference_outputs": example.outputs or {} if example else {},
                 }
                 args = (arg_map[arg] for arg in positional_args)
                 return func(*args)
diff --git a/python/tests/unit_tests/evaluation/test_evaluator.py b/python/tests/unit_tests/evaluation/test_evaluator.py
index 09f1d7eb3..1fd4ee740 100644
--- a/python/tests/unit_tests/evaluation/test_evaluator.py
+++ b/python/tests/unit_tests/evaluation/test_evaluator.py
@@ -51,7 +51,7 @@ def sample_evaluator(run: Run, example: Optional[Example]) -> EvaluationResult:
     assert result.score == 1.0
 
 
-async def test_dynamie_comparison_run_evaluator():
+async def test_dynamic_comparison_run_evaluator():
     def foo(runs: list, example):
         return ComparisonEvaluationResult(key="bar", scores={uuid.uuid4(): 3.1})
 
diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py
index 7f45c22c7..020d9724c 100644
--- a/python/tests/unit_tests/evaluation/test_runner.py
+++ b/python/tests/unit_tests/evaluation/test_runner.py
@@ -22,6 +22,7 @@
 from langsmith.evaluation._arunner import aevaluate, aevaluate_existing
 from langsmith.evaluation._runner import evaluate_existing
 from langsmith.evaluation.evaluator import (
+    _normalize_comparison_evaluator_func,
     _normalize_evaluator_func,
     _normalize_summary_evaluator,
 )
@@ -58,7 +59,14 @@ def request(self, verb: str, endpoint: str, *args, **kwargs):
                 response = MagicMock()
                 response.json.return_value = res
                 return response
-
+            elif (
+                endpoint
+                == f"http://localhost:1984/sessions/{self.created_session['id']}"
+            ):  # type: ignore
+                res = self.created_session  # type: ignore
+                response = MagicMock()
+                response.json.return_value = res
+                return response
             else:
                 self.should_fail = True
                 raise ValueError(f"Unknown endpoint: {endpoint}")
@@ -94,6 +102,14 @@ def request(self, verb: str, endpoint: str, *args, **kwargs):
                 response = MagicMock()
                 response.json.return_value = {}
                 return response
+            elif endpoint == "http://localhost:1984/datasets/comparative":
+                response = MagicMock()
+                self.created_comparative_experiment = json.loads(kwargs["data"]) | {
+                    "tenant_id": self.tenant_id,
+                    "modified_at": datetime.now(),
+                }
+                response.json.return_value = self.created_comparative_experiment
+                return response
 
             else:
                 raise ValueError(f"Unknown endpoint: {endpoint}")
@@ -303,7 +319,10 @@ def score_value(run, example):
         return {"score": 0.7}
 
     ex_results = evaluate_existing(
-        fake_request.created_session["name"], evaluators=[score_value], client=client
+        fake_request.created_session["name"],
+        evaluators=[score_value],
+        client=client,
+        blocking=blocking,
     )
     second_item = next(itertools.islice(iter(ex_results), 1, 2))
     first_list = list(ex_results)
@@ -669,3 +688,104 @@ def summary_eval_unknown_positional_args(runs, examples, foo):
 def test__normalize_summary_evaluator_invalid(evaluator: Callable) -> None:
     with pytest.raises(ValueError, match="Invalid evaluator function."):
         _normalize_summary_evaluator(evaluator)
+
+
+def comparison_eval(runs, example):
+    return [len(r.outputs["response"]) for r in runs]
+
+
+def comparison_eval_simple(inputs, outputs, reference_outputs):
+    return [len(o["response"]) for o in outputs]
+
+
+def comparison_eval_no_inputs(outputs, reference_outputs):
+    return [min(len(o["response"]), len(reference_outputs["answer"])) for o in outputs]
+
+
+@pytest.mark.parametrize(
+    "evaluator",
+    [comparison_eval, comparison_eval_simple, comparison_eval_no_inputs],
+)
+def test__normalize_comparison_evaluator(evaluator: Callable) -> None:
+    runs = [
+        ls_schemas.Run(
+            name="foo",
+            start_time=datetime.now(),
+            run_type="chain",
+            id=uuid.uuid4(),
+            dotted_order="a",
+            outputs={"response": "c" * 2},
+        ),
+        ls_schemas.Run(
+            name="foo",
+            start_time=datetime.now(),
+            run_type="chain",
+            id=uuid.uuid4(),
+            dotted_order="d",
+            outputs={"response": "e" * 3},
+        ),
+    ]
+    example = ls_schemas.Example(
+        id=uuid.uuid4(), inputs={"in": "b"}, outputs={"answer": "f" * 4}
+    )
+    normalized = _normalize_comparison_evaluator_func(evaluator)
+    assert normalized(runs, example) == [2, 3]
+
+
+async def acomparison_eval(runs, example):
+    return [len(r.outputs["response"]) for r in runs]
+
+
+async def acomparison_eval_simple(inputs, outputs, reference_outputs):
+    return [len(o["response"]) for o in outputs]
+
+
+async def acomparison_eval_no_inputs(outputs, reference_outputs):
+    return [min(len(o["response"]), len(reference_outputs["answer"])) for o in outputs]
+
+
+@pytest.mark.parametrize(
+    "evaluator",
+    [acomparison_eval, acomparison_eval_simple, acomparison_eval_no_inputs],
+)
+async def test__normalize_comparison_evaluator_async(evaluator: Callable) -> None:
+    runs = [
+        ls_schemas.Run(
+            name="foo",
+            start_time=datetime.now(),
+            run_type="chain",
+            id=uuid.uuid4(),
+            dotted_order="a",
+            outputs={"response": "c" * 2},
+        ),
+        ls_schemas.Run(
+            name="foo",
+            start_time=datetime.now(),
+            run_type="chain",
+            id=uuid.uuid4(),
+            dotted_order="d",
+            outputs={"response": "e" * 3},
+        ),
+    ]
+    example = ls_schemas.Example(
+        id=uuid.uuid4(), inputs={"in": "b"}, outputs={"answer": "f" * 4}
+    )
+    normalized = _normalize_comparison_evaluator_func(evaluator)
+    assert await normalized(runs, example) == [2, 3]
+
+
+def comparison_eval_kwargs(*, runs, example):
+    return
+
+
+def comparison_eval_unknown_positional_args(runs, example, foo):
+    return
+
+
+@pytest.mark.parametrize(
+    "evaluator",
+    [comparison_eval_kwargs, comparison_eval_unknown_positional_args],
+)
+def test__normalize_comparison_evaluator_invalid(evaluator: Callable) -> None:
+    with pytest.raises(ValueError, match="Invalid evaluator function."):
+        _normalize_comparison_evaluator_func(evaluator)

From c141d50ec4e74022c8a89602e94bfe38ba51c76b Mon Sep 17 00:00:00 2001
From: David <77736444+davidx33@users.noreply.github.com>
Date: Fri, 22 Nov 2024 09:42:45 -0800
Subject: [PATCH 29/31] recompile pattern if needed before its used to mask
 sensitive info (#1248)

when the processor is initialized, I recompile the pattern if necessary
---
 python/langsmith/anonymizer.py             | 25 +++++++----
 python/tests/unit_tests/test_anonymizer.py | 50 +++++++++++++++++++++-
 2 files changed, 65 insertions(+), 10 deletions(-)

diff --git a/python/langsmith/anonymizer.py b/python/langsmith/anonymizer.py
index 02954d460..e04ee0c37 100644
--- a/python/langsmith/anonymizer.py
+++ b/python/langsmith/anonymizer.py
@@ -90,7 +90,21 @@ class RuleNodeProcessor(StringNodeProcessor):
 
     def __init__(self, rules: List[StringNodeRule]):
         """Initialize the processor with a list of rules."""
-        self.rules = rules
+        self.rules = [
+            {
+                "pattern": (
+                    rule["pattern"]
+                    if isinstance(rule["pattern"], re.Pattern)
+                    else re.compile(rule["pattern"])
+                ),
+                "replace": (
+                    rule["replace"]
+                    if isinstance(rule.get("replace"), str)
+                    else "[redacted]"
+                ),
+            }
+            for rule in rules
+        ]
 
     def mask_nodes(self, nodes: List[StringNode]) -> List[StringNode]:
         """Mask nodes using the rules."""
@@ -98,14 +112,7 @@ def mask_nodes(self, nodes: List[StringNode]) -> List[StringNode]:
         for item in nodes:
             new_value = item["value"]
             for rule in self.rules:
-                new_value = rule["pattern"].sub(
-                    (
-                        rule["replace"]
-                        if isinstance(rule["replace"], str)
-                        else "[redacted]"
-                    ),
-                    new_value,
-                )
+                new_value = rule["pattern"].sub(rule["replace"], new_value)
             if new_value != item["value"]:
                 result.append(StringNode(value=new_value, path=item["path"]))
         return result
diff --git a/python/tests/unit_tests/test_anonymizer.py b/python/tests/unit_tests/test_anonymizer.py
index 147f46d1c..bd6284bf5 100644
--- a/python/tests/unit_tests/test_anonymizer.py
+++ b/python/tests/unit_tests/test_anonymizer.py
@@ -9,7 +9,7 @@
 from pydantic import BaseModel
 
 from langsmith import Client, traceable, tracing_context
-from langsmith.anonymizer import StringNodeRule, create_anonymizer
+from langsmith.anonymizer import RuleNodeProcessor, StringNodeRule, create_anonymizer
 
 EMAIL_REGEX = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
 UUID_REGEX = re.compile(
@@ -139,3 +139,51 @@ def my_func(body: str, from_: MyInput) -> MyOutput:
     if "inputs" in patched_data:
         assert patched_data["inputs"] == expected_inputs
     assert patched_data["outputs"] == expected_outputs
+
+
+def test_rule_node_processor_scrub_sensitive_info():
+    rules = [
+        StringNodeRule(pattern=re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), replace="[ssn]"),
+        StringNodeRule(
+            pattern=re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"),
+            replace="[email]",
+        ),
+        StringNodeRule(
+            pattern=re.compile(r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b"), replace="[phone]"
+        ),
+    ]
+    processor = RuleNodeProcessor(rules)
+
+    nodes = [
+        {"value": "My SSN is 123-45-6789.", "path": ["field1"]},
+        {"value": "Contact me at john.doe@example.com.", "path": ["field2"]},
+        {"value": "Call me on 123-456-7890.", "path": ["field3"]},
+    ]
+
+    expected = [
+        {"value": "My SSN is [ssn].", "path": ["field1"]},
+        {"value": "Contact me at [email].", "path": ["field2"]},
+        {"value": "Call me on [phone].", "path": ["field3"]},
+    ]
+
+    result = processor.mask_nodes(nodes)
+
+    assert result == expected
+
+
+def test_rule_node_processor_default_replace():
+    rules = [
+        StringNodeRule(pattern=re.compile(r"sensitive")),
+    ]
+    processor = RuleNodeProcessor(rules)
+
+    nodes = [
+        {"value": "This contains sensitive data", "path": ["field1"]},
+    ]
+
+    expected = [
+        {"value": "This contains [redacted] data", "path": ["field1"]},
+    ]
+
+    result = processor.mask_nodes(nodes)
+    assert result == expected

From 6cf7c9b1e7e8fb64e46c88fe070d80c364f6c057 Mon Sep 17 00:00:00 2001
From: Bagatur <22008038+baskaryan@users.noreply.github.com>
Date: Fri, 22 Nov 2024 13:51:12 -0500
Subject: [PATCH 30/31] python[patch]: `evaluate` local mode (#1224)

Adds an `upload_results` flag to avoid tracing any runs (target or
evaluator) or creating an experiment in langsmith:

```python
from langsmith import evaluate

results = evaluate(
    lambda x: x,
    data="Sample Dataset 3",
    evaluators=[lambda inputs: {"score": 1, "key": "correct"}],
    upload_results=False
)
```

---------

Co-authored-by: William Fu-Hinthorn <13333726+hinthornw@users.noreply.github.com>
---
 python/langsmith/evaluation/_arunner.py       |  53 ++++---
 python/langsmith/evaluation/_runner.py        | 109 ++++++++------
 python/langsmith/run_helpers.py               |  29 ++--
 python/langsmith/utils.py                     |   3 +-
 .../unit_tests/evaluation/test_runner.py      | 133 +++++++++++-------
 python/tests/unit_tests/test_run_helpers.py   |  76 +++++++---
 6 files changed, 254 insertions(+), 149 deletions(-)

diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py
index f37901a4b..a8799b083 100644
--- a/python/langsmith/evaluation/_arunner.py
+++ b/python/langsmith/evaluation/_arunner.py
@@ -31,6 +31,7 @@
 from langsmith import run_trees as rt
 from langsmith import utils as ls_utils
 from langsmith._internal import _aiter as aitertools
+from langsmith._internal._beta_decorator import _warn_once
 from langsmith.evaluation._runner import (
     AEVALUATOR_T,
     DATA_T,
@@ -83,6 +84,7 @@ async def aevaluate(
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
     experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
+    upload_results: bool = True,
 ) -> AsyncExperimentResults:
     r"""Evaluate an async target system or function on a given dataset.
 
@@ -241,6 +243,8 @@ async def aevaluate(
         ... )  # doctest: +ELLIPSIS
         View the evaluation results for experiment:...
     """  # noqa: E501
+    if not upload_results:
+        _warn_once("'upload_results' parameter is in beta.")
     if experiment and experiment_prefix:
         raise ValueError(
             "Expected at most one of 'experiment' or 'experiment_prefix',"
@@ -260,6 +264,7 @@ async def aevaluate(
         client=client,
         blocking=blocking,
         experiment=experiment,
+        upload_results=upload_results,
     )
 
 
@@ -379,6 +384,7 @@ async def _aevaluate(
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
     experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
+    upload_results: bool = True,
 ) -> AsyncExperimentResults:
     is_async_target = (
         asyncio.iscoroutinefunction(target)
@@ -401,6 +407,7 @@ async def _aevaluate(
         description=description,
         num_repetitions=num_repetitions,
         runs=runs,
+        upload_results=upload_results,
     ).astart()
     cache_dir = ls_utils.get_cache_dir(None)
     if cache_dir is not None:
@@ -461,6 +468,7 @@ def __init__(
         summary_results: Optional[AsyncIterable[EvaluationResults]] = None,
         description: Optional[str] = None,
         num_repetitions: int = 1,
+        upload_results: bool = True,
     ):
         super().__init__(
             experiment=experiment,
@@ -476,6 +484,7 @@ def __init__(
         self._evaluation_results = evaluation_results
         self._summary_results = summary_results
         self._num_repetitions = num_repetitions
+        self._upload_results = upload_results
 
     async def aget_examples(self) -> AsyncIterator[schemas.Example]:
         if self._examples is None:
@@ -535,7 +544,7 @@ async def astart(self) -> _AsyncExperimentManager:
                 "No examples found in the dataset."
                 "Please ensure the data provided to aevaluate is not empty."
             )
-        project = self._get_project(first_example)
+        project = self._get_project(first_example) if self._upload_results else None
         self._print_experiment_start(project, first_example)
         self._metadata["num_repetitions"] = self._num_repetitions
         return self.__class__(
@@ -545,6 +554,7 @@ async def astart(self) -> _AsyncExperimentManager:
             client=self.client,
             runs=self._runs,
             evaluation_results=self._evaluation_results,
+            upload_results=self._upload_results,
         )
 
     async def awith_predictions(
@@ -561,6 +571,7 @@ async def awith_predictions(
             metadata=self._metadata,
             client=self.client,
             runs=(pred["run"] async for pred in r2),
+            upload_results=self._upload_results,
         )
 
     async def awith_evaluators(
@@ -580,6 +591,7 @@ async def awith_evaluators(
             runs=(result["run"] async for result in r2),
             evaluation_results=(result["evaluation_results"] async for result in r3),
             summary_results=self._summary_results,
+            upload_results=self._upload_results,
         )
 
     async def awith_summary_evaluators(
@@ -596,6 +608,7 @@ async def awith_summary_evaluators(
             runs=self.aget_runs(),
             evaluation_results=self._evaluation_results,
             summary_results=aggregate_feedback_gen,
+            upload_results=self._upload_results,
         )
 
     async def aget_results(self) -> AsyncIterator[ExperimentResultRow]:
@@ -675,7 +688,7 @@ async def _arun_evaluators(
                 **current_context,
                 "project_name": "evaluators",
                 "metadata": metadata,
-                "enabled": True,
+                "enabled": "local" if not self._upload_results else True,
                 "client": self.client,
             }
         ):
@@ -689,10 +702,12 @@ async def _arun_evaluators(
                         example=example,
                     )
                     eval_results["results"].extend(
+                        self.client._select_eval_results(evaluator_response)
+                    )
+                    if self._upload_results:
                         self.client._log_evaluation_feedback(
                             evaluator_response, run=run, _executor=executor
                         )
-                    )
                 except Exception as e:
                     try:
                         feedback_keys = _extract_feedback_keys(evaluator)
@@ -709,11 +724,12 @@ async def _arun_evaluators(
                             ]
                         )
                         eval_results["results"].extend(
-                            # TODO: This is a hack
+                            self.client._select_eval_results(error_response)
+                        )
+                        if self._upload_results:
                             self.client._log_evaluation_feedback(
                                 error_response, run=run, _executor=executor
                             )
-                        )
                     except Exception as e2:
                         logger.debug(f"Error parsing feedback keys: {e2}")
                         pass
@@ -744,7 +760,7 @@ async def _aapply_summary_evaluators(
             runs.append(run)
             examples.append(example)
         aggregate_feedback = []
-        project_id = self._get_experiment().id
+        project_id = self._get_experiment().id if self._upload_results else None
         current_context = rh.get_tracing_context()
         metadata = {
             **(current_context["metadata"] or {}),
@@ -758,7 +774,7 @@ async def _aapply_summary_evaluators(
                 **current_context,
                 "project_name": "evaluators",
                 "metadata": metadata,
-                "enabled": True,
+                "enabled": "local" if not self._upload_results else True,
                 "client": self.client,
             }
         ):
@@ -770,16 +786,17 @@ async def _aapply_summary_evaluators(
                         fn_name=evaluator.__name__,
                     )
                     aggregate_feedback.extend(flattened_results)
-                    for result in flattened_results:
-                        feedback = result.dict(exclude={"target_run_id"})
-                        evaluator_info = feedback.pop("evaluator_info", None)
-                        await aitertools.aio_to_thread(
-                            self.client.create_feedback,
-                            **feedback,
-                            run_id=None,
-                            project_id=project_id,
-                            source_info=evaluator_info,
-                        )
+                    if self._upload_results:
+                        for result in flattened_results:
+                            feedback = result.dict(exclude={"target_run_id"})
+                            evaluator_info = feedback.pop("evaluator_info", None)
+                            await aitertools.aio_to_thread(
+                                self.client.create_feedback,
+                                **feedback,
+                                run_id=None,
+                                project_id=project_id,
+                                source_info=evaluator_info,
+                            )
                 except Exception as e:
                     logger.error(
                         f"Error running summary evaluator {repr(evaluator)}: {e}",
@@ -815,6 +832,8 @@ async def _get_dataset_splits(self) -> Optional[list[str]]:
         return list(splits)
 
     async def _aend(self) -> None:
+        if not self._upload_results:
+            return
         experiment = self._experiment
         if experiment is None:
             raise ValueError("Experiment not started yet.")
diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
index 556e697d4..2339601c6 100644
--- a/python/langsmith/evaluation/_runner.py
+++ b/python/langsmith/evaluation/_runner.py
@@ -44,6 +44,7 @@
 from langsmith import run_trees as rt
 from langsmith import schemas
 from langsmith import utils as ls_utils
+from langsmith._internal._beta_decorator import _warn_once
 from langsmith.evaluation.evaluator import (
     SUMMARY_EVALUATOR_T,
     ComparisonEvaluationResult,
@@ -103,6 +104,7 @@ def evaluate(
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
     experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
+    upload_results: bool = True,
 ) -> ExperimentResults:
     r"""Evaluate a target system or function on a given dataset.
 
@@ -259,6 +261,8 @@ def evaluate(
         ... )  # doctest: +ELLIPSIS
         View the evaluation results for experiment:...
     """  # noqa: E501
+    if not upload_results:
+        _warn_once("'upload_results' parameter is in beta.")
     if callable(target) and rh.is_async(target):
         raise ValueError(
             "Async functions are not supported by `evaluate`. "
@@ -290,6 +294,7 @@ def evaluate(
         client=client,
         blocking=blocking,
         experiment=experiment,
+        upload_results=upload_results,
     )
 
 
@@ -898,6 +903,7 @@ def _evaluate(
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
     experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
+    upload_results: bool = True,
 ) -> ExperimentResults:
     # Initialize the experiment manager.
     client = client or rt.get_cached_client()
@@ -918,6 +924,7 @@ def _evaluate(
         # If provided, we don't need to create a new experiment.
         runs=runs,
         # Create or resolve the experiment.
+        upload_results=upload_results,
     ).start()
     cache_dir = ls_utils.get_cache_dir(None)
     cache_path = (
@@ -1104,9 +1111,9 @@ def _get_project(self, first_example: schemas.Example) -> schemas.TracerSession:
         return project
 
     def _print_experiment_start(
-        self, project: schemas.TracerSession, first_example: schemas.Example
+        self, project: Optional[schemas.TracerSession], first_example: schemas.Example
     ) -> None:
-        if project.url:
+        if project and project.url:
             # TODO: Make this a public API
             project_url = project.url.split("?")[0]
             dataset_id = first_example.dataset_id
@@ -1162,6 +1169,7 @@ def __init__(
         summary_results: Optional[Iterable[EvaluationResults]] = None,
         description: Optional[str] = None,
         num_repetitions: int = 1,
+        upload_results: bool = True,
     ):
         super().__init__(
             experiment=experiment,
@@ -1175,6 +1183,7 @@ def __init__(
         self._evaluation_results = evaluation_results
         self._summary_results = summary_results
         self._num_repetitions = num_repetitions
+        self._upload_results = upload_results
 
     @property
     def examples(self) -> Iterable[schemas.Example]:
@@ -1215,7 +1224,7 @@ def runs(self) -> Iterable[schemas.Run]:
 
     def start(self) -> _ExperimentManager:
         first_example = next(itertools.islice(self.examples, 1))
-        project = self._get_project(first_example)
+        project = self._get_project(first_example) if self._upload_results else None
         self._print_experiment_start(project, first_example)
         self._metadata["num_repetitions"] = self._num_repetitions
         return self.__class__(
@@ -1225,6 +1234,7 @@ def start(self) -> _ExperimentManager:
             client=self.client,
             runs=self._runs,
             evaluation_results=self._evaluation_results,
+            upload_results=self._upload_results,
         )
 
     def with_predictions(
@@ -1245,6 +1255,7 @@ def with_predictions(
             metadata=self._metadata,
             client=self.client,
             runs=(pred["run"] for pred in r2),
+            upload_results=self._upload_results,
             # TODO: Can't do multiple prediction rounds rn.
         )
 
@@ -1276,6 +1287,7 @@ def with_evaluators(
             runs=(result["run"] for result in r2),
             evaluation_results=(result["evaluation_results"] for result in r3),
             summary_results=self._summary_results,
+            upload_results=self._upload_results,
         )
 
     def with_summary_evaluators(
@@ -1296,6 +1308,7 @@ def with_summary_evaluators(
             runs=self.runs,
             evaluation_results=self._evaluation_results,
             summary_results=aggregate_feedback_gen,
+            upload_results=self._upload_results,
         )
 
     def get_results(self) -> Iterable[ExperimentResultRow]:
@@ -1332,7 +1345,12 @@ def _predict(
         if max_concurrency == 0:
             for example in self.examples:
                 yield _forward(
-                    fn, example, self.experiment_name, self._metadata, self.client
+                    fn,
+                    example,
+                    self.experiment_name,
+                    self._metadata,
+                    self.client,
+                    self._upload_results,
                 )
 
         else:
@@ -1345,6 +1363,7 @@ def _predict(
                         self.experiment_name,
                         self._metadata,
                         self.client,
+                        self._upload_results,
                     )
                     for example in self.examples
                 ]
@@ -1373,7 +1392,7 @@ def _run_evaluators(
                 **current_context,
                 "project_name": "evaluators",
                 "metadata": metadata,
-                "enabled": True,
+                "enabled": "local" if not self._upload_results else True,
                 "client": self.client,
             }
         ):
@@ -1386,12 +1405,15 @@ def _run_evaluators(
                         run=run,
                         example=example,
                     )
+
                     eval_results["results"].extend(
+                        self.client._select_eval_results(evaluator_response)
+                    )
+                    if self._upload_results:
                         # TODO: This is a hack
                         self.client._log_evaluation_feedback(
                             evaluator_response, run=run, _executor=executor
                         )
-                    )
                 except Exception as e:
                     try:
                         feedback_keys = _extract_feedback_keys(evaluator)
@@ -1408,17 +1430,19 @@ def _run_evaluators(
                             ]
                         )
                         eval_results["results"].extend(
+                            self.client._select_eval_results(error_response)
+                        )
+                        if self._upload_results:
                             # TODO: This is a hack
                             self.client._log_evaluation_feedback(
                                 error_response, run=run, _executor=executor
                             )
-                        )
                     except Exception as e2:
                         logger.debug(f"Error parsing feedback keys: {e2}")
                         pass
                     logger.error(
                         f"Error running evaluator {repr(evaluator)} on"
-                        f" run {run.id}: {repr(e)}",
+                        f" run {run.id if run else ''}: {repr(e)}",
                         exc_info=True,
                     )
             return ExperimentResultRow(
@@ -1447,7 +1471,7 @@ def _score(
                         self._run_evaluators,
                         evaluators,
                         current_results,
-                        executor=executor,
+                        executor,
                     )
             else:
                 futures = set()
@@ -1457,7 +1481,7 @@ def _score(
                             self._run_evaluators,
                             evaluators,
                             current_results,
-                            executor=executor,
+                            executor,
                         )
                     )
                     try:
@@ -1481,7 +1505,7 @@ def _apply_summary_evaluators(
             examples.append(example)
         aggregate_feedback = []
         with ls_utils.ContextThreadPoolExecutor() as executor:
-            project_id = self._get_experiment().id
+            project_id = self._get_experiment().id if self._upload_results else None
             current_context = rh.get_tracing_context()
             metadata = {
                 **(current_context["metadata"] or {}),
@@ -1496,7 +1520,7 @@ def _apply_summary_evaluators(
                     "project_name": "evaluators",
                     "metadata": metadata,
                     "client": self.client,
-                    "enabled": True,
+                    "enabled": "local" if not self._upload_results else True,
                 }
             ):
                 for evaluator in summary_evaluators:
@@ -1508,16 +1532,17 @@ def _apply_summary_evaluators(
                             fn_name=evaluator.__name__,
                         )
                         aggregate_feedback.extend(flattened_results)
-                        for result in flattened_results:
-                            feedback = result.dict(exclude={"target_run_id"})
-                            evaluator_info = feedback.pop("evaluator_info", None)
-                            executor.submit(
-                                self.client.create_feedback,
-                                **feedback,
-                                run_id=None,
-                                project_id=project_id,
-                                source_info=evaluator_info,
-                            )
+                        if self._upload_results:
+                            for result in flattened_results:
+                                feedback = result.dict(exclude={"target_run_id"})
+                                evaluator_info = feedback.pop("evaluator_info", None)
+                                executor.submit(
+                                    self.client.create_feedback,
+                                    **feedback,
+                                    run_id=None,
+                                    project_id=project_id,
+                                    source_info=evaluator_info,
+                                )
                     except Exception as e:
                         logger.error(
                             f"Error running summary evaluator {repr(evaluator)}: {e}",
@@ -1551,6 +1576,8 @@ def _get_dataset_splits(self) -> Optional[list[str]]:
         return list(splits)
 
     def _end(self) -> None:
+        if not self._upload_results:
+            return
         experiment = self._experiment
         if experiment is None:
             raise ValueError("Experiment not started yet.")
@@ -1619,6 +1646,7 @@ def _forward(
     experiment_name: str,
     metadata: dict,
     client: langsmith.Client,
+    upload_results: bool,
 ) -> _ForwardResults:
     run: Optional[schemas.RunBase] = None
 
@@ -1626,33 +1654,26 @@ def _get_run(r: rt.RunTree) -> None:
         nonlocal run
         run = r
 
-    with rh.tracing_context(enabled=True):
+    with rh.tracing_context(enabled="local" if not upload_results else True):
+        example_version = (
+            example.modified_at.isoformat()
+            if example.modified_at
+            else example.created_at.isoformat()
+        )
+        langsmith_extra = rh.LangSmithExtra(
+            reference_example_id=example.id,
+            on_end=_get_run,
+            project_name=experiment_name,
+            metadata={**metadata, "example_version": example_version},
+            client=client,
+        )
         try:
-            fn(
-                example.inputs,
-                langsmith_extra=rh.LangSmithExtra(
-                    reference_example_id=example.id,
-                    on_end=_get_run,
-                    project_name=experiment_name,
-                    metadata={
-                        **metadata,
-                        "example_version": (
-                            example.modified_at.isoformat()
-                            if example.modified_at
-                            else example.created_at.isoformat()
-                        ),
-                    },
-                    client=client,
-                ),
-            )
+            fn(example.inputs, langsmith_extra=langsmith_extra)
         except Exception as e:
             logger.error(
                 f"Error running target function: {e}", exc_info=True, stacklevel=1
             )
-        return _ForwardResults(
-            run=cast(schemas.Run, run),
-            example=example,
-        )
+        return _ForwardResults(run=cast(schemas.Run, run), example=example)
 
 
 def _resolve_data(
diff --git a/python/langsmith/run_helpers.py b/python/langsmith/run_helpers.py
index 7510b75ee..80be362c0 100644
--- a/python/langsmith/run_helpers.py
+++ b/python/langsmith/run_helpers.py
@@ -24,6 +24,7 @@
     Generic,
     Iterator,
     List,
+    Literal,
     Mapping,
     Optional,
     Protocol,
@@ -58,7 +59,9 @@
 _PROJECT_NAME = contextvars.ContextVar[Optional[str]]("_PROJECT_NAME", default=None)
 _TAGS = contextvars.ContextVar[Optional[List[str]]]("_TAGS", default=None)
 _METADATA = contextvars.ContextVar[Optional[Dict[str, Any]]]("_METADATA", default=None)
-_TRACING_ENABLED = contextvars.ContextVar[Optional[bool]](
+
+
+_TRACING_ENABLED = contextvars.ContextVar[Optional[Union[bool, Literal["local"]]]](
     "_TRACING_ENABLED", default=None
 )
 _CLIENT = contextvars.ContextVar[Optional[ls_client.Client]]("_CLIENT", default=None)
@@ -100,7 +103,7 @@ def tracing_context(
     tags: Optional[List[str]] = None,
     metadata: Optional[Dict[str, Any]] = None,
     parent: Optional[Union[run_trees.RunTree, Mapping, str]] = None,
-    enabled: Optional[bool] = None,
+    enabled: Optional[Union[bool, Literal["local"]]] = None,
     client: Optional[ls_client.Client] = None,
     **kwargs: Any,
 ) -> Generator[None, None, None]:
@@ -935,8 +938,9 @@ def _setup(self) -> run_trees.RunTree:
                 attachments=self.attachments or {},
             )
 
-        if enabled:
+        if enabled is True:
             self.new_run.post()
+        if enabled:
             _TAGS.set(tags_)
             _METADATA.set(metadata)
             _PARENT_RUN_TREE.set(self.new_run)
@@ -974,7 +978,7 @@ def _teardown(
             self.new_run.end(error=tb)
         if self.old_ctx is not None:
             enabled = utils.tracing_is_enabled(self.old_ctx)
-            if enabled:
+            if enabled is True:
                 self.new_run.patch()
 
             _set_tracing_context(self.old_ctx)
@@ -1218,7 +1222,7 @@ def _container_end(
     """End the run."""
     run_tree = container.get("new_run")
     if run_tree is None:
-        # Tracing enabled
+        # Tracing not enabled
         return
     outputs_ = outputs if isinstance(outputs, dict) else {"output": outputs}
     error_ = None
@@ -1226,7 +1230,8 @@ def _container_end(
         stacktrace = utils._format_exc()
         error_ = f"{repr(error)}\n\n{stacktrace}"
     run_tree.end(outputs=outputs_, error=error_)
-    run_tree.patch()
+    if utils.tracing_is_enabled() is True:
+        run_tree.patch()
     on_end = container.get("on_end")
     if on_end is not None and callable(on_end):
         try:
@@ -1328,7 +1333,8 @@ def _setup_run(
     id_ = langsmith_extra.get("run_id")
     if not parent_run_ and not utils.tracing_is_enabled():
         utils.log_once(
-            logging.DEBUG, "LangSmith tracing is enabled, returning original function."
+            logging.DEBUG,
+            "LangSmith tracing is not enabled, returning original function.",
         )
         return _TraceableContainer(
             new_run=None,
@@ -1410,10 +1416,11 @@ def _setup_run(
             client=client_,  # type: ignore
             attachments=attachments,
         )
-    try:
-        new_run.post()
-    except BaseException as e:
-        LOGGER.error(f"Failed to post run {new_run.id}: {e}")
+    if utils.tracing_is_enabled() is True:
+        try:
+            new_run.post()
+        except BaseException as e:
+            LOGGER.error(f"Failed to post run {new_run.id}: {e}")
     response_container = _TraceableContainer(
         new_run=new_run,
         project_name=selected_project,
diff --git a/python/langsmith/utils.py b/python/langsmith/utils.py
index f7c257d1f..3e8956d1a 100644
--- a/python/langsmith/utils.py
+++ b/python/langsmith/utils.py
@@ -24,6 +24,7 @@
     Iterable,
     Iterator,
     List,
+    Literal,
     Mapping,
     Optional,
     Sequence,
@@ -91,7 +92,7 @@ class LangSmithMissingAPIKeyWarning(LangSmithWarning):
     """Warning for missing API key."""
 
 
-def tracing_is_enabled(ctx: Optional[dict] = None) -> bool:
+def tracing_is_enabled(ctx: Optional[dict] = None) -> Union[bool, Literal["local"]]:
     """Return True if tracing is enabled."""
     from langsmith.run_helpers import get_current_run_tree, get_tracing_context
 
diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py
index 020d9724c..72dee2128 100644
--- a/python/tests/unit_tests/evaluation/test_runner.py
+++ b/python/tests/unit_tests/evaluation/test_runner.py
@@ -153,7 +153,10 @@ def _create_example(idx: int) -> ls_schemas.Example:
 @pytest.mark.skipif(sys.version_info < (3, 9), reason="requires python3.9 or higher")
 @pytest.mark.parametrize("blocking", [False, True])
 @pytest.mark.parametrize("as_runnable", [False, True])
-def test_evaluate_results(blocking: bool, as_runnable: bool) -> None:
+@pytest.mark.parametrize("upload_results", [False, True])
+def test_evaluate_results(
+    blocking: bool, as_runnable: bool, upload_results: bool
+) -> None:
     session = mock.Mock()
     ds_name = "my-dataset"
     ds_id = "00886375-eb2a-4038-9032-efff60309896"
@@ -272,6 +275,7 @@ def summary_eval_outputs_reference(outputs, reference_outputs):
         summary_evaluators=summary_evaluators,
         num_repetitions=NUM_REPETITIONS,
         blocking=blocking,
+        upload_results=upload_results,
     )
     if not blocking:
         deltas = []
@@ -303,40 +307,45 @@ def summary_eval_outputs_reference(outputs, reference_outputs):
         )
     assert len(results._summary_results["results"]) == len(summary_evaluators)
 
-    assert fake_request.created_session
-    _wait_until(lambda: fake_request.runs)
     N_PREDS = SPLIT_SIZE * NUM_REPETITIONS
-    _wait_until(lambda: len(ordering_of_stuff) == (N_PREDS * (len(evaluators) + 1)))
-    _wait_until(lambda: slow_index is not None)
-    # Want it to be interleaved
-    assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS
+    if upload_results:
+        assert fake_request.created_session
+        _wait_until(lambda: fake_request.runs)
+        _wait_until(lambda: len(ordering_of_stuff) == (N_PREDS * (len(evaluators) + 1)))
+        _wait_until(lambda: slow_index is not None)
+        # Want it to be interleaved
+        assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS
+    else:
+        assert not fake_request.created_session
 
     # It's delayed, so it'll be the penultimate event
     # Will run all other preds and evals, then this, then the last eval
     assert slow_index == (len(evaluators) + 1) * (N_PREDS - 1)
 
-    def score_value(run, example):
-        return {"score": 0.7}
+    if upload_results:
 
-    ex_results = evaluate_existing(
-        fake_request.created_session["name"],
-        evaluators=[score_value],
-        client=client,
-        blocking=blocking,
-    )
-    second_item = next(itertools.islice(iter(ex_results), 1, 2))
-    first_list = list(ex_results)
-    second_list = list(ex_results)
-    second_item_after = next(itertools.islice(iter(ex_results), 1, 2))
-    assert len(first_list) == len(second_list) == SPLIT_SIZE * NUM_REPETITIONS
-    assert first_list == second_list
-    assert second_item == second_item_after
-    dev_xample_ids = [e.id for e in dev_split]
-    for r in ex_results:
-        assert r["example"].id in dev_xample_ids
-        assert r["evaluation_results"]["results"][0].score == 0.7
-        assert r["run"].reference_example_id in dev_xample_ids
-    assert not fake_request.should_fail
+        def score_value(run, example):
+            return {"score": 0.7}
+
+        ex_results = evaluate_existing(
+            fake_request.created_session["name"],
+            evaluators=[score_value],
+            client=client,
+            blocking=blocking,
+        )
+        second_item = next(itertools.islice(iter(ex_results), 1, 2))
+        first_list = list(ex_results)
+        second_list = list(ex_results)
+        second_item_after = next(itertools.islice(iter(ex_results), 1, 2))
+        assert len(first_list) == len(second_list) == SPLIT_SIZE * NUM_REPETITIONS
+        assert first_list == second_list
+        assert second_item == second_item_after
+        dev_xample_ids = [e.id for e in dev_split]
+        for r in ex_results:
+            assert r["example"].id in dev_xample_ids
+            assert r["evaluation_results"]["results"][0].score == 0.7
+            assert r["run"].reference_example_id in dev_xample_ids
+        assert not fake_request.should_fail
 
     # Returning list of non-dicts not supported.
     def bad_eval_list(run, example):
@@ -405,7 +414,10 @@ async def my_other_func(inputs: dict, other_val: int):
 @pytest.mark.skipif(sys.version_info < (3, 9), reason="requires python3.9 or higher")
 @pytest.mark.parametrize("blocking", [False, True])
 @pytest.mark.parametrize("as_runnable", [False, True])
-async def test_aevaluate_results(blocking: bool, as_runnable: bool) -> None:
+@pytest.mark.parametrize("upload_results", [False, True])
+async def test_aevaluate_results(
+    blocking: bool, as_runnable: bool, upload_results: bool
+) -> None:
     session = mock.Mock()
     ds_name = "my-dataset"
     ds_id = "00886375-eb2a-4038-9032-efff60309896"
@@ -527,6 +539,7 @@ def summary_eval_outputs_reference(outputs, reference_outputs):
         summary_evaluators=summary_evaluators,
         num_repetitions=NUM_REPETITIONS,
         blocking=blocking,
+        upload_results=upload_results,
     )
     if not blocking:
         deltas = []
@@ -562,36 +575,44 @@ def summary_eval_outputs_reference(outputs, reference_outputs):
         )
     assert len(results._summary_results["results"]) == len(summary_evaluators)
 
-    assert fake_request.created_session
-    _wait_until(lambda: fake_request.runs)
     N_PREDS = SPLIT_SIZE * NUM_REPETITIONS
-    _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * (len(evaluators) + 1))
-    _wait_until(lambda: slow_index is not None)
-    # Want it to be interleaved
-    assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS
-    assert slow_index is not None
-    # It's delayed, so it'll be the penultimate event
-    # Will run all other preds and evals, then this, then the last eval
-    assert slow_index == (N_PREDS - 1) * (len(evaluators) + 1)
 
-    assert fake_request.created_session["name"]
+    if upload_results:
+        assert fake_request.created_session
+        _wait_until(lambda: fake_request.runs)
+        _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * (len(evaluators) + 1))
+        _wait_until(lambda: slow_index is not None)
+        # Want it to be interleaved
+        assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS
+        assert slow_index is not None
+        # It's delayed, so it'll be the penultimate event
+        # Will run all other preds and evals, then this, then the last eval
+        assert slow_index == (N_PREDS - 1) * (len(evaluators) + 1)
+
+        assert fake_request.created_session["name"]
+    else:
+        assert not fake_request.created_session
 
     async def score_value(run, example):
         return {"score": 0.7}
 
-    ex_results = await aevaluate_existing(
-        fake_request.created_session["name"], evaluators=[score_value], client=client
-    )
-    all_results = [r async for r in ex_results]
-    assert len(all_results) == SPLIT_SIZE * NUM_REPETITIONS
-    dev_xample_ids = [e.id for e in dev_split]
-    async for r in ex_results:
-        assert r["example"].id in dev_xample_ids
-        assert r["evaluation_results"]["results"][0].score == 0.7
-        assert r["run"].reference_example_id in dev_xample_ids
-    assert not fake_request.should_fail
-    # Returning list of non-dicts not supported.
+    if upload_results:
+        ex_results = await aevaluate_existing(
+            fake_request.created_session["name"],
+            evaluators=[score_value],
+            client=client,
+            blocking=blocking,
+        )
+        all_results = [r async for r in ex_results]
+        assert len(all_results) == SPLIT_SIZE * NUM_REPETITIONS
+        dev_xample_ids = [e.id for e in dev_split]
+        async for r in ex_results:
+            assert r["example"].id in dev_xample_ids
+            assert r["evaluation_results"]["results"][0].score == 0.7
+            assert r["run"].reference_example_id in dev_xample_ids
+        assert not fake_request.should_fail
 
+    # Returning list of non-dicts not supported.
     async def bad_eval_list(run, example):
         ordering_of_stuff.append("evaluate")
         return ["foo", 1]
@@ -603,6 +624,7 @@ async def bad_eval_list(run, example):
         evaluators=[bad_eval_list],
         num_repetitions=NUM_REPETITIONS,
         blocking=blocking,
+        upload_results=upload_results,
     )
     async for r in results:
         assert r["evaluation_results"]["results"][0].extra == {"error": True}
@@ -628,7 +650,12 @@ async def atarget(x):
 
         with pytest.raises(ValueError, match="Invalid evaluator function."):
             await aevaluate(
-                atarget, data=ds_examples, evaluators=[eval_], client=client
+                atarget,
+                data=ds_examples,
+                evaluators=[eval_],
+                client=client,
+                upload_results=upload_results,
+                blocking=blocking,
             )
 
 
diff --git a/python/tests/unit_tests/test_run_helpers.py b/python/tests/unit_tests/test_run_helpers.py
index 34df400e7..2c5658a62 100644
--- a/python/tests/unit_tests/test_run_helpers.py
+++ b/python/tests/unit_tests/test_run_helpers.py
@@ -7,12 +7,22 @@
 import time
 import uuid
 import warnings
-from typing import Any, AsyncGenerator, Generator, List, Optional, Set, Tuple, cast
+from typing import (
+    Any,
+    AsyncGenerator,
+    Generator,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Union,
+    cast,
+)
 from unittest.mock import MagicMock, patch
 
 import pytest
 from requests_toolbelt import MultipartEncoder
-from typing_extensions import Annotated
+from typing_extensions import Annotated, Literal
 
 import langsmith
 from langsmith import Client
@@ -913,7 +923,8 @@ def _get_run(r: RunTree) -> None:
     assert len(child_runs[2].child_runs) == 1  # type: ignore
 
 
-def test_traceable_regular():
+@pytest.mark.parametrize("enabled", [True, "local"])
+def test_traceable_regular(enabled: Union[bool, Literal["local"]]):
     @traceable
     def some_sync_func(query: str, **kwargs: Any) -> list:
         assert kwargs == {"a": 1, "b": 2}
@@ -962,7 +973,7 @@ def _get_run(r: RunTree) -> None:
         run = r
 
     mock_client_ = _get_mock_client()
-    with tracing_context(enabled=True):
+    with tracing_context(enabled=enabled):
         all_chunks = my_answer(
             "some_query", langsmith_extra={"on_end": _get_run, "client": mock_client_}
         )
@@ -988,9 +999,12 @@ def _get_run(r: RunTree) -> None:
         "summarize_answers",
     ]
     assert len(child_runs[2].child_runs) == 1  # type: ignore
+    mock_calls = _get_calls(mock_client_)
+    assert len(mock_calls) == (0 if enabled == "local" else 1)
 
 
-async def test_traceable_async():
+@pytest.mark.parametrize("enabled", [True, "local"])
+async def test_traceable_async(enabled: Union[bool, Literal["local"]]):
     @traceable
     def some_sync_func(query: str) -> list:
         return [query, query]
@@ -1045,7 +1059,7 @@ def _get_run(r: RunTree) -> None:
         run = r
 
     mock_client_ = _get_mock_client()
-    with tracing_context(enabled=True):
+    with tracing_context(enabled=enabled):
         all_chunks = await my_answer(
             "some_query", langsmith_extra={"on_end": _get_run, "client": mock_client_}
         )
@@ -1071,9 +1085,12 @@ def _get_run(r: RunTree) -> None:
         "summarize_answers",
     ]
     assert len(child_runs[2].child_runs) == 1  # type: ignore
+    mock_calls = _get_calls(mock_client_)
+    assert len(mock_calls) == (0 if enabled == "local" else 1)
 
 
-def test_traceable_to_trace():
+@pytest.mark.parametrize("enabled", [True, "local"])
+def test_traceable_to_trace(enabled: Union[bool, Literal["local"]]):
     @traceable
     def parent_fn(a: int, b: int) -> int:
         with langsmith.trace(name="child_fn", inputs={"a": a, "b": b}) as run_tree:
@@ -1087,9 +1104,10 @@ def _get_run(r: RunTree) -> None:
         nonlocal run
         run = r
 
-    with tracing_context(enabled=True):
+    mock_client_ = _get_mock_client()
+    with tracing_context(enabled=enabled):
         result = parent_fn(
-            1, 2, langsmith_extra={"on_end": _get_run, "client": _get_mock_client()}
+            1, 2, langsmith_extra={"on_end": _get_run, "client": mock_client_}
         )
 
     assert result == 3
@@ -1103,9 +1121,12 @@ def _get_run(r: RunTree) -> None:
     assert len(child_runs) == 1
     assert child_runs[0].name == "child_fn"
     assert child_runs[0].inputs == {"a": 1, "b": 2}
+    mock_calls = _get_calls(mock_client_)
+    assert len(mock_calls) == (0 if enabled == "local" else 1)
 
 
-async def test_traceable_to_atrace():
+@pytest.mark.parametrize("enabled", [True, "local"])
+async def test_traceable_to_atrace(enabled: Union[bool, Literal["local"]]):
     @traceable
     async def great_grandchild_fn(a: int, b: int) -> int:
         return a + b
@@ -1134,9 +1155,10 @@ def _get_run(r: RunTree) -> None:
         nonlocal run
         run = r
 
-    with tracing_context(enabled=True):
+    mock_client_ = _get_mock_client()
+    with tracing_context(enabled=enabled):
         result = await parent_fn(
-            1, 2, langsmith_extra={"on_end": _get_run, "client": _get_mock_client()}
+            1, 2, langsmith_extra={"on_end": _get_run, "client": mock_client_}
         )
 
     assert result == 3
@@ -1162,15 +1184,18 @@ def _get_run(r: RunTree) -> None:
     ggc = grandchild.child_runs[1]
     assert ggc.name == "great_grandchild_fn"
     assert ggc.inputs == {"a": 1, "b": 2}
+    mock_calls = _get_calls(mock_client_)
+    assert len(mock_calls) == (0 if enabled == "local" else 1)
 
 
-def test_trace_to_traceable():
+@pytest.mark.parametrize("enabled", [True, "local"])
+def test_trace_to_traceable(enabled: Union[bool, Literal["local"]]):
     @traceable
     def child_fn(a: int, b: int) -> int:
         return a + b
 
     mock_client_ = _get_mock_client()
-    with tracing_context(enabled=True):
+    with tracing_context(enabled=enabled):
         rid = uuid.uuid4()
         with langsmith.trace(
             name="parent_fn", inputs={"a": 1, "b": 2}, client=mock_client_, run_id=rid
@@ -1190,7 +1215,7 @@ def child_fn(a: int, b: int) -> int:
     assert child_runs[0].inputs == {"a": 1, "b": 2}
 
 
-def test_client_passed_when_traceable_parent():
+def test_client_not_passed_when_traceable_parent():
     mock_client = _get_mock_client()
     rt = RunTree(name="foo", client=mock_client)
     headers = rt.to_headers()
@@ -1201,14 +1226,7 @@ def my_run(foo: str):
 
     my_run(foo="bar", langsmith_extra={"parent": headers, "client": mock_client})
     mock_calls = _get_calls(mock_client)
-    assert len(mock_calls) == 1
-    call = mock_client.session.request.call_args
-    assert call.args[0] == "POST"
-    assert call.args[1].startswith("https://api.smith.langchain.com")
-    body = json.loads(call.kwargs["data"])
-    assert body["post"]
-    assert body["post"][0]["inputs"] == {"foo": "bar"}
-    assert body["post"][0]["outputs"] == {"baz": "buzz"}
+    assert len(mock_calls) == 0
 
 
 def test_client_passed_when_trace_parent():
@@ -1231,6 +1249,18 @@ def test_client_passed_when_trace_parent():
     assert body["post"][0]["outputs"] == {"bar": "baz"}
 
 
+def test_client_not_called_when_enabled_local():
+    mock_client = _get_mock_client()
+    headers = RunTree(name="foo", client=mock_client).to_headers()
+    with tracing_context(enabled="local"):
+        with trace(
+            name="foo", inputs={"foo": "bar"}, parent=headers, client=mock_client
+        ) as rt:
+            rt.outputs["bar"] = "baz"
+    calls = _get_calls(mock_client)
+    assert len(calls) == 0
+
+
 def test_from_runnable_config():
     try:
         from langchain_core.tools import tool  # type: ignore

From c3053e63d0905b562066f2fcd0d24cb9d69d1f93 Mon Sep 17 00:00:00 2001
From: William FH <13333726+hinthornw@users.noreply.github.com>
Date: Fri, 22 Nov 2024 15:06:13 -0800
Subject: [PATCH 31/31] Refrain from overwriting experiment metadata (#1250)

---
 python/langsmith/evaluation/_arunner.py | 8 ++++++--
 python/langsmith/evaluation/_runner.py  | 8 ++++++--
 python/pyproject.toml                   | 2 +-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py
index a8799b083..073014970 100644
--- a/python/langsmith/evaluation/_arunner.py
+++ b/python/langsmith/evaluation/_arunner.py
@@ -843,8 +843,12 @@ async def _aend(self) -> None:
         project_metadata["dataset_splits"] = await self._get_dataset_splits()
         self.client.update_project(
             experiment.id,
-            end_time=datetime.datetime.now(datetime.timezone.utc),
-            metadata=project_metadata,
+            end_time=experiment.end_time
+            or datetime.datetime.now(datetime.timezone.utc),
+            metadata={
+                **experiment.metadata,
+                **project_metadata,
+            },
         )
 
 
diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
index 2339601c6..f4e2076ec 100644
--- a/python/langsmith/evaluation/_runner.py
+++ b/python/langsmith/evaluation/_runner.py
@@ -1587,8 +1587,12 @@ def _end(self) -> None:
         project_metadata["dataset_splits"] = self._get_dataset_splits()
         self.client.update_project(
             experiment.id,
-            end_time=datetime.datetime.now(datetime.timezone.utc),
-            metadata=project_metadata,
+            end_time=experiment.end_time
+            or datetime.datetime.now(datetime.timezone.utc),
+            metadata={
+                **experiment.metadata,
+                **project_metadata,
+            },
         )
 
 
diff --git a/python/pyproject.toml b/python/pyproject.toml
index d6425e255..12e89ba74 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langsmith"
-version = "0.1.144"
+version = "0.1.145"
 description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
 authors = ["LangChain <support@langchain.dev>"]
 license = "MIT"