langchain-ai · baskaryan · Dec 5, 2024 · Nov 21, 2024 · Dec 5, 2024 · Dec 5, 2024
diff --git a/js/package.json b/js/package.json
@@ -1,6 +1,6 @@
 {
   "name": "langsmith",
-  "version": "0.2.8",
+  "version": "0.2.9",
   "description": "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform.",
   "packageManager": "[email protected]",
   "files": [

diff --git a/js/src/index.ts b/js/src/index.ts
@@ -18,4 +18,4 @@ export { RunTree, type RunTreeConfig } from "./run_trees.js";
 export { overrideFetchImplementation } from "./singletons/fetch.js";
 
 // Update using yarn bump-version
-export const __version__ = "0.2.8";
+export const __version__ = "0.2.9";
diff --git a/python/langsmith/client.py b/python/langsmith/client.py
@@ -5825,7 +5825,7 @@ def evaluate(
         metadata: Optional[dict] = None,
         experiment_prefix: Optional[str] = None,
         description: Optional[str] = None,
-        max_concurrency: Optional[int] = None,
+        max_concurrency: Optional[int] = 0,
         num_repetitions: int = 1,
         blocking: bool = True,
         experiment: Optional[EXPERIMENT_T] = None,
@@ -5844,7 +5844,7 @@ def evaluate(
         metadata: Optional[dict] = None,
         experiment_prefix: Optional[str] = None,
         description: Optional[str] = None,
-        max_concurrency: Optional[int] = None,
+        max_concurrency: Optional[int] = 0,
         num_repetitions: int = 1,
         blocking: bool = True,
         experiment: Optional[EXPERIMENT_T] = None,
@@ -5866,7 +5866,7 @@ def evaluate(
         metadata: Optional[dict] = None,
         experiment_prefix: Optional[str] = None,
         description: Optional[str] = None,
-        max_concurrency: Optional[int] = None,
+        max_concurrency: Optional[int] = 0,
         num_repetitions: int = 1,
         blocking: bool = True,
         experiment: Optional[EXPERIMENT_T] = None,
@@ -5894,7 +5894,8 @@ def evaluate(
                 Defaults to None.
             description (str | None): A free-form text description for the experiment.
             max_concurrency (int | None): The maximum number of concurrent
-                evaluations to run. Defaults to None (max number of workers).
+                evaluations to run. If None then no limit is set. If 0 then no concurrency.
+                Defaults to 0.
             blocking (bool): Whether to block until the evaluation is complete.
                 Defaults to True.
             num_repetitions (int): The number of times to run the evaluation.
@@ -6077,7 +6078,7 @@ async def aevaluate(
         metadata: Optional[dict] = None,
         experiment_prefix: Optional[str] = None,
         description: Optional[str] = None,
-        max_concurrency: Optional[int] = None,
+        max_concurrency: Optional[int] = 0,
         num_repetitions: int = 1,
         blocking: bool = True,
         experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
@@ -6102,8 +6103,9 @@ async def aevaluate(
             experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
                 Defaults to None.
             description (Optional[str]): A description of the experiment.
-            max_concurrency (Optional[int]): The maximum number of concurrent
-                evaluations to run. Defaults to None.
+            max_concurrency (int | None): The maximum number of concurrent
+                evaluations to run. If None then no limit is set. If 0 then no concurrency.
+                Defaults to 0.
             num_repetitions (int): The number of times to run the evaluation.
                 Each item in the dataset will be run and evaluated this many times.
                 Defaults to 1.

diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py
@@ -84,7 +84,7 @@ async def aevaluate(
     metadata: Optional[dict] = None,
     experiment_prefix: Optional[str] = None,
     description: Optional[str] = None,
-    max_concurrency: Optional[int] = None,
+    max_concurrency: Optional[int] = 0,
     num_repetitions: int = 1,
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
@@ -110,8 +110,9 @@ async def aevaluate(
         experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
             Defaults to None.
         description (Optional[str]): A description of the experiment.
-        max_concurrency (Optional[int]): The maximum number of concurrent
-            evaluations to run. Defaults to None.
+        max_concurrency (int | None): The maximum number of concurrent
+            evaluations to run. If None then no limit is set. If 0 then no concurrency.
+            Defaults to 0.
         num_repetitions (int): The number of times to run the evaluation.
             Each item in the dataset will be run and evaluated this many times.
             Defaults to 1.
@@ -332,7 +333,7 @@ async def aevaluate_existing(
     evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]] = None,
     summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
     metadata: Optional[dict] = None,
-    max_concurrency: Optional[int] = None,
+    max_concurrency: Optional[int] = 0,
     client: Optional[langsmith.Client] = None,
     load_nested: bool = False,
     blocking: bool = True,
@@ -345,7 +346,9 @@ async def aevaluate_existing(
         summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): Optional sequence of evaluators
             to apply over the entire dataset.
         metadata (Optional[dict]): Optional metadata to include in the evaluation results.
-        max_concurrency (Optional[int]): Optional maximum number of concurrent evaluations.
+        max_concurrency (int | None): The maximum number of concurrent
+            evaluations to run. If None then no limit is set. If 0 then no concurrency.
+            Defaults to 0.
         client (Optional[langsmith.Client]): Optional Langsmith client to use for evaluation.
         load_nested: Whether to load all child runs for the experiment.
             Default is to only load the top-level root runs.

diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
@@ -101,7 +101,7 @@ def evaluate(
     metadata: Optional[dict] = None,
     experiment_prefix: Optional[str] = None,
     description: Optional[str] = None,
-    max_concurrency: Optional[int] = None,
+    max_concurrency: Optional[int] = 0,
     num_repetitions: int = 1,
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
@@ -121,7 +121,7 @@ def evaluate(
     metadata: Optional[dict] = None,
     experiment_prefix: Optional[str] = None,
     description: Optional[str] = None,
-    max_concurrency: Optional[int] = None,
+    max_concurrency: Optional[int] = 0,
     num_repetitions: int = 1,
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
@@ -142,7 +142,7 @@ def evaluate(
     metadata: Optional[dict] = None,
     experiment_prefix: Optional[str] = None,
     description: Optional[str] = None,
-    max_concurrency: Optional[int] = None,
+    max_concurrency: Optional[int] = 0,
     num_repetitions: int = 1,
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
@@ -171,7 +171,8 @@ def evaluate(
             Defaults to None.
         description (str | None): A free-form text description for the experiment.
         max_concurrency (int | None): The maximum number of concurrent
-            evaluations to run. Defaults to None (max number of workers).
+            evaluations to run. If None then no limit is set. If 0 then no concurrency.
+            Defaults to 0.
         client (langsmith.Client | None): The LangSmith client to use.
             Defaults to None.
         blocking (bool): Whether to block until the evaluation is complete.
@@ -440,7 +441,7 @@ def evaluate_existing(
     evaluators: Optional[Sequence[EVALUATOR_T]] = None,
     summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
     metadata: Optional[dict] = None,
-    max_concurrency: Optional[int] = None,
+    max_concurrency: Optional[int] = 0,
     client: Optional[langsmith.Client] = None,
     load_nested: bool = False,
     blocking: bool = True,
@@ -454,7 +455,9 @@ def evaluate_existing(
         summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): Optional sequence of evaluators
             to apply over the entire dataset.
         metadata (Optional[dict]): Optional metadata to include in the evaluation results.
-        max_concurrency (Optional[int]): Optional maximum number of concurrent evaluations.
+        max_concurrency (int | None): The maximum number of concurrent
+            evaluations to run. If None then no limit is set. If 0 then no concurrency.
+            Defaults to 0.
         client (Optional[langsmith.Client]): Optional Langsmith client to use for evaluation.
         load_nested: Whether to load all child runs for the experiment.
             Default is to only load the top-level root runs.
@@ -1597,7 +1600,7 @@ def _score(
         (e.g. from a previous prediction step)
         """
         with ls_utils.ContextThreadPoolExecutor(
-            max_workers=max_concurrency
+            max_workers=max_concurrency or 1
         ) as executor:
             if max_concurrency == 0:
                 context = copy_context()
@@ -1815,14 +1818,24 @@ def _get_run(r: rt.RunTree) -> None:
         return _ForwardResults(run=cast(schemas.Run, run), example=example)
 
 
+def _is_valid_uuid(value: str) -> bool:
+    try:
+        uuid.UUID(value)
+        return True
+    except ValueError:
+        return False
+
+
 def _resolve_data(
     data: DATA_T, *, client: langsmith.Client
 ) -> Iterable[schemas.Example]:
     """Return the examples for the given dataset."""
-    if isinstance(data, str):
-        return client.list_examples(dataset_name=data)
-    elif isinstance(data, uuid.UUID):
+    if isinstance(data, uuid.UUID):
         return client.list_examples(dataset_id=data)
+    elif isinstance(data, str) and _is_valid_uuid(data):
+        return client.list_examples(dataset_id=uuid.UUID(data))
+    elif isinstance(data, str):
+        return client.list_examples(dataset_name=data)
     elif isinstance(data, schemas.Dataset):
         return client.list_examples(dataset_id=data.id)
     return data

diff --git a/python/poetry.lock b/python/poetry.lock
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langsmith"
-version = "0.1.147"
+version = "0.2.0"
 description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
 authors = ["LangChain <[email protected]>"]
 license = "MIT"
@@ -25,7 +25,7 @@ packages = [{ include = "langsmith" }]
 langsmith = "langsmith.cli.main:main"
 
 [tool.poetry.dependencies]
-python = ">=3.8.1,<4.0"
+python = ">=3.9,<4.0"
 pydantic = [
   { version = ">=1,<3", python = "<3.12.4" },
   { version = "^2.7.4", python = ">=3.12.4" },

diff --git a/python/tests/evaluation/test_evaluation.py b/python/tests/evaluation/test_evaluation.py
@@ -474,7 +474,7 @@ async def predict(inputs: dict):
             data=ds_name,
         )
 
-    with pytest.raises(ValueError, match=match_val):
+    with pytest.raises(ValueError, match="Must specify 'data'"):
         await aevaluate(
             predict,
             data=[],

diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py
@@ -16,11 +16,8 @@
 
 import pytest
 
-from langsmith import evaluate
+from langsmith import Client, aevaluate, evaluate
 from langsmith import schemas as ls_schemas
-from langsmith.client import Client
-from langsmith.evaluation._arunner import aevaluate, aevaluate_existing
-from langsmith.evaluation._runner import evaluate_existing
 from langsmith.evaluation.evaluator import (
     _normalize_comparison_evaluator_func,
     _normalize_evaluator_func,
@@ -276,6 +273,7 @@ def summary_eval_outputs_reference(outputs, reference_outputs):
         num_repetitions=NUM_REPETITIONS,
         blocking=blocking,
         upload_results=upload_results,
+        max_concurrency=None,
     )
     if not blocking:
         deltas = []
@@ -327,7 +325,7 @@ def summary_eval_outputs_reference(outputs, reference_outputs):
         def score_value(run, example):
             return {"score": 0.7}
 
-        ex_results = evaluate_existing(
+        ex_results = evaluate(
             fake_request.created_session["name"],
             evaluators=[score_value],
             client=client,
@@ -549,6 +547,7 @@ def summary_eval_outputs_reference(outputs, reference_outputs):
         num_repetitions=NUM_REPETITIONS,
         blocking=blocking,
         upload_results=upload_results,
+        max_concurrency=None,
     )
     if not blocking:
         deltas = []
@@ -606,7 +605,7 @@ async def score_value(run, example):
         return {"score": 0.7}
 
     if upload_results:
-        ex_results = await aevaluate_existing(
+        ex_results = await aevaluate(
             fake_request.created_session["name"],
             evaluators=[score_value],
             client=client,