From b2fd1c85811790bcd3fa94415801bd077d4f4ba8 Mon Sep 17 00:00:00 2001
From: jakerachleff <jake@langchain.dev>
Date: Thu, 5 Dec 2024 10:38:44 -0800
Subject: [PATCH] python[minor], js[patch]: release py0.2.0, js0.2.9 (#1247)

v0.2
---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
---
 js/package.json                               |  2 +-
 js/src/evaluation/_runner.ts                  | 70 +++++++++++--------
 js/src/evaluation/evaluate_comparative.ts     |  2 +-
 js/src/evaluation/evaluator.ts                | 21 +++---
 js/src/index.ts                               |  2 +-
 js/src/tests/evaluate.int.test.ts             | 65 +++--------------
 python/langsmith/client.py                    | 21 ++++--
 python/langsmith/evaluation/_arunner.py       | 18 +++--
 python/langsmith/evaluation/_runner.py        | 38 +++++++---
 python/poetry.lock                            |  9 +--
 python/pyproject.toml                         |  4 +-
 python/tests/evaluation/test_evaluation.py    |  2 +-
 .../unit_tests/evaluation/test_runner.py      | 11 ++-
 13 files changed, 132 insertions(+), 133 deletions(-)
diff --git a/js/package.json b/js/package.json
index ea27664f6..a209a4f5c 100644
--- a/js/package.json
+++ b/js/package.json
@@ -1,6 +1,6 @@
 {
   "name": "langsmith",
-  "version": "0.2.8",
+  "version": "0.2.9",
   "description": "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform.",
   "packageManager": "yarn@1.22.19",
   "files": [
diff --git a/js/src/evaluation/_runner.ts b/js/src/evaluation/_runner.ts
index 7a373297d..cac6b5f4c 100644
--- a/js/src/evaluation/_runner.ts
+++ b/js/src/evaluation/_runner.ts
@@ -58,17 +58,17 @@ export type SummaryEvaluatorT =
   | DeprecatedSyncSummaryEvaluator
   | DeprecatedAsyncSummaryEvaluator
   | ((args: {
-      runs?: Array<Run>;
-      examples?: Array<Example>;
-      inputs?: Array<Record<string, any>>;
-      outputs?: Array<Record<string, any>>;
+      runs: Array<Run>;
+      examples: Array<Example>;
+      inputs: Array<Record<string, any>>;
+      outputs: Array<Record<string, any>>;
       referenceOutputs?: Array<Record<string, any>>;
     }) => EvaluationResult | EvaluationResults)
   | ((args: {
-      runs?: Array<Run>;
-      examples?: Array<Example>;
-      inputs?: Array<Record<string, any>>;
-      outputs?: Array<Record<string, any>>;
+      runs: Array<Run>;
+      examples: Array<Example>;
+      inputs: Array<Record<string, any>>;
+      outputs: Array<Record<string, any>>;
       referenceOutputs?: Array<Record<string, any>>;
     }) => Promise<EvaluationResult | EvaluationResults>);
 
@@ -93,17 +93,17 @@ export type EvaluatorT =
   | DeprecatedFunctionEvaluator
   | DeprecatedAsyncFunctionEvaluator
   | ((args: {
-      run?: Run;
-      example?: Example;
-      inputs?: Record<string, any>;
-      outputs?: Record<string, any>;
+      run: Run;
+      example: Example;
+      inputs: Record<string, any>;
+      outputs: Record<string, any>;
       referenceOutputs?: Record<string, any>;
     }) => EvaluationResult | EvaluationResults)
   | ((args: {
-      run?: Run;
-      example?: Example;
-      inputs?: Record<string, any>;
-      outputs?: Record<string, any>;
+      run: Run;
+      example: Example;
+      inputs: Record<string, any>;
+      outputs: Record<string, any>;
       referenceOutputs?: Record<string, any>;
     }) => Promise<EvaluationResult | EvaluationResults>);
 
@@ -130,11 +130,6 @@ interface _ExperimentManagerArgs {
 }
 
 type BaseEvaluateOptions = {
-  /**
-   * The dataset to evaluate on. Can be a dataset name, a list of
-   * examples, or a generator of examples.
-   */
-  data: DataT;
   /**
    * Metadata to attach to the experiment.
    * @default undefined
@@ -178,6 +173,11 @@ export interface EvaluateOptions extends BaseEvaluateOptions {
    * @default undefined
    */
   summaryEvaluators?: Array<SummaryEvaluatorT>;
+  /**
+   * The dataset to evaluate on. Can be a dataset name, a list of
+   * examples, or a generator of examples.
+   */
+  data: DataT;
 }
 
 export interface ComparativeEvaluateOptions extends BaseEvaluateOptions {
@@ -934,8 +934,10 @@ async function _evaluate(
   );
 
   let manager = await new _ExperimentManager({
-    data: Array.isArray(fields.data) ? undefined : fields.data,
-    examples: Array.isArray(fields.data) ? fields.data : undefined,
+    data: Array.isArray(standardFields.data) ? undefined : standardFields.data,
+    examples: Array.isArray(standardFields.data)
+      ? standardFields.data
+      : undefined,
     client,
     metadata: fields.metadata,
     experiment: experiment_ ?? fields.experimentPrefix,
@@ -1063,10 +1065,12 @@ function _resolveData(
 async function wrapSummaryEvaluators(
   evaluators: SummaryEvaluatorT[],
   optionsArray?: Partial<RunTreeConfig>[]
-): Promise<SummaryEvaluatorT[]> {
+): Promise<
+  Array<DeprecatedAsyncSummaryEvaluator | DeprecatedSyncSummaryEvaluator>
+> {
   async function _wrap(
     evaluator: SummaryEvaluatorT
-  ): Promise<SummaryEvaluatorT> {
+  ): Promise<DeprecatedAsyncSummaryEvaluator | DeprecatedSyncSummaryEvaluator> {
     const evalName = evaluator.name || "BatchEvaluator";
 
     const wrapperInner = (
@@ -1087,10 +1091,10 @@ async function wrapSummaryEvaluators(
             return Promise.resolve(
               (
                 evaluator as (args: {
-                  runs?: Run[];
-                  examples?: Example[];
-                  inputs?: Record<string, any>[];
-                  outputs?: Record<string, any>[];
+                  runs: Run[];
+                  examples: Example[];
+                  inputs: Record<string, any>[];
+                  outputs: Record<string, any>[];
                   referenceOutputs?: Record<string, any>[];
                 }) => EvaluationResult | EvaluationResults
               )({
@@ -1103,7 +1107,9 @@ async function wrapSummaryEvaluators(
             );
           }
           // Otherwise use the traditional (runs, examples) signature
-          return Promise.resolve(evaluator(runs, examples));
+          return Promise.resolve(
+            (evaluator as DeprecatedSyncSummaryEvaluator)(runs, examples)
+          );
         },
         { ...optionsArray, name: evalName }
       );
@@ -1119,7 +1125,9 @@ async function wrapSummaryEvaluators(
     return wrapperInner;
   }
 
-  const results: SummaryEvaluatorT[] = [];
+  const results: Array<
+    DeprecatedAsyncSummaryEvaluator | DeprecatedSyncSummaryEvaluator
+  > = [];
   for (let i = 0; i < evaluators.length; i++) {
     results.push(await _wrap(evaluators[i]));
   }
diff --git a/js/src/evaluation/evaluate_comparative.ts b/js/src/evaluation/evaluate_comparative.ts
index c70b6c0ab..06bee8343 100644
--- a/js/src/evaluation/evaluate_comparative.ts
+++ b/js/src/evaluation/evaluate_comparative.ts
@@ -79,7 +79,7 @@ export type _ComparativeEvaluator = (args: {
   runs: Run[];
   example: Example;
   inputs: Record<string, any>;
-  outputs?: Record<string, any>[];
+  outputs: Record<string, any>[];
   referenceOutputs?: Record<string, any>;
 }) => ComparisonEvaluationResultRow | Promise<ComparisonEvaluationResultRow>;
 
diff --git a/js/src/evaluation/evaluator.ts b/js/src/evaluation/evaluator.ts
index cad4707f1..4e64460d3 100644
--- a/js/src/evaluation/evaluator.ts
+++ b/js/src/evaluation/evaluator.ts
@@ -96,18 +96,23 @@ export type RunEvaluatorLike =
       example?: Example
     ) => Promise<EvaluationResult | EvaluationResults>)
   | ((run: Run, example?: Example) => EvaluationResult | EvaluationResults)
+  | ((
+      run: Run,
+      example: Example
+    ) => Promise<EvaluationResult | EvaluationResults>)
+  | ((run: Run, example: Example) => EvaluationResult | EvaluationResults)
   | ((args: {
-      run?: Run;
-      example?: Example;
-      inputs?: Record<string, any>;
-      outputs?: Record<string, any>;
+      run: Run;
+      example: Example;
+      inputs: Record<string, any>;
+      outputs: Record<string, any>;
       referenceOutputs?: Record<string, any>;
     }) => EvaluationResult | EvaluationResults)
   | ((args: {
-      run?: Run;
-      example?: Example;
-      inputs?: Record<string, any>;
-      outputs?: Record<string, any>;
+      run: Run;
+      example: Example;
+      inputs: Record<string, any>;
+      outputs: Record<string, any>;
       referenceOutputs?: Record<string, any>;
     }) => Promise<EvaluationResult | EvaluationResults>);
 
diff --git a/js/src/index.ts b/js/src/index.ts
index d648d23c5..04e2bd53e 100644
--- a/js/src/index.ts
+++ b/js/src/index.ts
@@ -18,4 +18,4 @@ export { RunTree, type RunTreeConfig } from "./run_trees.js";
 export { overrideFetchImplementation } from "./singletons/fetch.js";
 
 // Update using yarn bump-version
-export const __version__ = "0.2.8";
+export const __version__ = "0.2.9";
diff --git a/js/src/tests/evaluate.int.test.ts b/js/src/tests/evaluate.int.test.ts
index e3e524708..ae281b561 100644
--- a/js/src/tests/evaluate.int.test.ts
+++ b/js/src/tests/evaluate.int.test.ts
@@ -3,6 +3,7 @@ import {
   EvaluationResults,
 } from "../evaluation/evaluator.js";
 import { evaluate } from "../evaluation/_runner.js";
+import { waitUntilRunFound } from "./utils.js";
 import { Example, Run, TracerSession } from "../schemas.js";
 import { Client } from "../index.js";
 import { afterAll, beforeAll } from "@jest/globals";
@@ -1115,6 +1116,8 @@ test("evaluate handles partial summary evaluator parameters correctly", async ()
 });
 
 test("evaluate handles comparative target with ComparativeEvaluateOptions", async () => {
+  const client = new Client();
+
   // First, create two experiments to compare
   const targetFunc1 = (input: Record<string, any>) => {
     return {
@@ -1139,13 +1142,18 @@ test("evaluate handles comparative target with ComparativeEvaluateOptions", asyn
     description: "Second experiment for comparison",
   });
 
+  await Promise.all(
+    [exp1, exp2].flatMap(({ results }) =>
+      results.flatMap(({ run }) => waitUntilRunFound(client, run.id))
+    )
+  );
   // Create comparative evaluator
   const comparativeEvaluator = ({
     runs,
     example,
   }: {
-    runs?: Run[];
-    example?: Example;
+    runs: Run[];
+    example: Example;
   }) => {
     if (!runs || !example) throw new Error("Missing required parameters");
 
@@ -1167,7 +1175,6 @@ test("evaluate handles comparative target with ComparativeEvaluateOptions", asyn
   const compareRes = await evaluate(
     [exp1.experimentName, exp2.experimentName],
     {
-      data: TESTING_DATASET_NAME,
       evaluators: [comparativeEvaluator],
       description: "Comparative evaluation test",
       randomizeOrder: true,
@@ -1177,6 +1184,7 @@ test("evaluate handles comparative target with ComparativeEvaluateOptions", asyn
 
   // Verify we got ComparisonEvaluationResults
   expect(compareRes.experimentName).toBeDefined();
+  expect(compareRes.experimentName).toBeDefined();
   expect(compareRes.results).toBeDefined();
   expect(Array.isArray(compareRes.results)).toBe(true);
 
@@ -1212,59 +1220,8 @@ test("evaluate enforces correct evaluator types for comparative evaluation at ru
   await expect(
     // @ts-expect-error - Should error because standardEvaluator is not a ComparativeEvaluator
     evaluate([exp1.experimentName, exp2.experimentName], {
-      data: TESTING_DATASET_NAME,
       evaluators: [standardEvaluator],
       description: "Should fail at runtime",
     })
   ).rejects.toThrow(); // You might want to be more specific about the error message
 });
-
-test("evaluate comparative options includes comparative-specific fields", async () => {
-  const exp1 = await evaluate(
-    (input: Record<string, any>) => ({ foo: input.input + 1 }),
-    {
-      data: TESTING_DATASET_NAME,
-    }
-  );
-
-  const exp2 = await evaluate(
-    (input: Record<string, any>) => ({ foo: input.input + 2 }),
-    {
-      data: TESTING_DATASET_NAME,
-    }
-  );
-
-  const comparativeEvaluator = ({
-    runs,
-    example,
-  }: {
-    runs?: Run[];
-    example?: Example;
-  }) => {
-    if (!runs || !example) throw new Error("Missing required parameters");
-    return {
-      key: "comparative_score",
-      scores: Object.fromEntries(
-        runs.map((run) => [
-          run.id,
-          run.outputs?.foo === example.outputs?.output ? 1 : 0,
-        ])
-      ),
-    };
-  };
-
-  // Test that comparative-specific options work
-  const compareRes = await evaluate(
-    [exp1.experimentName, exp2.experimentName],
-    {
-      data: TESTING_DATASET_NAME,
-      evaluators: [comparativeEvaluator],
-      randomizeOrder: true, // Comparative-specific option
-      loadNested: true, // Comparative-specific option
-      description: "Testing comparative-specific options",
-    }
-  );
-
-  expect(compareRes.experimentName).toBeDefined();
-  expect(compareRes.results).toBeDefined();
-});
diff --git a/python/langsmith/client.py b/python/langsmith/client.py
index 1735889f5..e0b3b83e3 100644
--- a/python/langsmith/client.py
+++ b/python/langsmith/client.py
@@ -5842,7 +5842,7 @@ def evaluate(
         metadata: Optional[dict] = None,
         experiment_prefix: Optional[str] = None,
         description: Optional[str] = None,
-        max_concurrency: Optional[int] = None,
+        max_concurrency: Optional[int] = 0,
         num_repetitions: int = 1,
         blocking: bool = True,
         experiment: Optional[EXPERIMENT_T] = None,
@@ -5861,7 +5861,7 @@ def evaluate(
         metadata: Optional[dict] = None,
         experiment_prefix: Optional[str] = None,
         description: Optional[str] = None,
-        max_concurrency: Optional[int] = None,
+        max_concurrency: Optional[int] = 0,
         num_repetitions: int = 1,
         blocking: bool = True,
         experiment: Optional[EXPERIMENT_T] = None,
@@ -5883,7 +5883,7 @@ def evaluate(
         metadata: Optional[dict] = None,
         experiment_prefix: Optional[str] = None,
         description: Optional[str] = None,
-        max_concurrency: Optional[int] = None,
+        max_concurrency: Optional[int] = 0,
         num_repetitions: int = 1,
         blocking: bool = True,
         experiment: Optional[EXPERIMENT_T] = None,
@@ -5911,7 +5911,8 @@ def evaluate(
                 Defaults to None.
             description (str | None): A free-form text description for the experiment.
             max_concurrency (int | None): The maximum number of concurrent
-                evaluations to run. Defaults to None (max number of workers).
+                evaluations to run. If None then no limit is set. If 0 then no concurrency.
+                Defaults to 0.
             blocking (bool): Whether to block until the evaluation is complete.
                 Defaults to True.
             num_repetitions (int): The number of times to run the evaluation.
@@ -6053,6 +6054,8 @@ def evaluate(
             ...     summary_evaluators=[precision],
             ... )  # doctest: +ELLIPSIS
             View the evaluation results for experiment:...
+
+        .. versionadded:: 0.2.0
         """  # noqa: E501
         from langsmith.evaluation._runner import evaluate as evaluate_
 
@@ -6094,7 +6097,7 @@ async def aevaluate(
         metadata: Optional[dict] = None,
         experiment_prefix: Optional[str] = None,
         description: Optional[str] = None,
-        max_concurrency: Optional[int] = None,
+        max_concurrency: Optional[int] = 0,
         num_repetitions: int = 1,
         blocking: bool = True,
         experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
@@ -6119,8 +6122,9 @@ async def aevaluate(
             experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
                 Defaults to None.
             description (Optional[str]): A description of the experiment.
-            max_concurrency (Optional[int]): The maximum number of concurrent
-                evaluations to run. Defaults to None.
+            max_concurrency (int | None): The maximum number of concurrent
+                evaluations to run. If None then no limit is set. If 0 then no concurrency.
+                Defaults to 0.
             num_repetitions (int): The number of times to run the evaluation.
                 Each item in the dataset will be run and evaluated this many times.
                 Defaults to 1.
@@ -6259,6 +6263,9 @@ async def aevaluate(
             ...     )
             ... )  # doctest: +ELLIPSIS
             View the evaluation results for experiment:...
+
+        .. versionadded:: 0.2.0
+
         """  # noqa: E501
         from langsmith.evaluation._arunner import aevaluate as aevaluate_
 
diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py
index 878d7fe4c..4d5c063f6 100644
--- a/python/langsmith/evaluation/_arunner.py
+++ b/python/langsmith/evaluation/_arunner.py
@@ -84,7 +84,7 @@ async def aevaluate(
     metadata: Optional[dict] = None,
     experiment_prefix: Optional[str] = None,
     description: Optional[str] = None,
-    max_concurrency: Optional[int] = None,
+    max_concurrency: Optional[int] = 0,
     num_repetitions: int = 1,
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
@@ -110,8 +110,9 @@ async def aevaluate(
         experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
             Defaults to None.
         description (Optional[str]): A description of the experiment.
-        max_concurrency (Optional[int]): The maximum number of concurrent
-            evaluations to run. Defaults to None.
+        max_concurrency (int | None): The maximum number of concurrent
+            evaluations to run. If None then no limit is set. If 0 then no concurrency.
+            Defaults to 0.
         num_repetitions (int): The number of times to run the evaluation.
             Each item in the dataset will be run and evaluated this many times.
             Defaults to 1.
@@ -254,6 +255,11 @@ async def aevaluate(
         ...     )
         ... )  # doctest: +ELLIPSIS
         View the evaluation results for experiment:...
+
+    .. versionchanged:: 0.2.0
+
+        'max_concurrency' default updated from None (no limit on concurrency)
+        to 0 (no concurrency at all).
     """  # noqa: E501
     if isinstance(target, (str, uuid.UUID, schemas.TracerSession)):
         invalid_args = {
@@ -332,7 +338,7 @@ async def aevaluate_existing(
     evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]] = None,
     summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
     metadata: Optional[dict] = None,
-    max_concurrency: Optional[int] = None,
+    max_concurrency: Optional[int] = 0,
     client: Optional[langsmith.Client] = None,
     load_nested: bool = False,
     blocking: bool = True,
@@ -345,7 +351,9 @@ async def aevaluate_existing(
         summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): Optional sequence of evaluators
             to apply over the entire dataset.
         metadata (Optional[dict]): Optional metadata to include in the evaluation results.
-        max_concurrency (Optional[int]): Optional maximum number of concurrent evaluations.
+        max_concurrency (int | None): The maximum number of concurrent
+            evaluations to run. If None then no limit is set. If 0 then no concurrency.
+            Defaults to 0.
         client (Optional[langsmith.Client]): Optional Langsmith client to use for evaluation.
         load_nested: Whether to load all child runs for the experiment.
             Default is to only load the top-level root runs.
diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
index 20df5a795..bf7505284 100644
--- a/python/langsmith/evaluation/_runner.py
+++ b/python/langsmith/evaluation/_runner.py
@@ -101,7 +101,7 @@ def evaluate(
     metadata: Optional[dict] = None,
     experiment_prefix: Optional[str] = None,
     description: Optional[str] = None,
-    max_concurrency: Optional[int] = None,
+    max_concurrency: Optional[int] = 0,
     num_repetitions: int = 1,
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
@@ -121,7 +121,7 @@ def evaluate(
     metadata: Optional[dict] = None,
     experiment_prefix: Optional[str] = None,
     description: Optional[str] = None,
-    max_concurrency: Optional[int] = None,
+    max_concurrency: Optional[int] = 0,
     num_repetitions: int = 1,
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
@@ -142,7 +142,7 @@ def evaluate(
     metadata: Optional[dict] = None,
     experiment_prefix: Optional[str] = None,
     description: Optional[str] = None,
-    max_concurrency: Optional[int] = None,
+    max_concurrency: Optional[int] = 0,
     num_repetitions: int = 1,
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
@@ -171,7 +171,8 @@ def evaluate(
             Defaults to None.
         description (str | None): A free-form text description for the experiment.
         max_concurrency (int | None): The maximum number of concurrent
-            evaluations to run. Defaults to None (max number of workers).
+            evaluations to run. If None then no limit is set. If 0 then no concurrency.
+            Defaults to 0.
         client (langsmith.Client | None): The LangSmith client to use.
             Defaults to None.
         blocking (bool): Whether to block until the evaluation is complete.
@@ -317,6 +318,11 @@ def evaluate(
         ...     summary_evaluators=[precision],
         ... )  # doctest: +ELLIPSIS
         View the evaluation results for experiment:...
+
+    .. versionchanged:: 0.2.0
+
+        'max_concurrency' default updated from None (no limit on concurrency)
+        to 0 (no concurrency at all).
     """  # noqa: E501
     if isinstance(target, (str, uuid.UUID, schemas.TracerSession)):
         invalid_args = {
@@ -440,7 +446,7 @@ def evaluate_existing(
     evaluators: Optional[Sequence[EVALUATOR_T]] = None,
     summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
     metadata: Optional[dict] = None,
-    max_concurrency: Optional[int] = None,
+    max_concurrency: Optional[int] = 0,
     client: Optional[langsmith.Client] = None,
     load_nested: bool = False,
     blocking: bool = True,
@@ -454,7 +460,9 @@ def evaluate_existing(
         summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): Optional sequence of evaluators
             to apply over the entire dataset.
         metadata (Optional[dict]): Optional metadata to include in the evaluation results.
-        max_concurrency (Optional[int]): Optional maximum number of concurrent evaluations.
+        max_concurrency (int | None): The maximum number of concurrent
+            evaluations to run. If None then no limit is set. If 0 then no concurrency.
+            Defaults to 0.
         client (Optional[langsmith.Client]): Optional Langsmith client to use for evaluation.
         load_nested: Whether to load all child runs for the experiment.
             Default is to only load the top-level root runs.
@@ -1597,7 +1605,7 @@ def _score(
         (e.g. from a previous prediction step)
         """
         with ls_utils.ContextThreadPoolExecutor(
-            max_workers=max_concurrency
+            max_workers=max_concurrency or 1
         ) as executor:
             if max_concurrency == 0:
                 context = copy_context()
@@ -1815,14 +1823,24 @@ def _get_run(r: rt.RunTree) -> None:
         return _ForwardResults(run=cast(schemas.Run, run), example=example)
 
 
+def _is_valid_uuid(value: str) -> bool:
+    try:
+        uuid.UUID(value)
+        return True
+    except ValueError:
+        return False
+
+
 def _resolve_data(
     data: DATA_T, *, client: langsmith.Client
 ) -> Iterable[schemas.Example]:
     """Return the examples for the given dataset."""
-    if isinstance(data, str):
-        return client.list_examples(dataset_name=data)
-    elif isinstance(data, uuid.UUID):
+    if isinstance(data, uuid.UUID):
         return client.list_examples(dataset_id=data)
+    elif isinstance(data, str) and _is_valid_uuid(data):
+        return client.list_examples(dataset_id=uuid.UUID(data))
+    elif isinstance(data, str):
+        return client.list_examples(dataset_name=data)
     elif isinstance(data, schemas.Dataset):
         return client.list_examples(dataset_id=data.id)
     return data
diff --git a/python/poetry.lock b/python/poetry.lock
index 598ceaaf5..77d4b5c08 100644
--- a/python/poetry.lock
+++ b/python/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
 
 [[package]]
 name = "annotated-types"
@@ -11,9 +11,6 @@ files = [
     {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"},
 ]
 
-[package.dependencies]
-typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.9\""}
-
 [[package]]
 name = "anyio"
 version = "4.5.2"
@@ -2115,5 +2112,5 @@ vcr = []
 
 [metadata]
 lock-version = "2.0"
-python-versions = ">=3.8.1,<4.0"
-content-hash = "5eb7654ca6ae40c61164f6da76b8b4bd6baf0ef0967c77251bd01efad9d7d320"
+python-versions = ">=3.9,<4.0"
+content-hash = "c7acc8c8f123bf7968b265a0f0cdd0b679d88559bfbff33488bff25bb4f54f0f"
diff --git a/python/pyproject.toml b/python/pyproject.toml
index fd0f7798e..315d26d72 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langsmith"
-version = "0.1.147"
+version = "0.2.0"
 description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
 authors = ["LangChain <support@langchain.dev>"]
 license = "MIT"
@@ -25,7 +25,7 @@ packages = [{ include = "langsmith" }]
 langsmith = "langsmith.cli.main:main"
 
 [tool.poetry.dependencies]
-python = ">=3.8.1,<4.0"
+python = ">=3.9,<4.0"
 pydantic = [
   { version = ">=1,<3", python = "<3.12.4" },
   { version = "^2.7.4", python = ">=3.12.4" },
diff --git a/python/tests/evaluation/test_evaluation.py b/python/tests/evaluation/test_evaluation.py
index b709550ab..2bf09c110 100644
--- a/python/tests/evaluation/test_evaluation.py
+++ b/python/tests/evaluation/test_evaluation.py
@@ -474,7 +474,7 @@ async def predict(inputs: dict):
             data=ds_name,
         )
 
-    with pytest.raises(ValueError, match=match_val):
+    with pytest.raises(ValueError, match="Must specify 'data'"):
         await aevaluate(
             predict,
             data=[],
diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py
index a6c60c549..132af656a 100644
--- a/python/tests/unit_tests/evaluation/test_runner.py
+++ b/python/tests/unit_tests/evaluation/test_runner.py
@@ -16,11 +16,8 @@
 
 import pytest
 
-from langsmith import evaluate
+from langsmith import Client, aevaluate, evaluate
 from langsmith import schemas as ls_schemas
-from langsmith.client import Client
-from langsmith.evaluation._arunner import aevaluate, aevaluate_existing
-from langsmith.evaluation._runner import evaluate_existing
 from langsmith.evaluation.evaluator import (
     _normalize_comparison_evaluator_func,
     _normalize_evaluator_func,
@@ -276,6 +273,7 @@ def summary_eval_outputs_reference(outputs, reference_outputs):
         num_repetitions=NUM_REPETITIONS,
         blocking=blocking,
         upload_results=upload_results,
+        max_concurrency=None,
     )
     if not blocking:
         deltas = []
@@ -327,7 +325,7 @@ def summary_eval_outputs_reference(outputs, reference_outputs):
         def score_value(run, example):
             return {"score": 0.7}
 
-        ex_results = evaluate_existing(
+        ex_results = evaluate(
             fake_request.created_session["name"],
             evaluators=[score_value],
             client=client,
@@ -549,6 +547,7 @@ def summary_eval_outputs_reference(outputs, reference_outputs):
         num_repetitions=NUM_REPETITIONS,
         blocking=blocking,
         upload_results=upload_results,
+        max_concurrency=None,
     )
     if not blocking:
         deltas = []
@@ -606,7 +605,7 @@ async def score_value(run, example):
         return {"score": 0.7}
 
     if upload_results:
-        ex_results = await aevaluate_existing(
+        ex_results = await aevaluate(
             fake_request.created_session["name"],
             evaluators=[score_value],
             client=client,