langchain-ai · baskaryan · Dec 5, 2024 · Nov 21, 2024 · Dec 5, 2024 · Dec 5, 2024
diff --git a/js/package.json b/js/package.json
@@ -1,6 +1,6 @@
 {
   "name": "langsmith",
-  "version": "0.2.8",
+  "version": "0.2.9",
   "description": "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform.",
   "packageManager": "[email protected]",
   "files": [

diff --git a/js/src/evaluation/_runner.ts b/js/src/evaluation/_runner.ts
@@ -58,17 +58,17 @@ export type SummaryEvaluatorT =
   | DeprecatedSyncSummaryEvaluator
   | DeprecatedAsyncSummaryEvaluator
   | ((args: {
-      runs?: Array<Run>;
-      examples?: Array<Example>;
-      inputs?: Array<Record<string, any>>;
-      outputs?: Array<Record<string, any>>;
+      runs: Array<Run>;
+      examples: Array<Example>;
+      inputs: Array<Record<string, any>>;
+      outputs: Array<Record<string, any>>;
       referenceOutputs?: Array<Record<string, any>>;
     }) => EvaluationResult | EvaluationResults)
   | ((args: {
-      runs?: Array<Run>;
-      examples?: Array<Example>;
-      inputs?: Array<Record<string, any>>;
-      outputs?: Array<Record<string, any>>;
+      runs: Array<Run>;
+      examples: Array<Example>;
+      inputs: Array<Record<string, any>>;
+      outputs: Array<Record<string, any>>;
       referenceOutputs?: Array<Record<string, any>>;
     }) => Promise<EvaluationResult | EvaluationResults>);
 
@@ -93,17 +93,17 @@ export type EvaluatorT =
   | DeprecatedFunctionEvaluator
   | DeprecatedAsyncFunctionEvaluator
   | ((args: {
-      run?: Run;
-      example?: Example;
-      inputs?: Record<string, any>;
-      outputs?: Record<string, any>;
+      run: Run;
+      example: Example;
+      inputs: Record<string, any>;
+      outputs: Record<string, any>;
       referenceOutputs?: Record<string, any>;
     }) => EvaluationResult | EvaluationResults)
   | ((args: {
-      run?: Run;
-      example?: Example;
-      inputs?: Record<string, any>;
-      outputs?: Record<string, any>;
+      run: Run;
+      example: Example;
+      inputs: Record<string, any>;
+      outputs: Record<string, any>;
       referenceOutputs?: Record<string, any>;
     }) => Promise<EvaluationResult | EvaluationResults>);
 
@@ -130,11 +130,6 @@ interface _ExperimentManagerArgs {
 }
 
 type BaseEvaluateOptions = {
-  /**
-   * The dataset to evaluate on. Can be a dataset name, a list of
-   * examples, or a generator of examples.
-   */
-  data: DataT;
   /**
    * Metadata to attach to the experiment.
    * @default undefined
@@ -178,6 +173,11 @@ export interface EvaluateOptions extends BaseEvaluateOptions {
    * @default undefined
    */
   summaryEvaluators?: Array<SummaryEvaluatorT>;
+  /**
+   * The dataset to evaluate on. Can be a dataset name, a list of
+   * examples, or a generator of examples.
+   */
+  data: DataT;
 }
 
 export interface ComparativeEvaluateOptions extends BaseEvaluateOptions {
@@ -934,8 +934,10 @@ async function _evaluate(
   );
 
   let manager = await new _ExperimentManager({
-    data: Array.isArray(fields.data) ? undefined : fields.data,
-    examples: Array.isArray(fields.data) ? fields.data : undefined,
+    data: Array.isArray(standardFields.data) ? undefined : standardFields.data,
+    examples: Array.isArray(standardFields.data)
+      ? standardFields.data
+      : undefined,
     client,
     metadata: fields.metadata,
     experiment: experiment_ ?? fields.experimentPrefix,
@@ -1063,10 +1065,12 @@ function _resolveData(
 async function wrapSummaryEvaluators(
   evaluators: SummaryEvaluatorT[],
   optionsArray?: Partial<RunTreeConfig>[]
-): Promise<SummaryEvaluatorT[]> {
+): Promise<
+  Array<DeprecatedAsyncSummaryEvaluator | DeprecatedSyncSummaryEvaluator>
+> {
   async function _wrap(
     evaluator: SummaryEvaluatorT
-  ): Promise<SummaryEvaluatorT> {
+  ): Promise<DeprecatedAsyncSummaryEvaluator | DeprecatedSyncSummaryEvaluator> {
     const evalName = evaluator.name || "BatchEvaluator";
 
     const wrapperInner = (
@@ -1087,10 +1091,10 @@ async function wrapSummaryEvaluators(
             return Promise.resolve(
               (
                 evaluator as (args: {
-                  runs?: Run[];
-                  examples?: Example[];
-                  inputs?: Record<string, any>[];
-                  outputs?: Record<string, any>[];
+                  runs: Run[];
+                  examples: Example[];
+                  inputs: Record<string, any>[];
+                  outputs: Record<string, any>[];
                   referenceOutputs?: Record<string, any>[];
                 }) => EvaluationResult | EvaluationResults
               )({
@@ -1103,7 +1107,9 @@ async function wrapSummaryEvaluators(
             );
           }
           // Otherwise use the traditional (runs, examples) signature
-          return Promise.resolve(evaluator(runs, examples));
+          return Promise.resolve(
+            (evaluator as DeprecatedSyncSummaryEvaluator)(runs, examples)
+          );
         },
         { ...optionsArray, name: evalName }
       );
@@ -1119,7 +1125,9 @@ async function wrapSummaryEvaluators(
     return wrapperInner;
   }
 
-  const results: SummaryEvaluatorT[] = [];
+  const results: Array<
+    DeprecatedAsyncSummaryEvaluator | DeprecatedSyncSummaryEvaluator
+  > = [];
   for (let i = 0; i < evaluators.length; i++) {
     results.push(await _wrap(evaluators[i]));
   }

diff --git a/js/src/evaluation/evaluate_comparative.ts b/js/src/evaluation/evaluate_comparative.ts
@@ -79,7 +79,7 @@ export type _ComparativeEvaluator = (args: {
   runs: Run[];
   example: Example;
   inputs: Record<string, any>;
-  outputs?: Record<string, any>[];
+  outputs: Record<string, any>[];
   referenceOutputs?: Record<string, any>;
 }) => ComparisonEvaluationResultRow | Promise<ComparisonEvaluationResultRow>;
 

diff --git a/js/src/evaluation/evaluator.ts b/js/src/evaluation/evaluator.ts
@@ -96,18 +96,23 @@ export type RunEvaluatorLike =
       example?: Example
     ) => Promise<EvaluationResult | EvaluationResults>)
   | ((run: Run, example?: Example) => EvaluationResult | EvaluationResults)
+  | ((
+      run: Run,
+      example: Example
+    ) => Promise<EvaluationResult | EvaluationResults>)
+  | ((run: Run, example: Example) => EvaluationResult | EvaluationResults)
   | ((args: {
-      run?: Run;
-      example?: Example;
-      inputs?: Record<string, any>;
-      outputs?: Record<string, any>;
+      run: Run;
+      example: Example;
+      inputs: Record<string, any>;
+      outputs: Record<string, any>;
       referenceOutputs?: Record<string, any>;
     }) => EvaluationResult | EvaluationResults)
   | ((args: {
-      run?: Run;
-      example?: Example;
-      inputs?: Record<string, any>;
-      outputs?: Record<string, any>;
+      run: Run;
+      example: Example;
+      inputs: Record<string, any>;
+      outputs: Record<string, any>;
       referenceOutputs?: Record<string, any>;
     }) => Promise<EvaluationResult | EvaluationResults>);
 

diff --git a/js/src/index.ts b/js/src/index.ts
@@ -18,4 +18,4 @@ export { RunTree, type RunTreeConfig } from "./run_trees.js";
 export { overrideFetchImplementation } from "./singletons/fetch.js";
 
 // Update using yarn bump-version
-export const __version__ = "0.2.8";
+export const __version__ = "0.2.9";
diff --git a/js/src/tests/evaluate.int.test.ts b/js/src/tests/evaluate.int.test.ts
@@ -3,6 +3,7 @@ import {
   EvaluationResults,
 } from "../evaluation/evaluator.js";
 import { evaluate } from "../evaluation/_runner.js";
+import { waitUntilRunFound } from "./utils.js";
 import { Example, Run, TracerSession } from "../schemas.js";
 import { Client } from "../index.js";
 import { afterAll, beforeAll } from "@jest/globals";
@@ -1115,6 +1116,8 @@ test("evaluate handles partial summary evaluator parameters correctly", async ()
 });
 
 test("evaluate handles comparative target with ComparativeEvaluateOptions", async () => {
+  const client = new Client();
+
   // First, create two experiments to compare
   const targetFunc1 = (input: Record<string, any>) => {
     return {
@@ -1139,13 +1142,18 @@ test("evaluate handles comparative target with ComparativeEvaluateOptions", asyn
     description: "Second experiment for comparison",
   });
 
+  await Promise.all(
+    [exp1, exp2].flatMap(({ results }) =>
+      results.flatMap(({ run }) => waitUntilRunFound(client, run.id))
+    )
+  );
   // Create comparative evaluator
   const comparativeEvaluator = ({
     runs,
     example,
   }: {
-    runs?: Run[];
-    example?: Example;
+    runs: Run[];
+    example: Example;
   }) => {
     if (!runs || !example) throw new Error("Missing required parameters");
 
@@ -1167,7 +1175,6 @@ test("evaluate handles comparative target with ComparativeEvaluateOptions", asyn
   const compareRes = await evaluate(
     [exp1.experimentName, exp2.experimentName],
     {
-      data: TESTING_DATASET_NAME,
       evaluators: [comparativeEvaluator],
       description: "Comparative evaluation test",
       randomizeOrder: true,
@@ -1177,6 +1184,7 @@ test("evaluate handles comparative target with ComparativeEvaluateOptions", asyn
 
   // Verify we got ComparisonEvaluationResults
   expect(compareRes.experimentName).toBeDefined();
+  expect(compareRes.experimentName).toBeDefined();
   expect(compareRes.results).toBeDefined();
   expect(Array.isArray(compareRes.results)).toBe(true);
 
@@ -1212,59 +1220,8 @@ test("evaluate enforces correct evaluator types for comparative evaluation at ru
   await expect(
     // @ts-expect-error - Should error because standardEvaluator is not a ComparativeEvaluator
     evaluate([exp1.experimentName, exp2.experimentName], {
-      data: TESTING_DATASET_NAME,
       evaluators: [standardEvaluator],
       description: "Should fail at runtime",
     })
   ).rejects.toThrow(); // You might want to be more specific about the error message
 });
-
-test("evaluate comparative options includes comparative-specific fields", async () => {
-  const exp1 = await evaluate(
-    (input: Record<string, any>) => ({ foo: input.input + 1 }),
-    {
-      data: TESTING_DATASET_NAME,
-    }
-  );
-
-  const exp2 = await evaluate(
-    (input: Record<string, any>) => ({ foo: input.input + 2 }),
-    {
-      data: TESTING_DATASET_NAME,
-    }
-  );
-
-  const comparativeEvaluator = ({
-    runs,
-    example,
-  }: {
-    runs?: Run[];
-    example?: Example;
-  }) => {
-    if (!runs || !example) throw new Error("Missing required parameters");
-    return {
-      key: "comparative_score",
-      scores: Object.fromEntries(
-        runs.map((run) => [
-          run.id,
-          run.outputs?.foo === example.outputs?.output ? 1 : 0,
-        ])
-      ),
-    };
-  };
-
-  // Test that comparative-specific options work
-  const compareRes = await evaluate(
-    [exp1.experimentName, exp2.experimentName],
-    {
-      data: TESTING_DATASET_NAME,
-      evaluators: [comparativeEvaluator],
-      randomizeOrder: true, // Comparative-specific option
-      loadNested: true, // Comparative-specific option
-      description: "Testing comparative-specific options",
-    }
-  );
-
-  expect(compareRes.experimentName).toBeDefined();
-  expect(compareRes.results).toBeDefined();
-});
diff --git a/python/langsmith/client.py b/python/langsmith/client.py
@@ -5842,7 +5842,7 @@ def evaluate(
         metadata: Optional[dict] = None,
         experiment_prefix: Optional[str] = None,
         description: Optional[str] = None,
-        max_concurrency: Optional[int] = None,
+        max_concurrency: Optional[int] = 0,
         num_repetitions: int = 1,
         blocking: bool = True,
         experiment: Optional[EXPERIMENT_T] = None,
@@ -5861,7 +5861,7 @@ def evaluate(
         metadata: Optional[dict] = None,
         experiment_prefix: Optional[str] = None,
         description: Optional[str] = None,
-        max_concurrency: Optional[int] = None,
+        max_concurrency: Optional[int] = 0,
         num_repetitions: int = 1,
         blocking: bool = True,
         experiment: Optional[EXPERIMENT_T] = None,
@@ -5883,7 +5883,7 @@ def evaluate(
         metadata: Optional[dict] = None,
         experiment_prefix: Optional[str] = None,
         description: Optional[str] = None,
-        max_concurrency: Optional[int] = None,
+        max_concurrency: Optional[int] = 0,
         num_repetitions: int = 1,
         blocking: bool = True,
         experiment: Optional[EXPERIMENT_T] = None,
@@ -5911,7 +5911,8 @@ def evaluate(
                 Defaults to None.
             description (str | None): A free-form text description for the experiment.
             max_concurrency (int | None): The maximum number of concurrent
-                evaluations to run. Defaults to None (max number of workers).
+                evaluations to run. If None then no limit is set. If 0 then no concurrency.
+                Defaults to 0.
             blocking (bool): Whether to block until the evaluation is complete.
                 Defaults to True.
             num_repetitions (int): The number of times to run the evaluation.
@@ -6053,6 +6054,8 @@ def evaluate(
             ...     summary_evaluators=[precision],
             ... )  # doctest: +ELLIPSIS
             View the evaluation results for experiment:...
+
+        .. versionadded:: 0.2.0
         """  # noqa: E501
         from langsmith.evaluation._runner import evaluate as evaluate_
 
@@ -6094,7 +6097,7 @@ async def aevaluate(
         metadata: Optional[dict] = None,
         experiment_prefix: Optional[str] = None,
         description: Optional[str] = None,
-        max_concurrency: Optional[int] = None,
+        max_concurrency: Optional[int] = 0,
         num_repetitions: int = 1,
         blocking: bool = True,
         experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
@@ -6119,8 +6122,9 @@ async def aevaluate(
             experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
                 Defaults to None.
             description (Optional[str]): A description of the experiment.
-            max_concurrency (Optional[int]): The maximum number of concurrent
-                evaluations to run. Defaults to None.
+            max_concurrency (int | None): The maximum number of concurrent
+                evaluations to run. If None then no limit is set. If 0 then no concurrency.
+                Defaults to 0.
             num_repetitions (int): The number of times to run the evaluation.
                 Each item in the dataset will be run and evaluated this many times.
                 Defaults to 1.
@@ -6259,6 +6263,9 @@ async def aevaluate(
             ...     )
             ... )  # doctest: +ELLIPSIS
             View the evaluation results for experiment:...
+
+        .. versionadded:: 0.2.0
+
         """  # noqa: E501
         from langsmith.evaluation._arunner import aevaluate as aevaluate_