From b2fd1c85811790bcd3fa94415801bd077d4f4ba8 Mon Sep 17 00:00:00 2001 From: jakerachleff Date: Thu, 5 Dec 2024 10:38:44 -0800 Subject: [PATCH] python[minor], js[patch]: release py0.2.0, js0.2.9 (#1247) v0.2 --------- Co-authored-by: Bagatur Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> --- js/package.json | 2 +- js/src/evaluation/_runner.ts | 70 +++++++++++-------- js/src/evaluation/evaluate_comparative.ts | 2 +- js/src/evaluation/evaluator.ts | 21 +++--- js/src/index.ts | 2 +- js/src/tests/evaluate.int.test.ts | 65 +++-------------- python/langsmith/client.py | 21 ++++-- python/langsmith/evaluation/_arunner.py | 18 +++-- python/langsmith/evaluation/_runner.py | 38 +++++++--- python/poetry.lock | 9 +-- python/pyproject.toml | 4 +- python/tests/evaluation/test_evaluation.py | 2 +- .../unit_tests/evaluation/test_runner.py | 11 ++- 13 files changed, 132 insertions(+), 133 deletions(-) diff --git a/js/package.json b/js/package.json index ea27664f6..a209a4f5c 100644 --- a/js/package.json +++ b/js/package.json @@ -1,6 +1,6 @@ { "name": "langsmith", - "version": "0.2.8", + "version": "0.2.9", "description": "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform.", "packageManager": "yarn@1.22.19", "files": [ diff --git a/js/src/evaluation/_runner.ts b/js/src/evaluation/_runner.ts index 7a373297d..cac6b5f4c 100644 --- a/js/src/evaluation/_runner.ts +++ b/js/src/evaluation/_runner.ts @@ -58,17 +58,17 @@ export type SummaryEvaluatorT = | DeprecatedSyncSummaryEvaluator | DeprecatedAsyncSummaryEvaluator | ((args: { - runs?: Array; - examples?: Array; - inputs?: Array>; - outputs?: Array>; + runs: Array; + examples: Array; + inputs: Array>; + outputs: Array>; referenceOutputs?: Array>; }) => EvaluationResult | EvaluationResults) | ((args: { - runs?: Array; - examples?: Array; - inputs?: Array>; - outputs?: Array>; + runs: Array; + examples: Array; + inputs: Array>; + outputs: Array>; referenceOutputs?: Array>; }) => Promise); @@ -93,17 +93,17 @@ export type EvaluatorT = | DeprecatedFunctionEvaluator | DeprecatedAsyncFunctionEvaluator | ((args: { - run?: Run; - example?: Example; - inputs?: Record; - outputs?: Record; + run: Run; + example: Example; + inputs: Record; + outputs: Record; referenceOutputs?: Record; }) => EvaluationResult | EvaluationResults) | ((args: { - run?: Run; - example?: Example; - inputs?: Record; - outputs?: Record; + run: Run; + example: Example; + inputs: Record; + outputs: Record; referenceOutputs?: Record; }) => Promise); @@ -130,11 +130,6 @@ interface _ExperimentManagerArgs { } type BaseEvaluateOptions = { - /** - * The dataset to evaluate on. Can be a dataset name, a list of - * examples, or a generator of examples. - */ - data: DataT; /** * Metadata to attach to the experiment. * @default undefined @@ -178,6 +173,11 @@ export interface EvaluateOptions extends BaseEvaluateOptions { * @default undefined */ summaryEvaluators?: Array; + /** + * The dataset to evaluate on. Can be a dataset name, a list of + * examples, or a generator of examples. + */ + data: DataT; } export interface ComparativeEvaluateOptions extends BaseEvaluateOptions { @@ -934,8 +934,10 @@ async function _evaluate( ); let manager = await new _ExperimentManager({ - data: Array.isArray(fields.data) ? undefined : fields.data, - examples: Array.isArray(fields.data) ? fields.data : undefined, + data: Array.isArray(standardFields.data) ? undefined : standardFields.data, + examples: Array.isArray(standardFields.data) + ? standardFields.data + : undefined, client, metadata: fields.metadata, experiment: experiment_ ?? fields.experimentPrefix, @@ -1063,10 +1065,12 @@ function _resolveData( async function wrapSummaryEvaluators( evaluators: SummaryEvaluatorT[], optionsArray?: Partial[] -): Promise { +): Promise< + Array +> { async function _wrap( evaluator: SummaryEvaluatorT - ): Promise { + ): Promise { const evalName = evaluator.name || "BatchEvaluator"; const wrapperInner = ( @@ -1087,10 +1091,10 @@ async function wrapSummaryEvaluators( return Promise.resolve( ( evaluator as (args: { - runs?: Run[]; - examples?: Example[]; - inputs?: Record[]; - outputs?: Record[]; + runs: Run[]; + examples: Example[]; + inputs: Record[]; + outputs: Record[]; referenceOutputs?: Record[]; }) => EvaluationResult | EvaluationResults )({ @@ -1103,7 +1107,9 @@ async function wrapSummaryEvaluators( ); } // Otherwise use the traditional (runs, examples) signature - return Promise.resolve(evaluator(runs, examples)); + return Promise.resolve( + (evaluator as DeprecatedSyncSummaryEvaluator)(runs, examples) + ); }, { ...optionsArray, name: evalName } ); @@ -1119,7 +1125,9 @@ async function wrapSummaryEvaluators( return wrapperInner; } - const results: SummaryEvaluatorT[] = []; + const results: Array< + DeprecatedAsyncSummaryEvaluator | DeprecatedSyncSummaryEvaluator + > = []; for (let i = 0; i < evaluators.length; i++) { results.push(await _wrap(evaluators[i])); } diff --git a/js/src/evaluation/evaluate_comparative.ts b/js/src/evaluation/evaluate_comparative.ts index c70b6c0ab..06bee8343 100644 --- a/js/src/evaluation/evaluate_comparative.ts +++ b/js/src/evaluation/evaluate_comparative.ts @@ -79,7 +79,7 @@ export type _ComparativeEvaluator = (args: { runs: Run[]; example: Example; inputs: Record; - outputs?: Record[]; + outputs: Record[]; referenceOutputs?: Record; }) => ComparisonEvaluationResultRow | Promise; diff --git a/js/src/evaluation/evaluator.ts b/js/src/evaluation/evaluator.ts index cad4707f1..4e64460d3 100644 --- a/js/src/evaluation/evaluator.ts +++ b/js/src/evaluation/evaluator.ts @@ -96,18 +96,23 @@ export type RunEvaluatorLike = example?: Example ) => Promise) | ((run: Run, example?: Example) => EvaluationResult | EvaluationResults) + | (( + run: Run, + example: Example + ) => Promise) + | ((run: Run, example: Example) => EvaluationResult | EvaluationResults) | ((args: { - run?: Run; - example?: Example; - inputs?: Record; - outputs?: Record; + run: Run; + example: Example; + inputs: Record; + outputs: Record; referenceOutputs?: Record; }) => EvaluationResult | EvaluationResults) | ((args: { - run?: Run; - example?: Example; - inputs?: Record; - outputs?: Record; + run: Run; + example: Example; + inputs: Record; + outputs: Record; referenceOutputs?: Record; }) => Promise); diff --git a/js/src/index.ts b/js/src/index.ts index d648d23c5..04e2bd53e 100644 --- a/js/src/index.ts +++ b/js/src/index.ts @@ -18,4 +18,4 @@ export { RunTree, type RunTreeConfig } from "./run_trees.js"; export { overrideFetchImplementation } from "./singletons/fetch.js"; // Update using yarn bump-version -export const __version__ = "0.2.8"; +export const __version__ = "0.2.9"; diff --git a/js/src/tests/evaluate.int.test.ts b/js/src/tests/evaluate.int.test.ts index e3e524708..ae281b561 100644 --- a/js/src/tests/evaluate.int.test.ts +++ b/js/src/tests/evaluate.int.test.ts @@ -3,6 +3,7 @@ import { EvaluationResults, } from "../evaluation/evaluator.js"; import { evaluate } from "../evaluation/_runner.js"; +import { waitUntilRunFound } from "./utils.js"; import { Example, Run, TracerSession } from "../schemas.js"; import { Client } from "../index.js"; import { afterAll, beforeAll } from "@jest/globals"; @@ -1115,6 +1116,8 @@ test("evaluate handles partial summary evaluator parameters correctly", async () }); test("evaluate handles comparative target with ComparativeEvaluateOptions", async () => { + const client = new Client(); + // First, create two experiments to compare const targetFunc1 = (input: Record) => { return { @@ -1139,13 +1142,18 @@ test("evaluate handles comparative target with ComparativeEvaluateOptions", asyn description: "Second experiment for comparison", }); + await Promise.all( + [exp1, exp2].flatMap(({ results }) => + results.flatMap(({ run }) => waitUntilRunFound(client, run.id)) + ) + ); // Create comparative evaluator const comparativeEvaluator = ({ runs, example, }: { - runs?: Run[]; - example?: Example; + runs: Run[]; + example: Example; }) => { if (!runs || !example) throw new Error("Missing required parameters"); @@ -1167,7 +1175,6 @@ test("evaluate handles comparative target with ComparativeEvaluateOptions", asyn const compareRes = await evaluate( [exp1.experimentName, exp2.experimentName], { - data: TESTING_DATASET_NAME, evaluators: [comparativeEvaluator], description: "Comparative evaluation test", randomizeOrder: true, @@ -1177,6 +1184,7 @@ test("evaluate handles comparative target with ComparativeEvaluateOptions", asyn // Verify we got ComparisonEvaluationResults expect(compareRes.experimentName).toBeDefined(); + expect(compareRes.experimentName).toBeDefined(); expect(compareRes.results).toBeDefined(); expect(Array.isArray(compareRes.results)).toBe(true); @@ -1212,59 +1220,8 @@ test("evaluate enforces correct evaluator types for comparative evaluation at ru await expect( // @ts-expect-error - Should error because standardEvaluator is not a ComparativeEvaluator evaluate([exp1.experimentName, exp2.experimentName], { - data: TESTING_DATASET_NAME, evaluators: [standardEvaluator], description: "Should fail at runtime", }) ).rejects.toThrow(); // You might want to be more specific about the error message }); - -test("evaluate comparative options includes comparative-specific fields", async () => { - const exp1 = await evaluate( - (input: Record) => ({ foo: input.input + 1 }), - { - data: TESTING_DATASET_NAME, - } - ); - - const exp2 = await evaluate( - (input: Record) => ({ foo: input.input + 2 }), - { - data: TESTING_DATASET_NAME, - } - ); - - const comparativeEvaluator = ({ - runs, - example, - }: { - runs?: Run[]; - example?: Example; - }) => { - if (!runs || !example) throw new Error("Missing required parameters"); - return { - key: "comparative_score", - scores: Object.fromEntries( - runs.map((run) => [ - run.id, - run.outputs?.foo === example.outputs?.output ? 1 : 0, - ]) - ), - }; - }; - - // Test that comparative-specific options work - const compareRes = await evaluate( - [exp1.experimentName, exp2.experimentName], - { - data: TESTING_DATASET_NAME, - evaluators: [comparativeEvaluator], - randomizeOrder: true, // Comparative-specific option - loadNested: true, // Comparative-specific option - description: "Testing comparative-specific options", - } - ); - - expect(compareRes.experimentName).toBeDefined(); - expect(compareRes.results).toBeDefined(); -}); diff --git a/python/langsmith/client.py b/python/langsmith/client.py index 1735889f5..e0b3b83e3 100644 --- a/python/langsmith/client.py +++ b/python/langsmith/client.py @@ -5842,7 +5842,7 @@ def evaluate( metadata: Optional[dict] = None, experiment_prefix: Optional[str] = None, description: Optional[str] = None, - max_concurrency: Optional[int] = None, + max_concurrency: Optional[int] = 0, num_repetitions: int = 1, blocking: bool = True, experiment: Optional[EXPERIMENT_T] = None, @@ -5861,7 +5861,7 @@ def evaluate( metadata: Optional[dict] = None, experiment_prefix: Optional[str] = None, description: Optional[str] = None, - max_concurrency: Optional[int] = None, + max_concurrency: Optional[int] = 0, num_repetitions: int = 1, blocking: bool = True, experiment: Optional[EXPERIMENT_T] = None, @@ -5883,7 +5883,7 @@ def evaluate( metadata: Optional[dict] = None, experiment_prefix: Optional[str] = None, description: Optional[str] = None, - max_concurrency: Optional[int] = None, + max_concurrency: Optional[int] = 0, num_repetitions: int = 1, blocking: bool = True, experiment: Optional[EXPERIMENT_T] = None, @@ -5911,7 +5911,8 @@ def evaluate( Defaults to None. description (str | None): A free-form text description for the experiment. max_concurrency (int | None): The maximum number of concurrent - evaluations to run. Defaults to None (max number of workers). + evaluations to run. If None then no limit is set. If 0 then no concurrency. + Defaults to 0. blocking (bool): Whether to block until the evaluation is complete. Defaults to True. num_repetitions (int): The number of times to run the evaluation. @@ -6053,6 +6054,8 @@ def evaluate( ... summary_evaluators=[precision], ... ) # doctest: +ELLIPSIS View the evaluation results for experiment:... + + .. versionadded:: 0.2.0 """ # noqa: E501 from langsmith.evaluation._runner import evaluate as evaluate_ @@ -6094,7 +6097,7 @@ async def aevaluate( metadata: Optional[dict] = None, experiment_prefix: Optional[str] = None, description: Optional[str] = None, - max_concurrency: Optional[int] = None, + max_concurrency: Optional[int] = 0, num_repetitions: int = 1, blocking: bool = True, experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None, @@ -6119,8 +6122,9 @@ async def aevaluate( experiment_prefix (Optional[str]): A prefix to provide for your experiment name. Defaults to None. description (Optional[str]): A description of the experiment. - max_concurrency (Optional[int]): The maximum number of concurrent - evaluations to run. Defaults to None. + max_concurrency (int | None): The maximum number of concurrent + evaluations to run. If None then no limit is set. If 0 then no concurrency. + Defaults to 0. num_repetitions (int): The number of times to run the evaluation. Each item in the dataset will be run and evaluated this many times. Defaults to 1. @@ -6259,6 +6263,9 @@ async def aevaluate( ... ) ... ) # doctest: +ELLIPSIS View the evaluation results for experiment:... + + .. versionadded:: 0.2.0 + """ # noqa: E501 from langsmith.evaluation._arunner import aevaluate as aevaluate_ diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py index 878d7fe4c..4d5c063f6 100644 --- a/python/langsmith/evaluation/_arunner.py +++ b/python/langsmith/evaluation/_arunner.py @@ -84,7 +84,7 @@ async def aevaluate( metadata: Optional[dict] = None, experiment_prefix: Optional[str] = None, description: Optional[str] = None, - max_concurrency: Optional[int] = None, + max_concurrency: Optional[int] = 0, num_repetitions: int = 1, client: Optional[langsmith.Client] = None, blocking: bool = True, @@ -110,8 +110,9 @@ async def aevaluate( experiment_prefix (Optional[str]): A prefix to provide for your experiment name. Defaults to None. description (Optional[str]): A description of the experiment. - max_concurrency (Optional[int]): The maximum number of concurrent - evaluations to run. Defaults to None. + max_concurrency (int | None): The maximum number of concurrent + evaluations to run. If None then no limit is set. If 0 then no concurrency. + Defaults to 0. num_repetitions (int): The number of times to run the evaluation. Each item in the dataset will be run and evaluated this many times. Defaults to 1. @@ -254,6 +255,11 @@ async def aevaluate( ... ) ... ) # doctest: +ELLIPSIS View the evaluation results for experiment:... + + .. versionchanged:: 0.2.0 + + 'max_concurrency' default updated from None (no limit on concurrency) + to 0 (no concurrency at all). """ # noqa: E501 if isinstance(target, (str, uuid.UUID, schemas.TracerSession)): invalid_args = { @@ -332,7 +338,7 @@ async def aevaluate_existing( evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]] = None, summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None, metadata: Optional[dict] = None, - max_concurrency: Optional[int] = None, + max_concurrency: Optional[int] = 0, client: Optional[langsmith.Client] = None, load_nested: bool = False, blocking: bool = True, @@ -345,7 +351,9 @@ async def aevaluate_existing( summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): Optional sequence of evaluators to apply over the entire dataset. metadata (Optional[dict]): Optional metadata to include in the evaluation results. - max_concurrency (Optional[int]): Optional maximum number of concurrent evaluations. + max_concurrency (int | None): The maximum number of concurrent + evaluations to run. If None then no limit is set. If 0 then no concurrency. + Defaults to 0. client (Optional[langsmith.Client]): Optional Langsmith client to use for evaluation. load_nested: Whether to load all child runs for the experiment. Default is to only load the top-level root runs. diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index 20df5a795..bf7505284 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -101,7 +101,7 @@ def evaluate( metadata: Optional[dict] = None, experiment_prefix: Optional[str] = None, description: Optional[str] = None, - max_concurrency: Optional[int] = None, + max_concurrency: Optional[int] = 0, num_repetitions: int = 1, client: Optional[langsmith.Client] = None, blocking: bool = True, @@ -121,7 +121,7 @@ def evaluate( metadata: Optional[dict] = None, experiment_prefix: Optional[str] = None, description: Optional[str] = None, - max_concurrency: Optional[int] = None, + max_concurrency: Optional[int] = 0, num_repetitions: int = 1, client: Optional[langsmith.Client] = None, blocking: bool = True, @@ -142,7 +142,7 @@ def evaluate( metadata: Optional[dict] = None, experiment_prefix: Optional[str] = None, description: Optional[str] = None, - max_concurrency: Optional[int] = None, + max_concurrency: Optional[int] = 0, num_repetitions: int = 1, client: Optional[langsmith.Client] = None, blocking: bool = True, @@ -171,7 +171,8 @@ def evaluate( Defaults to None. description (str | None): A free-form text description for the experiment. max_concurrency (int | None): The maximum number of concurrent - evaluations to run. Defaults to None (max number of workers). + evaluations to run. If None then no limit is set. If 0 then no concurrency. + Defaults to 0. client (langsmith.Client | None): The LangSmith client to use. Defaults to None. blocking (bool): Whether to block until the evaluation is complete. @@ -317,6 +318,11 @@ def evaluate( ... summary_evaluators=[precision], ... ) # doctest: +ELLIPSIS View the evaluation results for experiment:... + + .. versionchanged:: 0.2.0 + + 'max_concurrency' default updated from None (no limit on concurrency) + to 0 (no concurrency at all). """ # noqa: E501 if isinstance(target, (str, uuid.UUID, schemas.TracerSession)): invalid_args = { @@ -440,7 +446,7 @@ def evaluate_existing( evaluators: Optional[Sequence[EVALUATOR_T]] = None, summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None, metadata: Optional[dict] = None, - max_concurrency: Optional[int] = None, + max_concurrency: Optional[int] = 0, client: Optional[langsmith.Client] = None, load_nested: bool = False, blocking: bool = True, @@ -454,7 +460,9 @@ def evaluate_existing( summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): Optional sequence of evaluators to apply over the entire dataset. metadata (Optional[dict]): Optional metadata to include in the evaluation results. - max_concurrency (Optional[int]): Optional maximum number of concurrent evaluations. + max_concurrency (int | None): The maximum number of concurrent + evaluations to run. If None then no limit is set. If 0 then no concurrency. + Defaults to 0. client (Optional[langsmith.Client]): Optional Langsmith client to use for evaluation. load_nested: Whether to load all child runs for the experiment. Default is to only load the top-level root runs. @@ -1597,7 +1605,7 @@ def _score( (e.g. from a previous prediction step) """ with ls_utils.ContextThreadPoolExecutor( - max_workers=max_concurrency + max_workers=max_concurrency or 1 ) as executor: if max_concurrency == 0: context = copy_context() @@ -1815,14 +1823,24 @@ def _get_run(r: rt.RunTree) -> None: return _ForwardResults(run=cast(schemas.Run, run), example=example) +def _is_valid_uuid(value: str) -> bool: + try: + uuid.UUID(value) + return True + except ValueError: + return False + + def _resolve_data( data: DATA_T, *, client: langsmith.Client ) -> Iterable[schemas.Example]: """Return the examples for the given dataset.""" - if isinstance(data, str): - return client.list_examples(dataset_name=data) - elif isinstance(data, uuid.UUID): + if isinstance(data, uuid.UUID): return client.list_examples(dataset_id=data) + elif isinstance(data, str) and _is_valid_uuid(data): + return client.list_examples(dataset_id=uuid.UUID(data)) + elif isinstance(data, str): + return client.list_examples(dataset_name=data) elif isinstance(data, schemas.Dataset): return client.list_examples(dataset_id=data.id) return data diff --git a/python/poetry.lock b/python/poetry.lock index 598ceaaf5..77d4b5c08 100644 --- a/python/poetry.lock +++ b/python/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "annotated-types" @@ -11,9 +11,6 @@ files = [ {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"}, ] -[package.dependencies] -typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.9\""} - [[package]] name = "anyio" version = "4.5.2" @@ -2115,5 +2112,5 @@ vcr = [] [metadata] lock-version = "2.0" -python-versions = ">=3.8.1,<4.0" -content-hash = "5eb7654ca6ae40c61164f6da76b8b4bd6baf0ef0967c77251bd01efad9d7d320" +python-versions = ">=3.9,<4.0" +content-hash = "c7acc8c8f123bf7968b265a0f0cdd0b679d88559bfbff33488bff25bb4f54f0f" diff --git a/python/pyproject.toml b/python/pyproject.toml index fd0f7798e..315d26d72 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langsmith" -version = "0.1.147" +version = "0.2.0" description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." authors = ["LangChain "] license = "MIT" @@ -25,7 +25,7 @@ packages = [{ include = "langsmith" }] langsmith = "langsmith.cli.main:main" [tool.poetry.dependencies] -python = ">=3.8.1,<4.0" +python = ">=3.9,<4.0" pydantic = [ { version = ">=1,<3", python = "<3.12.4" }, { version = "^2.7.4", python = ">=3.12.4" }, diff --git a/python/tests/evaluation/test_evaluation.py b/python/tests/evaluation/test_evaluation.py index b709550ab..2bf09c110 100644 --- a/python/tests/evaluation/test_evaluation.py +++ b/python/tests/evaluation/test_evaluation.py @@ -474,7 +474,7 @@ async def predict(inputs: dict): data=ds_name, ) - with pytest.raises(ValueError, match=match_val): + with pytest.raises(ValueError, match="Must specify 'data'"): await aevaluate( predict, data=[], diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py index a6c60c549..132af656a 100644 --- a/python/tests/unit_tests/evaluation/test_runner.py +++ b/python/tests/unit_tests/evaluation/test_runner.py @@ -16,11 +16,8 @@ import pytest -from langsmith import evaluate +from langsmith import Client, aevaluate, evaluate from langsmith import schemas as ls_schemas -from langsmith.client import Client -from langsmith.evaluation._arunner import aevaluate, aevaluate_existing -from langsmith.evaluation._runner import evaluate_existing from langsmith.evaluation.evaluator import ( _normalize_comparison_evaluator_func, _normalize_evaluator_func, @@ -276,6 +273,7 @@ def summary_eval_outputs_reference(outputs, reference_outputs): num_repetitions=NUM_REPETITIONS, blocking=blocking, upload_results=upload_results, + max_concurrency=None, ) if not blocking: deltas = [] @@ -327,7 +325,7 @@ def summary_eval_outputs_reference(outputs, reference_outputs): def score_value(run, example): return {"score": 0.7} - ex_results = evaluate_existing( + ex_results = evaluate( fake_request.created_session["name"], evaluators=[score_value], client=client, @@ -549,6 +547,7 @@ def summary_eval_outputs_reference(outputs, reference_outputs): num_repetitions=NUM_REPETITIONS, blocking=blocking, upload_results=upload_results, + max_concurrency=None, ) if not blocking: deltas = [] @@ -606,7 +605,7 @@ async def score_value(run, example): return {"score": 0.7} if upload_results: - ex_results = await aevaluate_existing( + ex_results = await aevaluate( fake_request.created_session["name"], evaluators=[score_value], client=client,