Skip to content

Commit

Permalink
python[minor], js[patch]: release py0.2.0, js0.2.9 (#1247)
Browse files Browse the repository at this point in the history
v0.2
---------

Co-authored-by: Bagatur <[email protected]>
Co-authored-by: Bagatur <[email protected]>
  • Loading branch information
3 people authored Dec 5, 2024
1 parent 59a8c09 commit b2fd1c8
Show file tree
Hide file tree
Showing 13 changed files with 132 additions and 133 deletions.
2 changes: 1 addition & 1 deletion js/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "langsmith",
"version": "0.2.8",
"version": "0.2.9",
"description": "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform.",
"packageManager": "[email protected]",
"files": [
Expand Down
70 changes: 39 additions & 31 deletions js/src/evaluation/_runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,17 @@ export type SummaryEvaluatorT =
| DeprecatedSyncSummaryEvaluator
| DeprecatedAsyncSummaryEvaluator
| ((args: {
runs?: Array<Run>;
examples?: Array<Example>;
inputs?: Array<Record<string, any>>;
outputs?: Array<Record<string, any>>;
runs: Array<Run>;
examples: Array<Example>;
inputs: Array<Record<string, any>>;
outputs: Array<Record<string, any>>;
referenceOutputs?: Array<Record<string, any>>;
}) => EvaluationResult | EvaluationResults)
| ((args: {
runs?: Array<Run>;
examples?: Array<Example>;
inputs?: Array<Record<string, any>>;
outputs?: Array<Record<string, any>>;
runs: Array<Run>;
examples: Array<Example>;
inputs: Array<Record<string, any>>;
outputs: Array<Record<string, any>>;
referenceOutputs?: Array<Record<string, any>>;
}) => Promise<EvaluationResult | EvaluationResults>);

Expand All @@ -93,17 +93,17 @@ export type EvaluatorT =
| DeprecatedFunctionEvaluator
| DeprecatedAsyncFunctionEvaluator
| ((args: {
run?: Run;
example?: Example;
inputs?: Record<string, any>;
outputs?: Record<string, any>;
run: Run;
example: Example;
inputs: Record<string, any>;
outputs: Record<string, any>;
referenceOutputs?: Record<string, any>;
}) => EvaluationResult | EvaluationResults)
| ((args: {
run?: Run;
example?: Example;
inputs?: Record<string, any>;
outputs?: Record<string, any>;
run: Run;
example: Example;
inputs: Record<string, any>;
outputs: Record<string, any>;
referenceOutputs?: Record<string, any>;
}) => Promise<EvaluationResult | EvaluationResults>);

Expand All @@ -130,11 +130,6 @@ interface _ExperimentManagerArgs {
}

type BaseEvaluateOptions = {
/**
* The dataset to evaluate on. Can be a dataset name, a list of
* examples, or a generator of examples.
*/
data: DataT;
/**
* Metadata to attach to the experiment.
* @default undefined
Expand Down Expand Up @@ -178,6 +173,11 @@ export interface EvaluateOptions extends BaseEvaluateOptions {
* @default undefined
*/
summaryEvaluators?: Array<SummaryEvaluatorT>;
/**
* The dataset to evaluate on. Can be a dataset name, a list of
* examples, or a generator of examples.
*/
data: DataT;
}

export interface ComparativeEvaluateOptions extends BaseEvaluateOptions {
Expand Down Expand Up @@ -934,8 +934,10 @@ async function _evaluate(
);

let manager = await new _ExperimentManager({
data: Array.isArray(fields.data) ? undefined : fields.data,
examples: Array.isArray(fields.data) ? fields.data : undefined,
data: Array.isArray(standardFields.data) ? undefined : standardFields.data,
examples: Array.isArray(standardFields.data)
? standardFields.data
: undefined,
client,
metadata: fields.metadata,
experiment: experiment_ ?? fields.experimentPrefix,
Expand Down Expand Up @@ -1063,10 +1065,12 @@ function _resolveData(
async function wrapSummaryEvaluators(
evaluators: SummaryEvaluatorT[],
optionsArray?: Partial<RunTreeConfig>[]
): Promise<SummaryEvaluatorT[]> {
): Promise<
Array<DeprecatedAsyncSummaryEvaluator | DeprecatedSyncSummaryEvaluator>
> {
async function _wrap(
evaluator: SummaryEvaluatorT
): Promise<SummaryEvaluatorT> {
): Promise<DeprecatedAsyncSummaryEvaluator | DeprecatedSyncSummaryEvaluator> {
const evalName = evaluator.name || "BatchEvaluator";

const wrapperInner = (
Expand All @@ -1087,10 +1091,10 @@ async function wrapSummaryEvaluators(
return Promise.resolve(
(
evaluator as (args: {
runs?: Run[];
examples?: Example[];
inputs?: Record<string, any>[];
outputs?: Record<string, any>[];
runs: Run[];
examples: Example[];
inputs: Record<string, any>[];
outputs: Record<string, any>[];
referenceOutputs?: Record<string, any>[];
}) => EvaluationResult | EvaluationResults
)({
Expand All @@ -1103,7 +1107,9 @@ async function wrapSummaryEvaluators(
);
}
// Otherwise use the traditional (runs, examples) signature
return Promise.resolve(evaluator(runs, examples));
return Promise.resolve(
(evaluator as DeprecatedSyncSummaryEvaluator)(runs, examples)
);
},
{ ...optionsArray, name: evalName }
);
Expand All @@ -1119,7 +1125,9 @@ async function wrapSummaryEvaluators(
return wrapperInner;
}

const results: SummaryEvaluatorT[] = [];
const results: Array<
DeprecatedAsyncSummaryEvaluator | DeprecatedSyncSummaryEvaluator
> = [];
for (let i = 0; i < evaluators.length; i++) {
results.push(await _wrap(evaluators[i]));
}
Expand Down
2 changes: 1 addition & 1 deletion js/src/evaluation/evaluate_comparative.ts
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ export type _ComparativeEvaluator = (args: {
runs: Run[];
example: Example;
inputs: Record<string, any>;
outputs?: Record<string, any>[];
outputs: Record<string, any>[];
referenceOutputs?: Record<string, any>;
}) => ComparisonEvaluationResultRow | Promise<ComparisonEvaluationResultRow>;

Expand Down
21 changes: 13 additions & 8 deletions js/src/evaluation/evaluator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -96,18 +96,23 @@ export type RunEvaluatorLike =
example?: Example
) => Promise<EvaluationResult | EvaluationResults>)
| ((run: Run, example?: Example) => EvaluationResult | EvaluationResults)
| ((
run: Run,
example: Example
) => Promise<EvaluationResult | EvaluationResults>)
| ((run: Run, example: Example) => EvaluationResult | EvaluationResults)
| ((args: {
run?: Run;
example?: Example;
inputs?: Record<string, any>;
outputs?: Record<string, any>;
run: Run;
example: Example;
inputs: Record<string, any>;
outputs: Record<string, any>;
referenceOutputs?: Record<string, any>;
}) => EvaluationResult | EvaluationResults)
| ((args: {
run?: Run;
example?: Example;
inputs?: Record<string, any>;
outputs?: Record<string, any>;
run: Run;
example: Example;
inputs: Record<string, any>;
outputs: Record<string, any>;
referenceOutputs?: Record<string, any>;
}) => Promise<EvaluationResult | EvaluationResults>);

Expand Down
2 changes: 1 addition & 1 deletion js/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ export { RunTree, type RunTreeConfig } from "./run_trees.js";
export { overrideFetchImplementation } from "./singletons/fetch.js";

// Update using yarn bump-version
export const __version__ = "0.2.8";
export const __version__ = "0.2.9";
65 changes: 11 additions & 54 deletions js/src/tests/evaluate.int.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import {
EvaluationResults,
} from "../evaluation/evaluator.js";
import { evaluate } from "../evaluation/_runner.js";
import { waitUntilRunFound } from "./utils.js";
import { Example, Run, TracerSession } from "../schemas.js";
import { Client } from "../index.js";
import { afterAll, beforeAll } from "@jest/globals";
Expand Down Expand Up @@ -1115,6 +1116,8 @@ test("evaluate handles partial summary evaluator parameters correctly", async ()
});

test("evaluate handles comparative target with ComparativeEvaluateOptions", async () => {
const client = new Client();

// First, create two experiments to compare
const targetFunc1 = (input: Record<string, any>) => {
return {
Expand All @@ -1139,13 +1142,18 @@ test("evaluate handles comparative target with ComparativeEvaluateOptions", asyn
description: "Second experiment for comparison",
});

await Promise.all(
[exp1, exp2].flatMap(({ results }) =>
results.flatMap(({ run }) => waitUntilRunFound(client, run.id))
)
);
// Create comparative evaluator
const comparativeEvaluator = ({
runs,
example,
}: {
runs?: Run[];
example?: Example;
runs: Run[];
example: Example;
}) => {
if (!runs || !example) throw new Error("Missing required parameters");

Expand All @@ -1167,7 +1175,6 @@ test("evaluate handles comparative target with ComparativeEvaluateOptions", asyn
const compareRes = await evaluate(
[exp1.experimentName, exp2.experimentName],
{
data: TESTING_DATASET_NAME,
evaluators: [comparativeEvaluator],
description: "Comparative evaluation test",
randomizeOrder: true,
Expand All @@ -1177,6 +1184,7 @@ test("evaluate handles comparative target with ComparativeEvaluateOptions", asyn

// Verify we got ComparisonEvaluationResults
expect(compareRes.experimentName).toBeDefined();
expect(compareRes.experimentName).toBeDefined();
expect(compareRes.results).toBeDefined();
expect(Array.isArray(compareRes.results)).toBe(true);

Expand Down Expand Up @@ -1212,59 +1220,8 @@ test("evaluate enforces correct evaluator types for comparative evaluation at ru
await expect(
// @ts-expect-error - Should error because standardEvaluator is not a ComparativeEvaluator
evaluate([exp1.experimentName, exp2.experimentName], {
data: TESTING_DATASET_NAME,
evaluators: [standardEvaluator],
description: "Should fail at runtime",
})
).rejects.toThrow(); // You might want to be more specific about the error message
});

test("evaluate comparative options includes comparative-specific fields", async () => {
const exp1 = await evaluate(
(input: Record<string, any>) => ({ foo: input.input + 1 }),
{
data: TESTING_DATASET_NAME,
}
);

const exp2 = await evaluate(
(input: Record<string, any>) => ({ foo: input.input + 2 }),
{
data: TESTING_DATASET_NAME,
}
);

const comparativeEvaluator = ({
runs,
example,
}: {
runs?: Run[];
example?: Example;
}) => {
if (!runs || !example) throw new Error("Missing required parameters");
return {
key: "comparative_score",
scores: Object.fromEntries(
runs.map((run) => [
run.id,
run.outputs?.foo === example.outputs?.output ? 1 : 0,
])
),
};
};

// Test that comparative-specific options work
const compareRes = await evaluate(
[exp1.experimentName, exp2.experimentName],
{
data: TESTING_DATASET_NAME,
evaluators: [comparativeEvaluator],
randomizeOrder: true, // Comparative-specific option
loadNested: true, // Comparative-specific option
description: "Testing comparative-specific options",
}
);

expect(compareRes.experimentName).toBeDefined();
expect(compareRes.results).toBeDefined();
});
21 changes: 14 additions & 7 deletions python/langsmith/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5842,7 +5842,7 @@ def evaluate(
metadata: Optional[dict] = None,
experiment_prefix: Optional[str] = None,
description: Optional[str] = None,
max_concurrency: Optional[int] = None,
max_concurrency: Optional[int] = 0,
num_repetitions: int = 1,
blocking: bool = True,
experiment: Optional[EXPERIMENT_T] = None,
Expand All @@ -5861,7 +5861,7 @@ def evaluate(
metadata: Optional[dict] = None,
experiment_prefix: Optional[str] = None,
description: Optional[str] = None,
max_concurrency: Optional[int] = None,
max_concurrency: Optional[int] = 0,
num_repetitions: int = 1,
blocking: bool = True,
experiment: Optional[EXPERIMENT_T] = None,
Expand All @@ -5883,7 +5883,7 @@ def evaluate(
metadata: Optional[dict] = None,
experiment_prefix: Optional[str] = None,
description: Optional[str] = None,
max_concurrency: Optional[int] = None,
max_concurrency: Optional[int] = 0,
num_repetitions: int = 1,
blocking: bool = True,
experiment: Optional[EXPERIMENT_T] = None,
Expand Down Expand Up @@ -5911,7 +5911,8 @@ def evaluate(
Defaults to None.
description (str | None): A free-form text description for the experiment.
max_concurrency (int | None): The maximum number of concurrent
evaluations to run. Defaults to None (max number of workers).
evaluations to run. If None then no limit is set. If 0 then no concurrency.
Defaults to 0.
blocking (bool): Whether to block until the evaluation is complete.
Defaults to True.
num_repetitions (int): The number of times to run the evaluation.
Expand Down Expand Up @@ -6053,6 +6054,8 @@ def evaluate(
... summary_evaluators=[precision],
... ) # doctest: +ELLIPSIS
View the evaluation results for experiment:...
.. versionadded:: 0.2.0
""" # noqa: E501
from langsmith.evaluation._runner import evaluate as evaluate_

Expand Down Expand Up @@ -6094,7 +6097,7 @@ async def aevaluate(
metadata: Optional[dict] = None,
experiment_prefix: Optional[str] = None,
description: Optional[str] = None,
max_concurrency: Optional[int] = None,
max_concurrency: Optional[int] = 0,
num_repetitions: int = 1,
blocking: bool = True,
experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
Expand All @@ -6119,8 +6122,9 @@ async def aevaluate(
experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
Defaults to None.
description (Optional[str]): A description of the experiment.
max_concurrency (Optional[int]): The maximum number of concurrent
evaluations to run. Defaults to None.
max_concurrency (int | None): The maximum number of concurrent
evaluations to run. If None then no limit is set. If 0 then no concurrency.
Defaults to 0.
num_repetitions (int): The number of times to run the evaluation.
Each item in the dataset will be run and evaluated this many times.
Defaults to 1.
Expand Down Expand Up @@ -6259,6 +6263,9 @@ async def aevaluate(
... )
... ) # doctest: +ELLIPSIS
View the evaluation results for experiment:...
.. versionadded:: 0.2.0
""" # noqa: E501
from langsmith.evaluation._arunner import aevaluate as aevaluate_

Expand Down
Loading

0 comments on commit b2fd1c8

Please sign in to comment.