diff --git a/js/src/evaluation/_runner.ts b/js/src/evaluation/_runner.ts index 6c74afa34..5ff1e48d2 100644 --- a/js/src/evaluation/_runner.ts +++ b/js/src/evaluation/_runner.ts @@ -2,7 +2,10 @@ import { Client, RunTree, RunTreeConfig } from "../index.js"; import { AttachmentInfo, BaseRun, + EvaluationResult, + EvaluationResults, Example, + ExperimentResultRow, KVMap, Run, TracerSession, @@ -15,12 +18,7 @@ import { atee } from "../utils/atee.js"; import { getLangChainEnvVarsMetadata } from "../utils/env.js"; import { printErrorStackTrace } from "../utils/error.js"; import { randomName } from "./_random_name.js"; -import { - EvaluationResult, - EvaluationResults, - RunEvaluator, - runEvaluator, -} from "./evaluator.js"; +import { RunEvaluator, runEvaluator } from "./evaluator.js"; import { LangSmithConflictError } from "../utils/error.js"; import { v4 as uuidv4 } from "uuid"; import { @@ -64,6 +62,18 @@ type DeprecatedAsyncSummaryEvaluator = ( examples: Array ) => Promise; +type SyncSummaryEvaluator = ( + runs: Array, + examples: Array, + evaluationResults: Array +) => EvaluationResult | EvaluationResults; + +type AsyncSummaryEvaluator = ( + runs: Array, + examples: Array, + evaluationResults: Array +) => Promise; + // Summary evaluator runs over the whole dataset export type SummaryEvaluatorT = | DeprecatedSyncSummaryEvaluator @@ -74,6 +84,7 @@ export type SummaryEvaluatorT = inputs: Array>; outputs: Array>; referenceOutputs?: Array>; + evaluationResults?: Array; }) => EvaluationResult | EvaluationResults) | ((args: { runs: Array; @@ -81,6 +92,7 @@ export type SummaryEvaluatorT = inputs: Array>; outputs: Array>; referenceOutputs?: Array>; + evaluationResults?: Array; }) => Promise); /** @deprecated Use object parameter version instead: (args: { run, example, inputs, outputs, referenceOutputs }) => ... */ @@ -133,13 +145,17 @@ interface _ExperimentManagerArgs { runs?: AsyncGenerator; evaluationResults?: AsyncGenerator; summaryResults?: AsyncGenerator< - (runsArray: Run[]) => AsyncGenerator, + ( + runsArray: Run[], + evaluationResults: ExperimentResultRow[] + ) => AsyncGenerator, any, unknown >; examples?: Example[]; numRepetitions?: number; _runsArray?: Run[]; + _evaluationResultsArray?: ExperimentResultRow[]; includeAttachments?: boolean; } @@ -235,12 +251,6 @@ export function evaluate( return _evaluate(target, options); } -export interface ExperimentResultRow { - run: Run; - example: Example; - evaluationResults: EvaluationResults; -} - /** * Manage the execution of experiments. * @@ -255,7 +265,10 @@ export class _ExperimentManager { _evaluationResults?: AsyncGenerator; _summaryResults?: AsyncGenerator< - (runsArray: Run[]) => AsyncGenerator, + ( + runsArray: Run[], + evaluationResults: ExperimentResultRow[] + ) => AsyncGenerator, any, unknown >; @@ -266,6 +279,8 @@ export class _ExperimentManager { _runsArray?: Run[]; + _evaluationResultsArray?: ExperimentResultRow[]; + client: Client; _experiment?: TracerSession; @@ -558,6 +573,7 @@ export class _ExperimentManager { client: this.client, runs: this.runs, _runsArray: this._runsArray, + _evaluationResultsArray: this._evaluationResultsArray, evaluationResults: this._evaluationResults, summaryResults: aggregateFeedbackGen, includeAttachments: this._includeAttachments, @@ -578,7 +594,15 @@ export class _ExperimentManager { for await (const evaluationResult of this.evaluationResults) { evaluationResults.push(evaluationResult); } + if (!this._evaluationResultsArray) { + this._evaluationResultsArray = []; + } for (let i = 0; i < this._runsArray.length; i++) { + this._evaluationResultsArray.push({ + run: this._runsArray[i], + example: examples[i], + evaluationResults: evaluationResults[i], + }); yield { run: this._runsArray[i], example: examples[i], @@ -598,7 +622,8 @@ export class _ExperimentManager { // This is because runs array is not available until after this generator // is set, so we need to pass it like so. for await (const evaluationResults of evaluationResultsGenerator( - this._runsArray ?? [] + this._runsArray ?? [], + this._evaluationResultsArray ?? [] )) { results.push(...evaluationResults.results); } @@ -752,7 +777,12 @@ export class _ExperimentManager { async *_applySummaryEvaluators( summaryEvaluators: Array - ): AsyncGenerator<(runsArray: Run[]) => AsyncGenerator> { + ): AsyncGenerator< + ( + runsArray: Run[], + evaluationResults: ExperimentResultRow[] + ) => AsyncGenerator + > { const projectId = this._getExperiment().id; const examples = await this.getExamples(); @@ -770,13 +800,18 @@ export class _ExperimentManager { yield async function* ( this: _ExperimentManager, - runsArray: Run[] + runsArray: Run[], + evaluationResults: ExperimentResultRow[] ): AsyncGenerator { const aggregateFeedback = []; for (const evaluator of wrappedEvaluators) { try { - const summaryEvalResult = await evaluator(runsArray, examples); + const summaryEvalResult = await evaluator( + runsArray, + examples, + evaluationResults + ); const flattenedResults = this.client._selectEvalResults(summaryEvalResult); @@ -1114,17 +1149,16 @@ function _resolveData( async function wrapSummaryEvaluators( evaluators: SummaryEvaluatorT[], optionsArray?: Partial[] -): Promise< - Array -> { +): Promise> { async function _wrap( evaluator: SummaryEvaluatorT - ): Promise { + ): Promise { const evalName = evaluator.name || "BatchEvaluator"; const wrapperInner = ( runs: Run[], - examples: Example[] + examples: Example[], + evaluationResults: ExperimentResultRow[] ): Promise => { const wrapperSuperInner = traceable( ( @@ -1145,6 +1179,7 @@ async function wrapSummaryEvaluators( inputs: Record[]; outputs: Record[]; referenceOutputs?: Record[]; + evaluationResults?: ExperimentResultRow[]; }) => EvaluationResult | EvaluationResults )({ runs, @@ -1152,12 +1187,17 @@ async function wrapSummaryEvaluators( inputs, outputs, referenceOutputs, + evaluationResults, }) ); } // Otherwise use the traditional (runs, examples) signature return Promise.resolve( - (evaluator as DeprecatedSyncSummaryEvaluator)(runs, examples) + (evaluator as SyncSummaryEvaluator)( + runs, + examples, + evaluationResults + ) ); }, { ...optionsArray, name: evalName } @@ -1174,9 +1214,7 @@ async function wrapSummaryEvaluators( return wrapperInner; } - const results: Array< - DeprecatedAsyncSummaryEvaluator | DeprecatedSyncSummaryEvaluator - > = []; + const results: Array = []; for (let i = 0; i < evaluators.length; i++) { results.push(await _wrap(evaluators[i])); } diff --git a/js/src/evaluation/evaluator.ts b/js/src/evaluation/evaluator.ts index cd7ce1fdd..02af85635 100644 --- a/js/src/evaluation/evaluator.ts +++ b/js/src/evaluation/evaluator.ts @@ -1,87 +1,13 @@ import { + EvaluationResult, + EvaluationResults, Example, - FeedbackConfig, Run, - ScoreType, - ValueType, } from "../schemas.js"; import { v4 as uuidv4 } from "uuid"; import { TraceableFunction, traceable } from "../traceable.js"; import { RunTreeConfig } from "../run_trees.js"; -/** - * Represents a categorical class. - */ -export type Category = { - /** - * The value of the category. - */ - value?: number; - /** - * The label of the category. - */ - label: string; -}; - -/** - * Represents the result of an evaluation. - */ -export type EvaluationResult = { - /** - * The key associated with the evaluation result. - */ - key: string; - /** - * The score of the evaluation result. - */ - score?: ScoreType; - /** - * The value of the evaluation result. - */ - value?: ValueType; - /** - * A comment associated with the evaluation result. - */ - comment?: string; - /** - * A correction record associated with the evaluation result. - */ - correction?: Record; - /** - * Information about the evaluator. - */ - evaluatorInfo?: Record; - /** - * The source run ID of the evaluation result. - * If set, a link to the source run will be available in the UI. - */ - sourceRunId?: string; - /** - * The target run ID of the evaluation result. - * If this is not set, the target run ID is assumed to be - * the root of the trace. - */ - targetRunId?: string; - - /** - * The feedback config associated with the evaluation result. - * If set, this will be used to define how a feedback key - * should be interpreted. - */ - feedbackConfig?: FeedbackConfig; -}; - -/** - * Batch evaluation results, if your evaluator wishes - * to return multiple scores. - */ -export type EvaluationResults = { - /** - * The evaluation results. - */ - results: Array; -}; - export interface RunEvaluator { evaluateRun( run: Run, diff --git a/js/src/schemas.ts b/js/src/schemas.ts index 9fe4e8e16..8ae50ec06 100644 --- a/js/src/schemas.ts +++ b/js/src/schemas.ts @@ -650,3 +650,82 @@ export type UsageMetadata = { */ output_token_details?: OutputTokenDetails; }; + +/** + * Represents a categorical class. + */ +export type Category = { + /** + * The value of the category. + */ + value?: number; + /** + * The label of the category. + */ + label: string; +}; + +/** + * Represents the result of an evaluation. + */ +export type EvaluationResult = { + /** + * The key associated with the evaluation result. + */ + key: string; + /** + * The score of the evaluation result. + */ + score?: ScoreType; + /** + * The value of the evaluation result. + */ + value?: ValueType; + /** + * A comment associated with the evaluation result. + */ + comment?: string; + /** + * A correction record associated with the evaluation result. + */ + correction?: Record; + /** + * Information about the evaluator. + */ + evaluatorInfo?: Record; + /** + * The source run ID of the evaluation result. + * If set, a link to the source run will be available in the UI. + */ + sourceRunId?: string; + /** + * The target run ID of the evaluation result. + * If this is not set, the target run ID is assumed to be + * the root of the trace. + */ + targetRunId?: string; + + /** + * The feedback config associated with the evaluation result. + * If set, this will be used to define how a feedback key + * should be interpreted. + */ + feedbackConfig?: FeedbackConfig; +}; + +/** + * Batch evaluation results, if your evaluator wishes + * to return multiple scores. + */ +export type EvaluationResults = { + /** + * The evaluation results. + */ + results: Array; +}; + +export interface ExperimentResultRow { + run: Run; + example: Example; + evaluationResults: EvaluationResults; +} diff --git a/js/src/tests/evaluate.int.test.ts b/js/src/tests/evaluate.int.test.ts index ae281b561..04e3ebf81 100644 --- a/js/src/tests/evaluate.int.test.ts +++ b/js/src/tests/evaluate.int.test.ts @@ -1,10 +1,13 @@ +import { evaluate } from "../evaluation/_runner.js"; +import { waitUntilRunFound } from "./utils.js"; import { EvaluationResult, EvaluationResults, -} from "../evaluation/evaluator.js"; -import { evaluate } from "../evaluation/_runner.js"; -import { waitUntilRunFound } from "./utils.js"; -import { Example, Run, TracerSession } from "../schemas.js"; + Example, + ExperimentResultRow, + Run, + TracerSession, +} from "../schemas.js"; import { Client } from "../index.js"; import { afterAll, beforeAll } from "@jest/globals"; import { RunnableLambda, RunnableSequence } from "@langchain/core/runnables"; @@ -1225,3 +1228,58 @@ test("evaluate enforces correct evaluator types for comparative evaluation at ru }) ).rejects.toThrow(); // You might want to be more specific about the error message }); + +test("summary evaluators receive evaluator results", async () => { + const client = new Client(); + + async function target( + inputs: Record + ): Promise<{ answer: string }> { + return { answer: inputs?.input.toString() }; + } + + // Define evaluator + const evaluator = () => { + return { + key: "foo", + score: 1, + }; + }; + + // Define summary evaluator + const summaryEvaluator = ({ + runs, + examples, + inputs, + outputs, + referenceOutputs, + evaluationResults, + }: { + runs: Array; + examples: Array; + inputs: Array>; + outputs: Array>; + referenceOutputs?: Array>; + evaluationResults?: Array; + }): EvaluationResult => { + expect(evaluationResults?.length).toBe(2); + expect(evaluationResults?.[0].evaluationResults.results[0].key).toBe("foo"); + expect(evaluationResults?.[0].evaluationResults.results[0].score).toBe(1); + return { + key: "summary_evaluator", + score: 1, + }; + }; + + // Run evaluation + const results = await evaluate(target, { + data: TESTING_DATASET_NAME, + evaluators: [evaluator], + summaryEvaluators: [summaryEvaluator], + numRepetitions: 1, + client: client, + }); + expect(results.summaryResults.results.length).toBe(1); + expect(results.summaryResults.results[0].score).toBe(1); + expect(results.summaryResults.results[0].key).toBe("summary_evaluator"); +});