Skip to content

Commit

Permalink
draft
Browse files Browse the repository at this point in the history
  • Loading branch information
isahers1 committed Dec 20, 2024
1 parent 7f7d7af commit 0dfce67
Show file tree
Hide file tree
Showing 4 changed files with 208 additions and 107 deletions.
92 changes: 65 additions & 27 deletions js/src/evaluation/_runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ import { Client, RunTree, RunTreeConfig } from "../index.js";
import {
AttachmentInfo,
BaseRun,
EvaluationResult,
EvaluationResults,
Example,
ExperimentResultRow,
KVMap,
Run,
TracerSession,
Expand All @@ -15,12 +18,7 @@ import { atee } from "../utils/atee.js";
import { getLangChainEnvVarsMetadata } from "../utils/env.js";
import { printErrorStackTrace } from "../utils/error.js";
import { randomName } from "./_random_name.js";
import {
EvaluationResult,
EvaluationResults,
RunEvaluator,
runEvaluator,
} from "./evaluator.js";
import { RunEvaluator, runEvaluator } from "./evaluator.js";
import { LangSmithConflictError } from "../utils/error.js";
import { v4 as uuidv4 } from "uuid";
import {
Expand Down Expand Up @@ -64,6 +62,18 @@ type DeprecatedAsyncSummaryEvaluator = (
examples: Array<Example>
) => Promise<EvaluationResult | EvaluationResults>;

type SyncSummaryEvaluator = (
runs: Array<Run>,
examples: Array<Example>,
evaluationResults: Array<ExperimentResultRow>
) => EvaluationResult | EvaluationResults;

type AsyncSummaryEvaluator = (
runs: Array<Run>,
examples: Array<Example>,
evaluationResults: Array<ExperimentResultRow>
) => Promise<EvaluationResult | EvaluationResults>;

// Summary evaluator runs over the whole dataset
export type SummaryEvaluatorT =
| DeprecatedSyncSummaryEvaluator
Expand All @@ -74,13 +84,15 @@ export type SummaryEvaluatorT =
inputs: Array<Record<string, any>>;
outputs: Array<Record<string, any>>;
referenceOutputs?: Array<Record<string, any>>;
evaluationResults?: Array<ExperimentResultRow>;
}) => EvaluationResult | EvaluationResults)
| ((args: {
runs: Array<Run>;
examples: Array<Example>;
inputs: Array<Record<string, any>>;
outputs: Array<Record<string, any>>;
referenceOutputs?: Array<Record<string, any>>;
evaluationResults?: Array<ExperimentResultRow>;
}) => Promise<EvaluationResult | EvaluationResults>);

/** @deprecated Use object parameter version instead: (args: { run, example, inputs, outputs, referenceOutputs }) => ... */
Expand Down Expand Up @@ -133,13 +145,17 @@ interface _ExperimentManagerArgs {
runs?: AsyncGenerator<Run>;
evaluationResults?: AsyncGenerator<EvaluationResults>;
summaryResults?: AsyncGenerator<
(runsArray: Run[]) => AsyncGenerator<EvaluationResults, any, unknown>,
(
runsArray: Run[],
evaluationResults: ExperimentResultRow[]
) => AsyncGenerator<EvaluationResults, any, unknown>,
any,
unknown
>;
examples?: Example[];
numRepetitions?: number;
_runsArray?: Run[];
_evaluationResultsArray?: ExperimentResultRow[];
includeAttachments?: boolean;
}

Expand Down Expand Up @@ -235,12 +251,6 @@ export function evaluate(
return _evaluate(target, options);
}

export interface ExperimentResultRow {
run: Run;
example: Example;
evaluationResults: EvaluationResults;
}

/**
* Manage the execution of experiments.
*
Expand All @@ -255,7 +265,10 @@ export class _ExperimentManager {
_evaluationResults?: AsyncGenerator<EvaluationResults>;

_summaryResults?: AsyncGenerator<
(runsArray: Run[]) => AsyncGenerator<EvaluationResults, any, unknown>,
(
runsArray: Run[],
evaluationResults: ExperimentResultRow[]
) => AsyncGenerator<EvaluationResults, any, unknown>,
any,
unknown
>;
Expand All @@ -266,6 +279,8 @@ export class _ExperimentManager {

_runsArray?: Run[];

_evaluationResultsArray?: ExperimentResultRow[];

client: Client;

_experiment?: TracerSession;
Expand Down Expand Up @@ -558,6 +573,7 @@ export class _ExperimentManager {
client: this.client,
runs: this.runs,
_runsArray: this._runsArray,
_evaluationResultsArray: this._evaluationResultsArray,
evaluationResults: this._evaluationResults,
summaryResults: aggregateFeedbackGen,
includeAttachments: this._includeAttachments,
Expand All @@ -578,7 +594,15 @@ export class _ExperimentManager {
for await (const evaluationResult of this.evaluationResults) {
evaluationResults.push(evaluationResult);
}
if (!this._evaluationResultsArray) {
this._evaluationResultsArray = [];
}
for (let i = 0; i < this._runsArray.length; i++) {
this._evaluationResultsArray.push({
run: this._runsArray[i],
example: examples[i],
evaluationResults: evaluationResults[i],
});
yield {
run: this._runsArray[i],
example: examples[i],
Expand All @@ -598,7 +622,8 @@ export class _ExperimentManager {
// This is because runs array is not available until after this generator
// is set, so we need to pass it like so.
for await (const evaluationResults of evaluationResultsGenerator(
this._runsArray ?? []
this._runsArray ?? [],
this._evaluationResultsArray ?? []
)) {
results.push(...evaluationResults.results);
}
Expand Down Expand Up @@ -752,7 +777,12 @@ export class _ExperimentManager {

async *_applySummaryEvaluators(
summaryEvaluators: Array<SummaryEvaluatorT>
): AsyncGenerator<(runsArray: Run[]) => AsyncGenerator<EvaluationResults>> {
): AsyncGenerator<
(
runsArray: Run[],
evaluationResults: ExperimentResultRow[]
) => AsyncGenerator<EvaluationResults>
> {
const projectId = this._getExperiment().id;
const examples = await this.getExamples();

Expand All @@ -770,13 +800,18 @@ export class _ExperimentManager {

yield async function* (
this: _ExperimentManager,
runsArray: Run[]
runsArray: Run[],
evaluationResults: ExperimentResultRow[]
): AsyncGenerator<EvaluationResults> {
const aggregateFeedback = [];

for (const evaluator of wrappedEvaluators) {
try {
const summaryEvalResult = await evaluator(runsArray, examples);
const summaryEvalResult = await evaluator(
runsArray,
examples,
evaluationResults
);

const flattenedResults =
this.client._selectEvalResults(summaryEvalResult);
Expand Down Expand Up @@ -1114,17 +1149,16 @@ function _resolveData(
async function wrapSummaryEvaluators(
evaluators: SummaryEvaluatorT[],
optionsArray?: Partial<RunTreeConfig>[]
): Promise<
Array<DeprecatedAsyncSummaryEvaluator | DeprecatedSyncSummaryEvaluator>
> {
): Promise<Array<AsyncSummaryEvaluator | SyncSummaryEvaluator>> {
async function _wrap(
evaluator: SummaryEvaluatorT
): Promise<DeprecatedAsyncSummaryEvaluator | DeprecatedSyncSummaryEvaluator> {
): Promise<AsyncSummaryEvaluator | SyncSummaryEvaluator> {
const evalName = evaluator.name || "BatchEvaluator";

const wrapperInner = (
runs: Run[],
examples: Example[]
examples: Example[],
evaluationResults: ExperimentResultRow[]
): Promise<EvaluationResult | EvaluationResults> => {
const wrapperSuperInner = traceable(
(
Expand All @@ -1145,19 +1179,25 @@ async function wrapSummaryEvaluators(
inputs: Record<string, any>[];
outputs: Record<string, any>[];
referenceOutputs?: Record<string, any>[];
evaluationResults?: ExperimentResultRow[];
}) => EvaluationResult | EvaluationResults
)({
runs,
examples,
inputs,
outputs,
referenceOutputs,
evaluationResults,
})
);
}
// Otherwise use the traditional (runs, examples) signature
return Promise.resolve(
(evaluator as DeprecatedSyncSummaryEvaluator)(runs, examples)
(evaluator as SyncSummaryEvaluator)(
runs,
examples,
evaluationResults
)
);
},
{ ...optionsArray, name: evalName }
Expand All @@ -1174,9 +1214,7 @@ async function wrapSummaryEvaluators(
return wrapperInner;
}

const results: Array<
DeprecatedAsyncSummaryEvaluator | DeprecatedSyncSummaryEvaluator
> = [];
const results: Array<AsyncSummaryEvaluator | SyncSummaryEvaluator> = [];
for (let i = 0; i < evaluators.length; i++) {
results.push(await _wrap(evaluators[i]));
}
Expand Down
78 changes: 2 additions & 76 deletions js/src/evaluation/evaluator.ts
Original file line number Diff line number Diff line change
@@ -1,87 +1,13 @@
import {
EvaluationResult,
EvaluationResults,
Example,
FeedbackConfig,
Run,
ScoreType,
ValueType,
} from "../schemas.js";
import { v4 as uuidv4 } from "uuid";
import { TraceableFunction, traceable } from "../traceable.js";
import { RunTreeConfig } from "../run_trees.js";

/**
* Represents a categorical class.
*/
export type Category = {
/**
* The value of the category.
*/
value?: number;
/**
* The label of the category.
*/
label: string;
};

/**
* Represents the result of an evaluation.
*/
export type EvaluationResult = {
/**
* The key associated with the evaluation result.
*/
key: string;
/**
* The score of the evaluation result.
*/
score?: ScoreType;
/**
* The value of the evaluation result.
*/
value?: ValueType;
/**
* A comment associated with the evaluation result.
*/
comment?: string;
/**
* A correction record associated with the evaluation result.
*/
correction?: Record<string, unknown>;
/**
* Information about the evaluator.
*/
evaluatorInfo?: Record<string, unknown>;
/**
* The source run ID of the evaluation result.
* If set, a link to the source run will be available in the UI.
*/
sourceRunId?: string;
/**
* The target run ID of the evaluation result.
* If this is not set, the target run ID is assumed to be
* the root of the trace.
*/
targetRunId?: string;

/**
* The feedback config associated with the evaluation result.
* If set, this will be used to define how a feedback key
* should be interpreted.
*/
feedbackConfig?: FeedbackConfig;
};

/**
* Batch evaluation results, if your evaluator wishes
* to return multiple scores.
*/
export type EvaluationResults = {
/**
* The evaluation results.
*/
results: Array<EvaluationResult>;
};

export interface RunEvaluator {
evaluateRun(
run: Run,
Expand Down
Loading

0 comments on commit 0dfce67

Please sign in to comment.