diff --git a/docs/evaluation/how_to_guides/summary.mdx b/docs/evaluation/how_to_guides/summary.mdx index adedbe20..25116b7d 100644 --- a/docs/evaluation/how_to_guides/summary.mdx +++ b/docs/evaluation/how_to_guides/summary.mdx @@ -18,22 +18,20 @@ Below, we'll implement a very simple summary evaluator that computes overall pas python` def pass_50(outputs: list[dict], reference_outputs: list[dict]) -> bool: """Pass if >50% of all results are correct.""" - correct = sum([out["output"] == ref["label"] for out, ref in zip(outputs, reference_outputs)]) + correct = sum([out["class"] == ref["label"] for out, ref in zip(outputs, reference_outputs)]) return correct / len(outputs) > 0.5 `, typescript` - import { Run, Example } from "langsmith/schemas"; - - function summaryEval(runs: Run[], examples: Example[]) { + function summaryEval({ outputs, referenceOutputs }: { outputs: Record[], referenceOutputs?: Record[]}) { let correct = 0; - for (let i = 0; i < runs.length; i++) { - if (runs[i].outputs["output"] === examples[i].outputs["label"]) { + for (let i = 0; i < outputs.length; i++) { + if (outputs[i]["output"] === referenceOutputs[i]["label"]) { correct += 1; } } - return { key: "pass", score: correct / runs.length > 0.5 }; + return { key: "pass", score: correct / outputs.length > 0.5 }; } `, ]} @@ -67,9 +65,30 @@ You can then pass this evaluator to the `evaluate` method as follows: ) `, typescript` - await evaluate((inputs) => labelQuery(inputs["input"]), { + + import { Client } from "langsmith"; + import { evaluate } from "langsmith/evaluation"; + import type { EvaluationResult } from "langsmith/evaluation"; + + const client = new Client(); + const datasetName = "Toxic queries"; + const dataset = await client.clonePublicDataset( + "https://smith.langchain.com/public/3d6831e6-1680-4c88-94df-618c8e01fc55/d, + { datasetName: datasetName } + ); + + function correct({ outputs, referenceOutputs }: { outputs: Record, referenceOutputs?: Record }): EvaluationResult { + const score = outputs["class"] === referenceOutputs?["label"]; + return { key: "correct", score }; + } + + function badClassifier(inputs: Record): { class: string } { + return { class: "Not toxic" }; + } + + await evaluate(badClassifier, { data: datasetName, - evaluators: [correctLabel], + evaluators: [correct], summaryEvaluators: [summaryEval], experimentPrefix: "Toxic Queries", });