diff --git a/docs/evaluation/how_to_guides/evaluation/async.mdx b/docs/evaluation/how_to_guides/evaluation/async.mdx index bc021428..dfd7fc2e 100644 --- a/docs/evaluation/how_to_guides/evaluation/async.mdx +++ b/docs/evaluation/how_to_guides/evaluation/async.mdx @@ -77,4 +77,4 @@ list 5 concrete questions that should be investigated to determine if the idea i ## Related - [Run an evaluation (synchronously)](../../how_to_guides/evaluation/evaluate_llm_application) -- [Handle model rate limits](./how_to_guides/evaluation/rate_limiting) +- [Handle model rate limits](../../how_to_guides/evaluation/rate_limiting) diff --git a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx index 0db42b13..475cbf8f 100644 --- a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx +++ b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx @@ -85,7 +85,7 @@ Currently Python only =0.1.145`"})` from langsmith import evaluate, wrappers from openai import AsyncOpenAI # Assumes you've installed pydantic. diff --git a/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx b/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx index 2f6d6655..c8a0b8f7 100644 --- a/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx +++ b/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx @@ -23,8 +23,8 @@ For maximal control of evaluator logic, we can write a custom evaluator and run =0.1.145`"})` + from langsmith import evaluate, traceable, wrappers, Client from openai import OpenAI # Assumes you've installed pydantic from pydantic import BaseModel @@ -50,34 +50,30 @@ for the answer is logically valid and consistent with question and the answer.\\ messages=[{"role": "system", "content": instructions,}, {"role": "user", "content": msg}], response_format=Response ) - return response.choices[0].messages.parsed.reasoning_is_valid + return response.choices[0].message.parsed.reasoning_is_valid # Optionally add the 'traceable' decorator to trace the inputs/outputs of this function. @traceable def dummy_app(inputs: dict) -> dict: return {"answer": "hmm i'm not sure", "reasoning": "i didn't understand the question"} + ls_client = Client() + questions = ["how will the universe end", "are we alone"] + dataset = ls_client.create_dataset("big questions") + ls_client.create_examples(dataset_id=dataset.id, inputs=[{"question": q} for q in questions]) + results = evaluate( dummy_app, - data="dataset_name", + data=dataset, evaluators=[valid_reasoning] ) `, - typescript` - import type { EvaluationResult } from "langsmith/evaluation"; - import type { Run, Example } from "langsmith/schemas"; - - `, ]} /> See [here](../../how_to_guides/evaluation/custom_evaluator) for more on how to write a custom evaluator. -## Builtin evaluator via the UI - -See [here](../../how_to_guides/evaluation/builtin_evaluators) for how to use LangSmith's builtin evaluators. - ## Prebuilt evaluator via `langchain` See [here](../../how_to_guides/evaluation/use_langchain_off_the_shelf_evaluators) for how to use prebuilt evaluators from `langchain`. diff --git a/docs/evaluation/how_to_guides/evaluation/metric_type.mdx b/docs/evaluation/how_to_guides/evaluation/metric_type.mdx index cd97786d..5c61b3a7 100644 --- a/docs/evaluation/how_to_guides/evaluation/metric_type.mdx +++ b/docs/evaluation/how_to_guides/evaluation/metric_type.mdx @@ -29,7 +29,7 @@ Here are some examples: =0.1.145`"})` def numerical_metric(inputs: dict, outputs: dict, reference_outputs: dict) -> float: # Evaluation logic... diff --git a/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx b/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx index dec2f56e..069b3337 100644 --- a/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx +++ b/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx @@ -6,22 +6,26 @@ import { # How to return multiple scores in one evaluator -Sometimes it is useful for a [custom evaluator function](../../how_to_guides/evaluation/custom_evaluator) to return multiple metrics. +Sometimes it is useful for a [custom evaluator function](../../how_to_guides/evaluation/custom_evaluator) or [summary evaluator function](../../how_to_guides/evaluation/summary) to return multiple metrics. For example, if you have multiple metrics being generated by an LLM judge, you can save time and money by making a single LLM call that generates multiple metrics instead of making multiple LLM calls. -To return multiple scores, simply return a list of dictionaries/objects of the following form: +To return multiple scores using the Python SDK, simply return a list of dictionaries/objects of the following form: ```python -{ - [ - # 'key' is the metric name - # 'score' is the value of a numerical metric - {"key": string, "score": number}, - # 'value' is the value of a categorical metric - {"key": string, "value": string}, - ... # You may log as many as you wish - ] -} +[ + # 'key' is the metric name + # 'score' is the value of a numerical metric + {"key": string, "score": number}, + # 'value' is the value of a categorical metric + {"key": string, "value": string}, + ... # You may log as many as you wish +] +``` + +To do so with the JS/TS SDK, return an object with a 'results' key and then a list of the above form + +```js +{results: [{ key: string, score: number }, ...]}; ``` Each of these dictionaries can contain any or all of the [feedback fields](/reference/data_formats/feedback_data_format); check out the linked document for more information. @@ -31,7 +35,7 @@ Example: =0.1.145`"})` def multiple_scores(outputs: dict, reference_outputs: dict) -> list[dict]: # Replace with real evaluation logic. precision = 0.8 diff --git a/docs/evaluation/how_to_guides/evaluation/summary.mdx b/docs/evaluation/how_to_guides/evaluation/summary.mdx index a248905b..97fd68bf 100644 --- a/docs/evaluation/how_to_guides/evaluation/summary.mdx +++ b/docs/evaluation/how_to_guides/evaluation/summary.mdx @@ -4,7 +4,7 @@ import { typescript, } from "@site/src/components/InstructionsWithCode"; -# How to run an aggregate evaluation +# How to define a summary evaluator Some metrics can only be defined on the entire experiment level as opposed to the individual runs of the experiment. For example, you may want to compute the overall pass rate or f1 score of your evaluation target across all examples in the dataset. diff --git a/docs/evaluation/how_to_guides/index.md b/docs/evaluation/how_to_guides/index.md index 698f00fd..9fa120a6 100644 --- a/docs/evaluation/how_to_guides/index.md +++ b/docs/evaluation/how_to_guides/index.md @@ -26,9 +26,8 @@ Evaluate and improve your application before deploying it. - [Define a custom evaluator](./how_to_guides/evaluation/custom_evaluator) - [Define an LLM-as-a-judge evaluator](./how_to_guides/evaluation/llm_as_judge) - [Define a pairwise evaluator](./how_to_guides/evaluation/evaluate_pairwise) +- [Define a summary evaluator](./how_to_guides/evaluation/summary) - [Use an off-the-shelf evaluator via the SDK (Python only)](./how_to_guides/evaluation/use_langchain_off_the_shelf_evaluators) -- [Use an off-the-shelf evaluator via the UI](./how_to_guides/evaluation/builtin_evaluators) -- [Evaluate aggregate experiment results](./how_to_guides/evaluation/summary) - [Evaluate intermediate steps](./how_to_guides/evaluation/evaluate_on_intermediate_steps) - [Return multiple metrics in one evaluator](./how_to_guides/evaluation/multiple_scores) - [Return categorical vs numerical metrics](./how_to_guides/evaluation/metric_type)