diff --git a/docs/evaluation/how_to_guides/evaluation/async.mdx b/docs/evaluation/how_to_guides/evaluation/async.mdx index 6741c635..bc021428 100644 --- a/docs/evaluation/how_to_guides/evaluation/async.mdx +++ b/docs/evaluation/how_to_guides/evaluation/async.mdx @@ -65,8 +65,9 @@ list 5 concrete questions that should be investigated to determine if the idea i researcher_app, data=dataset, evaluators=[concise], - max_concurrency=2, # Optional, no max by default - experiment_prefix="gpt-4o-mini-baseline" # Optional, random by default + # Optional, no max_concurrency by default but it is recommended to set one. + max_concurrency=2, + experiment_prefix="gpt-4o-mini-baseline" # Optional, random by default. ) `, @@ -76,3 +77,4 @@ list 5 concrete questions that should be investigated to determine if the idea i ## Related - [Run an evaluation (synchronously)](../../how_to_guides/evaluation/evaluate_llm_application) +- [Handle model rate limits](./how_to_guides/evaluation/rate_limiting) diff --git a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx index 8058c8c4..0db42b13 100644 --- a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx +++ b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx @@ -41,7 +41,7 @@ These functions can be passed directly into [evaluate()](https://langsmith-sdk.r import type { Run, Example } from "langsmith/schemas"; function correct(rootRun: Run, example: Example): EvaluationResult { - const score = rootRun.outputs?.outputs === example.outputs?.output; + const score = rootRun.outputs?.output === example.outputs?.output; return { key: "correct", score }; } `, @@ -53,11 +53,16 @@ These functions can be passed directly into [evaluate()](https://langsmith-sdk.r Custom evaluator functions must have specific argument names. They can take any subset of the following arguments: +Python and JS/TS + +- `run: langsmith.schemas.Run`: The full Run object generated by the application on the given example. +- `example: langsmith.schemas.Example`: The full dataset Example, including the example inputs, outputs (if available), and metdata (if available). + +Currently Python only + - `inputs: dict`: A dictionary of the inputs corresponding to a single example in a dataset. - `outputs: dict`: A dictionary of the outputs generated by the application on the given `inputs`. - `reference_outputs: dict`: A dictionary of the reference outputs associated with the example, if available. -- `run: langsmith.schemas.Run`: The full Run object generated by the application on the given example. -- `example: langsmith.schemas.Example`: The full dataset Example, including the example inputs, outputs (if available), and metdata (if available). For most use cases you'll only need `inputs`, `outputs`, and `reference_outputs`. `run` and `example` are useful only if you need some extra trace or example metadata outside of the actual inputs and outputs of the application. @@ -65,9 +70,14 @@ For most use cases you'll only need `inputs`, `outputs`, and `reference_outputs` Custom evaluators are expected to return one of the following types: +Python and JS/TS + +- `dict`: dicts of the form `{"score" | "value": ..., "name": ...}` allow you to customize the metric type ("score" for numerical and "value" for categorical) and metric name. This if useful if, for example, you want to log an integer as a categorical metric. + +Currently Python only + - `int | float | bool`: this is interepreted as an continuous metric that can be averaged, sorted, etc. The function name is used as the name of the metric. - `str`: this is intepreted as a categorical metric. The function name is used as the name of the metric. -- `dict`: dicts of the form `{"score" | "value": ..., "name": ...}` allow you to customize the metric type ("score" for numerical and "value" for categorical) and metric name. This if useful if, for example, you want to log an integer as a categorical metric. - `list[dict]`: return multiple metrics using a single function. ## Additional examples @@ -81,14 +91,17 @@ Custom evaluators are expected to return one of the following types: # Assumes you've installed pydantic. from pydantic import BaseModel + # Compare actual and reference outputs def correct(outputs: dict, reference_outputs: dict) -> bool: """Check if the answer exactly matches the expected answer.""" return outputs["answer"] == reference_outputs["answer"] + # Just evaluate actual outputs def concision(outputs: dict) -> int: """Score how concise the answer is. 1 is the most concise, 5 is the least concise.""" return min(len(outputs["answer"]) // 1000, 4) + 1 + # Use an LLM-as-a-judge oai_client = wrappers.wrap_openai(AsyncOpenAI()) async def valid_reasoning(inputs: dict, outputs: dict) -> bool: @@ -119,15 +132,6 @@ answer is logically valid and consistent with question and the answer.""" evaluators=[correct, concision, valid_reasoning] ) `, - typescript` - import type { EvaluationResult } from "langsmith/evaluation"; - import type { Run, Example } from "langsmith/schemas"; - - function correct(rootRun: Run, example: Example): EvaluationResult { - const score = rootRun.outputs?.outputs === example.outputs?.output; - return { key: "correct", score }; - } - `, ]} /> diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx index 7d368195..fdefed61 100644 --- a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx +++ b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx @@ -345,7 +345,7 @@ _If you've annotated your code for tracing, you can open the trace of each row i // Row-level evaluator function correct(rootRun: Run, example: Example): EvaluationResult { - const score = rootRun.outputs?.outputs === example.outputs?.output; + const score = rootRun.outputs?.output === example.outputs?.outputs; return { key: "correct", score }; } diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx index 24f6c710..d68b48b7 100644 --- a/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx +++ b/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx @@ -93,10 +93,9 @@ which asks the LLM to decide which is better between two AI assistant responses. :::info Optional LangChain Usage -In the Python example below, we are pulling [this structured prompt](https://smith.langchain.com/hub/langchain-ai/pairwise-evaluation-2) from the [LangChain Hub](../../../prompt_engineering/how_to_guides/prompts/langchain_hub) and using it with a LangChain LLM wrapper. -The prompt asks the LLM to decide which is better between two AI assistant responses. It uses structured output to parse the AI's response: 0, 1, or 2. +In the Python example below, we are pulling [this structured prompt](https://smith.langchain.com/hub/langchain-ai/pairwise-evaluation-2) from the [LangChain Hub](../../../prompt_engineering/how_to_guides/prompts/langchain_hub) and using it with a LangChain chat model wrapper. -**Usage of LangChain is totally optional.** To illustrate this point, the TypeScript example below uses the OpenAI API directly. +**Usage of LangChain is totally optional.** To illustrate this point, the TypeScript example uses the OpenAI SDK directly. ::: diff --git a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx index 26308cdb..ce7ae1ed 100644 --- a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx +++ b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx @@ -125,6 +125,7 @@ Let's create a simple dataset of questions and expected responses: outputs=[{"answers": a} for a in answers], ) `, + ]} /> diff --git a/vercel.json b/vercel.json index 4ceba045..ddf7eee3 100644 --- a/vercel.json +++ b/vercel.json @@ -185,6 +185,10 @@ { "source": "/tutorials/Developers/optimize_classifier", "destination": "/prompt_engineering/tutorials/optimize_classifier" + }, + { + "source": "evaluation/how_to_guides/evaluation/evaluate_llm_application#evaluate-on-a-particular-version-of-a-dataset", + "destination": "evaluation/how_to_guides/evaluation/dataset_version" } ], "builds": [