diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx index b2bf0361..7d368195 100644 --- a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx +++ b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx @@ -160,7 +160,7 @@ Since we have labels for this task, our evaluator can directly check if the actu =0.1.145`" })` def correct(inputs: dict, outputs: dict, reference_outputs: dict) -> bool: return outputs["output"] == reference_outputs["label"] `, @@ -169,7 +169,7 @@ Since we have labels for this task, our evaluator can directly check if the actu import type { Run, Example } from "langsmith/schemas"; function correct(rootRun: Run, example: Example): EvaluationResult { - const score = rootRun.outputs?.outputs === example.outputs?.output; + const score = rootRun.outputs?.output === example.outputs?.outputs; return { key: "correct", score }; } `, diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx index ca7b7f87..24f6c710 100644 --- a/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx +++ b/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx @@ -27,7 +27,7 @@ If you haven't already created experiments to compare, check out our [quick star ## `evaluate_comparative` args :::note -Pairwise evaluations currently require `langsmith` SDK Python version `>=0.1.55` or JS version `>=0.1.24`. +Pairwise evaluations require `langsmith` SDK Python version `>=0.1.145` or JS version `>=0.1.24`. ::: At its simplest, `evaluate_comparative` / `evaluateComparative` function takes the following arguments: @@ -57,21 +57,31 @@ Pairwise evaluators are just functions with an expected signature. Custom evaluator functions must have specific argument names. They can take any subset of the following arguments: +Python and JS/TS + +- `runs: list[langsmith.schemas.Run]`: A two-item list of the full Run objects generated by the two experiments on the given example. Use this if you need access to intermediate steps or metadata about each run. +- `example: langsmith.schemas.Example`: The full dataset Example, including the example inputs, outputs (if available), and metdata (if available). + +Currently Python only + - `inputs: dict`: A dictionary of the inputs corresponding to a single example in a dataset. - `outputs: list[dict]`: A two-item list of the outputs produced by each experiment on the given inputs. - `reference_outputs: dict`: A dictionary of the reference outputs associated with the example, if available. -- `runs: list[langsmith.schemas.Run]`: A two-item list of the full Run objects generated by the two experiments on the given example. Use this if you need access to intermediate steps or metadata about each run. -- `example: langsmith.schemas.Example`: The full dataset Example, including the example inputs, outputs (if available), and metdata (if available). -For most use cases you'll only need `inputs`, `outputs`, and `reference_outputs`. `run` and `example` are useful only if you need some extra trace or example metadata outside of the actual inputs and outputs of the application. +For most Python use cases you'll only need `inputs`, `outputs`, and `reference_outputs`. `run` and `example` are useful only if you need some extra trace or example metadata outside of the actual inputs and outputs of the application. ### Evaluator output Custom evaluators are expected to return one of the following types: +Python and JS/TS + - `dict`: dictionary with keys: - `key`, which represents the feedback key that will be logged - `scores`, which is a mapping from run ID to score for that run. + +Currently Python only + - `list[int | float | bool]`: a two-item list of scores. The list is assumed to have the same order as the `runs` / `outputs` evaluator args. The evaluator function name is used for the feedback key. Note that you should choose a feedback key that is distinct from standard feedbacks on your run. We recommend prefixing pairwise feedback keys with `pairwise_` or `ranked_`. @@ -81,7 +91,7 @@ Note that you should choose a feedback key that is distinct from standard feedba The following example uses [a prompt](https://smith.langchain.com/hub/langchain-ai/pairwise-evaluation-2) which asks the LLM to decide which is better between two AI assistant responses. It uses structured output to parse the AI's response: 0, 1, or 2. -:::note Optional LangChain Usage +:::info Optional LangChain Usage In the Python example below, we are pulling [this structured prompt](https://smith.langchain.com/hub/langchain-ai/pairwise-evaluation-2) from the [LangChain Hub](../../../prompt_engineering/how_to_guides/prompts/langchain_hub) and using it with a LangChain LLM wrapper. The prompt asks the LLM to decide which is better between two AI assistant responses. It uses structured output to parse the AI's response: 0, 1, or 2. @@ -92,7 +102,7 @@ The prompt asks the LLM to decide which is better between two AI assistant respo =0.1.145`"})` from langchain import hub from langchain.chat_models import init_chat_model from langsmith import evaluate_comparative diff --git a/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx b/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx index 9ca62d03..40566c2a 100644 --- a/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx +++ b/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx @@ -6,14 +6,15 @@ import { # How to evaluate a `langchain` runnable -`langchain` [Runnable](https://python.langchain.com/docs/concepts/runnables/) objects (such as chat models, retrievers, chains, etc.) can be passed directly into `evaluate()` / `aevaluate()`. - -:::info +:::info Key concepts -`langchain` refers to the [Python](https://python.langchain.com) and [JS/TS](https://js.langchain.com) OSS frameworks for building LLM applications. +- `langchain`: [Python](https://python.langchain.com) and [JS/TS](https://js.langchain.com) +- Runnable: [Python](https://python.langchain.com/docs/concepts/runnables/) and [JS/TS](https://js.langchain.com/docs/concepts/runnables/) ::: +`langchain` [Runnable](https://python.langchain.com/docs/concepts/runnables/) objects (such as chat models, retrievers, chains, etc.) can be passed directly into `evaluate()` / `aevaluate()`. + ## Setup Let's define a simple chain to evaluate. First, install all the required packages: @@ -101,7 +102,7 @@ To evaluate our chain we can pass it directly to the `evaluate()` / `aevaluate() actual = outputs["output"] expected = reference_outputs["label"] - assert actual == expected + return actual == expected results = await aevaluate( chain, @@ -112,11 +113,17 @@ To evaluate our chain we can pass it directly to the `evaluate()` / `aevaluate() `, typescript` import { evaluate } from "langsmith/evaluation"; + import { Client } from "langsmith"; + + const langsmith = new Client(); + const dataset = await client.clonePublicDataset( + "https://smith.langchain.com/public/3d6831e6-1680-4c88-94df-618c8e01fc55/d" + ) await evaluate(chain, { - data: datasetName, + data: dataset.name, evaluators: [correct], - experimentPrefix: "Toxic Queries", + experimentPrefix: "gpt-4o, ba }); `, diff --git a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx index 4f6429fe..26308cdb 100644 --- a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx +++ b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx @@ -93,10 +93,6 @@ Lets construct a simple ReACT agent to start: app = workflow.compile() `, - typescript` -// ToDo -`, - ]} /> @@ -129,10 +125,6 @@ Let's create a simple dataset of questions and expected responses: outputs=[{"answers": a} for a in answers], ) `, - typescript` - // ToDo - `, - ]} /> @@ -172,9 +164,6 @@ And a simple evaluator: ) return response.content.upper() == "CORRECT" `, - typescript` - // ToDo - `, ]} /> @@ -213,9 +202,6 @@ If any of you nodes are defined as async, you'll need to use `aevaluate` ) `, - typescript` - // ToDo - `, ]} /> @@ -244,9 +230,6 @@ For example, we can look at the messages to assert that the model invoked the 's experiment_prefix="claude-3.5-baseline", # optional ) `, - typescript` - // ToDo - `, ]} /> @@ -280,9 +263,6 @@ See more about what arguments you can pass to custom evaluators in this [how-to experiment_prefix="claude-3.5-baseline", # optional ) `, - typescript` - // ToDo - `, ]} /> @@ -307,9 +287,6 @@ In this case we can even continue using the evaluators we've been using. experiment_prefix="claude-3.5-model-node", # optional ) `, - typescript` - // ToDo - `, ]} /> @@ -458,9 +435,6 @@ In this case we can even continue using the evaluators we've been using. experiment_prefix="claude-3.5-baseline", # optional ) -`, - typescript` -// ToDo `, ]}