From 63478f4c7c1fa8ff1912ba94d205f99a2827e6c3 Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Tue, 3 Dec 2024 13:12:03 -0800 Subject: [PATCH 01/21] build python sdk reference into docs (#557) note: add ~1min to docs build time (total ~2min with this change) --- Makefile | 24 ++++++++++++++++++++++++ docs/reference/index.md | 2 +- docusaurus.config.js | 14 ++++++++++++-- package.json | 4 ++-- vercel.json | 4 ++++ 5 files changed, 43 insertions(+), 5 deletions(-) create mode 100644 Makefile diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..477e6923 --- /dev/null +++ b/Makefile @@ -0,0 +1,24 @@ +install-vercel-deps: + yum -y update + yum install gcc bzip2-devel libffi-devel zlib-devel wget tar gzip rsync -y + +PYTHON = .venv/bin/python + +build-api-ref: + git clone -b bagatur/update_py_api_ref --depth=1 https://github.com/langchain-ai/langsmith-sdk.git + python3 -m venv .venv + . .venv/bin/activate + $(PYTHON) -m pip install --upgrade pip + $(PYTHON) -m pip install --upgrade uv + cd langsmith-sdk && ../$(PYTHON) -m uv pip install -r python/docs/requirements.txt + $(PYTHON) langsmith-sdk/python/docs/create_api_rst.py + LC_ALL=C $(PYTHON) -m sphinx -T -E -b html -d langsmith-sdk/python/docs/_build/doctrees -c langsmith-sdk/python/docs langsmith-sdk/python/docs langsmith-sdk/python/docs/_build/html -j auto + $(PYTHON) langsmith-sdk/python/docs/scripts/custom_formatter.py langsmith-sdk/docs/_build/html/ + + +vercel-build: install-vercel-deps build-api-ref + mkdir -p static/reference/python + mv langsmith-sdk/python/docs/_build/html/* static/reference/python/ + rm -rf langsmith-sdk + NODE_OPTIONS="--max-old-space-size=5000" yarn run docusaurus build + diff --git a/docs/reference/index.md b/docs/reference/index.md index 9c1ffb59..dee88f9a 100644 --- a/docs/reference/index.md +++ b/docs/reference/index.md @@ -14,7 +14,7 @@ Technical reference that covers components, APIs, and other aspects of LangSmith ### SDK -- [Python SDK Reference](https://langsmith-sdk.readthedocs.io/en/latest/) +- [Python SDK Reference](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python) - [LangChain off-the-shelf evaluators (Python only)](./reference/sdk_reference/langchain_evaluators) ### Common data types diff --git a/docusaurus.config.js b/docusaurus.config.js index 33c16111..615e9960 100644 --- a/docusaurus.config.js +++ b/docusaurus.config.js @@ -134,9 +134,19 @@ const config = { position: "right", }, { - href: "https://api.smith.langchain.com/redoc", - label: "Go to API Docs", + type: "dropdown", + label: "API Reference", position: "left", + items: [ + { + label: "REST", + href: "https://api.smith.langchain.com/redoc", + }, + { + label: "Python", + to: "https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python", + }, + ], }, ], }, diff --git a/package.json b/package.json index 54d64b71..2eb6fdf6 100644 --- a/package.json +++ b/package.json @@ -7,8 +7,8 @@ "docusaurus": "docusaurus", "test": "jest", "start": "rm -rf ./docs/api && docusaurus start", - "build": "rm -rf ./build && docusaurus build", - "vercel-build": "git submodule update --init --recursive && python3 -m ensurepip --default-pip && python3 -m pip install -r subdirectories/scripts/requirements.txt && python3 subdirectories/scripts/build_cookbook.py && ls docs && npm run build", + "build": "make vercel-build", + "vercel-build": "ls .; cat Makefile; make vercel-build", "swizzle": "docusaurus swizzle", "deploy": "docusaurus deploy", "clear": "docusaurus clear", diff --git a/vercel.json b/vercel.json index 75bd5c29..7cbcf89c 100644 --- a/vercel.json +++ b/vercel.json @@ -201,6 +201,10 @@ { "source": "/evaluation/how_to_guides/human_feedback/:path*", "destination": "/evaluation/how_to_guides/:path*" + }, + { + "source": "/reference/python(/?)", + "destination": "/reference/python/reference" } ], "builds": [ From 162c4124e8ff083f1b32599356732b2324589f97 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Tue, 3 Dec 2024 13:24:24 -0800 Subject: [PATCH 02/21] bump captions --- docs/evaluation/how_to_guides/async.mdx | 2 +- docs/evaluation/how_to_guides/custom_evaluator.mdx | 4 ++-- docs/evaluation/how_to_guides/evaluate_llm_application.mdx | 6 +++--- docs/evaluation/how_to_guides/evaluate_pairwise.mdx | 4 ++-- docs/evaluation/how_to_guides/langchain_runnable.mdx | 2 +- docs/evaluation/how_to_guides/langgraph.mdx | 6 +++--- docs/evaluation/how_to_guides/llm_as_judge.mdx | 2 +- docs/evaluation/how_to_guides/local.mdx | 2 +- docs/evaluation/how_to_guides/metric_type.mdx | 2 +- docs/evaluation/how_to_guides/multiple_scores.mdx | 2 +- docs/evaluation/index.mdx | 2 +- 11 files changed, 17 insertions(+), 17 deletions(-) diff --git a/docs/evaluation/how_to_guides/async.mdx b/docs/evaluation/how_to_guides/async.mdx index facb4b81..8e186433 100644 --- a/docs/evaluation/how_to_guides/async.mdx +++ b/docs/evaluation/how_to_guides/async.mdx @@ -25,7 +25,7 @@ You can see how to use it [here](./evaluate_llm_application). =0.1.145`"})` + python({caption: "Requires `langsmith>=0.2.0`"})` from langsmith import aevaluate, wrappers, Client from openai import AsyncOpenAI diff --git a/docs/evaluation/how_to_guides/custom_evaluator.mdx b/docs/evaluation/how_to_guides/custom_evaluator.mdx index 93ac7ef0..4128dff7 100644 --- a/docs/evaluation/how_to_guides/custom_evaluator.mdx +++ b/docs/evaluation/how_to_guides/custom_evaluator.mdx @@ -20,7 +20,7 @@ These functions can be passed directly into [evaluate()](https://langsmith-sdk.r =0.1.145`"})` + python({caption: "Requires `langsmith>=0.2.0`"})` from langsmith import evaluate def correct(outputs: dict, reference_outputs: dict) -> bool: @@ -85,7 +85,7 @@ Currently Python only =0.1.145`"})` + python({caption: "Requires `langsmith>=0.2.0`"})` from langsmith import evaluate, wrappers from openai import AsyncOpenAI # Assumes you've installed pydantic. diff --git a/docs/evaluation/how_to_guides/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluate_llm_application.mdx index 84a8cff5..96f0a6a7 100644 --- a/docs/evaluation/how_to_guides/evaluate_llm_application.mdx +++ b/docs/evaluation/how_to_guides/evaluate_llm_application.mdx @@ -160,7 +160,7 @@ Since we have labels for this task, our evaluator can directly check if the actu =0.1.145`" })` + python({ caption: "Requires `langsmith>=0.2.0`" })` def correct(inputs: dict, outputs: dict, reference_outputs: dict) -> bool: return outputs["output"] == reference_outputs["label"] `, @@ -191,7 +191,7 @@ The key arguments are: =0.1.145`" })` + python({ caption: "Requires `langsmith>=0.2.0`" })` from langsmith import evaluate results = evaluate( @@ -232,7 +232,7 @@ _If you've annotated your code for tracing, you can open the trace of each row i =0.1.145`"})` + python({caption: "Requires `langsmith>=0.2.0`"})` from langsmith import Client, evaluate, traceable, wrappers from openai import OpenAI diff --git a/docs/evaluation/how_to_guides/evaluate_pairwise.mdx b/docs/evaluation/how_to_guides/evaluate_pairwise.mdx index f21ff146..0f9020ca 100644 --- a/docs/evaluation/how_to_guides/evaluate_pairwise.mdx +++ b/docs/evaluation/how_to_guides/evaluate_pairwise.mdx @@ -27,7 +27,7 @@ If you haven't already created experiments to compare, check out our [quick star ## `evaluate_comparative` args :::note -Pairwise evaluations require `langsmith` SDK Python version `>=0.1.145` or JS version `>=0.1.24`. +Pairwise evaluations require `langsmith` SDK Python version `>=0.2.0` or JS version `>=0.1.24`. ::: At its simplest, `evaluate_comparative` / `evaluateComparative` function takes the following arguments: @@ -101,7 +101,7 @@ In the Python example below, we are pulling [this structured prompt](https://smi =0.1.145`"})` + python({caption: "Requires `langsmith>=0.2.0`"})` from langchain import hub from langchain.chat_models import init_chat_model from langsmith import evaluate_comparative diff --git a/docs/evaluation/how_to_guides/langchain_runnable.mdx b/docs/evaluation/how_to_guides/langchain_runnable.mdx index faf2b216..54e2ff91 100644 --- a/docs/evaluation/how_to_guides/langchain_runnable.mdx +++ b/docs/evaluation/how_to_guides/langchain_runnable.mdx @@ -85,7 +85,7 @@ To evaluate our chain we can pass it directly to the `evaluate()` / `aevaluate() =0.1.145`"})` + python({caption: "Requires `langsmith>=0.2.0`"})` from langsmith import aevaluate, Client client = Client() diff --git a/docs/evaluation/how_to_guides/langgraph.mdx b/docs/evaluation/how_to_guides/langgraph.mdx index 557c2a6c..66f791b2 100644 --- a/docs/evaluation/how_to_guides/langgraph.mdx +++ b/docs/evaluation/how_to_guides/langgraph.mdx @@ -136,7 +136,7 @@ And a simple evaluator: =0.1.145`"})` + python({caption: "Requires `langsmith>=0.2.0`"})` judge_llm = init_chat_model("gpt-4o") @@ -184,7 +184,7 @@ If any of you nodes are defined as async, you'll need to use `aevaluate` =0.1.145`"})` + python({caption: "Requires `langsmith>=0.2.0`"})` from langsmith import aevaluate def example_to_state(inputs: dict) -> dict: @@ -217,7 +217,7 @@ For example, we can look at the messages to assert that the model invoked the 's =0.1.145`"})` + python({caption: "Requires `langsmith>=0.2.0`"})` def right_tool(outputs: dict) -> bool: tool_calls = outputs["messages"][1].tool_calls diff --git a/docs/evaluation/how_to_guides/llm_as_judge.mdx b/docs/evaluation/how_to_guides/llm_as_judge.mdx index b4d7ba8a..5a572c24 100644 --- a/docs/evaluation/how_to_guides/llm_as_judge.mdx +++ b/docs/evaluation/how_to_guides/llm_as_judge.mdx @@ -23,7 +23,7 @@ For maximal control of evaluator logic, we can write a custom evaluator and run =0.1.145`"})` + python({caption: "Requires `langsmith>=0.2.0`"})` from langsmith import evaluate, traceable, wrappers, Client from openai import OpenAI # Assumes you've installed pydantic diff --git a/docs/evaluation/how_to_guides/local.mdx b/docs/evaluation/how_to_guides/local.mdx index 8ab10d62..10e40852 100644 --- a/docs/evaluation/how_to_guides/local.mdx +++ b/docs/evaluation/how_to_guides/local.mdx @@ -22,7 +22,7 @@ Let's take a look at an example: =0.1.145`. Example also uses `pandas`."})` + python({caption: "Requires `langsmith>=0.2.0`. Example also uses `pandas`."})` from langsmith import evaluate, Client # 1. Create and/or select your dataset diff --git a/docs/evaluation/how_to_guides/metric_type.mdx b/docs/evaluation/how_to_guides/metric_type.mdx index 68610753..93ad9361 100644 --- a/docs/evaluation/how_to_guides/metric_type.mdx +++ b/docs/evaluation/how_to_guides/metric_type.mdx @@ -23,7 +23,7 @@ Here are some examples: =0.1.145`"})` + python({caption: "Requires `langsmith>=0.2.0`"})` def numerical_metric(inputs: dict, outputs: dict, reference_outputs: dict) -> float: # Evaluation logic... diff --git a/docs/evaluation/how_to_guides/multiple_scores.mdx b/docs/evaluation/how_to_guides/multiple_scores.mdx index 17f3fb9d..e42cc175 100644 --- a/docs/evaluation/how_to_guides/multiple_scores.mdx +++ b/docs/evaluation/how_to_guides/multiple_scores.mdx @@ -35,7 +35,7 @@ Example: =0.1.145`"})` + python({caption: "Requires `langsmith>=0.2.0`"})` def multiple_scores(outputs: dict, reference_outputs: dict) -> list[dict]: # Replace with real evaluation logic. precision = 0.8 diff --git a/docs/evaluation/index.mdx b/docs/evaluation/index.mdx index f8d368a3..e1c2b544 100644 --- a/docs/evaluation/index.mdx +++ b/docs/evaluation/index.mdx @@ -54,7 +54,7 @@ export LANGCHAIN_API_KEY=`), =0.1.145`"})` + python({caption: "Requires `langsmith>=0.2.0`"})` from langsmith import evaluate, Client # 1. Create and/or select your dataset From bca028aaa9983304d53a9a22f4998bd6b96254ef Mon Sep 17 00:00:00 2001 From: Bagatur Date: Tue, 3 Dec 2024 13:25:01 -0800 Subject: [PATCH 03/21] checkout main sdk docs --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 477e6923..eabd8090 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ install-vercel-deps: PYTHON = .venv/bin/python build-api-ref: - git clone -b bagatur/update_py_api_ref --depth=1 https://github.com/langchain-ai/langsmith-sdk.git + git clone --depth=1 https://github.com/langchain-ai/langsmith-sdk.git python3 -m venv .venv . .venv/bin/activate $(PYTHON) -m pip install --upgrade pip From 07d15f04cdc5d2741719c86ebd85080bd1ecc0e8 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Tue, 3 Dec 2024 17:26:50 -0800 Subject: [PATCH 04/21] wip --- docs/evaluation/how_to_guides/custom_evaluator.mdx | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/evaluation/how_to_guides/custom_evaluator.mdx b/docs/evaluation/how_to_guides/custom_evaluator.mdx index 4128dff7..d54237bc 100644 --- a/docs/evaluation/how_to_guides/custom_evaluator.mdx +++ b/docs/evaluation/how_to_guides/custom_evaluator.mdx @@ -36,12 +36,14 @@ These functions can be passed directly into [evaluate()](https://langsmith-sdk.r evaluators=[correct] ) `, - typescript` + typescript({caption: "Requires `langsmith>=0.2.9`"})` import type { EvaluationResult } from "langsmith/evaluation"; - import type { Run, Example } from "langsmith/schemas"; - function correct(run: Run, example: Example): EvaluationResult { - const score = run.outputs?.output === example.outputs?.output; + function correct({ outputs, referenceOutputs }: { + outputs: Record; + referenceOutputs?: Record; + }): EvaluationResult { + const score = outputs?.answer === referenceOutputs?.answer; return { key: "correct", score }; } `, From f63a7e3c598bf2699da0dfdd09b1dff1371c4211 Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Tue, 3 Dec 2024 17:32:17 -0800 Subject: [PATCH 05/21] Bagatur/sidebar groups (#564) --- sidebars.js | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sidebars.js b/sidebars.js index eb048e28..ab2cec42 100644 --- a/sidebars.js +++ b/sidebars.js @@ -162,6 +162,10 @@ module.exports = { link: { type: "doc", id: "prompt_engineering/concepts/index" }, }, "langgraph_cloud", + { + type: 'html', + value: '', + }, { type: "category", label: "Administration", @@ -205,7 +209,6 @@ module.exports = { ], link: { type: "doc", id: "administration/tutorials/index" }, }, - "administration/pricing", ], link: { type: "doc", id: "administration/concepts/index" }, }, @@ -222,6 +225,11 @@ module.exports = { ], link: { type: "doc", id: "self_hosting/index" }, }, + "administration/pricing", + { + type: 'html', + value: '', + }, { type: "category", label: "Reference", From 3c7bb50b0103dc4f801cc3dee046e459db22a24c Mon Sep 17 00:00:00 2001 From: Bagatur Date: Tue, 3 Dec 2024 17:49:40 -0800 Subject: [PATCH 06/21] wip --- docs/index.mdx | 53 ++++++++++++++++++++++++------------ src/components/QuickStart.js | 5 ++-- 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/docs/index.mdx b/docs/index.mdx index 8ad45eab..25c4be90 100644 --- a/docs/index.mdx +++ b/docs/index.mdx @@ -27,7 +27,9 @@ import { RegionalUrl } from "@site/src/components/RegionalUrls"; # Get started with LangSmith -**LangSmith** is a platform for building production-grade LLM applications. It allows you to closely monitor and evaluate your application, so you can ship quickly and with confidence. Use of [LangChain's open source frameworks](https://python.langchain.com) is not necessary - LangSmith works on its own! +**LangSmith** is a platform for building production-grade LLM applications. +It allows you to closely monitor and evaluate your application, so you can ship quickly and with confidence. +Use of LangChain's open source frameworks [langchain](https://python.langchain.com) and [langgraph](https://langchain-ai.github.io/langgraph/) is not necessary - LangSmith works on its own! ## 1. Install LangSmith @@ -99,35 +101,50 @@ Evaluation requires a system to test, data to serve as test cases, and optionall value: "python", label: "Python", language: "python", - content: `from langsmith import Client, evaluate -client = Client()\n + content: `from langsmith import Client, traceable + +client = Client() + # Define dataset: these are your test cases -dataset_name = "Sample Dataset" -dataset = client.create_dataset(dataset_name, description="A sample dataset in LangSmith.") +dataset = client.create_dataset( + "Sample Dataset", + description="A sample dataset in LangSmith.", +) client.create_examples( inputs=[ {"postfix": "to LangSmith"}, {"postfix": "to Evaluations in LangSmith"}, ], outputs=[ - {"output": "Welcome to LangSmith"}, - {"output": "Welcome to Evaluations in LangSmith"}, + {"response": "Welcome to LangSmith"}, + {"response": "Welcome to Evaluations in LangSmith"}, ], dataset_id=dataset.id, -)\n -# Define your evaluator -def exact_match(run, example): - return {"score": run.outputs["output"] == example.outputs["output"]}\n -experiment_results = evaluate( - lambda input: "Welcome " + input['postfix'], # Your AI system goes here - data=dataset_name, # The data to predict and grade over +) + +# Define an interface to your application (tracing optional) +@traceable +def dummy_app(inputs: dict) -> dict: + return {"response": "Welcome " + inputs["postfix"]} + +# Define your evaluator(s) +def exact_match(outputs: dict, reference_outputs: dict) -> bool: + return outputs["response"] == reference_outputs["response"] + +# Run the evaluation +experiment_results = client.evaluate( + dummy_app, # Your AI system goes here + data=dataset, # The data to predict and grade over evaluators=[exact_match], # The evaluators to score the results experiment_prefix="sample-experiment", # The name of the experiment - metadata={ - "version": "1.0.0", - "revision_id": "beta" - }, + metadata={"version": "1.0.0", "revision_id": "beta"}, # Metadata about the experiment ) + +# Analyze the results via the UI or programmatically +# If you have 'pandas' installed you can view the results as a +# pandas DataFrame by uncommenting below: + +# experiment_results.to_pandas() `, }, typescript` diff --git a/src/components/QuickStart.js b/src/components/QuickStart.js index c4f87ac0..8a3a6e60 100644 --- a/src/components/QuickStart.js +++ b/src/components/QuickStart.js @@ -103,10 +103,9 @@ patch_run(parent_run_id, {"answer": chat_completion.choices[0].message.content}) export const PythonSDKTracingCode = () => `import openai -from langsmith.wrappers import wrap_openai -from langsmith import traceable\n +from langsmith import wrappers, traceable\n # Auto-trace LLM calls in-context -client = wrap_openai(openai.Client())\n +client = wrappers.wrap_openai(openai.Client())\n @traceable # Auto-trace this function def pipeline(user_input: str): result = client.chat.completions.create( From b5a1683b1ff0374c4bf7871d84ec99a8ba7097b9 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Tue, 3 Dec 2024 18:11:01 -0800 Subject: [PATCH 07/21] fmt --- docs/index.mdx | 72 ++++++++++++++++++++++++++++---------------------- sidebars.js | 10 ++++--- 2 files changed, 46 insertions(+), 36 deletions(-) diff --git a/docs/index.mdx b/docs/index.mdx index 25c4be90..482fd517 100644 --- a/docs/index.mdx +++ b/docs/index.mdx @@ -27,9 +27,9 @@ import { RegionalUrl } from "@site/src/components/RegionalUrls"; # Get started with LangSmith -**LangSmith** is a platform for building production-grade LLM applications. -It allows you to closely monitor and evaluate your application, so you can ship quickly and with confidence. -Use of LangChain's open source frameworks [langchain](https://python.langchain.com) and [langgraph](https://langchain-ai.github.io/langgraph/) is not necessary - LangSmith works on its own! +**LangSmith** is a platform for building production-grade LLM applications. +It allows you to closely monitor and evaluate your application, so you can ship quickly and with confidence. +LangChain's open source frameworks [langchain](https://python.langchain.com) and [langgraph](https://langchain-ai.github.io/langgraph/) work seemlessly with LangSmith but are not necessary - LangSmith works on its own! ## 1. Install LangSmith @@ -106,53 +106,60 @@ Evaluation requires a system to test, data to serve as test cases, and optionall client = Client() # Define dataset: these are your test cases + dataset = client.create_dataset( - "Sample Dataset", - description="A sample dataset in LangSmith.", +"Sample Dataset", +description="A sample dataset in LangSmith.", ) client.create_examples( - inputs=[ - {"postfix": "to LangSmith"}, - {"postfix": "to Evaluations in LangSmith"}, - ], - outputs=[ - {"response": "Welcome to LangSmith"}, - {"response": "Welcome to Evaluations in LangSmith"}, - ], - dataset_id=dataset.id, +inputs=[ +{"postfix": "to LangSmith"}, +{"postfix": "to Evaluations in LangSmith"}, +], +outputs=[ +{"response": "Welcome to LangSmith"}, +{"response": "Welcome to Evaluations in LangSmith"}, +], +dataset_id=dataset.id, ) # Define an interface to your application (tracing optional) + @traceable def dummy_app(inputs: dict) -> dict: - return {"response": "Welcome " + inputs["postfix"]} +return {"response": "Welcome " + inputs["postfix"]} # Define your evaluator(s) + def exact_match(outputs: dict, reference_outputs: dict) -> bool: - return outputs["response"] == reference_outputs["response"] +return outputs["response"] == reference_outputs["response"] # Run the evaluation + experiment_results = client.evaluate( - dummy_app, # Your AI system goes here - data=dataset, # The data to predict and grade over - evaluators=[exact_match], # The evaluators to score the results - experiment_prefix="sample-experiment", # The name of the experiment - metadata={"version": "1.0.0", "revision_id": "beta"}, # Metadata about the experiment +dummy_app, # Your AI system goes here +data=dataset, # The data to predict and grade over +evaluators=[exact_match], # The evaluators to score the results +experiment_prefix="sample-experiment", # The name of the experiment +metadata={"version": "1.0.0", "revision_id": "beta"}, # Metadata about the experiment ) -# Analyze the results via the UI or programmatically -# If you have 'pandas' installed you can view the results as a -# pandas DataFrame by uncommenting below: +# Analyze the results via the UI or programmatically + +# If you have 'pandas' installed you can view the results as a + +# pandas DataFrame by uncommenting below: # experiment_results.to_pandas() + `, }, typescript` - import { Client, Run, Example } from "langsmith"; - import { EvaluationResult, evaluate } from "langsmith/evaluation"; - +import { Client, Run, Example } from "langsmith"; +import { EvaluationResult, evaluate } from "langsmith/evaluation"; + const client = new Client(); - + // Define dataset: these are your test cases const datasetName = "Sample Dataset"; const dataset = await client.createDataset(datasetName, { @@ -169,7 +176,7 @@ experiment_results = client.evaluate( ], datasetId: dataset.id, }); - + // Define your evaluator const exactMatch = async ( run: Run, @@ -180,7 +187,7 @@ experiment_results = client.evaluate( score: run.outputs?.output === example?.outputs?.output, }; }; - + await evaluate( (input: { postfix: string }) => ({ output: \`Welcome $\{input.postfix\}\` }), { @@ -193,8 +200,9 @@ experiment_results = client.evaluate( } ); `, - ]} - groupId="client-language" + +]} +groupId="client-language" /> - Click the link printed out by your evaluation run to access the LangSmith experiments UI, diff --git a/sidebars.js b/sidebars.js index ab2cec42..6b99e9d1 100644 --- a/sidebars.js +++ b/sidebars.js @@ -163,8 +163,9 @@ module.exports = { }, "langgraph_cloud", { - type: 'html', - value: '', + type: "html", + value: + '', }, { type: "category", @@ -227,8 +228,9 @@ module.exports = { }, "administration/pricing", { - type: 'html', - value: '', + type: "html", + value: + '', }, { type: "category", From 9f8b513642355b2c3d12087352ca78d05712f2cf Mon Sep 17 00:00:00 2001 From: Bagatur Date: Tue, 3 Dec 2024 18:13:57 -0800 Subject: [PATCH 08/21] wip --- docs/index.mdx | 104 ++++++++++++++++++++++--------------------------- 1 file changed, 47 insertions(+), 57 deletions(-) diff --git a/docs/index.mdx b/docs/index.mdx index 482fd517..d3dc0028 100644 --- a/docs/index.mdx +++ b/docs/index.mdx @@ -8,9 +8,8 @@ import Tabs from "@theme/Tabs"; import CodeBlock from "@theme/CodeBlock"; import { CodeTabs, - PythonBlock, - TypeScriptBlock, typescript, + python, } from "@site/src/components/InstructionsWithCode"; import { LangChainInstallationCodeTabs, @@ -97,63 +96,54 @@ Evaluation requires a system to test, data to serve as test cases, and optionall dict: -return {"response": "Welcome " + inputs["postfix"]} - -# Define your evaluator(s) - -def exact_match(outputs: dict, reference_outputs: dict) -> bool: -return outputs["response"] == reference_outputs["response"] - -# Run the evaluation - -experiment_results = client.evaluate( -dummy_app, # Your AI system goes here -data=dataset, # The data to predict and grade over -evaluators=[exact_match], # The evaluators to score the results -experiment_prefix="sample-experiment", # The name of the experiment -metadata={"version": "1.0.0", "revision_id": "beta"}, # Metadata about the experiment -) - -# Analyze the results via the UI or programmatically - -# If you have 'pandas' installed you can view the results as a - -# pandas DataFrame by uncommenting below: - -# experiment_results.to_pandas() + python` + from langsmith import Client, traceable + + client = Client() + + # Define dataset: these are your test cases + dataset = client.create_dataset( + "Sample Dataset", + description="A sample dataset in LangSmith.", + ) + + client.create_examples( + inputs=[ + {"postfix": "to LangSmith"}, + {"postfix": "to Evaluations in LangSmith"}, + ], + outputs=[ + {"response": "Welcome to LangSmith"}, + {"response": "Welcome to Evaluations in LangSmith"}, + ], + dataset_id=dataset.id, + ) + + # Define an interface to your application (tracing optional) + @traceable + def dummy_app(inputs: dict) -> dict: + return {"response": "Welcome " + inputs["postfix"]} + + # Define your evaluator(s) + def exact_match(outputs: dict, reference_outputs: dict) -> bool: + return outputs["response"] == reference_outputs["response"] + + # Run the evaluation + experiment_results = client.evaluate( + dummy_app, # Your AI system goes here + data=dataset, # The data to predict and grade over + evaluators=[exact_match], # The evaluators to score the results + experiment_prefix="sample-experiment", # The name of the experiment + metadata={"version": "1.0.0", "revision_id": "beta"}, # Metadata about the experiment + ) + + # Analyze the results via the UI or programmatically + # If you have 'pandas' installed you can view the results as a + # pandas DataFrame by uncommenting below: + + # experiment_results.to_pandas() `, - }, typescript` import { Client, Run, Example } from "langsmith"; import { EvaluationResult, evaluate } from "langsmith/evaluation"; From cb4c76c211469ad2fed6ef0768214f3ef733aa4b Mon Sep 17 00:00:00 2001 From: Bagatur Date: Tue, 3 Dec 2024 18:29:45 -0800 Subject: [PATCH 09/21] wip --- .../how_to_guides/custom_evaluator.mdx | 4 ++-- docs/index.mdx | 23 ++++++++----------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/docs/evaluation/how_to_guides/custom_evaluator.mdx b/docs/evaluation/how_to_guides/custom_evaluator.mdx index d54237bc..f5995630 100644 --- a/docs/evaluation/how_to_guides/custom_evaluator.mdx +++ b/docs/evaluation/how_to_guides/custom_evaluator.mdx @@ -39,10 +39,10 @@ These functions can be passed directly into [evaluate()](https://langsmith-sdk.r typescript({caption: "Requires `langsmith>=0.2.9`"})` import type { EvaluationResult } from "langsmith/evaluation"; - function correct({ outputs, referenceOutputs }: { + const correct = async ({ outputs, referenceOutputs }: { outputs: Record; referenceOutputs?: Record; - }): EvaluationResult { + }): Promise => { const score = outputs?.answer === referenceOutputs?.answer; return { key: "correct", score }; } diff --git a/docs/index.mdx b/docs/index.mdx index d3dc0028..3e3c7c4a 100644 --- a/docs/index.mdx +++ b/docs/index.mdx @@ -145,7 +145,7 @@ Evaluation requires a system to test, data to serve as test cases, and optionall `, typescript` -import { Client, Run, Example } from "langsmith"; +import { Client } from "langsmith"; import { EvaluationResult, evaluate } from "langsmith/evaluation"; const client = new Client(); @@ -161,32 +161,29 @@ import { EvaluationResult, evaluate } from "langsmith/evaluation"; { postfix: "to Evaluations in LangSmith" }, ], outputs: [ - { output: "Welcome to LangSmith" }, - { output: "Welcome to Evaluations in LangSmith" }, + { response: "Welcome to LangSmith" }, + { response: "Welcome to Evaluations in LangSmith" }, ], datasetId: dataset.id, }); // Define your evaluator - const exactMatch = async ( - run: Run, - example: Example - ): Promise => { + const exactMatch = async ({ outputs, referenceOutputs }: { + outputs?: Record; + referenceOutputs?: Record; + }): Promise => { return { key: "exact_match", - score: run.outputs?.output === example?.outputs?.output, + score: outputs?.response === referenceOutputs?.response, }; }; await evaluate( - (input: { postfix: string }) => ({ output: \`Welcome $\{input.postfix\}\` }), + (input: { postfix: string }) => ({ response: \`Welcome $\{input.postfix\}\` }), { data: datasetName, evaluators: [exactMatch], - metadata: { - version: "1.0.0", - revision_id: "beta", - }, + metadata: { version: "1.0.0", revision_id: "beta" }, } ); `, From 74ebfbce7e063cd59ecc6f6a75b8ee945e3d2d31 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Wed, 4 Dec 2024 10:05:51 -0800 Subject: [PATCH 10/21] wip --- docs/evaluation/how_to_guides/async.mdx | 4 +- .../create_few_shot_evaluators.mdx | 2 +- .../how_to_guides/custom_evaluator.mdx | 15 ++--- .../evaluate_llm_application.mdx | 6 +- .../how_to_guides/evaluate_pairwise.mdx | 66 +++++++++---------- .../how_to_guides/rate_limiting.mdx | 2 +- docs/evaluation/index.mdx | 18 ++--- docs/evaluation/tutorials/evaluation.mdx | 2 +- docs/index.mdx | 7 +- docs/reference/sdk_reference/_category_.json | 5 -- docs/reference/sdk_reference/index.mdx | 3 - vercel.json | 4 ++ 12 files changed, 62 insertions(+), 72 deletions(-) delete mode 100644 docs/reference/sdk_reference/_category_.json delete mode 100644 docs/reference/sdk_reference/index.mdx diff --git a/docs/evaluation/how_to_guides/async.mdx b/docs/evaluation/how_to_guides/async.mdx index 8e186433..d51266c8 100644 --- a/docs/evaluation/how_to_guides/async.mdx +++ b/docs/evaluation/how_to_guides/async.mdx @@ -8,8 +8,8 @@ import { CodeTabs, python } from "@site/src/components/InstructionsWithCode"; ::: -We can run evaluations asynchronously via the SDK using [aevaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._arunner.aevaluate.html), -which accepts all of the same arguments as [evaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) but expects the application function to be asynchronous. +We can run evaluations asynchronously via the SDK using [aevaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate), +which accepts all of the same arguments as [evaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._runner.evaluate) but expects the application function to be asynchronous. You can learn more about how to use the `evaluate()` function [here](./evaluate_llm_application). :::info Python only diff --git a/docs/evaluation/how_to_guides/create_few_shot_evaluators.mdx b/docs/evaluation/how_to_guides/create_few_shot_evaluators.mdx index 4bf8f696..ecdbbb78 100644 --- a/docs/evaluation/how_to_guides/create_few_shot_evaluators.mdx +++ b/docs/evaluation/how_to_guides/create_few_shot_evaluators.mdx @@ -2,7 +2,7 @@ sidebar_position: 10 --- -How to create few-shot evaluators +# How to create few-shot evaluators Using LLM-as-a-Judge evaluators can be very helpful when you can't evaluate your system programmatically. However, improving/iterating on these prompts can add unnecessary overhead to the development process of an LLM-based application - you now need to maintain both your application **and** your evaluators. To make this process easier, LangSmith allows diff --git a/docs/evaluation/how_to_guides/custom_evaluator.mdx b/docs/evaluation/how_to_guides/custom_evaluator.mdx index f5995630..2660d4a7 100644 --- a/docs/evaluation/how_to_guides/custom_evaluator.mdx +++ b/docs/evaluation/how_to_guides/custom_evaluator.mdx @@ -13,7 +13,7 @@ import { ::: Custom evaluators are just functions that take a dataset example and the resulting application output, and return one or more metrics. -These functions can be passed directly into [evaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) / [aevaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._arunner.aevaluate.html). +These functions can be passed directly into [evaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._runner.evaluate) / [aevaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate). ## Basic example @@ -55,19 +55,16 @@ These functions can be passed directly into [evaluate()](https://langsmith-sdk.r Custom evaluator functions must have specific argument names. They can take any subset of the following arguments: -Python and JS/TS - -- `run: langsmith.schemas.Run`: The full Run object generated by the application on the given example. -- `example: langsmith.schemas.Example`: The full dataset Example, including the example inputs, outputs (if available), and metdata (if available). - -Currently Python only - +- `run: Run`: The full [Run](/reference/data_formats/run_data_format) object generated by the application on the given example. +- `example: Example`: The full dataset [Example](/reference/data_formats/example_data_format), including the example inputs, outputs (if available), and metdata (if available). - `inputs: dict`: A dictionary of the inputs corresponding to a single example in a dataset. - `outputs: dict`: A dictionary of the outputs generated by the application on the given `inputs`. -- `reference_outputs: dict`: A dictionary of the reference outputs associated with the example, if available. +- `reference_outputs/referenceOutputs: dict`: A dictionary of the reference outputs associated with the example, if available. For most use cases you'll only need `inputs`, `outputs`, and `reference_outputs`. `run` and `example` are useful only if you need some extra trace or example metadata outside of the actual inputs and outputs of the application. +When using JS/TS these should all be passed in as part of a single object argument. + ## Evaluator output Custom evaluators are expected to return one of the following types: diff --git a/docs/evaluation/how_to_guides/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluate_llm_application.mdx index 96f0a6a7..141cbfe0 100644 --- a/docs/evaluation/how_to_guides/evaluate_llm_application.mdx +++ b/docs/evaluation/how_to_guides/evaluate_llm_application.mdx @@ -16,11 +16,11 @@ import { ::: -In this guide we'll go over how to evaluate an application using the [evaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) method in the LangSmith SDK. +In this guide we'll go over how to evaluate an application using the [evaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._runner.evaluate) method in the LangSmith SDK. :::tip -For larger evaluation jobs in Python we recommend using [aevaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._arunner.aevaluate.html), the asynchronous version of `evaluate()`. +For larger evaluation jobs in Python we recommend using [aevaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate), the asynchronous version of `evaluate()`. It is still worthwhile to read this guide first, as the two have nearly identical interfaces, and then read the how-to guide on [running an evaluation asynchronously](./async). @@ -180,7 +180,7 @@ See [here](.#define-an-evaluator) for more on how to define evaluators. ## Run the evaluation -We'll use the [evaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) / [aevaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._arunner.aevaluate.html) methods to run the evaluation. +We'll use the [evaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._runner.evaluate) / [aevaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate) methods to run the evaluation. The key arguments are: diff --git a/docs/evaluation/how_to_guides/evaluate_pairwise.mdx b/docs/evaluation/how_to_guides/evaluate_pairwise.mdx index 0f9020ca..08398dcc 100644 --- a/docs/evaluation/how_to_guides/evaluate_pairwise.mdx +++ b/docs/evaluation/how_to_guides/evaluate_pairwise.mdx @@ -20,22 +20,22 @@ import { LangSmith supports evaluating **existing** experiments in a comparative manner. This allows you to score the outputs from multiple experiments against each other, rather than being confined to evaluating outputs one at a time. Think [LMSYS Chatbot Arena](https://chat.lmsys.org/) - this is the same concept! -To do this, use the [evaluate_comparative](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate_comparative.html) / `evaluateComparative` function with two existing experiments. +To do this, use the [evaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._runner.evaluate) function with two existing experiments. If you haven't already created experiments to compare, check out our [quick start](../) or our [how-to guide](./evaluate_llm_application) to get started with evaluations. -## `evaluate_comparative` args +## `evaluate()` comparative args -:::note -Pairwise evaluations require `langsmith` SDK Python version `>=0.2.0` or JS version `>=0.1.24`. +:::info +This guide requires `langsmith` Python version `>=0.2.0` or JS version `>=0.2.9`. ::: -At its simplest, `evaluate_comparative` / `evaluateComparative` function takes the following arguments: +At its simplest, `evaluate` / `aevaluate` function takes the following arguments: -| Argument | Description | -| ------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -| `experiments` | A list of the two **existing experiments** you would like to evaluate against each other. These can be uuids or experiment names. | -| `evaluators` | A list of the pairwise evaluators that you would like to attach to this evaluation. See the section below for how to define these. | +| Argument | Description | +| ------------ | ---------------------------------------------------------------------------------------------------------------------------------- | +| `target` | A list of the two **existing experiments** you would like to evaluate against each other. These can be uuids or experiment names. | +| `evaluators` | A list of the pairwise evaluators that you would like to attach to this evaluation. See the section below for how to define these. | Along with these, you can also pass in the following optional args: @@ -59,16 +59,13 @@ Custom evaluator functions must have specific argument names. They can take any Python and JS/TS -- `runs: list[langsmith.schemas.Run]`: A two-item list of the full Run objects generated by the two experiments on the given example. Use this if you need access to intermediate steps or metadata about each run. -- `example: langsmith.schemas.Example`: The full dataset Example, including the example inputs, outputs (if available), and metdata (if available). - -Currently Python only - +- `runs: list[Run]`: A two-item list of the full [Run](/reference/data_formats/run_data_format) objects generated by the two experiments on the given example. Use this if you need access to intermediate steps or metadata about each run. +- `example: Example`: The full dataset [Example](/reference/data_formats/example_data_format), including the example inputs, outputs (if available), and metdata (if available). - `inputs: dict`: A dictionary of the inputs corresponding to a single example in a dataset. - `outputs: list[dict]`: A two-item list of the outputs produced by each experiment on the given inputs. -- `reference_outputs: dict`: A dictionary of the reference outputs associated with the example, if available. +- `reference_outputs` / `referenceOutputs: dict`: A dictionary of the reference outputs associated with the example, if available. -For most Python use cases you'll only need `inputs`, `outputs`, and `reference_outputs`. `run` and `example` are useful only if you need some extra trace or example metadata outside of the actual inputs and outputs of the application. +For most use cases you'll only need `inputs`, `outputs`, and `reference_outputs`. `run` and `example` are useful only if you need some extra trace or example metadata outside of the actual inputs and outputs of the application. ### Evaluator output @@ -104,47 +101,46 @@ In the Python example below, we are pulling [this structured prompt](https://smi python({caption: "Requires `langsmith>=0.2.0`"})` from langchain import hub from langchain.chat_models import init_chat_model - from langsmith import evaluate_comparative + from langsmith import evaluate + # See the prompt: https://smith.langchain.com/hub/langchain-ai/pairwise-evaluation-2 prompt = hub.pull("langchain-ai/pairwise-evaluation-2") model = init_chat_model("gpt-4o") chain = prompt | model def ranked_preference(inputs: dict, outputs: list[dict]) -> list: + # Assumes example inputs have a 'question' key and experiment + # outputs have an 'answer' key. response = chain.invoke({ "question": inputs["question"], - "answer_a": outputs[0].get("output", "N/A"), - "answer_b": outputs[1].get("output", "N/A"), + "answer_a": outputs[0].get("answer", "N/A"), + "answer_b": outputs[1].get("answer", "N/A"), }) - preference = response["Preference"] - if preference == 1: + if response["Preference"] == 1: scores = [1, 0] - elif preference == 2: + elif response["Preference"] == 2: scores = [0, 1] else: scores = [0, 0] return scores - evaluate_comparative( - # Replace the following array with the names or IDs of your experiments - ["my-experiment-name-1", "my-experiment-name-2"], + evaluate( + ["experiment-1", "experiment-2"], # Replace with the names/IDs of your experiments evaluators=[ranked_preference], + randomize_order=True, + max_concurrency=4, ) `, - typescript({ - caption: - "Note: LangChain support inside `evaluate` / `evaluateComparative` is not supported yet. See [this issue](https://github.com/langchain-ai/langsmith-sdk/issues/598) for more details.", - })` - import type { Run, Example } from "langsmith"; - import { evaluateComparative } from "langsmith/evaluation"; + typescript({caption: "Requires `langsmith>=0.2.9`"})` + import { evaluate} from "langsmith/evaluation"; import { wrapOpenAI } from "langsmith/wrappers"; import OpenAI from "openai"; + import { z } from "zod"; const openai = wrapOpenAI(new OpenAI()); - import { z } from "zod"; - async function evaluatePairwise(runs: Run[], example: Example) { + async function rankedPreference({ inputs, outputs }: { inputs: Record, outputs: Record }) { const scores: Record = {}; const [runA, runB] = runs; @@ -228,8 +224,8 @@ In the Python example below, we are pulling [this structured prompt](https://smi return { key: "ranked_preference", scores }; } - await evaluateComparative(["earnest-name-40", "reflecting-pump-91"], { - evaluators: [evaluatePairwise], + await evaluate(["earnest-name-40", "reflecting-pump-91"], { + evaluators: [rankedPreference], }); `, diff --git a/docs/evaluation/how_to_guides/rate_limiting.mdx b/docs/evaluation/how_to_guides/rate_limiting.mdx index 240bf482..41331bd6 100644 --- a/docs/evaluation/how_to_guides/rate_limiting.mdx +++ b/docs/evaluation/how_to_guides/rate_limiting.mdx @@ -83,7 +83,7 @@ See some examples of how to do this in the [OpenAI docs](https://platform.openai ## Limiting max_concurrency Limiting the number of concurrent calls you're making to your application and evaluators is another way to decrease the frequency of model calls you're making, and in that way avoid rate limit errors. -`max_concurrency` can be set directly on the [evaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) / [aevaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._arunner.aevaluate.html) functions. +`max_concurrency` can be set directly on the [evaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._runner.evaluate) / [aevaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate) functions. `), =0.2.0`"})` - from langsmith import evaluate, Client + from langsmith import Client # 1. Create and/or select your dataset client = Client() @@ -72,7 +72,7 @@ export LANGCHAIN_API_KEY=`), return {"answer": inputs["question"] + " is a good question. I don't know the answer."} # 4. Run an evaluation - evaluate( + experiment_results = client.evaluate( chatbot, data=dataset, evaluators=[is_concise], @@ -80,10 +80,11 @@ export LANGCHAIN_API_KEY=`), ) `, - typescript`import { Client } from "langsmith"; + typescript({caption: "Requires `langsmith>=0.2.9`"})` +import { Client } from "langsmith"; import { evaluate } from "langsmith/evaluation"; import type { EvaluationResult } from "langsmith/evaluation"; -import type { Run, Example } from "langsmith/schemas";\n + // 1. Define a dataset const client = new Client(); const datasetName = "my first dataset" @@ -92,16 +93,15 @@ const dataset = await client.clonePublicDataset( { datasetName: datasetName } )\n // 2. Define an evaluator -function isConcise(rootRun: Run, example: Example): EvaluationResult { -const score = rootRun.outputs?.outputs.length < 3 \* example.outputs?.answer.length; +function isConcise({ outputs, referenceOutputs }: { outputs?: Record, referenceOutputs?: Record }): EvaluationResult { +const score = outputs?.answer.length < 3 \* referenceOutputs?.answer.length; return { key: "is_concise", score: score }; }\n // 3. Run an evaluation -// For more info on evaluators, see: https://docs.smith.langchain.com/concepts/evaluation#evaluators await evaluate( -(exampleInput) => { +(inputs: { question: string }) => { return { -answer: exampleInput.question + " Good question. I don't know the answer" +answer: inputs.question + " Good question. I don't know the answer" }; }, { data: datasetName, diff --git a/docs/evaluation/tutorials/evaluation.mdx b/docs/evaluation/tutorials/evaluation.mdx index f10acb3a..917e81d1 100644 --- a/docs/evaluation/tutorials/evaluation.mdx +++ b/docs/evaluation/tutorials/evaluation.mdx @@ -19,7 +19,7 @@ At a high level, in this tutorial we will go over how to: - _Track results over time_ - _Set up automated testing to run in CI/CD_ -For more information on the evaluation workflows LangSmith supports, check out the [how-to guides](../../evaluation/how_to_guides), or see the reference docs for [evaluate](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) and its asynchronous [aevaluate](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._arunner.aevaluate.html) counterpart. +For more information on the evaluation workflows LangSmith supports, check out the [how-to guides](../../evaluation/how_to_guides), or see the reference docs for [evaluate](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._runner.evaluate) and its asynchronous [aevaluate](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate) counterpart. Lots to cover, let's dive in! diff --git a/docs/index.mdx b/docs/index.mdx index 3e3c7c4a..d3298379 100644 --- a/docs/index.mdx +++ b/docs/index.mdx @@ -167,7 +167,7 @@ import { EvaluationResult, evaluate } from "langsmith/evaluation"; datasetId: dataset.id, }); - // Define your evaluator + // Define your evaluator(s) const exactMatch = async ({ outputs, referenceOutputs }: { outputs?: Record; referenceOutputs?: Record; @@ -178,8 +178,9 @@ import { EvaluationResult, evaluate } from "langsmith/evaluation"; }; }; - await evaluate( - (input: { postfix: string }) => ({ response: \`Welcome $\{input.postfix\}\` }), + // Run the evaluation + const experimentResults = await evaluate( + (inputs: { postfix: string }) => ({ response: \`Welcome $\{inputs.postfix\}\` }), { data: datasetName, evaluators: [exactMatch], diff --git a/docs/reference/sdk_reference/_category_.json b/docs/reference/sdk_reference/_category_.json deleted file mode 100644 index b9f47770..00000000 --- a/docs/reference/sdk_reference/_category_.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "label": "SDK reference", - "collapsed": false, - "collapsible": true -} diff --git a/docs/reference/sdk_reference/index.mdx b/docs/reference/sdk_reference/index.mdx deleted file mode 100644 index bbb9e33a..00000000 --- a/docs/reference/sdk_reference/index.mdx +++ /dev/null @@ -1,3 +0,0 @@ -# SDK Reference - -- [Python SDK Reference](https://langsmith-sdk.readthedocs.io/en/latest/) diff --git a/vercel.json b/vercel.json index 7cbcf89c..e5808ad4 100644 --- a/vercel.json +++ b/vercel.json @@ -205,6 +205,10 @@ { "source": "/reference/python(/?)", "destination": "/reference/python/reference" + }, + { + "source": "/reference/sdk_reference(/?)", + "destination": "/reference/" } ], "builds": [ From d7ea5551e72a6400ca981de3236ecc83a45becdc Mon Sep 17 00:00:00 2001 From: Bagatur Date: Wed, 4 Dec 2024 10:19:46 -0800 Subject: [PATCH 11/21] wip --- .../evaluate_existing_experiment.mdx | 59 ++++++++++--------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/docs/evaluation/how_to_guides/evaluate_existing_experiment.mdx b/docs/evaluation/how_to_guides/evaluate_existing_experiment.mdx index 7c7ac33f..5c8bd0f3 100644 --- a/docs/evaluation/how_to_guides/evaluate_existing_experiment.mdx +++ b/docs/evaluation/how_to_guides/evaluate_existing_experiment.mdx @@ -4,20 +4,20 @@ sidebar_position: 6 # How to evaluate an existing experiment (Python only) -:::note -Currently, `evaluate_existing` is only supported in the Python SDK. +:::inof +Evaluation of existing experiments is currently only supported in the Python SDK. ::: If you have already run an experiment and want to add additional evaluation metrics, you -can apply any evaluators to the experiment using the `evaluate_existing` method. +can apply any evaluators to the experiment using the `evaluate()` / `aevaluate()` methods. ```python -from langsmith import evaluate_existing +from langsmith import evaluate -def always_half(run, example): - return {"score": 0.5} +def always_half(inputs: dict, outputs: dict) -> float: + return 0.5 -experiment_name = "my-experiment:abcd123" # Replace with an actual experiment name or ID +experiment_name = "my-experiment:abc" # Replace with an actual experiment name or ID evaluate_existing(experiment_name, evaluators=[always_half]) ``` @@ -27,37 +27,40 @@ Suppose you are evaluating a semantic router. You may first run an experiment: ```python from langsmith import evaluate + def semantic_router(inputs: dict): return {"class": 1} -def accuracy(run, example): - prediction = run.outputs["class"] - expected = example.outputs["label"] - return {"score": prediction == expected} +def accuracy(outputs: dict, reference_outputs: dict) -> bool: + prediction = outputs["class"] + expected = reference_outputs["label"] + return prediction == expected -results = evaluate(semantic_router, data="Router Classification Dataset", evaluators=[accuracy]) +results = evaluate( + semantic_router, + data="Router Classification Dataset", + evaluators=[accuracy], +) experiment_name = results.experiment_name ``` -Later, you realize you want to add precision and recall summary metrics. The `evaluate_existing` method accepts the same arguments as the `evaluate` method, replacing the `target` system with the `experiment` you wish to add metrics to, meaning -you can add both instance-level `evaluator`'s and aggregate `summary_evaluator`'s. +Later, you realize you want to add precision and recall summary metrics. You can rerun `evaluate()` this time with the extra metrics, +which allows you to add both instance-level `evaluator`'s and aggregate `summary_evaluator`'s. ```python -from langsmith import evaluate_existing - -def precision(runs: list, examples: list): - true_positives = sum([1 for run, example in zip(runs, examples) if run.outputs["class"] == example.outputs["label"]]) - false_positives = sum([1 for run, example in zip(runs, examples) if run.outputs["class"] != example.outputs["label"]]) - return {"score": true_positives / (true_positives + false_positives)} - -def recall(runs: list, examples: list): - true_positives = sum([1 for run, example in zip(runs, examples) if run.outputs["class"] == example.outputs["label"]]) - false_negatives = sum([1 for run, example in zip(runs, examples) if run.outputs["class"] != example.outputs["label"]]) - return {"score": true_positives / (true_positives + false_negatives)} +from langsmith import evaluate -evaluate_existing(experiment_name, summary_evaluators=[precision, recall]) +# Note that now we take list of dicts as inputs instead of just dicts. +def precision_recall(outputs: list[dict], reference_outputs: list[dict]) -> list[dict]: + true_positives = sum([ref["label"] == 1 and out["class"] == 1 for out, ref in zip(outputs, reference_outputs)]) + predicted_positives = len([out for out in outputs if out["class"] == 1]) + actual_positives = len([ref for ref in reference_outputs if ref["label"] == 1]) + return [ + {"score": true_positives / predicted_positives, "key": "precision"}, + {"score": true_positives / actual_positives, "key": "recall"} + ] + +evaluate(experiment_name, summary_evaluators=[precision_recall]) ``` The precision and recall metrics will now be available in the LangSmith UI for the `experiment_name` experiment. - -As is the case with the `evaluate` function, there is an identical, asynchronous `aevaluate_existing` function that can be used to evaluate experiments asynchronously. From d3c95bf6be9381583ee3cf7d41ad185f3ea89313 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Wed, 4 Dec 2024 11:52:00 -0800 Subject: [PATCH 12/21] wip --- docs/evaluation/how_to_guides/async.mdx | 7 ++- .../evaluate_existing_experiment.mdx | 5 +- .../evaluate_llm_application.mdx | 46 ++++++++++--------- 3 files changed, 31 insertions(+), 27 deletions(-) diff --git a/docs/evaluation/how_to_guides/async.mdx b/docs/evaluation/how_to_guides/async.mdx index d51266c8..4f9cd78f 100644 --- a/docs/evaluation/how_to_guides/async.mdx +++ b/docs/evaluation/how_to_guides/async.mdx @@ -26,7 +26,7 @@ You can see how to use it [here](./evaluate_llm_application). groupId="client-language" tabs={[ python({caption: "Requires `langsmith>=0.2.0`"})` - from langsmith import aevaluate, wrappers, Client + from langsmith import wrappers, Client from openai import AsyncOpenAI # Optionally wrap the OpenAI client to trace all model calls. @@ -61,7 +61,10 @@ list 5 concrete questions that should be investigated to determine if the idea i inputs=[{"idea": e} for e in examples, ) - results = await aevaluate( + # Can equivalently use the 'aevaluate' function directly: + # from langsmith import aevaluate + # await aevaluate(...) + results = await ls_client.aevaluate( researcher_app, data=dataset, evaluators=[concise], diff --git a/docs/evaluation/how_to_guides/evaluate_existing_experiment.mdx b/docs/evaluation/how_to_guides/evaluate_existing_experiment.mdx index 5c8bd0f3..ec34aa68 100644 --- a/docs/evaluation/how_to_guides/evaluate_existing_experiment.mdx +++ b/docs/evaluation/how_to_guides/evaluate_existing_experiment.mdx @@ -31,10 +31,9 @@ from langsmith import evaluate def semantic_router(inputs: dict): return {"class": 1} +# Assumes dataset examples have reference outputs with a "label" key. def accuracy(outputs: dict, reference_outputs: dict) -> bool: - prediction = outputs["class"] - expected = reference_outputs["label"] - return prediction == expected + return outputs["class"] == reference_outputs["label"] results = evaluate( semantic_router, diff --git a/docs/evaluation/how_to_guides/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluate_llm_application.mdx index 141cbfe0..643bb1c3 100644 --- a/docs/evaluation/how_to_guides/evaluate_llm_application.mdx +++ b/docs/evaluation/how_to_guides/evaluate_llm_application.mdx @@ -18,11 +18,12 @@ import { In this guide we'll go over how to evaluate an application using the [evaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._runner.evaluate) method in the LangSmith SDK. -:::tip +:::tip Async jobs in Python -For larger evaluation jobs in Python we recommend using [aevaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate), the asynchronous version of `evaluate()`. -It is still worthwhile to read this guide first, as the two have nearly identical interfaces, -and then read the how-to guide on [running an evaluation asynchronously](./async). +For larger evaluation jobs in Python we recommend using [aevaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate), the asynchronous version of [evaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._runner.evaluate). +It is still worthwhile to read this guide first, as the two have identical interfaces, before reading the how-to guide on [running an evaluation asynchronously](./async). + +In JS/TS evaluate() is already asynchronous so no separate method is needed. ::: @@ -42,7 +43,7 @@ First we need an application to evaluate. Let's create a simple toxicity classif # Optionally add the 'traceable' decorator to trace the inputs/outputs of this function. @traceable - def toxicity_classifier(inputs: dict) -> str: + def toxicity_classifier(inputs: dict) -> dict: instructions = ( "Please review the user query below and determine if it contains any form of toxic behavior, " "such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does " @@ -55,7 +56,7 @@ First we need an application to evaluate. Let's create a simple toxicity classif result = oai_client.chat.completions.create( messages=messages, model="gpt-4o-mini", temperature=0 ) - return result.choices[0].message.content + return {"class": result.choices[0].message.content} `, typescript` import { OpenAI } from "openai"; @@ -113,10 +114,11 @@ We need a [Dataset](../concepts#datasets) to evaluate our application on. Our da dataset_name = "Toxic Queries" dataset = ls_client.create_dataset(dataset_name=dataset_name) - inputs, outputs = zip( - *[({"text": text}, {"label": label}) for text, label in labeled_texts] + ls_client.create_examples( + inputs=[{"text": text} for text, _ in labeled_texts], + outputs=[{"label": label} for _, label in labeled_texts], + dataset_id=dataset.id, ) - ls_client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id) `, typescript` import { Client } from "langsmith"; @@ -162,14 +164,13 @@ Since we have labels for this task, our evaluator can directly check if the actu tabs={[ python({ caption: "Requires `langsmith>=0.2.0`" })` def correct(inputs: dict, outputs: dict, reference_outputs: dict) -> bool: - return outputs["output"] == reference_outputs["label"] + return outputs["class"] == reference_outputs["label"] `, typescript` import type { EvaluationResult } from "langsmith/evaluation"; - import type { Run, Example } from "langsmith/schemas"; - function correct(rootRun: Run, example: Example): EvaluationResult { - const score = rootRun.outputs?.output === example.outputs?.outputs; + function correct({ outputs, referenceOutputs }: { outputs: Record, referenceOutputs?: Record }): EvaluationResult { + const score = outputs.output === referenceOutputs?.outputs; return { key: "correct", score }; } `, @@ -184,7 +185,7 @@ We'll use the [evaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-l The key arguments are: -- a function that takes an input dictionary and returns an output dictionary or object +- a target function that takes an input dictionary and returns an output dictionary. The `example.inputs` field of each [Example](/reference/data_formats/example_data_format) is what gets passed to the target function. In this case our `toxicity_classifier` is already set up to take in example inputs so we can use it directly. - `data` - the name OR UUID of the LangSmith dataset to evaluate on, or an iterator of examples - `evaluators` - a list of evaluators to score the outputs of the function @@ -192,9 +193,9 @@ The key arguments are: groupId="client-language" tabs={[ python({ caption: "Requires `langsmith>=0.2.0`" })` - from langsmith import evaluate - - results = evaluate( + # Can equivalently use the 'evaluate' function directly: + # from langsmith import evaluate; evaluate(...) + results = ls_client.evaluate( toxicity_classifier, data=dataset_name, evaluators=[correct], @@ -233,7 +234,7 @@ _If you've annotated your code for tracing, you can open the trace of each row i groupId="client-language" tabs={[ python({caption: "Requires `langsmith>=0.2.0`"})` - from langsmith import Client, evaluate, traceable, wrappers + from langsmith import Client, traceable, wrappers from openai import OpenAI # Step 1. Define an application @@ -269,17 +270,18 @@ _If you've annotated your code for tracing, you can open the trace of each row i dataset_name = "Toxic Queries" dataset = ls_client.create_dataset(dataset_name=dataset_name) - inputs, outputs = zip( - *[({"text": text}, {"label": label}) for text, label in labeled_texts] + ls_client.create_examples( + inputs=[{"text": text} for text, _ in labeled_texts], + outputs=[{"label": label} for _, label in labeled_texts], + dataset_id=dataset.id, ) - ls_client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id) # Step 3. Define an evaluator def correct(inputs: dict, outputs: dict, reference_outputs: dict) -> bool: return outputs["output"] == reference_outputs["label"] # Step 4. Run the evaluation - results = evaluate( + results = ls_client.evaluate( toxicity_classifier, data=dataset_name, evaluators=[correct], From 6720e9d3fa881bb4dcd956bd7711c61f7d1074eb Mon Sep 17 00:00:00 2001 From: Bagatur Date: Wed, 4 Dec 2024 12:11:50 -0800 Subject: [PATCH 13/21] wip --- docs/evaluation/how_to_guides/summary.mdx | 43 +++++++++++++---------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/docs/evaluation/how_to_guides/summary.mdx b/docs/evaluation/how_to_guides/summary.mdx index 761043eb..adedbe20 100644 --- a/docs/evaluation/how_to_guides/summary.mdx +++ b/docs/evaluation/how_to_guides/summary.mdx @@ -16,17 +16,10 @@ Below, we'll implement a very simple summary evaluator that computes overall pas groupId="client-language" tabs={[ python` - from langsmith.schemas import Example, Run - - def summary_eval(runs: list[Run], examples: list[Example]) -> dict: - correct = 0 - for i, run in enumerate(runs): - if run.outputs["output"] == examples[i].outputs["label"]: - correct += 1 - if correct / len(runs) > 0.5: - return {"key": "pass", "score": True} - else: - return {"key": "pass", "score": False} + def pass_50(outputs: list[dict], reference_outputs: list[dict]) -> bool: + """Pass if >50% of all results are correct.""" + correct = sum([out["output"] == ref["label"] for out, ref in zip(outputs, reference_outputs)]) + return correct / len(outputs) > 0.5 `, typescript` import { Run, Example } from "langsmith/schemas"; @@ -52,12 +45,25 @@ You can then pass this evaluator to the `evaluate` method as follows: groupId="client-language" tabs={[ python` - results = evaluate( - lambda inputs: label_query(inputs["text"]), - data=dataset_name, - evaluators=[correct_label], - summary_evaluators=[summary_eval], - experiment_prefix="Toxic Queries", + from langsmith import Client + + ls_client = Client() + dataset = ls_client.clone_public_dataset( + "https://smith.langchain.com/public/3d6831e6-1680-4c88-94df-618c8e01fc55/d + ) + + def bad_classifier(inputs: dict) -> dict: + return {"class": "Not toxic"} + + def correct(outputs: dict, reference_outputs: dict) -> bool: + """Row-level correctness evaluator.""" + return outputs["class"] == reference_outputs["label"] + + results = ls_client.evaluate( + bad_classified, + data=dataset, + evaluators=[correct], + summary_evaluators=[pass_50], ) `, typescript` @@ -68,7 +74,8 @@ You can then pass this evaluator to the `evaluate` method as follows: experimentPrefix: "Toxic Queries", }); `, - ]} + +]} /> In the LangSmith UI, you'll the summary evaluator's score displayed with the corresponding key. From 02c1069a9cfba788aebbe10c8a8ab8b83b0a142c Mon Sep 17 00:00:00 2001 From: Bagatur Date: Wed, 4 Dec 2024 13:39:36 -0800 Subject: [PATCH 14/21] wip --- docs/evaluation/how_to_guides/summary.mdx | 37 +++++++++++++++++------ 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/docs/evaluation/how_to_guides/summary.mdx b/docs/evaluation/how_to_guides/summary.mdx index adedbe20..25116b7d 100644 --- a/docs/evaluation/how_to_guides/summary.mdx +++ b/docs/evaluation/how_to_guides/summary.mdx @@ -18,22 +18,20 @@ Below, we'll implement a very simple summary evaluator that computes overall pas python` def pass_50(outputs: list[dict], reference_outputs: list[dict]) -> bool: """Pass if >50% of all results are correct.""" - correct = sum([out["output"] == ref["label"] for out, ref in zip(outputs, reference_outputs)]) + correct = sum([out["class"] == ref["label"] for out, ref in zip(outputs, reference_outputs)]) return correct / len(outputs) > 0.5 `, typescript` - import { Run, Example } from "langsmith/schemas"; - - function summaryEval(runs: Run[], examples: Example[]) { + function summaryEval({ outputs, referenceOutputs }: { outputs: Record[], referenceOutputs?: Record[]}) { let correct = 0; - for (let i = 0; i < runs.length; i++) { - if (runs[i].outputs["output"] === examples[i].outputs["label"]) { + for (let i = 0; i < outputs.length; i++) { + if (outputs[i]["output"] === referenceOutputs[i]["label"]) { correct += 1; } } - return { key: "pass", score: correct / runs.length > 0.5 }; + return { key: "pass", score: correct / outputs.length > 0.5 }; } `, ]} @@ -67,9 +65,30 @@ You can then pass this evaluator to the `evaluate` method as follows: ) `, typescript` - await evaluate((inputs) => labelQuery(inputs["input"]), { + + import { Client } from "langsmith"; + import { evaluate } from "langsmith/evaluation"; + import type { EvaluationResult } from "langsmith/evaluation"; + + const client = new Client(); + const datasetName = "Toxic queries"; + const dataset = await client.clonePublicDataset( + "https://smith.langchain.com/public/3d6831e6-1680-4c88-94df-618c8e01fc55/d, + { datasetName: datasetName } + ); + + function correct({ outputs, referenceOutputs }: { outputs: Record, referenceOutputs?: Record }): EvaluationResult { + const score = outputs["class"] === referenceOutputs?["label"]; + return { key: "correct", score }; + } + + function badClassifier(inputs: Record): { class: string } { + return { class: "Not toxic" }; + } + + await evaluate(badClassifier, { data: datasetName, - evaluators: [correctLabel], + evaluators: [correct], summaryEvaluators: [summaryEval], experimentPrefix: "Toxic Queries", }); From f97d51b60dee2d19298753fe6f466bc9dc6d575d Mon Sep 17 00:00:00 2001 From: Bagatur Date: Wed, 4 Dec 2024 14:14:59 -0800 Subject: [PATCH 15/21] wip --- .../how_to_guides/dataset_subset.mdx | 2 +- .../how_to_guides/dataset_version.mdx | 39 ++++++++++----- .../evaluate_existing_experiment.mdx | 48 ++----------------- docs/evaluation/how_to_guides/index.md | 10 ++-- 4 files changed, 37 insertions(+), 62 deletions(-) diff --git a/docs/evaluation/how_to_guides/dataset_subset.mdx b/docs/evaluation/how_to_guides/dataset_subset.mdx index efc914c9..8aa2babf 100644 --- a/docs/evaluation/how_to_guides/dataset_subset.mdx +++ b/docs/evaluation/how_to_guides/dataset_subset.mdx @@ -85,4 +85,4 @@ You can use the `list_examples` / `listExamples` method to evaluate on one or mu ## Related -- More on [how to filter datasets](./manage_datasets_programmatically#list-examples-by-structured-filter) +- Learn more about how to fetch views of a dataset [here](./manage_datasets_programmatically#fetch-datasets) diff --git a/docs/evaluation/how_to_guides/dataset_version.mdx b/docs/evaluation/how_to_guides/dataset_version.mdx index 564c1295..235dfa04 100644 --- a/docs/evaluation/how_to_guides/dataset_version.mdx +++ b/docs/evaluation/how_to_guides/dataset_version.mdx @@ -13,22 +13,36 @@ Additionally, it might be helpful to read the [guide on fetching examples](./man ::: -You can take advantage of the fact that `evaluate` allows passing in an iterable of examples to evaluate on a particular version of a dataset. -Simply use `list_examples` / `listExamples` to fetch examples from a particular version tag using `as_of` / `asOf`. +## Using `list_examples` + +You can take advantage of the fact that `evaluate` / `aevaluate` allows passing in an iterable of examples to evaluate on a particular version of a dataset. +Simply use `list_examples` / `listExamples` to fetch examples from a particular version tag using `as_of` / `asOf` and pass that in to the `data` argument. bool: + return outputs["class"] == reference_outputs["label"] + + results = ls_client.evaluate( + lambda inputs: {"class": "Not toxic"}, + # Pass in filtered data here: + # highlight-next-line + data=ls_client.list_examples( + # highlight-next-line + dataset_name="Toxic Queries", + # highlight-next-line + as_of="latest", # specify version here + # highlight-next-line + ), + evaluators=[correct], ) `, typescript` @@ -40,8 +54,11 @@ Simply use `list_examples` / `listExamples` to fetch examples from a particular asOf: "latest", }), evaluators: [correctLabel], - experimentPrefix: "Toxic Queries", }); `, ]} /> + +## Related + +- Learn more about how to fetch views of a dataset [here](./manage_datasets_programmatically#fetch-datasets) diff --git a/docs/evaluation/how_to_guides/evaluate_existing_experiment.mdx b/docs/evaluation/how_to_guides/evaluate_existing_experiment.mdx index ec34aa68..489aa6c2 100644 --- a/docs/evaluation/how_to_guides/evaluate_existing_experiment.mdx +++ b/docs/evaluation/how_to_guides/evaluate_existing_experiment.mdx @@ -9,7 +9,8 @@ Evaluation of existing experiments is currently only supported in the Python SDK ::: If you have already run an experiment and want to add additional evaluation metrics, you -can apply any evaluators to the experiment using the `evaluate()` / `aevaluate()` methods. +can apply any evaluators to the experiment using the `evaluate()` / `aevaluate()` methods as before. +Just pass in the experiment name / ID instead of a target function: ```python from langsmith import evaluate @@ -18,48 +19,5 @@ def always_half(inputs: dict, outputs: dict) -> float: return 0.5 experiment_name = "my-experiment:abc" # Replace with an actual experiment name or ID -evaluate_existing(experiment_name, evaluators=[always_half]) +evaluate(experiment_name, evaluators=[always_half]) ``` - -## Example - -Suppose you are evaluating a semantic router. You may first run an experiment: - -```python -from langsmith import evaluate - -def semantic_router(inputs: dict): - return {"class": 1} - -# Assumes dataset examples have reference outputs with a "label" key. -def accuracy(outputs: dict, reference_outputs: dict) -> bool: - return outputs["class"] == reference_outputs["label"] - -results = evaluate( - semantic_router, - data="Router Classification Dataset", - evaluators=[accuracy], -) -experiment_name = results.experiment_name -``` - -Later, you realize you want to add precision and recall summary metrics. You can rerun `evaluate()` this time with the extra metrics, -which allows you to add both instance-level `evaluator`'s and aggregate `summary_evaluator`'s. - -```python -from langsmith import evaluate - -# Note that now we take list of dicts as inputs instead of just dicts. -def precision_recall(outputs: list[dict], reference_outputs: list[dict]) -> list[dict]: - true_positives = sum([ref["label"] == 1 and out["class"] == 1 for out, ref in zip(outputs, reference_outputs)]) - predicted_positives = len([out for out in outputs if out["class"] == 1]) - actual_positives = len([ref for ref in reference_outputs if ref["label"] == 1]) - return [ - {"score": true_positives / predicted_positives, "key": "precision"}, - {"score": true_positives / actual_positives, "key": "recall"} - ] - -evaluate(experiment_name, summary_evaluators=[precision_recall]) -``` - -The precision and recall metrics will now be available in the LangSmith UI for the `experiment_name` experiment. diff --git a/docs/evaluation/how_to_guides/index.md b/docs/evaluation/how_to_guides/index.md index 06c671c0..f3cff1f7 100644 --- a/docs/evaluation/how_to_guides/index.md +++ b/docs/evaluation/how_to_guides/index.md @@ -12,14 +12,14 @@ Evaluate and improve your application before deploying it. ### Run an evaluation -- [Run an evaluation](./how_to_guides/evaluate_llm_application) +- [Run an evaluation with the SDK](./how_to_guides/evaluate_llm_application) - [Run an evaluation asynchronously](./how_to_guides/async) - [Run an evaluation comparing two experiments](./how_to_guides/evaluate_pairwise) - [Evaluate a `langchain` runnable](./how_to_guides/langchain_runnable) - [Evaluate a `langgraph` graph](./how_to_guides/langgraph) - [Evaluate an existing experiment (Python only)](./how_to_guides/evaluate_existing_experiment) -- [Run an evaluation via the REST API](./how_to_guides/run_evals_api_only) - [Run an evaluation from the UI](./how_to_guides/run_evaluation_from_prompt_playground) +- [Run an evaluation via the REST API](./how_to_guides/run_evals_api_only) ### Define an evaluator @@ -82,9 +82,9 @@ Manage datasets in LangSmith used by your evaluations. - [Export a dataset from the UI](./how_to_guides/manage_datasets_in_application#export-a-dataset) - [Create a dataset split from the UI](./how_to_guides/manage_datasets_in_application#create-and-manage-dataset-splits) - [Filter examples from the UI](./how_to_guides/manage_datasets_in_application#filter-examples) -- [Create a dataset via the SDK](./how_to_guides/manage_datasets_programmatically#create-a-dataset) -- [Fetch a dataset via the SDK](./how_to_guides/manage_datasets_programmatically#fetch-datasets) -- [Update a dataset via the SDK](./how_to_guides/manage_datasets_programmatically#update-examples) +- [Create a dataset with the SDK](./how_to_guides/manage_datasets_programmatically#create-a-dataset) +- [Fetch a dataset with the SDK](./how_to_guides/manage_datasets_programmatically#fetch-datasets) +- [Update a dataset with the SDK](./how_to_guides/manage_datasets_programmatically#update-examples) - [Version a dataset](./how_to_guides/version_datasets) - [Share/unshare a dataset publicly](./how_to_guides/share_dataset) - [Export filtered traces from an experiment to a dataset](./how_to_guides/export_filtered_traces_to_dataset) From 9601489d102fe05bb4c579e371c25f81830bd2ff Mon Sep 17 00:00:00 2001 From: Bagatur Date: Wed, 4 Dec 2024 15:49:05 -0800 Subject: [PATCH 16/21] wip --- .../how_to_guides/dataset_version.mdx | 9 +- .../evaluate_on_intermediate_steps.mdx | 341 +++++++++++------- .../how_to_guides/evaluate_pairwise.mdx | 10 +- 3 files changed, 213 insertions(+), 147 deletions(-) diff --git a/docs/evaluation/how_to_guides/dataset_version.mdx b/docs/evaluation/how_to_guides/dataset_version.mdx index 235dfa04..791b879a 100644 --- a/docs/evaluation/how_to_guides/dataset_version.mdx +++ b/docs/evaluation/how_to_guides/dataset_version.mdx @@ -25,7 +25,7 @@ Simply use `list_examples` / `listExamples` to fetch examples from a particular from langsmith import Client ls_client = Client() - + # Assumes actual outputs have a 'class' key. # Assumes example outputs have a 'label' key. def correct(outputs: dict, reference_outputs: dict) -> bool: @@ -37,7 +37,7 @@ Simply use `list_examples` / `listExamples` to fetch examples from a particular # highlight-next-line data=ls_client.list_examples( # highlight-next-line - dataset_name="Toxic Queries", + dataset_name="Toxic Queries", # highlight-next-line as_of="latest", # specify version here # highlight-next-line @@ -47,7 +47,7 @@ Simply use `list_examples` / `listExamples` to fetch examples from a particular `, typescript` import { evaluate } from "langsmith/evaluation"; - + await evaluate((inputs) => labelText(inputs["input"]), { data: langsmith.listExamples({ datasetName: datasetName, @@ -56,7 +56,8 @@ Simply use `list_examples` / `listExamples` to fetch examples from a particular evaluators: [correctLabel], }); `, - ]} + +]} /> ## Related diff --git a/docs/evaluation/how_to_guides/evaluate_on_intermediate_steps.mdx b/docs/evaluation/how_to_guides/evaluate_on_intermediate_steps.mdx index a22f5df1..9b067025 100644 --- a/docs/evaluation/how_to_guides/evaluate_on_intermediate_steps.mdx +++ b/docs/evaluation/how_to_guides/evaluate_on_intermediate_steps.mdx @@ -21,65 +21,108 @@ For example, for retrieval-augmented generation (RAG), you might want to In this guide, we will use a simple, fully-custom evaluator for evaluating criteria 1 and an LLM-based evaluator for evaluating criteria 2 to highlight both scenarios. -In order to evaluate the intermediate steps of your pipeline, your evaluator function should traverse and process the `root_run`/`rootRun` argument, which is a `Run` object that contains the intermediate steps of your pipeline. +In order to evaluate the intermediate steps of your pipeline, your evaluator function should traverse and process the `run`/`rootRun` argument, which is a `Run` object that contains the intermediate steps of your pipeline. ## 1. Define your LLM pipeline The below RAG pipeline consists of 1) generating a Wikipedia query given the input question, 2) retrieving relevant documents from Wikipedia, and 3) generating an answer given the retrieved documents. +First lets install all the dependencies for this example: + = 2: - return results\n -@traceable -def generate_answer(question, context): - messages = [ - {"role": "system", "content": f"Answer the user's question based ONLY on the content below:\\n\\n{context}"}, - {"role": "user", "content": question} - ] - result = openai.chat.completions.create(messages=messages, model="gpt-4o-mini", temperature=0) - return result.choices[0].message.content\n -@traceable -def rag_pipeline(question): - query = generate_wiki_search(question) - context = "\\n\\n".join([doc["page_content"] for doc in retrieve(query)]) - answer = generate_answer(question, context) - return answer`), + { + value: "python", + label: "Python", + language: "bash", + content: `pip install -U langsmith langchain langchain-openai wikipedia`, + }, + { + value: "typescript", + label: "TypeScript", + language: "bash", + content: `yarn add langsmith langchain @langchain/openai wikipedia`, + }, + ]} + groupId="client-language" +/> + +=0.2.0`' })` + import openai + import wikipedia as wp + + from langsmith import traceable, wrappers + + oai_client = wrappers.wrap_openai(openai.Client()) + + @traceable + def generate_wiki_search(question: str) -> str: + """Generate the query to search in wikipedia.""" + instructions = ( + "Generate a search query to pass into wikipedia to answer the user's question. " + "Return only the search query and nothing more. " + "This will passed in directly to the wikipedia search engine." + ) + messages = [ + {"role": "system", "content": instructions}, + {"role": "user", "content": question} + ] + result = oai_client.chat.completions.create( + messages=messages, + model="gpt-4o-mini", + temperature=0, + ) + return result.choices[0].message.content + + @traceable(run_type="retriever") + def retrieve(query: str) -> list: + """Get up to two search wikipedia results.""" + results = [] + for term in wp.search(query, results = 10): + try: + page = wp.page(term, auto_suggest=False) + results.append({ + "page_content": page.summary, + "type": "Document", + "metadata": {"url": page.url} + }) + except wp.DisambiguationError: + pass + if len(results) >= 2: + return results + + @traceable + def generate_answer(question: str, context: str) -> str: + """Answer the question based on the retrieved information.""" + instructions = f"Answer the user's question based ONLY on the content below:\\n\\n{context}" + messages = [ + {"role": "system", "content": instructions}, + {"role": "user", "content": question} + ] + result = oai_client.chat.completions.create( + messages=messages, + model="gpt-4o-mini", + temperature=0 + ) + return result.choices[0].message.content + + @traceable + def qa_pipeline(question: str) -> str: + """The full pipeline.""" + query = generate_wiki_search(question) + context = "\\n\\n".join([doc["page_content"] for doc in retrieve(query)]) + return generate_answer(question, context)`, typescript` import OpenAI from "openai"; import wiki from "wikipedia"; import { Client } from "langsmith"; import { traceable } from "langsmith/traceable"; import { wrapOpenAI } from "langsmith/wrappers"; - + const openai = wrapOpenAI(new OpenAI()); - + const generateWikiSearch = traceable( async (input: { question: string }) => { const messages = [ @@ -90,18 +133,18 @@ def rag_pipeline(question): }, { role: "user" as const, content: input.question }, ]; - + const chatCompletion = await openai.chat.completions.create({ model: "gpt-4o-mini", messages: messages, temperature: 0, }); - + return chatCompletion.choices[0].message.content ?? ""; }, { name: "generateWikiSearch" } ); - + const retrieve = traceable( async (input: { query: string; numDocuments: number }) => { const { results } = await wiki.search(input.query, { limit: 10 }); @@ -110,7 +153,7 @@ def rag_pipeline(question): type: "Document"; metadata: { url: string }; }> = []; - + for (const result of results) { if (finalResults.length >= input.numDocuments) { // Just return the top 2 pages for now @@ -124,12 +167,12 @@ def rag_pipeline(question): metadata: { url: page.fullurl }, }); } - + return finalResults; }, { name: "retrieve", run_type: "retriever" } ); - + const generateAnswer = traceable( async (input: { question: string; context: string }) => { const messages = [ @@ -139,7 +182,7 @@ def rag_pipeline(question): }, { role: "user" as const, content: input.question }, ]; - + const chatCompletion = await openai.chat.completions.create({ model: "gpt-4o-mini", messages: messages, @@ -149,7 +192,7 @@ def rag_pipeline(question): }, { name: "generateAnswer" } ); - + const ragPipeline = traceable( async ({ question }: { question: string }, numDocuments: number = 2) => { const query = await generateWikiSearch({ question }); @@ -162,8 +205,9 @@ def rag_pipeline(question): }, { name: "ragPipeline" } );`, - ]} - groupId="client-language" + +]} +groupId="client-language" /> This pipeline will produce a trace that looks something like: @@ -178,25 +222,23 @@ We are building a very simple dataset with a couple of examples to evaluate the python` from langsmith import Client - client = Client() - - examples = [ - ("What is LangChain?", "LangChain is an open-source framework for building applications using large language models."), - ("What is LangSmith?", "LangSmith is an observability and evaluation tool for LLM products, built by LangChain Inc.") - ] - + ls_client = Client() dataset_name = "Wikipedia RAG" - if not client.has_dataset(dataset_name=dataset_name): - dataset = client.create_dataset(dataset_name=dataset_name) - inputs, outputs = zip( - *[({"input": input}, {"expected": expected}) for input, expected in examples] + + if not ls_client.has_dataset(dataset_name=dataset_name): + dataset = ls_client.create_dataset(dataset_name=dataset_name) + ls_client.create_examples( + inputs=[ + {"question": "What is LangChain?"}, + {"question": "What is LangSmith?"}, + ], + dataset_id=dataset.id ) - client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id) `, typescript` import { Client } from "langsmith"; const client = new Client(); - + const examples = [ [ "What is LangChain?", @@ -207,17 +249,18 @@ We are building a very simple dataset with a couple of examples to evaluate the "LangSmith is an observability and evaluation tool for LLM products, built by LangChain Inc.", ], ]; - + const datasetName = "Wikipedia RAG"; - + const inputs = examples.map(([input, _]) => ({ input })); const outputs = examples.map(([_, expected]) => ({ expected })); - + const dataset = await client.createDataset(datasetName); await client.createExamples({ datasetId: dataset.id, inputs, outputs }); `, - ]} - groupId="client-language" + +]} +groupId="client-language" /> ## 3. Define your custom evaluators @@ -225,61 +268,81 @@ We are building a very simple dataset with a couple of examples to evaluate the As mentioned above, we will define two evaluators: one that evaluates the relevance of the retrieved documents w.r.t the input query and another that evaluates the hallucination of the generated answer w.r.t the retrieved documents. We will be using LangChain LLM wrappers, along with [`with_structured_output`](https://python.langchain.com/v0.1/docs/modules/model_io/chat/structured_output/) to define the evaluator for hallucination. -The key here is that the evaluator function should traverse the `root_run` / `rootRun` argument to access the intermediate steps of the pipeline. The evaluator can then process the inputs and outputs of the intermediate steps to evaluate according to the desired criteria. +The key here is that the evaluator function should traverse the `run` / `rootRun` argument to access the intermediate steps of the pipeline. The evaluator can then process the inputs and outputs of the intermediate steps to evaluate according to the desired criteria. dict: - """ - A very simple evaluator that checks to see if the input of the retrieval step exists - in the retrieved docs. - """ - rag_pipeline_run = next(run for run in root_run.child_runs if run.name == "rag_pipeline") - retrieve_run = next(run for run in rag_pipeline_run.child_runs if run.name == "retrieve") - page_contents = "\\n\\n".join(doc["page_content"] for doc in retrieve_run.outputs["output"]) - score = retrieve_run.inputs["query"] in page_contents - return {"key": "simple_document_relevance", "score": score}\n -def hallucination(root_run: Run, example: Example) -> dict: - """ - A simple evaluator that checks to see the answer is grounded in the documents - """ - # Get documents and answer - rag_pipeline_run = next(run for run in root_run.child_runs if run.name == "rag_pipeline") - retrieve_run = next(run for run in rag_pipeline_run.child_runs if run.name == "retrieve") - page_contents = "\\n\\n".join(doc["page_content"] for doc in retrieve_run.outputs["output"]) - generation = rag_pipeline_run.outputs["output"]\n - # Data model - class GradeHallucinations(BaseModel): - """Binary score for hallucination present in generation answer."""\n - binary_score: int = Field(description="Answer is grounded in the facts, 1 or 0")\n - # LLM with function call - llm = ChatOpenAI(model="gpt-4o-mini", temperature=0) - structured_llm_grader = llm.with_structured_output(GradeHallucinations)\n - # Prompt - system = """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \\n - Give a binary score 1 or 0, where 1 means that the answer is grounded in / supported by the set of facts.""" - hallucination_prompt = ChatPromptTemplate.from_messages( - [ - ("system", system), - ("human", "Set of facts: \\n\\n {documents} \\n\\n LLM generation: {generation}"), + python({ caption: "Example uses `langchain` for convenience, this is not required." })` + from langchain.chat_models import init_chat_model + from langsmith.schemas import Run + from pydantic import BaseModel, Field + + def document_relevance(run: Run) -> bool: + """Checks if retriever input exists in the retrieved docs.""" + qa_pipeline_run = next( + r for run in run.child_runs if r.name == "qa_pipeline" + ) + retrieve_run = next( + r for run in qa_pipeline_run.child_runs if r.name == "retrieve" + ) + page_contents = "\\n\\n".join( + doc["page_content"] for doc in retrieve_run.outputs["output"] + ) + return retrieve_run.inputs["query"] in page_contents + + + def no_hallucination(run: Run) -> bool: + """Check if the answer is grounded in the documents. + + Return True if there is no hallucination, False otherwise. + """ + # Get documents and answer + qa_pipeline_run = next( + run for run in run.child_runs if run.name == "qa_pipeline" + ) + retrieve_run = next( + run for run in qa_pipeline_run.child_runs if run.name == "retrieve" + ) + retrieved_content = "\\n\\n".join( + doc["page_content"] for doc in retrieve_run.outputs["output"] + ) + + # Data model + class GradeHallucinations(BaseModel): + """Binary score for hallucination present in generation answer.""" + + is_grounded: bool = Field(..., description="True if the answer is grounded in the facts, False otherwise.") + + # LLM with structured outputs + # For more see: https://python.langchain.com/docs/how_to/structured_output/ + llm = init_chat_model("gpt-4o-mini", temperature=0) + structured_llm= llm.with_structured_output( + GradeHallucinations, + method="json_schema", + strict=True, + ) + + # Prompt + instructions = ( + "You are a grader assessing whether an LLM generation is grounded in / " + "supported by a set of retrieved facts. Give a binary score 1 or 0, " + "where 1 means that the answer is grounded in / supported by the set of facts." + ) + messages = [ + {"role": "system", "content": instructions}, + {"role": "user", "content": "Set of facts: \\n\\n {retrieved_content} \\n\\n LLM generation: {generation}"}, ] - )\n - hallucination_grader = hallucination_prompt | structured_llm_grader - score = hallucination_grader.invoke({"documents": page_contents, "generation": generation}) - return {"key": "answer_hallucination", "score": int(score.binary_score)}`), + + grade = structured_llm.invoke(messages) + return grade.is_grounded`, typescript` import { EvaluationResult } from "langsmith/evaluation"; import { Run, Example } from "langsmith/schemas"; - + import { ChatPromptTemplate } from "@langchain/core/prompts"; import { ChatOpenAI } from "@langchain/openai"; import { z } from "zod"; - + function findNestedRun(run: Run, search: (run: Run) => boolean): Run | null { const queue: Run[] = [run]; while (queue.length > 0) { @@ -289,31 +352,31 @@ def hallucination(root_run: Run, example: Example) -> dict: } return null; } - + // A very simple evaluator that checks to see if the input of the retrieval step exists // in the retrieved docs. function documentRelevance(rootRun: Run, example: Example): EvaluationResult { const retrieveRun = findNestedRun(rootRun, (run) => run.name === "retrieve"); const docs: Array<{ page_content: string }> | undefined = retrieveRun.outputs?.outputs; - + const pageContents = docs?.map((doc) => doc.page_content).join("\\n\\n"); const score = pageContents.includes(retrieveRun.inputs?.query); return { key: "simple_document_relevance", score }; } - + async function hallucination( rootRun: Run, example: Example ): Promise { const rag = findNestedRun(rootRun, (run) => run.name === "ragPipeline"); const retrieve = findNestedRun(rootRun, (run) => run.name === "retrieve"); - + const docs: Array<{ page_content: string }> | undefined = retrieve.outputs?.outputs; - + const documents = docs?.map((doc) => doc.page_content).join("\\n\\n"); - + const prompt = ChatPromptTemplate.fromMessages<{ documents: string; generation: string; @@ -330,7 +393,7 @@ def hallucination(root_run: Run, example: Example) -> dict: "Set of facts: \\n\\n {documents} \\n\\n LLM generation: {generation}", ], ]); - + const llm = new ChatOpenAI({ model: "gpt-4o-mini", temperature: 0, @@ -343,18 +406,19 @@ def hallucination(root_run: Run, example: Example) -> dict: }) .describe("Binary score for hallucination present in generation answer.") ); - + const grader = prompt.pipe(llm); const score = await grader.invoke({ documents, generation: rag.outputs?.outputs, }); - + return { key: "answer_hallucination", score: score.binary_score }; } `, - ]} - groupId="client-language" + +]} +groupId="client-language" /> ## 4. Evaluate the pipeline @@ -364,26 +428,29 @@ Finally, we'll run `evaluate` with the custom evaluators defined above. dict: + """Wrap the qa_pipeline so it can accept the Example.inputs dict as input.""" + return {"answer": qa_pipeline(inputs["question"])} + + experiment_results = ls_client.evaluate( + qa_wrapper, data=dataset_name, - evaluators=[document_relevance, hallucination], + evaluators=[document_relevance, no_hallucination], experiment_prefix="rag-wiki-oai" ) `, typescript` import { evaluate } from "langsmith/evaluation"; - + await evaluate((inputs) => ragPipeline({ question: inputs.input }), { data: datasetName, evaluators: [hallucination, documentRelevance], experimentPrefix: "rag-wiki-oai", }); `, - ]} - groupId="client-language" + +]} +groupId="client-language" /> The experiment will contain the results of the evaluation, including the scores and comments from the evaluators: diff --git a/docs/evaluation/how_to_guides/evaluate_pairwise.mdx b/docs/evaluation/how_to_guides/evaluate_pairwise.mdx index 08398dcc..e828aee3 100644 --- a/docs/evaluation/how_to_guides/evaluate_pairwise.mdx +++ b/docs/evaluation/how_to_guides/evaluate_pairwise.mdx @@ -57,15 +57,13 @@ Pairwise evaluators are just functions with an expected signature. Custom evaluator functions must have specific argument names. They can take any subset of the following arguments: -Python and JS/TS - -- `runs: list[Run]`: A two-item list of the full [Run](/reference/data_formats/run_data_format) objects generated by the two experiments on the given example. Use this if you need access to intermediate steps or metadata about each run. -- `example: Example`: The full dataset [Example](/reference/data_formats/example_data_format), including the example inputs, outputs (if available), and metdata (if available). - `inputs: dict`: A dictionary of the inputs corresponding to a single example in a dataset. -- `outputs: list[dict]`: A two-item list of the outputs produced by each experiment on the given inputs. +- `outputs: list[dict]`: A two-item list of the dict outputs produced by each experiment on the given inputs. - `reference_outputs` / `referenceOutputs: dict`: A dictionary of the reference outputs associated with the example, if available. +- `runs: list[Run]`: A two-item list of the full [Run](/reference/data_formats/run_data_format) objects generated by the two experiments on the given example. Use this if you need access to intermediate steps or metadata about each run. +- `example: Example`: The full dataset [Example](/reference/data_formats/example_data_format), including the example inputs, outputs (if available), and metdata (if available). -For most use cases you'll only need `inputs`, `outputs`, and `reference_outputs`. `run` and `example` are useful only if you need some extra trace or example metadata outside of the actual inputs and outputs of the application. +For most use cases you'll only need `inputs`, `outputs`, and `reference_outputs` / `referenceOutputs`. `run` and `example` are useful only if you need some extra trace or example metadata outside of the actual inputs and outputs of the application. ### Evaluator output From a745fe86098bebfed55eecfe96413948634deaad Mon Sep 17 00:00:00 2001 From: Bagatur Date: Wed, 4 Dec 2024 15:53:57 -0800 Subject: [PATCH 17/21] wip --- .../how_to_guides/evaluate_on_intermediate_steps.mdx | 4 +--- docs/evaluation/how_to_guides/index.md | 2 +- docs/evaluation/how_to_guides/local.mdx | 8 ++++---- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/docs/evaluation/how_to_guides/evaluate_on_intermediate_steps.mdx b/docs/evaluation/how_to_guides/evaluate_on_intermediate_steps.mdx index 9b067025..fc0957b1 100644 --- a/docs/evaluation/how_to_guides/evaluate_on_intermediate_steps.mdx +++ b/docs/evaluation/how_to_guides/evaluate_on_intermediate_steps.mdx @@ -10,7 +10,7 @@ import { typescript, } from "@site/src/components/InstructionsWithCode"; -# How to evaluate intermediate steps +# How to evaluate an application's intermediate steps While, in many scenarios, it is sufficient to evaluate the final output of your task, in some cases you might want to evaluate the intermediate steps of your pipeline. @@ -27,8 +27,6 @@ In order to evaluate the intermediate steps of your pipeline, your evaluator fun The below RAG pipeline consists of 1) generating a Wikipedia query given the input question, 2) retrieving relevant documents from Wikipedia, and 3) generating an answer given the retrieved documents. -First lets install all the dependencies for this example: - =0.2.0`. Example also uses `pandas`."})` - from langsmith import evaluate, Client + from langsmith import Client # 1. Create and/or select your dataset - client = Client() - dataset = client.clone_public_dataset( + ls_client = Client() + dataset = ls_client.clone_public_dataset( "https://smith.langchain.com/public/a63525f9-bdf2-4512-83e3-077dc9417f96/d" ) @@ -40,7 +40,7 @@ Let's take a look at an example: return {"answer": inputs["question"] + " is a good question. I don't know the answer."} # 4. Run an evaluation - experiment = evaluate( + experiment = ls_client.evaluate( chatbot, data=dataset, evaluators=[is_concise], From 66ae51a00524b5a9b5e054911e59d92385b00bc2 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Thu, 5 Dec 2024 07:22:21 -0800 Subject: [PATCH 18/21] cr --- docs/evaluation/how_to_guides/async.mdx | 4 +- .../how_to_guides/custom_evaluator.mdx | 105 +++++++++++++++++- .../evaluate_llm_application.mdx | 6 +- .../evaluate_on_intermediate_steps.mdx | 45 ++++---- docs/evaluation/index.mdx | 4 +- docs/index.mdx | 2 + 6 files changed, 135 insertions(+), 31 deletions(-) diff --git a/docs/evaluation/how_to_guides/async.mdx b/docs/evaluation/how_to_guides/async.mdx index 4f9cd78f..7fcb251b 100644 --- a/docs/evaluation/how_to_guides/async.mdx +++ b/docs/evaluation/how_to_guides/async.mdx @@ -68,8 +68,8 @@ list 5 concrete questions that should be investigated to determine if the idea i researcher_app, data=dataset, evaluators=[concise], - # Optional, no max_concurrency by default but it is recommended to set one. - max_concurrency=2, + # Optional, add concurrency. + max_concurrency=2, # Optional, add concurrency. experiment_prefix="gpt-4o-mini-baseline" # Optional, random by default. ) `, diff --git a/docs/evaluation/how_to_guides/custom_evaluator.mdx b/docs/evaluation/how_to_guides/custom_evaluator.mdx index 2660d4a7..2727a3b4 100644 --- a/docs/evaluation/how_to_guides/custom_evaluator.mdx +++ b/docs/evaluation/how_to_guides/custom_evaluator.mdx @@ -86,14 +86,15 @@ Currently Python only tabs={[ python({caption: "Requires `langsmith>=0.2.0`"})` from langsmith import evaluate, wrappers + from langsmith.schemas import Run, Example from openai import AsyncOpenAI # Assumes you've installed pydantic. from pydantic import BaseModel - # Compare actual and reference outputs - def correct(outputs: dict, reference_outputs: dict) -> bool: + # We can still pass in Run and Example objects if we'd like + def correct_old_signature(run: Run, example: Example) -> dict: """Check if the answer exactly matches the expected answer.""" - return outputs["answer"] == reference_outputs["answer"] + return {"key": "correct", "score": run.outputs["answer"] == example.outputs["answer"]} # Just evaluate actual outputs def concision(outputs: dict) -> int: @@ -128,9 +129,105 @@ answer is logically valid and consistent with question and the answer.""" results = evaluate( dummy_app, data="dataset_name", - evaluators=[correct, concision, valid_reasoning] + evaluators=[correct_old_signature, concision, valid_reasoning] ) `, + typescript` + import { Client } from "langsmith"; + import { evaluate } from "langsmith/evaluation"; + import { Run, Example } from "langsmith/schemas"; + import OpenAI from "openai"; + + // Type definitions + interface AppInputs { + question: string; + } + + interface AppOutputs { + answer: string; + reasoning: string; + } + + interface Response { + reasoning_is_valid: boolean; + } + + // Old signature evaluator + function correctOldSignature(run: Run, example: Example) { + return { + key: "correct", + score: run.outputs?.["answer"] === example.outputs?.["answer"], + }; + } + + // Output-only evaluator + function concision({ outputs }: { outputs: AppOutputs }) { + return { + key: "concision", + score: Math.min(Math.floor(outputs.answer.length / 1000), 4) + 1, + }; + } + + // LLM-as-judge evaluator + const openai = new OpenAI(); + + async function validReasoning({ + inputs, + outputs + }: { + inputs: AppInputs; + outputs: AppOutputs; + }) { + const instructions = \`\ + Given the following question, answer, and reasoning, determine if the reasoning for the \ + answer is logically valid and consistent with question and the answer.\`; + + const msg = \`Question: \${inputs.question}\nAnswer: \${outputs.answer}\\nReasoning: \${outputs.reasoning}\`; + + const response = await openai.chat.completions.create({ + model: "gpt-4", + messages: [ + { role: "system", content: instructions }, + { role: "user", content: msg } + ], + response_format: { type: "json_object" }, + functions: [{ + name: "parse_response", + parameters: { + type: "object", + properties: { + reasoning_is_valid: { + type: "boolean", + description: "Whether the reasoning is valid" + } + }, + required: ["reasoning_is_valid"] + } + }] + }); + + const parsed = JSON.parse(response.choices[0].message.content ?? "{}") as Response; + + return { + key: "valid_reasoning", + score: parsed.reasoning_is_valid ? 1 : 0 + }; + } + + // Example application + function dummyApp(inputs: AppInputs): AppOutputs { + return { + answer: "hmm i'm not sure", + reasoning: "i didn't understand the question" + }; + } + + const results = await evaluate(dummyApp, { + data: "dataset_name", + evaluators: [correctOldSignature, concision, validReasoning], + client: new Client() + }); + ` ]} /> diff --git a/docs/evaluation/how_to_guides/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluate_llm_application.mdx index 643bb1c3..2c07b802 100644 --- a/docs/evaluation/how_to_guides/evaluate_llm_application.mdx +++ b/docs/evaluation/how_to_guides/evaluate_llm_application.mdx @@ -201,6 +201,7 @@ The key arguments are: evaluators=[correct], experiment_prefix="gpt-4o-mini, baseline", # optional, experiment name prefix description="Testing the baseline system.", # optional, experiment description + max_concurrency=4, # optional, add concurrency ) `, typescript` @@ -210,6 +211,7 @@ The key arguments are: data: datasetName, evaluators: [correct], experimentPrefix: "gpt-4o-mini, baseline", // optional, experiment name prefix + maxConcurrency: 4, // optional, add concurrency }); `, ]} @@ -281,12 +283,14 @@ _If you've annotated your code for tracing, you can open the trace of each row i return outputs["output"] == reference_outputs["label"] # Step 4. Run the evaluation + # Client.evaluate() and evaluate() behave the same. results = ls_client.evaluate( toxicity_classifier, data=dataset_name, evaluators=[correct], experiment_prefix="gpt-4o-mini, simple", # optional, experiment name prefix description="Testing the baseline system.", # optional, experiment description + max_concurrency=4, # optional, add concurrency ) `, typescript` @@ -355,7 +359,7 @@ _If you've annotated your code for tracing, you can open the trace of each row i data: datasetName, evaluators: [correct], experimentPrefix: "gpt-4o-mini, simple", // optional, experiment name prefix - + maxConcurrency: 4, // optional, add concurrency }); `, diff --git a/docs/evaluation/how_to_guides/evaluate_on_intermediate_steps.mdx b/docs/evaluation/how_to_guides/evaluate_on_intermediate_steps.mdx index fc0957b1..772c5976 100644 --- a/docs/evaluation/how_to_guides/evaluate_on_intermediate_steps.mdx +++ b/docs/evaluation/how_to_guides/evaluate_on_intermediate_steps.mdx @@ -48,12 +48,11 @@ The below RAG pipeline consists of 1) generating a Wikipedia query given the inp =0.2.0`' })` - import openai import wikipedia as wp - + from openai import OpenAI from langsmith import traceable, wrappers - oai_client = wrappers.wrap_openai(openai.Client()) + oai_client = wrappers.wrap_openai(OpenAI()) @traceable def generate_wiki_search(question: str) -> str: @@ -289,6 +288,20 @@ The key here is that the evaluator function should traverse the `run` / `rootRun return retrieve_run.inputs["query"] in page_contents + # Data model + class GradeHallucinations(BaseModel): + """Binary score for hallucination present in generation answer.""" + + is_grounded: bool = Field(..., description="True if the answer is grounded in the facts, False otherwise.") + + # LLM with structured outputs for grading hallucinations + # For more see: https://python.langchain.com/docs/how_to/structured_output/ + grader_llm= init_chat_model("gpt-4o-mini", temperature=0).with_structured_output( + GradeHallucinations, + method="json_schema", + strict=True, + ) + def no_hallucination(run: Run) -> bool: """Check if the answer is grounded in the documents. @@ -296,42 +309,28 @@ The key here is that the evaluator function should traverse the `run` / `rootRun """ # Get documents and answer qa_pipeline_run = next( - run for run in run.child_runs if run.name == "qa_pipeline" + r for r in run.child_runs if r.name == "qa_pipeline" ) retrieve_run = next( - run for run in qa_pipeline_run.child_runs if run.name == "retrieve" + r for r in qa_pipeline_run.child_runs if r.name == "retrieve" ) retrieved_content = "\\n\\n".join( doc["page_content"] for doc in retrieve_run.outputs["output"] ) - # Data model - class GradeHallucinations(BaseModel): - """Binary score for hallucination present in generation answer.""" - - is_grounded: bool = Field(..., description="True if the answer is grounded in the facts, False otherwise.") - - # LLM with structured outputs - # For more see: https://python.langchain.com/docs/how_to/structured_output/ - llm = init_chat_model("gpt-4o-mini", temperature=0) - structured_llm= llm.with_structured_output( - GradeHallucinations, - method="json_schema", - strict=True, - ) - - # Prompt + # Construct prompt instructions = ( "You are a grader assessing whether an LLM generation is grounded in / " "supported by a set of retrieved facts. Give a binary score 1 or 0, " "where 1 means that the answer is grounded in / supported by the set of facts." ) + messages = [ {"role": "system", "content": instructions}, - {"role": "user", "content": "Set of facts: \\n\\n {retrieved_content} \\n\\n LLM generation: {generation}"}, + {"role": "user", "content": f"Set of facts:\\n{retrieved_content}\\n\\nLLM generation: {run.outputs['answer']}"}, ] - grade = structured_llm.invoke(messages) + grade = grader_llm.invoke(messages) return grade.is_grounded`, typescript` import { EvaluationResult } from "langsmith/evaluation"; diff --git a/docs/evaluation/index.mdx b/docs/evaluation/index.mdx index 18b53608..c119cfc7 100644 --- a/docs/evaluation/index.mdx +++ b/docs/evaluation/index.mdx @@ -76,7 +76,8 @@ export LANGCHAIN_API_KEY=`), chatbot, data=dataset, evaluators=[is_concise], - experiment_prefix="my first experiment " + experiment_prefix="my first experiment ", + max_concurrency=4, ) `, @@ -107,6 +108,7 @@ answer: inputs.question + " Good question. I don't know the answer" data: datasetName, evaluators: [isConcise], experimentPrefix: "my first experiment ", +maxConcurrency: 4, });`, ]} groupId="client-language" diff --git a/docs/index.mdx b/docs/index.mdx index d3298379..bb870143 100644 --- a/docs/index.mdx +++ b/docs/index.mdx @@ -135,6 +135,7 @@ Evaluation requires a system to test, data to serve as test cases, and optionall evaluators=[exact_match], # The evaluators to score the results experiment_prefix="sample-experiment", # The name of the experiment metadata={"version": "1.0.0", "revision_id": "beta"}, # Metadata about the experiment + max_concurrency=4, # Add concurrency. ) # Analyze the results via the UI or programmatically @@ -185,6 +186,7 @@ import { EvaluationResult, evaluate } from "langsmith/evaluation"; data: datasetName, evaluators: [exactMatch], metadata: { version: "1.0.0", revision_id: "beta" }, + maxConcurrency: 4, } ); `, From c2ed61d4ea12182ebe0a17f6a99a32f6cc937725 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Thu, 5 Dec 2024 07:53:12 -0800 Subject: [PATCH 19/21] cr --- docs/evaluation/how_to_guides/evaluate_llm_application.mdx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/evaluation/how_to_guides/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluate_llm_application.mdx index 2c07b802..87150858 100644 --- a/docs/evaluation/how_to_guides/evaluate_llm_application.mdx +++ b/docs/evaluation/how_to_guides/evaluate_llm_application.mdx @@ -18,13 +18,16 @@ import { In this guide we'll go over how to evaluate an application using the [evaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._runner.evaluate) method in the LangSmith SDK. -:::tip Async jobs in Python +:::tip Running large jobs For larger evaluation jobs in Python we recommend using [aevaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate), the asynchronous version of [evaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._runner.evaluate). It is still worthwhile to read this guide first, as the two have identical interfaces, before reading the how-to guide on [running an evaluation asynchronously](./async). In JS/TS evaluate() is already asynchronous so no separate method is needed. +It is also important to configure the `max_concurrency`/`maxConcurrency` arg when running large jobs. +This parallelizes evaluation by effectively splitting the dataset across threads. + ::: ## Define an application From 5beab48e7bb6de115c0b33ba3ffa54c306f4b2cf Mon Sep 17 00:00:00 2001 From: Bagatur Date: Thu, 5 Dec 2024 08:05:08 -0800 Subject: [PATCH 20/21] links --- docs/evaluation/how_to_guides/async.mdx | 4 ++-- docs/evaluation/how_to_guides/custom_evaluator.mdx | 2 +- docs/evaluation/how_to_guides/evaluate_llm_application.mdx | 6 +++--- docs/evaluation/how_to_guides/evaluate_pairwise.mdx | 2 +- docs/evaluation/how_to_guides/rate_limiting.mdx | 3 ++- docs/evaluation/tutorials/evaluation.mdx | 2 +- docs/reference/index.md | 2 +- 7 files changed, 11 insertions(+), 10 deletions(-) diff --git a/docs/evaluation/how_to_guides/async.mdx b/docs/evaluation/how_to_guides/async.mdx index 7fcb251b..9c35a0c4 100644 --- a/docs/evaluation/how_to_guides/async.mdx +++ b/docs/evaluation/how_to_guides/async.mdx @@ -8,8 +8,8 @@ import { CodeTabs, python } from "@site/src/components/InstructionsWithCode"; ::: -We can run evaluations asynchronously via the SDK using [aevaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate), -which accepts all of the same arguments as [evaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._runner.evaluate) but expects the application function to be asynchronous. +We can run evaluations asynchronously via the SDK using [aevaluate()](https://docs.smith.langchain.com/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate), +which accepts all of the same arguments as [evaluate()](https://docs.smith.langchain.com/reference/python/evaluation/langsmith.evaluation._runner.evaluate) but expects the application function to be asynchronous. You can learn more about how to use the `evaluate()` function [here](./evaluate_llm_application). :::info Python only diff --git a/docs/evaluation/how_to_guides/custom_evaluator.mdx b/docs/evaluation/how_to_guides/custom_evaluator.mdx index 2727a3b4..de567cae 100644 --- a/docs/evaluation/how_to_guides/custom_evaluator.mdx +++ b/docs/evaluation/how_to_guides/custom_evaluator.mdx @@ -13,7 +13,7 @@ import { ::: Custom evaluators are just functions that take a dataset example and the resulting application output, and return one or more metrics. -These functions can be passed directly into [evaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._runner.evaluate) / [aevaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate). +These functions can be passed directly into [evaluate()](https://docs.smith.langchain.com/reference/python/evaluation/langsmith.evaluation._runner.evaluate) / [aevaluate()](https://docs.smith.langchain.com/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate). ## Basic example diff --git a/docs/evaluation/how_to_guides/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluate_llm_application.mdx index 87150858..b93e2cfb 100644 --- a/docs/evaluation/how_to_guides/evaluate_llm_application.mdx +++ b/docs/evaluation/how_to_guides/evaluate_llm_application.mdx @@ -16,11 +16,11 @@ import { ::: -In this guide we'll go over how to evaluate an application using the [evaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._runner.evaluate) method in the LangSmith SDK. +In this guide we'll go over how to evaluate an application using the [evaluate()](https://docs.smith.langchain.com/reference/python/evaluation/langsmith.evaluation._runner.evaluate) method in the LangSmith SDK. :::tip Running large jobs -For larger evaluation jobs in Python we recommend using [aevaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate), the asynchronous version of [evaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._runner.evaluate). +For larger evaluation jobs in Python we recommend using [aevaluate()](https://docs.smith.langchain.com/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate), the asynchronous version of [evaluate()](https://docs.smith.langchain.com/reference/python/evaluation/langsmith.evaluation._runner.evaluate). It is still worthwhile to read this guide first, as the two have identical interfaces, before reading the how-to guide on [running an evaluation asynchronously](./async). In JS/TS evaluate() is already asynchronous so no separate method is needed. @@ -184,7 +184,7 @@ See [here](.#define-an-evaluator) for more on how to define evaluators. ## Run the evaluation -We'll use the [evaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._runner.evaluate) / [aevaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate) methods to run the evaluation. +We'll use the [evaluate()](https://docs.smith.langchain.com/reference/python/evaluation/langsmith.evaluation._runner.evaluate) / [aevaluate()](https://docs.smith.langchain.com/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate) methods to run the evaluation. The key arguments are: diff --git a/docs/evaluation/how_to_guides/evaluate_pairwise.mdx b/docs/evaluation/how_to_guides/evaluate_pairwise.mdx index e828aee3..a9d8473c 100644 --- a/docs/evaluation/how_to_guides/evaluate_pairwise.mdx +++ b/docs/evaluation/how_to_guides/evaluate_pairwise.mdx @@ -20,7 +20,7 @@ import { LangSmith supports evaluating **existing** experiments in a comparative manner. This allows you to score the outputs from multiple experiments against each other, rather than being confined to evaluating outputs one at a time. Think [LMSYS Chatbot Arena](https://chat.lmsys.org/) - this is the same concept! -To do this, use the [evaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._runner.evaluate) function with two existing experiments. +To do this, use the [evaluate()](https://docs.smith.langchain.com/reference/python/evaluation/langsmith.evaluation._runner.evaluate) function with two existing experiments. If you haven't already created experiments to compare, check out our [quick start](../) or our [how-to guide](./evaluate_llm_application) to get started with evaluations. diff --git a/docs/evaluation/how_to_guides/rate_limiting.mdx b/docs/evaluation/how_to_guides/rate_limiting.mdx index 41331bd6..41fda7f7 100644 --- a/docs/evaluation/how_to_guides/rate_limiting.mdx +++ b/docs/evaluation/how_to_guides/rate_limiting.mdx @@ -83,7 +83,8 @@ See some examples of how to do this in the [OpenAI docs](https://platform.openai ## Limiting max_concurrency Limiting the number of concurrent calls you're making to your application and evaluators is another way to decrease the frequency of model calls you're making, and in that way avoid rate limit errors. -`max_concurrency` can be set directly on the [evaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._runner.evaluate) / [aevaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate) functions. +`max_concurrency` can be set directly on the [evaluate()](https://docs.smith.langchain.com/reference/python/evaluation/langsmith.evaluation._runner.evaluate) / [aevaluate()](https://docs.smith.langchain.com/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate) functions. +This parallelizes evaluation by effectively splitting the dataset across threads. Date: Thu, 5 Dec 2024 09:49:52 -0800 Subject: [PATCH 21/21] fix --- .../evaluate_llm_application.mdx | 20 +++++++++++++++---- .../how_to_guides/evaluate_pairwise.mdx | 7 ++++--- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/docs/evaluation/how_to_guides/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluate_llm_application.mdx index b93e2cfb..e8ac1937 100644 --- a/docs/evaluation/how_to_guides/evaluate_llm_application.mdx +++ b/docs/evaluation/how_to_guides/evaluate_llm_application.mdx @@ -169,10 +169,16 @@ Since we have labels for this task, our evaluator can directly check if the actu def correct(inputs: dict, outputs: dict, reference_outputs: dict) -> bool: return outputs["class"] == reference_outputs["label"] `, - typescript` + typescript({ caption: "Requires `langsmith>=0.2.9`" })` import type { EvaluationResult } from "langsmith/evaluation"; - function correct({ outputs, referenceOutputs }: { outputs: Record, referenceOutputs?: Record }): EvaluationResult { + function correct({ + outputs, + referenceOutputs, + }: { + outputs: Record; + referenceOutputs?: Record; + }): EvaluationResult { const score = outputs.output === referenceOutputs?.outputs; return { key: "correct", score }; } @@ -353,8 +359,14 @@ _If you've annotated your code for tracing, you can open the trace of each row i await langsmith.createExamples({ inputs, outputs, datasetId: toxicDataset.id }); // Row-level evaluator - function correct(rootRun: Run, example: Example): EvaluationResult { - const score = rootRun.outputs?.output === example.outputs?.outputs; + function correct({ + outputs, + referenceOutputs, + }: { + outputs: Record; + referenceOutputs?: Record; + }): EvaluationResult { + const score = outputs.output === referenceOutputs?.outputs; return { key: "correct", score }; } diff --git a/docs/evaluation/how_to_guides/evaluate_pairwise.mdx b/docs/evaluation/how_to_guides/evaluate_pairwise.mdx index a9d8473c..e487393c 100644 --- a/docs/evaluation/how_to_guides/evaluate_pairwise.mdx +++ b/docs/evaluation/how_to_guides/evaluate_pairwise.mdx @@ -20,7 +20,7 @@ import { LangSmith supports evaluating **existing** experiments in a comparative manner. This allows you to score the outputs from multiple experiments against each other, rather than being confined to evaluating outputs one at a time. Think [LMSYS Chatbot Arena](https://chat.lmsys.org/) - this is the same concept! -To do this, use the [evaluate()](https://docs.smith.langchain.com/reference/python/evaluation/langsmith.evaluation._runner.evaluate) function with two existing experiments. +To do this, use the [evaluate()](https://langsmith-docs-git-bagatur-rfcbuiltinsdkref-langchain.vercel.app/reference/python/evaluation/langsmith.evaluation._runner.evaluate) function with two existing experiments. If you haven't already created experiments to compare, check out our [quick start](../) or our [how-to guide](./evaluate_llm_application) to get started with evaluations. @@ -132,20 +132,21 @@ In the Python example below, we are pulling [this structured prompt](https://smi `, typescript({caption: "Requires `langsmith>=0.2.9`"})` import { evaluate} from "langsmith/evaluation"; + import { Run } from "langsmith/schemas"; import { wrapOpenAI } from "langsmith/wrappers"; import OpenAI from "openai"; import { z } from "zod"; const openai = wrapOpenAI(new OpenAI()); - async function rankedPreference({ inputs, outputs }: { inputs: Record, outputs: Record }) { + async function rankedPreference({ inputs, runs }: { inputs: Record, runs: Run[] }) { const scores: Record = {}; const [runA, runB] = runs; if (!runA || !runB) throw new Error("Expected at least two runs"); const payload = { - question: example.inputs?.question, + question: inputs.question, answer_a: runA?.outputs?.output ?? "N/A", answer_b: runB?.outputs?.output ?? "N/A", };