From 6c23581fc57b62971446999ce66e1bf3a96c803b Mon Sep 17 00:00:00 2001 From: Bagatur Date: Fri, 22 Nov 2024 21:17:41 -0500 Subject: [PATCH] fmt --- .../manage_datasets_programmatically.mdx | 6 ++--- .../how_to_guides/evaluation/async.mdx | 26 ++++++++----------- .../evaluation/builtin_evaluators.mdx | 0 .../evaluation/evaluate_llm_application.mdx | 4 +-- .../how_to_guides/evaluation/langgraph.mdx | 10 +++---- docs/evaluation/how_to_guides/index.md | 2 +- 6 files changed, 22 insertions(+), 26 deletions(-) delete mode 100644 docs/evaluation/how_to_guides/evaluation/builtin_evaluators.mdx diff --git a/docs/evaluation/how_to_guides/datasets/manage_datasets_programmatically.mdx b/docs/evaluation/how_to_guides/datasets/manage_datasets_programmatically.mdx index c6daa3be..08a9627a 100644 --- a/docs/evaluation/how_to_guides/datasets/manage_datasets_programmatically.mdx +++ b/docs/evaluation/how_to_guides/datasets/manage_datasets_programmatically.mdx @@ -382,9 +382,9 @@ Additionally, you can also chain multiple filters together using the `and` opera tabs={[ PythonBlock( `examples = client.list_examples( - dataset_name=dataset_name, - filter='and(not(has(metadata, \\'{"foo": "bar"}\\')), exists(metadata, "tenant_id"))' - )` + dataset_name=dataset_name, + filter='and(not(has(metadata, \\'{"foo": "bar"}\\')), exists(metadata, "tenant_id"))' +)` ), TypeScriptBlock( `const examples = await client.listExamples({datasetName: datasetName, filter: 'and(not(has(metadata, \\'{"foo": "bar"}\\')), exists(metadata, "tenant_id"))'});` diff --git a/docs/evaluation/how_to_guides/evaluation/async.mdx b/docs/evaluation/how_to_guides/evaluation/async.mdx index 92ab9f2c..6741c635 100644 --- a/docs/evaluation/how_to_guides/evaluation/async.mdx +++ b/docs/evaluation/how_to_guides/evaluation/async.mdx @@ -1,8 +1,4 @@ -import { - CodeTabs, - python, - typescript, -} from "@site/src/components/InstructionsWithCode"; +import { CodeTabs, python } from "@site/src/components/InstructionsWithCode"; # How to run an evaluation asynchronously @@ -29,7 +25,7 @@ You can see how to use it [here](../../how_to_guides/evaluation/evaluate_llm_app =0.1.145`"})` from langsmith import aevaluate, wrappers, Client from openai import AsyncOpenAI @@ -57,22 +53,22 @@ list 5 concrete questions that should be investigated to determine if the idea i return len(output["output"]) < 3 * len(inputs["idea"]) ls_client = Client() - # TODO - dataset = ... - results = aevaluate( + examples = ["universal basic income", "nuclear fusion", "hyperloop", "nuclear powered rockets"] + dataset = ls_client.create_dataset("research ideas") + ls_client.create_examples( + dataset_name=dataset.name, + inputs=[{"idea": e} for e in examples, + ) + + results = await aevaluate( researcher_app, data=dataset, evaluators=[concise], max_concurrency=2, # Optional, no max by default - experiment_prefix="gpt-4o-mini, baseline" # Optional, random by default + experiment_prefix="gpt-4o-mini-baseline" # Optional, random by default ) `, - typescript` - import type { EvaluationResult } from "langsmith/evaluation"; - import type { Run, Example } from "langsmith/schemas"; - - `, ]} /> diff --git a/docs/evaluation/how_to_guides/evaluation/builtin_evaluators.mdx b/docs/evaluation/how_to_guides/evaluation/builtin_evaluators.mdx deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx index 822e381d..b2bf0361 100644 --- a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx +++ b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx @@ -12,7 +12,7 @@ import { :::info Key concepts -[Evaluations](../../concepts#applying-evaluations) | [Evaluators](../../concepts#evaluators) | [Datasets](../../concepts#datasets) | [Experiments](../../concepts#experiments) +[Evaluations](../../concepts#applying-evaluations) | [Evaluators](../../concepts#evaluators) | [Datasets](../../concepts#datasets) ::: @@ -232,7 +232,7 @@ _If you've annotated your code for tracing, you can open the trace of each row i =0.1.145`"})` from langsmith import Client, evaluate, traceable, wrappers from openai import OpenAI diff --git a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx index 82ae0e95..4f6429fe 100644 --- a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx +++ b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx @@ -204,7 +204,7 @@ If any of you nodes are defined as async, you'll need to use `aevaluate` # Remember that langgraph graphs are also langchain runnables. target = example_to_state | app - experiment_results = aevaluate( + experiment_results = await aevaluate( target, data="weather agent", evaluators=[correct], @@ -236,7 +236,7 @@ For example, we can look at the messages to assert that the model invoked the 's tool_calls = outputs["messages"][1].tool_calls return bool(tool_calls and tool_calls[0]["name"] == "search") - experiment_results = aevaluate( + experiment_results = await aevaluate( target, data="weather agent", evaluators=[correct, right_tool], @@ -272,7 +272,7 @@ See more about what arguments you can pass to custom evaluators in this [how-to right_tool = bool(tool_calls and tool_calls[0]["name"] == "search") return {"key": "right_tool", "value": right_tool} - experiment_results = aevaluate( + experiment_results = await aevaluate( target, data="weather agent", evaluators=[correct, right_tool_from_run], @@ -299,7 +299,7 @@ In this case we can even continue using the evaluators we've been using. node_target = example_to_state | app.nodes["agent"] - node_experiment_results = aevaluate( + node_experiment_results = await aevaluate( node_target, data="weather agent", evaluators=[right_tool_from_run], @@ -450,7 +450,7 @@ In this case we can even continue using the evaluators we've been using. # Run evaluation - experiment_results = aevaluate( + experiment_results = await aevaluate( target, data="weather agent", evaluators=[correct, right_tool], diff --git a/docs/evaluation/how_to_guides/index.md b/docs/evaluation/how_to_guides/index.md index 5fe3ea1e..698f00fd 100644 --- a/docs/evaluation/how_to_guides/index.md +++ b/docs/evaluation/how_to_guides/index.md @@ -19,7 +19,7 @@ Evaluate and improve your application before deploying it. - [Evaluate a `langgraph` graph](./how_to_guides/evaluation/langgraph) - [Run an evaluation of an existing experiment](./how_to_guides/evaluation/evaluate_existing_experiment) - [Run an evaluation via the REST API](./how_to_guides/evaluation/run_evals_api_only) -- [Run an evaluation from the prompt playground](./how_to_guides/evaluation/run_evaluation_from_prompt_playground) +- [Run an evaluation from the UI](./how_to_guides/evaluation/run_evaluation_from_prompt_playground) ### Define an evaluator