From f7f83a954e7114c611509ccbcd7b747ccf6e2f80 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Tue, 12 Nov 2024 17:48:28 -0800
Subject: [PATCH 01/29] wip: eval how to revamp

---
 .../export_filtered_traces_to_dataset.mdx     |   2 +-
 .../manage_datasets_in_application.mdx        |   2 +-
 .../manage_datasets_programmatically.mdx      |   2 +-
 .../how_to_guides/datasets/share_dataset.mdx  |   2 +-
 .../datasets/version_datasets.mdx             |   2 +-
 .../how_to_guides/evaluation/async.mdx        |   0
 .../evaluation/audit_evaluator_scores.mdx     |   2 +-
 .../evaluation/bind_evaluator_to_dataset.mdx  |   2 +-
 .../evaluation/check_evaluator.mdx            |   0
 .../evaluation/compare_experiment_results.mdx |   2 +-
 .../evaluation/create_few_shot_evaluators.mdx |   2 +-
 .../evaluation/custom_evaluator.mdx           |   0
 .../evaluation/dataset_subset.mdx             |  83 ++++
 .../evaluation/dataset_version.mdx            |  45 ++
 .../evaluate_existing_experiment.mdx          |   2 +-
 .../evaluation/evaluate_llm_application.mdx   | 441 +-----------------
 .../evaluate_on_intermediate_steps.mdx        |   2 +-
 .../evaluation/evaluate_pairwise.mdx          |   2 +-
 .../fetch_perf_metrics_experiment.mdx         |   4 +-
 .../evaluation/filter_experiments_ui.mdx      |   2 +-
 .../evaluation/langchain_runnable.mdx         |  76 +++
 .../how_to_guides/evaluation/large_job.mdx    |   0
 .../how_to_guides/evaluation/metric_type.mdx  |   0
 .../evaluation/multiple_scores.mdx            |  65 +++
 .../evaluation/rate_limiting.mdx              |   0
 .../how_to_guides/evaluation/repetition.mdx   |  39 ++
 .../evaluation/run_evals_api_only.mdx         |  24 +-
 .../run_evaluation_from_prompt_playground.mdx |   2 +-
 .../how_to_guides/evaluation/summary.mdx      |  76 +++
 .../how_to_guides/evaluation/unit_testing.mdx |  10 +-
 .../upload_existing_experiments.mdx           |   2 +-
 ...use_langchain_off_the_shelf_evaluators.mdx |   6 +-
 docs/evaluation/how_to_guides/index.md        |  78 ++--
 33 files changed, 492 insertions(+), 485 deletions(-)
 create mode 100644 docs/evaluation/how_to_guides/evaluation/async.mdx
 create mode 100644 docs/evaluation/how_to_guides/evaluation/check_evaluator.mdx
 create mode 100644 docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
 create mode 100644 docs/evaluation/how_to_guides/evaluation/dataset_subset.mdx
 create mode 100644 docs/evaluation/how_to_guides/evaluation/dataset_version.mdx
 create mode 100644 docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
 create mode 100644 docs/evaluation/how_to_guides/evaluation/large_job.mdx
 create mode 100644 docs/evaluation/how_to_guides/evaluation/metric_type.mdx
 create mode 100644 docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx
 create mode 100644 docs/evaluation/how_to_guides/evaluation/rate_limiting.mdx
 create mode 100644 docs/evaluation/how_to_guides/evaluation/repetition.mdx
 create mode 100644 docs/evaluation/how_to_guides/evaluation/summary.mdx

diff --git a/docs/evaluation/how_to_guides/datasets/export_filtered_traces_to_dataset.mdx b/docs/evaluation/how_to_guides/datasets/export_filtered_traces_to_dataset.mdx
index 9a8526fe..167755f9 100644
--- a/docs/evaluation/how_to_guides/datasets/export_filtered_traces_to_dataset.mdx
+++ b/docs/evaluation/how_to_guides/datasets/export_filtered_traces_to_dataset.mdx
@@ -2,7 +2,7 @@
 sidebar_position: 6
 ---
 
-# Export filtered traces from experiment to dataset
+# How to export filtered traces from experiment to dataset
 
 After running an offline evaluation in LangSmith, you may want to export traces that met some evaluation criteria to a dataset.
 
diff --git a/docs/evaluation/how_to_guides/datasets/manage_datasets_in_application.mdx b/docs/evaluation/how_to_guides/datasets/manage_datasets_in_application.mdx
index 07c2df7c..7820a550 100644
--- a/docs/evaluation/how_to_guides/datasets/manage_datasets_in_application.mdx
+++ b/docs/evaluation/how_to_guides/datasets/manage_datasets_in_application.mdx
@@ -2,7 +2,7 @@
 sidebar_position: 1
 ---
 
-# Manage datasets in the application
+# How to manage datasets in the UI
 
 :::tip Recommended Reading
 Before diving into this content, it might be helpful to read the following:
diff --git a/docs/evaluation/how_to_guides/datasets/manage_datasets_programmatically.mdx b/docs/evaluation/how_to_guides/datasets/manage_datasets_programmatically.mdx
index d0042f28..c6daa3be 100644
--- a/docs/evaluation/how_to_guides/datasets/manage_datasets_programmatically.mdx
+++ b/docs/evaluation/how_to_guides/datasets/manage_datasets_programmatically.mdx
@@ -8,7 +8,7 @@ import {
   TypeScriptBlock,
 } from "@site/src/components/InstructionsWithCode";
 
-# Manage datasets programmatically
+# How to manage datasets programmatically
 
 You can use the Python and TypeScript SDK to manage datasets programmatically. This includes creating, updating, and deleting datasets, as well as adding examples to them.
 
diff --git a/docs/evaluation/how_to_guides/datasets/share_dataset.mdx b/docs/evaluation/how_to_guides/datasets/share_dataset.mdx
index 496af963..ff9aa918 100644
--- a/docs/evaluation/how_to_guides/datasets/share_dataset.mdx
+++ b/docs/evaluation/how_to_guides/datasets/share_dataset.mdx
@@ -4,7 +4,7 @@ sidebar_position: 4
 
 import { RegionalUrl } from "@site/src/components/RegionalUrls";
 
-# Share or unshare a dataset publicly
+# How to share or unshare a dataset publicly
 
 :::caution
 
diff --git a/docs/evaluation/how_to_guides/datasets/version_datasets.mdx b/docs/evaluation/how_to_guides/datasets/version_datasets.mdx
index 510b7815..be7cc8bd 100644
--- a/docs/evaluation/how_to_guides/datasets/version_datasets.mdx
+++ b/docs/evaluation/how_to_guides/datasets/version_datasets.mdx
@@ -2,7 +2,7 @@
 sidebar_position: 3
 ---
 
-# Version datasets
+# How to version datasets
 
 In LangSmith, datasets are versioned. This means that every time you add, update, or delete examples in your dataset, a new version of the dataset is created.
 
diff --git a/docs/evaluation/how_to_guides/evaluation/async.mdx b/docs/evaluation/how_to_guides/evaluation/async.mdx
new file mode 100644
index 00000000..e69de29b
diff --git a/docs/evaluation/how_to_guides/evaluation/audit_evaluator_scores.mdx b/docs/evaluation/how_to_guides/evaluation/audit_evaluator_scores.mdx
index 5b74d3c9..0aaf9268 100644
--- a/docs/evaluation/how_to_guides/evaluation/audit_evaluator_scores.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/audit_evaluator_scores.mdx
@@ -8,7 +8,7 @@ import {
   python,
 } from "@site/src/components/InstructionsWithCode";
 
-# Audit evaluator scores
+# How to audit evaluator scores
 
 LLM-as-a-judge evaluators don't always get it right. Because of this, it is often useful for a human to manually audit the scores left by an evaluator and correct them where necessary. LangSmith allows you to make corrections on evaluator scores in the UI or SDK.
 
diff --git a/docs/evaluation/how_to_guides/evaluation/bind_evaluator_to_dataset.mdx b/docs/evaluation/how_to_guides/evaluation/bind_evaluator_to_dataset.mdx
index 505f8a06..88236b54 100644
--- a/docs/evaluation/how_to_guides/evaluation/bind_evaluator_to_dataset.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/bind_evaluator_to_dataset.mdx
@@ -2,7 +2,7 @@
 sidebar_position: 2
 ---
 
-# Bind an evaluator to a dataset in the UI
+# How to bind an evaluator to a dataset in the UI
 
 While you can specify evaluators to grade the results of your experiments programmatically (see [this guide](./evaluate_llm_application) for more information), you can also bind evaluators to a dataset in the UI.
 This allows you to configure automatic evaluators that grade your experiment results. We have support for both LLM-based evaluators, and custom python code evaluators.
diff --git a/docs/evaluation/how_to_guides/evaluation/check_evaluator.mdx b/docs/evaluation/how_to_guides/evaluation/check_evaluator.mdx
new file mode 100644
index 00000000..e69de29b
diff --git a/docs/evaluation/how_to_guides/evaluation/compare_experiment_results.mdx b/docs/evaluation/how_to_guides/evaluation/compare_experiment_results.mdx
index 3e435122..9875f4db 100644
--- a/docs/evaluation/how_to_guides/evaluation/compare_experiment_results.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/compare_experiment_results.mdx
@@ -2,7 +2,7 @@
 sidebar_position: 5
 ---
 
-# Compare experiment results
+# How to compare experiment results
 
 Oftentimes, when you are iterating on your LLM application (such as changing the model or the prompt), you will want to compare the results of different experiments.
 
diff --git a/docs/evaluation/how_to_guides/evaluation/create_few_shot_evaluators.mdx b/docs/evaluation/how_to_guides/evaluation/create_few_shot_evaluators.mdx
index cbee2703..e50b3965 100644
--- a/docs/evaluation/how_to_guides/evaluation/create_few_shot_evaluators.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/create_few_shot_evaluators.mdx
@@ -2,7 +2,7 @@
 sidebar_position: 10
 ---
 
-# Create few-shot evaluators
+How to create few-shot evaluators
 
 Using LLM-as-a-Judge evaluators can be very helpful when you can't evaluate your system programmatically. However, improving/iterating on these prompts can add unnecessary
 overhead to the development process of an LLM-based application - you now need to maintain both your application **and** your evaluators. To make this process easier, LangSmith allows
diff --git a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
new file mode 100644
index 00000000..e69de29b
diff --git a/docs/evaluation/how_to_guides/evaluation/dataset_subset.mdx b/docs/evaluation/how_to_guides/evaluation/dataset_subset.mdx
new file mode 100644
index 00000000..4515a454
--- /dev/null
+++ b/docs/evaluation/how_to_guides/evaluation/dataset_subset.mdx
@@ -0,0 +1,83 @@
+import {
+  CodeTabs,
+  python,
+  typescript,
+} from "@site/src/components/InstructionsWithCode";
+
+# How to evaluate on a split / filtered view of a dataset
+
+:::tip Recommended reading
+
+Before diving into this content, it might be helpful to read:
+
+- [guide on fetching examples](../datasets/manage_datasets_programmatically#fetch-examples).
+- [guide on creating/managing dataset splits](../datasets/manage_datasets_in_application#create-and-manage-dataset-splits)
+
+:::
+
+# How to: Evaluate on a filtered view of a dataset
+
+You can use the `list_examples` / `listExamples` method to fetch a subset of examples from a dataset to evaluate on. You can refer to guide above to learn more about the different ways to fetch examples.
+
+One common workflow is to fetch examples that have a certain metadata key-value pair.
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+      from langsmith.evaluation import evaluate
+      
+      results = evaluate(
+          lambda inputs: label_text(inputs["text"]),
+          data=client.list_examples(dataset_name=dataset_name, metadata={"desired_key": "desired_value"}),
+          evaluators=[correct_label],
+          experiment_prefix="Toxic Queries",
+      )
+    `,
+    typescript`
+      import { evaluate } from "langsmith/evaluation";
+      
+      await evaluate((inputs) => labelText(inputs["input"]), {
+        data: langsmith.listExamples({
+          datasetName: datasetName,
+          metadata: {"desired_key": "desired_value"},
+        }),
+        evaluators: [correctLabel],
+        experimentPrefix: "Toxic Queries",
+      });
+    `,
+  ]}
+/>
+
+
+## Evaluate on a dataset split
+
+You can use the `list_examples` / `listExamples` method to evaluate on one or multiple splits of your dataset. The `splits` param takes a list of the splits you would like to evaluate.
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+      from langsmith.evaluation import evaluate
+      
+      results = evaluate(
+          lambda inputs: label_text(inputs["text"]),
+          data=client.list_examples(dataset_name=dataset_name, splits=["test", "training"]),
+          evaluators=[correct_label],
+          experiment_prefix="Toxic Queries",
+      )
+    `,
+    typescript`
+      import { evaluate } from "langsmith/evaluation";
+      
+      await evaluate((inputs) => labelText(inputs["input"]), {
+        data: langsmith.listExamples({
+          datasetName: datasetName,
+          splits: ["test", "training"],
+        }),
+        evaluators: [correctLabel],
+        experimentPrefix: "Toxic Queries",
+      });
+    `,
+  ]}
+/>
diff --git a/docs/evaluation/how_to_guides/evaluation/dataset_version.mdx b/docs/evaluation/how_to_guides/evaluation/dataset_version.mdx
new file mode 100644
index 00000000..01009afe
--- /dev/null
+++ b/docs/evaluation/how_to_guides/evaluation/dataset_version.mdx
@@ -0,0 +1,45 @@
+import {
+  CodeTabs,
+  python,
+  typescript,
+} from "@site/src/components/InstructionsWithCode";
+
+# How to evaluate on a specific dataset version
+
+:::tip Recommended reading
+
+Before diving into this content, it might be helpful to read the [guide on versioning datasets](../datasets/version_datasets).
+Additionally, it might be helpful to read the [guide on fetching examples](../datasets/manage_datasets_programmatically#fetch-examples).
+
+:::
+
+You can take advantage of the fact that `evaluate` allows passing in an iterable of examples to evaluate on a particular version of a dataset.
+Simply use `list_examples` / `listExamples` to fetch examples from a particular version tag using `as_of` / `asOf`.
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+      from langsmith.evaluation import evaluate
+      
+      results = evaluate(
+          lambda inputs: label_text(inputs["text"]),
+          data=client.list_examples(dataset_name=toxic_dataset_name, as_of="latest"),
+          evaluators=[correct_label],
+          experiment_prefix="Toxic Queries",
+      )
+    `,
+    typescript`
+      import { evaluate } from "langsmith/evaluation";
+      
+      await evaluate((inputs) => labelText(inputs["input"]), {
+        data: langsmith.listExamples({
+          datasetName: datasetName,
+          asOf: "latest",
+        }),
+        evaluators: [correctLabel],
+        experimentPrefix: "Toxic Queries",
+      });
+    `,
+  ]}
+/>
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_existing_experiment.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_existing_experiment.mdx
index 309f969d..229be6be 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_existing_experiment.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_existing_experiment.mdx
@@ -2,7 +2,7 @@
 sidebar_position: 6
 ---
 
-# Evaluate an existing experiment
+# How to evaluate an existing experiment
 
 :::note
 Currently, `evaluate_existing` is only supported in the Python SDK.
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
index faea0989..b050e216 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
@@ -10,9 +10,9 @@ import {
   TypeScriptBlock,
 } from "@site/src/components/InstructionsWithCode";
 
-# Evaluate an LLM Application
+# How to run an evaluation
 
-:::tip Recommended Reading
+:::tip Recommended reading
 Before diving into this content, it might be helpful to read the following:
 
 - [Conceptual guide on evaluation](../../concepts)
@@ -24,18 +24,7 @@ Before diving into this content, it might be helpful to read the following:
 Evaluating the performance of your LLM application is a critical step in the development process. LangSmith makes it easy to run evaluations and track evaluation performance over time.
 This section provides guidance on how to evaluate the performance of your LLM application.
 
-## Run an evaluation
-
-At a high-level, the evaluation process involves the following steps:
-
-1. Define your LLM application or target task.
-2. Creating or selecting a dataset to evaluate your LLM application. Your evaluation criteria may or may not require expected outputs in the dataset.
-3. Configuring evaluators to score the outputs of your LLM application, sometimes against expected outputs.
-4. Running the evaluation and viewing the results.
-
-The following example involves evaluating a very simple LLM pipeline as classifier to label input data as `"Toxic"` or `"Not toxic"`.
-
-### Step 1: Define your target task
+## Step 1: Define your application logic
 
 In this case, we are defining a simple evaluation target consisting of an LLM pipeline that classifies text as toxic or non-toxic.
 We've optionally enabled tracing to capture the inputs and outputs of each step in the pipeline.
@@ -49,18 +38,20 @@ To understand how to annotate your code for tracing, please refer to [this guide
       from langsmith import traceable, wrappers
       from openai import Client
       
-      openai = wrappers.wrap_openai(Client())
+      client = wrappers.wrap_openai(Client())
       
       @traceable
       def label_text(text):
+          system = (
+            "Please review the user query below and determine if it contains any form of toxic behavior, "
+            "such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does, "
+            "and 'Not toxic' if it doesn't."
+          )
           messages = [
-              {
-                  "role": "system",
-                  "content": "Please review the user query below and determine if it contains any form of toxic behavior, such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does, and 'Not toxic' if it doesn't.",
-              },
+              {"role": "system", "content": system},
               {"role": "user", "content": text},
           ]
-          result = openai.chat.completions.create(
+          result = client.chat.completions.create(
               messages=messages, model="gpt-4o-mini", temperature=0
           )
           return result.choices[0].message.content
@@ -93,7 +84,7 @@ To understand how to annotate your code for tracing, please refer to [this guide
   ]}
 />
 
-### Step 2: Create or select a dataset
+## Step 2: Create or select a dataset
 
 In this case, we are creating a dataset to evaluate the performance of our LLM application. The dataset contains examples of toxic and non-toxic text.
 
@@ -163,7 +154,7 @@ The values in the examples can also take any form, such as strings, numbers, lis
   ]}
 />
 
-### Step 3. Configure evaluators to score the outputs
+## Step 3. Configure evaluators to score the outputs
 
 In this case, we are using a dead-simple evaluator that compares the output of our LLM pipeline to the expected output in the dataset.
 Writing evaluators is discussed in more detail in the [following section](#custom-evaluators).
@@ -171,10 +162,13 @@ Writing evaluators is discussed in more detail in the [following section](#custo
 <CodeTabs
   groupId="client-language"
   tabs={[
-    PythonBlock(`from langsmith.schemas import Example, Run\n
-def correct_label(root_run: Run, example: Example) -> dict:
-    score = root_run.outputs.get("output") == example.outputs.get("label")
-    return {"score": int(score), "key": "correct_label"}`),
+    python`
+      from langsmith.schemas import Example, Run
+
+      def correct_label(root_run: Run, example: Example) -> dict:
+          score = root_run.outputs.get("output") == example.outputs.get("label")
+          return {"score": int(score), "key": "correct_label"}
+    `,
     typescript`
       import type { EvaluationResult } from "langsmith/evaluation";
       import type { Run, Example } from "langsmith/schemas";
@@ -188,7 +182,7 @@ def correct_label(root_run: Run, example: Example) -> dict:
   ]}
 />
 
-### Step 4. Run the evaluation and view the results
+## Step 4. Run the evaluation and view the results
 
 You can use the `evaluate` method in Python and TypeScript to run an evaluation.
 
@@ -236,396 +230,3 @@ _If you've annotated your code for tracing, you can open the trace of each row i
 
 ![](../evaluation/static/view_experiment.gif)
 
-## Use custom evaluators
-
-At a high-level, an evaluator judges an invocation of your LLM application against a reference example, and returns an evaluation score.
-
-In LangSmith evaluators, we represent this process as a function that takes in a [`Run`](/reference/data_formats/run_data_format)
-(representing the LLM app invocation) and an [`Example`](/reference/data_formats/example_data_format)
-(representing the data point to evaluate), and returns [`Feedback`](/reference/data_formats/feedback_data_format) (representing the evaluator's score of the LLM app invocation).
-
-Here is an example of a very simple custom evaluator that compares the output of a model to the expected output in the dataset:
-
-<CodeTabs
-  groupId="client-language"
-  tabs={[
-    PythonBlock(`from langsmith.schemas import Example, Run\n
-def correct_label(root_run: Run, example: Example) -> dict:
-    score = root_run.outputs.get("output") == example.outputs.get("label")
-    return {"score": int(score), "key": "correct_label"}`),
-    typescript`
-      import type { EvaluationResult } from "langsmith/evaluation";
-      import type { Run, Example } from "langsmith/schemas";
-      
-      // Row-level evaluator
-      function correctLabel(rootRun: Run, example: Example): EvaluationResult {
-        const score = rootRun.outputs?.outputs === example.outputs?.output;
-        return { key: "correct_label", score };
-      }
-    `,
-  ]}
-/>
-
-:::note default feedback key
-
-If the "key" field is not provided, the default key name will be the name of the evaluator function.
-
-:::
-
-:::tip advanced use-cases
-
-- **Evaluate on intermediate steps**: The `run` object comes from LangSmith tracing, so it allows you to see any traced intermediate steps of your LLM application. See [this guide](./evaluate_on_intermediate_steps) for more info on evaluating on intermediate steps.
-- **Return multiple scores**: you can return multiple scores from a single evaluator. Please check out [the example below](#return-multiple-scores) for more information.
-  :::
-
-## Evaluate on a particular version of a dataset
-
-:::tip Recommended Reading
-
-Before diving into this content, it might be helpful to read the [guide on versioning datasets](../datasets/version_datasets).
-Additionally, it might be helpful to read the [guide on fetching examples](../datasets/manage_datasets_programmatically#fetch-examples).
-
-:::
-
-You can take advantage of the fact that `evaluate` allows passing in an iterable of examples to evaluate on a particular version of a dataset.
-Simply use `list_examples` / `listExamples` to fetch examples from a particular version tag using `as_of` / `asOf`.
-
-<CodeTabs
-  groupId="client-language"
-  tabs={[
-    python`
-      from langsmith.evaluation import evaluate
-      
-      results = evaluate(
-          lambda inputs: label_text(inputs["text"]),
-          data=client.list_examples(dataset_name=toxic_dataset_name, as_of="latest"),
-          evaluators=[correct_label],
-          experiment_prefix="Toxic Queries",
-      )
-    `,
-    typescript`
-      import { evaluate } from "langsmith/evaluation";
-      
-      await evaluate((inputs) => labelText(inputs["input"]), {
-        data: langsmith.listExamples({
-          datasetName: datasetName,
-          asOf: "latest",
-        }),
-        evaluators: [correctLabel],
-        experimentPrefix: "Toxic Queries",
-      });
-    `,
-  ]}
-/>
-
-## Evaluate on a subset of a dataset
-
-:::tip Recommended Reading
-
-Before diving into this content, it might be helpful to read the [guide on fetching examples](../datasets/manage_datasets_programmatically#fetch-examples).
-
-:::
-
-You can use the `list_examples` / `listExamples` method to fetch a subset of examples from a dataset to evaluate on. You can refer to guide above to learn more about the different ways to fetch examples.
-
-One common workflow is to fetch examples that have a certain metadata key-value pair.
-
-<CodeTabs
-  groupId="client-language"
-  tabs={[
-    python`
-      from langsmith.evaluation import evaluate
-      
-      results = evaluate(
-          lambda inputs: label_text(inputs["text"]),
-          data=client.list_examples(dataset_name=dataset_name, metadata={"desired_key": "desired_value"}),
-          evaluators=[correct_label],
-          experiment_prefix="Toxic Queries",
-      )
-    `,
-    typescript`
-      import { evaluate } from "langsmith/evaluation";
-      
-      await evaluate((inputs) => labelText(inputs["input"]), {
-        data: langsmith.listExamples({
-          datasetName: datasetName,
-          metadata: {"desired_key": "desired_value"},
-        }),
-        evaluators: [correctLabel],
-        experimentPrefix: "Toxic Queries",
-      });
-    `,
-  ]}
-/>
-
-## Evaluate on a dataset split
-
-:::tip Recommended Reading
-
-Before reading, it might be useful to check out the [guide on creating/managing dataset splits](../datasets/manage_datasets_in_application#create-and-manage-dataset-splits).
-
-:::
-
-You can use the `list_examples` / `listExamples` method to evaluate on one or multiple splits of your dataset. The `splits` param takes a list of the splits you would like to evaluate.
-
-<CodeTabs
-  groupId="client-language"
-  tabs={[
-    python`
-      from langsmith.evaluation import evaluate
-      
-      results = evaluate(
-          lambda inputs: label_text(inputs["text"]),
-          data=client.list_examples(dataset_name=dataset_name, splits=["test", "training"]),
-          evaluators=[correct_label],
-          experiment_prefix="Toxic Queries",
-      )
-    `,
-    typescript`
-      import { evaluate } from "langsmith/evaluation";
-      
-      await evaluate((inputs) => labelText(inputs["input"]), {
-        data: langsmith.listExamples({
-          datasetName: datasetName,
-          splits: ["test", "training"],
-        }),
-        evaluators: [correctLabel],
-        experimentPrefix: "Toxic Queries",
-      });
-    `,
-  ]}
-/>
-
-## Evaluate on a dataset with repetitions
-
-The optional `num_repetitions` param to the `evaluate` function allows you to specify how many times
-to run/evaluate each example in your dataset. For instance, if you have 5 examples and set
-`num_repetitions=5`, each example will be run 5 times, for a total of 25 runs. This can be useful for reducing
-noise in systems prone to high variability, such as agents.
-
-<CodeTabs
-  groupId="client-language"
-  tabs={[
-    python`
-      from langsmith.evaluation import evaluate
-      
-      results = evaluate(
-          lambda inputs: label_text(inputs["text"]),
-          data=dataset_name,
-          evaluators=[correct_label],
-          experiment_prefix="Toxic Queries",
-          num_repetitions=3,
-      )
-    `,
-    typescript`
-      import { evaluate } from "langsmith/evaluation";
-      
-      await evaluate((inputs) => labelText(inputs["input"]), {
-        data: datasetName,
-        evaluators: [correctLabel],
-        experimentPrefix: "Toxic Queries",
-        numReptitions=3,
-      });
-    `,
-  ]}
-/>
-
-## Use a summary evaluator
-
-Some metrics can only be defined on the entire experiment level as opposed to the individual runs of the experiment.
-For example, you may want to compute the overall pass rate or f1 score of your evaluation target across all examples in the dataset.
-These are called `summary_evaluators`. Instead of taking in a single `Run` and `Example`, these evaluators take a list of each.
-
-Below, we'll implement a very simple summary evaluator that computes overall pass rate:
-
-<CodeTabs
-  groupId="client-language"
-  tabs={[
-    python`
-      from langsmith.schemas import Example, Run
-      
-      def summary_eval(runs: list[Run], examples: list[Example]) -> dict:
-          correct = 0
-          for i, run in enumerate(runs):
-              if run.outputs["output"] == examples[i].outputs["label"]:
-                  correct += 1
-          if correct / len(runs) > 0.5:
-              return {"key": "pass", "score": True}
-          else:
-              return {"key": "pass", "score": False}
-    `,
-    typescript`
-      import { Run, Example } from "langsmith/schemas";
-      
-      function summaryEval(runs: Run[], examples: Example[]) {
-        let correct = 0;
-        
-        for (let i = 0; i < runs.length; i++) {
-          if (runs[i].outputs["output"] === examples[i].outputs["label"]) {
-            correct += 1;
-          }
-        }
-        
-        return { key: "pass", score: correct / runs.length > 0.5 };
-      }
-    `,
-  ]}
-/>
-
-You can then pass this evaluator to the `evaluate` method as follows:
-
-<CodeTabs
-  groupId="client-language"
-  tabs={[
-    python`
-      results = evaluate(
-          lambda inputs: label_query(inputs["text"]),
-          data=dataset_name,
-          evaluators=[correct_label],
-          summary_evaluators=[summary_eval],
-          experiment_prefix="Toxic Queries",
-      )
-    `,
-    typescript`
-      await evaluate((inputs) => labelQuery(inputs["input"]), {
-        data: datasetName,
-        evaluators: [correctLabel],
-        summaryEvaluators: [summaryEval],
-        experimentPrefix: "Toxic Queries",
-      });
-    `,
-  ]}
-/>
-
-In the LangSmith UI, you'll the summary evaluator's score displayed with the corresponding key.
-
-![](../evaluation/static/summary_eval.png)
-
-## Evaluate a LangChain runnable
-
-You can configure a `LangChain` runnable to be evaluated by passing `runnable.invoke` it to the `evaluate` method in Python, or just the `runnable` in TypeScript.
-
-First, define your `LangChain` runnable:
-
-<CodeTabs
-  groupId="client-language"
-  tabs={[
-    python`
-      from langchain_openai import ChatOpenAI
-      from langchain_core.prompts import ChatPromptTemplate
-      from langchain_core.output_parsers import StrOutputParser
-      
-      prompt = ChatPromptTemplate.from_messages([
-        ("system", "Please review the user query below and determine if it contains any form of toxic behavior, such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does, and 'Not toxic' if it doesn't."),
-        ("user", "{text}")
-      ])
-      chat_model = ChatOpenAI()
-      output_parser = StrOutputParser()
-      
-      chain = prompt | chat_model | output_parser
-    `,
-    typescript`
-      import { ChatOpenAI } from "@langchain/openai";
-      import { ChatPromptTemplate } from "@langchain/core/prompts";
-      import { StringOutputParser } from "@langchain/core/output_parsers";
-      
-      const prompt = ChatPromptTemplate.fromMessages([
-        ["system", "Please review the user query below and determine if it contains any form of toxic behavior, such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does, and 'Not toxic' if it doesn't."],
-        ["user", "{text}"]
-      ]);
-      const chatModel = new ChatOpenAI();
-      const outputParser = new StringOutputParser();
-      
-      const chain = prompt.pipe(chatModel).pipe(outputParser);
-    `,
-  ]}
-/>
-
-Then, pass the `runnable.invoke` method to the `evaluate` method. Note that the input variables of the runnable must match the keys of the example inputs.
-
-<CodeTabs
-  groupId="client-language"
-  tabs={[
-    python`
-      from langsmith.evaluation import evaluate
-      
-      results = evaluate(
-          chain.invoke,
-          data=dataset_name,
-          evaluators=[correct_label],
-          experiment_prefix="Toxic Queries",
-      )
-    `,
-    typescript`
-      import { evaluate } from "langsmith/evaluation";
-      
-      await evaluate(chain, {
-        data: datasetName,
-        evaluators: [correctLabel],
-        experimentPrefix: "Toxic Queries",
-      });
-    `,
-  ]}
-/>
-
-The runnable is traced appropriately for each output.
-
-![](../evaluation/static/runnable_eval.png)
-
-## Return multiple scores
-
-In most cases, each evaluator returns a single key or categorical value. Alternatively, you can return evaluation metrics from a single evaluator. This is useful if your metrics share intermediate values. For example, precision and recall but rely on the same true and false positives and negative values, or you may have an LLM generate multiple metrics in a single shot.
-
-To return multiple scores, simply return a dictionary/object of the following form:
-
-```python
-{
-    "results": [
-        {"key":string, "score": number},
-        {"key":string, "score": number},
-        # You may log as many as you wish
-    ]
-}
-```
-
-Each of these dictionaries can contain any or all of the [feedback fields](/reference/data_formats/feedback_data_format); check out the linked document for more information.
-
-Example:
-
-<CodeTabs
-  groupId="client-language"
-  tabs={[
-    PythonBlock(`from langsmith.schemas import Example, Run\n
-def multiple_scores(root_run: Run, example: Example) -> dict:
-  # Your evaluation logic here
-  return {
-      "results": [
-          {"key": "precision", "score": 0.8},
-          {"key": "recall", "score": 0.9},
-          {"key": "f1", "score": 0.85},
-      ]
-  }
-    `),
-    typescript({
-      caption:
-        "Support for multiple scores is available in `langsmith@0.1.32` and higher",
-    })`
-      import type { Run, Example } from "langsmith/schemas";
-      
-      function multipleScores(rootRun: Run, example: Example) {
-        // Your evaluation logic here
-        return {
-            results: [
-                { key: "precision", score: 0.8 },
-                { key: "recall", score: 0.9 },
-                { key: "f1", score: 0.85 },
-            ],
-        };
-      }
-    `,
-  ]}
-/>
-
-Rows from the resulting experiment will display each of the scores.
-
-![](../evaluation/static/multiple_scores.png)
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_on_intermediate_steps.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_on_intermediate_steps.mdx
index 504a8042..9c9c3a14 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_on_intermediate_steps.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_on_intermediate_steps.mdx
@@ -10,7 +10,7 @@ import {
   typescript,
 } from "@site/src/components/InstructionsWithCode";
 
-# Evaluate on intermediate steps
+# How to evaluate intermediate steps
 
 While, in many scenarios, it is sufficient to evaluate the final output of your task, in some cases you might want to evaluate the intermediate steps of your pipeline.
 
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx
index a75eeaf8..11287e70 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx
@@ -9,7 +9,7 @@ import {
   typescript,
 } from "@site/src/components/InstructionsWithCode";
 
-# Run pairwise evaluations
+# How to run pairwise evaluations
 
 :::tip Recommended Reading
 Before diving into this content, it might be helpful to read the following:
diff --git a/docs/evaluation/how_to_guides/evaluation/fetch_perf_metrics_experiment.mdx b/docs/evaluation/how_to_guides/evaluation/fetch_perf_metrics_experiment.mdx
index 74ac1771..2017a6ff 100644
--- a/docs/evaluation/how_to_guides/evaluation/fetch_perf_metrics_experiment.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/fetch_perf_metrics_experiment.mdx
@@ -8,7 +8,7 @@ import {
   TypeScriptBlock,
 } from "@site/src/components/InstructionsWithCode";
 
-# Fetch performance metrics for an experiment
+# How to fetch performance metrics for an experiment
 
 :::tip Experiments, Projects, and Sessions
 
@@ -108,7 +108,7 @@ from langsmith import Client
 
 client = Client()
 
-# Create a dataset
+ Create a dataset
 examples = [
     ("Harrison", "Hello Harrison"),
     ("Ankush", "Hello Ankush"),
diff --git a/docs/evaluation/how_to_guides/evaluation/filter_experiments_ui.mdx b/docs/evaluation/how_to_guides/evaluation/filter_experiments_ui.mdx
index f31ca391..6f32dfc2 100644
--- a/docs/evaluation/how_to_guides/evaluation/filter_experiments_ui.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/filter_experiments_ui.mdx
@@ -6,7 +6,7 @@ import {
   TypeScriptBlock,
 } from "@site/src/components/InstructionsWithCode";
 
-# Filter experiments in the UI
+# How to filter experiments in the UI
 
 LangSmith lets you filter your previous experiments by feedback scores and metadata to make it easy
 to find only the experiments you care about.
diff --git a/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx b/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
new file mode 100644
index 00000000..8c8a2065
--- /dev/null
+++ b/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
@@ -0,0 +1,76 @@
+import {
+  CodeTabs,
+  python,
+  typescript,
+} from "@site/src/components/InstructionsWithCode";
+
+# How to evaluate a LangChain runnable
+
+You can configure a `LangChain` runnable to be evaluated by passing `runnable.invoke` it to the `evaluate` method in Python, or just the `runnable` in TypeScript.
+
+First, define your `LangChain` runnable:
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+      from langchain_openai import ChatOpenAI
+      from langchain_core.prompts import ChatPromptTemplate
+      from langchain_core.output_parsers import StrOutputParser
+      
+      prompt = ChatPromptTemplate.from_messages([
+        ("system", "Please review the user query below and determine if it contains any form of toxic behavior, such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does, and 'Not toxic' if it doesn't."),
+        ("user", "{text}")
+      ])
+      chat_model = ChatOpenAI()
+      output_parser = StrOutputParser()
+      
+      chain = prompt | chat_model | output_parser
+    `,
+    typescript`
+      import { ChatOpenAI } from "@langchain/openai";
+      import { ChatPromptTemplate } from "@langchain/core/prompts";
+      import { StringOutputParser } from "@langchain/core/output_parsers";
+      
+      const prompt = ChatPromptTemplate.fromMessages([
+        ["system", "Please review the user query below and determine if it contains any form of toxic behavior, such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does, and 'Not toxic' if it doesn't."],
+        ["user", "{text}"]
+      ]);
+      const chatModel = new ChatOpenAI();
+      const outputParser = new StringOutputParser();
+      
+      const chain = prompt.pipe(chatModel).pipe(outputParser);
+    `,
+  ]}
+/>
+
+Then, pass the `runnable.invoke` method to the `evaluate` method. Note that the input variables of the runnable must match the keys of the example inputs.
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+      from langsmith.evaluation import evaluate
+      
+      results = evaluate(
+          chain.invoke,
+          data=dataset_name,
+          evaluators=[correct_label],
+          experiment_prefix="Toxic Queries",
+      )
+    `,
+    typescript`
+      import { evaluate } from "langsmith/evaluation";
+      
+      await evaluate(chain, {
+        data: datasetName,
+        evaluators: [correctLabel],
+        experimentPrefix: "Toxic Queries",
+      });
+    `,
+  ]}
+/>
+
+The runnable is traced appropriately for each output.
+
+![](../evaluation/static/runnable_eval.png)
diff --git a/docs/evaluation/how_to_guides/evaluation/large_job.mdx b/docs/evaluation/how_to_guides/evaluation/large_job.mdx
new file mode 100644
index 00000000..e69de29b
diff --git a/docs/evaluation/how_to_guides/evaluation/metric_type.mdx b/docs/evaluation/how_to_guides/evaluation/metric_type.mdx
new file mode 100644
index 00000000..e69de29b
diff --git a/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx b/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx
new file mode 100644
index 00000000..c5f50a35
--- /dev/null
+++ b/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx
@@ -0,0 +1,65 @@
+import {
+  CodeTabs,
+  python,
+  typescript,
+} from "@site/src/components/InstructionsWithCode";
+
+# How to return multiple scores in one evaluator
+
+In most cases, each evaluator returns a single key or categorical value. Alternatively, you can return evaluation metrics from a single evaluator. This is useful if your metrics share intermediate values. For example, precision and recall but rely on the same true and false positives and negative values, or you may have an LLM generate multiple metrics in a single shot.
+
+To return multiple scores, simply return a dictionary/object of the following form:
+
+```python
+{
+    "results": [
+        {"key":string, "score": number},
+        {"key":string, "score": number},
+        # You may log as many as you wish
+    ]
+}
+```
+
+Each of these dictionaries can contain any or all of the [feedback fields](/reference/data_formats/feedback_data_format); check out the linked document for more information.
+
+Example:
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+        from langsmith.schemas import Example, Run\n
+
+        def multiple_scores(root_run: Run, example: Example) -> dict:
+        # Your evaluation logic here
+        return {
+            "results": [
+                {"key": "precision", "score": 0.8},
+                {"key": "recall", "score": 0.9},
+                {"key": "f1", "score": 0.85},
+            ]
+        }
+    `,
+    typescript({
+      caption:
+        "Support for multiple scores is available in `langsmith@0.1.32` and higher",
+    })`
+      import type { Run, Example } from "langsmith/schemas";
+      
+      function multipleScores(rootRun: Run, example: Example) {
+        // Your evaluation logic here
+        return {
+            results: [
+                { key: "precision", score: 0.8 },
+                { key: "recall", score: 0.9 },
+                { key: "f1", score: 0.85 },
+            ],
+        };
+      }
+    `,
+  ]}
+/>
+
+Rows from the resulting experiment will display each of the scores.
+
+![](../evaluation/static/multiple_scores.png)
\ No newline at end of file
diff --git a/docs/evaluation/how_to_guides/evaluation/rate_limiting.mdx b/docs/evaluation/how_to_guides/evaluation/rate_limiting.mdx
new file mode 100644
index 00000000..e69de29b
diff --git a/docs/evaluation/how_to_guides/evaluation/repetition.mdx b/docs/evaluation/how_to_guides/evaluation/repetition.mdx
new file mode 100644
index 00000000..2f8c0a45
--- /dev/null
+++ b/docs/evaluation/how_to_guides/evaluation/repetition.mdx
@@ -0,0 +1,39 @@
+import {
+  CodeTabs,
+  python,
+  typescript,
+} from "@site/src/components/InstructionsWithCode";
+
+# How to evaluate with repetitions
+
+The optional `num_repetitions` param to the `evaluate` function allows you to specify how many times
+to run/evaluate each example in your dataset. For instance, if you have 5 examples and set
+`num_repetitions=5`, each example will be run 5 times, for a total of 25 runs. This can be useful for reducing
+noise in systems prone to high variability, such as agents.
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+      from langsmith.evaluation import evaluate
+      
+      results = evaluate(
+          lambda inputs: label_text(inputs["text"]),
+          data=dataset_name,
+          evaluators=[correct_label],
+          experiment_prefix="Toxic Queries",
+          num_repetitions=3,
+      )
+    `,
+    typescript`
+      import { evaluate } from "langsmith/evaluation";
+      
+      await evaluate((inputs) => labelText(inputs["input"]), {
+        data: datasetName,
+        evaluators: [correctLabel],
+        experimentPrefix: "Toxic Queries",
+        numReptitions=3,
+      });
+    `,
+  ]}
+/>
diff --git a/docs/evaluation/how_to_guides/evaluation/run_evals_api_only.mdx b/docs/evaluation/how_to_guides/evaluation/run_evals_api_only.mdx
index 9c8e323b..b431fe98 100644
--- a/docs/evaluation/how_to_guides/evaluation/run_evals_api_only.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/run_evals_api_only.mdx
@@ -9,7 +9,7 @@ import {
 } from "@site/src/components/InstructionsWithCode";
 import { RegionalUrl } from "@site/src/components/RegionalUrls";
 
-# Run evals with the REST API
+# How to use the REST API
 
 :::tip Recommended Reading
 Before diving into this content, it might be helpful to read the following:
@@ -40,7 +40,7 @@ from uuid import uuid4
 
 client = Client()
 
-# Create a dataset
+ Create a dataset
 examples = [
     ("Shut up, idiot", "Toxic"),
     ("You're a wonderful person", "Not toxic"),
@@ -63,8 +63,8 @@ client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
 First, pull all of the examples you'd want to use in your experiment.
 
 ```python
-# Pick a dataset id. In this case, we are using the dataset we created above.
-# Spec: https://api.smith.langchain.com/redoc#tag/examples/operation/delete_example_api_v1_examples__example_id__delete
+ Pick a dataset id. In this case, we are using the dataset we created above.
+ Spec: https://api.smith.langchain.com/redoc#tag/examples/operation/delete_example_api_v1_examples__example_id__delete
 dataset_id = dataset.id
 params = { "dataset": dataset_id }
 
@@ -152,9 +152,9 @@ def run_completion_on_example(example, model_name, experiment_id):
 We are going to run completions on all examples using two models: gpt-3.5-turbo and gpt-4o-mini.
 
 ```python
-# Create a new experiment using the /sessions endpoint
-# An experiment is a collection of runs with a reference to the dataset used
-# Spec: https://api.smith.langchain.com/redoc#tag/tracer-sessions/operation/create_tracer_session_api_v1_sessions_post
+ Create a new experiment using the /sessions endpoint
+ An experiment is a collection of runs with a reference to the dataset used
+ Spec: https://api.smith.langchain.com/redoc#tag/tracer-sessions/operation/create_tracer_session_api_v1_sessions_post
 
 model_names = ("gpt-3.5-turbo", "gpt-4o-mini")
 experiment_ids = []
@@ -194,8 +194,8 @@ Next, we'll demonstrate how to run a pairwise experiment. In a pairwise experime
 For more information, check out [this guide](../evaluation/evaluate_pairwise).
 
 ```python
-# A comparative experiment allows you to provide a preferential ranking on the outputs of two or more experiments
-# Spec: https://api.smith.langchain.com/redoc#tag/datasets/operation/create_comparative_experiment_api_v1_datasets_comparative_post
+ A comparative experiment allows you to provide a preferential ranking on the outputs of two or more experiments
+ Spec: https://api.smith.langchain.com/redoc#tag/datasets/operation/create_comparative_experiment_api_v1_datasets_comparative_post
 resp = requests.post(
     "https://api.smith.langchain.com/api/v1/datasets/comparative",
     json={
@@ -213,9 +213,9 @@ resp = requests.post(
 comparative_experiment = resp.json()
 comparative_experiment_id = comparative_experiment["id"]
 
-# You can iterate over the runs in the experiments belonging to the comparative experiment and preferentially rank the outputs
+ You can iterate over the runs in the experiments belonging to the comparative experiment and preferentially rank the outputs
 
-# Fetch the comparative experiment
+ Fetch the comparative experiment
 resp = requests.get(
     f"https://api.smith.langchain.com/api/v1/datasets/{str(dataset_id)}/comparative",
     params={"id": comparative_experiment_id},
@@ -228,7 +228,7 @@ experiment_ids = [info["id"] for info in comparative_experiment["experiments_inf
 from collections import defaultdict
 example_id_to_runs_map = defaultdict(list)
 
-# Spec: https://api.smith.langchain.com/redoc#tag/run/operation/query_runs_api_v1_runs_query_post
+ Spec: https://api.smith.langchain.com/redoc#tag/run/operation/query_runs_api_v1_runs_query_post
 runs = requests.post(
     f"https://api.smith.langchain.com/api/v1/runs/query",
     headers={"x-api-key": os.environ["LANGSMITH_API_KEY"]},
diff --git a/docs/evaluation/how_to_guides/evaluation/run_evaluation_from_prompt_playground.mdx b/docs/evaluation/how_to_guides/evaluation/run_evaluation_from_prompt_playground.mdx
index c7b99141..b2dee48b 100644
--- a/docs/evaluation/how_to_guides/evaluation/run_evaluation_from_prompt_playground.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/run_evaluation_from_prompt_playground.mdx
@@ -2,7 +2,7 @@
 sidebar_position: 2
 ---
 
-# Run an evaluation from the prompt playground
+# How to run an evaluation from the prompt playground
 
 While you can kick off experiments easily using the sdk, as outlined [here](./evaluate_llm_application), it's often useful to run experiments directly in the prompt playground.
 
diff --git a/docs/evaluation/how_to_guides/evaluation/summary.mdx b/docs/evaluation/how_to_guides/evaluation/summary.mdx
new file mode 100644
index 00000000..2abc0c5b
--- /dev/null
+++ b/docs/evaluation/how_to_guides/evaluation/summary.mdx
@@ -0,0 +1,76 @@
+import {
+  CodeTabs,
+  python,
+  typescript,
+} from "@site/src/components/InstructionsWithCode";
+
+# How to run an aggregate evaluation
+
+Some metrics can only be defined on the entire experiment level as opposed to the individual runs of the experiment.
+For example, you may want to compute the overall pass rate or f1 score of your evaluation target across all examples in the dataset.
+These are called `summary_evaluators`. Instead of taking in a single `Run` and `Example`, these evaluators take a list of each.
+
+Below, we'll implement a very simple summary evaluator that computes overall pass rate:
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+      from langsmith.schemas import Example, Run
+      
+      def summary_eval(runs: list[Run], examples: list[Example]) -> dict:
+          correct = 0
+          for i, run in enumerate(runs):
+              if run.outputs["output"] == examples[i].outputs["label"]:
+                  correct += 1
+          if correct / len(runs) > 0.5:
+              return {"key": "pass", "score": True}
+          else:
+              return {"key": "pass", "score": False}
+    `,
+    typescript`
+      import { Run, Example } from "langsmith/schemas";
+      
+      function summaryEval(runs: Run[], examples: Example[]) {
+        let correct = 0;
+        
+        for (let i = 0; i < runs.length; i++) {
+          if (runs[i].outputs["output"] === examples[i].outputs["label"]) {
+            correct += 1;
+          }
+        }
+        
+        return { key: "pass", score: correct / runs.length > 0.5 };
+      }
+    `,
+  ]}
+/>
+
+You can then pass this evaluator to the `evaluate` method as follows:
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+      results = evaluate(
+          lambda inputs: label_query(inputs["text"]),
+          data=dataset_name,
+          evaluators=[correct_label],
+          summary_evaluators=[summary_eval],
+          experiment_prefix="Toxic Queries",
+      )
+    `,
+    typescript`
+      await evaluate((inputs) => labelQuery(inputs["input"]), {
+        data: datasetName,
+        evaluators: [correctLabel],
+        summaryEvaluators: [summaryEval],
+        experimentPrefix: "Toxic Queries",
+      });
+    `,
+  ]}
+/>
+
+In the LangSmith UI, you'll the summary evaluator's score displayed with the corresponding key.
+
+![](../evaluation/static/summary_eval.png)
\ No newline at end of file
diff --git a/docs/evaluation/how_to_guides/evaluation/unit_testing.mdx b/docs/evaluation/how_to_guides/evaluation/unit_testing.mdx
index bc2c2f53..b43eab1b 100644
--- a/docs/evaluation/how_to_guides/evaluation/unit_testing.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/unit_testing.mdx
@@ -2,7 +2,7 @@
 sidebar_position: 7
 ---
 
-# Test LLM applications (Python only)
+# How to unit test applications (Python only)
 
 LangSmith functional tests are assertions and expectations designed to **quickly** identify obvious bugs and regressions in your AI system. Relative to evaluations, tests typically are designed to be **fast** and **cheap** to run, focusing on **specific** functionality and edge cases.
 We recommend using LangSmith to track any unit tests, end-to-end integration tests, or other specific assertions that touch an LLM or other non-deterministic part of your AI system.
@@ -18,7 +18,7 @@ To write a LangSmith functional test, decorate your test function with `@unit`.
 If you want to track the full nested trace of the system or component being tested, you can mark those functions with `@traceable`. For example:
 
 ```python
-# my_app/main.py
+ my_app/main.py
 from langsmith import traceable
 
 @traceable # Optional
@@ -31,7 +31,7 @@ def generate_sql(user_query):
 Then define your test:
 
 ```python tests/test_my_app.py
-# tests/test_my_app.py
+ tests/test_my_app.py
 from langsmith import unit
 from my_app.main import generate_sql
 
@@ -78,8 +78,8 @@ def user_query():
 def expected_sql():
     return "SELECT * FROM customers"
 
-# output_keys indicate which test arguments to save as 'outputs' in the dataset (Optional)
-# Otherwise, all arguments are saved as 'inputs'
+ output_keys indicate which test arguments to save as 'outputs' in the dataset (Optional)
+ Otherwise, all arguments are saved as 'inputs'
 @unit(output_keys=["expected_sql"])
 def test_sql_generation_with_fixture(user_query, expected_sql):
     sql = generate_sql(user_query)
diff --git a/docs/evaluation/how_to_guides/evaluation/upload_existing_experiments.mdx b/docs/evaluation/how_to_guides/evaluation/upload_existing_experiments.mdx
index 41f9c59f..c9c8551d 100644
--- a/docs/evaluation/how_to_guides/evaluation/upload_existing_experiments.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/upload_existing_experiments.mdx
@@ -9,7 +9,7 @@ import {
 } from "@site/src/components/InstructionsWithCode";
 import { RegionalUrl } from "@site/src/components/RegionalUrls";
 
-# Upload experiments run outside of LangSmith with the REST API
+# How to upload experiments run outside of LangSmith with the REST API
 
 Some users prefer to manage their datasets and run their experiments outside of LangSmith, but want to use the LangSmith UI to view the results. This is supported via our `/datasets/upload-experiment` endpoint.
 
diff --git a/docs/evaluation/how_to_guides/evaluation/use_langchain_off_the_shelf_evaluators.mdx b/docs/evaluation/how_to_guides/evaluation/use_langchain_off_the_shelf_evaluators.mdx
index 4fa20643..3b723033 100644
--- a/docs/evaluation/how_to_guides/evaluation/use_langchain_off_the_shelf_evaluators.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/use_langchain_off_the_shelf_evaluators.mdx
@@ -2,7 +2,7 @@
 sidebar_position: 4
 ---
 
-# Use LangChain off-the-shelf evaluators (Python only)
+# How to use off-the-shelf evaluators (Python only)
 
 :::tip Recommended Reading
 Before diving into this content, it might be helpful to read the following:
@@ -25,7 +25,7 @@ from langsmith import Client
 
 client = Client()
 
-# Create a dataset
+ Create a dataset
 examples = [
     ("Ankush", "Hello Ankush"),
     ("Harrison", "Hello Harrison"),
@@ -164,7 +164,7 @@ To measure the similarity between a predicted string and a reference, you can us
 - The `"embedding_distance"` evaluator computes the distance between the text embeddings of the prediction and reference
 
 ```python
-# !pip install rapidfuzz
+ !pip install rapidfuzz
 from langsmith.evaluation import LangChainStringEvaluator, evaluate
 
 string_distance_evaluator = LangChainStringEvaluator(
diff --git a/docs/evaluation/how_to_guides/index.md b/docs/evaluation/how_to_guides/index.md
index 6ca14f90..d31f9a50 100644
--- a/docs/evaluation/how_to_guides/index.md
+++ b/docs/evaluation/how_to_guides/index.md
@@ -1,63 +1,85 @@
 # Evaluation how-to guides
 
-Step-by-step guides that cover key tasks and operations for doing evaluating and testing your applications in LangSmith.
+These guides answer “How do I….?” format questions. 
+They are goal-oriented and concrete, and are meant to help you complete a specific task. 
+For conceptual explanations see the [Conceptual guide](./concepts). 
+For end-to-end walkthroughs see [Tutorials](./tutorials). 
+For comprehensive descriptions of every class and function see the [API Reference](https://langsmith-sdk.readthedocs.io/en/latest/evaluation.html).
+
+## Offline evaluation
+
+Evaluate and improve your application.
+
+### Run an evaluation
+- [Run an evaluation using the SDK](./how_to_guides/evaluation/evaluate_llm_application)
+- [Run an evaluation asynchronously](./how_to_guides/evaluation/async)
+- [Run an evaluation comparing two experiments](./how_to_guides/evaluation/evaluate_pairwise)
+- [Run an evaluation of a LangChain / LangGraph object](./how_to_guides/evaluation/langchain_runnable)
+- [Run an evaluation of an existing experiment](./how_to_guides/evaluation/evaluate_existing_experiment)
+- [Run an evaluation using the REST API](./how_to_guides/evaluation/run_evals_api_only)
+- [Run an evaluation in the prompt playground](./how_to_guides/evaluation/run_evaluation_from_prompt_playground)
 
-## Evaluation SDK & API
+### Define an evaluator
+- [Define a custom evaluator](./how_to_guides/evaluation/custom_evaluator)
+- [Use an off-the-shelf evaluator (Python only)](./how_to_guides/evaluation/use_langchain_off_the_shelf_evaluators)
+- [Evaluate aggregate experiment results](./how_to_guides/evaluation/summary)
+- [Evaluate intermediate steps](./how_to_guides/evaluation/evaluate_on_intermediate_steps)
+- [Return multiple metrics in one evaluator](./how_to_guides/evaluation/multiple_scores)
+- [Return categorical and continuous metrics](./how_to_guides/evaluation/metric_type)
+- [Check your evaluator setup](./how_to_guides/evaluation/check_evaluator)
 
-Write evaluations to test and improve your application.
+### Configure the data
+- [Evaluate on a split / filtered view of a dataset](./how_to_guides/evaluation/dataset_subset)
+- [Evaluate on a specific dataset version](./how_to_guides/evaluation/dataset_version)
 
-- [Evaluate an LLM application in the SDK](./how_to_guides/evaluation/evaluate_llm_application)
-- [Define a custom evaluator](./how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators)
-- [Evaluate on intermediate steps](./how_to_guides/evaluation/evaluate_on_intermediate_steps)
-- [Use LangChain off-the-shelf evaluators (Python only)](./how_to_guides/evaluation/use_langchain_off_the_shelf_evaluators)
-- [Evaluate an existing experiment](./how_to_guides/evaluation/evaluate_existing_experiment)
-- [Run a pairwise evaluation](./how_to_guides/evaluation/evaluate_pairwise)
-- [Run evals using the API only](./how_to_guides/evaluation/run_evals_api_only)
+### Configure an evaluation job
+- [Evaluate with repetitions](./how_to_guides/evaluation/repetition)
+- [Run a large evaluation job](./how_to_guides/evaluation/large_job)
+- [Handle rate limiting](./how_to_guides/evaluation/rate_limiting)
 
 ## Unit testing
 
-Run assertions and expectations designed to quickly identify obvious bugs and regressions in your AI system, natively in your favorite testing library.
+Unit test your system to identify bugs and regressions.
 
-- [Unit test LLM applications (Python only)](./how_to_guides/evaluation/unit_testing)
+- [Unit test applications (Python only)](./how_to_guides/evaluation/unit_testing)
 
-## Auto-evaluation
+## Online evaluation
 
-Set up auto-evaluators that LangSmith will automatically run on your experiments.
+Evaluate and monitor your system's live performance on production data.
 
-- [Set up an Auto-Evaluator to run on all experiments](./how_to_guides/evaluation/bind_evaluator_to_dataset)
-- [Create few-shot evaluators](./how_to_guides/evaluation/create_few_shot_evaluators)
+- [Set up an online evaluator](../../observability/how_to_guides/monitoring/online_evaluations)
+- [Create a few-shot evaluator](./how_to_guides/evaluation/create_few_shot_evaluators)
 
-## Online evaluation
+## Automatic evaluation
 
-Set up evaluations to run on incoming traces to understand your application's behavior in production.
+Set up evaluators that automatically run for all experiments against a dataset.
 
-- [Set up online evaluations](../../observability/how_to_guides/monitoring/online_evaluations)
-- [Create few-shot evaluators](./how_to_guides/evaluation/create_few_shot_evaluators)
+- [Set up an auto-evaluator](./how_to_guides/evaluation/bind_evaluator_to_dataset)
+- [Create a few-shot evaluator](./how_to_guides/evaluation/create_few_shot_evaluators)
 
-## Experiments
+## Analyzing experiment results
 
-Use the experiments UI & API to understand your evaluations.
+Use the UI & API to understand your experiment results.
 
-- [Run an evaluation in the prompt playground](./how_to_guides/evaluation/run_evaluation_from_prompt_playground)
 - [Compare experiments with the comparison view](./how_to_guides/evaluation/compare_experiment_results)
 - [Filter experiments](./how_to_guides/evaluation/filter_experiments_ui)
 - [View pairwise experiments](./how_to_guides/evaluation/evaluate_pairwise#view-pairwise-experiments)
 - [Fetch experiment results in the SDK](./how_to_guides/evaluation/fetch_perf_metrics_experiment)
 - [Upload experiments run outside of LangSmith with the REST API](./how_to_guides/evaluation/upload_existing_experiments)
 
-## Datasets
+## Dataset management
 
-Manage datasets in LangSmith used by your offline evaluations (as well as other downstream applications).
+Manage datasets in LangSmith used by your evaluations.
 
-- [Manage datasets in the application](./how_to_guides/datasets/manage_datasets_in_application)
+- [Manage datasets from the UI](./how_to_guides/datasets/manage_datasets_in_application)
 - [Manage datasets programmatically](./how_to_guides/datasets/manage_datasets_programmatically)
 - [Version datasets](./how_to_guides/datasets/version_datasets)
 - [Share or unshare a dataset publicly](./how_to_guides/datasets/share_dataset)
 - [Export filtered traces from an experiment to a dataset](./how_to_guides/datasets/export_filtered_traces_to_dataset)
 
-## Annotation Queues and Human Feedback
+## Annotation queues and human feedback
 
-Collect feedback from subject matter experts and users to improve your LLM applications.
+Collect feedback from subject matter experts and users to improve your applications.
 
 - [Use annotation queues](./how_to_guides/human_feedback/annotation_queues)
 - [Capture user feedback from your application to traces](./how_to_guides/human_feedback/attach_user_feedback)

From 3ae30e22972d28be44986ec9c73df94a77f5fc09 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Wed, 13 Nov 2024 19:03:42 -0800
Subject: [PATCH 02/29] wip

---
 .../evaluation/dataset_subset.mdx             |   4 +-
 .../evaluation/dataset_version.mdx            |   2 +-
 .../evaluate_existing_experiment.mdx          |   6 +-
 .../evaluation/evaluate_llm_application.mdx   | 207 ++++++++++++------
 .../evaluate_on_intermediate_steps.mdx        |   2 +-
 .../evaluation/evaluate_pairwise.mdx          |   2 +-
 .../fetch_perf_metrics_experiment.mdx         |   2 +-
 .../evaluation/langchain_runnable.mdx         |   2 +-
 .../how_to_guides/evaluation/repetition.mdx   |   2 +-
 docs/evaluation/how_to_guides/index.md        |   4 +-
 docs/evaluation/tutorials/agents.mdx          |   2 +-
 docs/evaluation/tutorials/backtesting.mdx     |   2 +-
 docs/evaluation/tutorials/evaluation.mdx      |   4 +-
 docs/evaluation/tutorials/rag.mdx             |   4 +-
 docs/evaluation/tutorials/swe-benchmark.mdx   |   4 +-
 15 files changed, 167 insertions(+), 82 deletions(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/dataset_subset.mdx b/docs/evaluation/how_to_guides/evaluation/dataset_subset.mdx
index 4515a454..e1ee8ecf 100644
--- a/docs/evaluation/how_to_guides/evaluation/dataset_subset.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/dataset_subset.mdx
@@ -25,7 +25,7 @@ One common workflow is to fetch examples that have a certain metadata key-value
   groupId="client-language"
   tabs={[
     python`
-      from langsmith.evaluation import evaluate
+      from langsmith import evaluate
       
       results = evaluate(
           lambda inputs: label_text(inputs["text"]),
@@ -58,7 +58,7 @@ You can use the `list_examples` / `listExamples` method to evaluate on one or mu
   groupId="client-language"
   tabs={[
     python`
-      from langsmith.evaluation import evaluate
+      from langsmith import evaluate
       
       results = evaluate(
           lambda inputs: label_text(inputs["text"]),
diff --git a/docs/evaluation/how_to_guides/evaluation/dataset_version.mdx b/docs/evaluation/how_to_guides/evaluation/dataset_version.mdx
index 01009afe..c61aed1e 100644
--- a/docs/evaluation/how_to_guides/evaluation/dataset_version.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/dataset_version.mdx
@@ -20,7 +20,7 @@ Simply use `list_examples` / `listExamples` to fetch examples from a particular
   groupId="client-language"
   tabs={[
     python`
-      from langsmith.evaluation import evaluate
+      from langsmith import evaluate
       
       results = evaluate(
           lambda inputs: label_text(inputs["text"]),
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_existing_experiment.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_existing_experiment.mdx
index 229be6be..78b5fa07 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_existing_experiment.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_existing_experiment.mdx
@@ -12,7 +12,7 @@ If you have already run an experiment and want to add additional evaluation metr
 can apply any evaluators to the experiment using the `evaluate_existing` method.
 
 ```python
-from langsmith.evaluation import evaluate_existing
+from langsmith import evaluate_existing
 
 def always_half(run, example):
     return {"score": 0.5}
@@ -26,7 +26,7 @@ evaluate_existing(experiment_name, evaluators=[always_half])
 Suppose you are evaluating a semantic router. You may first run an experiment:
 
 ```python
-from langsmith.evaluation import evaluate
+from langsmith import evaluate
 def semantic_router(inputs: dict):
     return {"class": 1}
 
@@ -43,7 +43,7 @@ Later, you realize you want to add precision and recall summary metrics. The `ev
 you can add both instance-level `evaluator`'s and aggregate `summary_evaluator`'s.
 
 ```python
-from langsmith.evaluation import evaluate_existing
+from langsmith import evaluate_existing
 
 def precision(runs: list, examples: list):
     true_positives = sum([1 for run, example in zip(runs, examples) if run.outputs["class"] == example.outputs["label"]])
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
index b050e216..358940d7 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
@@ -12,23 +12,22 @@ import {
 
 # How to run an evaluation
 
-:::tip Recommended reading
-Before diving into this content, it might be helpful to read the following:
+:::info Key concepts
 
-- [Conceptual guide on evaluation](../../concepts)
-- [How-to guide on managing datasets](../datasets/manage_datasets_in_application)
-- [How-to guide on managing datasets programmatically](../datasets/manage_datasets_programmatically)
+- [Evaluations](../../concepts#applying-evaluations)
+- [Evaluators](../../concepts#evaluators)
+- [Datasets](../../concepts#datasets)
+- [Experiments](../../concepts#experiments)
 
 :::
 
-Evaluating the performance of your LLM application is a critical step in the development process. LangSmith makes it easy to run evaluations and track evaluation performance over time.
-This section provides guidance on how to evaluate the performance of your LLM application.
+In this guide we'll go over how to evaluate an application using the LangSmith SDKs.
 
-## Step 1: Define your application logic
+## Step 1: Define an application
 
-In this case, we are defining a simple evaluation target consisting of an LLM pipeline that classifies text as toxic or non-toxic.
-We've optionally enabled tracing to capture the inputs and outputs of each step in the pipeline.
+First we need an application to evaluate. Let's create a simple toxicity classifier for this example.
 
+We've optionally enabled tracing to capture the inputs and outputs of each step in the pipeline.
 To understand how to annotate your code for tracing, please refer to [this guide](../../../observability/how_to_guides/tracing/annotate_code).
 
 <CodeTabs
@@ -36,22 +35,22 @@ To understand how to annotate your code for tracing, please refer to [this guide
   tabs={[
     python`
       from langsmith import traceable, wrappers
-      from openai import Client
+      from openai import OpenAI
       
-      client = wrappers.wrap_openai(Client())
+      oai_client = wrappers.wrap_openai(OpenAI())
       
       @traceable
-      def label_text(text):
+      def toxicity_classifier(inputs: dict) -> str:
           system = (
             "Please review the user query below and determine if it contains any form of toxic behavior, "
-            "such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does, "
+            "such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does "
             "and 'Not toxic' if it doesn't."
           )
           messages = [
               {"role": "system", "content": system},
-              {"role": "user", "content": text},
+              {"role": "user", "content": inputs["text"]},
           ]
-          result = client.chat.completions.create(
+          result = oai_client.chat.completions.create(
               messages=messages, model="gpt-4o-mini", temperature=0
           )
           return result.choices[0].message.content
@@ -61,11 +60,11 @@ To understand how to annotate your code for tracing, please refer to [this guide
       import { wrapOpenAI } from "langsmith/wrappers";
       import { traceable } from "langsmith/traceable";
       
-      const client = wrapOpenAI(new OpenAI());
+      const oaiClient = wrapOpenAI(new OpenAI());
       
-      const labelText = traceable(
+      const toxicityClassifier = traceable(
         async (text: string) => {
-          const result = await client.chat.completions.create({
+          const result = await oaiClient.chat.completions.create({
             messages: [
               { 
                 role: "system",
@@ -79,23 +78,14 @@ To understand how to annotate your code for tracing, please refer to [this guide
           
           return result.choices[0].message.content;
         },
-        { name: "labelText" }
+        { name: "toxicityClassifier" }
       );`,
   ]}
 />
 
 ## Step 2: Create or select a dataset
 
-In this case, we are creating a dataset to evaluate the performance of our LLM application. The dataset contains examples of toxic and non-toxic text.
-
-Each `Example` in the dataset contains three dictionaries / objects:
-
-- `outputs`: The reference labels or other context found in your dataset
-- `inputs`: The inputs to your pipeline
-- `metadata`: Any other metadata you have stored in that example within the dataset
-
-These dictionaries / objects can have arbitrary keys and values, but the keys must be consistent across all examples in the dataset.
-The values in the examples can also take any form, such as strings, numbers, lists, or dictionaries, but for this example, we are simply using strings.
+Now we need a [Dataset](../../concepts#datasets) to evaluate our application on. Our dataset will contain labeled [examples](../../concepts#examples) of toxic and non-toxic text.
 
 <CodeTabs
   groupId="client-language"
@@ -105,8 +95,7 @@ The values in the examples can also take any form, such as strings, numbers, lis
       
       client = Client()
       
-      # Create a dataset
-      examples = [
+      labeled_texts = [
           ("Shut up, idiot", "Toxic"),
           ("You're a wonderful person", "Not toxic"),
           ("This is the worst thing ever", "Toxic"),
@@ -118,7 +107,7 @@ The values in the examples can also take any form, such as strings, numbers, lis
       dataset_name = "Toxic Queries"
       dataset = client.create_dataset(dataset_name=dataset_name)
       inputs, outputs = zip(
-          *[({"text": text}, {"label": label}) for text, label in examples]
+          *[({"text": text}, {"label": label}) for text, label in labeled_texts]
       )
       client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
     `,
@@ -128,7 +117,7 @@ The values in the examples can also take any form, such as strings, numbers, lis
       const langsmith = new Client();
       
       // create a dataset
-      const toxicExamples = [
+      const labeledTexts = [
         ["Shut up, idiot", "Toxic"],
         ["You're a wonderful person", "Not toxic"],
         ["This is the worst thing ever", "Toxic"],
@@ -137,7 +126,7 @@ The values in the examples can also take any form, such as strings, numbers, lis
         ["This is unacceptable. I want to speak to the manager.", "Not toxic"],
       ];
       
-      const [inputs, outputs] = toxicExamples.reduce<
+      const [inputs, outputs] = labeledTexts.reduce<
         [Array<{ input: string }>, Array<{ outputs: string }>]
       >(
         ([inputs, outputs], item) => [
@@ -154,10 +143,12 @@ The values in the examples can also take any form, such as strings, numbers, lis
   ]}
 />
 
-## Step 3. Configure evaluators to score the outputs
+See [here](../../how_to_guides#dataset-management) for more on dataset management.
 
-In this case, we are using a dead-simple evaluator that compares the output of our LLM pipeline to the expected output in the dataset.
-Writing evaluators is discussed in more detail in the [following section](#custom-evaluators).
+## Step 3. Define an evaluator
+
+[Evaluators](../../concepts#evaluators) are functions for scoring your application's outputs. They take in the example inputs, actual outputs, and, when present, the reference (example) outputs.
+Since we have labels for this task, our evaluator can directly check if the actual outputs match the reference outputs.
 
 <CodeTabs
   groupId="client-language"
@@ -165,68 +156,162 @@ Writing evaluators is discussed in more detail in the [following section](#custo
     python`
       from langsmith.schemas import Example, Run
 
-      def correct_label(root_run: Run, example: Example) -> dict:
+      def correct(root_run: Run, example: Example) -> dict:
           score = root_run.outputs.get("output") == example.outputs.get("label")
-          return {"score": int(score), "key": "correct_label"}
+          return {"score": int(score)}
     `,
     typescript`
       import type { EvaluationResult } from "langsmith/evaluation";
       import type { Run, Example } from "langsmith/schemas";
       
       // Row-level evaluator
-      function correctLabel(rootRun: Run, example: Example): EvaluationResult {
+      function correct(rootRun: Run, example: Example): EvaluationResult {
         const score = rootRun.outputs?.outputs === example.outputs?.output;
-        return { key: "correct_label", score };
+        return { key: "correct", score };
       }
     `,
   ]}
 />
 
-## Step 4. Run the evaluation and view the results
+See [here](../../how_to_guide#define-an-evaluator) for more on how to define evaluators.
+
+## Step 4. Run the evaluation 
 
-You can use the `evaluate` method in Python and TypeScript to run an evaluation.
+We'll use the [evaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) / [aevaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._arunner.aevaluate.html) methods to run the evaluation.
 
-At its simplest, the `evaluate` method takes the following arguments:
+The key arguments are:
 
-- a function that takes an input dictionary or object and returns an output dictionary or object
+- a function that takes an input dictionary and returns an output dictionary or object
 - `data` - the name OR UUID of the LangSmith dataset to evaluate on, or an iterator of examples
 - `evaluators` - a list of evaluators to score the outputs of the function
-- `experiment_prefix` - a string to prefix the experiment name with. A name will be generated if not provided.
 
 <CodeTabs
   groupId="client-language"
   tabs={[
     python`
-      from langsmith.evaluation import evaluate
-      
-      dataset_name = "Toxic Queries"
+      from langsmith import evaluate
       
       results = evaluate(
-          lambda inputs: label_text(inputs["text"]),
+          toxicity_classifier,
           data=dataset_name,
-          evaluators=[correct_label],
-          experiment_prefix="Toxic Queries",
-          description="Testing the baseline system.",  # optional
+          evaluators=[correct],
+          experiment_prefix="gpt-4o-mini, simple",  # optional, experiment name prefix
+          description="Testing the baseline system.",  # optional, experiment description
       )
     `,
     typescript`
       import { evaluate } from "langsmith/evaluation";
       
-      const datasetName = "Toxic Queries";
-      
-      await evaluate((inputs) => labelText(inputs["input"]), {
+      await evaluate((inputs) => toxicityClassifier(inputs["input"]), {
         data: datasetName,
-        evaluators: [correctLabel],
-        experimentPrefix: "Toxic Queries",
+        evaluators: [correct],
+        experimentPrefix: gpt-4o-mini, simple",  # optional, experiment name prefix
+
       });
     `,
   ]}
 />
 
-Each invocation of `evaluate` produces an experiment which is bound to the dataset, and can be viewed in the LangSmith UI.
-Evaluation scores are stored against each individual output produced by the target task as feedback, with the name and score configured in the evaluator.
+See [here](../../how_to_guides#run-an-evaluation) for other ways to kick off evaluations and [here](../../how_to_guides#configure-an-evaluation-job) for how to configure evaluation jobs.
+
+## Step 5. Explore the results
+
+Each invocation of `evaluate()` creates an [Experiment](../../concepts#experiments) which can be viewed in the LangSmith UI or queried via the SDK.
+Evaluation scores are stored against each actual output as feedback.
 
 _If you've annotated your code for tracing, you can open the trace of each row in a side panel view._
 
 ![](../evaluation/static/view_experiment.gif)
 
+## Reference code
+
+<details>
+<summary>Click to see consolidated code snippet</summary>
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+      from langsmith import Client, evaluate, traceable, wrappers
+      from langsmith.schemas import Example, Run
+      from openai import OpenAI
+      
+      # Step 1. Define an application
+      oai_client = wrappers.wrap_openai(OpenAI())
+      
+      @traceable
+      def toxicity_classifier(inputs: dict) -> str:
+          system = (
+            "Please review the user query below and determine if it contains any form of toxic behavior, "
+            "such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does "
+            "and 'Not toxic' if it doesn't."
+          )
+          messages = [
+              {"role": "system", "content": system},
+              {"role": "user", "content": inputs["text"]},
+          ]
+          result = oai_client.chat.completions.create(
+              messages=messages, model="gpt-4o-mini", temperature=0
+          )
+          return result.choices[0].message.content
+      
+      # Step 2. Create a dataset
+      client = Client()
+      
+      labeled_texts = [
+          ("Shut up, idiot", "Toxic"),
+          ("You're a wonderful person", "Not toxic"),
+          ("This is the worst thing ever", "Toxic"),
+          ("I had a great day today", "Not toxic"),
+          ("Nobody likes you", "Toxic"),
+          ("This is unacceptable. I want to speak to the manager.", "Not toxic"),
+      ]
+      
+      dataset_name = "Toxic Queries"
+      dataset = client.create_dataset(dataset_name=dataset_name)
+      inputs, outputs = zip(
+          *[({"text": text}, {"label": label}) for text, label in labeled_texts]
+      )
+      client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
+
+      # Step 3. Define an evaluator
+      def correct(root_run: Run, example: Example) -> dict:
+          score = root_run.outputs.get("output") == example.outputs.get("label")
+          return {"score": int(score)}
+
+      # Step 4. Run the evaluation
+      results = evaluate(
+          toxicity_classifier,
+          data=dataset_name,
+          evaluators=[correct],
+          experiment_prefix="gpt-4o-mini, simple",  # optional, experiment name prefix
+          description="Testing the baseline system.",  # optional, experiment description
+      )
+      `,
+    typescript`
+      import { OpenAI } from "openai";
+      import { wrapOpenAI } from "langsmith/wrappers";
+      import { traceable } from "langsmith/traceable";
+      
+      const oaiClient = wrapOpenAI(new OpenAI());
+      
+      const toxicityClassifier = traceable(
+        async (text: string) => {
+          const result = await oaiClient.chat.completions.create({
+            messages: [
+              { 
+                role: "system",
+                content: "Please review the user query below and determine if it contains any form of toxic behavior, such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does, and 'Not toxic' if it doesn't.",
+              },
+              { role: "user", content: text },
+            ],
+            model: "gpt-4o-mini",
+            temperature: 0,
+          });
+          
+          return result.choices[0].message.content;
+        },
+        { name: "toxicityClassifier" }
+      );`,
+  ]}
+/>
+</details>
\ No newline at end of file
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_on_intermediate_steps.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_on_intermediate_steps.mdx
index 9c9c3a14..a9d9688a 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_on_intermediate_steps.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_on_intermediate_steps.mdx
@@ -364,7 +364,7 @@ Finally, we'll run `evaluate` with the custom evaluators defined above.
 <CodeTabs
   tabs={[
     python`
-      from langsmith.evaluation import evaluate
+      from langsmith import evaluate
       
       experiment_results = evaluate(
           lambda inputs: rag_pipeline(inputs["input"]),
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx
index 11287e70..2200b950 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx
@@ -77,7 +77,7 @@ The prompt asks the LLM to decide which is better between two AI assistant respo
 <CodeTabs
   tabs={[
     python`
-      from langsmith.evaluation import evaluate_comparative
+      from langsmith import evaluate_comparative
       from langchain import hub
       from langchain_openai import ChatOpenAI
       from langsmith.schemas import Run, Example
diff --git a/docs/evaluation/how_to_guides/evaluation/fetch_perf_metrics_experiment.mdx b/docs/evaluation/how_to_guides/evaluation/fetch_perf_metrics_experiment.mdx
index 2017a6ff..8c2a50f4 100644
--- a/docs/evaluation/how_to_guides/evaluation/fetch_perf_metrics_experiment.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/fetch_perf_metrics_experiment.mdx
@@ -131,7 +131,7 @@ Next, we will create an experiment, retrieve the experiment name from the result
 dataset_name = "HelloDataset"\n
 def foo_label(root_run: Run, example: Example) -> dict:
     return {"score": 1, "key": "foo"}\n
-from langsmith.evaluation import evaluate\n
+from langsmith import evaluate\n
 results = evaluate(
     lambda inputs: "Hello " + inputs["input"],
     data=dataset_name,
diff --git a/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx b/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
index 8c8a2065..1e5e1139 100644
--- a/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
@@ -50,7 +50,7 @@ Then, pass the `runnable.invoke` method to the `evaluate` method. Note that the
   groupId="client-language"
   tabs={[
     python`
-      from langsmith.evaluation import evaluate
+      from langsmith import evaluate
       
       results = evaluate(
           chain.invoke,
diff --git a/docs/evaluation/how_to_guides/evaluation/repetition.mdx b/docs/evaluation/how_to_guides/evaluation/repetition.mdx
index 2f8c0a45..5f32b11b 100644
--- a/docs/evaluation/how_to_guides/evaluation/repetition.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/repetition.mdx
@@ -15,7 +15,7 @@ noise in systems prone to high variability, such as agents.
   groupId="client-language"
   tabs={[
     python`
-      from langsmith.evaluation import evaluate
+      from langsmith import evaluate
       
       results = evaluate(
           lambda inputs: label_text(inputs["text"]),
diff --git a/docs/evaluation/how_to_guides/index.md b/docs/evaluation/how_to_guides/index.md
index d31f9a50..035dcab4 100644
--- a/docs/evaluation/how_to_guides/index.md
+++ b/docs/evaluation/how_to_guides/index.md
@@ -4,14 +4,14 @@ These guides answer “How do I….?” format questions.
 They are goal-oriented and concrete, and are meant to help you complete a specific task. 
 For conceptual explanations see the [Conceptual guide](./concepts). 
 For end-to-end walkthroughs see [Tutorials](./tutorials). 
-For comprehensive descriptions of every class and function see the [API Reference](https://langsmith-sdk.readthedocs.io/en/latest/evaluation.html).
+For comprehensive descriptions of every class and function see the [API reference](https://langsmith-sdk.readthedocs.io/en/latest/evaluation.html).
 
 ## Offline evaluation
 
 Evaluate and improve your application.
 
 ### Run an evaluation
-- [Run an evaluation using the SDK](./how_to_guides/evaluation/evaluate_llm_application)
+- [Run an evaluation](./how_to_guides/evaluation/evaluate_llm_application)
 - [Run an evaluation asynchronously](./how_to_guides/evaluation/async)
 - [Run an evaluation comparing two experiments](./how_to_guides/evaluation/evaluate_pairwise)
 - [Run an evaluation of a LangChain / LangGraph object](./how_to_guides/evaluation/langchain_runnable)
diff --git a/docs/evaluation/tutorials/agents.mdx b/docs/evaluation/tutorials/agents.mdx
index e6338a62..c819f52e 100644
--- a/docs/evaluation/tutorials/agents.mdx
+++ b/docs/evaluation/tutorials/agents.mdx
@@ -438,7 +438,7 @@ def answer_evaluator(run, example) -> dict:
 `Create evaluation`
 
 ```python
-from langsmith.evaluation import evaluate
+from langsmith import evaluate
 
 experiment_results = evaluate(
     predict_sql_agent_answer,
diff --git a/docs/evaluation/tutorials/backtesting.mdx b/docs/evaluation/tutorials/backtesting.mdx
index 7881bcc9..674afe1b 100644
--- a/docs/evaluation/tutorials/backtesting.mdx
+++ b/docs/evaluation/tutorials/backtesting.mdx
@@ -135,7 +135,7 @@ Now we have the dataset and prod runs saved as an experiment.
 Let's run inference on our new system to compare.
 
 ```python
-from langsmith.evaluation import evaluate
+from langsmith import evaluate
 
 def predict(example_input: dict):
     # The dataset includes serialized messages that we
diff --git a/docs/evaluation/tutorials/evaluation.mdx b/docs/evaluation/tutorials/evaluation.mdx
index ea295150..f10acb3a 100644
--- a/docs/evaluation/tutorials/evaluation.mdx
+++ b/docs/evaluation/tutorials/evaluation.mdx
@@ -250,7 +250,7 @@ def langsmith_app_1(inputs):
     output = my_app_1(inputs["question"])
     return {"output": output}
 
-from langsmith.evaluation import evaluate
+from langsmith import evaluate
 
 experiment_results = evaluate(
     langsmith_app_1, # Your AI system
@@ -288,7 +288,7 @@ def langsmith_app_2(inputs):
     output = my_app_2(inputs["question"])
     return {"output": output}
 
-from langsmith.evaluation import evaluate
+from langsmith import evaluate
 
 experiment_results = evaluate(
     langsmith_app_2, # Your AI system
diff --git a/docs/evaluation/tutorials/rag.mdx b/docs/evaluation/tutorials/rag.mdx
index 154aca18..3ff6eddf 100644
--- a/docs/evaluation/tutorials/rag.mdx
+++ b/docs/evaluation/tutorials/rag.mdx
@@ -245,7 +245,7 @@ Now, we kick off evaluation:
 - `answer_evaluator`: Passes RAG chain answer, question, and ground truth answer to an evaluator
 
 ```python
-from langsmith.evaluation import evaluate
+from langsmith import evaluate
 
 experiment_results = evaluate(
     predict_rag_answer,
@@ -414,7 +414,7 @@ https://youtu.be/yx3JMAaNggQ?feature=shared
 
 ```python
 from langsmith.schemas import Example, Run
-from langsmith.evaluation import evaluate
+from langsmith import evaluate
 
 def document_relevance_grader(root_run: Run, example: Example) -> dict:
     """
diff --git a/docs/evaluation/tutorials/swe-benchmark.mdx b/docs/evaluation/tutorials/swe-benchmark.mdx
index e715c1ec..aa7ee4b0 100644
--- a/docs/evaluation/tutorials/swe-benchmark.mdx
+++ b/docs/evaluation/tutorials/swe-benchmark.mdx
@@ -90,7 +90,7 @@ import creating_split from "./static/creating_split.mp4";
 Running evaluation over SWE-bench works a little differently than most evals you will typically run on LangSmith since we don't have a reference output. Because of this, we first generate all of our outputs without running an evaluator (note how the `evaluate` call doesn't have the `evaluators` parameter set). In this case we returned a dummy predict function, but you can insert your agent logic inside the `predict` function to make it work as intended.
 
 ```python
-from langsmith.evaluation import evaluate
+from langsmith import evaluate
 from langsmith import Client
 
 client = Client()
@@ -276,7 +276,7 @@ evaluate_predictions(dataset,predictions,max_workers=8,force_rebuild=False,cache
 Now, we can actually send our evaluation feedback to LangSmith by using the `evaluate_existing` function. Our evaluate function is incredibly simple in this case, because the `convert_runs_to_langsmith_feedback` function above made our life very easy by saving all the feedback to a single file.
 
 ```python
-from langsmith.evaluation import evaluate_existing
+from langsmith import evaluate_existing
 from langsmith.schemas import Example, Run
 
 def swe_bench_evaluator(run: Run, example: Example):

From b7d4341b15e4a44962dce0af813ae095d4854a96 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Thu, 14 Nov 2024 09:15:11 -0800
Subject: [PATCH 03/29] wip

---
 .../evaluation/evaluate_llm_application.mdx   | 50 +++++++++++++++++--
 1 file changed, 47 insertions(+), 3 deletions(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
index 358940d7..6464c512 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
@@ -226,7 +226,7 @@ _If you've annotated your code for tracing, you can open the trace of each row i
 ## Reference code
 
 <details>
-<summary>Click to see consolidated code snippet</summary>
+<summary>Click to see a consolidated code snippet</summary>
 <CodeTabs
   groupId="client-language"
   tabs={[
@@ -289,8 +289,12 @@ _If you've annotated your code for tracing, you can open the trace of each row i
       `,
     typescript`
       import { OpenAI } from "openai";
-      import { wrapOpenAI } from "langsmith/wrappers";
+      import { Client } from "langsmith";
+      import { evaluate, EvaluationResult } from "langsmith/evaluation";
+      import type { Run, Example } from "langsmith/schemas";
       import { traceable } from "langsmith/traceable";
+      import { wrapOpenAI } from "langsmith/wrappers";
+      
       
       const oaiClient = wrapOpenAI(new OpenAI());
       
@@ -311,7 +315,47 @@ _If you've annotated your code for tracing, you can open the trace of each row i
           return result.choices[0].message.content;
         },
         { name: "toxicityClassifier" }
-      );`,
+      );
+      
+      const langsmith = new Client();
+      
+      // create a dataset
+      const labeledTexts = [
+        ["Shut up, idiot", "Toxic"],
+        ["You're a wonderful person", "Not toxic"],
+        ["This is the worst thing ever", "Toxic"],
+        ["I had a great day today", "Not toxic"],
+        ["Nobody likes you", "Toxic"],
+        ["This is unacceptable. I want to speak to the manager.", "Not toxic"],
+      ];
+      
+      const [inputs, outputs] = labeledTexts.reduce<
+        [Array<{ input: string }>, Array<{ outputs: string }>]
+      >(
+        ([inputs, outputs], item) => [
+          [...inputs, { input: item[0] }],
+          [...outputs, { outputs: item[1] }],
+        ],
+        [[], []]
+      );
+      
+      const datasetName = "Toxic Queries";
+      const toxicDataset = await langsmith.createDataset(datasetName);
+      await langsmith.createExamples({ inputs, outputs, datasetId: toxicDataset.id });
+
+      // Row-level evaluator
+      function correct(rootRun: Run, example: Example): EvaluationResult {
+        const score = rootRun.outputs?.outputs === example.outputs?.output;
+        return { key: "correct", score };
+      }
+
+      await evaluate((inputs) => toxicityClassifier(inputs["input"]), {
+        data: datasetName,
+        evaluators: [correct],
+        experimentPrefix: gpt-4o-mini, simple",  # optional, experiment name prefix
+
+      });
+      `,
   ]}
 />
 </details>
\ No newline at end of file

From e4efc7d554d0a27c3b087b56080ff93cc4a314f8 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Thu, 14 Nov 2024 13:46:12 -0800
Subject: [PATCH 04/29] wip

---
 .../evaluation/evaluate_llm_application.mdx   | 25 ++++++++-----------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
index 6464c512..181775a1 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
@@ -93,7 +93,7 @@ Now we need a [Dataset](../../concepts#datasets) to evaluate our application on.
     python`
       from langsmith import Client
       
-      client = Client()
+      ls_client = Client()
       
       labeled_texts = [
           ("Shut up, idiot", "Toxic"),
@@ -105,11 +105,11 @@ Now we need a [Dataset](../../concepts#datasets) to evaluate our application on.
       ]
       
       dataset_name = "Toxic Queries"
-      dataset = client.create_dataset(dataset_name=dataset_name)
+      dataset = ls_client.create_dataset(dataset_name=dataset_name)
       inputs, outputs = zip(
           *[({"text": text}, {"label": label}) for text, label in labeled_texts]
       )
-      client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
+      ls_client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
     `,
     typescript`
       import { Client } from "langsmith";
@@ -154,11 +154,8 @@ Since we have labels for this task, our evaluator can directly check if the actu
   groupId="client-language"
   tabs={[
     python`
-      from langsmith.schemas import Example, Run
-
-      def correct(root_run: Run, example: Example) -> dict:
-          score = root_run.outputs.get("output") == example.outputs.get("label")
-          return {"score": int(score)}
+      def correct(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
+          return outputs["output"] == reference_outputs["label"]
     `,
     typescript`
       import type { EvaluationResult } from "langsmith/evaluation";
@@ -232,7 +229,6 @@ _If you've annotated your code for tracing, you can open the trace of each row i
   tabs={[
     python`
       from langsmith import Client, evaluate, traceable, wrappers
-      from langsmith.schemas import Example, Run
       from openai import OpenAI
       
       # Step 1. Define an application
@@ -255,7 +251,7 @@ _If you've annotated your code for tracing, you can open the trace of each row i
           return result.choices[0].message.content
       
       # Step 2. Create a dataset
-      client = Client()
+      ls_client = Client()
       
       labeled_texts = [
           ("Shut up, idiot", "Toxic"),
@@ -267,16 +263,15 @@ _If you've annotated your code for tracing, you can open the trace of each row i
       ]
       
       dataset_name = "Toxic Queries"
-      dataset = client.create_dataset(dataset_name=dataset_name)
+      dataset = ls_client.create_dataset(dataset_name=dataset_name)
       inputs, outputs = zip(
           *[({"text": text}, {"label": label}) for text, label in labeled_texts]
       )
-      client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
+      ls_client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
 
       # Step 3. Define an evaluator
-      def correct(root_run: Run, example: Example) -> dict:
-          score = root_run.outputs.get("output") == example.outputs.get("label")
-          return {"score": int(score)}
+      def correct(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
+          return outputs["output"] == reference_outputs["label"]
 
       # Step 4. Run the evaluation
       results = evaluate(

From e258077dfb4dcbb6bc1a308848ab289cc329e06b Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Thu, 14 Nov 2024 13:48:02 -0800
Subject: [PATCH 05/29] fix

---
 .../how_to_guides/evaluation/evaluate_llm_application.mdx       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
index 181775a1..bcc6bba5 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
@@ -170,7 +170,7 @@ Since we have labels for this task, our evaluator can directly check if the actu
   ]}
 />
 
-See [here](../../how_to_guide#define-an-evaluator) for more on how to define evaluators.
+See [here](../../how_to_guides#define-an-evaluator) for more on how to define evaluators.
 
 ## Step 4. Run the evaluation 
 

From 26a8101cc9c1af9eefd3bf0ea344c614f963d7ef Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Thu, 14 Nov 2024 16:22:54 -0800
Subject: [PATCH 06/29] wip

---
 .../evaluation/builtin_evaluators.mdx         |   0
 .../evaluation/custom_evaluator.mdx           | 129 ++++++++++++++++++
 .../evaluation/evaluate_llm_application.mdx   |  22 ++-
 .../how_to_guides/evaluation/llm_as_judge.mdx |   0
 docs/evaluation/how_to_guides/index.md        |   8 +-
 5 files changed, 143 insertions(+), 16 deletions(-)
 create mode 100644 docs/evaluation/how_to_guides/evaluation/builtin_evaluators.mdx
 create mode 100644 docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx

diff --git a/docs/evaluation/how_to_guides/evaluation/builtin_evaluators.mdx b/docs/evaluation/how_to_guides/evaluation/builtin_evaluators.mdx
new file mode 100644
index 00000000..e69de29b
diff --git a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
index e69de29b..2a9f238b 100644
--- a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
@@ -0,0 +1,129 @@
+import {
+  CodeTabs,
+  python,
+  typescript,
+} from "@site/src/components/InstructionsWithCode";
+
+# How to define a custom evaluator
+
+:::info Key concepts
+
+- [Evaluators](../../concepts#evaluators)
+
+:::
+
+Custom evaluators are just functions that take a dataset example and the resulting application output, and return one or more metrics. 
+These functions can be passed directly into [evaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) / [aevaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._arunner.aevaluate.html).
+
+## Basic example
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+      from langsmith import evaluate
+
+      def correct(outputs: dict, reference_outputs: dict) -> bool:
+        """Check if the answer exactly matches the expected answer."""
+        return outputs["answer"] == reference_outputs["answer"]
+
+      def dummy_app(inputs: dict) -> dict:
+        return {"answer": "hmm i'm not sure", "reasoning": "i didn't understand the question"}
+    
+      results = evaluate(
+        dummy_app, 
+        data="dataset_name", 
+        evaluators=[correct]
+      )
+    `,
+    typescript`
+      import type { EvaluationResult } from "langsmith/evaluation";
+      import type { Run, Example } from "langsmith/schemas";
+      
+      function correct(rootRun: Run, example: Example): EvaluationResult {
+        const score = rootRun.outputs?.outputs === example.outputs?.output;
+        return { key: "correct", score };
+      }
+    `,
+  ]}
+/>
+
+## Evaluator args
+
+Custom evaluator functions must have specific argument names. They can take any subset of the following arguments:
+- `inputs: dict`: A dictionary of the inputs corresponding to a single example in a dataset.
+- `outputs: dict`: A dictionary of the outputs generated by the application on the given `inputs`.
+- `reference_outputs: dict`: A dictionary of the reference outputs associated with the example, if available.
+- `run: langsmith.schemas.Run`: The full Run object generated by the application on the given example.
+- `example: langsmith.schemas.Example`: The full dataset Example, including the example inputs, outputs (if available), and metdata (if available).
+
+For most use cases you'll only need `inputs`, `outputs`, and `reference_outputs`. `run` and `example` are useful only if you need some extra trace or example metadata outside of the actual inputs and outputs of the application.
+
+## Evaluator output
+
+Custom evaluators are expected to return one of the following types:
+- `int | float | bool`: this is interepreted as an continuous metric that can be averaged, sorted, etc. The function name is used as the name of the metric.
+- `str`: this is intepreted as a categorical metric. The function name is used as the name of the metric.
+- `dict`: dicts of the form `{"score" | "value": ..., "name": ...}` allow you to customize the metric type ("score" for numerical and "value" for categorical) and metric name. This if useful if, for example, you want to log an integer as a categorical metric.
+- `list[dict]`: return multiple metrics using a single function.
+
+## Additional examples
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+      from langsmith import evaluate, wrappers
+      from openai import AsyncOpenAI
+      
+
+      def correct(outputs: dict, reference_outputs: dict) -> bool:
+        """Check if the answer exactly matches the expected answer."""
+        return outputs["answer"] == reference_outputs["answer"]
+
+      def concision(outputs: dict) -> int:
+        """Score how concise the answer is. 1 is the most concise, 5 is the least concise."""
+        return min(len(outputs["answer"]) // 1000, 4) + 1
+        
+      oai_client = wrappers.wrap_openai(AsyncOpenAI())
+        
+      async def valid_reasoning(inputs: dict, outputs: dict) -> bool:
+        """Use an LLM to judge if the reasoning and the answer are consistent."""
+        instructions = """
+Given the following question, answer, and reasoning, determine if the reasoning for the 
+answer is logically valid and consistent with question and the answer."""
+        
+        class Response(BaseModel):
+          reasoning_is_valid: bool
+          
+        msg = f"Question: {inputs['question']}\\nAnswer: {outputs['answer']}\\nReasoning: {outputs['reasoning']}"
+        response = await oai_client.beta.chat.completions.parse(
+          messages=[{"role": "system", "content": instructions,}, {"role": "user", "content": msg}], 
+          response_format=Response
+        )
+        return response.reasoning_is_valid
+
+      def dummy_app(inputs: dict) -> dict:
+        return {"answer": "hmm i'm not sure", "reasoning": "i didn't understand the question"}
+    
+      results = evaluate(
+        dummy_app, 
+        data="dataset_name", 
+        evaluators=[correct, concision, valid_reasoning]
+      )
+    `,
+    typescript`
+      import type { EvaluationResult } from "langsmith/evaluation";
+      import type { Run, Example } from "langsmith/schemas";
+      
+      function correct(rootRun: Run, example: Example): EvaluationResult {
+        const score = rootRun.outputs?.outputs === example.outputs?.output;
+        return { key: "correct", score };
+      }
+    `,
+  ]}
+/>
+
+## Related
+- [Evaluate aggregate experiment results](../../how_to_guides/summary): Define summary evaluators, which compute metrics for an entire experiment.
+- [Run an evaluation comparing two experiments](../../how_to_guides/evaluate_pairwise): Define pairwise evaluators, which compute metrics by comparing two (or more) experiments against each other.
\ No newline at end of file
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
index bcc6bba5..cab4c046 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
@@ -6,8 +6,6 @@ import {
   CodeTabs,
   python,
   typescript,
-  PythonBlock,
-  TypeScriptBlock,
 } from "@site/src/components/InstructionsWithCode";
 
 # How to run an evaluation
@@ -21,15 +19,12 @@ import {
 
 :::
 
-In this guide we'll go over how to evaluate an application using the LangSmith SDKs.
+In this guide we'll go over how to evaluate an application using the [evaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) method in the LangSmith SDK.
 
 ## Step 1: Define an application
 
 First we need an application to evaluate. Let's create a simple toxicity classifier for this example.
 
-We've optionally enabled tracing to capture the inputs and outputs of each step in the pipeline.
-To understand how to annotate your code for tracing, please refer to [this guide](../../../observability/how_to_guides/tracing/annotate_code).
-
 <CodeTabs
   groupId="client-language"
   tabs={[
@@ -83,9 +78,12 @@ To understand how to annotate your code for tracing, please refer to [this guide
   ]}
 />
 
+We've optionally enabled tracing to capture the inputs and outputs of each step in the pipeline.
+To understand how to annotate your code for tracing, please refer to [this guide](../../../observability/how_to_guides/tracing/annotate_code).
+
 ## Step 2: Create or select a dataset
 
-Now we need a [Dataset](../../concepts#datasets) to evaluate our application on. Our dataset will contain labeled [examples](../../concepts#examples) of toxic and non-toxic text.
+We need a [Dataset](../../concepts#datasets) to evaluate our application on. Our dataset will contain labeled [examples](../../concepts#examples) of toxic and non-toxic text.
 
 <CodeTabs
   groupId="client-language"
@@ -147,7 +145,7 @@ See [here](../../how_to_guides#dataset-management) for more on dataset managemen
 
 ## Step 3. Define an evaluator
 
-[Evaluators](../../concepts#evaluators) are functions for scoring your application's outputs. They take in the example inputs, actual outputs, and, when present, the reference (example) outputs.
+[Evaluators](../../concepts#evaluators) are functions for scoring your application's outputs. They take in the example inputs, actual outputs, and, when present, the reference outputs.
 Since we have labels for this task, our evaluator can directly check if the actual outputs match the reference outputs.
 
 <CodeTabs
@@ -161,7 +159,6 @@ Since we have labels for this task, our evaluator can directly check if the actu
       import type { EvaluationResult } from "langsmith/evaluation";
       import type { Run, Example } from "langsmith/schemas";
       
-      // Row-level evaluator
       function correct(rootRun: Run, example: Example): EvaluationResult {
         const score = rootRun.outputs?.outputs === example.outputs?.output;
         return { key: "correct", score };
@@ -192,7 +189,7 @@ The key arguments are:
           toxicity_classifier,
           data=dataset_name,
           evaluators=[correct],
-          experiment_prefix="gpt-4o-mini, simple",  # optional, experiment name prefix
+          experiment_prefix="gpt-4o-mini, baseline",  # optional, experiment name prefix
           description="Testing the baseline system.",  # optional, experiment description
       )
     `,
@@ -202,8 +199,7 @@ The key arguments are:
       await evaluate((inputs) => toxicityClassifier(inputs["input"]), {
         data: datasetName,
         evaluators: [correct],
-        experimentPrefix: gpt-4o-mini, simple",  # optional, experiment name prefix
-
+        experimentPrefix: "gpt-4o-mini, baseline",  // optional, experiment name prefix
       });
     `,
   ]}
@@ -347,7 +343,7 @@ _If you've annotated your code for tracing, you can open the trace of each row i
       await evaluate((inputs) => toxicityClassifier(inputs["input"]), {
         data: datasetName,
         evaluators: [correct],
-        experimentPrefix: gpt-4o-mini, simple",  # optional, experiment name prefix
+        experimentPrefix: "gpt-4o-mini, simple",  // optional, experiment name prefix
 
       });
       `,
diff --git a/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx b/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx
new file mode 100644
index 00000000..e69de29b
diff --git a/docs/evaluation/how_to_guides/index.md b/docs/evaluation/how_to_guides/index.md
index 035dcab4..066bdbc5 100644
--- a/docs/evaluation/how_to_guides/index.md
+++ b/docs/evaluation/how_to_guides/index.md
@@ -16,12 +16,14 @@ Evaluate and improve your application.
 - [Run an evaluation comparing two experiments](./how_to_guides/evaluation/evaluate_pairwise)
 - [Run an evaluation of a LangChain / LangGraph object](./how_to_guides/evaluation/langchain_runnable)
 - [Run an evaluation of an existing experiment](./how_to_guides/evaluation/evaluate_existing_experiment)
-- [Run an evaluation using the REST API](./how_to_guides/evaluation/run_evals_api_only)
-- [Run an evaluation in the prompt playground](./how_to_guides/evaluation/run_evaluation_from_prompt_playground)
+- [Run an evaluation via the REST API](./how_to_guides/evaluation/run_evals_api_only)
+- [Run an evaluation from the prompt playground](./how_to_guides/evaluation/run_evaluation_from_prompt_playground)
 
 ### Define an evaluator
 - [Define a custom evaluator](./how_to_guides/evaluation/custom_evaluator)
-- [Use an off-the-shelf evaluator (Python only)](./how_to_guides/evaluation/use_langchain_off_the_shelf_evaluators)
+- [Define an LLM-as-a-judge evaluator](./how_to_guides/evaluation/llm_as_judge)
+- [Use an off-the-shelf evaluator via the SDK (Python only)](./how_to_guides/evaluation/use_langchain_off_the_shelf_evaluators)
+- [Use an off-the-shelf evaluator via the UI](./how_to_guides/evaluation/builtin_evaluators)
 - [Evaluate aggregate experiment results](./how_to_guides/evaluation/summary)
 - [Evaluate intermediate steps](./how_to_guides/evaluation/evaluate_on_intermediate_steps)
 - [Return multiple metrics in one evaluator](./how_to_guides/evaluation/multiple_scores)

From ef5b3aefd1d0c0b2ef42620d1ffbc36906923873 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Fri, 15 Nov 2024 08:19:46 -0800
Subject: [PATCH 07/29] wip

---
 .../how_to_guides/evaluation/async.mdx        |  60 +++++++++
 .../evaluation/custom_evaluator.mdx           | 114 ++++++++++--------
 .../evaluation/dataset_subset.mdx             |   1 -
 .../evaluation/evaluate_llm_application.mdx   |  34 +++---
 .../how_to_guides/evaluation/llm_as_judge.mdx |  77 ++++++++++++
 .../evaluation/multiple_scores.mdx            |   7 +-
 .../how_to_guides/evaluation/summary.mdx      |   2 +-
 docs/evaluation/how_to_guides/index.md        |  15 ++-
 8 files changed, 233 insertions(+), 77 deletions(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/async.mdx b/docs/evaluation/how_to_guides/evaluation/async.mdx
index e69de29b..ed94d9ed 100644
--- a/docs/evaluation/how_to_guides/evaluation/async.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/async.mdx
@@ -0,0 +1,60 @@
+import {
+  CodeTabs,
+  python,
+  typescript,
+} from "@site/src/components/InstructionsWithCode";
+
+# How to run an evaluation asynchronously
+
+We can run evaluations asynchronously via the SDK using [aevaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._arunner.aevaluate.html), which accepts all of the same arguments as the [evaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) but expects the application function to be asynchronous.
+
+:::note
+
+This guide is only relevant when using the Python SDK. In JS/TypeScript the `evaluate()` function is already async. You can see how to use it [here](../../how_to_guides/evaluate_llm_application).
+
+:::
+
+## Using `aevaluate()`
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+      from langsmith import aevaluate, wrappers
+      from openai import AsyncOpenAI
+      # Assumes you've installed pydantic.
+      from pydantic import BaseModel
+      
+      oai_client = wrappers.wrap_openai(AsyncOpenAI())
+        
+      async def researcher_app(inputs: dict) -> str:
+        instructions = """You are an excellent researcher. Given a high-level research idea, \\
+list 5 concrete questions that should be investigated to determine if the idea is a worthwhile \\
+one to pursue."""
+          
+        response = await oai_client.beta.chat.completions.parse(
+          messages=[
+              {"role": "system", "content": instructions}, 
+              {"role": "user", "content": inputs["idea"]},
+            ], 
+        )
+        return response.choices[0].message.content
+
+      results = aevaluate(
+        researcher_app,
+        data="dataset_name",
+        evaluators=[correct, concision, valid_reasoning]
+      )
+    `,
+    typescript`
+      import type { EvaluationResult } from "langsmith/evaluation";
+      import type { Run, Example } from "langsmith/schemas";
+
+      function correct(rootRun: Run, example: Example): EvaluationResult {
+        const score = rootRun.outputs?.outputs === example.outputs?.output;
+        return { key: "correct", score };
+      }
+    `,
+
+]}
+/>
diff --git a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
index 2a9f238b..c6529eed 100644
--- a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
@@ -12,7 +12,7 @@ import {
 
 :::
 
-Custom evaluators are just functions that take a dataset example and the resulting application output, and return one or more metrics. 
+Custom evaluators are just functions that take a dataset example and the resulting application output, and return one or more metrics.
 These functions can be passed directly into [evaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) / [aevaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._arunner.aevaluate.html).
 
 ## Basic example
@@ -21,36 +21,38 @@ These functions can be passed directly into [evaluate()](https://langsmith-sdk.r
   groupId="client-language"
   tabs={[
     python`
-      from langsmith import evaluate
+        from langsmith import evaluate
 
-      def correct(outputs: dict, reference_outputs: dict) -> bool:
+        def correct(outputs: dict, reference_outputs: dict) -> bool:
         """Check if the answer exactly matches the expected answer."""
-        return outputs["answer"] == reference_outputs["answer"]
-
-      def dummy_app(inputs: dict) -> dict:
-        return {"answer": "hmm i'm not sure", "reasoning": "i didn't understand the question"}
-    
-      results = evaluate(
-        dummy_app, 
-        data="dataset_name", 
-        evaluators=[correct]
-      )
+            return outputs["answer"] == reference_outputs["answer"]
+
+        def dummy_app(inputs: dict) -> dict:
+            return {"answer": "hmm i'm not sure", "reasoning": "i didn't understand the question"}
+
+        results = evaluate(
+            dummy_app,
+            data="dataset_name",
+            evaluators=[correct]
+        )
     `,
     typescript`
       import type { EvaluationResult } from "langsmith/evaluation";
       import type { Run, Example } from "langsmith/schemas";
-      
+
       function correct(rootRun: Run, example: Example): EvaluationResult {
         const score = rootRun.outputs?.outputs === example.outputs?.output;
         return { key: "correct", score };
       }
     `,
-  ]}
+
+]}
 />
 
 ## Evaluator args
 
 Custom evaluator functions must have specific argument names. They can take any subset of the following arguments:
+
 - `inputs: dict`: A dictionary of the inputs corresponding to a single example in a dataset.
 - `outputs: dict`: A dictionary of the outputs generated by the application on the given `inputs`.
 - `reference_outputs: dict`: A dictionary of the reference outputs associated with the example, if available.
@@ -62,6 +64,7 @@ For most use cases you'll only need `inputs`, `outputs`, and `reference_outputs`
 ## Evaluator output
 
 Custom evaluators are expected to return one of the following types:
+
 - `int | float | bool`: this is interepreted as an continuous metric that can be averaged, sorted, etc. The function name is used as the name of the metric.
 - `str`: this is intepreted as a categorical metric. The function name is used as the name of the metric.
 - `dict`: dicts of the form `{"score" | "value": ..., "name": ...}` allow you to customize the metric type ("score" for numerical and "value" for categorical) and metric name. This if useful if, for example, you want to log an integer as a categorical metric.
@@ -73,57 +76,62 @@ Custom evaluators are expected to return one of the following types:
   groupId="client-language"
   tabs={[
     python`
-      from langsmith import evaluate, wrappers
-      from openai import AsyncOpenAI
-      
+        from langsmith import evaluate, wrappers
+        from openai import AsyncOpenAI
+        # Assumes you've installed pydantic.
+        from pydantic import BaseModel
 
-      def correct(outputs: dict, reference_outputs: dict) -> bool:
-        """Check if the answer exactly matches the expected answer."""
-        return outputs["answer"] == reference_outputs["answer"]
-
-      def concision(outputs: dict) -> int:
-        """Score how concise the answer is. 1 is the most concise, 5 is the least concise."""
-        return min(len(outputs["answer"]) // 1000, 4) + 1
-        
-      oai_client = wrappers.wrap_openai(AsyncOpenAI())
-        
-      async def valid_reasoning(inputs: dict, outputs: dict) -> bool:
-        """Use an LLM to judge if the reasoning and the answer are consistent."""
-        instructions = """
-Given the following question, answer, and reasoning, determine if the reasoning for the 
+        def correct(outputs: dict, reference_outputs: dict) -> bool:
+            """Check if the answer exactly matches the expected answer."""
+            return outputs["answer"] == reference_outputs["answer"]
+
+        def concision(outputs: dict) -> int:
+            """Score how concise the answer is. 1 is the most concise, 5 is the least concise."""
+            return min(len(outputs["answer"]) // 1000, 4) + 1
+
+        oai_client = wrappers.wrap_openai(AsyncOpenAI())
+
+        async def valid_reasoning(inputs: dict, outputs: dict) -> bool:
+            """Use an LLM to judge if the reasoning and the answer are consistent."""
+
+            instructions = """\\
+
+Given the following question, answer, and reasoning, determine if the reasoning for the \\
 answer is logically valid and consistent with question and the answer."""
-        
-        class Response(BaseModel):
-          reasoning_is_valid: bool
-          
-        msg = f"Question: {inputs['question']}\\nAnswer: {outputs['answer']}\\nReasoning: {outputs['reasoning']}"
-        response = await oai_client.beta.chat.completions.parse(
-          messages=[{"role": "system", "content": instructions,}, {"role": "user", "content": msg}], 
-          response_format=Response
+
+            class Response(BaseModel):
+                reasoning_is_valid: bool
+
+            msg = f"Question: {inputs['question']}\\nAnswer: {outputs['answer']}\\nReasoning: {outputs['reasoning']}"
+            response = await oai_client.beta.chat.completions.parse(
+                messages=[{"role": "system", "content": instructions,}, {"role": "user", "content": msg}],
+                response_format=Response
+            )
+            return response.choices[0].message.parsed.reasoning_is_valid
+
+        def dummy_app(inputs: dict) -> dict:
+            return {"answer": "hmm i'm not sure", "reasoning": "i didn't understand the question"}
+
+        results = evaluate(
+            dummy_app,
+            data="dataset_name",
+            evaluators=[correct, concision, valid_reasoning]
         )
-        return response.reasoning_is_valid
-
-      def dummy_app(inputs: dict) -> dict:
-        return {"answer": "hmm i'm not sure", "reasoning": "i didn't understand the question"}
-    
-      results = evaluate(
-        dummy_app, 
-        data="dataset_name", 
-        evaluators=[correct, concision, valid_reasoning]
-      )
     `,
     typescript`
       import type { EvaluationResult } from "langsmith/evaluation";
       import type { Run, Example } from "langsmith/schemas";
-      
+
       function correct(rootRun: Run, example: Example): EvaluationResult {
         const score = rootRun.outputs?.outputs === example.outputs?.output;
         return { key: "correct", score };
       }
     `,
-  ]}
+
+]}
 />
 
 ## Related
+
 - [Evaluate aggregate experiment results](../../how_to_guides/summary): Define summary evaluators, which compute metrics for an entire experiment.
-- [Run an evaluation comparing two experiments](../../how_to_guides/evaluate_pairwise): Define pairwise evaluators, which compute metrics by comparing two (or more) experiments against each other.
\ No newline at end of file
+- [Run an evaluation comparing two experiments](../../how_to_guides/evaluate_pairwise): Define pairwise evaluators, which compute metrics by comparing two (or more) experiments against each other.
diff --git a/docs/evaluation/how_to_guides/evaluation/dataset_subset.mdx b/docs/evaluation/how_to_guides/evaluation/dataset_subset.mdx
index e1ee8ecf..5ef78843 100644
--- a/docs/evaluation/how_to_guides/evaluation/dataset_subset.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/dataset_subset.mdx
@@ -49,7 +49,6 @@ One common workflow is to fetch examples that have a certain metadata key-value
   ]}
 />
 
-
 ## Evaluate on a dataset split
 
 You can use the `list_examples` / `listExamples` method to evaluate on one or multiple splits of your dataset. The `splits` param takes a list of the splits you would like to evaluate.
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
index cab4c046..2776c2cf 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
@@ -32,17 +32,19 @@ First we need an application to evaluate. Let's create a simple toxicity classif
       from langsmith import traceable, wrappers
       from openai import OpenAI
       
+      # Optionally wrap the OpenAI client to trace all model calls.
       oai_client = wrappers.wrap_openai(OpenAI())
       
+      # Optionally add the 'traceable' decorator to trace the inputs/outputs of this function.
       @traceable
       def toxicity_classifier(inputs: dict) -> str:
-          system = (
+          instructions = (
             "Please review the user query below and determine if it contains any form of toxic behavior, "
             "such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does "
             "and 'Not toxic' if it doesn't."
           )
           messages = [
-              {"role": "system", "content": system},
+              {"role": "system", "content": instructions},
               {"role": "user", "content": inputs["text"]},
           ]
           result = oai_client.chat.completions.create(
@@ -55,8 +57,10 @@ First we need an application to evaluate. Let's create a simple toxicity classif
       import { wrapOpenAI } from "langsmith/wrappers";
       import { traceable } from "langsmith/traceable";
       
+      # Optionally wrap the OpenAI client to trace all model calls.
       const oaiClient = wrapOpenAI(new OpenAI());
       
+      # Optionally add the 'traceable' wrapper to trace the inputs/outputs of this function.
       const toxicityClassifier = traceable(
         async (text: string) => {
           const result = await oaiClient.chat.completions.create({
@@ -169,7 +173,7 @@ Since we have labels for this task, our evaluator can directly check if the actu
 
 See [here](../../how_to_guides#define-an-evaluator) for more on how to define evaluators.
 
-## Step 4. Run the evaluation 
+## Step 4. Run the evaluation
 
 We'll use the [evaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) / [aevaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._arunner.aevaluate.html) methods to run the evaluation.
 
@@ -285,15 +289,15 @@ _If you've annotated your code for tracing, you can open the trace of each row i
       import type { Run, Example } from "langsmith/schemas";
       import { traceable } from "langsmith/traceable";
       import { wrapOpenAI } from "langsmith/wrappers";
-      
-      
+
+
       const oaiClient = wrapOpenAI(new OpenAI());
-      
+
       const toxicityClassifier = traceable(
         async (text: string) => {
           const result = await oaiClient.chat.completions.create({
             messages: [
-              { 
+              {
                 role: "system",
                 content: "Please review the user query below and determine if it contains any form of toxic behavior, such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does, and 'Not toxic' if it doesn't.",
               },
@@ -302,14 +306,14 @@ _If you've annotated your code for tracing, you can open the trace of each row i
             model: "gpt-4o-mini",
             temperature: 0,
           });
-          
+
           return result.choices[0].message.content;
         },
         { name: "toxicityClassifier" }
       );
-      
+
       const langsmith = new Client();
-      
+
       // create a dataset
       const labeledTexts = [
         ["Shut up, idiot", "Toxic"],
@@ -319,7 +323,7 @@ _If you've annotated your code for tracing, you can open the trace of each row i
         ["Nobody likes you", "Toxic"],
         ["This is unacceptable. I want to speak to the manager.", "Not toxic"],
       ];
-      
+
       const [inputs, outputs] = labeledTexts.reduce<
         [Array<{ input: string }>, Array<{ outputs: string }>]
       >(
@@ -329,7 +333,7 @@ _If you've annotated your code for tracing, you can open the trace of each row i
         ],
         [[], []]
       );
-      
+
       const datasetName = "Toxic Queries";
       const toxicDataset = await langsmith.createDataset(datasetName);
       await langsmith.createExamples({ inputs, outputs, datasetId: toxicDataset.id });
@@ -347,6 +351,8 @@ _If you've annotated your code for tracing, you can open the trace of each row i
 
       });
       `,
-  ]}
+
+]}
 />
-</details>
\ No newline at end of file
+
+</details>
diff --git a/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx b/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx
index e69de29b..1c8b4c56 100644
--- a/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx
@@ -0,0 +1,77 @@
+import {
+  CodeTabs,
+  python,
+  typescript,
+} from "@site/src/components/InstructionsWithCode";
+
+# How to define an LLM-as-a-judge evaluator
+
+:::info Key concepts
+
+- [LLM-as-a-judge evaluator](../../concepts#llm-as-judge)
+
+:::
+
+LLM applications can be difficult to evaluate because often they're generating conversational text for which there's no single "right" answer.
+An imperfect but valuable way to evaluate such applications is to use a second LLM to judge the outputs of the first.
+
+## Custom evaluator via SDK
+
+For maximal control of evaluator logic, we can write a custom evaluator and run it using the SDK.
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+      from langsmith import evaluate, traceable, wrappers
+      from openai import OpenAI
+      # Assumes you've installed pydantic
+      from pydantic import BaseModel
+      
+      # Optionally wrap the OpenAI client to trace all model calls.
+      oai_client = wrappers.wrap_openai(OpenAI())
+        
+      def valid_reasoning(inputs: dict, outputs: dict) -> bool:
+        """Use an LLM to judge if the reasoning and the answer are consistent."""
+
+        instructions = """\\
+
+Given the following question, answer, and reasoning, determine if the reasoning \\
+for the answer is logically valid and consistent with question and the answer.\\
+"""
+
+        class Response(BaseModel):
+          reasoning_is_valid: bool
+
+        msg = f"Question: {inputs['question']}\\nAnswer: {outputs['answer']}\\nReasoning: {outputs['reasoning']}"
+        response = oai_client.beta.chat.completions.parse(
+          messages=[{"role": "system", "content": instructions,}, {"role": "user", "content": msg}],
+          response_format=Response
+        )
+        return response.choices[0].messages.parsed.reasoning_is_valid
+
+      # Optionally add the 'traceable' decorator to trace the inputs/outputs of this function.
+      @traceable
+      def dummy_app(inputs: dict) -> dict:
+        return {"answer": "hmm i'm not sure", "reasoning": "i didn't understand the question"}
+
+      results = evaluate(
+        dummy_app,
+        data="dataset_name",
+        evaluators=[valid_reasoning]
+      )
+    `,
+    typescript`
+      import type { EvaluationResult } from "langsmith/evaluation";
+      import type { Run, Example } from "langsmith/schemas";
+
+    `,
+
+]}
+/>
+
+See [here](../../how_to_guides/custom_evaluator) for more on how to write a custom evaluator.
+
+## Builtin evaluator via the UI
+
+## Prebuilt evaluator via LangChain
diff --git a/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx b/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx
index c5f50a35..33e5be15 100644
--- a/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx
@@ -45,7 +45,7 @@ Example:
         "Support for multiple scores is available in `langsmith@0.1.32` and higher",
     })`
       import type { Run, Example } from "langsmith/schemas";
-      
+
       function multipleScores(rootRun: Run, example: Example) {
         // Your evaluation logic here
         return {
@@ -57,9 +57,10 @@ Example:
         };
       }
     `,
-  ]}
+
+]}
 />
 
 Rows from the resulting experiment will display each of the scores.
 
-![](../evaluation/static/multiple_scores.png)
\ No newline at end of file
+![](../evaluation/static/multiple_scores.png)
diff --git a/docs/evaluation/how_to_guides/evaluation/summary.mdx b/docs/evaluation/how_to_guides/evaluation/summary.mdx
index 2abc0c5b..a248905b 100644
--- a/docs/evaluation/how_to_guides/evaluation/summary.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/summary.mdx
@@ -73,4 +73,4 @@ You can then pass this evaluator to the `evaluate` method as follows:
 
 In the LangSmith UI, you'll the summary evaluator's score displayed with the corresponding key.
 
-![](../evaluation/static/summary_eval.png)
\ No newline at end of file
+![](../evaluation/static/summary_eval.png)
diff --git a/docs/evaluation/how_to_guides/index.md b/docs/evaluation/how_to_guides/index.md
index 066bdbc5..81a4ad97 100644
--- a/docs/evaluation/how_to_guides/index.md
+++ b/docs/evaluation/how_to_guides/index.md
@@ -1,9 +1,9 @@
 # Evaluation how-to guides
 
-These guides answer “How do I….?” format questions. 
-They are goal-oriented and concrete, and are meant to help you complete a specific task. 
-For conceptual explanations see the [Conceptual guide](./concepts). 
-For end-to-end walkthroughs see [Tutorials](./tutorials). 
+These guides answer “How do I….?” format questions.
+They are goal-oriented and concrete, and are meant to help you complete a specific task.
+For conceptual explanations see the [Conceptual guide](./concepts).
+For end-to-end walkthroughs see [Tutorials](./tutorials).
 For comprehensive descriptions of every class and function see the [API reference](https://langsmith-sdk.readthedocs.io/en/latest/evaluation.html).
 
 ## Offline evaluation
@@ -11,15 +11,18 @@ For comprehensive descriptions of every class and function see the [API referenc
 Evaluate and improve your application.
 
 ### Run an evaluation
+
 - [Run an evaluation](./how_to_guides/evaluation/evaluate_llm_application)
 - [Run an evaluation asynchronously](./how_to_guides/evaluation/async)
 - [Run an evaluation comparing two experiments](./how_to_guides/evaluation/evaluate_pairwise)
-- [Run an evaluation of a LangChain / LangGraph object](./how_to_guides/evaluation/langchain_runnable)
+- [Run an evaluation of a LangChain chain](./how_to_guides/evaluation/langchain_runnable)
+- [Run an evaluation of a LangGraph graph](./how_to_guides/evaluation/langgraph)
 - [Run an evaluation of an existing experiment](./how_to_guides/evaluation/evaluate_existing_experiment)
 - [Run an evaluation via the REST API](./how_to_guides/evaluation/run_evals_api_only)
 - [Run an evaluation from the prompt playground](./how_to_guides/evaluation/run_evaluation_from_prompt_playground)
 
 ### Define an evaluator
+
 - [Define a custom evaluator](./how_to_guides/evaluation/custom_evaluator)
 - [Define an LLM-as-a-judge evaluator](./how_to_guides/evaluation/llm_as_judge)
 - [Use an off-the-shelf evaluator via the SDK (Python only)](./how_to_guides/evaluation/use_langchain_off_the_shelf_evaluators)
@@ -31,10 +34,12 @@ Evaluate and improve your application.
 - [Check your evaluator setup](./how_to_guides/evaluation/check_evaluator)
 
 ### Configure the data
+
 - [Evaluate on a split / filtered view of a dataset](./how_to_guides/evaluation/dataset_subset)
 - [Evaluate on a specific dataset version](./how_to_guides/evaluation/dataset_version)
 
 ### Configure an evaluation job
+
 - [Evaluate with repetitions](./how_to_guides/evaluation/repetition)
 - [Run a large evaluation job](./how_to_guides/evaluation/large_job)
 - [Handle rate limiting](./how_to_guides/evaluation/rate_limiting)

From ebbd9459d65f97c5a7e33f8a81f24714f2ce4bd9 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Fri, 15 Nov 2024 09:04:55 -0800
Subject: [PATCH 08/29] wip

---
 .../how_to_guides/evaluation/async.mdx        | 57 +++++++++----------
 .../evaluation/custom_evaluator.mdx           |  1 +
 .../how_to_guides/evaluation/langgraph.mdx    |  0
 .../how_to_guides/evaluation/llm_as_judge.mdx |  5 +-
 4 files changed, 32 insertions(+), 31 deletions(-)
 create mode 100644 docs/evaluation/how_to_guides/evaluation/langgraph.mdx

diff --git a/docs/evaluation/how_to_guides/evaluation/async.mdx b/docs/evaluation/how_to_guides/evaluation/async.mdx
index ed94d9ed..90a60c7d 100644
--- a/docs/evaluation/how_to_guides/evaluation/async.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/async.mdx
@@ -20,41 +20,40 @@ This guide is only relevant when using the Python SDK. In JS/TypeScript the `eva
   groupId="client-language"
   tabs={[
     python`
-      from langsmith import aevaluate, wrappers
-      from openai import AsyncOpenAI
-      # Assumes you've installed pydantic.
-      from pydantic import BaseModel
-      
-      oai_client = wrappers.wrap_openai(AsyncOpenAI())
-        
-      async def researcher_app(inputs: dict) -> str:
-        instructions = """You are an excellent researcher. Given a high-level research idea, \\
-list 5 concrete questions that should be investigated to determine if the idea is a worthwhile \\
-one to pursue."""
-          
-        response = await oai_client.beta.chat.completions.parse(
-          messages=[
-              {"role": "system", "content": instructions}, 
-              {"role": "user", "content": inputs["idea"]},
-            ], 
+        from langsmith import aevaluate, wrappers
+        from openai import AsyncOpenAI
+
+        # Optionally wrap the OpenAI client to trace all model calls.
+        oai_client = wrappers.wrap_openai(AsyncOpenAI())
+
+        @traceable
+        async def researcher_app(inputs: dict) -> str:
+            instructions = """You are an excellent researcher. Given a high-level research idea, \\
+list 5 concrete questions that should be investigated to determine if the idea is worth pursuing."""
+                
+            response = await oai_client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {"role": "system", "content": instructions}, 
+                    {"role": "user", "content": inputs["idea"]},
+                ], 
+            )
+            return response.choices[0].message.content
+
+        # Optionally add the 'traceable' decorator to trace the inputs/outputs of this function.
+        @traceable
+        async def 
+
+        results = aevaluate(
+            researcher_app,
+            data="dataset_name",
+            evaluators=[correct, concision, valid_reasoning]
         )
-        return response.choices[0].message.content
-
-      results = aevaluate(
-        researcher_app,
-        data="dataset_name",
-        evaluators=[correct, concision, valid_reasoning]
-      )
     `,
     typescript`
       import type { EvaluationResult } from "langsmith/evaluation";
       import type { Run, Example } from "langsmith/schemas";
 
-      function correct(rootRun: Run, example: Example): EvaluationResult {
-        const score = rootRun.outputs?.outputs === example.outputs?.output;
-        return { key: "correct", score };
-      }
     `,
-
 ]}
 />
diff --git a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
index c6529eed..dea67624 100644
--- a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
@@ -104,6 +104,7 @@ answer is logically valid and consistent with question and the answer."""
 
             msg = f"Question: {inputs['question']}\\nAnswer: {outputs['answer']}\\nReasoning: {outputs['reasoning']}"
             response = await oai_client.beta.chat.completions.parse(
+                model="gpt-4o-mini",
                 messages=[{"role": "system", "content": instructions,}, {"role": "user", "content": msg}],
                 response_format=Response
             )
diff --git a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
new file mode 100644
index 00000000..e69de29b
diff --git a/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx b/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx
index 1c8b4c56..d464713d 100644
--- a/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx
@@ -13,7 +13,8 @@ import {
 :::
 
 LLM applications can be difficult to evaluate because often they're generating conversational text for which there's no single "right" answer.
-An imperfect but valuable way to evaluate such applications is to use a second LLM to judge the outputs of the first.
+An imperfect but valuable way to evaluate such applications is to use a second LLM to judge the outputs of the first. 
+This can be especially useful if a smaller model is used in the application and a larger, better model is used for evaluation.
 
 ## Custom evaluator via SDK
 
@@ -35,7 +36,6 @@ For maximal control of evaluator logic, we can write a custom evaluator and run
         """Use an LLM to judge if the reasoning and the answer are consistent."""
 
         instructions = """\\
-
 Given the following question, answer, and reasoning, determine if the reasoning \\
 for the answer is logically valid and consistent with question and the answer.\\
 """
@@ -45,6 +45,7 @@ for the answer is logically valid and consistent with question and the answer.\\
 
         msg = f"Question: {inputs['question']}\\nAnswer: {outputs['answer']}\\nReasoning: {outputs['reasoning']}"
         response = oai_client.beta.chat.completions.parse(
+          model="gpt-4o",
           messages=[{"role": "system", "content": instructions,}, {"role": "user", "content": msg}],
           response_format=Response
         )

From 028de027a2d9755346c1454e5eb2b13ef14d503e Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Fri, 15 Nov 2024 09:07:07 -0800
Subject: [PATCH 09/29] links

---
 docs/evaluation/how_to_guides/evaluation/async.mdx            | 2 +-
 docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx | 4 ++--
 docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx     | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/async.mdx b/docs/evaluation/how_to_guides/evaluation/async.mdx
index 90a60c7d..b10d78af 100644
--- a/docs/evaluation/how_to_guides/evaluation/async.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/async.mdx
@@ -10,7 +10,7 @@ We can run evaluations asynchronously via the SDK using [aevaluate()](https://la
 
 :::note
 
-This guide is only relevant when using the Python SDK. In JS/TypeScript the `evaluate()` function is already async. You can see how to use it [here](../../how_to_guides/evaluate_llm_application).
+This guide is only relevant when using the Python SDK. In JS/TypeScript the `evaluate()` function is already async. You can see how to use it [here](../../how_to_guides/evaluation/evaluate_llm_application).
 
 :::
 
diff --git a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
index dea67624..8058c8c4 100644
--- a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
@@ -134,5 +134,5 @@ answer is logically valid and consistent with question and the answer."""
 
 ## Related
 
-- [Evaluate aggregate experiment results](../../how_to_guides/summary): Define summary evaluators, which compute metrics for an entire experiment.
-- [Run an evaluation comparing two experiments](../../how_to_guides/evaluate_pairwise): Define pairwise evaluators, which compute metrics by comparing two (or more) experiments against each other.
+- [Evaluate aggregate experiment results](../../how_to_guides/evaluation/summary): Define summary evaluators, which compute metrics for an entire experiment.
+- [Run an evaluation comparing two experiments](../../how_to_guides/evaluation/evaluate_pairwise): Define pairwise evaluators, which compute metrics by comparing two (or more) experiments against each other.
diff --git a/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx b/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx
index d464713d..47926705 100644
--- a/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx
@@ -71,7 +71,7 @@ for the answer is logically valid and consistent with question and the answer.\\
 ]}
 />
 
-See [here](../../how_to_guides/custom_evaluator) for more on how to write a custom evaluator.
+See [here](../../how_to_guides/evaluation/custom_evaluator) for more on how to write a custom evaluator.
 
 ## Builtin evaluator via the UI
 

From 7d021f6ed7f075d1a832621cce08bbb8115fe9b1 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Fri, 15 Nov 2024 09:39:45 -0800
Subject: [PATCH 10/29] intro

---
 docs/evaluation/index.mdx | 47 ++++++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/docs/evaluation/index.mdx b/docs/evaluation/index.mdx
index d1021bdc..ce7f4449 100644
--- a/docs/evaluation/index.mdx
+++ b/docs/evaluation/index.mdx
@@ -67,25 +67,32 @@ export LANGCHAIN_API_KEY=<your-api-key>`),
 
 <CodeTabs
   tabs={[
-    PythonBlock(`from langsmith import evaluate, Client
-from langsmith.schemas import Example, Run\n
-# 1. Create and/or select your dataset
-client = Client()
-dataset = client.clone_public_dataset("https://smith.langchain.com/public/a63525f9-bdf2-4512-83e3-077dc9417f96/d")\n
-# 2. Define an evaluator
-# For more info on defining evaluators, see: https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators
-def is_concise_enough(root_run: Run, example: Example) -> dict:
-    score = len(root_run.outputs["output"]) < 3 \* len(example.outputs["answer"])
-    return {"key": "is_concise", "score": int(score)}\n
-# 3. Run an evaluation
-evaluate(
-    lambda x: x["question"] + "is a good question. I don't know the answer.",
-    data=dataset.name,
-    evaluators=[is_concise_enough],
-    experiment_prefix="my first experiment "
-)
-`),
-    TypeScriptBlock(`import { Client } from "langsmith";
+    python`
+    from langsmith import evaluate, Client
+
+    # 1. Create and/or select your dataset
+    client = Client()
+    dataset = client.clone_public_dataset(
+        "https://smith.langchain.com/public/a63525f9-bdf2-4512-83e3-077dc9417f96/d"
+    )
+
+    # 2. Define an evaluator
+    def is_concise(outputs: dict, reference_outputs: dict) -> bool:
+        return len(outputs["answer"]) < (3 * len(reference_outputs["answer"]))
+
+    # 3. Define the interface to your app
+    def chatbot(inputs: dict) -> dict:
+        return {"answer": inputs["question"] + "is a good question. I don't know the answer."}
+
+    # 4. Run an evaluation
+    evaluate(
+        chatbot,
+        data=dataset.name,
+        evaluators=[is_concise],
+        experiment_prefix="my first experiment "
+    )
+`,
+    typescript`import { Client } from "langsmith";
 import { evaluate } from "langsmith/evaluation";
 import type { EvaluationResult } from "langsmith/evaluation";
 import type { Run, Example } from "langsmith/schemas";\n
@@ -112,7 +119,7 @@ answer: exampleInput.question + " Good question. I don't know the answer"
 data: datasetName,
 evaluators: [isConcise],
 experimentPrefix: "my first experiment ",
-});`),
+});`,
   ]}
   groupId="client-language"
 />

From 7520c481e8fa4bc0e6e275f2da14f277af60e52b Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Fri, 15 Nov 2024 10:15:48 -0800
Subject: [PATCH 11/29] wip

---
 .../evaluation/langchain_runnable.mdx         | 67 ++++++++++++++-----
 docs/evaluation/how_to_guides/index.md        |  4 +-
 2 files changed, 51 insertions(+), 20 deletions(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx b/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
index 1e5e1139..3e721aac 100644
--- a/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
@@ -4,28 +4,54 @@ import {
   typescript,
 } from "@site/src/components/InstructionsWithCode";
 
-# How to evaluate a LangChain runnable
+# How to evaluate a `langchain` runnable
 
-You can configure a `LangChain` runnable to be evaluated by passing `runnable.invoke` it to the `evaluate` method in Python, or just the `runnable` in TypeScript.
+`langchain` [Runnable](https://python.langchain.com/docs/concepts/runnables/) objects (such as chat models, retrievers, chains, etc.) can be passed directly into `evaluate()` / `aevaluate()`.
 
-First, define your `LangChain` runnable:
+:::info 
+
+`langchain` refers to the [Python](https://python.langchain.com) and [TypeScript](https://js.langchain.com) OSS frameworks for building LLM applications.
+
+:::
+
+First, install all the required packages
+
+<CodeTabs
+  tabs={[
+    {
+      value: "python",
+      label: "Python",
+      language: "bash",
+      content: `pip install -U langsmith langchain langchain-openai`,
+    },
+    {
+      value: "typescript",
+      label: "TypeScript",
+      language: "bash",
+      content: `yarn add langsmith @langchain/openai`,
+    },
+  ]}
+  groupId="client-language"
+/>
+
+Now define your chain:
 
 <CodeTabs
   groupId="client-language"
   tabs={[
     python`
-      from langchain_openai import ChatOpenAI
-      from langchain_core.prompts import ChatPromptTemplate
-      from langchain_core.output_parsers import StrOutputParser
-      
-      prompt = ChatPromptTemplate.from_messages([
-        ("system", "Please review the user query below and determine if it contains any form of toxic behavior, such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does, and 'Not toxic' if it doesn't."),
-        ("user", "{text}")
-      ])
-      chat_model = ChatOpenAI()
-      output_parser = StrOutputParser()
-      
-      chain = prompt | chat_model | output_parser
+        from langchain.chat_models import init_chat_model
+        from langchain_core.prompts import ChatPromptTemplate
+        from langchain_core.output_parsers import StrOutputParser
+
+        instructions = """Please review the user query below and determine if it contains any form of \\
+toxic behavior, such as insults, threats, or highly negative comments. Respond with 'Toxic' if it \\
+does, and 'Not toxic' if it doesn't."""
+        prompt = ChatPromptTemplate([("system", instructions), ("user", "{text}")])
+        llm = init_chat_model("gpt-4o")
+        output_parser = StrOutputParser()
+
+        chain = prompt | llm | output_parser
     `,
     typescript`
       import { ChatOpenAI } from "@langchain/openai";
@@ -44,16 +70,21 @@ First, define your `LangChain` runnable:
   ]}
 />
 
-Then, pass the `runnable.invoke` method to the `evaluate` method. Note that the input variables of the runnable must match the keys of the example inputs.
+Then pass the chain to the `evaluate()` method. Note that the input variables of the chain must match the keys of the example inputs. In this case, the example inputs should have the form `{"text": "..."}`.
 
 <CodeTabs
   groupId="client-language"
   tabs={[
     python`
-      from langsmith import evaluate
+      from langsmith import evaluate, Client
+      
+      client = Client()
+      dataset = client.clone_public_dataset(
+        "https://smith.langchain.com/public/3d6831e6-1680-4c88-94df-618c8e01fc55/d"
+      )
       
       results = evaluate(
-          chain.invoke,
+          chain,
           data=dataset_name,
           evaluators=[correct_label],
           experiment_prefix="Toxic Queries",
diff --git a/docs/evaluation/how_to_guides/index.md b/docs/evaluation/how_to_guides/index.md
index 81a4ad97..550b599b 100644
--- a/docs/evaluation/how_to_guides/index.md
+++ b/docs/evaluation/how_to_guides/index.md
@@ -15,8 +15,8 @@ Evaluate and improve your application.
 - [Run an evaluation](./how_to_guides/evaluation/evaluate_llm_application)
 - [Run an evaluation asynchronously](./how_to_guides/evaluation/async)
 - [Run an evaluation comparing two experiments](./how_to_guides/evaluation/evaluate_pairwise)
-- [Run an evaluation of a LangChain chain](./how_to_guides/evaluation/langchain_runnable)
-- [Run an evaluation of a LangGraph graph](./how_to_guides/evaluation/langgraph)
+- [Evaluate a `langchain` runnable](./how_to_guides/evaluation/langchain_runnable)
+- [Evaluate a `langgraph` graph](./how_to_guides/evaluation/langgraph)
 - [Run an evaluation of an existing experiment](./how_to_guides/evaluation/evaluate_existing_experiment)
 - [Run an evaluation via the REST API](./how_to_guides/evaluation/run_evals_api_only)
 - [Run an evaluation from the prompt playground](./how_to_guides/evaluation/run_evaluation_from_prompt_playground)

From 0eebf982fdd290132036f356a3880baa1bb65256 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Fri, 15 Nov 2024 15:22:41 -0800
Subject: [PATCH 12/29] wip

---
 .../evaluation/langchain_runnable.mdx         | 55 ++++++++++++-------
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx b/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
index 3e721aac..deae826e 100644
--- a/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
@@ -34,7 +34,7 @@ First, install all the required packages
   groupId="client-language"
 />
 
-Now define your chain:
+Now define your chain
 
 <CodeTabs
   groupId="client-language"
@@ -44,14 +44,18 @@ Now define your chain:
         from langchain_core.prompts import ChatPromptTemplate
         from langchain_core.output_parsers import StrOutputParser
 
-        instructions = """Please review the user query below and determine if it contains any form of \\
-toxic behavior, such as insults, threats, or highly negative comments. Respond with 'Toxic' if it \\
-does, and 'Not toxic' if it doesn't."""
-        prompt = ChatPromptTemplate([("system", instructions), ("user", "{text}")])
+        instructions = (
+            "Please review the user query below and determine if it contains any form "
+            "of toxic behavior, such as insults, threats, or highly negative comments. "
+            "Respond with 'Toxic' if it does, and 'Not toxic' if it doesn't."
+        )
+
+        prompt = ChatPromptTemplate(
+            [("system", instructions), ("user", "{text}")],
+        )
         llm = init_chat_model("gpt-4o")
-        output_parser = StrOutputParser()
 
-        chain = prompt | llm | output_parser
+        chain = prompt | llm | StrOutputParser()
     `,
     typescript`
       import { ChatOpenAI } from "@langchain/openai";
@@ -76,19 +80,30 @@ Then pass the chain to the `evaluate()` method. Note that the input variables of
   groupId="client-language"
   tabs={[
     python`
-      from langsmith import evaluate, Client
-      
-      client = Client()
-      dataset = client.clone_public_dataset(
-        "https://smith.langchain.com/public/3d6831e6-1680-4c88-94df-618c8e01fc55/d"
-      )
-      
-      results = evaluate(
-          chain,
-          data=dataset_name,
-          evaluators=[correct_label],
-          experiment_prefix="Toxic Queries",
-      )
+        from langsmith import evaluate, Client
+
+        client = Client()
+
+        # Clone a dataset of texts with toxicity labels.
+        # Each example input has a "text" key and each output has a "label" key.
+        dataset = client.clone_public_dataset(
+            "https://smith.langchain.com/public/3d6831e6-1680-4c88-94df-618c8e01fc55/d"
+        )
+        
+        def correct(outputs: dict, reference_outputs: dict) -> bool:
+            # Since our chain outputs a string not a dict, this string
+            # gets stored under the default "output" key in the outputs dict:
+            actual = outputs["output"] 
+            expected = reference_outputs["label"]
+
+            assert actual == expected
+
+        results = evaluate(
+            chain,
+            data=dataset,
+            evaluators=[correct],
+            experiment_prefix="gpt-4o, baseline",
+        )
     `,
     typescript`
       import { evaluate } from "langsmith/evaluation";

From 25c88a08189be7c58471535a24e418a6518fb41f Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Fri, 15 Nov 2024 18:37:59 -0800
Subject: [PATCH 13/29] wip

---
 .../how_to_guides/evaluation/async.mdx        | 42 +++++++++++++------
 .../evaluation/evaluate_llm_application.mdx   | 16 ++++---
 .../evaluation/langchain_runnable.mdx         | 18 ++++----
 .../how_to_guides/evaluation/langgraph.mdx    |  7 ++++
 .../how_to_guides/evaluation/llm_as_judge.mdx |  3 +-
 .../evaluation/multiple_scores.mdx            | 34 ++++++++-------
 docs/evaluation/index.mdx                     |  5 ++-
 7 files changed, 82 insertions(+), 43 deletions(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/async.mdx b/docs/evaluation/how_to_guides/evaluation/async.mdx
index b10d78af..cc2c9c59 100644
--- a/docs/evaluation/how_to_guides/evaluation/async.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/async.mdx
@@ -6,11 +6,15 @@ import {
 
 # How to run an evaluation asynchronously
 
-We can run evaluations asynchronously via the SDK using [aevaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._arunner.aevaluate.html), which accepts all of the same arguments as the [evaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) but expects the application function to be asynchronous.
+We can run evaluations asynchronously via the SDK using [aevaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._arunner.aevaluate.html),
+which accepts all of the same arguments as [evaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) but expects the application function to be asynchronous.
+You can learn more about how to use the `evaluate()` function [here](../../how_to_guides/evaluation/evaluate_llm_application).
 
-:::note
+:::info Python only
 
-This guide is only relevant when using the Python SDK. In JS/TypeScript the `evaluate()` function is already async. You can see how to use it [here](../../how_to_guides/evaluation/evaluate_llm_application).
+This guide is only relevant when using the Python SDK.
+In JS/TypeScript the `evaluate()` function is already async.
+You can see how to use it [here](../../how_to_guides/evaluation/evaluate_llm_application).
 
 :::
 
@@ -20,34 +24,42 @@ This guide is only relevant when using the Python SDK. In JS/TypeScript the `eva
   groupId="client-language"
   tabs={[
     python`
-        from langsmith import aevaluate, wrappers
+        from langsmith import aevaluate, wrappers, Client
         from openai import AsyncOpenAI
 
         # Optionally wrap the OpenAI client to trace all model calls.
         oai_client = wrappers.wrap_openai(AsyncOpenAI())
 
+        # Optionally add the 'traceable' decorator to trace the inputs/outputs of this function.
         @traceable
         async def researcher_app(inputs: dict) -> str:
             instructions = """You are an excellent researcher. Given a high-level research idea, \\
+
 list 5 concrete questions that should be investigated to determine if the idea is worth pursuing."""
-                
+
             response = await oai_client.chat.completions.create(
                 model="gpt-4o-mini",
                 messages=[
-                    {"role": "system", "content": instructions}, 
+                    {"role": "system", "content": instructions},
                     {"role": "user", "content": inputs["idea"]},
-                ], 
+                ],
             )
             return response.choices[0].message.content
 
-        # Optionally add the 'traceable' decorator to trace the inputs/outputs of this function.
-        @traceable
-        async def 
+        # Evaluator functions can be sync or async
+        def concise(inputs: dict, output: dict) -> bool:
+            return len(output["output"]) < 3 * len(inputs["idea"])
+
+        ls_client = Client()
+        # TODO
+        dataset = ...
 
         results = aevaluate(
             researcher_app,
-            data="dataset_name",
-            evaluators=[correct, concision, valid_reasoning]
+            data=dataset,
+            evaluators=[concise],
+            max_concurrency=2,  # Optional, no max by default
+            experiment_prefix="gpt-4o-mini, baseline"  # Optional, random by default
         )
     `,
     typescript`
@@ -55,5 +67,11 @@ list 5 concrete questions that should be investigated to determine if the idea i
       import type { Run, Example } from "langsmith/schemas";
 
     `,
+
 ]}
 />
+
+## Related
+
+- [Run an evaluation (synchronously)](../../how_to_guides/evaluation/evaluate_llm_application)
+- [Run a large evaluation job](../../how_to_guides/evaluation/large_job): Learn about the key `aevaluate()` parameters to configure when running large evaluation jobs.
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
index 2776c2cf..c196e74e 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
@@ -21,7 +21,7 @@ import {
 
 In this guide we'll go over how to evaluate an application using the [evaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) method in the LangSmith SDK.
 
-## Step 1: Define an application
+## Define an application
 
 First we need an application to evaluate. Let's create a simple toxicity classifier for this example.
 
@@ -85,7 +85,7 @@ First we need an application to evaluate. Let's create a simple toxicity classif
 We've optionally enabled tracing to capture the inputs and outputs of each step in the pipeline.
 To understand how to annotate your code for tracing, please refer to [this guide](../../../observability/how_to_guides/tracing/annotate_code).
 
-## Step 2: Create or select a dataset
+## Create or select a dataset
 
 We need a [Dataset](../../concepts#datasets) to evaluate our application on. Our dataset will contain labeled [examples](../../concepts#examples) of toxic and non-toxic text.
 
@@ -147,7 +147,7 @@ We need a [Dataset](../../concepts#datasets) to evaluate our application on. Our
 
 See [here](../../how_to_guides#dataset-management) for more on dataset management.
 
-## Step 3. Define an evaluator
+## Define an evaluator
 
 [Evaluators](../../concepts#evaluators) are functions for scoring your application's outputs. They take in the example inputs, actual outputs, and, when present, the reference outputs.
 Since we have labels for this task, our evaluator can directly check if the actual outputs match the reference outputs.
@@ -173,7 +173,7 @@ Since we have labels for this task, our evaluator can directly check if the actu
 
 See [here](../../how_to_guides#define-an-evaluator) for more on how to define evaluators.
 
-## Step 4. Run the evaluation
+## Run the evaluation
 
 We'll use the [evaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) / [aevaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._arunner.aevaluate.html) methods to run the evaluation.
 
@@ -211,7 +211,7 @@ The key arguments are:
 
 See [here](../../how_to_guides#run-an-evaluation) for other ways to kick off evaluations and [here](../../how_to_guides#configure-an-evaluation-job) for how to configure evaluation jobs.
 
-## Step 5. Explore the results
+## Explore the results
 
 Each invocation of `evaluate()` creates an [Experiment](../../concepts#experiments) which can be viewed in the LangSmith UI or queried via the SDK.
 Evaluation scores are stored against each actual output as feedback.
@@ -356,3 +356,9 @@ _If you've annotated your code for tracing, you can open the trace of each row i
 />
 
 </details>
+
+## Related
+
+- [Run an evaluation asynchronously](../../how_to_guides/evaluation/async)
+- [Run an evaluation via the REST API](../../how_to_guides/evaluation/run_evals_api_only)
+- [Run an evaluation from the prompt playground](../../how_to_guides/evaluation/run_evaluation_from_prompt_playground)
diff --git a/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx b/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
index deae826e..bfab559e 100644
--- a/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
@@ -8,7 +8,7 @@ import {
 
 `langchain` [Runnable](https://python.langchain.com/docs/concepts/runnables/) objects (such as chat models, retrievers, chains, etc.) can be passed directly into `evaluate()` / `aevaluate()`.
 
-:::info 
+:::info
 
 `langchain` refers to the [Python](https://python.langchain.com) and [TypeScript](https://js.langchain.com) OSS frameworks for building LLM applications.
 
@@ -61,17 +61,18 @@ Now define your chain
       import { ChatOpenAI } from "@langchain/openai";
       import { ChatPromptTemplate } from "@langchain/core/prompts";
       import { StringOutputParser } from "@langchain/core/output_parsers";
-      
+
       const prompt = ChatPromptTemplate.fromMessages([
         ["system", "Please review the user query below and determine if it contains any form of toxic behavior, such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does, and 'Not toxic' if it doesn't."],
         ["user", "{text}"]
       ]);
       const chatModel = new ChatOpenAI();
       const outputParser = new StringOutputParser();
-      
+
       const chain = prompt.pipe(chatModel).pipe(outputParser);
     `,
-  ]}
+
+]}
 />
 
 Then pass the chain to the `evaluate()` method. Note that the input variables of the chain must match the keys of the example inputs. In this case, the example inputs should have the form `{"text": "..."}`.
@@ -89,11 +90,11 @@ Then pass the chain to the `evaluate()` method. Note that the input variables of
         dataset = client.clone_public_dataset(
             "https://smith.langchain.com/public/3d6831e6-1680-4c88-94df-618c8e01fc55/d"
         )
-        
+
         def correct(outputs: dict, reference_outputs: dict) -> bool:
             # Since our chain outputs a string not a dict, this string
             # gets stored under the default "output" key in the outputs dict:
-            actual = outputs["output"] 
+            actual = outputs["output"]
             expected = reference_outputs["label"]
 
             assert actual == expected
@@ -107,14 +108,15 @@ Then pass the chain to the `evaluate()` method. Note that the input variables of
     `,
     typescript`
       import { evaluate } from "langsmith/evaluation";
-      
+
       await evaluate(chain, {
         data: datasetName,
         evaluators: [correctLabel],
         experimentPrefix: "Toxic Queries",
       });
     `,
-  ]}
+
+]}
 />
 
 The runnable is traced appropriately for each output.
diff --git a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
index e69de29b..0e836ef2 100644
--- a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
@@ -0,0 +1,7 @@
+import {
+  CodeTabs,
+  python,
+  typescript,
+} from "@site/src/components/InstructionsWithCode";
+
+# How to evaluate a `langgraph` graph
diff --git a/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx b/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx
index 47926705..47946955 100644
--- a/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx
@@ -13,7 +13,7 @@ import {
 :::
 
 LLM applications can be difficult to evaluate because often they're generating conversational text for which there's no single "right" answer.
-An imperfect but valuable way to evaluate such applications is to use a second LLM to judge the outputs of the first. 
+An imperfect but valuable way to evaluate such applications is to use a second LLM to judge the outputs of the first.
 This can be especially useful if a smaller model is used in the application and a larger, better model is used for evaluation.
 
 ## Custom evaluator via SDK
@@ -36,6 +36,7 @@ For maximal control of evaluator logic, we can write a custom evaluator and run
         """Use an LLM to judge if the reasoning and the answer are consistent."""
 
         instructions = """\\
+
 Given the following question, answer, and reasoning, determine if the reasoning \\
 for the answer is logically valid and consistent with question and the answer.\\
 """
diff --git a/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx b/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx
index 33e5be15..dec2f56e 100644
--- a/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx
@@ -6,16 +6,20 @@ import {
 
 # How to return multiple scores in one evaluator
 
-In most cases, each evaluator returns a single key or categorical value. Alternatively, you can return evaluation metrics from a single evaluator. This is useful if your metrics share intermediate values. For example, precision and recall but rely on the same true and false positives and negative values, or you may have an LLM generate multiple metrics in a single shot.
+Sometimes it is useful for a [custom evaluator function](../../how_to_guides/evaluation/custom_evaluator) to return multiple metrics.
+For example, if you have multiple metrics being generated by an LLM judge, you can save time and money by making a single LLM call that generates multiple metrics instead of making multiple LLM calls.
 
-To return multiple scores, simply return a dictionary/object of the following form:
+To return multiple scores, simply return a list of dictionaries/objects of the following form:
 
 ```python
 {
-    "results": [
-        {"key":string, "score": number},
-        {"key":string, "score": number},
-        # You may log as many as you wish
+    [
+        # 'key' is the metric name
+        # 'score' is the value of a numerical metric
+        {"key": string, "score": number},
+        # 'value' is the value of a categorical metric
+        {"key": string, "value": string},
+        ... # You may log as many as you wish
     ]
 }
 ```
@@ -28,17 +32,17 @@ Example:
   groupId="client-language"
   tabs={[
     python`
-        from langsmith.schemas import Example, Run\n
+        def multiple_scores(outputs: dict, reference_outputs: dict) -> list[dict]:
+            # Replace with real evaluation logic.
+            precision = 0.8
+            recall = 0.9
+            f1 = 0.85
 
-        def multiple_scores(root_run: Run, example: Example) -> dict:
-        # Your evaluation logic here
-        return {
-            "results": [
-                {"key": "precision", "score": 0.8},
-                {"key": "recall", "score": 0.9},
-                {"key": "f1", "score": 0.85},
+            return [
+                {"key": "precision", "score": precision},
+                {"key": "recall", "score": recall},
+                {"key": "f1", "score": f1},
             ]
-        }
     `,
     typescript({
       caption:
diff --git a/docs/evaluation/index.mdx b/docs/evaluation/index.mdx
index ce7f4449..9c41ffba 100644
--- a/docs/evaluation/index.mdx
+++ b/docs/evaluation/index.mdx
@@ -91,6 +91,7 @@ export LANGCHAIN_API_KEY=<your-api-key>`),
         evaluators=[is_concise],
         experiment_prefix="my first experiment "
     )
+
 `,
     typescript`import { Client } from "langsmith";
 import { evaluate } from "langsmith/evaluation";
@@ -120,8 +121,8 @@ data: datasetName,
 evaluators: [isConcise],
 experimentPrefix: "my first experiment ",
 });`,
-  ]}
-  groupId="client-language"
+]}
+groupId="client-language"
 />
 
 ## 5. View Experiments UI

From ac3e7f37f2a5ecc00b15e34c846778b3dc2df2ae Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Fri, 15 Nov 2024 19:31:56 -0800
Subject: [PATCH 14/29] wip

---
 .../evaluation/dataset_subset.mdx             |  2 +-
 .../evaluation/dataset_version.mdx            |  4 +-
 .../how_to_guides/evaluation/llm_as_judge.mdx |  6 +-
 .../how_to_guides/evaluation/metric_type.mdx  | 74 +++++++++++++++++++
 .../evaluation/rate_limiting.mdx              | 46 ++++++++++++
 docs/evaluation/how_to_guides/index.md        |  4 +-
 6 files changed, 131 insertions(+), 5 deletions(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/dataset_subset.mdx b/docs/evaluation/how_to_guides/evaluation/dataset_subset.mdx
index 5ef78843..547fbcab 100644
--- a/docs/evaluation/how_to_guides/evaluation/dataset_subset.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/dataset_subset.mdx
@@ -15,7 +15,7 @@ Before diving into this content, it might be helpful to read:
 
 :::
 
-# How to: Evaluate on a filtered view of a dataset
+## Evaluate on a filtered view of a dataset
 
 You can use the `list_examples` / `listExamples` method to fetch a subset of examples from a dataset to evaluate on. You can refer to guide above to learn more about the different ways to fetch examples.
 
diff --git a/docs/evaluation/how_to_guides/evaluation/dataset_version.mdx b/docs/evaluation/how_to_guides/evaluation/dataset_version.mdx
index c61aed1e..e592bcad 100644
--- a/docs/evaluation/how_to_guides/evaluation/dataset_version.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/dataset_version.mdx
@@ -21,10 +21,12 @@ Simply use `list_examples` / `listExamples` to fetch examples from a particular
   tabs={[
     python`
       from langsmith import evaluate
+        
+      latest_data=client.list_examples(dataset_name=toxic_dataset_name, as_of="latest")
       
       results = evaluate(
           lambda inputs: label_text(inputs["text"]),
-          data=client.list_examples(dataset_name=toxic_dataset_name, as_of="latest"),
+          data=latest_data,
           evaluators=[correct_label],
           experiment_prefix="Toxic Queries",
       )
diff --git a/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx b/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx
index 47946955..2f6d6655 100644
--- a/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx
@@ -76,4 +76,8 @@ See [here](../../how_to_guides/evaluation/custom_evaluator) for more on how to w
 
 ## Builtin evaluator via the UI
 
-## Prebuilt evaluator via LangChain
+See [here](../../how_to_guides/evaluation/builtin_evaluators) for how to use LangSmith's builtin evaluators.
+
+## Prebuilt evaluator via `langchain`
+
+See [here](../../how_to_guides/evaluation/use_langchain_off_the_shelf_evaluators) for how to use prebuilt evaluators from `langchain`.
diff --git a/docs/evaluation/how_to_guides/evaluation/metric_type.mdx b/docs/evaluation/how_to_guides/evaluation/metric_type.mdx
index e69de29b..cd97786d 100644
--- a/docs/evaluation/how_to_guides/evaluation/metric_type.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/metric_type.mdx
@@ -0,0 +1,74 @@
+import {
+  CodeTabs,
+  python,
+  typescript,
+} from "@site/src/components/InstructionsWithCode";
+
+# How to return categorical vs numerical metrics
+
+:::info Key concepts
+
+- Metrics
+
+:::
+
+LangSmith supports both categorical and numerical metrics, and you can return either when writing a [custom evaluator](../../how_to_guides/evaluation/custom_evaluator).
+
+For an evaluator result to be logged as a numerical metric, it must returned as:
+
+- an `int`, `float`, or `bool`
+- a dict of the form `{"key": "metric_name", "score": int | float | bool}`
+
+For an evaluator result to be logged as a categorical metric, it must be returned as:
+
+- a `str`
+- a dict of the form `{"key": "metric_name", "value": str | int | float | bool}`
+
+Here are some examples:
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+        def numerical_metric(inputs: dict, outputs: dict, reference_outputs: dict) -> float:
+            # Evaluation logic...
+            
+            return 0.8
+            
+            # Equivalently
+            # return {"score": 0.8}
+
+            # Or
+            # return {"key": "numerical_metric", "score": 0.8}
+
+        def categorical_metric(inputs: dict, outputs: dict, reference_outputs: dict) -> str:
+            # Evaluation logic...
+
+            return "english"
+
+            # Equivalently
+            # return {"key": "categorical_metric", "score": "english"}
+
+            # Or
+            # return {"score": "english"}
+    `,
+    typescript({
+      caption:
+        "Support for multiple scores is available in `langsmith@0.1.32` and higher",
+    })`
+      import type { Run, Example } from "langsmith/schemas";
+
+      function multipleScores(rootRun: Run, example: Example) {
+        // Your evaluation logic here
+        return {
+            results: [
+                { key: "precision", score: 0.8 },
+                { key: "recall", score: 0.9 },
+                { key: "f1", score: 0.85 },
+            ],
+        };
+      }
+    `,
+
+]}
+/>
diff --git a/docs/evaluation/how_to_guides/evaluation/rate_limiting.mdx b/docs/evaluation/how_to_guides/evaluation/rate_limiting.mdx
index e69de29b..babeab1f 100644
--- a/docs/evaluation/how_to_guides/evaluation/rate_limiting.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/rate_limiting.mdx
@@ -0,0 +1,46 @@
+import {
+  CodeTabs,
+  python,
+  typescript,
+} from "@site/src/components/InstructionsWithCode";
+
+# How to handle model rate limits
+
+A common issue when running large enough evaluation jobs with high enough concurrency is running into third-party rate limit errors, usually from model providers.
+
+## Using `langchain`
+
+If you're using `langchain` ChatModels in your application or evaluators you can add rate limiters to your model(s) that will space out the frequency with which requests are made to the model provider API, so that you don't hit the model provider rate limits.
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+        from langchain.chat_models import init_chat_model
+        from langchain_core.rate_limiters import InMemoryRateLimiter
+
+        rate_limiter = InMemoryRateLimiter(
+            requests_per_second=0.1,  # <-- Super slow! We can only make a request once every 10 seconds!!
+            check_every_n_seconds=0.1,  # Wake up every 100 ms to check whether allowed to make a request,
+            max_bucket_size=10,  # Controls the maximum burst size.
+        )
+
+        llm = init_chat_model("gpt-4o", rate_limiter=rate_limiter)
+
+        def app(inputs: dict) -> dict:
+            ...
+    `,
+    typescript`
+      ...
+    `,
+
+]}
+/>
+
+See the `langchain` documentation for more on how to configure rate limiters: [Python](https://python.langchain.com/docs/how_to/chat_model_rate_limiting/), [JS]().
+
+## Limiting `max_concurrency`
+
+## Related
+
+- See [here](../../how_to_guides/evaluation/large_job) for more guidance on how to run large evaluation jobs efficiently.
diff --git a/docs/evaluation/how_to_guides/index.md b/docs/evaluation/how_to_guides/index.md
index 550b599b..9300c994 100644
--- a/docs/evaluation/how_to_guides/index.md
+++ b/docs/evaluation/how_to_guides/index.md
@@ -30,7 +30,7 @@ Evaluate and improve your application.
 - [Evaluate aggregate experiment results](./how_to_guides/evaluation/summary)
 - [Evaluate intermediate steps](./how_to_guides/evaluation/evaluate_on_intermediate_steps)
 - [Return multiple metrics in one evaluator](./how_to_guides/evaluation/multiple_scores)
-- [Return categorical and continuous metrics](./how_to_guides/evaluation/metric_type)
+- [Return categorical vs numerical metrics](./how_to_guides/evaluation/metric_type)
 - [Check your evaluator setup](./how_to_guides/evaluation/check_evaluator)
 
 ### Configure the data
@@ -42,7 +42,7 @@ Evaluate and improve your application.
 
 - [Evaluate with repetitions](./how_to_guides/evaluation/repetition)
 - [Run a large evaluation job](./how_to_guides/evaluation/large_job)
-- [Handle rate limiting](./how_to_guides/evaluation/rate_limiting)
+- [Handle model rate limits](./how_to_guides/evaluation/rate_limiting)
 
 ## Unit testing
 

From bbcbe331b11c1b49f1b58b203a49e3f61fafc881 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Sun, 17 Nov 2024 13:20:05 -0800
Subject: [PATCH 15/29] wip

---
 .../evaluation/langchain_runnable.mdx         | 14 ++--
 .../evaluation/rate_limiting.mdx              | 75 +++++++++++++++++--
 docs/evaluation/how_to_guides/index.md        |  4 +-
 3 files changed, 78 insertions(+), 15 deletions(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx b/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
index bfab559e..78416a32 100644
--- a/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
@@ -10,11 +10,13 @@ import {
 
 :::info
 
-`langchain` refers to the [Python](https://python.langchain.com) and [TypeScript](https://js.langchain.com) OSS frameworks for building LLM applications.
+`langchain` refers to the [Python](https://python.langchain.com) and [JS/TS](https://js.langchain.com) OSS frameworks for building LLM applications.
 
 :::
 
-First, install all the required packages
+## Setup
+
+Let's define a simple chain to evaluate. First, install all the required packages:
 
 <CodeTabs
   tabs={[
@@ -34,7 +36,7 @@ First, install all the required packages
   groupId="client-language"
 />
 
-Now define your chain
+Now define a chain:
 
 <CodeTabs
   groupId="client-language"
@@ -75,7 +77,9 @@ Now define your chain
 ]}
 />
 
-Then pass the chain to the `evaluate()` method. Note that the input variables of the chain must match the keys of the example inputs. In this case, the example inputs should have the form `{"text": "..."}`.
+## Evaluate
+
+To evaluate our chain we can pass it directly to the `evaluate()` / `aevaluate()` method. Note that the input variables of the chain must match the keys of the example inputs. In this case, the example inputs should have the form `{"text": "..."}`.
 
 <CodeTabs
   groupId="client-language"
@@ -111,7 +115,7 @@ Then pass the chain to the `evaluate()` method. Note that the input variables of
 
       await evaluate(chain, {
         data: datasetName,
-        evaluators: [correctLabel],
+        evaluators: [correct],
         experimentPrefix: "Toxic Queries",
       });
     `,
diff --git a/docs/evaluation/how_to_guides/evaluation/rate_limiting.mdx b/docs/evaluation/how_to_guides/evaluation/rate_limiting.mdx
index babeab1f..11f7fb50 100644
--- a/docs/evaluation/how_to_guides/evaluation/rate_limiting.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/rate_limiting.mdx
@@ -6,11 +6,12 @@ import {
 
 # How to handle model rate limits
 
-A common issue when running large enough evaluation jobs with high enough concurrency is running into third-party rate limit errors, usually from model providers.
+A common issue when running large evaluation jobs is running into third-party API rate limits, usually from model providers.
+There are a few ways to deal with rate limits.
 
-## Using `langchain`
+## Using `langchain` RateLimiters
 
-If you're using `langchain` ChatModels in your application or evaluators you can add rate limiters to your model(s) that will space out the frequency with which requests are made to the model provider API, so that you don't hit the model provider rate limits.
+If you're using `langchain` Python ChatModels in your application or evaluators, you can add rate limiters to your model(s) that will add client-side control of the frequency with which requests are sent to the model provider API to avoid rate limit errors.
 
 <CodeTabs
   groupId="client-language"
@@ -28,18 +29,76 @@ If you're using `langchain` ChatModels in your application or evaluators you can
         llm = init_chat_model("gpt-4o", rate_limiter=rate_limiter)
 
         def app(inputs: dict) -> dict:
+            response = llm.invoke(...)
+            ...
+
+        def evaluator(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
+            response = llm.invoke(...)
             ...
-    `,
-    typescript`
-      ...
     `,
 
 ]}
 />
 
-See the `langchain` documentation for more on how to configure rate limiters: [Python](https://python.langchain.com/docs/how_to/chat_model_rate_limiting/), [JS]().
+See the [langchain](https://python.langchain.com/docs/how_to/chat_model_rate_limiting/) documentation for more on how to configure rate limiters.
+
+## Retrying with exponential backoff
+
+A very common way to deal with rate limit errors is retrying with exponential backoff.
+Retrying with exponential backoff means repeatedly retrying failed requests with an (exponentially) increasing wait time between each retry.
+This continues until either the request succeeds or a maximum number of requests is made.
+
+#### With `langchain`
+
+If you're using `langchain` components you can add retries to all model calls with the `.with_retry(...)` method:
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+        from langchain import init_chat_model
+        
+        llm = init_chat_model("gpt-4o-mini").with_retry(stop_after_attempt=6)
+    `,
+    typescript`
+      
+    `,
+  ]}
+/>
+
+See the `langchain` [Python](https://python.langchain.com/api_reference/core/language_models/langchain_core.language_models.chat_models.BaseChatModel.html#langchain_core.language_models.chat_models.BaseChatModel.with_retry) and [JS](https://v03.api.js.langchain.com/classes/_langchain_core.language_models_chat_models.BaseChatModel.html#withRetry) API references for more.
+
+#### Without `langchain`
 
-## Limiting `max_concurrency`
+If you're not using `langchain` you can use other libraries like `tenacity` (Python) or `backoff` (Python) to implement retries with exponential backoff, or you can implement it from scratch.
+See some examples of how to do this in the [OpenAI docs](https://platform.openai.com/docs/guides/rate-limits#retrying-with-exponential-backoff).
+
+## Limiting max_concurrency
+
+Limiting the number of concurrent calls you're making to your application and evaluators is another way to decrease the frequency of model calls you're making, and in that way avoid rate limit errors.
+`max_concurrency` can be set directly on the [evaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) / [aevaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._arunner.aevaluate.html) functions.
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+      from langsmith import evaluate
+        
+      results = evaluate(
+          ...
+          max_concurrency=4,
+      )
+    `,
+    typescript`
+      import { evaluate } from "langsmith/evaluation";
+      
+      await evaluate(..., {
+        ...,
+        maxConcurrency: 4,
+      });
+    `,
+  ]}
+/>
 
 ## Related
 
diff --git a/docs/evaluation/how_to_guides/index.md b/docs/evaluation/how_to_guides/index.md
index 9300c994..7aa52605 100644
--- a/docs/evaluation/how_to_guides/index.md
+++ b/docs/evaluation/how_to_guides/index.md
@@ -8,7 +8,7 @@ For comprehensive descriptions of every class and function see the [API referenc
 
 ## Offline evaluation
 
-Evaluate and improve your application.
+Evaluate and improve your application before deploying it.
 
 ### Run an evaluation
 
@@ -33,7 +33,7 @@ Evaluate and improve your application.
 - [Return categorical vs numerical metrics](./how_to_guides/evaluation/metric_type)
 - [Check your evaluator setup](./how_to_guides/evaluation/check_evaluator)
 
-### Configure the data
+### Configure the evaluation data
 
 - [Evaluate on a split / filtered view of a dataset](./how_to_guides/evaluation/dataset_subset)
 - [Evaluate on a specific dataset version](./how_to_guides/evaluation/dataset_version)

From 5e6a4e0e0d2d82ac083e95bdafaabad15760d6c7 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Thu, 21 Nov 2024 17:06:40 -0500
Subject: [PATCH 16/29] pairwise

---
 docs/evaluation/concepts/index.mdx            |   5 +-
 .../evaluation/evaluate_pairwise.mdx          | 112 ++++++++++--------
 docs/evaluation/how_to_guides/index.md        |   1 +
 docs/evaluation/index.mdx                     |  31 ++---
 4 files changed, 78 insertions(+), 71 deletions(-)

diff --git a/docs/evaluation/concepts/index.mdx b/docs/evaluation/concepts/index.mdx
index 523498ea..ab0f1f68 100644
--- a/docs/evaluation/concepts/index.mdx
+++ b/docs/evaluation/concepts/index.mdx
@@ -1,4 +1,4 @@
-# Concepts
+# Evaluation concepts
 
 The pace of AI application development is often rate-limited by high-quality evaluations because there is a paradox of choice. Developers often wonder how to engineer their prompt or which LLM best balances accuracy, latency, and cost. High quality evaluations can help you rapidly answer these types of questions with confidence.
 
@@ -130,7 +130,8 @@ See documentation on our workflow to audit and manually correct evaluator scores
 
 ### Pairwise
 
-Pairwise evaluators pick the better of two task outputs based upon some criteria.
+Pairwise evaluators allow you to compare the outputs of two versions of an application.
+Think [LMSYS Chatbot Arena](https://chat.lmsys.org/) - this is the same concept, but applied to AI applications more generally, not just models!
 This can use either a heuristic ("which response is longer"), an LLM (with a specific pairwise prompt), or human (asking them to manually annotate examples).
 
 **When should you use pairwise evaluation?** Pairwise evaluation is helpful when it is difficult to directly score an LLM output, but easier to compare two outputs.
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx
index 2200b950..ca7b7f87 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx
@@ -11,19 +11,20 @@ import {
 
 # How to run pairwise evaluations
 
-:::tip Recommended Reading
-Before diving into this content, it might be helpful to read the following:
+:::info Key concepts
 
-- [How-to guide on running regular evals](./evaluate_llm_application)
+- [Pairwise evaluations](../../concepts#pairwise)
 
 :::
 
-LangSmith supports evaluating **existing** experiments in a comparative manner. This allows you to use automatic evaluators (especially, LLM-based evaluators) to score the outputs from multiple experiments against each other, rather than being confined to evaluating outputs one at a time. Think [LMSYS Chatbot Arena](https://chat.lmsys.org/) - this is the same concept! To do this, use the `evaluate_comparative` / `evaluateComparative` function
-with two existing experiments.
+LangSmith supports evaluating **existing** experiments in a comparative manner.
+This allows you to score the outputs from multiple experiments against each other, rather than being confined to evaluating outputs one at a time.
+Think [LMSYS Chatbot Arena](https://chat.lmsys.org/) - this is the same concept!
+To do this, use the [evaluate_comparative](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate_comparative.html) / `evaluateComparative` function with two existing experiments.
 
 If you haven't already created experiments to compare, check out our [quick start](https://docs.smith.langchain.com/#5-run-your-first-evaluation) or oue [how-to guide](https://docs.smith.langchain.com/how_to_guides/evaluation/evaluate_llm_application) to get started with evaluations.
 
-## Use the `evaluate_comparative` function
+## `evaluate_comparative` args
 
 :::note
 Pairwise evaluations currently require `langsmith` SDK Python version `>=0.1.55` or JS version `>=0.1.24`.
@@ -48,19 +49,34 @@ Along with these, you can also pass in the following optional args:
 | `metadata`                               | Metadata to attach to your pairwise experiment. Defaults to None.                                                                                                                                                                                                                                                                                                              |
 | `load_nested` / `loadNested`             | Whether to load all child runs for the experiment. When False, only the root trace will be passed to your evaluator. Defaults to False.                                                                                                                                                                                                                                        |
 
-## Configure inputs and outputs for pairwise evaluators
+## Define a pairwise evaluator
 
-**Inputs:** A list of Runs and a single Example. This is exactly the same as a normal evaluator, except with a list of Runs instead of a single Run. The list of runs will have a length of two. You can access the inputs and outputs with
-`runs[0].inputs`, `runs[0].outputs`, `runs[1].inputs`, `runs[1].outputs`, `example.inputs`, and `example.outputs`.
+Pairwise evaluators are just functions with an expected signature.
 
-**Output:** Your evaluator should return a dictionary with two keys:
+### Evaluator args
 
-- `key`, which represents the feedback key that will be logged
-- `scores`, which is a mapping from run ID to score for that run. **We strongly encourage using 0 and 1 as the score values, where 1 is better.** You may also set both to 0 to represent "both equally bad" or both to 1 for "both equally good".
+Custom evaluator functions must have specific argument names. They can take any subset of the following arguments:
+
+- `inputs: dict`: A dictionary of the inputs corresponding to a single example in a dataset.
+- `outputs: list[dict]`: A two-item list of the outputs produced by each experiment on the given inputs.
+- `reference_outputs: dict`: A dictionary of the reference outputs associated with the example, if available.
+- `runs: list[langsmith.schemas.Run]`: A two-item list of the full Run objects generated by the two experiments on the given example. Use this if you need access to intermediate steps or metadata about each run.
+- `example: langsmith.schemas.Example`: The full dataset Example, including the example inputs, outputs (if available), and metdata (if available).
+
+For most use cases you'll only need `inputs`, `outputs`, and `reference_outputs`. `run` and `example` are useful only if you need some extra trace or example metadata outside of the actual inputs and outputs of the application.
+
+### Evaluator output
+
+Custom evaluators are expected to return one of the following types:
+
+- `dict`: dictionary with keys:
+  - `key`, which represents the feedback key that will be logged
+  - `scores`, which is a mapping from run ID to score for that run.
+- `list[int | float | bool]`: a two-item list of scores. The list is assumed to have the same order as the `runs` / `outputs` evaluator args. The evaluator function name is used for the feedback key.
 
 Note that you should choose a feedback key that is distinct from standard feedbacks on your run. We recommend prefixing pairwise feedback keys with `pairwise_` or `ranked_`.
 
-## Compare two experiments with LLM-based pairwise evaluators
+## Run a pairwise evaluation
 
 The following example uses [a prompt](https://smith.langchain.com/hub/langchain-ai/pairwise-evaluation-2)
 which asks the LLM to decide which is better between two AI assistant responses. It uses structured output to parse the AI's response: 0, 1, or 2.
@@ -77,41 +93,34 @@ The prompt asks the LLM to decide which is better between two AI assistant respo
 <CodeTabs
   tabs={[
     python`
-      from langsmith import evaluate_comparative
       from langchain import hub
-      from langchain_openai import ChatOpenAI
-      from langsmith.schemas import Run, Example
+      from langchain.chat_models import init_chat_model
+      from langsmith import evaluate_comparative
+
       prompt = hub.pull("langchain-ai/pairwise-evaluation-2")
-      
-      def evaluate_pairwise(runs: list[Run], example: Example):
-          scores = {}
-          
-          # Create the model to run your evaluator
-          model = ChatOpenAI(model_name="gpt-4")
-          
-          runnable = prompt | model
-          response = runnable.invoke({
-              "question": example.inputs["question"],
-              "answer_a": runs[0].outputs["output"] if runs[0].outputs is not None else "N/A",
-              "answer_b": runs[1].outputs["output"] if runs[1].outputs is not None else "N/A",
+      model = init_chat_model("gpt-4o")
+      chain = prompt | model
+
+      def ranked_preference(inputs: dict, outputs: list[dict]) -> list:
+          response = chain.invoke({
+              "question": inputs["question"],
+              "answer_a": outputs[0].get("output", "N/A"),
+              "answer_b": outputs[1].get("output", "N/A"),
           })
-          score = response["Preference"]
-          if score == 1:
-              scores[runs[0].id] = 1
-              scores[runs[1].id] = 0
-          elif score == 2:
-              scores[runs[0].id] = 0
-              scores[runs[1].id] = 1
+          preference = response["Preference"]
+
+          if preference == 1:
+              scores = [1, 0]
+          elif preference == 2:
+              scores = [0, 1]
           else:
-              scores[runs[0].id] = 0
-              scores[runs[1].id] = 0
-          return {"key": "ranked_preference", "scores": scores}
-          
-          
+              scores = [0, 0]
+          return scores
+
       evaluate_comparative(
           # Replace the following array with the names or IDs of your experiments
           ["my-experiment-name-1", "my-experiment-name-2"],
-          evaluators=[evaluate_pairwise],
+          evaluators=[ranked_preference],
       )
     `,
     typescript({
@@ -122,22 +131,22 @@ The prompt asks the LLM to decide which is better between two AI assistant respo
       import { evaluateComparative } from "langsmith/evaluation";
       import { wrapOpenAI } from "langsmith/wrappers";
       import OpenAI from "openai";
-      
+
       const openai = wrapOpenAI(new OpenAI());
       import { z } from "zod";
-      
+
       async function evaluatePairwise(runs: Run[], example: Example) {
         const scores: Record<string, number> = {};
         const [runA, runB] = runs;
-        
+
         if (!runA || !runB) throw new Error("Expected at least two runs");
-        
+
         const payload = {
           question: example.inputs?.question,
           answer_a: runA?.outputs?.output ?? "N/A",
           answer_b: runB?.outputs?.output ?? "N/A",
         };
-        
+
         const output = await openai.chat.completions.create({
           model: "gpt-4-turbo",
           messages: [
@@ -189,13 +198,13 @@ The prompt asks the LLM to decide which is better between two AI assistant respo
             },
           ],
         });
-        
+
         const { Preference } = z
           .object({ Preference: z.number() })
           .parse(
             JSON.parse(output.choices[0].message.tool_calls[0].function.arguments)
           );
-          
+
         if (Preference === 1) {
           scores[runA.id] = 1;
           scores[runB.id] = 0;
@@ -206,15 +215,16 @@ The prompt asks the LLM to decide which is better between two AI assistant respo
           scores[runA.id] = 0;
           scores[runB.id] = 0;
         }
-        
+
         return { key: "ranked_preference", scores };
       }
-      
+
       await evaluateComparative(["earnest-name-40", "reflecting-pump-91"], {
         evaluators: [evaluatePairwise],
       });
     `,
-  ]}
+
+]}
 />
 
 ## View pairwise experiments
diff --git a/docs/evaluation/how_to_guides/index.md b/docs/evaluation/how_to_guides/index.md
index 7aa52605..307e1860 100644
--- a/docs/evaluation/how_to_guides/index.md
+++ b/docs/evaluation/how_to_guides/index.md
@@ -25,6 +25,7 @@ Evaluate and improve your application before deploying it.
 
 - [Define a custom evaluator](./how_to_guides/evaluation/custom_evaluator)
 - [Define an LLM-as-a-judge evaluator](./how_to_guides/evaluation/llm_as_judge)
+- [Define a pairwise evaluator](./how_to_guides/evaluation/evaluate_pairwise)
 - [Use an off-the-shelf evaluator via the SDK (Python only)](./how_to_guides/evaluation/use_langchain_off_the_shelf_evaluators)
 - [Use an off-the-shelf evaluator via the UI](./how_to_guides/evaluation/builtin_evaluators)
 - [Evaluate aggregate experiment results](./how_to_guides/evaluation/summary)
diff --git a/docs/evaluation/index.mdx b/docs/evaluation/index.mdx
index 9c41ffba..f82cf5f0 100644
--- a/docs/evaluation/index.mdx
+++ b/docs/evaluation/index.mdx
@@ -8,24 +8,11 @@ import {
   CodeTabs,
   python,
   typescript,
-  PythonBlock,
   ShellBlock,
-  TypeScriptBlock,
 } from "@site/src/components/InstructionsWithCode";
-import {
-  LangChainInstallationCodeTabs,
-  LangChainQuickStartCodeTabs,
-  ConfigureEnvironmentCodeTabs,
-  RunTreeQuickStartCodeTabs,
-  ConfigureSDKEnvironmentCodeTabs,
-  PythonSDKTracingCode,
-  TypeScriptSDKTracingCode,
-} from "@site/src/components/QuickStart";
-import { ClientInstallationCodeTabs } from "@site/src/components/ClientInstallation";
-import DocCardList from "@theme/DocCardList";
 import { RegionalUrl } from "@site/src/components/RegionalUrls";
 
-# Evaluation Quick Start
+# Evaluation quick start
 
 This quick start will get you up and running with our evaluation SDK and Experiments UI.
 
@@ -82,12 +69,12 @@ export LANGCHAIN_API_KEY=<your-api-key>`),
 
     # 3. Define the interface to your app
     def chatbot(inputs: dict) -> dict:
-        return {"answer": inputs["question"] + "is a good question. I don't know the answer."}
+        return {"answer": inputs["question"] + " is a good question. I don't know the answer."}
 
     # 4. Run an evaluation
     evaluate(
         chatbot,
-        data=dataset.name,
+        data=dataset,
         evaluators=[is_concise],
         experiment_prefix="my first experiment "
     )
@@ -127,5 +114,13 @@ groupId="client-language"
 
 ## 5. View Experiments UI
 
-Click the link printed out by your evaluation run to access the LangSmith experiments UI,
-and explore the results of your evaluation.
+Click the link printed out by your evaluation run to access the LangSmith Experiments UI, and explore the results of your evaluation.
+
+![](./how_to_guides/evaluation/static/view_experiment.gif)
+
+## Next steps
+
+For conceptual explanations see the [Conceptual guide](./evaluation/concepts).
+See the [How-to guides](./evaluation/how_to_guides) for answers to “How do I….?” format questions.
+For end-to-end walkthroughs see [Tutorials](./evaluation/tutorials).
+For comprehensive descriptions of every class and function see the [API reference](https://langsmith-sdk.readthedocs.io/en/latest/evaluation.html).

From 7f78762d92c81f760a1bf94ac483dee39f51b430 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Fri, 22 Nov 2024 12:19:47 -0500
Subject: [PATCH 17/29] langgraph

---
 .../how_to_guides/evaluation/async.mdx        |  10 +-
 .../evaluation/evaluate_llm_application.mdx   |  13 +-
 .../evaluate_on_intermediate_steps.mdx        |   4 +
 .../evaluation/langchain_runnable.mdx         |   8 +-
 .../how_to_guides/evaluation/langgraph.mdx    | 218 ++++++++++++++++++
 .../how_to_guides/evaluation/large_job.mdx    |   0
 docs/evaluation/how_to_guides/index.md        |   1 -
 7 files changed, 245 insertions(+), 9 deletions(-)
 delete mode 100644 docs/evaluation/how_to_guides/evaluation/large_job.mdx

diff --git a/docs/evaluation/how_to_guides/evaluation/async.mdx b/docs/evaluation/how_to_guides/evaluation/async.mdx
index cc2c9c59..1786107b 100644
--- a/docs/evaluation/how_to_guides/evaluation/async.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/async.mdx
@@ -6,6 +6,12 @@ import {
 
 # How to run an evaluation asynchronously
 
+:::info Key concepts
+
+[Evaluations](../../concepts#applying-evaluations) | [Evaluators](../../concepts#evaluators) | [Datasets](../../concepts#datasets) | [Experiments](../../concepts#experiments)
+
+:::
+
 We can run evaluations asynchronously via the SDK using [aevaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._arunner.aevaluate.html),
 which accepts all of the same arguments as [evaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) but expects the application function to be asynchronous.
 You can learn more about how to use the `evaluate()` function [here](../../how_to_guides/evaluation/evaluate_llm_application).
@@ -13,12 +19,12 @@ You can learn more about how to use the `evaluate()` function [here](../../how_t
 :::info Python only
 
 This guide is only relevant when using the Python SDK.
-In JS/TypeScript the `evaluate()` function is already async.
+In JS/TS the `evaluate()` function is already async.
 You can see how to use it [here](../../how_to_guides/evaluation/evaluate_llm_application).
 
 :::
 
-## Using `aevaluate()`
+## Use `aevaluate()`
 
 <CodeTabs
   groupId="client-language"
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
index c196e74e..822e381d 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
@@ -12,15 +12,20 @@ import {
 
 :::info Key concepts
 
-- [Evaluations](../../concepts#applying-evaluations)
-- [Evaluators](../../concepts#evaluators)
-- [Datasets](../../concepts#datasets)
-- [Experiments](../../concepts#experiments)
+[Evaluations](../../concepts#applying-evaluations) | [Evaluators](../../concepts#evaluators) | [Datasets](../../concepts#datasets) | [Experiments](../../concepts#experiments)
 
 :::
 
 In this guide we'll go over how to evaluate an application using the [evaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) method in the LangSmith SDK.
 
+:::tip
+
+For larger evaluation jobs in Python we recommend using [aevaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._arunner.aevaluate.html), the asynchronous version of `evaluate()`.
+It is still worthwhile to read this guide first, as the two have nearly identical interfaces,
+and then read the how-to guide on [running an evaluation asynchronously](../../how_to_guides/evaluation/async).
+
+:::
+
 ## Define an application
 
 First we need an application to evaluate. Let's create a simple toxicity classifier for this example.
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_on_intermediate_steps.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_on_intermediate_steps.mdx
index a9d9688a..39e1041a 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_on_intermediate_steps.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_on_intermediate_steps.mdx
@@ -388,3 +388,7 @@ Finally, we'll run `evaluate` with the custom evaluators defined above.
 
 The experiment will contain the results of the evaluation, including the scores and comments from the evaluators:
 ![](../evaluation/static/evaluation_intermediate_experiment.png)
+
+## Related
+
+- [Evaluate a `langgraph` graph](../evaluation/langgraph)
diff --git a/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx b/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
index 78416a32..9ca62d03 100644
--- a/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
@@ -85,7 +85,7 @@ To evaluate our chain we can pass it directly to the `evaluate()` / `aevaluate()
   groupId="client-language"
   tabs={[
     python`
-        from langsmith import evaluate, Client
+        from langsmith import aevaluate, Client
 
         client = Client()
 
@@ -103,7 +103,7 @@ To evaluate our chain we can pass it directly to the `evaluate()` / `aevaluate()
 
             assert actual == expected
 
-        results = evaluate(
+        results = await aevaluate(
             chain,
             data=dataset,
             evaluators=[correct],
@@ -126,3 +126,7 @@ To evaluate our chain we can pass it directly to the `evaluate()` / `aevaluate()
 The runnable is traced appropriately for each output.
 
 ![](../evaluation/static/runnable_eval.png)
+
+## Related
+
+- [How to evaluate a `langgraph` graph](../evaluation/langgraph)
diff --git a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
index 0e836ef2..29668e1f 100644
--- a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
@@ -5,3 +5,221 @@ import {
 } from "@site/src/components/InstructionsWithCode";
 
 # How to evaluate a `langgraph` graph
+
+:::info Key concepts
+[langgraph](https://langchain-ai.github.io/langgraph/)
+:::
+
+`langgraph` is a library for building stateful, multi-actor applications with LLMs, used to create agent and multi-agent workflows.
+Evaluating `langgraph` graphs can be challenging because a single invocation can involve many LLM calls, and which LLM calls are made may depend on the outputs of preceding calls.
+In this guide we will focus on the mechanics of how to pass graphs and graph nodes to `evaluate()` / `aevaluate()`.
+For evaluation techniques and best practices when building agents head to the [langgraph docs](https://langchain-ai.github.io/langgraph/tutorials/#evaluation).
+
+## End-to-end evaluations
+
+The most common type of evaluation is an end-to-end one, where we want to evaluate the final graph output for each example input.
+
+### Define a graph
+
+Lets construct a simple ReACT agent to start:
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+from typing import Annotated, Literal, TypedDict
+
+from langchain.chat_models import init_chat_model
+from langchain_core.tools import tool
+from langgraph.graph import END, START, StateGraph
+from langgraph.prebuilt import ToolNode
+from langgraph.graph.message import add_messages
+
+class State(TypedDict): # Messages have the type "list". The 'add_messages' function # in the annotation defines how this state key should be updated # (in this case, it appends messages to the list, rather than overwriting them)
+messages: Annotated[list, add_messages]
+
+# Define the tools for the agent to use
+
+@tool
+def search(query: str) -> str:
+"""Call to surf the web.""" # This is a placeholder, but don't tell the LLM that...
+if "sf" in query.lower() or "san francisco" in query.lower():
+return "It's 60 degrees and foggy."
+return "It's 90 degrees and sunny."
+
+tools = [search]
+tool_node = ToolNode(tools)
+model = init_chat_model("claude-3-5-sonnet-latest").bind_tools(tools)
+
+# Define the function that determines whether to continue or not
+
+def should_continue(state: State) -> Literal["tools", END]:
+messages = state['messages']
+last_message = messages[-1] # If the LLM makes a tool call, then we route to the "tools" node
+if last_message.tool_calls:
+return "tools" # Otherwise, we stop (reply to the user)
+return END
+
+# Define the function that calls the model
+
+def call_model(state: State):
+messages = state['messages']
+response = model.invoke(messages) # We return a list, because this will get added to the existing list
+return {"messages": [response]}
+
+# Define a new graph
+
+workflow = StateGraph(State)
+
+# Define the two nodes we will cycle between
+
+workflow.add_node("agent", call_model)
+workflow.add_node("tools", tool_node)
+
+# Set the entrypoint as 'agent'
+
+# This means that this node is the first one called
+
+workflow.add_edge(START, "agent")
+
+# We now add a conditional edge
+
+workflow.add_conditional_edges( # First, we define the start node. We use 'agent'. # This means these are the edges taken after the 'agent' node is called.
+"agent", # Next, we pass in the function that will determine which node is called next.
+should_continue,
+)
+
+# We now add a normal edge from 'tools' to 'agent'.
+
+# This means that after 'tools' is called, 'agent' node is called next.
+
+workflow.add_edge("tools", 'agent')
+
+# Finally, we compile it!
+
+# This compiles it into a LangChain Runnable,
+
+# meaning you can use it as you would any other runnable.
+
+# Note that we're (optionally) passing the memory when compiling the graph
+
+app = workflow.compile()
+`,
+    typescript`
+// ToDo
+`,
+
+]}
+/>
+
+### Create a dataset
+
+Let's create a simple dataset of questions and expected responses:
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+    from langsmith import Client
+
+    questions = [
+        "what's the weather in sf",
+        "whats the weather in san fran",
+        "whats the weather in tangier"
+    ]
+    answers = [
+        "It's 60 degrees and foggy.",
+        "It's 60 degrees and foggy.",
+        "It's 90 degrees and sunny.",
+    ]
+
+    ls_client = Client()
+
+    dataset = ls_client.create_dataset(
+        "weather agent",
+        inputs=[{"question": q} for q in questions],
+        outputs=[{"answers": a} for a in answers],
+    )
+    `,
+    typescript`
+    // ToDo
+    `,
+
+]}
+/>
+
+### Create an evaluator
+
+And a simple evaluator:
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+
+    judge_llm = init_chat_model("gpt-4o")
+
+    async def correct(outputs: dict, reference_outputs: dict) -> bool:
+        instructions = (
+            "Given an actual answer and an expected answer, determine whether"
+            " the actual answer contains all of the information in the"
+            " expected answer. Respond with 'CORRECT' if the actual answer"
+            " does contain all of the expected information and 'INCORRECT'"
+            " otherwise. Do not include anything else in your response."
+        )
+        # Our graph outputs a State dictionary, which in this case means
+        # we'll have a 'messages' key and the final message should
+        # be our actual answer.
+        actual_answer = outputs["messages"][-1].content
+        expected_answer = reference_outputs["answer"]
+        user_msg = (
+            f"ACTUAL ANSWER: {actual_answer}"
+            f"\\n\\nEXPECTED ANSWER: {expected_answer}"
+        )
+        response = await judge_llm.ainvoke(
+            [
+                {"role": "system", "content": instructions},
+                {"role": "user", "content": user_msg}
+            ]
+        )
+        return response.content.upper() == "CORRECT"
+    `,
+    typescript`
+        // ToDo
+    `,
+
+]}
+/>
+
+### Run evaluations
+
+Now we can run our evaluations
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+    from langsmith import aevaluate
+
+    experiment_results = aevaluate(
+        app,
+        data="weather agent",
+        evaluators=[correct],
+        max_concurrency=4,  # optional
+        experiment_prefix="claude-3.5-baseline",  # optional
+    )
+    `,
+    typescript`
+        // ToDo
+    `,
+
+]}
+/>
+
+## Evaluating individual nodes
+
+## Evaluating intermediate steps
+
+## Related
+
+- [`langgraph` evaluation docs](https://langchain-ai.github.io/langgraph/tutorials/#evaluation)
diff --git a/docs/evaluation/how_to_guides/evaluation/large_job.mdx b/docs/evaluation/how_to_guides/evaluation/large_job.mdx
deleted file mode 100644
index e69de29b..00000000
diff --git a/docs/evaluation/how_to_guides/index.md b/docs/evaluation/how_to_guides/index.md
index 307e1860..5fe3ea1e 100644
--- a/docs/evaluation/how_to_guides/index.md
+++ b/docs/evaluation/how_to_guides/index.md
@@ -42,7 +42,6 @@ Evaluate and improve your application before deploying it.
 ### Configure an evaluation job
 
 - [Evaluate with repetitions](./how_to_guides/evaluation/repetition)
-- [Run a large evaluation job](./how_to_guides/evaluation/large_job)
 - [Handle model rate limits](./how_to_guides/evaluation/rate_limiting)
 
 ## Unit testing

From ad4c30d3641e04ebaed98884f3566ddefd836524 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Fri, 22 Nov 2024 13:16:04 -0500
Subject: [PATCH 18/29] wip

---
 .../how_to_guides/evaluation/langgraph.mdx    | 254 ++++++++++++------
 1 file changed, 179 insertions(+), 75 deletions(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
index 29668e1f..a858e477 100644
--- a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
@@ -27,83 +27,71 @@ Lets construct a simple ReACT agent to start:
   groupId="client-language"
   tabs={[
     python`
-from typing import Annotated, Literal, TypedDict
-
-from langchain.chat_models import init_chat_model
-from langchain_core.tools import tool
-from langgraph.graph import END, START, StateGraph
-from langgraph.prebuilt import ToolNode
-from langgraph.graph.message import add_messages
-
-class State(TypedDict): # Messages have the type "list". The 'add_messages' function # in the annotation defines how this state key should be updated # (in this case, it appends messages to the list, rather than overwriting them)
-messages: Annotated[list, add_messages]
-
-# Define the tools for the agent to use
-
-@tool
-def search(query: str) -> str:
-"""Call to surf the web.""" # This is a placeholder, but don't tell the LLM that...
-if "sf" in query.lower() or "san francisco" in query.lower():
-return "It's 60 degrees and foggy."
-return "It's 90 degrees and sunny."
-
-tools = [search]
-tool_node = ToolNode(tools)
-model = init_chat_model("claude-3-5-sonnet-latest").bind_tools(tools)
-
-# Define the function that determines whether to continue or not
-
-def should_continue(state: State) -> Literal["tools", END]:
-messages = state['messages']
-last_message = messages[-1] # If the LLM makes a tool call, then we route to the "tools" node
-if last_message.tool_calls:
-return "tools" # Otherwise, we stop (reply to the user)
-return END
-
-# Define the function that calls the model
-
-def call_model(state: State):
-messages = state['messages']
-response = model.invoke(messages) # We return a list, because this will get added to the existing list
-return {"messages": [response]}
-
-# Define a new graph
-
-workflow = StateGraph(State)
-
-# Define the two nodes we will cycle between
-
-workflow.add_node("agent", call_model)
-workflow.add_node("tools", tool_node)
-
-# Set the entrypoint as 'agent'
-
-# This means that this node is the first one called
-
-workflow.add_edge(START, "agent")
-
-# We now add a conditional edge
-
-workflow.add_conditional_edges( # First, we define the start node. We use 'agent'. # This means these are the edges taken after the 'agent' node is called.
-"agent", # Next, we pass in the function that will determine which node is called next.
-should_continue,
-)
-
-# We now add a normal edge from 'tools' to 'agent'.
-
-# This means that after 'tools' is called, 'agent' node is called next.
-
-workflow.add_edge("tools", 'agent')
-
-# Finally, we compile it!
-
-# This compiles it into a LangChain Runnable,
+    from typing import Annotated, Literal, TypedDict
+
+    from langchain.chat_models import init_chat_model
+    from langchain_core.tools import tool
+    from langgraph.graph import END, START, StateGraph
+    from langgraph.prebuilt import ToolNode
+    from langgraph.graph.message import add_messages
+
+    class State(TypedDict): # Messages have the type "list". The 'add_messages' function # in the annotation defines how this state key should be updated # (in this case, it appends messages to the list, rather than overwriting them)
+    messages: Annotated[list, add_messages]
+
+    # Define the tools for the agent to use
+    @tool
+    def search(query: str) -> str:
+        """Call to surf the web.""" # This is a placeholder, but don't tell the LLM that...
+        if "sf" in query.lower() or "san francisco" in query.lower():
+            return "It's 60 degrees and foggy."
+        return "It's 90 degrees and sunny."
+
+    tools = [search]
+    tool_node = ToolNode(tools)
+    model = init_chat_model("claude-3-5-sonnet-latest").bind_tools(tools)
+
+    # Define the function that determines whether to continue or not
+    def should_continue(state: State) -> Literal["tools", END]:
+        messages = state['messages']
+        last_message = messages[-1] # If the LLM makes a tool call, then we route to the "tools" node
+        if last_message.tool_calls:
+            return "tools" # Otherwise, we stop (reply to the user)
+        return END
+
+    # Define the function that calls the model
+
+    def call_model(state: State):
+        messages = state['messages']
+        response = model.invoke(messages) # We return a list, because this will get added to the existing list
+        return {"messages": [response]}
+
+    # Define a new graph
+    workflow = StateGraph(State)
+
+    # Define the two nodes we will cycle between
+    workflow.add_node("agent", call_model)
+    workflow.add_node("tools", tool_node)
+
+    # Set the entrypoint as 'agent'
+    # This means that this node is the first one called
+    workflow.add_edge(START, "agent")
+
+    # We now add a conditional edge
+    workflow.add_conditional_edges( # First, we define the start node. We use 'agent'. # This means these are the edges taken after the 'agent' node is called.
+        "agent", # Next, we pass in the function that will determine which node is called next.
+        should_continue,
+    )
 
-# meaning you can use it as you would any other runnable.
+    # We now add a normal edge from 'tools' to 'agent'.
+    # This means that after 'tools' is called, 'agent' node is called next.
+    workflow.add_edge("tools", 'agent')
 
-# Note that we're (optionally) passing the memory when compiling the graph
+    # Finally, we compile it!
+    # This compiles it into a LangChain Runnable,
+    # meaning you can use it as you would any other runnable.
+    # Note that we're (optionally) passing the memory when compiling the graph
+    app = workflow.compile()
 
-app = workflow.compile()
 `,
     typescript`
 // ToDo
@@ -193,7 +181,7 @@ And a simple evaluator:
 
 ### Run evaluations
 
-Now we can run our evaluations
+Now we can run our evaluations and explore the results:
 
 <CodeTabs
   groupId="client-language"
@@ -208,6 +196,7 @@ Now we can run our evaluations
         max_concurrency=4,  # optional
         experiment_prefix="claude-3.5-baseline",  # optional
     )
+
     `,
     typescript`
         // ToDo
@@ -223,3 +212,118 @@ Now we can run our evaluations
 ## Related
 
 - [`langgraph` evaluation docs](https://langchain-ai.github.io/langgraph/tutorials/#evaluation)
+
+## Reference code
+
+<details>
+<summary>Click to see a consolidated code snippet</summary>
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+    from typing import Annotated, Literal, TypedDict
+
+    from langchain.chat_models import init_chat_model
+    from langchain_core.tools import tool
+    from langgraph.graph import END, START, StateGraph
+    from langgraph.prebuilt import ToolNode
+    from langgraph.graph.message import add_messages
+    from langsmith import Client, aevaluate
+
+    # Define a graph
+
+    class State(TypedDict): # Messages have the type "list". The 'add_messages' function # in the annotation defines how this state key should be updated # (in this case, it appends messages to the list, rather than overwriting them)
+        messages: Annotated[list, add_messages]
+
+    # Define the tools for the agent to use
+
+    @tool
+    def search(query: str) -> str:
+        """Call to surf the web.""" # This is a placeholder, but don't tell the LLM that...
+        if "sf" in query.lower() or "san francisco" in query.lower():
+            return "It's 60 degrees and foggy."
+        return "It's 90 degrees and sunny."
+
+    tools = [search]
+    tool_node = ToolNode(tools)
+    model = init_chat_model("claude-3-5-sonnet-latest").bind_tools(tools)
+
+    # Define the function that determines whether to continue or not
+
+    def should_continue(state: State) -> Literal["tools", END]:
+        messages = state['messages']
+        last_message = messages[-1] # If the LLM makes a tool call, then we route to the "tools" node
+        if last_message.tool_calls:
+            return "tools" # Otherwise, we stop (reply to the user)
+        return END
+
+    # Define the function that calls the model
+
+    def call_model(state: State):
+        messages = state['messages']
+        response = model.invoke(messages) # We return a list, because this will get added to the existing list
+        return {"messages": [response]}
+
+    # Define a new graph
+    workflow = StateGraph(State)
+
+    # Define the two nodes we will cycle between
+    workflow.add_node("agent", call_model)
+    workflow.add_node("tools", tool_node)
+
+    # Set the entrypoint as 'agent'
+    # This means that this node is the first one called
+    workflow.add_edge(START, "agent")
+
+    # We now add a conditional edge
+    workflow.add_conditional_edges( # First, we define the start node. We use 'agent'. # This means these are the edges taken after the 'agent' node is called.
+        "agent", # Next, we pass in the function that will determine which node is called next.
+        should_continue,
+    )
+
+    # We now add a normal edge from 'tools' to 'agent'.
+    # This means that after 'tools' is called, 'agent' node is called next.
+    workflow.add_edge("tools", 'agent')
+
+    # Finally, we compile it!
+    # This compiles it into a LangChain Runnable,
+    # meaning you can use it as you would any other runnable.
+    # Note that we're (optionally) passing the memory when compiling the graph
+    app = workflow.compile()
+
+    questions = [
+        "what's the weather in sf",
+        "whats the weather in san fran",
+        "whats the weather in tangier"
+    ]
+    answers = [
+        "It's 60 degrees and foggy.",
+        "It's 60 degrees and foggy.",
+        "It's 90 degrees and sunny.",
+    ]
+
+    # Create a dataset
+    ls_client = Client()
+
+    dataset = ls_client.create_dataset(
+        "weather agent",
+        inputs=[{"question": q} for q in questions],
+        outputs=[{"answers": a} for a in answers],
+    )
+
+    # Define evaluators
+
+
+    # Run evaluation
+
+    # Explore results
+
+`,
+    typescript`
+// ToDo
+`,
+
+]}
+/>
+
+</details>

From db7c388386161f4fc3a539a946ca7b911398d3e8 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Fri, 22 Nov 2024 13:46:24 -0500
Subject: [PATCH 19/29] fix

---
 docs/evaluation/how_to_guides/evaluation/async.mdx         | 1 -
 docs/evaluation/how_to_guides/evaluation/rate_limiting.mdx | 1 -
 2 files changed, 2 deletions(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/async.mdx b/docs/evaluation/how_to_guides/evaluation/async.mdx
index 1786107b..92ab9f2c 100644
--- a/docs/evaluation/how_to_guides/evaluation/async.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/async.mdx
@@ -80,4 +80,3 @@ list 5 concrete questions that should be investigated to determine if the idea i
 ## Related
 
 - [Run an evaluation (synchronously)](../../how_to_guides/evaluation/evaluate_llm_application)
-- [Run a large evaluation job](../../how_to_guides/evaluation/large_job): Learn about the key `aevaluate()` parameters to configure when running large evaluation jobs.
diff --git a/docs/evaluation/how_to_guides/evaluation/rate_limiting.mdx b/docs/evaluation/how_to_guides/evaluation/rate_limiting.mdx
index 11f7fb50..e2f2f56b 100644
--- a/docs/evaluation/how_to_guides/evaluation/rate_limiting.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/rate_limiting.mdx
@@ -102,4 +102,3 @@ Limiting the number of concurrent calls you're making to your application and ev
 
 ## Related
 
-- See [here](../../how_to_guides/evaluation/large_job) for more guidance on how to run large evaluation jobs efficiently.

From a080e258b0a9b8ee973185472dd22bc905531e55 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Fri, 22 Nov 2024 18:18:43 -0500
Subject: [PATCH 20/29] more

---
 .../how_to_guides/evaluation/langgraph.mdx    | 150 +++++++++++++++++-
 .../evaluation/rate_limiting.mdx              |   1 -
 2 files changed, 145 insertions(+), 6 deletions(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
index a858e477..82ae0e95 100644
--- a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
@@ -181,7 +181,15 @@ And a simple evaluator:
 
 ### Run evaluations
 
-Now we can run our evaluations and explore the results:
+Now we can run our evaluations and explore the results.
+We'll just need to wrap our graph function so that it can take inputs in the format they're stored on our example:
+
+:::note Evaluating with async nodes
+
+If all of your graph nodes are defined as sync functions then you can use `evaluate` or `aevaluate`.
+If any of you nodes are defined as async, you'll need to use `aevaluate`
+
+:::
 
 <CodeTabs
   groupId="client-language"
@@ -189,8 +197,15 @@ Now we can run our evaluations and explore the results:
     python`
     from langsmith import aevaluate
 
+    def example_to_state(inputs: dict) -> dict:
+      return {"messages": [{"role": "user", "content": "inputs['question']"}]}
+
+    # We use LCEL declarative syntax here.
+    # Remember that langgraph graphs are also langchain runnables.
+    target = example_to_state | app
+
     experiment_results = aevaluate(
-        app,
+        target,
         data="weather agent",
         evaluators=[correct],
         max_concurrency=4,  # optional
@@ -205,10 +220,100 @@ Now we can run our evaluations and explore the results:
 ]}
 />
 
-## Evaluating individual nodes
-
 ## Evaluating intermediate steps
 
+Often it is valuable to evaluate not only the final output of an agent but also the intermediate steps it has taken.
+What's nice about `langgraph` is that the output of a graph is a state object that often already carries information about the intermediate steps taken.
+Usually we can evaluate whatever we're interested in just by looking at the messages in our state.
+For example, we can look at the messages to assert that the model invoked the 'search' tool upon as a first step.
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+
+    def right_tool(outputs: dict) -> bool:
+        tool_calls = outputs["messages"][1].tool_calls
+        return bool(tool_calls and tool_calls[0]["name"] == "search")
+
+    experiment_results = aevaluate(
+        target,
+        data="weather agent",
+        evaluators=[correct, right_tool],
+        max_concurrency=4,  # optional
+        experiment_prefix="claude-3.5-baseline",  # optional
+    )
+    `,
+    typescript`
+        // ToDo
+    `,
+
+]}
+/>
+
+If we need access to information about intermediate steps that isn't in state, we can look at the Run object. This contains the full traces for all node inputs and outputs:
+
+:::tip Custom evaluators
+
+See more about what arguments you can pass to custom evaluators in this [how-to guide](../evaluation/custom_evaluator).
+
+:::
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+    from langsmith.schemas import Run, Example
+
+    def right_tool_from_run(run: Run, example: Example) -> dict:
+        # Get documents and answer
+        first_model_run = next(run for run in root_run.child_runs if run.name == "agent")
+        tool_calls = first_model_run.outputs["messages"][-1].tool_calls
+        right_tool = bool(tool_calls and tool_calls[0]["name"] == "search")
+        return {"key": "right_tool", "value": right_tool}
+
+    experiment_results = aevaluate(
+        target,
+        data="weather agent",
+        evaluators=[correct, right_tool_from_run],
+        max_concurrency=4,  # optional
+        experiment_prefix="claude-3.5-baseline",  # optional
+    )
+    `,
+    typescript`
+        // ToDo
+    `,
+
+]}
+/>
+
+## Running and evaluating individual nodes
+
+Sometimes you want to evaluate a single node directly to save time and costs. `langgraph` makes it easy to do this.
+In this case we can even continue using the evaluators we've been using.
+
+<CodeTabs
+  groupId="client-language"
+  tabs={[
+    python`
+    
+    node_target = example_to_state | app.nodes["agent"]
+
+    node_experiment_results = aevaluate(
+        node_target,
+        data="weather agent",
+        evaluators=[right_tool_from_run],
+        max_concurrency=4,  # optional
+        experiment_prefix="claude-3.5-model-node",  # optional
+    )
+    `,
+    typescript`
+        // ToDo
+    `,
+
+]}
+/>
+
 ## Related
 
 - [`langgraph` evaluation docs](https://langchain-ai.github.io/langgraph/tutorials/#evaluation)
@@ -313,10 +418,45 @@ Now we can run our evaluations and explore the results:
 
     # Define evaluators
 
+    async def correct(outputs: dict, reference_outputs: dict) -> bool:
+        instructions = (
+            "Given an actual answer and an expected answer, determine whether"
+            " the actual answer contains all of the information in the"
+            " expected answer. Respond with 'CORRECT' if the actual answer"
+            " does contain all of the expected information and 'INCORRECT'"
+            " otherwise. Do not include anything else in your response."
+        )
+        # Our graph outputs a State dictionary, which in this case means
+        # we'll have a 'messages' key and the final message should
+        # be our actual answer.
+        actual_answer = outputs["messages"][-1].content
+        expected_answer = reference_outputs["answer"]
+        user_msg = (
+            f"ACTUAL ANSWER: {actual_answer}"
+            f"\\n\\nEXPECTED ANSWER: {expected_answer}"
+        )
+        response = await judge_llm.ainvoke(
+            [
+                {"role": "system", "content": instructions},
+                {"role": "user", "content": user_msg}
+            ]
+        )
+        return response.content.upper() == "CORRECT"
+
+
+    def right_tool(outputs: dict) -> bool:
+        tool_calls = outputs["messages"][1].tool_calls
+        return bool(tool_calls and tool_calls[0]["name"] == "search")
 
     # Run evaluation
 
-    # Explore results
+    experiment_results = aevaluate(
+        target,
+        data="weather agent",
+        evaluators=[correct, right_tool],
+        max_concurrency=4,  # optional
+        experiment_prefix="claude-3.5-baseline",  # optional
+    )
 
 `,
     typescript`
diff --git a/docs/evaluation/how_to_guides/evaluation/rate_limiting.mdx b/docs/evaluation/how_to_guides/evaluation/rate_limiting.mdx
index e2f2f56b..e5ea4318 100644
--- a/docs/evaluation/how_to_guides/evaluation/rate_limiting.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/rate_limiting.mdx
@@ -101,4 +101,3 @@ Limiting the number of concurrent calls you're making to your application and ev
 />
 
 ## Related
-

From 8ebd71e1d61849937f722d738481454573488f10 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Fri, 22 Nov 2024 19:20:29 -0500
Subject: [PATCH 21/29] cr

---
 .../evaluation/dataset_subset.mdx             |  6 +++++
 .../evaluation/run_evals_api_only.mdx         | 22 +++++++++----------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/dataset_subset.mdx b/docs/evaluation/how_to_guides/evaluation/dataset_subset.mdx
index 547fbcab..ca51c10e 100644
--- a/docs/evaluation/how_to_guides/evaluation/dataset_subset.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/dataset_subset.mdx
@@ -49,6 +49,8 @@ One common workflow is to fetch examples that have a certain metadata key-value
   ]}
 />
 
+For more advanced filtering capabilities see this [how-to guide](../datasets/manage_datasets_programmatically#list-examples-by-structured-filter).
+
 ## Evaluate on a dataset split
 
 You can use the `list_examples` / `listExamples` method to evaluate on one or multiple splits of your dataset. The `splits` param takes a list of the splits you would like to evaluate.
@@ -80,3 +82,7 @@ You can use the `list_examples` / `listExamples` method to evaluate on one or mu
     `,
   ]}
 />
+
+## Related
+
+- More on [how to filter datasets](../datasets/manage_datasets_programmatically#list-examples-by-structured-filter)
diff --git a/docs/evaluation/how_to_guides/evaluation/run_evals_api_only.mdx b/docs/evaluation/how_to_guides/evaluation/run_evals_api_only.mdx
index b431fe98..d50578f1 100644
--- a/docs/evaluation/how_to_guides/evaluation/run_evals_api_only.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/run_evals_api_only.mdx
@@ -40,7 +40,7 @@ from uuid import uuid4
 
 client = Client()
 
- Create a dataset
+#  Create a dataset
 examples = [
     ("Shut up, idiot", "Toxic"),
     ("You're a wonderful person", "Not toxic"),
@@ -63,8 +63,8 @@ client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
 First, pull all of the examples you'd want to use in your experiment.
 
 ```python
- Pick a dataset id. In this case, we are using the dataset we created above.
- Spec: https://api.smith.langchain.com/redoc#tag/examples/operation/delete_example_api_v1_examples__example_id__delete
+#  Pick a dataset id. In this case, we are using the dataset we created above.
+#  Spec: https://api.smith.langchain.com/redoc#tag/examples/operation/delete_example_api_v1_examples__example_id__delete
 dataset_id = dataset.id
 params = { "dataset": dataset_id }
 
@@ -152,9 +152,9 @@ def run_completion_on_example(example, model_name, experiment_id):
 We are going to run completions on all examples using two models: gpt-3.5-turbo and gpt-4o-mini.
 
 ```python
- Create a new experiment using the /sessions endpoint
- An experiment is a collection of runs with a reference to the dataset used
- Spec: https://api.smith.langchain.com/redoc#tag/tracer-sessions/operation/create_tracer_session_api_v1_sessions_post
+#  Create a new experiment using the /sessions endpoint
+#  An experiment is a collection of runs with a reference to the dataset used
+#  Spec: https://api.smith.langchain.com/redoc#tag/tracer-sessions/operation/create_tracer_session_api_v1_sessions_post
 
 model_names = ("gpt-3.5-turbo", "gpt-4o-mini")
 experiment_ids = []
@@ -194,8 +194,8 @@ Next, we'll demonstrate how to run a pairwise experiment. In a pairwise experime
 For more information, check out [this guide](../evaluation/evaluate_pairwise).
 
 ```python
- A comparative experiment allows you to provide a preferential ranking on the outputs of two or more experiments
- Spec: https://api.smith.langchain.com/redoc#tag/datasets/operation/create_comparative_experiment_api_v1_datasets_comparative_post
+#  A comparative experiment allows you to provide a preferential ranking on the outputs of two or more experiments
+#  Spec: https://api.smith.langchain.com/redoc#tag/datasets/operation/create_comparative_experiment_api_v1_datasets_comparative_post
 resp = requests.post(
     "https://api.smith.langchain.com/api/v1/datasets/comparative",
     json={
@@ -213,9 +213,9 @@ resp = requests.post(
 comparative_experiment = resp.json()
 comparative_experiment_id = comparative_experiment["id"]
 
- You can iterate over the runs in the experiments belonging to the comparative experiment and preferentially rank the outputs
+#  You can iterate over the runs in the experiments belonging to the comparative experiment and preferentially rank the outputs
 
- Fetch the comparative experiment
+#  Fetch the comparative experiment
 resp = requests.get(
     f"https://api.smith.langchain.com/api/v1/datasets/{str(dataset_id)}/comparative",
     params={"id": comparative_experiment_id},
@@ -228,7 +228,7 @@ experiment_ids = [info["id"] for info in comparative_experiment["experiments_inf
 from collections import defaultdict
 example_id_to_runs_map = defaultdict(list)
 
- Spec: https://api.smith.langchain.com/redoc#tag/run/operation/query_runs_api_v1_runs_query_post
+#  Spec: https://api.smith.langchain.com/redoc#tag/run/operation/query_runs_api_v1_runs_query_post
 runs = requests.post(
     f"https://api.smith.langchain.com/api/v1/runs/query",
     headers={"x-api-key": os.environ["LANGSMITH_API_KEY"]},

From 6c23581fc57b62971446999ce66e1bf3a96c803b Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Fri, 22 Nov 2024 21:17:41 -0500
Subject: [PATCH 22/29] fmt

---
 .../manage_datasets_programmatically.mdx      |  6 ++---
 .../how_to_guides/evaluation/async.mdx        | 26 ++++++++-----------
 .../evaluation/builtin_evaluators.mdx         |  0
 .../evaluation/evaluate_llm_application.mdx   |  4 +--
 .../how_to_guides/evaluation/langgraph.mdx    | 10 +++----
 docs/evaluation/how_to_guides/index.md        |  2 +-
 6 files changed, 22 insertions(+), 26 deletions(-)
 delete mode 100644 docs/evaluation/how_to_guides/evaluation/builtin_evaluators.mdx

diff --git a/docs/evaluation/how_to_guides/datasets/manage_datasets_programmatically.mdx b/docs/evaluation/how_to_guides/datasets/manage_datasets_programmatically.mdx
index c6daa3be..08a9627a 100644
--- a/docs/evaluation/how_to_guides/datasets/manage_datasets_programmatically.mdx
+++ b/docs/evaluation/how_to_guides/datasets/manage_datasets_programmatically.mdx
@@ -382,9 +382,9 @@ Additionally, you can also chain multiple filters together using the `and` opera
   tabs={[
     PythonBlock(
       `examples = client.list_examples(
-                dataset_name=dataset_name,
-                filter='and(not(has(metadata, \\'{"foo": "bar"}\\')), exists(metadata, "tenant_id"))'
-            )`
+    dataset_name=dataset_name,
+    filter='and(not(has(metadata, \\'{"foo": "bar"}\\')), exists(metadata, "tenant_id"))'
+)`
     ),
     TypeScriptBlock(
       `const examples = await client.listExamples({datasetName: datasetName, filter: 'and(not(has(metadata, \\'{"foo": "bar"}\\')), exists(metadata, "tenant_id"))'});`
diff --git a/docs/evaluation/how_to_guides/evaluation/async.mdx b/docs/evaluation/how_to_guides/evaluation/async.mdx
index 92ab9f2c..6741c635 100644
--- a/docs/evaluation/how_to_guides/evaluation/async.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/async.mdx
@@ -1,8 +1,4 @@
-import {
-  CodeTabs,
-  python,
-  typescript,
-} from "@site/src/components/InstructionsWithCode";
+import { CodeTabs, python } from "@site/src/components/InstructionsWithCode";
 
 # How to run an evaluation asynchronously
 
@@ -29,7 +25,7 @@ You can see how to use it [here](../../how_to_guides/evaluation/evaluate_llm_app
 <CodeTabs
   groupId="client-language"
   tabs={[
-    python`
+    python({caption: "Requires `langsmith>=0.1.145`"})`
         from langsmith import aevaluate, wrappers, Client
         from openai import AsyncOpenAI
 
@@ -57,22 +53,22 @@ list 5 concrete questions that should be investigated to determine if the idea i
             return len(output["output"]) < 3 * len(inputs["idea"])
 
         ls_client = Client()
-        # TODO
-        dataset = ...
 
-        results = aevaluate(
+        examples = ["universal basic income", "nuclear fusion", "hyperloop", "nuclear powered rockets"]
+        dataset = ls_client.create_dataset("research ideas")
+        ls_client.create_examples(
+            dataset_name=dataset.name,
+            inputs=[{"idea": e} for e in examples,
+        )
+
+        results = await aevaluate(
             researcher_app,
             data=dataset,
             evaluators=[concise],
             max_concurrency=2,  # Optional, no max by default
-            experiment_prefix="gpt-4o-mini, baseline"  # Optional, random by default
+            experiment_prefix="gpt-4o-mini-baseline"  # Optional, random by default
         )
     `,
-    typescript`
-      import type { EvaluationResult } from "langsmith/evaluation";
-      import type { Run, Example } from "langsmith/schemas";
-
-    `,
 
 ]}
 />
diff --git a/docs/evaluation/how_to_guides/evaluation/builtin_evaluators.mdx b/docs/evaluation/how_to_guides/evaluation/builtin_evaluators.mdx
deleted file mode 100644
index e69de29b..00000000
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
index 822e381d..b2bf0361 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
@@ -12,7 +12,7 @@ import {
 
 :::info Key concepts
 
-[Evaluations](../../concepts#applying-evaluations) | [Evaluators](../../concepts#evaluators) | [Datasets](../../concepts#datasets) | [Experiments](../../concepts#experiments)
+[Evaluations](../../concepts#applying-evaluations) | [Evaluators](../../concepts#evaluators) | [Datasets](../../concepts#datasets)
 
 :::
 
@@ -232,7 +232,7 @@ _If you've annotated your code for tracing, you can open the trace of each row i
 <CodeTabs
   groupId="client-language"
   tabs={[
-    python`
+    python({caption: "Requires `langsmith>=0.1.145`"})`
       from langsmith import Client, evaluate, traceable, wrappers
       from openai import OpenAI
       
diff --git a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
index 82ae0e95..4f6429fe 100644
--- a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
@@ -204,7 +204,7 @@ If any of you nodes are defined as async, you'll need to use `aevaluate`
     # Remember that langgraph graphs are also langchain runnables.
     target = example_to_state | app
 
-    experiment_results = aevaluate(
+    experiment_results = await aevaluate(
         target,
         data="weather agent",
         evaluators=[correct],
@@ -236,7 +236,7 @@ For example, we can look at the messages to assert that the model invoked the 's
         tool_calls = outputs["messages"][1].tool_calls
         return bool(tool_calls and tool_calls[0]["name"] == "search")
 
-    experiment_results = aevaluate(
+    experiment_results = await aevaluate(
         target,
         data="weather agent",
         evaluators=[correct, right_tool],
@@ -272,7 +272,7 @@ See more about what arguments you can pass to custom evaluators in this [how-to
         right_tool = bool(tool_calls and tool_calls[0]["name"] == "search")
         return {"key": "right_tool", "value": right_tool}
 
-    experiment_results = aevaluate(
+    experiment_results = await aevaluate(
         target,
         data="weather agent",
         evaluators=[correct, right_tool_from_run],
@@ -299,7 +299,7 @@ In this case we can even continue using the evaluators we've been using.
     
     node_target = example_to_state | app.nodes["agent"]
 
-    node_experiment_results = aevaluate(
+    node_experiment_results = await aevaluate(
         node_target,
         data="weather agent",
         evaluators=[right_tool_from_run],
@@ -450,7 +450,7 @@ In this case we can even continue using the evaluators we've been using.
 
     # Run evaluation
 
-    experiment_results = aevaluate(
+    experiment_results = await aevaluate(
         target,
         data="weather agent",
         evaluators=[correct, right_tool],
diff --git a/docs/evaluation/how_to_guides/index.md b/docs/evaluation/how_to_guides/index.md
index 5fe3ea1e..698f00fd 100644
--- a/docs/evaluation/how_to_guides/index.md
+++ b/docs/evaluation/how_to_guides/index.md
@@ -19,7 +19,7 @@ Evaluate and improve your application before deploying it.
 - [Evaluate a `langgraph` graph](./how_to_guides/evaluation/langgraph)
 - [Run an evaluation of an existing experiment](./how_to_guides/evaluation/evaluate_existing_experiment)
 - [Run an evaluation via the REST API](./how_to_guides/evaluation/run_evals_api_only)
-- [Run an evaluation from the prompt playground](./how_to_guides/evaluation/run_evaluation_from_prompt_playground)
+- [Run an evaluation from the UI](./how_to_guides/evaluation/run_evaluation_from_prompt_playground)
 
 ### Define an evaluator
 

From 0524905bd4bd679a1767754856c85a884f2adcdd Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Sat, 23 Nov 2024 04:36:56 -0500
Subject: [PATCH 23/29] fix

---
 .../evaluation/evaluate_llm_application.mdx   |  4 +--
 .../evaluation/evaluate_pairwise.mdx          | 22 +++++++++++-----
 .../evaluation/langchain_runnable.mdx         | 21 ++++++++++-----
 .../how_to_guides/evaluation/langgraph.mdx    | 26 -------------------
 4 files changed, 32 insertions(+), 41 deletions(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
index b2bf0361..7d368195 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
@@ -160,7 +160,7 @@ Since we have labels for this task, our evaluator can directly check if the actu
 <CodeTabs
   groupId="client-language"
   tabs={[
-    python`
+    python({ caption: "Requires `langsmith>=0.1.145`" })`
       def correct(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
           return outputs["output"] == reference_outputs["label"]
     `,
@@ -169,7 +169,7 @@ Since we have labels for this task, our evaluator can directly check if the actu
       import type { Run, Example } from "langsmith/schemas";
       
       function correct(rootRun: Run, example: Example): EvaluationResult {
-        const score = rootRun.outputs?.outputs === example.outputs?.output;
+        const score = rootRun.outputs?.output === example.outputs?.outputs;
         return { key: "correct", score };
       }
     `,
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx
index ca7b7f87..24f6c710 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx
@@ -27,7 +27,7 @@ If you haven't already created experiments to compare, check out our [quick star
 ## `evaluate_comparative` args
 
 :::note
-Pairwise evaluations currently require `langsmith` SDK Python version `>=0.1.55` or JS version `>=0.1.24`.
+Pairwise evaluations require `langsmith` SDK Python version `>=0.1.145` or JS version `>=0.1.24`.
 :::
 
 At its simplest, `evaluate_comparative` / `evaluateComparative` function takes the following arguments:
@@ -57,21 +57,31 @@ Pairwise evaluators are just functions with an expected signature.
 
 Custom evaluator functions must have specific argument names. They can take any subset of the following arguments:
 
+Python and JS/TS
+
+- `runs: list[langsmith.schemas.Run]`: A two-item list of the full Run objects generated by the two experiments on the given example. Use this if you need access to intermediate steps or metadata about each run.
+- `example: langsmith.schemas.Example`: The full dataset Example, including the example inputs, outputs (if available), and metdata (if available).
+
+Currently Python only
+
 - `inputs: dict`: A dictionary of the inputs corresponding to a single example in a dataset.
 - `outputs: list[dict]`: A two-item list of the outputs produced by each experiment on the given inputs.
 - `reference_outputs: dict`: A dictionary of the reference outputs associated with the example, if available.
-- `runs: list[langsmith.schemas.Run]`: A two-item list of the full Run objects generated by the two experiments on the given example. Use this if you need access to intermediate steps or metadata about each run.
-- `example: langsmith.schemas.Example`: The full dataset Example, including the example inputs, outputs (if available), and metdata (if available).
 
-For most use cases you'll only need `inputs`, `outputs`, and `reference_outputs`. `run` and `example` are useful only if you need some extra trace or example metadata outside of the actual inputs and outputs of the application.
+For most Python use cases you'll only need `inputs`, `outputs`, and `reference_outputs`. `run` and `example` are useful only if you need some extra trace or example metadata outside of the actual inputs and outputs of the application.
 
 ### Evaluator output
 
 Custom evaluators are expected to return one of the following types:
 
+Python and JS/TS
+
 - `dict`: dictionary with keys:
   - `key`, which represents the feedback key that will be logged
   - `scores`, which is a mapping from run ID to score for that run.
+
+Currently Python only
+
 - `list[int | float | bool]`: a two-item list of scores. The list is assumed to have the same order as the `runs` / `outputs` evaluator args. The evaluator function name is used for the feedback key.
 
 Note that you should choose a feedback key that is distinct from standard feedbacks on your run. We recommend prefixing pairwise feedback keys with `pairwise_` or `ranked_`.
@@ -81,7 +91,7 @@ Note that you should choose a feedback key that is distinct from standard feedba
 The following example uses [a prompt](https://smith.langchain.com/hub/langchain-ai/pairwise-evaluation-2)
 which asks the LLM to decide which is better between two AI assistant responses. It uses structured output to parse the AI's response: 0, 1, or 2.
 
-:::note Optional LangChain Usage
+:::info Optional LangChain Usage
 
 In the Python example below, we are pulling [this structured prompt](https://smith.langchain.com/hub/langchain-ai/pairwise-evaluation-2) from the [LangChain Hub](../../../prompt_engineering/how_to_guides/prompts/langchain_hub) and using it with a LangChain LLM wrapper.
 The prompt asks the LLM to decide which is better between two AI assistant responses. It uses structured output to parse the AI's response: 0, 1, or 2.
@@ -92,7 +102,7 @@ The prompt asks the LLM to decide which is better between two AI assistant respo
 
 <CodeTabs
   tabs={[
-    python`
+    python({caption: "Requires `langsmith>=0.1.145`"})`
       from langchain import hub
       from langchain.chat_models import init_chat_model
       from langsmith import evaluate_comparative
diff --git a/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx b/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
index 9ca62d03..40566c2a 100644
--- a/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
@@ -6,14 +6,15 @@ import {
 
 # How to evaluate a `langchain` runnable
 
-`langchain` [Runnable](https://python.langchain.com/docs/concepts/runnables/) objects (such as chat models, retrievers, chains, etc.) can be passed directly into `evaluate()` / `aevaluate()`.
-
-:::info
+:::info Key concepts
 
-`langchain` refers to the [Python](https://python.langchain.com) and [JS/TS](https://js.langchain.com) OSS frameworks for building LLM applications.
+- `langchain`: [Python](https://python.langchain.com) and [JS/TS](https://js.langchain.com)
+- Runnable: [Python](https://python.langchain.com/docs/concepts/runnables/) and [JS/TS](https://js.langchain.com/docs/concepts/runnables/)
 
 :::
 
+`langchain` [Runnable](https://python.langchain.com/docs/concepts/runnables/) objects (such as chat models, retrievers, chains, etc.) can be passed directly into `evaluate()` / `aevaluate()`.
+
 ## Setup
 
 Let's define a simple chain to evaluate. First, install all the required packages:
@@ -101,7 +102,7 @@ To evaluate our chain we can pass it directly to the `evaluate()` / `aevaluate()
             actual = outputs["output"]
             expected = reference_outputs["label"]
 
-            assert actual == expected
+            return actual == expected
 
         results = await aevaluate(
             chain,
@@ -112,11 +113,17 @@ To evaluate our chain we can pass it directly to the `evaluate()` / `aevaluate()
     `,
     typescript`
       import { evaluate } from "langsmith/evaluation";
+      import { Client } from "langsmith";
+
+      const langsmith = new Client();
+      const dataset = await client.clonePublicDataset(
+        "https://smith.langchain.com/public/3d6831e6-1680-4c88-94df-618c8e01fc55/d"
+      )
 
       await evaluate(chain, {
-        data: datasetName,
+        data: dataset.name,
         evaluators: [correct],
-        experimentPrefix: "Toxic Queries",
+        experimentPrefix: "gpt-4o, ba
       });
     `,
 
diff --git a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
index 4f6429fe..26308cdb 100644
--- a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
@@ -93,10 +93,6 @@ Lets construct a simple ReACT agent to start:
     app = workflow.compile()
 
 `,
-    typescript`
-// ToDo
-`,
-
 ]}
 />
 
@@ -129,10 +125,6 @@ Let's create a simple dataset of questions and expected responses:
         outputs=[{"answers": a} for a in answers],
     )
     `,
-    typescript`
-    // ToDo
-    `,
-
 ]}
 />
 
@@ -172,9 +164,6 @@ And a simple evaluator:
         )
         return response.content.upper() == "CORRECT"
     `,
-    typescript`
-        // ToDo
-    `,
 
 ]}
 />
@@ -213,9 +202,6 @@ If any of you nodes are defined as async, you'll need to use `aevaluate`
     )
 
     `,
-    typescript`
-        // ToDo
-    `,
 
 ]}
 />
@@ -244,9 +230,6 @@ For example, we can look at the messages to assert that the model invoked the 's
         experiment_prefix="claude-3.5-baseline",  # optional
     )
     `,
-    typescript`
-        // ToDo
-    `,
 
 ]}
 />
@@ -280,9 +263,6 @@ See more about what arguments you can pass to custom evaluators in this [how-to
         experiment_prefix="claude-3.5-baseline",  # optional
     )
     `,
-    typescript`
-        // ToDo
-    `,
 
 ]}
 />
@@ -307,9 +287,6 @@ In this case we can even continue using the evaluators we've been using.
         experiment_prefix="claude-3.5-model-node",  # optional
     )
     `,
-    typescript`
-        // ToDo
-    `,
 
 ]}
 />
@@ -458,9 +435,6 @@ In this case we can even continue using the evaluators we've been using.
         experiment_prefix="claude-3.5-baseline",  # optional
     )
 
-`,
-    typescript`
-// ToDo
 `,
 
 ]}

From 4e2da393d3d188b14e820a0dc43b5e88b63c7648 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Sat, 23 Nov 2024 13:29:44 -0500
Subject: [PATCH 24/29] try redirect

---
 .../how_to_guides/evaluation/async.mdx        |  6 ++--
 .../evaluation/custom_evaluator.mdx           | 30 +++++++++++--------
 .../evaluation/evaluate_llm_application.mdx   |  2 +-
 .../evaluation/evaluate_pairwise.mdx          |  5 ++--
 .../how_to_guides/evaluation/langgraph.mdx    |  1 +
 vercel.json                                   |  4 +++
 6 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/async.mdx b/docs/evaluation/how_to_guides/evaluation/async.mdx
index 6741c635..bc021428 100644
--- a/docs/evaluation/how_to_guides/evaluation/async.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/async.mdx
@@ -65,8 +65,9 @@ list 5 concrete questions that should be investigated to determine if the idea i
             researcher_app,
             data=dataset,
             evaluators=[concise],
-            max_concurrency=2,  # Optional, no max by default
-            experiment_prefix="gpt-4o-mini-baseline"  # Optional, random by default
+            # Optional, no max_concurrency by default but it is recommended to set one.
+            max_concurrency=2,
+            experiment_prefix="gpt-4o-mini-baseline"  # Optional, random by default.
         )
     `,
 
@@ -76,3 +77,4 @@ list 5 concrete questions that should be investigated to determine if the idea i
 ## Related
 
 - [Run an evaluation (synchronously)](../../how_to_guides/evaluation/evaluate_llm_application)
+- [Handle model rate limits](./how_to_guides/evaluation/rate_limiting)
diff --git a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
index 8058c8c4..0db42b13 100644
--- a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
@@ -41,7 +41,7 @@ These functions can be passed directly into [evaluate()](https://langsmith-sdk.r
       import type { Run, Example } from "langsmith/schemas";
 
       function correct(rootRun: Run, example: Example): EvaluationResult {
-        const score = rootRun.outputs?.outputs === example.outputs?.output;
+        const score = rootRun.outputs?.output === example.outputs?.output;
         return { key: "correct", score };
       }
     `,
@@ -53,11 +53,16 @@ These functions can be passed directly into [evaluate()](https://langsmith-sdk.r
 
 Custom evaluator functions must have specific argument names. They can take any subset of the following arguments:
 
+Python and JS/TS
+
+- `run: langsmith.schemas.Run`: The full Run object generated by the application on the given example.
+- `example: langsmith.schemas.Example`: The full dataset Example, including the example inputs, outputs (if available), and metdata (if available).
+
+Currently Python only
+
 - `inputs: dict`: A dictionary of the inputs corresponding to a single example in a dataset.
 - `outputs: dict`: A dictionary of the outputs generated by the application on the given `inputs`.
 - `reference_outputs: dict`: A dictionary of the reference outputs associated with the example, if available.
-- `run: langsmith.schemas.Run`: The full Run object generated by the application on the given example.
-- `example: langsmith.schemas.Example`: The full dataset Example, including the example inputs, outputs (if available), and metdata (if available).
 
 For most use cases you'll only need `inputs`, `outputs`, and `reference_outputs`. `run` and `example` are useful only if you need some extra trace or example metadata outside of the actual inputs and outputs of the application.
 
@@ -65,9 +70,14 @@ For most use cases you'll only need `inputs`, `outputs`, and `reference_outputs`
 
 Custom evaluators are expected to return one of the following types:
 
+Python and JS/TS
+
+- `dict`: dicts of the form `{"score" | "value": ..., "name": ...}` allow you to customize the metric type ("score" for numerical and "value" for categorical) and metric name. This if useful if, for example, you want to log an integer as a categorical metric.
+
+Currently Python only
+
 - `int | float | bool`: this is interepreted as an continuous metric that can be averaged, sorted, etc. The function name is used as the name of the metric.
 - `str`: this is intepreted as a categorical metric. The function name is used as the name of the metric.
-- `dict`: dicts of the form `{"score" | "value": ..., "name": ...}` allow you to customize the metric type ("score" for numerical and "value" for categorical) and metric name. This if useful if, for example, you want to log an integer as a categorical metric.
 - `list[dict]`: return multiple metrics using a single function.
 
 ## Additional examples
@@ -81,14 +91,17 @@ Custom evaluators are expected to return one of the following types:
         # Assumes you've installed pydantic.
         from pydantic import BaseModel
 
+        # Compare actual and reference outputs
         def correct(outputs: dict, reference_outputs: dict) -> bool:
             """Check if the answer exactly matches the expected answer."""
             return outputs["answer"] == reference_outputs["answer"]
 
+        # Just evaluate actual outputs
         def concision(outputs: dict) -> int:
             """Score how concise the answer is. 1 is the most concise, 5 is the least concise."""
             return min(len(outputs["answer"]) // 1000, 4) + 1
 
+        # Use an LLM-as-a-judge
         oai_client = wrappers.wrap_openai(AsyncOpenAI())
 
         async def valid_reasoning(inputs: dict, outputs: dict) -> bool:
@@ -119,15 +132,6 @@ answer is logically valid and consistent with question and the answer."""
             evaluators=[correct, concision, valid_reasoning]
         )
     `,
-    typescript`
-      import type { EvaluationResult } from "langsmith/evaluation";
-      import type { Run, Example } from "langsmith/schemas";
-
-      function correct(rootRun: Run, example: Example): EvaluationResult {
-        const score = rootRun.outputs?.outputs === example.outputs?.output;
-        return { key: "correct", score };
-      }
-    `,
 
 ]}
 />
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
index 7d368195..fdefed61 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
@@ -345,7 +345,7 @@ _If you've annotated your code for tracing, you can open the trace of each row i
 
       // Row-level evaluator
       function correct(rootRun: Run, example: Example): EvaluationResult {
-        const score = rootRun.outputs?.outputs === example.outputs?.output;
+        const score = rootRun.outputs?.output === example.outputs?.outputs;
         return { key: "correct", score };
       }
 
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx
index 24f6c710..d68b48b7 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx
@@ -93,10 +93,9 @@ which asks the LLM to decide which is better between two AI assistant responses.
 
 :::info Optional LangChain Usage
 
-In the Python example below, we are pulling [this structured prompt](https://smith.langchain.com/hub/langchain-ai/pairwise-evaluation-2) from the [LangChain Hub](../../../prompt_engineering/how_to_guides/prompts/langchain_hub) and using it with a LangChain LLM wrapper.
-The prompt asks the LLM to decide which is better between two AI assistant responses. It uses structured output to parse the AI's response: 0, 1, or 2.
+In the Python example below, we are pulling [this structured prompt](https://smith.langchain.com/hub/langchain-ai/pairwise-evaluation-2) from the [LangChain Hub](../../../prompt_engineering/how_to_guides/prompts/langchain_hub) and using it with a LangChain chat model wrapper.
 
-**Usage of LangChain is totally optional.** To illustrate this point, the TypeScript example below uses the OpenAI API directly.
+**Usage of LangChain is totally optional.** To illustrate this point, the TypeScript example uses the OpenAI SDK directly.
 
 :::
 
diff --git a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
index 26308cdb..ce7ae1ed 100644
--- a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
@@ -125,6 +125,7 @@ Let's create a simple dataset of questions and expected responses:
         outputs=[{"answers": a} for a in answers],
     )
     `,
+
 ]}
 />
 
diff --git a/vercel.json b/vercel.json
index 4ceba045..ddf7eee3 100644
--- a/vercel.json
+++ b/vercel.json
@@ -185,6 +185,10 @@
     {
       "source": "/tutorials/Developers/optimize_classifier",
       "destination": "/prompt_engineering/tutorials/optimize_classifier"
+    },
+    {
+        "source": "evaluation/how_to_guides/evaluation/evaluate_llm_application#evaluate-on-a-particular-version-of-a-dataset",
+        "destination": "evaluation/how_to_guides/evaluation/dataset_version"
     }
   ],
   "builds": [

From f6507d08fbc3beff89aeb4a5f697be925a997351 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Sat, 23 Nov 2024 13:53:37 -0500
Subject: [PATCH 25/29] fix

---
 .../how_to_guides/evaluation/async.mdx        |  2 +-
 .../evaluation/custom_evaluator.mdx           |  2 +-
 .../how_to_guides/evaluation/llm_as_judge.mdx | 22 ++++++--------
 .../how_to_guides/evaluation/metric_type.mdx  |  2 +-
 .../evaluation/multiple_scores.mdx            | 30 +++++++++++--------
 .../how_to_guides/evaluation/summary.mdx      |  2 +-
 docs/evaluation/how_to_guides/index.md        |  3 +-
 7 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/async.mdx b/docs/evaluation/how_to_guides/evaluation/async.mdx
index bc021428..dfd7fc2e 100644
--- a/docs/evaluation/how_to_guides/evaluation/async.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/async.mdx
@@ -77,4 +77,4 @@ list 5 concrete questions that should be investigated to determine if the idea i
 ## Related
 
 - [Run an evaluation (synchronously)](../../how_to_guides/evaluation/evaluate_llm_application)
-- [Handle model rate limits](./how_to_guides/evaluation/rate_limiting)
+- [Handle model rate limits](../../how_to_guides/evaluation/rate_limiting)
diff --git a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
index 0db42b13..475cbf8f 100644
--- a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
@@ -85,7 +85,7 @@ Currently Python only
 <CodeTabs
   groupId="client-language"
   tabs={[
-    python`
+    python({caption: "Requires `langsmith>=0.1.145`"})`
         from langsmith import evaluate, wrappers
         from openai import AsyncOpenAI
         # Assumes you've installed pydantic.
diff --git a/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx b/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx
index 2f6d6655..c8a0b8f7 100644
--- a/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/llm_as_judge.mdx
@@ -23,8 +23,8 @@ For maximal control of evaluator logic, we can write a custom evaluator and run
 <CodeTabs
   groupId="client-language"
   tabs={[
-    python`
-      from langsmith import evaluate, traceable, wrappers
+    python({caption: "Requires `langsmith>=0.1.145`"})`
+      from langsmith import evaluate, traceable, wrappers, Client
       from openai import OpenAI
       # Assumes you've installed pydantic
       from pydantic import BaseModel
@@ -50,34 +50,30 @@ for the answer is logically valid and consistent with question and the answer.\\
           messages=[{"role": "system", "content": instructions,}, {"role": "user", "content": msg}],
           response_format=Response
         )
-        return response.choices[0].messages.parsed.reasoning_is_valid
+        return response.choices[0].message.parsed.reasoning_is_valid
 
       # Optionally add the 'traceable' decorator to trace the inputs/outputs of this function.
       @traceable
       def dummy_app(inputs: dict) -> dict:
         return {"answer": "hmm i'm not sure", "reasoning": "i didn't understand the question"}
 
+      ls_client = Client()
+      questions = ["how will the universe end", "are we alone"]
+      dataset = ls_client.create_dataset("big questions")
+      ls_client.create_examples(dataset_id=dataset.id, inputs=[{"question": q} for q in questions])
+
       results = evaluate(
         dummy_app,
-        data="dataset_name",
+        data=dataset,
         evaluators=[valid_reasoning]
       )
     `,
-    typescript`
-      import type { EvaluationResult } from "langsmith/evaluation";
-      import type { Run, Example } from "langsmith/schemas";
-
-    `,
 
 ]}
 />
 
 See [here](../../how_to_guides/evaluation/custom_evaluator) for more on how to write a custom evaluator.
 
-## Builtin evaluator via the UI
-
-See [here](../../how_to_guides/evaluation/builtin_evaluators) for how to use LangSmith's builtin evaluators.
-
 ## Prebuilt evaluator via `langchain`
 
 See [here](../../how_to_guides/evaluation/use_langchain_off_the_shelf_evaluators) for how to use prebuilt evaluators from `langchain`.
diff --git a/docs/evaluation/how_to_guides/evaluation/metric_type.mdx b/docs/evaluation/how_to_guides/evaluation/metric_type.mdx
index cd97786d..5c61b3a7 100644
--- a/docs/evaluation/how_to_guides/evaluation/metric_type.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/metric_type.mdx
@@ -29,7 +29,7 @@ Here are some examples:
 <CodeTabs
   groupId="client-language"
   tabs={[
-    python`
+    python({caption: "Requires `langsmith>=0.1.145`"})`
         def numerical_metric(inputs: dict, outputs: dict, reference_outputs: dict) -> float:
             # Evaluation logic...
             
diff --git a/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx b/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx
index dec2f56e..069b3337 100644
--- a/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx
@@ -6,22 +6,26 @@ import {
 
 # How to return multiple scores in one evaluator
 
-Sometimes it is useful for a [custom evaluator function](../../how_to_guides/evaluation/custom_evaluator) to return multiple metrics.
+Sometimes it is useful for a [custom evaluator function](../../how_to_guides/evaluation/custom_evaluator) or [summary evaluator function](../../how_to_guides/evaluation/summary) to return multiple metrics.
 For example, if you have multiple metrics being generated by an LLM judge, you can save time and money by making a single LLM call that generates multiple metrics instead of making multiple LLM calls.
 
-To return multiple scores, simply return a list of dictionaries/objects of the following form:
+To return multiple scores using the Python SDK, simply return a list of dictionaries/objects of the following form:
 
 ```python
-{
-    [
-        # 'key' is the metric name
-        # 'score' is the value of a numerical metric
-        {"key": string, "score": number},
-        # 'value' is the value of a categorical metric
-        {"key": string, "value": string},
-        ... # You may log as many as you wish
-    ]
-}
+[
+    # 'key' is the metric name
+    # 'score' is the value of a numerical metric
+    {"key": string, "score": number},
+    # 'value' is the value of a categorical metric
+    {"key": string, "value": string},
+    ... # You may log as many as you wish
+]
+```
+
+To do so with the JS/TS SDK, return an object with a 'results' key and then a list of the above form
+
+```js
+{results: [{ key: string, score: number }, ...]};
 ```
 
 Each of these dictionaries can contain any or all of the [feedback fields](/reference/data_formats/feedback_data_format); check out the linked document for more information.
@@ -31,7 +35,7 @@ Example:
 <CodeTabs
   groupId="client-language"
   tabs={[
-    python`
+    python({caption: "Requires `langsmith>=0.1.145`"})`
         def multiple_scores(outputs: dict, reference_outputs: dict) -> list[dict]:
             # Replace with real evaluation logic.
             precision = 0.8
diff --git a/docs/evaluation/how_to_guides/evaluation/summary.mdx b/docs/evaluation/how_to_guides/evaluation/summary.mdx
index a248905b..97fd68bf 100644
--- a/docs/evaluation/how_to_guides/evaluation/summary.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/summary.mdx
@@ -4,7 +4,7 @@ import {
   typescript,
 } from "@site/src/components/InstructionsWithCode";
 
-# How to run an aggregate evaluation
+# How to define a summary evaluator
 
 Some metrics can only be defined on the entire experiment level as opposed to the individual runs of the experiment.
 For example, you may want to compute the overall pass rate or f1 score of your evaluation target across all examples in the dataset.
diff --git a/docs/evaluation/how_to_guides/index.md b/docs/evaluation/how_to_guides/index.md
index 698f00fd..9fa120a6 100644
--- a/docs/evaluation/how_to_guides/index.md
+++ b/docs/evaluation/how_to_guides/index.md
@@ -26,9 +26,8 @@ Evaluate and improve your application before deploying it.
 - [Define a custom evaluator](./how_to_guides/evaluation/custom_evaluator)
 - [Define an LLM-as-a-judge evaluator](./how_to_guides/evaluation/llm_as_judge)
 - [Define a pairwise evaluator](./how_to_guides/evaluation/evaluate_pairwise)
+- [Define a summary evaluator](./how_to_guides/evaluation/summary)
 - [Use an off-the-shelf evaluator via the SDK (Python only)](./how_to_guides/evaluation/use_langchain_off_the_shelf_evaluators)
-- [Use an off-the-shelf evaluator via the UI](./how_to_guides/evaluation/builtin_evaluators)
-- [Evaluate aggregate experiment results](./how_to_guides/evaluation/summary)
 - [Evaluate intermediate steps](./how_to_guides/evaluation/evaluate_on_intermediate_steps)
 - [Return multiple metrics in one evaluator](./how_to_guides/evaluation/multiple_scores)
 - [Return categorical vs numerical metrics](./how_to_guides/evaluation/metric_type)

From d8b3fc42be9cc587af1d873c2d070d09951961f6 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Sat, 23 Nov 2024 13:58:07 -0500
Subject: [PATCH 26/29] fix

---
 .../evaluation/custom_evaluator.mdx           |  4 +--
 .../how_to_guides/evaluation/metric_type.mdx  | 25 +++++++------------
 vercel.json                                   |  4 +--
 3 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
index 475cbf8f..658c770e 100644
--- a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
@@ -40,8 +40,8 @@ These functions can be passed directly into [evaluate()](https://langsmith-sdk.r
       import type { EvaluationResult } from "langsmith/evaluation";
       import type { Run, Example } from "langsmith/schemas";
 
-      function correct(rootRun: Run, example: Example): EvaluationResult {
-        const score = rootRun.outputs?.output === example.outputs?.output;
+      function correct(run: Run, example: Example): EvaluationResult {
+        const score = run.outputs?.output === example.outputs?.output;
         return { key: "correct", score };
       }
     `,
diff --git a/docs/evaluation/how_to_guides/evaluation/metric_type.mdx b/docs/evaluation/how_to_guides/evaluation/metric_type.mdx
index 5c61b3a7..371cdd92 100644
--- a/docs/evaluation/how_to_guides/evaluation/metric_type.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/metric_type.mdx
@@ -6,22 +6,16 @@ import {
 
 # How to return categorical vs numerical metrics
 
-:::info Key concepts
-
-- Metrics
-
-:::
-
 LangSmith supports both categorical and numerical metrics, and you can return either when writing a [custom evaluator](../../how_to_guides/evaluation/custom_evaluator).
 
 For an evaluator result to be logged as a numerical metric, it must returned as:
 
-- an `int`, `float`, or `bool`
+- (Python only) an `int`, `float`, or `bool`
 - a dict of the form `{"key": "metric_name", "score": int | float | bool}`
 
 For an evaluator result to be logged as a categorical metric, it must be returned as:
 
-- a `str`
+- (Python only) a `str`
 - a dict of the form `{"key": "metric_name", "value": str | int | float | bool}`
 
 Here are some examples:
@@ -58,15 +52,14 @@ Here are some examples:
     })`
       import type { Run, Example } from "langsmith/schemas";
 
-      function multipleScores(rootRun: Run, example: Example) {
+      function numericalMetric(run: Run, example: Example) {
+        // Your evaluation logic here
+        return { key: "numerical_metric", score: 0.8};
+      }
+
+      function categoricalMetric(run: Run, example: Example) {
         // Your evaluation logic here
-        return {
-            results: [
-                { key: "precision", score: 0.8 },
-                { key: "recall", score: 0.9 },
-                { key: "f1", score: 0.85 },
-            ],
-        };
+        return { key: "categorical_metric", value: "english"};
       }
     `,
 
diff --git a/vercel.json b/vercel.json
index ddf7eee3..8ea82cb1 100644
--- a/vercel.json
+++ b/vercel.json
@@ -187,8 +187,8 @@
       "destination": "/prompt_engineering/tutorials/optimize_classifier"
     },
     {
-        "source": "evaluation/how_to_guides/evaluation/evaluate_llm_application#evaluate-on-a-particular-version-of-a-dataset",
-        "destination": "evaluation/how_to_guides/evaluation/dataset_version"
+        "source": "/evaluation/how_to_guides/evaluation/evaluate_llm_application#evaluate-on-a-particular-version-of-a-dataset",
+        "destination": "/evaluation/how_to_guides/evaluation/dataset_version"
     }
   ],
   "builds": [

From a6df8ea65151ec720322abc4421aa8f01df26e64 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Sat, 23 Nov 2024 14:05:18 -0500
Subject: [PATCH 27/29] fix

---
 docs/evaluation/concepts/index.mdx                          | 6 +++---
 docs/evaluation/how_to_guides/datasets/version_datasets.mdx | 2 +-
 .../evaluation/how_to_guides/evaluation/check_evaluator.mdx | 0
 docs/evaluation/how_to_guides/evaluation/metric_type.mdx    | 4 ++++
 .../evaluation/how_to_guides/evaluation/multiple_scores.mdx | 4 ++++
 docs/evaluation/how_to_guides/index.md                      | 1 -
 docs/evaluation/tutorials/agents.mdx                        | 4 ++--
 7 files changed, 14 insertions(+), 7 deletions(-)
 delete mode 100644 docs/evaluation/how_to_guides/evaluation/check_evaluator.mdx

diff --git a/docs/evaluation/concepts/index.mdx b/docs/evaluation/concepts/index.mdx
index ab0f1f68..d068081c 100644
--- a/docs/evaluation/concepts/index.mdx
+++ b/docs/evaluation/concepts/index.mdx
@@ -225,7 +225,7 @@ LangSmith evaluations are kicked off using a single function, `evaluate`, which
 
 :::tip
 
-See documentation on using `evaluate` [here](https://docs.smith.langchain.com/how_to_guides/evaluation/evaluate_llm_application#step-4-run-the-evaluation-and-view-the-results).
+See documentation on using `evaluate` [here](https://docs.smith.langchain.com/how_to_guides/evaluation/evaluate_llm_application).
 
 :::
 
@@ -236,7 +236,7 @@ One of the most common questions when evaluating AI applications is: how can I b
 :::tip
 
 - See the [video on `Repetitions` in our LangSmith Evaluation series](https://youtu.be/Pvz24JdzzF8)
-- See our documentation on [`Repetitions`](https://docs.smith.langchain.com/how_to_guides/evaluation/evaluate_llm_application#evaluate-on-a-dataset-with-repetitions)
+- See our documentation on [`Repetitions`](https://docs.smith.langchain.com/how_to_guides/evaluation/repetition)
 
 :::
 
@@ -434,7 +434,7 @@ Classification / Tagging applies a label to a given input (e.g., for toxicity de
 
 A central consideration for Classification / Tagging evaluation is whether you have a dataset with `reference` labels or not. If not, users frequently want to define an evaluator that uses criteria to apply label (e.g., toxicity, etc) to an input (e.g., text, user-question, etc). However, if ground truth class labels are provided, then the evaluation objective is focused on scoring a Classification / Tagging chain relative to the ground truth class label (e.g., using metrics such as precision, recall, etc).
 
-If ground truth reference labels are provided, then it's common to simply define a [custom heuristic evaluator](https://docs.smith.langchain.com/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators) to compare ground truth labels to the chain output. However, it is increasingly common given the emergence of LLMs simply use `LLM-as-judge` to perform the Classification / Tagging of an input based upon specified criteria (without a ground truth reference).
+If ground truth reference labels are provided, then it's common to simply define a [custom heuristic evaluator](https://docs.smith.langchain.com/how_to_guides/evaluation/custom_evaluator) to compare ground truth labels to the chain output. However, it is increasingly common given the emergence of LLMs simply use `LLM-as-judge` to perform the Classification / Tagging of an input based upon specified criteria (without a ground truth reference).
 
 `Online` or `Offline` evaluation is feasible when using `LLM-as-judge` with the `Reference-free` prompt used. In particular, this is well suited to `Online` evaluation when a user wants to tag / classify application input (e.g., for toxicity, etc).
 
diff --git a/docs/evaluation/how_to_guides/datasets/version_datasets.mdx b/docs/evaluation/how_to_guides/datasets/version_datasets.mdx
index be7cc8bd..0f15f123 100644
--- a/docs/evaluation/how_to_guides/datasets/version_datasets.mdx
+++ b/docs/evaluation/how_to_guides/datasets/version_datasets.mdx
@@ -46,4 +46,4 @@ client.update_dataset_tag(
 )
 ```
 
-To run an evaluation on a particular tagged version of a dataset, you can follow [this guide](../evaluation/evaluate_llm_application#evaluate-on-a-particular-version-of-a-dataset).
+To run an evaluation on a particular tagged version of a dataset, you can follow [this guide](../evaluation/dataset_version).
diff --git a/docs/evaluation/how_to_guides/evaluation/check_evaluator.mdx b/docs/evaluation/how_to_guides/evaluation/check_evaluator.mdx
deleted file mode 100644
index e69de29b..00000000
diff --git a/docs/evaluation/how_to_guides/evaluation/metric_type.mdx b/docs/evaluation/how_to_guides/evaluation/metric_type.mdx
index 371cdd92..0f366dc8 100644
--- a/docs/evaluation/how_to_guides/evaluation/metric_type.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/metric_type.mdx
@@ -65,3 +65,7 @@ Here are some examples:
 
 ]}
 />
+
+## Related
+
+- [Return multiple metrics in one evaluator](./how_to_guides/evaluation/multiple_scores)
diff --git a/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx b/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx
index 069b3337..c1f21e94 100644
--- a/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx
@@ -72,3 +72,7 @@ Example:
 Rows from the resulting experiment will display each of the scores.
 
 ![](../evaluation/static/multiple_scores.png)
+
+## Related
+
+- [Return categorical vs numerical metrics](./how_to_guides/evaluation/metric_type)
diff --git a/docs/evaluation/how_to_guides/index.md b/docs/evaluation/how_to_guides/index.md
index 9fa120a6..7c3226d6 100644
--- a/docs/evaluation/how_to_guides/index.md
+++ b/docs/evaluation/how_to_guides/index.md
@@ -31,7 +31,6 @@ Evaluate and improve your application before deploying it.
 - [Evaluate intermediate steps](./how_to_guides/evaluation/evaluate_on_intermediate_steps)
 - [Return multiple metrics in one evaluator](./how_to_guides/evaluation/multiple_scores)
 - [Return categorical vs numerical metrics](./how_to_guides/evaluation/metric_type)
-- [Check your evaluator setup](./how_to_guides/evaluation/check_evaluator)
 
 ### Configure the evaluation data
 
diff --git a/docs/evaluation/tutorials/agents.mdx b/docs/evaluation/tutorials/agents.mdx
index c819f52e..9efd0f73 100644
--- a/docs/evaluation/tutorials/agents.mdx
+++ b/docs/evaluation/tutorials/agents.mdx
@@ -460,7 +460,7 @@ See the full overview of single step evaluation in our [conceptual guide](https:
 
 :::
 
-We can check a specific tool call using [a custom evaluator](https://docs.smith.langchain.com/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators):
+We can check a specific tool call using [a custom evaluator](https://docs.smith.langchain.com/how_to_guides/evaluation/custom_evaluator):
 
 - Here, we just invoke the assistant, `assistant_runnable`, with a prompt and check if the resulting tool call is as expected.
 - Here, we are using a specialized agent where the tools are hard-coded (rather than passed with the dataset input).
@@ -507,7 +507,7 @@ experiment_results = evaluate(
 
 ### Trajectory
 
-We can check a trajectory of tool calls using [custom evaluators](https://docs.smith.langchain.com/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators):
+We can check a trajectory of tool calls using [custom evaluators](https://docs.smith.langchain.com/how_to_guides/evaluation/custom_evaluator):
 
 - Here, we just invoke the agent, `graph.invoke`, with a prompt.
 - Here, we are using a specialized agent where the tools are hard-coded (rather than passed with the dataset input).

From ce3a19dbb60c91dd5ba3cb09ef4d2f4f5f0a18d9 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Sat, 23 Nov 2024 14:07:40 -0500
Subject: [PATCH 28/29] nit

---
 docs/evaluation/index.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/evaluation/index.mdx b/docs/evaluation/index.mdx
index f82cf5f0..a88c782f 100644
--- a/docs/evaluation/index.mdx
+++ b/docs/evaluation/index.mdx
@@ -54,7 +54,7 @@ export LANGCHAIN_API_KEY=<your-api-key>`),
 
 <CodeTabs
   tabs={[
-    python`
+    python({caption: "Requires `langsmith>=0.1.145`"})`
     from langsmith import evaluate, Client
 
     # 1. Create and/or select your dataset

From 1a194f07d8396db05336fb97786fac6236a2028f Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Sat, 23 Nov 2024 14:10:02 -0500
Subject: [PATCH 29/29] fix

---
 docs/evaluation/how_to_guides/evaluation/metric_type.mdx     | 2 +-
 docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/metric_type.mdx b/docs/evaluation/how_to_guides/evaluation/metric_type.mdx
index 0f366dc8..a3aa401a 100644
--- a/docs/evaluation/how_to_guides/evaluation/metric_type.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/metric_type.mdx
@@ -68,4 +68,4 @@ Here are some examples:
 
 ## Related
 
-- [Return multiple metrics in one evaluator](./how_to_guides/evaluation/multiple_scores)
+- [Return multiple metrics in one evaluator](../../how_to_guides/evaluation/multiple_scores)
diff --git a/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx b/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx
index c1f21e94..2a433002 100644
--- a/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/multiple_scores.mdx
@@ -75,4 +75,4 @@ Rows from the resulting experiment will display each of the scores.
 
 ## Related
 
-- [Return categorical vs numerical metrics](./how_to_guides/evaluation/metric_type)
+- [Return categorical vs numerical metrics](../../how_to_guides/evaluation/metric_type)