fix

langchain-ai · Nov 23, 2024 · 0524905 · 0524905
1 parent 6c23581
commit 0524905
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 41 deletions.
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
@@ -160,7 +160,7 @@ Since we have labels for this task, our evaluator can directly check if the actu
 <CodeTabs
   groupId="client-language"
   tabs={[
-    python`
+    python({ caption: "Requires `langsmith>=0.1.145`" })`
       def correct(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
           return outputs["output"] == reference_outputs["label"]
     `,
@@ -169,7 +169,7 @@ Since we have labels for this task, our evaluator can directly check if the actu
       import type { Run, Example } from "langsmith/schemas";
       
       function correct(rootRun: Run, example: Example): EvaluationResult {
-        const score = rootRun.outputs?.outputs === example.outputs?.output;
+        const score = rootRun.outputs?.output === example.outputs?.outputs;
         return { key: "correct", score };
       }
     `,

diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx
@@ -27,7 +27,7 @@ If you haven't already created experiments to compare, check out our [quick star
 ## `evaluate_comparative` args
 
 :::note
-Pairwise evaluations currently require `langsmith` SDK Python version `>=0.1.55` or JS version `>=0.1.24`.
+Pairwise evaluations require `langsmith` SDK Python version `>=0.1.145` or JS version `>=0.1.24`.
 :::
 
 At its simplest, `evaluate_comparative` / `evaluateComparative` function takes the following arguments:
@@ -57,21 +57,31 @@ Pairwise evaluators are just functions with an expected signature.
 
 Custom evaluator functions must have specific argument names. They can take any subset of the following arguments:
 
+Python and JS/TS
+
+- `runs: list[langsmith.schemas.Run]`: A two-item list of the full Run objects generated by the two experiments on the given example. Use this if you need access to intermediate steps or metadata about each run.
+- `example: langsmith.schemas.Example`: The full dataset Example, including the example inputs, outputs (if available), and metdata (if available).
+
+Currently Python only
+
 - `inputs: dict`: A dictionary of the inputs corresponding to a single example in a dataset.
 - `outputs: list[dict]`: A two-item list of the outputs produced by each experiment on the given inputs.
 - `reference_outputs: dict`: A dictionary of the reference outputs associated with the example, if available.
-- `runs: list[langsmith.schemas.Run]`: A two-item list of the full Run objects generated by the two experiments on the given example. Use this if you need access to intermediate steps or metadata about each run.
-- `example: langsmith.schemas.Example`: The full dataset Example, including the example inputs, outputs (if available), and metdata (if available).
 
-For most use cases you'll only need `inputs`, `outputs`, and `reference_outputs`. `run` and `example` are useful only if you need some extra trace or example metadata outside of the actual inputs and outputs of the application.
+For most Python use cases you'll only need `inputs`, `outputs`, and `reference_outputs`. `run` and `example` are useful only if you need some extra trace or example metadata outside of the actual inputs and outputs of the application.
 
 ### Evaluator output
 
 Custom evaluators are expected to return one of the following types:
 
+Python and JS/TS
+
 - `dict`: dictionary with keys:
   - `key`, which represents the feedback key that will be logged
   - `scores`, which is a mapping from run ID to score for that run.
+
+Currently Python only
+
 - `list[int | float | bool]`: a two-item list of scores. The list is assumed to have the same order as the `runs` / `outputs` evaluator args. The evaluator function name is used for the feedback key.
 
 Note that you should choose a feedback key that is distinct from standard feedbacks on your run. We recommend prefixing pairwise feedback keys with `pairwise_` or `ranked_`.
@@ -81,7 +91,7 @@ Note that you should choose a feedback key that is distinct from standard feedba
 The following example uses [a prompt](https://smith.langchain.com/hub/langchain-ai/pairwise-evaluation-2)
 which asks the LLM to decide which is better between two AI assistant responses. It uses structured output to parse the AI's response: 0, 1, or 2.
 
-:::note Optional LangChain Usage
+:::info Optional LangChain Usage
 
 In the Python example below, we are pulling [this structured prompt](https://smith.langchain.com/hub/langchain-ai/pairwise-evaluation-2) from the [LangChain Hub](../../../prompt_engineering/how_to_guides/prompts/langchain_hub) and using it with a LangChain LLM wrapper.
 The prompt asks the LLM to decide which is better between two AI assistant responses. It uses structured output to parse the AI's response: 0, 1, or 2.
@@ -92,7 +102,7 @@ The prompt asks the LLM to decide which is better between two AI assistant respo
 
 <CodeTabs
   tabs={[
-    python`
+    python({caption: "Requires `langsmith>=0.1.145`"})`
       from langchain import hub
       from langchain.chat_models import init_chat_model
       from langsmith import evaluate_comparative

diff --git a/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx b/docs/evaluation/how_to_guides/evaluation/langchain_runnable.mdx
@@ -6,14 +6,15 @@ import {
 
 # How to evaluate a `langchain` runnable
 
-`langchain` [Runnable](https://python.langchain.com/docs/concepts/runnables/) objects (such as chat models, retrievers, chains, etc.) can be passed directly into `evaluate()` / `aevaluate()`.
-
-:::info
+:::info Key concepts
 
-`langchain` refers to the [Python](https://python.langchain.com) and [JS/TS](https://js.langchain.com) OSS frameworks for building LLM applications.
+- `langchain`: [Python](https://python.langchain.com) and [JS/TS](https://js.langchain.com)
+- Runnable: [Python](https://python.langchain.com/docs/concepts/runnables/) and [JS/TS](https://js.langchain.com/docs/concepts/runnables/)
 
 :::
 
+`langchain` [Runnable](https://python.langchain.com/docs/concepts/runnables/) objects (such as chat models, retrievers, chains, etc.) can be passed directly into `evaluate()` / `aevaluate()`.
+
 ## Setup
 
 Let's define a simple chain to evaluate. First, install all the required packages:
@@ -101,7 +102,7 @@ To evaluate our chain we can pass it directly to the `evaluate()` / `aevaluate()
             actual = outputs["output"]
             expected = reference_outputs["label"]
 
-            assert actual == expected
+            return actual == expected
 
         results = await aevaluate(
             chain,
@@ -112,11 +113,17 @@ To evaluate our chain we can pass it directly to the `evaluate()` / `aevaluate()
     `,
     typescript`
       import { evaluate } from "langsmith/evaluation";
+      import { Client } from "langsmith";
+
+      const langsmith = new Client();
+      const dataset = await client.clonePublicDataset(
+        "https://smith.langchain.com/public/3d6831e6-1680-4c88-94df-618c8e01fc55/d"
+      )
 
       await evaluate(chain, {
-        data: datasetName,
+        data: dataset.name,
         evaluators: [correct],
-        experimentPrefix: "Toxic Queries",
+        experimentPrefix: "gpt-4o, ba
       });
     `,
 

diff --git a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
@@ -93,10 +93,6 @@ Lets construct a simple ReACT agent to start:
     app = workflow.compile()
 
 `,
-    typescript`
-// ToDo
-`,
-
 ]}
 />
 
@@ -129,10 +125,6 @@ Let's create a simple dataset of questions and expected responses:
         outputs=[{"answers": a} for a in answers],
     )
     `,
-    typescript`
-    // ToDo
-    `,
-
 ]}
 />
 
@@ -172,9 +164,6 @@ And a simple evaluator:
         )
         return response.content.upper() == "CORRECT"
     `,
-    typescript`
-        // ToDo
-    `,
 
 ]}
 />
@@ -213,9 +202,6 @@ If any of you nodes are defined as async, you'll need to use `aevaluate`
     )
 
     `,
-    typescript`
-        // ToDo
-    `,
 
 ]}
 />
@@ -244,9 +230,6 @@ For example, we can look at the messages to assert that the model invoked the 's
         experiment_prefix="claude-3.5-baseline",  # optional
     )
     `,
-    typescript`
-        // ToDo
-    `,
 
 ]}
 />
@@ -280,9 +263,6 @@ See more about what arguments you can pass to custom evaluators in this [how-to
         experiment_prefix="claude-3.5-baseline",  # optional
     )
     `,
-    typescript`
-        // ToDo
-    `,
 
 ]}
 />
@@ -307,9 +287,6 @@ In this case we can even continue using the evaluators we've been using.
         experiment_prefix="claude-3.5-model-node",  # optional
     )
     `,
-    typescript`
-        // ToDo
-    `,
 
 ]}
 />
@@ -458,9 +435,6 @@ In this case we can even continue using the evaluators we've been using.
         experiment_prefix="claude-3.5-baseline",  # optional
     )
 
-`,
-    typescript`
-// ToDo
 `,
 
 ]}