From 4e2da393d3d188b14e820a0dc43b5e88b63c7648 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Sat, 23 Nov 2024 13:29:44 -0500
Subject: [PATCH] try redirect

---
 .../how_to_guides/evaluation/async.mdx        |  6 ++--
 .../evaluation/custom_evaluator.mdx           | 30 +++++++++++--------
 .../evaluation/evaluate_llm_application.mdx   |  2 +-
 .../evaluation/evaluate_pairwise.mdx          |  5 ++--
 .../how_to_guides/evaluation/langgraph.mdx    |  1 +
 vercel.json                                   |  4 +++
 6 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/docs/evaluation/how_to_guides/evaluation/async.mdx b/docs/evaluation/how_to_guides/evaluation/async.mdx
index 6741c635..bc021428 100644
--- a/docs/evaluation/how_to_guides/evaluation/async.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/async.mdx
@@ -65,8 +65,9 @@ list 5 concrete questions that should be investigated to determine if the idea i
             researcher_app,
             data=dataset,
             evaluators=[concise],
-            max_concurrency=2,  # Optional, no max by default
-            experiment_prefix="gpt-4o-mini-baseline"  # Optional, random by default
+            # Optional, no max_concurrency by default but it is recommended to set one.
+            max_concurrency=2,
+            experiment_prefix="gpt-4o-mini-baseline"  # Optional, random by default.
         )
     `,
 
@@ -76,3 +77,4 @@ list 5 concrete questions that should be investigated to determine if the idea i
 ## Related
 
 - [Run an evaluation (synchronously)](../../how_to_guides/evaluation/evaluate_llm_application)
+- [Handle model rate limits](./how_to_guides/evaluation/rate_limiting)
diff --git a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
index 8058c8c4..0db42b13 100644
--- a/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/custom_evaluator.mdx
@@ -41,7 +41,7 @@ These functions can be passed directly into [evaluate()](https://langsmith-sdk.r
       import type { Run, Example } from "langsmith/schemas";
 
       function correct(rootRun: Run, example: Example): EvaluationResult {
-        const score = rootRun.outputs?.outputs === example.outputs?.output;
+        const score = rootRun.outputs?.output === example.outputs?.output;
         return { key: "correct", score };
       }
     `,
@@ -53,11 +53,16 @@ These functions can be passed directly into [evaluate()](https://langsmith-sdk.r
 
 Custom evaluator functions must have specific argument names. They can take any subset of the following arguments:
 
+Python and JS/TS
+
+- `run: langsmith.schemas.Run`: The full Run object generated by the application on the given example.
+- `example: langsmith.schemas.Example`: The full dataset Example, including the example inputs, outputs (if available), and metdata (if available).
+
+Currently Python only
+
 - `inputs: dict`: A dictionary of the inputs corresponding to a single example in a dataset.
 - `outputs: dict`: A dictionary of the outputs generated by the application on the given `inputs`.
 - `reference_outputs: dict`: A dictionary of the reference outputs associated with the example, if available.
-- `run: langsmith.schemas.Run`: The full Run object generated by the application on the given example.
-- `example: langsmith.schemas.Example`: The full dataset Example, including the example inputs, outputs (if available), and metdata (if available).
 
 For most use cases you'll only need `inputs`, `outputs`, and `reference_outputs`. `run` and `example` are useful only if you need some extra trace or example metadata outside of the actual inputs and outputs of the application.
 
@@ -65,9 +70,14 @@ For most use cases you'll only need `inputs`, `outputs`, and `reference_outputs`
 
 Custom evaluators are expected to return one of the following types:
 
+Python and JS/TS
+
+- `dict`: dicts of the form `{"score" | "value": ..., "name": ...}` allow you to customize the metric type ("score" for numerical and "value" for categorical) and metric name. This if useful if, for example, you want to log an integer as a categorical metric.
+
+Currently Python only
+
 - `int | float | bool`: this is interepreted as an continuous metric that can be averaged, sorted, etc. The function name is used as the name of the metric.
 - `str`: this is intepreted as a categorical metric. The function name is used as the name of the metric.
-- `dict`: dicts of the form `{"score" | "value": ..., "name": ...}` allow you to customize the metric type ("score" for numerical and "value" for categorical) and metric name. This if useful if, for example, you want to log an integer as a categorical metric.
 - `list[dict]`: return multiple metrics using a single function.
 
 ## Additional examples
@@ -81,14 +91,17 @@ Custom evaluators are expected to return one of the following types:
         # Assumes you've installed pydantic.
         from pydantic import BaseModel
 
+        # Compare actual and reference outputs
         def correct(outputs: dict, reference_outputs: dict) -> bool:
             """Check if the answer exactly matches the expected answer."""
             return outputs["answer"] == reference_outputs["answer"]
 
+        # Just evaluate actual outputs
         def concision(outputs: dict) -> int:
             """Score how concise the answer is. 1 is the most concise, 5 is the least concise."""
             return min(len(outputs["answer"]) // 1000, 4) + 1
 
+        # Use an LLM-as-a-judge
         oai_client = wrappers.wrap_openai(AsyncOpenAI())
 
         async def valid_reasoning(inputs: dict, outputs: dict) -> bool:
@@ -119,15 +132,6 @@ answer is logically valid and consistent with question and the answer."""
             evaluators=[correct, concision, valid_reasoning]
         )
     `,
-    typescript`
-      import type { EvaluationResult } from "langsmith/evaluation";
-      import type { Run, Example } from "langsmith/schemas";
-
-      function correct(rootRun: Run, example: Example): EvaluationResult {
-        const score = rootRun.outputs?.outputs === example.outputs?.output;
-        return { key: "correct", score };
-      }
-    `,
 
 ]}
 />
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
index 7d368195..fdefed61 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_llm_application.mdx
@@ -345,7 +345,7 @@ _If you've annotated your code for tracing, you can open the trace of each row i
 
       // Row-level evaluator
       function correct(rootRun: Run, example: Example): EvaluationResult {
-        const score = rootRun.outputs?.outputs === example.outputs?.output;
+        const score = rootRun.outputs?.output === example.outputs?.outputs;
         return { key: "correct", score };
       }
 
diff --git a/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx b/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx
index 24f6c710..d68b48b7 100644
--- a/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/evaluate_pairwise.mdx
@@ -93,10 +93,9 @@ which asks the LLM to decide which is better between two AI assistant responses.
 
 :::info Optional LangChain Usage
 
-In the Python example below, we are pulling [this structured prompt](https://smith.langchain.com/hub/langchain-ai/pairwise-evaluation-2) from the [LangChain Hub](../../../prompt_engineering/how_to_guides/prompts/langchain_hub) and using it with a LangChain LLM wrapper.
-The prompt asks the LLM to decide which is better between two AI assistant responses. It uses structured output to parse the AI's response: 0, 1, or 2.
+In the Python example below, we are pulling [this structured prompt](https://smith.langchain.com/hub/langchain-ai/pairwise-evaluation-2) from the [LangChain Hub](../../../prompt_engineering/how_to_guides/prompts/langchain_hub) and using it with a LangChain chat model wrapper.
 
-**Usage of LangChain is totally optional.** To illustrate this point, the TypeScript example below uses the OpenAI API directly.
+**Usage of LangChain is totally optional.** To illustrate this point, the TypeScript example uses the OpenAI SDK directly.
 
 :::
 
diff --git a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
index 26308cdb..ce7ae1ed 100644
--- a/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
+++ b/docs/evaluation/how_to_guides/evaluation/langgraph.mdx
@@ -125,6 +125,7 @@ Let's create a simple dataset of questions and expected responses:
         outputs=[{"answers": a} for a in answers],
     )
     `,
+
 ]}
 />
 
diff --git a/vercel.json b/vercel.json
index 4ceba045..ddf7eee3 100644
--- a/vercel.json
+++ b/vercel.json
@@ -185,6 +185,10 @@
     {
       "source": "/tutorials/Developers/optimize_classifier",
       "destination": "/prompt_engineering/tutorials/optimize_classifier"
+    },
+    {
+        "source": "evaluation/how_to_guides/evaluation/evaluate_llm_application#evaluate-on-a-particular-version-of-a-dataset",
+        "destination": "evaluation/how_to_guides/evaluation/dataset_version"
     }
   ],
   "builds": [