From 9601489d102fe05bb4c579e371c25f81830bd2ff Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Wed, 4 Dec 2024 15:49:05 -0800
Subject: [PATCH] wip

---
 .../how_to_guides/dataset_version.mdx         |   9 +-
 .../evaluate_on_intermediate_steps.mdx        | 341 +++++++++++-------
 .../how_to_guides/evaluate_pairwise.mdx       |  10 +-
 3 files changed, 213 insertions(+), 147 deletions(-)

diff --git a/docs/evaluation/how_to_guides/dataset_version.mdx b/docs/evaluation/how_to_guides/dataset_version.mdx
index 235dfa04..791b879a 100644
--- a/docs/evaluation/how_to_guides/dataset_version.mdx
+++ b/docs/evaluation/how_to_guides/dataset_version.mdx
@@ -25,7 +25,7 @@ Simply use `list_examples` / `listExamples` to fetch examples from a particular
       from langsmith import Client
 
       ls_client = Client()
-      
+
       # Assumes actual outputs have a 'class' key.
       # Assumes example outputs have a 'label' key.
       def correct(outputs: dict, reference_outputs: dict) -> bool:
@@ -37,7 +37,7 @@ Simply use `list_examples` / `listExamples` to fetch examples from a particular
           # highlight-next-line
           data=ls_client.list_examples(
             # highlight-next-line
-            dataset_name="Toxic Queries", 
+            dataset_name="Toxic Queries",
             # highlight-next-line
             as_of="latest",  # specify version here
             # highlight-next-line
@@ -47,7 +47,7 @@ Simply use `list_examples` / `listExamples` to fetch examples from a particular
     `,
     typescript`
       import { evaluate } from "langsmith/evaluation";
-      
+
       await evaluate((inputs) => labelText(inputs["input"]), {
         data: langsmith.listExamples({
           datasetName: datasetName,
@@ -56,7 +56,8 @@ Simply use `list_examples` / `listExamples` to fetch examples from a particular
         evaluators: [correctLabel],
       });
     `,
-  ]}
+
+]}
 />
 
 ## Related
diff --git a/docs/evaluation/how_to_guides/evaluate_on_intermediate_steps.mdx b/docs/evaluation/how_to_guides/evaluate_on_intermediate_steps.mdx
index a22f5df1..9b067025 100644
--- a/docs/evaluation/how_to_guides/evaluate_on_intermediate_steps.mdx
+++ b/docs/evaluation/how_to_guides/evaluate_on_intermediate_steps.mdx
@@ -21,65 +21,108 @@ For example, for retrieval-augmented generation (RAG), you might want to
 
 In this guide, we will use a simple, fully-custom evaluator for evaluating criteria 1 and an LLM-based evaluator for evaluating criteria 2 to highlight both scenarios.
 
-In order to evaluate the intermediate steps of your pipeline, your evaluator function should traverse and process the `root_run`/`rootRun` argument, which is a `Run` object that contains the intermediate steps of your pipeline.
+In order to evaluate the intermediate steps of your pipeline, your evaluator function should traverse and process the `run`/`rootRun` argument, which is a `Run` object that contains the intermediate steps of your pipeline.
 
 ## 1. Define your LLM pipeline
 
 The below RAG pipeline consists of 1) generating a Wikipedia query given the input question, 2) retrieving relevant documents from Wikipedia, and 3) generating an answer given the retrieved documents.
 
+First lets install all the dependencies for this example:
+
 <CodeTabs
   tabs={[
-    PythonBlock(`import openai
-import wikipedia as wp\n
-from langsmith import traceable
-from langsmith.wrappers import wrap_openai\n
-openai = wrap_openai(openai.Client())\n
-@traceable
-def generate_wiki_search(question):
-    messages = [
-        {"role": "system", "content": "Generate a search query to pass into wikipedia to answer the user's question. Return only the search query and nothing more. This will passed in directly to the wikipedia search engine."},
-        {"role": "user", "content": question}
-    ]
-    result = openai.chat.completions.create(messages=messages, model="gpt-4o-mini", temperature=0)
-    return result.choices[0].message.content\n
-@traceable(run_type="retriever")
-def retrieve(query):
-    results = []
-    for term in wp.search(query, results = 10):
-        try:
-            page = wp.page(term, auto_suggest=False)
-            results.append({
-                "page_content": page.summary,
-                "type": "Document",
-                "metadata": {"url": page.url}
-            })
-        except wp.DisambiguationError:
-            pass
-        if len(results) >= 2:
-            return results\n
-@traceable
-def generate_answer(question, context):
-    messages = [
-        {"role": "system", "content": f"Answer the user's question based ONLY on the content below:\\n\\n{context}"},
-        {"role": "user", "content": question}
-    ]
-    result = openai.chat.completions.create(messages=messages, model="gpt-4o-mini", temperature=0)
-    return result.choices[0].message.content\n
-@traceable
-def rag_pipeline(question):
-    query = generate_wiki_search(question)
-    context = "\\n\\n".join([doc["page_content"] for doc in retrieve(query)])
-    answer = generate_answer(question, context)
-    return answer`),
+    {
+      value: "python",
+      label: "Python",
+      language: "bash",
+      content: `pip install -U langsmith langchain langchain-openai wikipedia`,
+    },
+    {
+      value: "typescript",
+      label: "TypeScript",
+      language: "bash",
+      content: `yarn add langsmith langchain @langchain/openai wikipedia`,
+    },
+  ]}
+  groupId="client-language"
+/>
+
+<CodeTabs
+  tabs={[
+    python({ caption: 'Requires `langsmith>=0.2.0`' })`
+    import openai
+    import wikipedia as wp
+
+    from langsmith import traceable, wrappers
+
+    oai_client = wrappers.wrap_openai(openai.Client())
+
+    @traceable
+    def generate_wiki_search(question: str) -> str:
+        """Generate the query to search in wikipedia."""
+        instructions = (
+            "Generate a search query to pass into wikipedia to answer the user's question. "
+            "Return only the search query and nothing more. "
+            "This will passed in directly to the wikipedia search engine."
+        )
+        messages = [
+            {"role": "system", "content": instructions},
+            {"role": "user", "content": question}
+        ]
+        result = oai_client.chat.completions.create(
+            messages=messages,
+            model="gpt-4o-mini",
+            temperature=0,
+        )
+        return result.choices[0].message.content
+
+    @traceable(run_type="retriever")
+    def retrieve(query: str) -> list:
+        """Get up to two search wikipedia results."""
+        results = []
+        for term in wp.search(query, results = 10):
+            try:
+                page = wp.page(term, auto_suggest=False)
+                results.append({
+                    "page_content": page.summary,
+                    "type": "Document",
+                    "metadata": {"url": page.url}
+                })
+            except wp.DisambiguationError:
+                pass
+            if len(results) >= 2:
+                return results
+
+    @traceable
+    def generate_answer(question: str, context: str) -> str:
+        """Answer the question based on the retrieved information."""
+        instructions = f"Answer the user's question based ONLY on the content below:\\n\\n{context}"
+        messages = [
+            {"role": "system", "content": instructions},
+            {"role": "user", "content": question}
+        ]
+        result = oai_client.chat.completions.create(
+            messages=messages,
+            model="gpt-4o-mini",
+            temperature=0
+        )
+        return result.choices[0].message.content
+
+    @traceable
+    def qa_pipeline(question: str) -> str:
+        """The full pipeline."""
+        query = generate_wiki_search(question)
+        context = "\\n\\n".join([doc["page_content"] for doc in retrieve(query)])
+        return generate_answer(question, context)`,
     typescript`
     import OpenAI from "openai";
     import wiki from "wikipedia";
     import { Client } from "langsmith";
     import { traceable } from "langsmith/traceable";
     import { wrapOpenAI } from "langsmith/wrappers";
-    
+
     const openai = wrapOpenAI(new OpenAI());
-    
+
     const generateWikiSearch = traceable(
       async (input: { question: string }) => {
         const messages = [
@@ -90,18 +133,18 @@ def rag_pipeline(question):
           },
           { role: "user" as const, content: input.question },
         ];
-        
+
         const chatCompletion = await openai.chat.completions.create({
           model: "gpt-4o-mini",
           messages: messages,
           temperature: 0,
         });
-        
+
         return chatCompletion.choices[0].message.content ?? "";
       },
       { name: "generateWikiSearch" }
     );
-    
+
     const retrieve = traceable(
       async (input: { query: string; numDocuments: number }) => {
         const { results } = await wiki.search(input.query, { limit: 10 });
@@ -110,7 +153,7 @@ def rag_pipeline(question):
           type: "Document";
           metadata: { url: string };
         }> = [];
-      
+
         for (const result of results) {
           if (finalResults.length >= input.numDocuments) {
             // Just return the top 2 pages for now
@@ -124,12 +167,12 @@ def rag_pipeline(question):
             metadata: { url: page.fullurl },
           });
         }
-    
+
         return finalResults;
       },
       { name: "retrieve", run_type: "retriever" }
     );
-    
+
     const generateAnswer = traceable(
       async (input: { question: string; context: string }) => {
         const messages = [
@@ -139,7 +182,7 @@ def rag_pipeline(question):
           },
           { role: "user" as const, content: input.question },
         ];
-      
+
         const chatCompletion = await openai.chat.completions.create({
           model: "gpt-4o-mini",
           messages: messages,
@@ -149,7 +192,7 @@ def rag_pipeline(question):
       },
       { name: "generateAnswer" }
     );
-    
+
     const ragPipeline = traceable(
       async ({ question }: { question: string }, numDocuments: number = 2) => {
         const query = await generateWikiSearch({ question });
@@ -162,8 +205,9 @@ def rag_pipeline(question):
       },
       { name: "ragPipeline" }
     );`,
-  ]}
-  groupId="client-language"
+
+]}
+groupId="client-language"
 />
 
 This pipeline will produce a trace that looks something like:
@@ -178,25 +222,23 @@ We are building a very simple dataset with a couple of examples to evaluate the
     python`
       from langsmith import Client
       
-      client = Client()
-      
-      examples = [
-          ("What is LangChain?", "LangChain is an open-source framework for building applications using large language models."),
-          ("What is LangSmith?", "LangSmith is an observability and evaluation tool for LLM products, built by LangChain Inc.")
-      ]
-      
+      ls_client = Client()
       dataset_name = "Wikipedia RAG"
-      if not client.has_dataset(dataset_name=dataset_name):
-          dataset = client.create_dataset(dataset_name=dataset_name)
-          inputs, outputs = zip(
-              *[({"input": input}, {"expected": expected}) for input, expected in examples]
+
+      if not ls_client.has_dataset(dataset_name=dataset_name):
+          dataset = ls_client.create_dataset(dataset_name=dataset_name)
+          ls_client.create_examples(
+            inputs=[
+                {"question": "What is LangChain?"},
+                {"question": "What is LangSmith?"},
+            ],
+            dataset_id=dataset.id
           )
-          client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
     `,
     typescript`
       import { Client } from "langsmith";
       const client = new Client();
-      
+
       const examples = [
         [
           "What is LangChain?",
@@ -207,17 +249,18 @@ We are building a very simple dataset with a couple of examples to evaluate the
           "LangSmith is an observability and evaluation tool for LLM products, built by LangChain Inc.",
         ],
       ];
-      
+
       const datasetName = "Wikipedia RAG";
-      
+
       const inputs = examples.map(([input, _]) => ({ input }));
       const outputs = examples.map(([_, expected]) => ({ expected }));
-      
+
       const dataset = await client.createDataset(datasetName);
       await client.createExamples({ datasetId: dataset.id, inputs, outputs });
     `,
-  ]}
-  groupId="client-language"
+
+]}
+groupId="client-language"
 />
 
 ## 3. Define your custom evaluators
@@ -225,61 +268,81 @@ We are building a very simple dataset with a couple of examples to evaluate the
 As mentioned above, we will define two evaluators: one that evaluates the relevance of the retrieved documents w.r.t the input query and another that evaluates the hallucination of the generated answer w.r.t the retrieved documents.
 We will be using LangChain LLM wrappers, along with [`with_structured_output`](https://python.langchain.com/v0.1/docs/modules/model_io/chat/structured_output/) to define the evaluator for hallucination.
 
-The key here is that the evaluator function should traverse the `root_run` / `rootRun` argument to access the intermediate steps of the pipeline. The evaluator can then process the inputs and outputs of the intermediate steps to evaluate according to the desired criteria.
+The key here is that the evaluator function should traverse the `run` / `rootRun` argument to access the intermediate steps of the pipeline. The evaluator can then process the inputs and outputs of the intermediate steps to evaluate according to the desired criteria.
 
 <CodeTabs
   tabs={[
-    PythonBlock(`from langsmith.evaluation import LangChainStringEvaluator, evaluate
-from langsmith.schemas import Example, Run
-from langchain_openai import ChatOpenAI
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.pydantic_v1 import BaseModel, Field\n
-def document_relevance(root_run: Run, example: Example) -> dict:
-    """
-    A very simple evaluator that checks to see if the input of the retrieval step exists
-    in the retrieved docs.
-    """
-    rag_pipeline_run = next(run for run in root_run.child_runs if run.name == "rag_pipeline")
-    retrieve_run = next(run for run in rag_pipeline_run.child_runs if run.name == "retrieve")
-    page_contents = "\\n\\n".join(doc["page_content"] for doc in retrieve_run.outputs["output"])
-    score = retrieve_run.inputs["query"] in page_contents
-    return {"key": "simple_document_relevance", "score": score}\n
-def hallucination(root_run: Run, example: Example) -> dict:
-    """
-    A simple evaluator that checks to see the answer is grounded in the documents
-    """
-    # Get documents and answer
-    rag_pipeline_run = next(run for run in root_run.child_runs if run.name == "rag_pipeline")
-    retrieve_run = next(run for run in rag_pipeline_run.child_runs if run.name == "retrieve")
-    page_contents = "\\n\\n".join(doc["page_content"] for doc in retrieve_run.outputs["output"])
-    generation = rag_pipeline_run.outputs["output"]\n
-    # Data model
-    class GradeHallucinations(BaseModel):
-        """Binary score for hallucination present in generation answer."""\n
-        binary_score: int = Field(description="Answer is grounded in the facts, 1 or 0")\n
-    # LLM with function call
-    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
-    structured_llm_grader = llm.with_structured_output(GradeHallucinations)\n
-    # Prompt
-    system = """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \\n
-        Give a binary score 1 or 0, where 1 means that the answer is grounded in / supported by the set of facts."""
-    hallucination_prompt = ChatPromptTemplate.from_messages(
-        [
-            ("system", system),
-            ("human", "Set of facts: \\n\\n {documents} \\n\\n LLM generation: {generation}"),
+    python({ caption: "Example uses `langchain` for convenience, this is not required." })`
+    from langchain.chat_models import init_chat_model
+    from langsmith.schemas import Run
+    from pydantic import BaseModel, Field
+
+    def document_relevance(run: Run) -> bool:
+        """Checks if retriever input exists in the retrieved docs."""
+        qa_pipeline_run = next(
+            r for run in run.child_runs if r.name == "qa_pipeline"
+        )
+        retrieve_run = next(
+            r for run in qa_pipeline_run.child_runs if r.name == "retrieve"
+        )
+        page_contents = "\\n\\n".join(
+            doc["page_content"] for doc in retrieve_run.outputs["output"]
+        )
+        return retrieve_run.inputs["query"] in page_contents
+
+
+    def no_hallucination(run: Run) -> bool:
+        """Check if the answer is grounded in the documents.
+
+        Return True if there is no hallucination, False otherwise.
+        """
+        # Get documents and answer
+        qa_pipeline_run = next(
+            run for run in run.child_runs if run.name == "qa_pipeline"
+        )
+        retrieve_run = next(
+            run for run in qa_pipeline_run.child_runs if run.name == "retrieve"
+        )
+        retrieved_content = "\\n\\n".join(
+            doc["page_content"] for doc in retrieve_run.outputs["output"]
+        )
+
+        # Data model
+        class GradeHallucinations(BaseModel):
+            """Binary score for hallucination present in generation answer."""
+
+            is_grounded: bool = Field(..., description="True if the answer is grounded in the facts, False otherwise.")
+
+        # LLM with structured outputs
+        # For more see: https://python.langchain.com/docs/how_to/structured_output/
+        llm = init_chat_model("gpt-4o-mini", temperature=0)
+        structured_llm= llm.with_structured_output(
+            GradeHallucinations,
+            method="json_schema",
+            strict=True,
+        )
+
+        # Prompt
+        instructions = (
+            "You are a grader assessing whether an LLM generation is grounded in / "
+            "supported by a set of retrieved facts. Give a binary score 1 or 0, "
+            "where 1 means that the answer is grounded in / supported by the set of facts."
+        )
+        messages = [
+            {"role": "system", "content": instructions},
+            {"role": "user", "content": "Set of facts: \\n\\n {retrieved_content} \\n\\n LLM generation: {generation}"},
         ]
-    )\n
-    hallucination_grader = hallucination_prompt | structured_llm_grader
-    score = hallucination_grader.invoke({"documents": page_contents, "generation": generation})
-    return {"key": "answer_hallucination", "score": int(score.binary_score)}`),
+
+        grade = structured_llm.invoke(messages)
+        return grade.is_grounded`,
     typescript`
       import { EvaluationResult } from "langsmith/evaluation";
       import { Run, Example } from "langsmith/schemas";
-      
+
       import { ChatPromptTemplate } from "@langchain/core/prompts";
       import { ChatOpenAI } from "@langchain/openai";
       import { z } from "zod";
-      
+
       function findNestedRun(run: Run, search: (run: Run) => boolean): Run | null {
         const queue: Run[] = [run];
         while (queue.length > 0) {
@@ -289,31 +352,31 @@ def hallucination(root_run: Run, example: Example) -> dict:
         }
         return null;
       }
-      
+
       // A very simple evaluator that checks to see if the input of the retrieval step exists
       // in the retrieved docs.
       function documentRelevance(rootRun: Run, example: Example): EvaluationResult {
         const retrieveRun = findNestedRun(rootRun, (run) => run.name === "retrieve");
         const docs: Array<{ page_content: string }> | undefined =
           retrieveRun.outputs?.outputs;
-        
+
         const pageContents = docs?.map((doc) => doc.page_content).join("\\n\\n");
         const score = pageContents.includes(retrieveRun.inputs?.query);
         return { key: "simple_document_relevance", score };
       }
-      
+
       async function hallucination(
         rootRun: Run,
         example: Example
       ): Promise<EvaluationResult> {
         const rag = findNestedRun(rootRun, (run) => run.name === "ragPipeline");
         const retrieve = findNestedRun(rootRun, (run) => run.name === "retrieve");
-        
+
         const docs: Array<{ page_content: string }> | undefined =
           retrieve.outputs?.outputs;
-        
+
         const documents = docs?.map((doc) => doc.page_content).join("\\n\\n");
-        
+
         const prompt = ChatPromptTemplate.fromMessages<{
           documents: string;
           generation: string;
@@ -330,7 +393,7 @@ def hallucination(root_run: Run, example: Example) -> dict:
             "Set of facts: \\n\\n {documents} \\n\\n LLM generation: {generation}",
           ],
         ]);
-        
+
         const llm = new ChatOpenAI({
           model: "gpt-4o-mini",
           temperature: 0,
@@ -343,18 +406,19 @@ def hallucination(root_run: Run, example: Example) -> dict:
             })
             .describe("Binary score for hallucination present in generation answer.")
         );
-        
+
         const grader = prompt.pipe(llm);
         const score = await grader.invoke({
           documents,
           generation: rag.outputs?.outputs,
         });
-        
+
         return { key: "answer_hallucination", score: score.binary_score };
       }
     `,
-  ]}
-  groupId="client-language"
+
+]}
+groupId="client-language"
 />
 
 ## 4. Evaluate the pipeline
@@ -364,26 +428,29 @@ Finally, we'll run `evaluate` with the custom evaluators defined above.
 <CodeTabs
   tabs={[
     python`
-      from langsmith import evaluate
-      
-      experiment_results = evaluate(
-          lambda inputs: rag_pipeline(inputs["input"]),
+      def qa_wrapper(inputs: dict) -> dict:
+        """Wrap the qa_pipeline so it can accept the Example.inputs dict as input."""
+        return {"answer": qa_pipeline(inputs["question"])}
+
+      experiment_results = ls_client.evaluate(
+          qa_wrapper,
           data=dataset_name,
-          evaluators=[document_relevance, hallucination],
+          evaluators=[document_relevance, no_hallucination],
           experiment_prefix="rag-wiki-oai"
       )
     `,
     typescript`
       import { evaluate } from "langsmith/evaluation";
-      
+
       await evaluate((inputs) => ragPipeline({ question: inputs.input }), {
         data: datasetName,
         evaluators: [hallucination, documentRelevance],
         experimentPrefix: "rag-wiki-oai",
       });
     `,
-  ]}
-  groupId="client-language"
+
+]}
+groupId="client-language"
 />
 
 The experiment will contain the results of the evaluation, including the scores and comments from the evaluators:
diff --git a/docs/evaluation/how_to_guides/evaluate_pairwise.mdx b/docs/evaluation/how_to_guides/evaluate_pairwise.mdx
index 08398dcc..e828aee3 100644
--- a/docs/evaluation/how_to_guides/evaluate_pairwise.mdx
+++ b/docs/evaluation/how_to_guides/evaluate_pairwise.mdx
@@ -57,15 +57,13 @@ Pairwise evaluators are just functions with an expected signature.
 
 Custom evaluator functions must have specific argument names. They can take any subset of the following arguments:
 
-Python and JS/TS
-
-- `runs: list[Run]`: A two-item list of the full [Run](/reference/data_formats/run_data_format) objects generated by the two experiments on the given example. Use this if you need access to intermediate steps or metadata about each run.
-- `example: Example`: The full dataset [Example](/reference/data_formats/example_data_format), including the example inputs, outputs (if available), and metdata (if available).
 - `inputs: dict`: A dictionary of the inputs corresponding to a single example in a dataset.
-- `outputs: list[dict]`: A two-item list of the outputs produced by each experiment on the given inputs.
+- `outputs: list[dict]`: A two-item list of the dict outputs produced by each experiment on the given inputs.
 - `reference_outputs` / `referenceOutputs: dict`: A dictionary of the reference outputs associated with the example, if available.
+- `runs: list[Run]`: A two-item list of the full [Run](/reference/data_formats/run_data_format) objects generated by the two experiments on the given example. Use this if you need access to intermediate steps or metadata about each run.
+- `example: Example`: The full dataset [Example](/reference/data_formats/example_data_format), including the example inputs, outputs (if available), and metdata (if available).
 
-For most use cases you'll only need `inputs`, `outputs`, and `reference_outputs`. `run` and `example` are useful only if you need some extra trace or example metadata outside of the actual inputs and outputs of the application.
+For most use cases you'll only need `inputs`, `outputs`, and `reference_outputs` / `referenceOutputs`. `run` and `example` are useful only if you need some extra trace or example metadata outside of the actual inputs and outputs of the application.
 
 ### Evaluator output