sdk v0.2 docs (#563)

langchain-ai · Dec 5, 2024 · 18d95f5 · 18d95f5
1 parent 69a9037
commit 18d95f5
Show file tree

Hide file tree

Showing 30 changed files with 674 additions and 430 deletions.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,24 @@
+install-vercel-deps:
+	yum -y update
+	yum install gcc bzip2-devel libffi-devel zlib-devel wget tar gzip rsync -y
+
+PYTHON = .venv/bin/python
+
+build-api-ref:
+	git clone --depth=1 https://github.com/langchain-ai/langsmith-sdk.git
+	python3 -m venv .venv
+	. .venv/bin/activate
+	$(PYTHON) -m pip install --upgrade pip
+	$(PYTHON) -m pip install --upgrade uv
+	cd langsmith-sdk && ../$(PYTHON) -m uv pip install -r python/docs/requirements.txt
+	$(PYTHON) langsmith-sdk/python/docs/create_api_rst.py
+	LC_ALL=C $(PYTHON) -m sphinx -T -E -b html -d langsmith-sdk/python/docs/_build/doctrees -c langsmith-sdk/python/docs langsmith-sdk/python/docs langsmith-sdk/python/docs/_build/html -j auto
+	$(PYTHON) langsmith-sdk/python/docs/scripts/custom_formatter.py langsmith-sdk/docs/_build/html/
+
+
+vercel-build: install-vercel-deps build-api-ref 
+	mkdir -p static/reference/python
+	mv langsmith-sdk/python/docs/_build/html/* static/reference/python/
+	rm -rf langsmith-sdk
+	NODE_OPTIONS="--max-old-space-size=5000" yarn run docusaurus build
+
diff --git a/docs/evaluation/how_to_guides/async.mdx b/docs/evaluation/how_to_guides/async.mdx
@@ -8,8 +8,8 @@ import { CodeTabs, python } from "@site/src/components/InstructionsWithCode";
 
 :::
 
-We can run evaluations asynchronously via the SDK using [aevaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._arunner.aevaluate.html),
-which accepts all of the same arguments as [evaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) but expects the application function to be asynchronous.
+We can run evaluations asynchronously via the SDK using [aevaluate()](https://docs.smith.langchain.com/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate),
+which accepts all of the same arguments as [evaluate()](https://docs.smith.langchain.com/reference/python/evaluation/langsmith.evaluation._runner.evaluate) but expects the application function to be asynchronous.
 You can learn more about how to use the `evaluate()` function [here](./evaluate_llm_application).
 
 :::info Python only
@@ -25,8 +25,8 @@ You can see how to use it [here](./evaluate_llm_application).
 <CodeTabs
   groupId="client-language"
   tabs={[
-    python({caption: "Requires `langsmith>=0.1.145`"})`
-        from langsmith import aevaluate, wrappers, Client
+    python({caption: "Requires `langsmith>=0.2.0`"})`
+        from langsmith import wrappers, Client
         from openai import AsyncOpenAI
 
         # Optionally wrap the OpenAI client to trace all model calls.
@@ -61,12 +61,15 @@ list 5 concrete questions that should be investigated to determine if the idea i
             inputs=[{"idea": e} for e in examples,
         )
 
-        results = await aevaluate(
+        # Can equivalently use the 'aevaluate' function directly:
+        # from langsmith import aevaluate
+        # await aevaluate(...)
+        results = await ls_client.aevaluate(
             researcher_app,
             data=dataset,
             evaluators=[concise],
-            # Optional, no max_concurrency by default but it is recommended to set one.
-            max_concurrency=2,
+            # Optional, add concurrency.
+            max_concurrency=2,  # Optional, add concurrency.
             experiment_prefix="gpt-4o-mini-baseline"  # Optional, random by default.
         )
     `,

diff --git a/docs/evaluation/how_to_guides/create_few_shot_evaluators.mdx b/docs/evaluation/how_to_guides/create_few_shot_evaluators.mdx
@@ -2,7 +2,7 @@
 sidebar_position: 10
 ---
 
-How to create few-shot evaluators
+# How to create few-shot evaluators
 
 Using LLM-as-a-Judge evaluators can be very helpful when you can't evaluate your system programmatically. However, improving/iterating on these prompts can add unnecessary
 overhead to the development process of an LLM-based application - you now need to maintain both your application **and** your evaluators. To make this process easier, LangSmith allows

diff --git a/docs/evaluation/how_to_guides/custom_evaluator.mdx b/docs/evaluation/how_to_guides/custom_evaluator.mdx
@@ -13,14 +13,14 @@ import {
 :::
 
 Custom evaluators are just functions that take a dataset example and the resulting application output, and return one or more metrics.
-These functions can be passed directly into [evaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._runner.evaluate.html) / [aevaluate()](https://langsmith-sdk.readthedocs.io/en/latest/evaluation/langsmith.evaluation._arunner.aevaluate.html).
+These functions can be passed directly into [evaluate()](https://docs.smith.langchain.com/reference/python/evaluation/langsmith.evaluation._runner.evaluate) / [aevaluate()](https://docs.smith.langchain.com/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate).
 
 ## Basic example
 
 <CodeTabs
   groupId="client-language"
   tabs={[
-    python({caption: "Requires `langsmith>=0.1.145`"})`
+    python({caption: "Requires `langsmith>=0.2.0`"})`
         from langsmith import evaluate
 
         def correct(outputs: dict, reference_outputs: dict) -> bool:
@@ -36,12 +36,14 @@ These functions can be passed directly into [evaluate()](https://langsmith-sdk.r
             evaluators=[correct]
         )
     `,
-    typescript`
+    typescript({caption: "Requires `langsmith>=0.2.9`"})`
       import type { EvaluationResult } from "langsmith/evaluation";
-      import type { Run, Example } from "langsmith/schemas";
 
-      function correct(run: Run, example: Example): EvaluationResult {
-        const score = run.outputs?.output === example.outputs?.output;
+      const correct = async ({ outputs, referenceOutputs }: {
+        outputs: Record<string, any>;
+        referenceOutputs?: Record<string, any>;
+      }): Promise<EvaluationResult> => {
+        const score = outputs?.answer === referenceOutputs?.answer;
         return { key: "correct", score };
       }
     `,
@@ -53,19 +55,16 @@ These functions can be passed directly into [evaluate()](https://langsmith-sdk.r
 
 Custom evaluator functions must have specific argument names. They can take any subset of the following arguments:
 
-Python and JS/TS
-
-- `run: langsmith.schemas.Run`: The full Run object generated by the application on the given example.
-- `example: langsmith.schemas.Example`: The full dataset Example, including the example inputs, outputs (if available), and metdata (if available).
-
-Currently Python only
-
+- `run: Run`: The full [Run](/reference/data_formats/run_data_format) object generated by the application on the given example.
+- `example: Example`: The full dataset [Example](/reference/data_formats/example_data_format), including the example inputs, outputs (if available), and metdata (if available).
 - `inputs: dict`: A dictionary of the inputs corresponding to a single example in a dataset.
 - `outputs: dict`: A dictionary of the outputs generated by the application on the given `inputs`.
-- `reference_outputs: dict`: A dictionary of the reference outputs associated with the example, if available.
+- `reference_outputs/referenceOutputs: dict`: A dictionary of the reference outputs associated with the example, if available.
 
 For most use cases you'll only need `inputs`, `outputs`, and `reference_outputs`. `run` and `example` are useful only if you need some extra trace or example metadata outside of the actual inputs and outputs of the application.
 
+When using JS/TS these should all be passed in as part of a single object argument.
+
 ## Evaluator output
 
 Custom evaluators are expected to return one of the following types:
@@ -85,16 +84,17 @@ Currently Python only
 <CodeTabs
   groupId="client-language"
   tabs={[
-    python({caption: "Requires `langsmith>=0.1.145`"})`
+    python({caption: "Requires `langsmith>=0.2.0`"})`
         from langsmith import evaluate, wrappers
+        from langsmith.schemas import Run, Example
         from openai import AsyncOpenAI
         # Assumes you've installed pydantic.
         from pydantic import BaseModel
 
-        # Compare actual and reference outputs
-        def correct(outputs: dict, reference_outputs: dict) -> bool:
+        # We can still pass in Run and Example objects if we'd like
+        def correct_old_signature(run: Run, example: Example) -> dict:
             """Check if the answer exactly matches the expected answer."""
-            return outputs["answer"] == reference_outputs["answer"]
+            return {"key": "correct", "score": run.outputs["answer"] == example.outputs["answer"]}
 
         # Just evaluate actual outputs
         def concision(outputs: dict) -> int:
@@ -129,9 +129,105 @@ answer is logically valid and consistent with question and the answer."""
         results = evaluate(
             dummy_app,
             data="dataset_name",
-            evaluators=[correct, concision, valid_reasoning]
+            evaluators=[correct_old_signature, concision, valid_reasoning]
         )
     `,
+    typescript`
+    import { Client } from "langsmith";
+    import { evaluate } from "langsmith/evaluation";
+    import { Run, Example } from "langsmith/schemas";
+    import OpenAI from "openai";
+
+    // Type definitions
+    interface AppInputs {
+        question: string;
+    }
+
+    interface AppOutputs {
+        answer: string;
+        reasoning: string;
+    }
+
+    interface Response {
+        reasoning_is_valid: boolean;
+    }
+
+    // Old signature evaluator
+    function correctOldSignature(run: Run, example: Example) {
+        return {
+            key: "correct",
+            score: run.outputs?.["answer"] === example.outputs?.["answer"],
+        };
+    }
+
+    // Output-only evaluator
+    function concision({ outputs }: { outputs: AppOutputs }) {
+        return {
+            key: "concision",
+            score: Math.min(Math.floor(outputs.answer.length / 1000), 4) + 1,
+        };
+    }
+
+    // LLM-as-judge evaluator
+    const openai = new OpenAI();
+
+    async function validReasoning({
+        inputs,
+        outputs
+    }: {
+        inputs: AppInputs;
+        outputs: AppOutputs;
+    }) {
+        const instructions = \`\
+        Given the following question, answer, and reasoning, determine if the reasoning for the \
+        answer is logically valid and consistent with question and the answer.\`;
+
+        const msg = \`Question: \${inputs.question}\nAnswer: \${outputs.answer}\\nReasoning: \${outputs.reasoning}\`;
+
+        const response = await openai.chat.completions.create({
+            model: "gpt-4",
+            messages: [
+                { role: "system", content: instructions },
+                { role: "user", content: msg }
+            ],
+            response_format: { type: "json_object" },
+            functions: [{
+            name: "parse_response",
+            parameters: {
+                type: "object",
+                properties: {
+                reasoning_is_valid: {
+                    type: "boolean",
+                    description: "Whether the reasoning is valid"
+                }
+                },
+                required: ["reasoning_is_valid"]
+            }
+            }]
+        });
+
+        const parsed = JSON.parse(response.choices[0].message.content ?? "{}") as Response;
+
+        return {
+            key: "valid_reasoning",
+            score: parsed.reasoning_is_valid ? 1 : 0
+        };
+    }
+
+    // Example application
+    function dummyApp(inputs: AppInputs): AppOutputs {
+        return {
+            answer: "hmm i'm not sure",
+            reasoning: "i didn't understand the question"
+        };
+    }
+
+    const results = await evaluate(dummyApp, {
+            data: "dataset_name",
+            evaluators: [correctOldSignature, concision, validReasoning],
+            client: new Client()
+    });
+    `
 
 ]}
 />

diff --git a/docs/evaluation/how_to_guides/dataset_subset.mdx b/docs/evaluation/how_to_guides/dataset_subset.mdx
@@ -85,4 +85,4 @@ You can use the `list_examples` / `listExamples` method to evaluate on one or mu
 
 ## Related
 
-- More on [how to filter datasets](./manage_datasets_programmatically#list-examples-by-structured-filter)
+- Learn more about how to fetch views of a dataset [here](./manage_datasets_programmatically#fetch-datasets)
diff --git a/docs/evaluation/how_to_guides/dataset_version.mdx b/docs/evaluation/how_to_guides/dataset_version.mdx
@@ -13,35 +13,53 @@ Additionally, it might be helpful to read the [guide on fetching examples](./man
 
 :::
 
-You can take advantage of the fact that `evaluate` allows passing in an iterable of examples to evaluate on a particular version of a dataset.
-Simply use `list_examples` / `listExamples` to fetch examples from a particular version tag using `as_of` / `asOf`.
+## Using `list_examples`
+
+You can take advantage of the fact that `evaluate` / `aevaluate` allows passing in an iterable of examples to evaluate on a particular version of a dataset.
+Simply use `list_examples` / `listExamples` to fetch examples from a particular version tag using `as_of` / `asOf` and pass that in to the `data` argument.
 
 <CodeTabs
   groupId="client-language"
   tabs={[
     python`
-      from langsmith import evaluate
-        
-      latest_data=client.list_examples(dataset_name=toxic_dataset_name, as_of="latest")
-      
-      results = evaluate(
-          lambda inputs: label_text(inputs["text"]),
-          data=latest_data,
-          evaluators=[correct_label],
-          experiment_prefix="Toxic Queries",
+      from langsmith import Client
+
+      ls_client = Client()
+
+      # Assumes actual outputs have a 'class' key.
+      # Assumes example outputs have a 'label' key.
+      def correct(outputs: dict, reference_outputs: dict) -> bool:
+        return outputs["class"] == reference_outputs["label"]
+
+      results = ls_client.evaluate(
+          lambda inputs: {"class": "Not toxic"},
+          # Pass in filtered data here:
+          # highlight-next-line
+          data=ls_client.list_examples(
+            # highlight-next-line
+            dataset_name="Toxic Queries",
+            # highlight-next-line
+            as_of="latest",  # specify version here
+            # highlight-next-line
+          ),
+          evaluators=[correct],
       )
     `,
     typescript`
       import { evaluate } from "langsmith/evaluation";
-      
+
       await evaluate((inputs) => labelText(inputs["input"]), {
         data: langsmith.listExamples({
           datasetName: datasetName,
           asOf: "latest",
         }),
         evaluators: [correctLabel],
-        experimentPrefix: "Toxic Queries",
       });
     `,
-  ]}
+
+]}
 />
+
+## Related
+
+- Learn more about how to fetch views of a dataset [here](./manage_datasets_programmatically#fetch-datasets)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -85,4 +85,4 @@ You can use the `list_examples` / `listExamples` method to evaluate on one or mu

		## Related

		- More on [how to filter datasets](./manage_datasets_programmatically#list-examples-by-structured-filter)
		- Learn more about how to fetch views of a dataset [here](./manage_datasets_programmatically#fetch-datasets)