Skip to content

Commit

Permalink
WIP: New eval quickstart (#575)
Browse files Browse the repository at this point in the history
new Quickstart for evaluations
  • Loading branch information
davidx33 authored Dec 18, 2024
2 parents c58d7e8 + aca2642 commit 78b2b30
Show file tree
Hide file tree
Showing 3 changed files with 275 additions and 58 deletions.
3 changes: 2 additions & 1 deletion .prettierignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
node_modules
build
.docusaurus
docs/api
docs/api
docs/evaluation
Binary file modified docs/evaluation/how_to_guides/static/view_experiment.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
330 changes: 273 additions & 57 deletions docs/evaluation/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,21 @@ import { RegionalUrl } from "@site/src/components/RegionalUrls";

This quick start will get you up and running with our evaluation SDK and Experiments UI.

## 1. Install LangSmith
## 1. Install Dependencies

<CodeTabs
tabs={[
{
value: "python",
label: "Python",
language: "bash",
content: `pip install -U langsmith`,
content: `pip install -U langsmith openai pydantic`,
},
{
value: "typescript",
label: "TypeScript",
language: "bash",
content: `yarn add langsmith`,
content: `yarn add langsmith openai zod`,
},
]}
groupId="client-language"
Expand All @@ -45,76 +45,292 @@ To create an API key head to the <RegionalUrl text='Settings page' suffix='/sett
<CodeTabs
tabs={[
ShellBlock(`export LANGCHAIN_TRACING_V2=true
export LANGCHAIN_API_KEY=<your-api-key>`),
export LANGCHAIN_API_KEY="<your-langchain-api-key>"
# The example uses OpenAI, but it's not necessary in general
export OPENAI_API_KEY="<your-openai-api-key>"`),
]}
groupId="client-language"
/>

## 4. Run your evaluation
## 3. Import dependencies

<CodeTabs
tabs={[
python({caption: "Requires `langsmith>=0.2.0`"})`
from langsmith import Client
{
value: "python",
label: "Python",
content: `from langsmith import wrappers, Client
from pydantic import BaseModel, Field
from openai import OpenAI
# 1. Create and/or select your dataset
client = Client()
dataset = client.clone_public_dataset(
"https://smith.langchain.com/public/a63525f9-bdf2-4512-83e3-077dc9417f96/d"
)
client = Client()
openai_client = wrappers.wrap_openai(OpenAI())`,
},
{
value: "typescript",
label: "TypeScript",
content: `import { Client } from "langsmith";
import OpenAI from "openai";
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
import type { EvaluationResult } from "langsmith/evaluation";
import { evaluate } from "langsmith/evaluation";
# 2. Define an evaluator
def is_concise(outputs: dict, reference_outputs: dict) -> bool:
return len(outputs["answer"]) < (3 * len(reference_outputs["answer"]))
# 3. Define the interface to your app
def chatbot(inputs: dict) -> dict:
return {"answer": inputs["question"] + " is a good question. I don't know the answer."}
# 4. Run an evaluation
experiment_results = client.evaluate(
chatbot,
data=dataset,
evaluators=[is_concise],
experiment_prefix="my first experiment ",
max_concurrency=4,
)
const client = new Client();
const openai = new OpenAI();`,
},
]}
groupId="client-language"
/>

## 4. Create a dataset

<CodeTabs
tabs={[
{
value: "python",
label: "Python",
content: `# For other dataset creation methods, see:
# https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_programmatically
# https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_in_application
# Create inputs and reference outputs
examples = [
(
"Which country is Mount Kilimanjaro located in?",
"Mount Kilimanjaro is located in Tanzania.",
),
(
"What is Earth's lowest point?",
"Earth's lowest point is The Dead Sea.",
),
]
inputs = [{"question": input_prompt} for input_prompt, _ in examples]
outputs = [{"answer": output_answer} for _, output_answer in examples]
# Programmatically create a dataset in LangSmith
dataset = client.create_dataset(
dataset_name="Sample dataset", description="A sample dataset in LangSmith."
)
# Add examples to the dataset
client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
`,
typescript({caption: "Requires `langsmith>=0.2.9`"})`
import { Client } from "langsmith";
import { evaluate } from "langsmith/evaluation";
import type { EvaluationResult } from "langsmith/evaluation";
},
{
value: "typescript",
label: "TypeScript",
content: `
// For other dataset creation methods, see:
// https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_programmatically
// https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_in_application
// Create inputs and reference outputs
const examples: [string, string][] = [
[
"Which country is Mount Kilimanjaro located in?",
"Mount Kilimanjaro is located in Tanzania.",
],
[
"What is Earth's lowest point?",
"Earth's lowest point is The Dead Sea.",
],
];
// 1. Define a dataset
const client = new Client();
const datasetName = "my first dataset"
const dataset = await client.clonePublicDataset(
"https://smith.langchain.com/public/a63525f9-bdf2-4512-83e3-077dc9417f96/d",
{ datasetName: datasetName }
)\n
// 2. Define an evaluator
function isConcise({ outputs, referenceOutputs }: { outputs?: Record<string, any>, referenceOutputs?: Record<string, any> }): EvaluationResult {
const score = outputs?.answer.length < 3 \* referenceOutputs?.answer.length;
return { key: "is_concise", score: score };
}\n
// 3. Run an evaluation
await evaluate(
(inputs: { question: string }) => {
return {
answer: inputs.question + " Good question. I don't know the answer"
};
}, {
data: datasetName,
evaluators: [isConcise],
experimentPrefix: "my first experiment ",
maxConcurrency: 4,
});`,
const inputs = examples.map(([inputPrompt]) => ({
question: inputPrompt,
}));
const outputs = examples.map(([, outputAnswer]) => ({
answer: outputAnswer,
}));
// Programmatically create a dataset in LangSmith
const dataset = await client.createDataset("Sample dataset", {
description: "A sample dataset in LangSmith.",
});
// Add examples to the dataset
await client.createExamples({
inputs,
outputs,
datasetId: dataset.id,
});
`,
},
]}
groupId="client-language"
/>

## 5. View Experiments UI
## 5. Define what you're evaluating

<CodeTabs
tabs={[
{
value: "python",
label: "Python",
content: `# Define the application logic you want to evaluate inside a target function
# The SDK will automatically send the inputs from the dataset to your target function
def target(inputs: dict) -> dict:
response = openai_client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{ "role": "system", "content": "Answer the following question accurately" },
{ "role": "user", "content": inputs["question"] },
],
)
return { "response": response.choices[0].message.content.strip() }
`},
{
value: "typescript",
label: "TypeScript",
content: `// Define the application logic you want to evaluate inside a target function
// The SDK will automatically send the inputs from the dataset to your target function
async function target(inputs: string): Promise<{ response: string }> {
const response = await openai.chat.completions.create({
model: "gpt-4o-mini",
messages: [
{ role: "system", content: "Answer the following question accurately" },
{ role: "user", content: inputs },
],
});
return { response: response.choices[0].message.content?.trim() || "" };
}
`,
},
]}
groupId="client-language"
/>

## 6. Define evaluator

<CodeTabs
tabs={[
{
value: "python",
label: "Python",
content: `# Define instructions for the LLM judge evaluator
instructions = """Evaluate Student Answer against Ground Truth for conceptual similarity and classify true or false:
- False: No conceptual match and similarity
- True: Most or full conceptual match and similarity
- Key criteria: Concept should match, not exact wording.
"""
# Define output schema for the LLM judge
class Grade(BaseModel):
score: bool = Field(
description="Boolean that indicates whether the response is accurate relative to the reference answer"
)
# Define LLM judge that grades the accuracy of the response relative to reference output
def accuracy(outputs: dict, reference_outputs: dict) -> bool:
response = openai_client.beta.chat.completions.parse(
model="gpt-4o-mini",
messages=[
{ "role": "system", "content": instructions },
{
"role": "user",
"content": f"""Ground Truth answer: {reference_outputs["answer"]};
Student's Answer: {outputs["response"]}"""
},
],
response_format=Grade,
)
return response.choices[0].message.parsed.score`,
},
{
value: "typescript",
label: "TypeScript",
content: `// Define instructions for the LLM judge evaluator
const instructions = \`Evaluate Student Answer against Ground Truth for conceptual similarity and classify true or false:
- False: No conceptual match and similarity
- True: Most or full conceptual match and similarity
- Key criteria: Concept should match, not exact wording.
\`;
// Define context for the LLM judge evaluator
const context = \`Ground Truth answer: {reference}; Student's Answer: {prediction}\`;
// Define output schema for the LLM judge
const ResponseSchema = z.object({
score: z
.boolean()
.describe(
"Boolean that indicates whether the response is accurate relative to the reference answer"
),
});
// Define LLM judge that grades the accuracy of the response relative to reference output
async function accuracy({
outputs,
referenceOutputs,
}: {
outputs?: Record<string, string>;
referenceOutputs?: Record<string, string>;
}): Promise<EvaluationResult> {
const response = await openai.chat.completions.create({
model: "gpt-4o-mini",
messages: [
{ role: "system", content: instructions },
{ role: "user", content: context.replace("{prediction}", outputs?.answer || "").replace("{reference}", referenceOutputs?.answer || "") }
],
response_format: zodResponseFormat(ResponseSchema, "response")
});
return {
key: "accuracy",
score: ResponseSchema.parse(JSON.parse(response.choices[0].message.content || "")).score,
};
}`,
},
]}
groupId="client-language"
/>

## 7. Run and view results

<CodeTabs tabs={[

{
value: "python",
label: "Python",
content: `# After running the evaluation, a link will be provided to view the results in langsmith
experiment_results = client.evaluate(
target,
data="Sample dataset",
evaluators=[
accuracy,
# can add multiple evaluators here
],
experiment_prefix="first-eval-in-langsmith",
max_concurrency=2,
)
`},
{
value: "typescript",
label: "TypeScript",
content: `// After running the evaluation, a link will be provided to view the results in langsmith
await evaluate(
(exampleInput) => {
return target(exampleInput.question);
},
{
data: "Sample dataset",
evaluators: [
accuracy,
// can add multiple evaluators here
],
experimentPrefix: "first-eval-in-langsmith",
maxConcurrency: 2,
}
);
`,
},
]}
groupId="client-language"
/>

Click the link printed out by your evaluation run to access the LangSmith Experiments UI, and explore the results of your evaluation.

Expand Down

0 comments on commit 78b2b30

Please sign in to comment.