Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
bracesproul committed Apr 29, 2024
1 parent 9f7f9de commit 0c156d4
Show file tree
Hide file tree
Showing 10 changed files with 642 additions and 395 deletions.
45 changes: 30 additions & 15 deletions docs/evaluation/faq/evaluator-implementations.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,12 @@ Three QA evaluators you can load are: `"qa"`, `"context_qa"`, `"cot_qa"`. Based
<CodeTabs
tabs={[
PythonBlock(`from langsmith import Client
from langsmith.evaluation import LangChainStringEvaluator, evaluate\n
from langsmith.evaluation import LangChainStringEvaluator, evaluate
qa_evaluator = LangChainStringEvaluator("qa")
context_qa_evaluator = LangChainStringEvaluator("context_qa")
cot_qa_evaluator = LangChainStringEvaluator("cot_qa")\n
cot_qa_evaluator = LangChainStringEvaluator("cot_qa")
client = Client()
evaluate(
<your pipeline>,
Expand All @@ -79,7 +81,8 @@ out the reference docs for more information on the expected prompt format.
tabs={[
PythonBlock(`from langchain.chat_models import ChatAnthropic
from langchain_core.prompts.prompt import PromptTemplate
from langsmith.evaluation import LangChainStringEvaluator\n
from langsmith.evaluation import LangChainStringEvaluator
_PROMPT_TEMPLATE = """You are an expert professor specialized in grading students' answers to questions.
You are grading the following question:
{input}
Expand All @@ -89,11 +92,13 @@ You are grading the following predicted answer:
{prediction}
Respond with CORRECT or INCORRECT:
Grade:
"""\n
"""
PROMPT = PromptTemplate(
input_variables=["input", "reference", "prediction"], template=_PROMPT_TEMPLATE
)
eval_llm = ChatAnthropic(temperature=0.0)\n
eval_llm = ChatAnthropic(temperature=0.0)
qa_evaluator = LangChainStringEvaluator("qa", config={"llm": eval_llm, "prompt": PROMPT})
context_qa_evaluator = LangChainStringEvaluator("context_qa", config={"llm": eval_llm})
cot_qa_evaluator = LangChainStringEvaluator("cot_qa", config={"llm": eval_llm})
Expand All @@ -117,7 +122,8 @@ If you don't have ground truth reference labels, you can evaluate your run again
<CodeTabs
tabs={[
PythonBlock(`from langsmith import Client
from langsmith.evaluation import LangChainStringEvaluator, evaluate\n
from langsmith.evaluation import LangChainStringEvaluator, evaluate
criteria_evaluator = LangChainStringEvaluator(
"criteria",
config={
Expand All @@ -136,7 +142,8 @@ score_evaluator = LangChainStringEvaluator(
# If you want the score to be saved on a scale from 0 to 1
"normalize_by": 10,
}
)\n
)
client = Client()
evaluate(
<your pipeline>,
Expand Down Expand Up @@ -180,7 +187,8 @@ If you have ground truth reference labels, you can evaluate your run against cus
<CodeTabs
tabs={[
PythonBlock(`from langsmith import Client
from langsmith.evaluation import LangChainStringEvaluator, evaluate\n
from langsmith.evaluation import LangChainStringEvaluator, evaluate
labeled_criteria_evaluator = LangChainStringEvaluator(
"labeled_criteria",
config={
Expand Down Expand Up @@ -210,7 +218,8 @@ labeled_score_evaluator = LangChainStringEvaluator(
"reference": example.outputs["answer"],
"input": example.inputs["question"],
}
)\n
)
client = Client()
evaluate(
<your pipeline>,
Expand All @@ -236,7 +245,8 @@ Evaluating extraction and function calling applications often comes down to vali

<CodeTabs
tabs={[
PythonBlock(`from langsmith.evaluation import LangChainStringEvaluator, evaluate\n
PythonBlock(`from langsmith.evaluation import LangChainStringEvaluator, evaluate
json_validity_evaluator = LangChainStringEvaluator("json_validity")
json_equality_evaluator = LangChainStringEvaluator("json_equality")
json_edit_distance_evaluator = LangChainStringEvaluator("json_edit_distance")
Expand All @@ -252,7 +262,8 @@ json_schema_evaluator = LangChainStringEvaluator(
"required": ["name"]
}
}
)\n
)
evaluate(
<your pipeline>,
data="<dataset_name>",
Expand All @@ -277,7 +288,8 @@ To measure the similarity between a predicted string and a reference, you can us

<CodeTabs
tabs={[
PythonBlock(`from langsmith.evaluation import LangChainStringEvaluator, evaluate\n
PythonBlock(`from langsmith.evaluation import LangChainStringEvaluator, evaluate
string_distance_evaluator = LangChainStringEvaluator(
"string_distance",
config={"distance": "levenshtein", "normalize_score": True}
Expand All @@ -294,7 +306,8 @@ embedding_distance_evaluator = LangChainStringEvaluator(
exact_match_evaluator = LangChainStringEvaluator(
"exact_match",
config={"ignore_case": True, "ignore_punctuation": True}
)\n
)
evaluate(
<your pipeline>,
data="<dataset_name>",
Expand All @@ -315,14 +328,16 @@ The pattern is provided as a string in the example outputs of the dataset. The e

<CodeTabs
tabs={[
PythonBlock(`from langsmith.evaluation import LangChainStringEvaluator, evaluate\n
PythonBlock(`from langsmith.evaluation import LangChainStringEvaluator, evaluate
regex_evaluator = LangChainStringEvaluator(
"regex_match",
config={
# Optionally control which flags to use in the regex match
"flags": re.IGNORECASE
}
)\n
)
evaluate(
<your pipeline>,
data="<dataset_name>",
Expand Down
134 changes: 85 additions & 49 deletions docs/evaluation/faq/manage-datasets.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -65,48 +65,64 @@ Note that you can add arbitrary metadata to each example, such as a note or a so

<CodeTabs
tabs={[
PythonBlock(`from langsmith import Client\n
PythonBlock(`from langsmith import Client
example_inputs = [
("What is the largest mammal?", "The blue whale"),
("What do mammals and birds have in common?", "They are both warm-blooded"),
("What are reptiles known for?", "Having scales"),
("What's the main characteristic of amphibians?", "They live both in water and on land"),
]\n
("What is the largest mammal?", "The blue whale"),
("What do mammals and birds have in common?", "They are both warm-blooded"),
("What are reptiles known for?", "Having scales"),
(
"What's the main characteristic of amphibians?",
"They live both in water and on land",
),
]
client = Client()
dataset_name = "Elementary Animal Questions"\n
dataset_name = "Elementary Animal Questions"
# Storing inputs in a dataset lets us
# run chains and LLMs over a shared set of examples.
dataset = client.create_dataset(
dataset_name=dataset_name, description="Questions and answers about animal phylogenetics.",
dataset_name=dataset_name,
description="Questions and answers about animal phylogenetics.",
)
for input_prompt, output_answer in example_inputs:
client.create_example(
inputs={"question": input_prompt},
outputs={"answer": output_answer},
metadata={"source": "Wikipedia"},
dataset_id=dataset.id,
)`),
TypeScriptBlock(`import { Client } from "langsmith";\n
)
`),
TypeScriptBlock(`import { Client } from "langsmith";
const client = new Client({
// apiUrl: "https://api.langchain.com", // Defaults to the LANGCHAIN_ENDPOINT env var
// apiKey: "my_api_key", // Defaults to the LANGCHAIN_API_KEY env var
/* callerOptions: {
maxConcurrency?: Infinity; // Maximum number of concurrent requests to make
maxRetries?: 6; // Maximum number of retries to make
}*/
});\n
});
const exampleInputs: [string, string][] = [
["What is the largest mammal?", "The blue whale"],
["What do mammals and birds have in common?", "They are both warm-blooded"],
["What are reptiles known for?", "Having scales"],
["What's the main characteristic of amphibians?", "They live both in water and on land"],
];\n
const datasetName = "Elementary Animal Questions";\n
[
"What's the main characteristic of amphibians?",
"They live both in water and on land",
],
];
const datasetName = "Elementary Animal Questions";
// Storing inputs in a dataset lets us
// run chains and LLMs over a shared set of examples.
const dataset = await client.createDataset(datasetName, {
description: "Questions and answers about animal phylogenetics",
});\n
});
for (const [inputPrompt, outputAnswer] of exampleInputs) {
await client.createExample(
{ question: inputPrompt },
Expand All @@ -116,7 +132,8 @@ for (const [inputPrompt, outputAnswer] of exampleInputs) {
metadata: { source: "Wikipedia" },
}
);
}`),
}
`),
]}
groupId="client-language"
/>
Expand All @@ -127,24 +144,28 @@ To create datasets from existing runs, you can use the same approach. Below is a

<CodeTabs
tabs={[
PythonBlock(`from langsmith import Client\n
PythonBlock(`from langsmith import Client
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = "<YOUR-LANGSMITH-API-KEY>"
client = Client()
dataset_name = "Example Dataset"\n
dataset_name = "Example Dataset"
# Filter runs to add to the dataset
runs = client.list_runs(
project_name="my_project",
execution_order=1,
error=False,
)\n
)
dataset = client.create_dataset(dataset_name, description="An example dataset")
for run in runs:
client.create_example(
inputs=run.inputs,
outputs=run.outputs,
dataset_id=dataset.id,
)`),
)
`),
TypeScriptBlock(`import { Client, Run } from "langsmith";
const client = new Client({
// apiUrl: "https://api.langchain.com", // Defaults to the LANGCHAIN_ENDPOINT env var
Expand All @@ -153,7 +174,8 @@ const client = new Client({
maxConcurrency?: Infinity; // Maximum number of concurrent requests to make
maxRetries?: 6; // Maximum number of retries to make
}*/
});\n
});
const datasetName = "Example Dataset";
// Filter runs to add to the dataset
const runs: Run[] = [];
Expand All @@ -163,11 +185,13 @@ for await (const run of client.listRuns({
error: false,
})) {
runs.push(run);
}\n
}
const dataset = await client.createDataset(datasetName, {
description: "An example dataset",
dataType: "kv",
});\n
});
for (const run of runs) {
await client.createExample(run.inputs, run.outputs ?? {}, {
datasetId: dataset.id,
Expand All @@ -187,13 +211,17 @@ First, ensure your CSV file is properly formatted with columns that represent yo
<CodeTabs
tabs={[
PythonBlock(`from langsmith import Client
import os\n
import os
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = "<YOUR-LANGSMITH-API-KEY>"\n
client = Client()\n
os.environ["LANGCHAIN_API_KEY"] = "<YOUR-LANGSMITH-API-KEY>"
client = Client()
csv_file = 'path/to/your/csvfile.csv'
input_keys = ['column1', 'column2'] # replace with your input column names
output_keys = ['output1', 'output2'] # replace with your output column names\n
output_keys = ['output1', 'output2'] # replace with your output column names
dataset = client.upload_csv(
csv_file=csv_file,
input_keys=input_keys,
Expand All @@ -202,19 +230,23 @@ dataset = client.upload_csv(
description="Dataset created from a CSV file"
data_type="kv"
)`),
TypeScriptBlock(`import { Client } from "langsmith";\n
const client = new Client();\n
const csvFile = 'path/to/your/csvfile.csv';
const inputKeys = ['column1', 'column2']; // replace with your input column names
const outputKeys = ['output1', 'output2']; // replace with your output column names\n
TypeScriptBlock(`import { Client } from "langsmith";
const client = new Client();
const csvFile = "path/to/your/csvfile.csv";
const inputKeys = ["column1", "column2"]; // replace with your input column names
const outputKeys = ["output1", "output2"]; // replace with your output column names
const dataset = await client.uploadCsv({
csvFile: csvFile,
fileName: "My CSV Dataset",
inputKeys: inputKeys,
outputKeys: outputKeys,
description: "Dataset created from a CSV file",
dataType: "kv"
});`),
csvFile: csvFile,
fileName: "My CSV Dataset",
inputKeys: inputKeys,
outputKeys: outputKeys,
description: "Dataset created from a CSV file",
dataType: "kv",
});
`),
]}
groupId="client-language"
/>
Expand Down Expand Up @@ -254,8 +286,10 @@ You can programmatically fetch the datasets from LangSmith using the `list_datas

<CodeTabs
tabs={[
PythonBlock(`datasets = client.list_datasets()`),
TypeScriptBlock(`const datasets = await client.listDatasets();`),
PythonBlock(`datasets = client.list_datasets()
`),
TypeScriptBlock(`const datasets = await client.listDatasets();
`),
]}
groupId="client-language"
/>
Expand Down Expand Up @@ -296,7 +330,8 @@ You can filter datasets by type. Below is an example querying for chat datasets.

<CodeTabs
tabs={[
PythonBlock(`datasets = client.list_datasets(data_type="chat")`),
PythonBlock(`datasets = client.list_datasets(data_type="chat")
`),
TypeScriptBlock(
`const datasets = await client.listDatasets({dataType: "chat"});`
),
Expand Down Expand Up @@ -345,18 +380,19 @@ You can also list multiple examples all by ID.
<CodeTabs
tabs={[
PythonBlock(`example_ids = [
'734fc6a0-c187-4266-9721-90b7a025751a',
'd6b4c1b9-6160-4d63-9b61-b034c585074f',
'4d31df4e-f9c3-4a6e-8b6c-65701c2fed13',
"734fc6a0-c187-4266-9721-90b7a025751a",
"d6b4c1b9-6160-4d63-9b61-b034c585074f",
"4d31df4e-f9c3-4a6e-8b6c-65701c2fed13",
]
examples = client.list_examples(example_ids=example_ids)`),
TypeScriptBlock(`
const exampleIds = [
examples = client.list_examples(example_ids=example_ids)
`),
TypeScriptBlock(`const exampleIds = [
"734fc6a0-c187-4266-9721-90b7a025751a",
"d6b4c1b9-6160-4d63-9b61-b034c585074f",
"4d31df4e-f9c3-4a6e-8b6c-65701c2fed13",
];
const examples = await client.listExamples({exampleIds: exampleIds});`),
const examples = await client.listExamples({ exampleIds: exampleIds });
`),
]}
groupId="client-language"
/>
Loading

0 comments on commit 0c156d4

Please sign in to comment.