wip

langchain-ai · Dec 4, 2024 · d7ea555 · d7ea555
1 parent 74ebfbc
commit d7ea555
Showing 1 changed file with 31 additions and 28 deletions.
diff --git a/docs/evaluation/how_to_guides/evaluate_existing_experiment.mdx b/docs/evaluation/how_to_guides/evaluate_existing_experiment.mdx
@@ -4,20 +4,20 @@ sidebar_position: 6
 
 # How to evaluate an existing experiment (Python only)
 
-:::note
-Currently, `evaluate_existing` is only supported in the Python SDK.
+:::inof
+Evaluation of existing experiments is currently only supported in the Python SDK.
 :::
 
 If you have already run an experiment and want to add additional evaluation metrics, you
-can apply any evaluators to the experiment using the `evaluate_existing` method.
+can apply any evaluators to the experiment using the `evaluate()` / `aevaluate()` methods.
 
 ```python
-from langsmith import evaluate_existing
+from langsmith import evaluate
 
-def always_half(run, example):
-    return {"score": 0.5}
+def always_half(inputs: dict, outputs: dict) -> float:
+    return 0.5
 
-experiment_name = "my-experiment:abcd123" # Replace with an actual experiment name or ID
+experiment_name = "my-experiment:abc" # Replace with an actual experiment name or ID
 evaluate_existing(experiment_name, evaluators=[always_half])
 ```
 
@@ -27,37 +27,40 @@ Suppose you are evaluating a semantic router. You may first run an experiment:
 
 ```python
 from langsmith import evaluate
+
 def semantic_router(inputs: dict):
     return {"class": 1}
 
-def accuracy(run, example):
-    prediction = run.outputs["class"]
-    expected = example.outputs["label"]
-    return {"score": prediction == expected}
+def accuracy(outputs: dict, reference_outputs: dict) -> bool:
+    prediction = outputs["class"]
+    expected = reference_outputs["label"]
+    return prediction == expected
 
-results = evaluate(semantic_router, data="Router Classification Dataset", evaluators=[accuracy])
+results = evaluate(
+    semantic_router,
+    data="Router Classification Dataset",
+    evaluators=[accuracy],
+)
 experiment_name = results.experiment_name
 ```
 
-Later, you realize you want to add precision and recall summary metrics. The `evaluate_existing` method accepts the same arguments as the `evaluate` method, replacing the `target` system with the `experiment` you wish to add metrics to, meaning
-you can add both instance-level `evaluator`'s and aggregate `summary_evaluator`'s.
+Later, you realize you want to add precision and recall summary metrics. You can rerun `evaluate()` this time with the extra metrics,
+which allows you to add both instance-level `evaluator`'s and aggregate `summary_evaluator`'s.
 
 ```python
-from langsmith import evaluate_existing
-
-def precision(runs: list, examples: list):
-    true_positives = sum([1 for run, example in zip(runs, examples) if run.outputs["class"] == example.outputs["label"]])
-    false_positives = sum([1 for run, example in zip(runs, examples) if run.outputs["class"] != example.outputs["label"]])
-    return {"score": true_positives / (true_positives + false_positives)}
-
-def recall(runs: list, examples: list):
-    true_positives = sum([1 for run, example in zip(runs, examples) if run.outputs["class"] == example.outputs["label"]])
-    false_negatives = sum([1 for run, example in zip(runs, examples) if run.outputs["class"] != example.outputs["label"]])
-    return {"score": true_positives / (true_positives + false_negatives)}
+from langsmith import evaluate
 
-evaluate_existing(experiment_name, summary_evaluators=[precision, recall])
+# Note that now we take list of dicts as inputs instead of just dicts.
+def precision_recall(outputs: list[dict], reference_outputs: list[dict]) -> list[dict]:
+    true_positives = sum([ref["label"] == 1 and out["class"] == 1 for out, ref in zip(outputs, reference_outputs)])
+    predicted_positives = len([out for out in outputs if out["class"] == 1])
+    actual_positives = len([ref for ref in reference_outputs if ref["label"] == 1])
+    return [
+        {"score": true_positives / predicted_positives, "key": "precision"},
+        {"score": true_positives / actual_positives, "key": "recall"}
+    ]
+
+evaluate(experiment_name, summary_evaluators=[precision_recall])
 ```
 
 The precision and recall metrics will now be available in the LangSmith UI for the `experiment_name` experiment.
-
-As is the case with the `evaluate` function, there is an identical, asynchronous `aevaluate_existing` function that can be used to evaluate experiments asynchronously.