From b412da0417bab09dbfed74c98fa52d4426d82609 Mon Sep 17 00:00:00 2001
From: isaac hershenson <ihershenson@hmc.edu>
Date: Thu, 19 Dec 2024 15:46:18 -0800
Subject: [PATCH 1/2] draft

---
 docs/evaluation/how_to_guides/summary.mdx | 30 ++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/docs/evaluation/how_to_guides/summary.mdx b/docs/evaluation/how_to_guides/summary.mdx
index 25116b7d..7e9adbe7 100644
--- a/docs/evaluation/how_to_guides/summary.mdx
+++ b/docs/evaluation/how_to_guides/summary.mdx
@@ -8,7 +8,9 @@ import {
 
 Some metrics can only be defined on the entire experiment level as opposed to the individual runs of the experiment.
 For example, you may want to compute the overall pass rate or f1 score of your evaluation target across all examples in the dataset.
-These are called `summary_evaluators`. Instead of taking in a single `Run` and `Example`, these evaluators take a list of each.
+These are called `summary_evaluators`.
+
+## Basic example
 
 Below, we'll implement a very simple summary evaluator that computes overall pass rate:
 
@@ -47,7 +49,7 @@ You can then pass this evaluator to the `evaluate` method as follows:
       
       ls_client = Client()
       dataset = ls_client.clone_public_dataset(
-        "https://smith.langchain.com/public/3d6831e6-1680-4c88-94df-618c8e01fc55/d
+        "https://smith.langchain.com/public/3d6831e6-1680-4c88-94df-618c8e01fc55/d"
       )
       
       def bad_classifier(inputs: dict) -> dict:
@@ -73,7 +75,7 @@ You can then pass this evaluator to the `evaluate` method as follows:
       const client = new Client();
       const datasetName = "Toxic queries";
       const dataset = await client.clonePublicDataset(
-        "https://smith.langchain.com/public/3d6831e6-1680-4c88-94df-618c8e01fc55/d,
+        "https://smith.langchain.com/public/3d6831e6-1680-4c88-94df-618c8e01fc55/d",
         { datasetName: datasetName }
       );
 
@@ -100,3 +102,25 @@ You can then pass this evaluator to the `evaluate` method as follows:
 In the LangSmith UI, you'll the summary evaluator's score displayed with the corresponding key.
 
 ![](./static/summary_eval.png)
+
+## Summary evaluator args
+
+Summary evaluator functions must have specific argument names. They can take any subset of the following arguments:
+
+- `inputs: list[dict]`: A list of the inputs corresponding to a single example in a dataset.
+- `outputs: list[dict]`: A list of the dict outputs produced by each experiment on the given inputs.
+- `reference_outputs/referenceOutputs: list[dict]`: A list of the reference outputs associated with the example, if available.
+- `runs: list[Run]`: A list of the full [Run](/reference/data_formats/run_data_format) objects generated by the two experiments on the given example. Use this if you need access to intermediate steps or metadata about each run.
+- `examples: list[Example]`: All of the dataset [Example](/reference/data_formats/example_data_format) objects, including the example inputs, outputs (if available), and metdata (if available).
+
+## Summary evaluator output
+
+Summary evaluators are expected to return one of the following types:
+
+Python and JS/TS
+
+- `dict`: dicts of the form `{"score": ..., "name": ...}` allow you to pass a numeric or boolean score and metric name.
+
+Currently Python only
+
+- `int | float | bool`: this is interepreted as an continuous metric that can be averaged, sorted, etc. The function name is used as the name of the metric.
\ No newline at end of file

From ac1d58af71aa4b04f81806049be399b6e4ad8f12 Mon Sep 17 00:00:00 2001
From: isaac hershenson <ihershenson@hmc.edu>
Date: Fri, 20 Dec 2024 11:40:09 -0800
Subject: [PATCH 2/2] change basic example

---
 docs/evaluation/how_to_guides/summary.mdx | 56 +++++++++++++++++++----
 1 file changed, 46 insertions(+), 10 deletions(-)

diff --git a/docs/evaluation/how_to_guides/summary.mdx b/docs/evaluation/how_to_guides/summary.mdx
index 7e9adbe7..ad6272f3 100644
--- a/docs/evaluation/how_to_guides/summary.mdx
+++ b/docs/evaluation/how_to_guides/summary.mdx
@@ -12,28 +12,64 @@ These are called `summary_evaluators`.
 
 ## Basic example
 
-Below, we'll implement a very simple summary evaluator that computes overall pass rate:
+Here, we'll compute the f1-score, which is a combination of precision and recall.
+
+This sort of metric can only be computed over all of the examples in our experiment, so our evaluator takes in a list of outputs, and a list of reference_outputs.
 
 <CodeTabs
   groupId="client-language"
   tabs={[
     python`
-      def pass_50(outputs: list[dict], reference_outputs: list[dict]) -> bool:
-          """Pass if >50% of all results are correct."""
-          correct = sum([out["class"] == ref["label"] for out, ref in zip(outputs, reference_outputs)])
-          return correct / len(outputs) > 0.5
+      def f1_score_summary_evaluator(outputs: list[dict], reference_outputs: list[dict]) -> dict:
+          true_positives = 0
+          false_positives = 0
+          false_negatives = 0
+          for output_dict, reference_output_dict in zip(outputs, reference_outputs):
+              output = output_dict["class"]
+              reference_output = reference_output_dict["class"]
+              if output == "Toxic" and reference_output == "Toxic":
+                  true_positives += 1
+              elif output == "Toxic" and reference_output == "Not toxic":
+                  false_positives += 1
+              elif output == "Not toxic" and reference_output == "Toxic":
+                  false_negatives += 1
+
+          if true_positives == 0:
+              return {"key": "f1_score", "score": 0.0}
+
+          precision = true_positives / (true_positives + false_positives)
+          recall = true_positives / (true_positives + false_negatives)
+          f1_score = 2 * (precision * recall) / (precision + recall)
+          return {"key": "f1_score", "score": f1_score}
     `,
     typescript`
-      function summaryEval({ outputs, referenceOutputs }: { outputs: Record<string, any>[], referenceOutputs?: Record<string, any>[]}) {
-        let correct = 0;
+      function f1ScoreSummaryEvaluator({ outputs, referenceOutputs }: { outputs: Record<string, any>[], referenceOutputs: Record<string, any>[] }) {
+        let truePositives = 0;
+        let falsePositives = 0;
+        let falseNegatives = 0;
         
         for (let i = 0; i < outputs.length; i++) {
-          if (outputs[i]["output"] === referenceOutputs[i]["label"]) {
-            correct += 1;
+          const output = outputs[i]["class"];
+          const referenceOutput = referenceOutputs[i]["class"];
+          
+          if (output === "Toxic" && referenceOutput === "Toxic") {
+            truePositives += 1;
+          } else if (output === "Toxic" && referenceOutput === "Not toxic") {
+            falsePositives += 1;
+          } else if (output === "Not toxic" && referenceOutput === "Toxic") {
+            falseNegatives += 1;
           }
         }
         
-        return { key: "pass", score: correct / outputs.length > 0.5 };
+        if (truePositives === 0) {
+          return { key: "f1_score", score: 0.0 };
+        }
+        
+        const precision = truePositives / (truePositives + falsePositives);
+        const recall = truePositives / (truePositives + falseNegatives);
+        const f1Score = 2 * (precision * recall) / (precision + recall);
+        
+        return { key: "f1_score", score: f1Score };
       }
     `,
   ]}