From 3e336dc0619d96e373bf499dfd2f2f00593cad0e Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Sun, 5 Nov 2023 21:54:36 +0100
Subject: [PATCH 01/54] add endpoint

---
 athena/athena/endpoints.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/athena/athena/endpoints.py b/athena/athena/endpoints.py
index 8e9d4dcc5..eac88656f 100644
--- a/athena/athena/endpoints.py
+++ b/athena/athena/endpoints.py
@@ -2,7 +2,7 @@
 import inspect
 from fastapi import Depends, BackgroundTasks
 from pydantic import BaseModel, ValidationError
-from typing import TypeVar, Callable, List, Union, Any, Coroutine, Type
+from typing import TypeVar, Callable, Dict, List, Union, Any, Coroutine, Type
 
 from athena.app import app
 from athena.authenticate import authenticated
@@ -358,4 +358,12 @@ def config_schema_provider(cls: Type[C]) -> Type[C]:
     async def wrapper():
         return cls.schema()
 
-    return cls
\ No newline at end of file
+    return cls
+
+def evaluation_provider(func: Union[
+    Callable[[E, S, List[F], List[F]], Dict[int, Any]],
+    Callable[[E, S, List[F], List[F]], Coroutine[Any, Any, Dict[int, Any]]],
+    Callable[[E, S, List[F], List[F], C], Dict[int, Any]],
+    Callable[[E, S, List[F], List[F], C], Coroutine[Any, Any, Dict[int, Any]]]
+]):
+    pass
\ No newline at end of file

From 2fb28f00182568feab5ca78605c10e13b61deeea Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Sun, 5 Nov 2023 22:09:19 +0100
Subject: [PATCH 02/54] add evaluation_provider

---
 athena/athena/endpoints.py | 64 ++++++++++++++++++++++++++++++++++----
 1 file changed, 58 insertions(+), 6 deletions(-)

diff --git a/athena/athena/endpoints.py b/athena/athena/endpoints.py
index eac88656f..a762a37d0 100644
--- a/athena/athena/endpoints.py
+++ b/athena/athena/endpoints.py
@@ -2,7 +2,7 @@
 import inspect
 from fastapi import Depends, BackgroundTasks
 from pydantic import BaseModel, ValidationError
-from typing import TypeVar, Callable, Dict, List, Union, Any, Coroutine, Type
+from typing import TypeVar, Callable, List, Union, Any, Coroutine, Type
 
 from athena.app import app
 from athena.authenticate import authenticated
@@ -361,9 +361,61 @@ async def wrapper():
     return cls
 
 def evaluation_provider(func: Union[
-    Callable[[E, S, List[F], List[F]], Dict[int, Any]],
-    Callable[[E, S, List[F], List[F]], Coroutine[Any, Any, Dict[int, Any]]],
-    Callable[[E, S, List[F], List[F], C], Dict[int, Any]],
-    Callable[[E, S, List[F], List[F], C], Coroutine[Any, Any, Dict[int, Any]]]
+    Callable[[E, S, List[F], List[F]], Any],
+    Callable[[E, S, List[F], List[F]], Coroutine[Any, Any, Any]]
 ]):
-    pass
\ No newline at end of file
+    """
+    Provide evaluated feedback to the Assessment Module Manager.
+    
+    Note: The evaluation provider is usually called during the research and development phase (by the Playground).
+    Return arbitrary evaluation results.
+
+    This decorator can be used with several types of functions: synchronous or asynchronous.
+
+    Examples:
+        Below are some examples of possible functions that you can decorate with this decorator:
+
+        Without using module config (both synchronous and asynchronous forms):
+        >>> @evaluation_provider
+        ... def sync_evaluate_feedback(
+        ...     exercise: Exercise, submission: Submission, 
+        ...     true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]
+        ... ) -> Any:
+        ...     # evaluate predicted feedback here and return evaluation results
+
+        >>> @feedback_provider
+        ... async def async_evaluate_feedback(
+        ...     exercise: Exercise, submission: Submission, 
+        ...     true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]
+        ... ) -> Any:
+        ...     # evaluate predicted feedback here and return evaluation results
+    """
+    exercise_type = inspect.signature(func).parameters["exercise"].annotation
+    submission_type = inspect.signature(func).parameters["submission"].annotation
+    feedback_type = inspect.signature(func).parameters["predicted_feedbacks"].annotation.__args__[0]
+
+    @app.post("/evaluation", responses=module_responses)
+    @authenticated
+    @with_meta
+    async def wrapper(
+            exercise: exercise_type, 
+            submission: submission_type, 
+            true_feedbacks: List[feedback_type], 
+            predicted_feedbacks: List[feedback_type],
+        ):
+        # Retrieve existing metadata for the exercise, submission and feedback
+        exercise.meta.update(get_stored_exercise_meta(exercise) or {})
+        submission.meta.update(get_stored_submission_meta(submission) or {})
+        for feedback in true_feedbacks:
+            feedback.meta.update(get_stored_feedback_meta(feedback) or {})
+        for feedback in predicted_feedbacks:
+            feedback.meta.update(get_stored_feedback_meta(feedback) or {})
+
+        # Call the actual provider
+        if inspect.iscoroutinefunction(func):
+            evaluation = await func(exercise, submission, true_feedbacks, predicted_feedbacks)
+        else:
+            evaluation = func(exercise, submission, true_feedbacks, predicted_feedbacks)
+
+        return evaluation
+    return wrapper
\ No newline at end of file

From 38f6055373b7fb24a34749b3fcd6f0e46c6fc273 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Sun, 5 Nov 2023 22:09:27 +0100
Subject: [PATCH 03/54] add new line

---
 athena/athena/endpoints.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/athena/athena/endpoints.py b/athena/athena/endpoints.py
index a762a37d0..411edaf0a 100644
--- a/athena/athena/endpoints.py
+++ b/athena/athena/endpoints.py
@@ -360,6 +360,7 @@ async def wrapper():
 
     return cls
 
+
 def evaluation_provider(func: Union[
     Callable[[E, S, List[F], List[F]], Any],
     Callable[[E, S, List[F], List[F]], Coroutine[Any, Any, Any]]

From 92ca4ed74f8e77cbe3dc349fa691f74c2b7b8f28 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Sun, 5 Nov 2023 22:10:28 +0100
Subject: [PATCH 04/54] add evaluation_provider to export

---
 athena/athena/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/athena/athena/__init__.py b/athena/athena/__init__.py
index 90fb46e62..8a67e5315 100644
--- a/athena/athena/__init__.py
+++ b/athena/athena/__init__.py
@@ -6,7 +6,7 @@
 from .schemas import ExerciseType, GradingCriterion, StructuredGradingInstruction
 from .metadata import emit_meta, get_meta
 from .experiment import get_experiment_environment
-from .endpoints import submission_selector, submissions_consumer, feedback_consumer, feedback_provider, config_schema_provider  # type: ignore
+from .endpoints import submission_selector, submissions_consumer, feedback_consumer, feedback_provider, config_schema_provider, evaluation_provider  # type: ignore
 
 
 @app.get("/")
@@ -28,6 +28,7 @@ def run_module():
     "feedback_consumer",
     "feedback_provider",
     "config_schema_provider",
+    "evaluation_provider",
     "emit_meta",
     "get_meta",
     "get_experiment_environment",

From 433cd7fda70dbe38890797a33bed5b475632b572 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Sun, 5 Nov 2023 22:25:35 +0100
Subject: [PATCH 05/54] add example evaluation endpoint

---
 module_example/module_example/__main__.py | 30 +++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/module_example/module_example/__main__.py b/module_example/module_example/__main__.py
index 7bdef91fe..182f160bb 100644
--- a/module_example/module_example/__main__.py
+++ b/module_example/module_example/__main__.py
@@ -1,10 +1,11 @@
 """
 Entry point for the module_example module.
 """
-from typing import List
+import random
+from typing import List, Any
 from pydantic import BaseModel, Field
 
-from athena import app, config_schema_provider, submissions_consumer, submission_selector, feedback_consumer, feedback_provider, emit_meta
+from athena import app, config_schema_provider, submissions_consumer, submission_selector, feedback_consumer, feedback_provider, evaluation_provider, emit_meta
 from athena.programming import Exercise, Submission, Feedback
 from athena.logger import logger
 from athena.storage import store_exercise, store_submissions, store_feedback
@@ -139,5 +140,30 @@ def suggest_feedback(exercise: Exercise, submission: Submission, module_config:
     ]
 
 
+# Only if it makes sense for a module (Optional)
+@evaluation_provider
+def evaluate_feedback(exercise: Exercise, submission: Submission, true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]) -> Any:
+    logger.info(
+        "evaluate_feedback: Evaluation for submission %d of exercise %d was requested with %d true and %d predicted feedbacks", 
+        submission.id, exercise.id, len(true_feedbacks), len(predicted_feedbacks)
+    )
+
+    # Do something with the true and predicted feedback and return the evaluation result
+    # Generate some example evaluation result
+    evaluation_results = []
+    true_feedback_embeddings = [random.random() for _ in true_feedbacks] 
+    predicted_feedback_embeddings = [random.random() for _ in predicted_feedbacks]
+    for feedback, embedding in zip(predicted_feedbacks, predicted_feedback_embeddings):
+        feedback_evaluation = {
+            "feedback_id": feedback.id,
+            "embedding": embedding,
+            "has_match": len([t for t in true_feedback_embeddings if abs(t - embedding) < 0.1]) > 0,
+            "correctness": random.random()
+        }
+        evaluation_results.append(feedback_evaluation)
+
+    return evaluation_results
+
+
 if __name__ == "__main__":
     app.start()

From 9b4e2c97367cc8bfb8fe1b35caa8c0ae45f29bda Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Sun, 5 Nov 2023 22:44:22 +0100
Subject: [PATCH 06/54] add playground ui

---
 .../view_mode/module_requests/index.tsx       |   7 +-
 .../module_requests/request_evaluation.tsx    | 172 ++++++++++++++++++
 .../src/hooks/athena/request_evaluation.ts    |  31 ++++
 3 files changed, 208 insertions(+), 2 deletions(-)
 create mode 100644 playground/src/components/view_mode/module_requests/request_evaluation.tsx
 create mode 100644 playground/src/hooks/athena/request_evaluation.ts

diff --git a/playground/src/components/view_mode/module_requests/index.tsx b/playground/src/components/view_mode/module_requests/index.tsx
index dca29946d..dcda17bf2 100644
--- a/playground/src/components/view_mode/module_requests/index.tsx
+++ b/playground/src/components/view_mode/module_requests/index.tsx
@@ -3,12 +3,14 @@ import type { ModuleMeta } from "@/model/health_response";
 import { useState } from "react";
 
 import { ModuleProvider } from "@/hooks/module_context";
+import ModuleAndConfigSelect from "@/components/selectors/module_and_config_select";
 import GetConfigSchema from "@/components/view_mode/module_requests/get_config_schema";
 import SendSubmissions from "@/components/view_mode/module_requests/send_submissions";
+import SelectSubmission from "@/components/view_mode/module_requests/request_submission_selection";
 import SendFeedbacks from "@/components/view_mode/module_requests/send_feedbacks";
 import RequestFeedbackSuggestions from "@/components/view_mode/module_requests/request_feedback_suggestions";
-import SelectSubmission from "@/components/view_mode/module_requests/request_submission_selection";
-import ModuleAndConfigSelect from "@/components/selectors/module_and_config_select";
+import RequestEvaluation from "@/components/view_mode/module_requests/request_evaluation";
+
 
 export default function ModuleRequests() {
   const [moduleAndConfig, setModuleAndConfig] = useState<{ module: ModuleMeta; moduleConfig: any } | undefined>(undefined);
@@ -34,6 +36,7 @@ export default function ModuleRequests() {
           <SelectSubmission />
           <SendFeedbacks />
           <RequestFeedbackSuggestions />
+          <RequestEvaluation />
         </ModuleProvider>
       )}
     </>
diff --git a/playground/src/components/view_mode/module_requests/request_evaluation.tsx b/playground/src/components/view_mode/module_requests/request_evaluation.tsx
new file mode 100644
index 000000000..c98ac6d27
--- /dev/null
+++ b/playground/src/components/view_mode/module_requests/request_evaluation.tsx
@@ -0,0 +1,172 @@
+import type { Submission } from "@/model/submission";
+import type { Exercise } from "@/model/exercise";
+import type { Feedback } from "@/model/feedback";
+import type ModuleResponse from "@/model/module_response";
+
+import { useEffect, useState } from "react";
+
+import { useModule } from "@/hooks/module_context";
+import { useBaseInfo } from "@/hooks/base_info_context";
+import useRequestEvaluation from "@/hooks/athena/request_evaluation";
+import useFeedbacks from "@/hooks/playground/feedbacks";
+
+import ExerciseSelect from "@/components/selectors/exercise_select";
+import SubmissionSelect from "@/components/selectors/submission_select";
+import ModuleResponseView from "@/components/module_response_view";
+import Disclosure from "@/components/disclosure";
+import ExerciseDetail from "@/components/details/exercise_detail";
+import SubmissionDetail from "@/components/details/submission_detail";
+
+export default function RequestEvaluation() {
+  const { module } = useModule();
+  const { dataMode } = useBaseInfo();
+
+  const [exercise, setExercise] = useState<Exercise | undefined>(undefined);
+  const [submission, setSubmission] = useState<Submission | undefined>(
+    undefined
+  );
+
+  const [predictedFeedbacks, setPredictedFeedbacks] = useState<Feedback[]>([]);
+
+  const {
+    data: trueFeedbacks,
+    isLoading: isLoadingTrueFeedbacks,
+    error: errorTrueFeedbacks,
+  } = useFeedbacks(exercise, submission);
+
+  const {
+    data: response,
+    isLoading,
+    error,
+    mutate,
+    reset,
+  } = useRequestEvaluation();
+
+  useEffect(() => setExercise(undefined), [module, dataMode]);
+
+  return (
+    <div className="bg-white rounded-md p-4 mb-8">
+      <h3 className="text-2xl font-bold mb-4">
+        Request Evaluation from Athena{" "}
+        <span className="text-gray-500 text-sm">(OPTIONAL)</span>
+      </h3>
+      <p className="text-gray-500 mb-4">
+        Evaluate a list of feedback suggestions during the research and
+        development phase. Compare the predicted feedback with the actual
+        feedback using the function annotated with{" "}
+        <code>@evaluation_provider</code>. Each module can implement custom
+        metrics to evaluate the feedback suggestions during evaluation and
+        respond with arbitrary evaluation results.
+      </p>
+      <ExerciseSelect
+        exerciseType={module.type}
+        exercise={exercise}
+        onChange={(exercise) => {
+          setExercise(exercise);
+          reset();
+          setSubmission(undefined);
+          setPredictedFeedbacks([]);
+        }}
+        disabled={isLoading}
+      />
+      {exercise && (
+        <>
+          <SubmissionSelect
+            exercise={exercise}
+            submission={submission}
+            onChange={(submission) => {
+              setSubmission(submission);
+              setPredictedFeedbacks([]);
+            }}
+            disabled={isLoading}
+          />
+          <div className="space-y-1 mt-2">
+            <ExerciseDetail exercise={exercise} />
+            {submission &&
+              (trueFeedbacks ? (
+                <Disclosure title="True Feedbacks" openedInitially>
+                  <p className="text-gray-500 text-sm">
+                    The following feedbacks given by the tutor in the past.
+                  </p>
+                  <SubmissionDetail
+                    identifier="trueFeedbacks"
+                    submission={submission}
+                    feedbacks={trueFeedbacks.filter(
+                      (f) => f.submission_id === submission.id
+                    )}
+                  />
+                </Disclosure>
+              ) : (
+                <div className="text-gray-500 text-sm">
+                  No true feedbacks available
+                </div>
+              ))}
+            {submission && (
+              <Disclosure title="Predicted Feedbacks" openedInitially>
+                <p className="text-gray-500 text-sm">
+                  Provide feedback as <strong>predicted feedbacks</strong> to
+                  test the evaluation.
+                </p>
+                <SubmissionDetail
+                  identifier="predictedFeedbacks"
+                  submission={submission}
+                  feedbacks={predictedFeedbacks.filter(
+                    (f) => f.submission_id === submission.id
+                  )}
+                  onFeedbacksChange={setPredictedFeedbacks}
+                />
+              </Disclosure>
+            )}
+            {isLoadingTrueFeedbacks && (
+              <div className="text-gray-500 text-sm">Loading feedbacks...</div>
+            )}
+            {errorTrueFeedbacks && (
+              <div className="text-red-500 text-sm">
+                Failed to load feedbacks
+              </div>
+            )}
+          </div>
+        </>
+      )}
+      <ModuleResponseView
+        response={
+          response ??
+          (error?.asModuleResponse ? error.asModuleResponse() : undefined)
+        }
+      />
+      <button
+        className="bg-primary-500 text-white rounded-md p-2 mt-4 hover:bg-primary-600 disabled:text-gray-500 disabled:bg-gray-200 disabled:cursor-not-allowed"
+        onClick={async () => {
+          if (!exercise) {
+            alert("Please select an exercise");
+            return;
+          }
+          if (!submission) {
+            alert("Please select a submission");
+            return;
+          }
+          if (!trueFeedbacks) {
+            alert("Please wait for the true feedbacks to load");
+            return;
+          }
+
+          mutate({
+            exercise,
+            submission,
+            trueFeedbacks,
+            predictedFeedbacks,
+          });
+        }}
+        disabled={!exercise || isLoading || isLoadingTrueFeedbacks}
+      >
+        {exercise
+          ? isLoading || isLoadingTrueFeedbacks
+            ? "Loading..."
+            : "Request evaluation"
+          : exercise
+          ? "Please select a submission"
+          : "Please select an exercise"}
+      </button>
+    </div>
+  );
+}
diff --git a/playground/src/hooks/athena/request_evaluation.ts b/playground/src/hooks/athena/request_evaluation.ts
new file mode 100644
index 000000000..d6ccc9f60
--- /dev/null
+++ b/playground/src/hooks/athena/request_evaluation.ts
@@ -0,0 +1,31 @@
+import type { Exercise } from "@/model/exercise";
+import type { Submission } from "@/model/submission";
+import type ModuleResponse from "@/model/module_response";
+
+import { UseMutationOptions, useMutation } from "react-query";
+import { AthenaError, useAthenaFetcher } from "@/hooks/athena_fetcher";
+import { Feedback } from "@/model/feedback";
+
+/**
+ * Requests an evaluation for an exercise and a submission given the true and predicted feedbacks from an Athena module.
+ *
+ * @example
+ * const { data, isLoading, error, mutate } = useRequestEvaluation();
+ * mutate({ exercise, submission, trueFeedbacks, predictedFeedbacks });
+ * 
+ * @param options The react-query options.
+ */
+export default function useRequestEvaluation(
+  options: Omit<
+    UseMutationOptions<ModuleResponse | undefined, AthenaError, { exercise: Exercise; submission: Submission, trueFeedbacks: Feedback[], predictedFeedbacks: Feedback[] }>,
+    "mutationFn"
+  > = {}
+) {
+  const athenaFetcher = useAthenaFetcher();
+  return useMutation({
+    mutationFn: async ({ exercise, submission, trueFeedbacks, predictedFeedbacks }) => {
+      return await athenaFetcher("/evaluation", { exercise, submission, true_feedbacks: trueFeedbacks, predicted_feedbacks: predictedFeedbacks });
+    },
+    ...options,
+  });
+}

From 5f39b8c6d2c28ed945a61f6a24ee33f7900f9688 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Mon, 6 Nov 2023 16:22:16 +0100
Subject: [PATCH 07/54] add automatic evaluation

---
 .../conduct_experiment/index.tsx              |  17 +--
 .../src/hooks/batch_module_experiment.ts      | 108 +++++++++++++++++-
 playground/src/model/automatic_evaluation.ts  |   3 +
 3 files changed, 117 insertions(+), 11 deletions(-)
 create mode 100644 playground/src/model/automatic_evaluation.ts

diff --git a/playground/src/components/view_mode/evaluation_mode/conduct_experiment/index.tsx b/playground/src/components/view_mode/evaluation_mode/conduct_experiment/index.tsx
index f91cc7b3e..3c78e1ecb 100644
--- a/playground/src/components/view_mode/evaluation_mode/conduct_experiment/index.tsx
+++ b/playground/src/components/view_mode/evaluation_mode/conduct_experiment/index.tsx
@@ -65,6 +65,12 @@ export default function ConductExperiment({
               data: data.manualRatings,
             });
           }
+          if (data.automaticEvaluation) {
+            files.push({
+              name: `${experiment.exerciseType}_automatic_evaluation_${moduleConfigurations[index].name}_${experiment.id}_run-${data.automaticEvaluation.runId}`,
+              data: data.automaticEvaluation,
+            });
+          }
         }
         return files;
       })
@@ -102,14 +108,11 @@ export default function ConductExperiment({
       return;
     }
 
-    if (
-      !data.type ||
-      (data.type !== "results" && data.type !== "manualRatings")
-    ) {
-      alert("No correct type found in the data i.e. 'results' or 'manualRatings'");
+    if (!data.type || !["results", "manualRatings", "automaticEvaluation"].includes(data.type)) {
+      alert("No correct type found in the data i.e. 'results', 'manualRatings', or 'automaticEvaluation'.");
       return;
     }
-    const type = data.type as "results" | "manualRatings";
+    const type = data.type as "results" | "manualRatings" | "automaticEvaluation";
 
     try {
       moduleViewRef.importData(data);
@@ -209,7 +212,7 @@ export default function ConductExperiment({
 
                         // If all files have been read, sort and import
                         if (filesProcessed === files.length) {
-                          // Sort the array by 'type', 'results' first and then 'manualRatings'
+                          // Sort the array by 'type', 'results' first and then 'manualRatings' or 'automaticEvaluation'
                           const sortedData = fileDataArray.sort((a, b) => {
                             if (a.type === "results" && b.type !== "results") {
                               return -1;
diff --git a/playground/src/hooks/batch_module_experiment.ts b/playground/src/hooks/batch_module_experiment.ts
index ba99982c2..a2ff10fb0 100644
--- a/playground/src/hooks/batch_module_experiment.ts
+++ b/playground/src/hooks/batch_module_experiment.ts
@@ -1,5 +1,6 @@
 import type { Feedback } from "@/model/feedback";
 import type { ManualRating } from "@/model/manual_rating";
+import type { AutomaticEvaluation } from "@/model/automatic_evaluation";
 import type { Experiment } from "@/components/view_mode/evaluation_mode/define_experiment";
 import type { ModuleConfiguration } from "@/components/view_mode/evaluation_mode/configure_modules";
 
@@ -9,6 +10,7 @@ import { useSendFeedbacks } from "./athena/send_feedbacks";
 import useRequestSubmissionSelection from "./athena/request_submission_selection";
 import useRequestFeedbackSuggestions from "./athena/request_feedback_suggestions";
 import useSendSubmissions from "./athena/send_submissions";
+import useRequestEvaluation from "./athena/request_evaluation";
 import { useExperimentIdentifiersSetRunId } from "./experiment_identifiers_context";
 
 export type ExperimentStep =
@@ -50,6 +52,11 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
     Map<number, ManualRating[]>
   >(new Map());
 
+  // Stores automatic evaluation of submissions
+  const [submissionsWithAutomaticEvaluation, setSubmissionsWithAutomaticEvaluation] = useState<
+    Map<number, AutomaticEvaluation>
+  >(new Map());
+
   const [processingStep, setProcessingStep] = useState<
     ExperimentStep | undefined
   >(undefined);
@@ -95,6 +102,19 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
           },
         } : {}
       ),
+      ...(
+        submissionsWithAutomaticEvaluation.size > 0 ? {
+          automaticEvaluation: {
+            type: "automaticEvaluation",
+            runId: data.runId,
+            experimentId: experiment.id,
+            moduleConfigurationId: moduleConfiguration.id,
+            submissionsWithAutomaticEvaluation: Object.fromEntries(
+              submissionsWithAutomaticEvaluation
+            ),
+          },
+        } : {}
+      ),
     };
   };
 
@@ -108,6 +128,7 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
         throw new Error("Invalid results data");
       }
 
+      setProcessingStep(undefined);
       setData(() => ({
         runId: importedData.runId,
         step: importedData.step,
@@ -134,7 +155,22 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
         )
       ));
       return;
+    } else if (importedData.type === "automaticEvaluation") {
+      // Relies on the fact that the automatic evaluations have to be imported after the results
+      if (importedData.runId !== data.runId) {
+        throw new Error("Run ID does not match, have you imported the results first?");
+      }
+      if (importedData.submissionsWithAutomaticEvaluation === undefined) {
+        throw new Error("Invalid automatic evaluation data");
+      }
+      setSubmissionsWithAutomaticEvaluation(() => new Map(
+        Object.entries(importedData.submissionsWithAutomaticEvaluation).map(
+          ([key, value]) => [Number(key), value as any]
+        )
+      ));
+      return;
     }
+
     throw new Error("Unknown import data type");
   };
 
@@ -158,6 +194,7 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
   const sendFeedbacks = useSendFeedbacks();
   const requestSubmissionSelection = useRequestSubmissionSelection();
   const requestFeedbackSuggestions = useRequestFeedbackSuggestions();
+  const requestEvaluation = useRequestEvaluation();
 
   // 1. Send submissions to Athena
   const stepSendSubmissions = () => {
@@ -338,10 +375,70 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
 
     setData((prevState) => ({
       ...prevState,
-      step: "finished",
+      step: "finished", // Automatic evaluation is done separately
     }));
   };
 
+  // 4. Automatic evaluation (after results are 'finished')
+  const stepAutomaticEvaluation = async () => {
+    setProcessingStep("finished");
+
+    console.log("Running automatic evaluation...");
+
+    let remainingSubmissions = experiment.evaluationSubmissions.filter(
+      (submission) => !submissionsWithAutomaticEvaluation.has(submission.id)
+    );
+    
+    let index = 0;
+    for (const submission of remainingSubmissions) {
+      console.log(
+        `Evaluating... (${index + 1}/${
+          remainingSubmissions.length
+        })`
+      );
+
+      const predictedFeedbacks = data.submissionsWithFeedbackSuggestions.get(
+        submission.id
+      )?.suggestions ?? [];
+
+      if (predictedFeedbacks.length === 0) {
+        // Skip if there are no predicted feedbacks
+        setSubmissionsWithAutomaticEvaluation((prevState) => {
+          const newMap = new Map(prevState);
+          newMap.set(submission.id, {});
+          return newMap;
+        });
+        continue;
+      }
+
+      try {
+        const response = await requestEvaluation.mutateAsync({
+          exercise: experiment.exercise,
+          submission,
+          trueFeedbacks: experiment.tutorFeedbacks.filter(
+            (feedback) => feedback.submission_id === submission.id
+          ),
+          predictedFeedbacks: predictedFeedbacks,
+        });
+        if (!isMounted.current) {
+          return;
+        }
+        console.log(`Received evaluation for submission ${submission.id}:`, response.data);
+
+        setSubmissionsWithAutomaticEvaluation((prevState) => {
+          const newMap = new Map(prevState);
+          newMap.set(submission.id, response.data);
+          return newMap;
+        });
+      } catch (error) {
+        console.error(
+          `Error while evaluating submission ${submission.id}:`,
+          error
+        );
+      }
+    }
+  };
+
   useEffect(() => {
     isMounted.current = true;
     return () => {
@@ -375,10 +472,12 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
       processingStep !== "generatingFeedbackSuggestions"
     ) {
       stepGenerateFeedbackSuggestions();
+    } else if (
+      data.step === "finished" &&
+      processingStep !== "finished"
+    ) {
+      stepAutomaticEvaluation();
     }
-    // TODO: Add automatic evaluation step here
-    // Note: Evaluate tutor feedback more globally to not do it multiple times
-    // Note 2: Actually, I probably want to have it in parallel with the feedback suggestions for the interactive mode!
   }, [data.step]);
 
   return {
@@ -394,6 +493,7 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
       sendFeedbacks,
       requestSubmissionSelection,
       requestFeedbackSuggestions,
+      requestEvaluation,
     },
   };
 }
diff --git a/playground/src/model/automatic_evaluation.ts b/playground/src/model/automatic_evaluation.ts
new file mode 100644
index 000000000..fb55b9c86
--- /dev/null
+++ b/playground/src/model/automatic_evaluation.ts
@@ -0,0 +1,3 @@
+export type AutomaticEvaluation = {
+  [module: string]: any;
+};

From 2667cab2b270305becc9f3b054e40a5bfb2af81d Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Mon, 6 Nov 2023 17:22:55 +0100
Subject: [PATCH 08/54] add automatic evaluation

---
 .../endpoints/health_endpoint.py              |  5 +-
 .../module/list_modules.py                    |  1 +
 .../module/module.py                          |  1 +
 assessment_module_manager/modules.ini         |  5 ++
 .../module_requests/request_evaluation.tsx    |  4 +-
 .../src/hooks/athena/request_evaluation.ts    | 62 ++++++++++++++++---
 playground/src/hooks/athena_fetcher.ts        | 42 ++++++++-----
 .../src/hooks/batch_module_experiment.ts      | 11 +++-
 playground/src/hooks/module_context.tsx       |  2 +-
 playground/src/model/health_response.ts       |  1 +
 10 files changed, 103 insertions(+), 31 deletions(-)

diff --git a/assessment_module_manager/assessment_module_manager/endpoints/health_endpoint.py b/assessment_module_manager/assessment_module_manager/endpoints/health_endpoint.py
index e1a8d252f..85879b673 100644
--- a/assessment_module_manager/assessment_module_manager/endpoints/health_endpoint.py
+++ b/assessment_module_manager/assessment_module_manager/endpoints/health_endpoint.py
@@ -27,6 +27,7 @@ class HealthResponse(BaseModel):
     """
     Response indicating whether the Assessment Module Manager is healthy,
     and whether all the modules are healthy (i.e. reachable).
+    Additional information about the modules is also provided.
     """
     status: str = Field(const=True, default="ok", example="ok")
     modules: dict = Field(
@@ -35,7 +36,8 @@ class HealthResponse(BaseModel):
                 "module_example": {
                     "url": "http://localhost:5001",
                     "type": "programming",
-                    "healthy": True
+                    "healthy": True,
+                    "supportsEvaluation": True
                 }
             }
         ]
@@ -56,6 +58,7 @@ async def get_health() -> HealthResponse:
                 "url": module.url,
                 "type": module.type,
                 "healthy": await is_healthy(module),
+                "supportsEvaluation": module.supports_evaluation
             }
             for module in get_modules()
         }
diff --git a/assessment_module_manager/assessment_module_manager/module/list_modules.py b/assessment_module_manager/assessment_module_manager/module/list_modules.py
index b2862e489..e9f18f106 100644
--- a/assessment_module_manager/assessment_module_manager/module/list_modules.py
+++ b/assessment_module_manager/assessment_module_manager/module/list_modules.py
@@ -18,6 +18,7 @@ def list_modules() -> List[Module]:
             name=module,
             url=cast(AnyHttpUrl, modules_config[module]["url"]),
             type=ExerciseType(modules_config[module]["type"]),
+            supports_evaluation=modules_config[module].getboolean("supports_evaluation"),
         )
         for module in modules_config.sections()
     ]
diff --git a/assessment_module_manager/assessment_module_manager/module/module.py b/assessment_module_manager/assessment_module_manager/module/module.py
index 35dbb6da1..65e99931f 100644
--- a/assessment_module_manager/assessment_module_manager/module/module.py
+++ b/assessment_module_manager/assessment_module_manager/module/module.py
@@ -8,3 +8,4 @@ class Module(BaseModel):
     name: str = Field(example="module_example")
     url: AnyHttpUrl = Field(example="http://localhost:5001")
     type: ExerciseType = Field(example=ExerciseType.text)
+    supports_evaluation: bool = Field(description="Whether the module supports evaluation", example=True)
diff --git a/assessment_module_manager/modules.ini b/assessment_module_manager/modules.ini
index 0dde7b074..3402183f0 100644
--- a/assessment_module_manager/modules.ini
+++ b/assessment_module_manager/modules.ini
@@ -1,19 +1,24 @@
 [module_example]
 url = http://localhost:5001
 type = programming
+supports_evaluation = true
 
 [module_programming_llm]
 url = http://localhost:5002
 type = programming
+supports_evaluation = false
 
 [module_text_llm]
 url = http://localhost:5003
 type = text
+supports_evaluation = false
 
 [module_text_cofee]
 url = http://localhost:5004
 type = text
+supports_evaluation = false
 
 [module_programming_themisml]
 url = http://localhost:5005
 type = programming
+supports_evaluation = false
diff --git a/playground/src/components/view_mode/module_requests/request_evaluation.tsx b/playground/src/components/view_mode/module_requests/request_evaluation.tsx
index c98ac6d27..f004708d6 100644
--- a/playground/src/components/view_mode/module_requests/request_evaluation.tsx
+++ b/playground/src/components/view_mode/module_requests/request_evaluation.tsx
@@ -40,7 +40,7 @@ export default function RequestEvaluation() {
     error,
     mutate,
     reset,
-  } = useRequestEvaluation();
+  } = useRequestEvaluation(undefined, true) // onlyUseContextModule = true for module requests only
 
   useEffect(() => setExercise(undefined), [module, dataMode]);
 
@@ -130,7 +130,7 @@ export default function RequestEvaluation() {
       )}
       <ModuleResponseView
         response={
-          response ??
+          response?.at(0) ??
           (error?.asModuleResponse ? error.asModuleResponse() : undefined)
         }
       />
diff --git a/playground/src/hooks/athena/request_evaluation.ts b/playground/src/hooks/athena/request_evaluation.ts
index d6ccc9f60..76227258f 100644
--- a/playground/src/hooks/athena/request_evaluation.ts
+++ b/playground/src/hooks/athena/request_evaluation.ts
@@ -5,26 +5,74 @@ import type ModuleResponse from "@/model/module_response";
 import { UseMutationOptions, useMutation } from "react-query";
 import { AthenaError, useAthenaFetcher } from "@/hooks/athena_fetcher";
 import { Feedback } from "@/model/feedback";
+import { useModule } from "@/hooks/module_context";
+import useHealth from "@/hooks/health";
 
 /**
- * Requests an evaluation for an exercise and a submission given the true and predicted feedbacks from an Athena module.
+ * Requests an evaluation for an exercise and a submission given the true and predicted feedbacks from healthy Athena modules.
+ *
+ * @param options The react-query options.
+ * @param onlyUseContextModule - If true, only the context module is used for the evaluation. Otherwise, all healthy modules are used.
  *
  * @example
  * const { data, isLoading, error, mutate } = useRequestEvaluation();
  * mutate({ exercise, submission, trueFeedbacks, predictedFeedbacks });
- * 
- * @param options The react-query options.
  */
 export default function useRequestEvaluation(
   options: Omit<
-    UseMutationOptions<ModuleResponse | undefined, AthenaError, { exercise: Exercise; submission: Submission, trueFeedbacks: Feedback[], predictedFeedbacks: Feedback[] }>,
+    UseMutationOptions<
+      ModuleResponse[] | undefined,
+      AthenaError,
+      {
+        exercise: Exercise;
+        submission: Submission;
+        trueFeedbacks: Feedback[];
+        predictedFeedbacks: Feedback[];
+      }
+    >,
     "mutationFn"
-  > = {}
+  > = {},
+  onlyUseContextModule = false
 ) {
   const athenaFetcher = useAthenaFetcher();
+  const { module: contextModule } = useModule();
+  const { data: health } = useHealth();
+
   return useMutation({
-    mutationFn: async ({ exercise, submission, trueFeedbacks, predictedFeedbacks }) => {
-      return await athenaFetcher("/evaluation", { exercise, submission, true_feedbacks: trueFeedbacks, predicted_feedbacks: predictedFeedbacks });
+    mutationFn: async ({
+      exercise,
+      submission,
+      trueFeedbacks,
+      predictedFeedbacks,
+    }) => {
+      const modules = onlyUseContextModule
+        ? [contextModule]
+        : Object.values(health?.modules ?? {}).filter(
+            (module) => module.healthy && module.type === contextModule.type
+          );
+
+      const results = await Promise.allSettled(
+        modules.map((module) =>
+          athenaFetcher(
+            "/evaluation",
+            {
+              exercise,
+              submission,
+              true_feedbacks: trueFeedbacks,
+              predicted_feedbacks: predictedFeedbacks,
+            },
+            { module: module, moduleConfig: undefined }
+          )
+        )
+      );
+
+      return results.flatMap((result) => {
+        if (result.status === "fulfilled") {
+          return [result.value];
+        } else {
+          return [];
+        }
+      });
     },
     ...options,
   });
diff --git a/playground/src/hooks/athena_fetcher.ts b/playground/src/hooks/athena_fetcher.ts
index 1f23f20ba..69b9a2278 100644
--- a/playground/src/hooks/athena_fetcher.ts
+++ b/playground/src/hooks/athena_fetcher.ts
@@ -1,4 +1,5 @@
 import type ModuleResponse from "@/model/module_response";
+import type { Module } from "@/hooks/module_context";
 
 import baseUrl from "@/helpers/base_url";
 import { useBaseInfo } from "@/hooks/base_info_context";
@@ -36,27 +37,34 @@ export class AthenaError extends Error {
  * @returns A function that can be used to fetch data from the module or that returns undefined if the module is not set.
  */
 export function useAthenaFetcher() {
-  const { module, moduleConfig } = useModule();
+  const { module: contextModule, moduleConfig: contextModuleConfig } = useModule();
   const { athenaUrl, athenaSecret } = useBaseInfo();  
   const { experimentId, moduleConfigurationId, runId } = useExperimentIdentifiers();
 
-  const headers: { [key: string]: string } = {};
-  if (moduleConfig) {
-    headers["X-Module-Config"] = JSON.stringify(moduleConfig);
-  }
-  if (experimentId) {
-    headers["X-Experiment-ID"] = experimentId;
-  }
-  if (moduleConfigurationId) {
-    headers["X-Module-Configuration-ID"] = moduleConfigurationId;
-  }
-  if (runId) {
-    headers["X-Run-ID"] = runId;
-  }
-
   return (
-    async (moduleRoute: string, body?: any) => {
-      const url = `${athenaUrl}/modules/${module.type}/${module.name}${moduleRoute}`;
+    async (moduleRoute: string, body?: any, overrideModule?: Module) => {
+      let targetModule = contextModule;
+      let targetModuleConfig = contextModuleConfig;
+      if (overrideModule) {
+        targetModule = overrideModule.module;
+        targetModuleConfig = overrideModule.moduleConfig;
+      }
+
+      const headers: { [key: string]: string } = {};
+      if (targetModuleConfig) {
+        headers["X-Module-Config"] = JSON.stringify(targetModuleConfig);
+      }
+      if (experimentId) {
+        headers["X-Experiment-ID"] = experimentId;
+      }
+      if (moduleConfigurationId) {
+        headers["X-Module-Configuration-ID"] = moduleConfigurationId;
+      }
+      if (runId) {
+        headers["X-Run-ID"] = runId;
+      }
+
+      const url = `${athenaUrl}/modules/${targetModule.type}/${targetModule.name}${moduleRoute}`;
       const response = await fetch(
         `${baseUrl}/api/athena_request?${new URLSearchParams({
           url: url,
diff --git a/playground/src/hooks/batch_module_experiment.ts b/playground/src/hooks/batch_module_experiment.ts
index a2ff10fb0..7c6f9f68d 100644
--- a/playground/src/hooks/batch_module_experiment.ts
+++ b/playground/src/hooks/batch_module_experiment.ts
@@ -412,7 +412,7 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
       }
 
       try {
-        const response = await requestEvaluation.mutateAsync({
+        const responses = await requestEvaluation.mutateAsync({
           exercise: experiment.exercise,
           submission,
           trueFeedbacks: experiment.tutorFeedbacks.filter(
@@ -423,11 +423,16 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
         if (!isMounted.current) {
           return;
         }
-        console.log(`Received evaluation for submission ${submission.id}:`, response.data);
+
+        const data = Object.fromEntries(
+          responses.map((response) => [response.module_name, response.data])
+        );
+
+        console.log(`Received evaluation for submission ${submission.id}:`, data);
 
         setSubmissionsWithAutomaticEvaluation((prevState) => {
           const newMap = new Map(prevState);
-          newMap.set(submission.id, response.data);
+          newMap.set(submission.id, data);
           return newMap;
         });
       } catch (error) {
diff --git a/playground/src/hooks/module_context.tsx b/playground/src/hooks/module_context.tsx
index c2a96232e..f7aab7666 100644
--- a/playground/src/hooks/module_context.tsx
+++ b/playground/src/hooks/module_context.tsx
@@ -2,7 +2,7 @@ import type { ModuleMeta } from '@/model/health_response';
 
 import { ReactNode, createContext, useContext, useReducer } from 'react';
 
-type Module = {
+export type Module = {
   module: ModuleMeta;
   moduleConfig: any;
 };
diff --git a/playground/src/model/health_response.ts b/playground/src/model/health_response.ts
index 7502b21d9..4ffe27de9 100644
--- a/playground/src/model/health_response.ts
+++ b/playground/src/model/health_response.ts
@@ -2,6 +2,7 @@ export type ModuleMeta = {
   name: string;
   type: string;
   healthy: boolean;
+  supportsEvaluation: boolean;
 };
 
 export type HealthResponse = {

From 7dbb316f1a7b003bff65c24c38092b9c5f0c2451 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 16:34:34 +0100
Subject: [PATCH 09/54] add UI changes

---
 .../batch_module_experiment.tsx               |  64 ++++--
 .../module_experiment_progress.tsx            | 193 +++++++++++-------
 .../src/hooks/batch_module_experiment.ts      |  22 +-
 3 files changed, 177 insertions(+), 102 deletions(-)

diff --git a/playground/src/components/view_mode/evaluation_mode/conduct_experiment/batch_module_experiment.tsx b/playground/src/components/view_mode/evaluation_mode/conduct_experiment/batch_module_experiment.tsx
index d2e9fdb02..560d8b204 100644
--- a/playground/src/components/view_mode/evaluation_mode/conduct_experiment/batch_module_experiment.tsx
+++ b/playground/src/components/view_mode/evaluation_mode/conduct_experiment/batch_module_experiment.tsx
@@ -2,7 +2,12 @@ import type { Submission } from "@/model/submission";
 import type { Experiment } from "../define_experiment";
 import type { ExperimentStep } from "@/hooks/batch_module_experiment";
 
-import React, { useImperativeHandle, useState, ForwardedRef, useEffect } from "react";
+import React, {
+  useImperativeHandle,
+  useState,
+  ForwardedRef,
+  useEffect,
+} from "react";
 import Modal from "react-modal";
 import { FullScreenHandle } from "react-full-screen";
 
@@ -14,6 +19,7 @@ import { ModuleConfiguration } from "../configure_modules";
 import ModuleExperimentProgress from "./module_experiment_progress";
 import SubmissionDetail from "@/components/details/submission_detail";
 import ModuleConfigSelect from "@/components/selectors/module_config_select";
+import { twMerge } from "tailwind-merge";
 
 type ConductBatchModuleExperimentProps = {
   experiment: Experiment;
@@ -53,7 +59,10 @@ const ConductBatchModuleExperiment = React.forwardRef<
     ref: ForwardedRef<ConductBatchModuleExperimentHandles>
   ) => {
     const { data: health } = useHealth();
-    const moduleExperiment = useBatchModuleExperiment(experiment, moduleConfiguration);
+    const moduleExperiment = useBatchModuleExperiment(
+      experiment,
+      moduleConfiguration
+    );
 
     const [showProgress, setShowProgress] = useState(true);
     const [isConfigModalOpen, setConfigModalOpen] = useState(false);
@@ -88,14 +97,6 @@ const ConductBatchModuleExperiment = React.forwardRef<
             <div className="flex items-center justify-between gap-2">
               <h4 className="text-lg font-bold">{moduleConfiguration.name}</h4>
               <div className="flex flex-1 justify-end gap-1 mb-1 self-start">
-                {moduleExperiment.continueAfterTraining && (
-                <button 
-                  className="rounded-md p-2 bg-primary-500 hover:bg-primary-600 text-white text-base leading-none"
-                  onClick={moduleExperiment.continueAfterTraining}
-                  >
-                  Start Generating
-                </button>
-                )}
                 <button
                   disabled={moduleOrderControl.isFirstModule}
                   className="w-8 h-8 rounded-md p-2 bg-gray-100 hover:bg-gray-200 font-bold text-gray-500 hover:text-gray-600 text-base leading-none disabled:text-gray-300 disabled:bg-gray-50 disabled:cursor-not-allowed"
@@ -125,16 +126,29 @@ const ConductBatchModuleExperiment = React.forwardRef<
                     Unhealthy
                   </span>
                 )}
-                {moduleExperiment.data.step === "finished" ? (
+                {moduleExperiment.data.step === "finished" &&
+                moduleExperiment.submissionsWithAutomaticEvaluation?.size ===
+                  moduleExperiment.data.submissionsWithFeedbackSuggestions
+                    .size ? (
                   <span className="rounded-full bg-green-500 text-white px-2 py-0.5 text-xs">
                     Finished
                   </span>
-                ) : moduleExperiment.data.step !== undefined ? (
-                  <span className="rounded-full bg-yellow-500 text-white px-2 py-0.5 text-xs">
-                    {moduleExperiment.continueAfterTraining ?
+                ) : moduleExperiment.data.step !== "notStarted" ? (
+                  <span
+                    className={twMerge(
+                      "rounded-full text-white px-2 py-0.5 text-xs",
+                      moduleExperiment.continueAfterTraining ||
+                        moduleExperiment.continueWithAutomaticEvaluation
+                        ? "bg-primary-300"
+                        : "bg-yellow-500"
+                    )}
+                  >
+                    {moduleExperiment.continueAfterTraining ||
+                    moduleExperiment.continueWithAutomaticEvaluation ? (
                       <>Waiting&nbsp;to&nbsp;Continue</>
-                    : <>In&nbsp;Progress</>
-                    }
+                    ) : (
+                      <>In&nbsp;Progress</>
+                    )}
                   </span>
                 ) : (
                   <span className="rounded-full bg-gray-500 text-white px-2 py-0.5 text-xs">
@@ -203,8 +217,12 @@ const ConductBatchModuleExperiment = React.forwardRef<
                 viewSubmission.id
               )?.suggestions ?? []
             }
-            manualRatings={moduleExperiment.submissionsWithManualRatings.get(viewSubmission.id)}
-            onManualRatingsChange={moduleExperiment.getManualRatingsSetter(viewSubmission.id)}
+            manualRatings={moduleExperiment.submissionsWithManualRatings.get(
+              viewSubmission.id
+            )}
+            onManualRatingsChange={moduleExperiment.getManualRatingsSetter(
+              viewSubmission.id
+            )}
           />
         </div>
         <Modal
@@ -259,10 +277,12 @@ function ConductBatchModuleExperimentWrapped(
   ref: React.Ref<ConductBatchModuleExperimentHandles>
 ) {
   return (
-    <ExperimentIdentifiersProvider experimentIdentifiers={{
-      experimentId: props.experiment.id,
-      moduleConfigurationId: props.moduleConfiguration.id,
-    }}>
+    <ExperimentIdentifiersProvider
+      experimentIdentifiers={{
+        experimentId: props.experiment.id,
+        moduleConfigurationId: props.moduleConfiguration.id,
+      }}
+    >
       <ModuleProvider
         module={props.moduleConfiguration.moduleAndConfig.module}
         moduleConfig={props.moduleConfiguration.moduleAndConfig.moduleConfig}
diff --git a/playground/src/components/view_mode/evaluation_mode/conduct_experiment/module_experiment_progress.tsx b/playground/src/components/view_mode/evaluation_mode/conduct_experiment/module_experiment_progress.tsx
index c2f4b9c65..345c9e67a 100644
--- a/playground/src/components/view_mode/evaluation_mode/conduct_experiment/module_experiment_progress.tsx
+++ b/playground/src/components/view_mode/evaluation_mode/conduct_experiment/module_experiment_progress.tsx
@@ -14,7 +14,7 @@ export default function ModuleExperimentProgress({
   experiment,
   moduleExperiment,
 }: ModuleExperimentProgressProps) {
-  const data = moduleExperiment.data;
+  const { data, submissionsWithAutomaticEvaluation } = moduleExperiment;
   const moduleRequests = moduleExperiment.moduleRequests;
 
   const stepToIndex = (step: ExperimentStep) => {
@@ -86,10 +86,11 @@ export default function ModuleExperimentProgress({
             className={twMerge(
               "flex items-center justify-center w-6 h-6 border rounded-full shrink-0",
               stepToIndex(data.step) >= 2
-              ? stepToIndex(data.step) > 2 || moduleExperiment.continueAfterTraining
-                ? "text-green-500 border-green-500"
-                : "text-yellow-500 border-yellow-500"
-              : "text-gray-500 border-gray-500"
+                ? stepToIndex(data.step) > 2 ||
+                  moduleExperiment.continueAfterTraining
+                  ? "text-green-500 border-green-500"
+                  : "text-yellow-500 border-yellow-500"
+                : "text-gray-500 border-gray-500"
             )}
           >
             2
@@ -98,10 +99,11 @@ export default function ModuleExperimentProgress({
             className={twMerge(
               "flex flex-col",
               stepToIndex(data.step) >= 2
-              ? stepToIndex(data.step) > 2 || moduleExperiment.continueAfterTraining
-                ? "text-green-500"
-                : "text-yellow-500"
-              : "text-gray-500"
+                ? stepToIndex(data.step) > 2 ||
+                  moduleExperiment.continueAfterTraining
+                  ? "text-green-500"
+                  : "text-yellow-500"
+                : "text-gray-500"
             )}
           >
             <span className="font-medium">Sending Training Feedback</span>
@@ -129,70 +131,121 @@ export default function ModuleExperimentProgress({
       )}
 
       {/* Generate Feedback Suggestions */}
-      <li className="flex items-center space-x-2">
-        <span
-          className={twMerge(
-            "flex items-center justify-center w-6 h-6 border rounded-full shrink-0",
-            stepToIndex(data.step) > 3
-              ? "text-green-500 border-green-500"
-              : stepToIndex(data.step) === 3
-              ? "text-yellow-500 border-yellow-500"
-              : "text-gray-500 border-gray-500"
-          )}
-        >
-          {experiment.trainingSubmissions ? 3 : 2}
-        </span>
-        <div
-          className={twMerge(
-            "flex flex-col",
-            stepToIndex(data.step) > 3
-              ? "text-green-500"
-              : stepToIndex(data.step) === 3
-              ? "text-yellow-500"
-              : "text-gray-500"
-          )}
-        >
-          <span className="font-medium">Generating Feedback Suggestions</span>
-          {moduleRequests.requestFeedbackSuggestions.isLoading && (
-            <span className="text-xs text-gray-500 animate-pulse">
-              Generating feedback suggestions... (
-              {data.submissionsWithFeedbackSuggestions.size + 1}/
-              {experiment.evaluationSubmissions.length})
-            </span>
-          )}
-          {moduleRequests.requestFeedbackSuggestions.isError && (
-            <span className="text-xs text-red-500">
-              {moduleRequests.requestFeedbackSuggestions.error.message}
-            </span>
-          )}
-          {moduleRequests.requestFeedbackSuggestions.isSuccess && (
-            <span className="text-xs text-green-500">
-              Generated feedback suggestions (
-              {data.submissionsWithFeedbackSuggestions.size}/
-              {experiment.evaluationSubmissions.length})
-            </span>
-          )}
+      <li className="flex items-center justify-between space-x-2">
+        <div className="flex items-center space-x-2">
+          <span
+            className={twMerge(
+              "flex items-center justify-center w-6 h-6 border rounded-full shrink-0",
+              stepToIndex(data.step) > 3
+                ? "text-green-500 border-green-500"
+                : stepToIndex(data.step) === 3
+                ? "text-yellow-500 border-yellow-500"
+                : "text-gray-500 border-gray-500"
+            )}
+          >
+            {experiment.trainingSubmissions ? 3 : 2}
+          </span>
+          <div
+            className={twMerge(
+              "flex flex-col",
+              stepToIndex(data.step) > 3
+                ? "text-green-500"
+                : stepToIndex(data.step) === 3
+                ? "text-yellow-500"
+                : "text-gray-500"
+            )}
+          >
+            <span className="font-medium">Generating Feedback Suggestions</span>
+            {moduleRequests.requestFeedbackSuggestions.isLoading && (
+              <span className="text-xs text-gray-500 animate-pulse">
+                Generating feedback suggestions... (
+                {data.submissionsWithFeedbackSuggestions.size + 1}/
+                {experiment.evaluationSubmissions.length})
+              </span>
+            )}
+            {moduleRequests.requestFeedbackSuggestions.isError && (
+              <span className="text-xs text-red-500">
+                {moduleRequests.requestFeedbackSuggestions.error.message}
+              </span>
+            )}
+            {moduleRequests.requestFeedbackSuggestions.isSuccess && (
+              <span className="text-xs text-green-500">
+                Generated feedback suggestions (
+                {data.submissionsWithFeedbackSuggestions.size}/
+                {experiment.evaluationSubmissions.length})
+              </span>
+            )}
+          </div>
         </div>
+        {moduleExperiment.continueAfterTraining && (
+          <button
+            className="rounded-md p-2 bg-primary-500 hover:bg-primary-600 text-white text-base leading-none"
+            onClick={moduleExperiment.continueAfterTraining}
+          >
+            Start Generating
+          </button>
+        )}
       </li>
-      <li className="flex items-center space-x-2">
-        <span
-          className={twMerge(
-            "flex items-center justify-center w-6 h-6 border rounded-full shrink-0",
-            stepToIndex(data.step) === 4
-              ? "text-green-500 border-green-500"
-              : "text-gray-500 border-gray-500"
-          )}
-        >
-          {experiment.trainingSubmissions ? 4 : 3}
-        </span>
-        <div
-          className={twMerge(
-            "flex flex-col",
-            stepToIndex(data.step) === 4 ? "text-green-500" : "text-gray-500"
-          )}
-        >
-          <span className="font-medium">Finished</span>
+
+      {/* Run Automatic Evaluation */}
+      <li className="flex items-center justify-between space-x-2">
+        <div className="flex items-center space-x-2">
+          <span
+            className={twMerge(
+              "flex items-center justify-center w-6 h-6 border rounded-full shrink-0",
+              stepToIndex(data.step) === 4 &&
+                submissionsWithAutomaticEvaluation?.size ===
+                  data.submissionsWithFeedbackSuggestions.size
+                ? "text-green-500 border-green-500"
+                : stepToIndex(data.step) === 4
+                ? "text-yellow-500 border-yellow-500"
+                : "text-gray-500 border-gray-500"
+            )}
+          >
+            {experiment.trainingSubmissions ? 4 : 3}
+          </span>
+          <div
+            className={twMerge(
+              "flex flex-col",
+              stepToIndex(data.step) === 4 &&
+                submissionsWithAutomaticEvaluation?.size ===
+                  data.submissionsWithFeedbackSuggestions.size
+                ? "text-green-500"
+                : stepToIndex(data.step) === 4
+                ? "text-yellow-500"
+                : "text-gray-500"
+            )}
+          >
+            <span className="font-medium">Run Automatic Evaluation</span>
+            {moduleRequests.requestEvaluation.isLoading && (
+              <span className="text-xs text-gray-500 animate-pulse">
+                Evaluating submissions... (
+                {(submissionsWithAutomaticEvaluation?.size ?? 0) + 1}/
+                {experiment.evaluationSubmissions.length})
+              </span>
+            )}
+            {moduleRequests.requestEvaluation.isError && (
+              <span className="text-xs text-red-500">
+                {moduleRequests.requestEvaluation.error.message}
+              </span>
+            )}
+            {moduleRequests.requestEvaluation.isSuccess && (
+              <span className="text-xs text-green-500">
+                Evaluated submissions (
+                {submissionsWithAutomaticEvaluation?.size ?? 0}/
+                {experiment.evaluationSubmissions.length})
+              </span>
+            )}
+          </div>
         </div>
+        {moduleExperiment.continueWithAutomaticEvaluation && (
+          <button
+            className="self-end rounded-md p-2 bg-primary-500 hover:bg-primary-600 text-white text-base leading-none"
+            onClick={moduleExperiment.continueWithAutomaticEvaluation}
+          >
+            Start Evaluating
+          </button>
+        )}
       </li>
     </ol>
   );
diff --git a/playground/src/hooks/batch_module_experiment.ts b/playground/src/hooks/batch_module_experiment.ts
index 7c6f9f68d..89480a30a 100644
--- a/playground/src/hooks/batch_module_experiment.ts
+++ b/playground/src/hooks/batch_module_experiment.ts
@@ -54,8 +54,8 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
 
   // Stores automatic evaluation of submissions
   const [submissionsWithAutomaticEvaluation, setSubmissionsWithAutomaticEvaluation] = useState<
-    Map<number, AutomaticEvaluation>
-  >(new Map());
+    Map<number, AutomaticEvaluation> | undefined
+  >(undefined);
 
   const [processingStep, setProcessingStep] = useState<
     ExperimentStep | undefined
@@ -103,7 +103,7 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
         } : {}
       ),
       ...(
-        submissionsWithAutomaticEvaluation.size > 0 ? {
+        submissionsWithAutomaticEvaluation && submissionsWithAutomaticEvaluation.size > 0 ? {
           automaticEvaluation: {
             type: "automaticEvaluation",
             runId: data.runId,
@@ -189,6 +189,10 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
     }));
   }) : undefined;
 
+  const continueWithAutomaticEvaluation = (data.step === "finished" && submissionsWithAutomaticEvaluation === undefined) ? (() => {
+    stepAutomaticEvaluation();
+  }) : undefined;
+
   // Module requests
   const sendSubmissions = useSendSubmissions();
   const sendFeedbacks = useSendFeedbacks();
@@ -386,7 +390,7 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
     console.log("Running automatic evaluation...");
 
     let remainingSubmissions = experiment.evaluationSubmissions.filter(
-      (submission) => !submissionsWithAutomaticEvaluation.has(submission.id)
+      (submission) => !submissionsWithAutomaticEvaluation?.has(submission.id)
     );
     
     let index = 0;
@@ -477,20 +481,18 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
       processingStep !== "generatingFeedbackSuggestions"
     ) {
       stepGenerateFeedbackSuggestions();
-    } else if (
-      data.step === "finished" &&
-      processingStep !== "finished"
-    ) {
-      stepAutomaticEvaluation();
-    }
+    } 
+    // Automatic evaluation is triggered manually
   }, [data.step]);
 
   return {
     data,
     submissionsWithManualRatings,
+    submissionsWithAutomaticEvaluation,
     getManualRatingsSetter,
     startExperiment,
     continueAfterTraining,
+    continueWithAutomaticEvaluation,
     exportData,
     importData,
     moduleRequests: {

From 5053a4aa54d3d8767e370f4d365d5d3f0ef00b85 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 16:57:53 +0100
Subject: [PATCH 10/54] fix color

---
 .../conduct_experiment/module_experiment_progress.tsx         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/playground/src/components/view_mode/evaluation_mode/conduct_experiment/module_experiment_progress.tsx b/playground/src/components/view_mode/evaluation_mode/conduct_experiment/module_experiment_progress.tsx
index 345c9e67a..75d0e9964 100644
--- a/playground/src/components/view_mode/evaluation_mode/conduct_experiment/module_experiment_progress.tsx
+++ b/playground/src/components/view_mode/evaluation_mode/conduct_experiment/module_experiment_progress.tsx
@@ -197,7 +197,7 @@ export default function ModuleExperimentProgress({
                 submissionsWithAutomaticEvaluation?.size ===
                   data.submissionsWithFeedbackSuggestions.size
                 ? "text-green-500 border-green-500"
-                : stepToIndex(data.step) === 4
+                : stepToIndex(data.step) === 4 && submissionsWithAutomaticEvaluation !== undefined
                 ? "text-yellow-500 border-yellow-500"
                 : "text-gray-500 border-gray-500"
             )}
@@ -211,7 +211,7 @@ export default function ModuleExperimentProgress({
                 submissionsWithAutomaticEvaluation?.size ===
                   data.submissionsWithFeedbackSuggestions.size
                 ? "text-green-500"
-                : stepToIndex(data.step) === 4
+                : stepToIndex(data.step) === 4 && submissionsWithAutomaticEvaluation !== undefined
                 ? "text-yellow-500"
                 : "text-gray-500"
             )}

From e9bcb263309ecfb9a299dbfa706f72f2fd466ffd Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 17:18:41 +0100
Subject: [PATCH 11/54] add evaluation model

---
 module_text_llm/.env.example                         |  6 ++++++
 .../module_text_llm/helpers/models/__init__.py       | 12 +++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/module_text_llm/.env.example b/module_text_llm/.env.example
index 2ecf0a8f4..aedc5bdba 100644
--- a/module_text_llm/.env.example
+++ b/module_text_llm/.env.example
@@ -14,6 +14,12 @@ DATABASE_URL=sqlite:///../data/data.sqlite
 # See below for options, available models are also logged on startup
 LLM_DEFAULT_MODEL="azure_openai_gpt-35"
 
+# Enable LLM-as-a-judge approach 0 = disabled, 1 = enabled
+LLM_ENABLE_LLM_AS_A_JUDGE=1
+# Evaluation model to use for the LLM-as-a-judge approach [Only important if you want to use it in the /evaluate endpoint]
+# See below for options, available models are also logged on startup
+LLM_EVALUATION_MODEL="azure_openai_gpt-4"
+
 # Standard OpenAI (Non-Azure) [leave blank if not used]
 # Model names prefixed with `openai_` followed by the model name, e.g. `openai_text-davinci-003`
 # A list of models can be found in `module_text_llm/helpers/models/openai.py` (openai_models)
diff --git a/module_text_llm/module_text_llm/helpers/models/__init__.py b/module_text_llm/module_text_llm/helpers/models/__init__.py
index 4d2fe5a65..144bcf923 100644
--- a/module_text_llm/module_text_llm/helpers/models/__init__.py
+++ b/module_text_llm/module_text_llm/helpers/models/__init__.py
@@ -1,10 +1,16 @@
 import os
-from typing import Type, Union, List
+from typing import Type, Union, List, Optional
+from langchain.base_language import BaseLanguageModel
+
 from module_text_llm.helpers.models.model_config import ModelConfig
 
 
 DefaultModelConfig: Type[ModelConfig]
 default_model_name = os.environ.get("LLM_DEFAULT_MODEL")
+evaluation_model_name = os.environ.get("LLM_EVALUATION_MODEL")
+
+# Model used during evaluation for judging the output (should be a more powerful model)
+evaluation_model: Optional[BaseLanguageModel] = None
 
 types: List[Type[ModelConfig]] = []
 try:
@@ -12,6 +18,8 @@
     types.append(openai_config.OpenAIModelConfig)
     if default_model_name in openai_config.available_models:
         DefaultModelConfig = openai_config.OpenAIModelConfig
+    if evaluation_model_name in openai_config.available_models:
+        evaluation_model = openai_config.available_models[evaluation_model_name]
 except AttributeError:
     pass
 
@@ -20,6 +28,8 @@
     types.append(replicate_config.ReplicateModelConfig)
     if default_model_name in replicate_config.available_models:
         DefaultModelConfig = replicate_config.ReplicateModelConfig
+    if evaluation_model_name in replicate_config.available_models:
+        evaluation_model = replicate_config.available_models[evaluation_model_name]
 except AttributeError:
     pass
 

From cabed57f55d9decb2fbad1a7bbba038e295ab55d Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 17:30:33 +0100
Subject: [PATCH 12/54] add llm as a judge

---
 module_text_llm/module_text_llm/__main__.py   | 22 ++++-
 .../module_text_llm/generate_evaluation.py    | 95 +++++++++++++++++++
 .../module_text_llm/helpers/utils.py          | 25 +++++
 .../prompts/generate_evaluation.py            | 26 +++++
 4 files changed, 166 insertions(+), 2 deletions(-)
 create mode 100644 module_text_llm/module_text_llm/generate_evaluation.py
 create mode 100644 module_text_llm/module_text_llm/prompts/generate_evaluation.py

diff --git a/module_text_llm/module_text_llm/__main__.py b/module_text_llm/module_text_llm/__main__.py
index e9bf8d448..e3f7d7769 100644
--- a/module_text_llm/module_text_llm/__main__.py
+++ b/module_text_llm/module_text_llm/__main__.py
@@ -1,14 +1,16 @@
-from typing import List
+import os
+from typing import List, Any
 
 import nltk
 import tiktoken
 
-from athena import app, submission_selector, submissions_consumer, feedback_consumer, feedback_provider
+from athena import app, submission_selector, submissions_consumer, feedback_consumer, feedback_provider, evaluation_provider
 from athena.text import Exercise, Submission, Feedback
 from athena.logger import logger
 
 from module_text_llm.config import Configuration
 from module_text_llm.generate_suggestions import generate_suggestions
+from module_text_llm.generate_evaluation import generate_evaluation
 
 
 @submissions_consumer
@@ -33,6 +35,22 @@ async def suggest_feedback(exercise: Exercise, submission: Submission, module_co
     return await generate_suggestions(exercise, submission, module_config.approach, module_config.debug)
 
 
+@evaluation_provider
+async def evaluate_feedback(
+    exercise: Exercise, submission: Submission, 
+    true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback], 
+) -> Any:
+    logger.info(
+        "evaluate_feedback: Evaluation for submission %d of exercise %d was requested with %d true and %d predicted feedbacks", 
+        submission.id, exercise.id, len(true_feedbacks), len(predicted_feedbacks)
+    )
+    
+    evaluation = {}
+    if bool(os.environ.get("LLM_ENABLE_LLM_AS_A_JUDGE")):
+        evaluation["llm-as-a-judge"] = await generate_evaluation(exercise, submission, true_feedbacks, predicted_feedbacks)
+
+    return evaluation
+
 if __name__ == "__main__":
     nltk.download("punkt")
     tiktoken.get_encoding("cl100k_base")
diff --git a/module_text_llm/module_text_llm/generate_evaluation.py b/module_text_llm/module_text_llm/generate_evaluation.py
new file mode 100644
index 000000000..4c4a9969b
--- /dev/null
+++ b/module_text_llm/module_text_llm/generate_evaluation.py
@@ -0,0 +1,95 @@
+from typing import List, Sequence, Dict, Literal
+from pydantic import BaseModel, Field
+import json
+
+from athena.text import Exercise, Submission, Feedback
+from athena.logger import logger
+
+from module_text_llm.helpers.models import evaluation_model
+from module_text_llm.helpers.llm_utils import (
+    get_chat_prompt_with_formatting_instructions,
+    check_prompt_length_and_omit_features_if_necessary,
+    predict_and_parse
+)
+from module_text_llm.helpers.utils import add_sentence_numbers, get_line_range_from_index_range
+from module_text_llm.prompts.generate_evaluation import system_message, human_message
+
+
+class AccuracyMetric(BaseModel):
+    id: int = Field(..., description="Feedback ID")
+    reasoning: str = Field(..., description="Step-by-step critical reasoning of the labels")
+    acceptance_label: Literal["accepted", "rejected"] = Field(..., description="Estimated acceptance label")
+    level_of_needed_modification_label: Literal["no", "minor", "major"] = Field(..., description="Estimated level of needed modification")
+
+class Evaluation(BaseModel):
+    metrics: Sequence[AccuracyMetric] = Field(...)
+
+
+async def generate_evaluation(
+    exercise: Exercise,
+    submission: Submission,
+    true_feedbacks: List[Feedback],
+    predicted_feedbacks: List[Feedback]
+) -> Dict[int, dict]:
+
+    if evaluation_model is None:
+        raise EnvironmentError("No evaluation model available, please set up LLM_EVALUATION_MODEL correctly"
+                               "by setting it to one of the available models logged during startup.")
+    max_input_tokens = 3000
+
+    def feedback_to_dict(feedback: Feedback):
+        line_start, line_end = get_line_range_from_index_range(
+            feedback.index_start, feedback.index_end, submission.text)
+        return {
+            "id": feedback.id,
+            "title": feedback.title,
+            "description": feedback.description,
+            "line_start": line_start,
+            "line_end": line_end,
+            "credits": feedback.credits
+        }
+
+    prompt_input = {
+        "submission": add_sentence_numbers(submission.text),
+        "true_feedbacks": json.dumps([feedback_to_dict(feedback) for feedback in true_feedbacks]),
+        "predicted_feedbacks": json.dumps([feedback_to_dict(feedback) for feedback in predicted_feedbacks]),
+    }
+
+    chat_prompt = get_chat_prompt_with_formatting_instructions(
+        model=evaluation_model,
+        system_message=system_message,
+        human_message=human_message,
+        pydantic_object=Evaluation
+    )
+
+    # Check if the prompt is too long and omit features if necessary (in order of importance)
+    omittable_features = ["submission"]
+    prompt_input, should_run = check_prompt_length_and_omit_features_if_necessary(
+        prompt=chat_prompt,
+        prompt_input=prompt_input,
+        max_input_tokens=max_input_tokens,
+        omittable_features=omittable_features,
+        debug=False
+    )
+
+    if not should_run:
+        logger.warning("Evaluation input too long. Skipping.")
+        return {}
+
+    result = await predict_and_parse(
+        model=evaluation_model,
+        chat_prompt=chat_prompt,
+        prompt_input=prompt_input,
+        pydantic_object=Evaluation,
+        tags=[
+            f"exercise-{exercise.id}",
+            f"submission-{submission.id}",
+            "evaluation"
+        ]
+    )
+
+    if result is None:
+        logger.warning("Evaluation failed. Skipping.")
+        return {}
+
+    return { item.id: item.dict() for item in result.metrics }
diff --git a/module_text_llm/module_text_llm/helpers/utils.py b/module_text_llm/module_text_llm/helpers/utils.py
index 2ed05aec5..24cf41024 100644
--- a/module_text_llm/module_text_llm/helpers/utils.py
+++ b/module_text_llm/module_text_llm/helpers/utils.py
@@ -92,3 +92,28 @@ def get_index_range_from_line_range(line_start: Optional[int], line_end: Optiona
     line_end_index = min(max(int(line_end), 0), len(sentence_spans) - 1)
     
     return sentence_spans[line_start_index][0], sentence_spans[line_end_index][1]
+
+
+def get_line_range_from_index_range(index_start: Optional[int], index_end: Optional[int], content: str) -> Tuple[Optional[int], Optional[int]]:
+    if index_start is None and index_end is None:
+        return None, None
+
+    index_start = index_start or index_end or 0
+    index_end = index_end or index_start or 0
+
+    if index_start > index_end:
+        index_start, index_end = index_end, index_start
+
+    sentence_spans = get_sentence_spans(content)
+
+    line_start = None
+    line_end = None
+
+    for line_number, (start_index, end_index) in enumerate(sentence_spans, start=1):
+        if start_index <= index_start < end_index:
+            line_start = line_number
+        if start_index <= index_end <= end_index:
+            line_end = line_number
+            break
+    
+    return line_start, line_end
\ No newline at end of file
diff --git a/module_text_llm/module_text_llm/prompts/generate_evaluation.py b/module_text_llm/module_text_llm/prompts/generate_evaluation.py
new file mode 100644
index 000000000..10daa84a4
--- /dev/null
+++ b/module_text_llm/module_text_llm/prompts/generate_evaluation.py
@@ -0,0 +1,26 @@
+system_message = """\
+You are now an evaluator for feedback accuracy generated by a machine-learning system.
+
+# Task
+Your task is to estimate if a human tutor would accept or reject the feedback suggestion and how much modification is needed to make the feedback useful.
+
+# Score Criteria
+Accept feedback that is useful to the tutor, meaning that it can be applied to the submission with minor or no modification. \
+Our goal is to reduce the workload of tutors and reduce their cognitive load. \
+Reject feedback that is not useful and would burden the tutor.
+
+Put the focus on the description of the feedback, the title is optional. \
+The `line_start` and `line_end` should make sense with respect to the submission but do not need to be exact. \
+Credits should make sense with respect to the feedback and the submission but also do not need to be exact.
+
+# Submission (with sentence numbers <number>: <sentence>):
+{submission}
+
+# Example (Human) Feedback:
+{true_feedbacks}
+"""
+
+human_message = """\
+### Model Output:
+{predicted_feedbacks}
+"""
\ No newline at end of file

From 2595d5c60daa86a50555984be9510e97c46e04e6 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 17:55:01 +0100
Subject: [PATCH 13/54] fix ui issue and some var naming

---
 module_text_llm/module_text_llm/__main__.py     | 2 +-
 playground/src/hooks/batch_module_experiment.ts | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/module_text_llm/module_text_llm/__main__.py b/module_text_llm/module_text_llm/__main__.py
index e3f7d7769..cc6c42baa 100644
--- a/module_text_llm/module_text_llm/__main__.py
+++ b/module_text_llm/module_text_llm/__main__.py
@@ -47,7 +47,7 @@ async def evaluate_feedback(
     
     evaluation = {}
     if bool(os.environ.get("LLM_ENABLE_LLM_AS_A_JUDGE")):
-        evaluation["llm-as-a-judge"] = await generate_evaluation(exercise, submission, true_feedbacks, predicted_feedbacks)
+        evaluation["llm_as_a_judge"] = await generate_evaluation(exercise, submission, true_feedbacks, predicted_feedbacks)
 
     return evaluation
 
diff --git a/playground/src/hooks/batch_module_experiment.ts b/playground/src/hooks/batch_module_experiment.ts
index 89480a30a..e2610fbb8 100644
--- a/playground/src/hooks/batch_module_experiment.ts
+++ b/playground/src/hooks/batch_module_experiment.ts
@@ -190,6 +190,7 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
   }) : undefined;
 
   const continueWithAutomaticEvaluation = (data.step === "finished" && submissionsWithAutomaticEvaluation === undefined) ? (() => {
+    setSubmissionsWithAutomaticEvaluation((prevState) => new Map(prevState));
     stepAutomaticEvaluation();
   }) : undefined;
 

From 27397579cddde0f3060c53d449075d9691d16a0f Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 18:54:25 +0100
Subject: [PATCH 14/54] fix line break

---
 playground/src/components/details/exercise_detail/common.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/playground/src/components/details/exercise_detail/common.tsx b/playground/src/components/details/exercise_detail/common.tsx
index 852b19a66..8c583a292 100644
--- a/playground/src/components/details/exercise_detail/common.tsx
+++ b/playground/src/components/details/exercise_detail/common.tsx
@@ -54,7 +54,7 @@ export default function CommonExerciseDetail({
                       <i>Missing criterion title</i>
                     )}
                     <span className="text-xs text-orange-800 rounded-full px-2 py-0.5 bg-orange-100">
-                      Grading Criterion {criterion.id}
+                      Grading&nbsp;Criterion&nbsp;{criterion.id}
                     </span>
                   </span>
                   {criterion.structured_grading_instructions.map(

From 6b383e3c3729745143aa8505397bca93681d66a5 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 19:58:53 +0100
Subject: [PATCH 15/54] add langsmith logging

---
 module_text_llm/module_text_llm/__main__.py |  47 ++++++++-
 module_text_llm/poetry.lock                 | 111 ++++++++++++++++----
 module_text_llm/pyproject.toml              |   1 +
 playground/src/pages/api/athena_request.ts  |  17 ++-
 4 files changed, 151 insertions(+), 25 deletions(-)

diff --git a/module_text_llm/module_text_llm/__main__.py b/module_text_llm/module_text_llm/__main__.py
index cc6c42baa..0c9bfd29d 100644
--- a/module_text_llm/module_text_llm/__main__.py
+++ b/module_text_llm/module_text_llm/__main__.py
@@ -1,10 +1,13 @@
+import json
 import os
 from typing import List, Any
 
 import nltk
 import tiktoken
+from langsmith import Client as LangsmithClient
+from langsmith.schemas import Run
 
-from athena import app, submission_selector, submissions_consumer, feedback_consumer, feedback_provider, evaluation_provider
+from athena import app, get_experiment_environment, submission_selector, submissions_consumer, feedback_consumer, feedback_provider, evaluation_provider
 from athena.text import Exercise, Submission, Feedback
 from athena.logger import logger
 
@@ -49,6 +52,48 @@ async def evaluate_feedback(
     if bool(os.environ.get("LLM_ENABLE_LLM_AS_A_JUDGE")):
         evaluation["llm_as_a_judge"] = await generate_evaluation(exercise, submission, true_feedbacks, predicted_feedbacks)
 
+    # Gather LLM token usage and response times
+    if bool(os.environ.get("LANGCHAIN_TRACING_V2")):
+        experiment = get_experiment_environment()
+        client = LangsmithClient()
+        project_name = os.environ.get("LANGCHAIN_PROJECT")
+        runs = list(client.list_runs(
+            project_name=project_name,
+            filter=f'and(has(tags, "run-{experiment.run_id}"), has(tags, "submission-{submission.id}"))'
+        ))
+        logger.info("evaluate_feedback: Found %d runs for submission %d of exercise %d.", len(runs), submission.id, exercise.id)
+        
+        def get_statistics(runs: List[Run]):
+            return {
+                "response_time": sum((run.end_time - run.start_time).total_seconds() for run in runs if run.end_time is not None),
+                "prompt_tokens": sum(run.prompt_tokens for run in runs if run.prompt_tokens is not None),
+                "completion_tokens": sum(run.completion_tokens for run in runs if run.completion_tokens is not None),
+                "total_tokens": sum(run.total_tokens for run in runs if run.total_tokens is not None),
+            }
+
+        suggestion_runs = []
+        evaluation_runs = []
+        for run in runs:
+            if "evaluation" in (run.tags or []):
+                evaluation_runs.append(run)
+            else:
+                suggestion_runs.append(run)
+
+        if suggestion_runs or evaluation_runs:
+            evaluation["runs"] = {}
+            if suggestion_runs:
+                evaluation["runs"]["suggestions"] = {
+                    "count": len(suggestion_runs),
+                    "statistics": get_statistics(suggestion_runs),
+                    "runs": [json.loads(run.json()) for run in suggestion_runs]
+                }
+            if evaluation_runs:
+                evaluation["runs"]["evaluation"] = {
+                    "count": len(evaluation_runs),
+                    "statistics": get_statistics(evaluation_runs),
+                    "runs": [json.loads(run.json()) for run in evaluation_runs]
+                }
+
     return evaluation
 
 if __name__ == "__main__":
diff --git a/module_text_llm/poetry.lock b/module_text_llm/poetry.lock
index 96c269625..28e23ad99 100644
--- a/module_text_llm/poetry.lock
+++ b/module_text_llm/poetry.lock
@@ -1,9 +1,10 @@
-# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
 version = "3.8.6"
 description = "Async http client/server framework (asyncio)"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -112,6 +113,7 @@ speedups = ["Brotli", "aiodns", "cchardet"]
 name = "aiosignal"
 version = "1.3.1"
 description = "aiosignal: a list of registered asynchronous callbacks"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -126,6 +128,7 @@ frozenlist = ">=1.1.0"
 name = "anyio"
 version = "3.7.1"
 description = "High level compatibility layer for multiple asynchronous event loop implementations"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -146,6 +149,7 @@ trio = ["trio (<0.22)"]
 name = "astroid"
 version = "2.15.8"
 description = "An abstract syntax tree for Python with inference support."
+category = "dev"
 optional = false
 python-versions = ">=3.7.2"
 files = [
@@ -161,6 +165,7 @@ wrapt = {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}
 name = "async-timeout"
 version = "4.0.3"
 description = "Timeout context manager for asyncio programs"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -172,6 +177,7 @@ files = [
 name = "athena"
 version = "1.0.0"
 description = "This is a helper module for easier development of Athena modules. It provides communication functionality with the Assessment Module manager, as well as helper functions for storage."
+category = "main"
 optional = false
 python-versions = "3.11.*"
 files = []
@@ -193,6 +199,7 @@ url = "../athena"
 name = "attrs"
 version = "23.1.0"
 description = "Classes Without Boilerplate"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -211,6 +218,7 @@ tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pyte
 name = "certifi"
 version = "2023.7.22"
 description = "Python package for providing Mozilla's CA Bundle."
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -222,6 +230,7 @@ files = [
 name = "charset-normalizer"
 version = "3.3.1"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
+category = "main"
 optional = false
 python-versions = ">=3.7.0"
 files = [
@@ -321,6 +330,7 @@ files = [
 name = "click"
 version = "8.1.7"
 description = "Composable command line interface toolkit"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -335,6 +345,7 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""}
 name = "colorama"
 version = "0.4.6"
 description = "Cross-platform colored terminal text."
+category = "main"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
 files = [
@@ -346,6 +357,7 @@ files = [
 name = "dataclasses-json"
 version = "0.6.1"
 description = "Easily serialize dataclasses to and from JSON."
+category = "main"
 optional = false
 python-versions = ">=3.7,<4.0"
 files = [
@@ -361,6 +373,7 @@ typing-inspect = ">=0.4.0,<1"
 name = "dill"
 version = "0.3.7"
 description = "serialize all of Python"
+category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -375,6 +388,7 @@ graph = ["objgraph (>=1.7.2)"]
 name = "dodgy"
 version = "0.2.1"
 description = "Dodgy: Searches for dodgy looking lines in Python code"
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -386,6 +400,7 @@ files = [
 name = "fastapi"
 version = "0.96.1"
 description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -407,6 +422,7 @@ test = ["anyio[trio] (>=3.2.1,<4.0.0)", "black (==23.1.0)", "coverage[toml] (>=6
 name = "flake8"
 version = "2.3.0"
 description = "the modular source code checker: pep8, pyflakes and co"
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -423,6 +439,7 @@ pyflakes = ">=0.8.1"
 name = "flake8-polyfill"
 version = "1.0.2"
 description = "Polyfill package for Flake8 plugins"
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -437,6 +454,7 @@ flake8 = "*"
 name = "frozenlist"
 version = "1.4.0"
 description = "A list-like structure which implements collections.abc.MutableSequence"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -507,6 +525,7 @@ files = [
 name = "gitdb"
 version = "4.0.11"
 description = "Git Object Database"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -521,6 +540,7 @@ smmap = ">=3.0.1,<6"
 name = "gitpython"
 version = "3.1.40"
 description = "GitPython is a Python library used to interact with Git repositories"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -538,6 +558,7 @@ test = ["black", "coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre
 name = "greenlet"
 version = "3.0.1"
 description = "Lightweight in-process concurrent programming"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -608,6 +629,7 @@ test = ["objgraph", "psutil"]
 name = "h11"
 version = "0.14.0"
 description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -619,6 +641,7 @@ files = [
 name = "httpcore"
 version = "0.17.3"
 description = "A minimal low-level HTTP client."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -630,16 +653,17 @@ files = [
 anyio = ">=3.0,<5.0"
 certifi = "*"
 h11 = ">=0.13,<0.15"
-sniffio = "==1.*"
+sniffio = ">=1.0.0,<2.0.0"
 
 [package.extras]
 http2 = ["h2 (>=3,<5)"]
-socks = ["socksio (==1.*)"]
+socks = ["socksio (>=1.0.0,<2.0.0)"]
 
 [[package]]
 name = "httpx"
 version = "0.24.1"
 description = "The next generation HTTP client."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -655,14 +679,15 @@ sniffio = "*"
 
 [package.extras]
 brotli = ["brotli", "brotlicffi"]
-cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
+cli = ["click (>=8.0.0,<9.0.0)", "pygments (>=2.0.0,<3.0.0)", "rich (>=10,<14)"]
 http2 = ["h2 (>=3,<5)"]
-socks = ["socksio (==1.*)"]
+socks = ["socksio (>=1.0.0,<2.0.0)"]
 
 [[package]]
 name = "idna"
 version = "3.4"
 description = "Internationalized Domain Names in Applications (IDNA)"
+category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -674,6 +699,7 @@ files = [
 name = "isort"
 version = "5.12.0"
 description = "A Python utility / library to sort Python imports."
+category = "dev"
 optional = false
 python-versions = ">=3.8.0"
 files = [
@@ -691,6 +717,7 @@ requirements-deprecated-finder = ["pip-api", "pipreqs"]
 name = "joblib"
 version = "1.3.2"
 description = "Lightweight pipelining with Python functions"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -702,6 +729,7 @@ files = [
 name = "jsonpatch"
 version = "1.33"
 description = "Apply JSON-Patches (RFC 6902)"
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
 files = [
@@ -716,6 +744,7 @@ jsonpointer = ">=1.9"
 name = "jsonpointer"
 version = "2.4"
 description = "Identify specific nodes in a JSON document (RFC 6901)"
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
 files = [
@@ -727,6 +756,7 @@ files = [
 name = "langchain"
 version = "0.0.325"
 description = "Building applications with LLMs through composability"
+category = "main"
 optional = false
 python-versions = ">=3.8.1,<4.0"
 files = [
@@ -764,13 +794,14 @@ text-helpers = ["chardet (>=5.1.0,<6.0.0)"]
 
 [[package]]
 name = "langsmith"
-version = "0.0.52"
+version = "0.0.60"
 description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
+category = "main"
 optional = false
 python-versions = ">=3.8.1,<4.0"
 files = [
-    {file = "langsmith-0.0.52-py3-none-any.whl", hash = "sha256:d02a0ade5a53b36143084e57003ed38ccbdf5fc15a5a0eb14f8989ceaee0b807"},
-    {file = "langsmith-0.0.52.tar.gz", hash = "sha256:1dc29082d257deea1859cb22c53d9481ca5c4a37f3af40c0f9d300fb8adc91db"},
+    {file = "langsmith-0.0.60-py3-none-any.whl", hash = "sha256:94f9ef9898fa5fb5afed72538bb3ccca9a92a841b37654d699c732a76c623379"},
+    {file = "langsmith-0.0.60.tar.gz", hash = "sha256:f63513398d8d4530e3aa552926924c8443ac9d21c3812f303fa20fa2c44a9a42"},
 ]
 
 [package.dependencies]
@@ -781,6 +812,7 @@ requests = ">=2,<3"
 name = "lazy-object-proxy"
 version = "1.9.0"
 description = "A fast and thorough lazy object proxy."
+category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -826,6 +858,7 @@ files = [
 name = "marshmallow"
 version = "3.20.1"
 description = "A lightweight library for converting complex datatypes to and from native Python datatypes."
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -846,6 +879,7 @@ tests = ["pytest", "pytz", "simplejson"]
 name = "mccabe"
 version = "0.7.0"
 description = "McCabe checker, plugin for flake8"
+category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -857,6 +891,7 @@ files = [
 name = "multidict"
 version = "6.0.4"
 description = "multidict implementation"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -940,6 +975,7 @@ files = [
 name = "mypy"
 version = "1.6.1"
 description = "Optional static typing for Python"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -985,6 +1021,7 @@ reports = ["lxml"]
 name = "mypy-extensions"
 version = "1.0.0"
 description = "Type system extensions for programs checked with the mypy type checker."
+category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -996,6 +1033,7 @@ files = [
 name = "nltk"
 version = "3.8.1"
 description = "Natural Language Toolkit"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1021,6 +1059,7 @@ twitter = ["twython"]
 name = "numpy"
 version = "1.26.1"
 description = "Fundamental package for array computing in Python"
+category = "main"
 optional = false
 python-versions = "<3.13,>=3.9"
 files = [
@@ -1062,6 +1101,7 @@ files = [
 name = "openai"
 version = "0.27.10"
 description = "Python client library for the OpenAI API"
+category = "main"
 optional = false
 python-versions = ">=3.7.1"
 files = [
@@ -1076,7 +1116,7 @@ tqdm = "*"
 
 [package.extras]
 datalib = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
-dev = ["black (>=21.6b0,<22.0)", "pytest (==6.*)", "pytest-asyncio", "pytest-mock"]
+dev = ["black (>=21.6b0,<22.0)", "pytest (>=6.0.0,<7.0.0)", "pytest-asyncio", "pytest-mock"]
 embeddings = ["matplotlib", "numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "plotly", "scikit-learn (>=1.0.2)", "scipy", "tenacity (>=8.0.1)"]
 wandb = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "wandb"]
 
@@ -1084,6 +1124,7 @@ wandb = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1
 name = "packaging"
 version = "23.2"
 description = "Core utilities for Python packages"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1095,6 +1136,7 @@ files = [
 name = "pep8"
 version = "1.7.1"
 description = "Python style guide checker"
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -1106,6 +1148,7 @@ files = [
 name = "pep8-naming"
 version = "0.10.0"
 description = "Check PEP-8 naming conventions, plugin for flake8"
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -1120,6 +1163,7 @@ flake8-polyfill = ">=1.0.2,<2"
 name = "platformdirs"
 version = "3.11.0"
 description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
+category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1135,6 +1179,7 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-co
 name = "prospector"
 version = "1.10.3"
 description = "Prospector is a tool to analyse Python code by aggregating the result of other tools."
+category = "dev"
 optional = false
 python-versions = ">=3.7.2,<4.0"
 files = [
@@ -1174,6 +1219,7 @@ with-vulture = ["vulture (>=1.5)"]
 name = "psycopg2"
 version = "2.9.9"
 description = "psycopg2 - Python-PostgreSQL Database Adapter"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1181,8 +1227,6 @@ files = [
     {file = "psycopg2-2.9.9-cp310-cp310-win_amd64.whl", hash = "sha256:426f9f29bde126913a20a96ff8ce7d73fd8a216cfb323b1f04da402d452853c3"},
     {file = "psycopg2-2.9.9-cp311-cp311-win32.whl", hash = "sha256:ade01303ccf7ae12c356a5e10911c9e1c51136003a9a1d92f7aa9d010fb98372"},
     {file = "psycopg2-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:121081ea2e76729acfb0673ff33755e8703d45e926e416cb59bae3a86c6a4981"},
-    {file = "psycopg2-2.9.9-cp312-cp312-win32.whl", hash = "sha256:d735786acc7dd25815e89cc4ad529a43af779db2e25aa7c626de864127e5a024"},
-    {file = "psycopg2-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:a7653d00b732afb6fc597e29c50ad28087dcb4fbfb28e86092277a559ae4e693"},
     {file = "psycopg2-2.9.9-cp37-cp37m-win32.whl", hash = "sha256:5e0d98cade4f0e0304d7d6f25bbfbc5bd186e07b38eac65379309c4ca3193efa"},
     {file = "psycopg2-2.9.9-cp37-cp37m-win_amd64.whl", hash = "sha256:7e2dacf8b009a1c1e843b5213a87f7c544b2b042476ed7755be813eaf4e8347a"},
     {file = "psycopg2-2.9.9-cp38-cp38-win32.whl", hash = "sha256:ff432630e510709564c01dafdbe996cb552e0b9f3f065eb89bdce5bd31fabf4c"},
@@ -1196,6 +1240,7 @@ files = [
 name = "pycodestyle"
 version = "2.11.1"
 description = "Python style guide checker"
+category = "dev"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -1207,6 +1252,7 @@ files = [
 name = "pydantic"
 version = "1.10.13"
 description = "Data validation and settings management using python type hints"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1259,6 +1305,7 @@ email = ["email-validator (>=1.0.3)"]
 name = "pydocstyle"
 version = "6.3.0"
 description = "Python docstring style checker"
+category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -1276,6 +1323,7 @@ toml = ["tomli (>=1.2.3)"]
 name = "pyflakes"
 version = "2.5.0"
 description = "passive checker of Python programs"
+category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -1287,6 +1335,7 @@ files = [
 name = "pylint"
 version = "2.17.7"
 description = "python code static checker"
+category = "dev"
 optional = false
 python-versions = ">=3.7.2"
 files = [
@@ -1311,6 +1360,7 @@ testutils = ["gitpython (>3)"]
 name = "pylint-celery"
 version = "0.3"
 description = "pylint-celery is a Pylint plugin to aid Pylint in recognising and understandingerrors caused when using the Celery library"
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -1326,6 +1376,7 @@ pylint-plugin-utils = ">=0.2.1"
 name = "pylint-django"
 version = "2.5.3"
 description = "A Pylint plugin to help Pylint understand the Django web framework"
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -1345,6 +1396,7 @@ with-django = ["Django"]
 name = "pylint-flask"
 version = "0.6"
 description = "pylint-flask is a Pylint plugin to aid Pylint in recognizing and understanding errors caused when using Flask"
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -1358,6 +1410,7 @@ pylint-plugin-utils = ">=0.2.1"
 name = "pylint-plugin-utils"
 version = "0.7"
 description = "Utilities and helpers for writing Pylint plugins"
+category = "dev"
 optional = false
 python-versions = ">=3.6.2"
 files = [
@@ -1372,6 +1425,7 @@ pylint = ">=1.7"
 name = "python-dotenv"
 version = "1.0.0"
 description = "Read key-value pairs from a .env file and set them as environment variables"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -1386,6 +1440,7 @@ cli = ["click (>=5.0)"]
 name = "pyyaml"
 version = "6.0.1"
 description = "YAML parser and emitter for Python"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -1394,7 +1449,6 @@ files = [
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
-    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
     {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
     {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
     {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
@@ -1402,15 +1456,8 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
-    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
     {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
-    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
-    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
-    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
-    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
-    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
     {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
@@ -1427,7 +1474,6 @@ files = [
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
-    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
     {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
     {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
     {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
@@ -1435,7 +1481,6 @@ files = [
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
-    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
     {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
     {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
     {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
@@ -1445,6 +1490,7 @@ files = [
 name = "regex"
 version = "2023.10.3"
 description = "Alternative regular expression module, to replace re."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1542,6 +1588,7 @@ files = [
 name = "replicate"
 version = "0.11.0"
 description = "Python client for Replicate"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -1561,6 +1608,7 @@ dev = ["black", "mypy", "pytest", "responses", "ruff"]
 name = "requests"
 version = "2.31.0"
 description = "Python HTTP for Humans."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1582,6 +1630,7 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 name = "requirements-detector"
 version = "1.2.2"
 description = "Python tool to find and list requirements of a Python project"
+category = "dev"
 optional = false
 python-versions = ">=3.7,<4.0"
 files = [
@@ -1599,6 +1648,7 @@ toml = ">=0.10.2,<0.11.0"
 name = "semver"
 version = "3.0.2"
 description = "Python helper for Semantic Versioning (https://semver.org)"
+category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1610,6 +1660,7 @@ files = [
 name = "setoptconf-tmp"
 version = "0.3.1"
 description = "A module for retrieving program settings from various sources in a consistant method."
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -1624,6 +1675,7 @@ yaml = ["pyyaml"]
 name = "smmap"
 version = "5.0.1"
 description = "A pure Python implementation of a sliding window memory map manager"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1635,6 +1687,7 @@ files = [
 name = "sniffio"
 version = "1.3.0"
 description = "Sniff out which async library your code is running under"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1646,6 +1699,7 @@ files = [
 name = "snowballstemmer"
 version = "2.2.0"
 description = "This package provides 29 stemmers for 28 languages generated from Snowball algorithms."
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -1657,6 +1711,7 @@ files = [
 name = "sqlalchemy"
 version = "2.0.22"
 description = "Database Abstraction Library"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1744,6 +1799,7 @@ sqlcipher = ["sqlcipher3-binary"]
 name = "starlette"
 version = "0.27.0"
 description = "The little ASGI library that shines."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1761,6 +1817,7 @@ full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyam
 name = "tenacity"
 version = "8.2.3"
 description = "Retry code until it succeeds"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1775,6 +1832,7 @@ doc = ["reno", "sphinx", "tornado (>=4.5)"]
 name = "tiktoken"
 version = "0.4.0"
 description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -1820,6 +1878,7 @@ blobfile = ["blobfile (>=2)"]
 name = "toml"
 version = "0.10.2"
 description = "Python Library for Tom's Obvious, Minimal Language"
+category = "dev"
 optional = false
 python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
 files = [
@@ -1831,6 +1890,7 @@ files = [
 name = "tomlkit"
 version = "0.12.1"
 description = "Style preserving TOML library"
+category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1842,6 +1902,7 @@ files = [
 name = "tqdm"
 version = "4.66.1"
 description = "Fast, Extensible Progress Meter"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1862,6 +1923,7 @@ telegram = ["requests"]
 name = "typing-extensions"
 version = "4.8.0"
 description = "Backported and Experimental Type Hints for Python 3.8+"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -1873,6 +1935,7 @@ files = [
 name = "typing-inspect"
 version = "0.9.0"
 description = "Runtime inspection utilities for typing module."
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -1888,6 +1951,7 @@ typing-extensions = ">=3.7.4"
 name = "urllib3"
 version = "2.0.7"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1905,6 +1969,7 @@ zstd = ["zstandard (>=0.18.0)"]
 name = "uvicorn"
 version = "0.23.2"
 description = "The lightning-fast ASGI server."
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -1923,6 +1988,7 @@ standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)",
 name = "wrapt"
 version = "1.15.0"
 description = "Module for decorators, wrappers and monkey patching."
+category = "dev"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
 files = [
@@ -2007,6 +2073,7 @@ files = [
 name = "yarl"
 version = "1.9.2"
 description = "Yet another URL library"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -2093,4 +2160,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "3.11.*"
-content-hash = "680a5df064fcdd1cac69f7130fe0cc41571497de32b7797be0f88a0aa4e7d098"
+content-hash = "844e1ad75ca9b73100279326d787a4621e504c69482e4348051b214e941fd49d"
diff --git a/module_text_llm/pyproject.toml b/module_text_llm/pyproject.toml
index e3d7ba38f..9610767ea 100644
--- a/module_text_llm/pyproject.toml
+++ b/module_text_llm/pyproject.toml
@@ -15,6 +15,7 @@ nltk = "^3.8.1"
 gitpython = "^3.1.37"
 replicate = "^0.11.0"
 tiktoken = "^0.4.0"
+langsmith = "^0.0.60"
 
 [tool.poetry.scripts]
 module = "athena:run_module"
diff --git a/playground/src/pages/api/athena_request.ts b/playground/src/pages/api/athena_request.ts
index de94908fc..b3971db81 100644
--- a/playground/src/pages/api/athena_request.ts
+++ b/playground/src/pages/api/athena_request.ts
@@ -17,7 +17,20 @@ export default async function handler(
   const url = req.query.url;
   let response;
   const secret = req.headers["authorization"] as string;
-  const moduleConfig = req.headers["x-module-config"] as string | undefined;
+  const forwardHeaders = [
+    "X-Module-Config", 
+    "X-Experiment-ID", 
+    "X-Module-Configuration-ID",
+    "X-Run-ID",
+  ]
+
+  const headers = Object.fromEntries(
+    forwardHeaders.flatMap((header) => {
+      const value = req.headers[header.toLowerCase()] as string | undefined;
+      return value ? [[header, value]] : [];
+    })
+  )
+  
   if (!secret) {
     console.warn("No secret provided");
   }
@@ -27,7 +40,7 @@ export default async function handler(
         "Content-Type": "application/json",
         Accept: "application/json",
         "Authorization": secret,
-        ...(moduleConfig && { "X-Module-Config": moduleConfig }),
+        ...headers,
       },
       method: req.method,
       ...(req.method === "POST" ? { body: JSON.stringify(req.body) } : {}),

From e46df2ce5dae50a7d7ca6935115593d21c0867b9 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 20:04:14 +0100
Subject: [PATCH 16/54] inline statistics

---
 module_text_llm/module_text_llm/__main__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/module_text_llm/module_text_llm/__main__.py b/module_text_llm/module_text_llm/__main__.py
index 0c9bfd29d..9b81dfedb 100644
--- a/module_text_llm/module_text_llm/__main__.py
+++ b/module_text_llm/module_text_llm/__main__.py
@@ -84,13 +84,13 @@ def get_statistics(runs: List[Run]):
             if suggestion_runs:
                 evaluation["runs"]["suggestions"] = {
                     "count": len(suggestion_runs),
-                    "statistics": get_statistics(suggestion_runs),
+                    **get_statistics(suggestion_runs),
                     "runs": [json.loads(run.json()) for run in suggestion_runs]
                 }
             if evaluation_runs:
                 evaluation["runs"]["evaluation"] = {
                     "count": len(evaluation_runs),
-                    "statistics": get_statistics(evaluation_runs),
+                    **get_statistics(evaluation_runs),
                     "runs": [json.loads(run.json()) for run in evaluation_runs]
                 }
 

From 8d50922ad9f4a1cb5d1595fe7ff51bc9a0e472dc Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 20:46:36 +0100
Subject: [PATCH 17/54] add sgi evaluation

---
 module_text_llm/module_text_llm/__main__.py | 56 +++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/module_text_llm/module_text_llm/__main__.py b/module_text_llm/module_text_llm/__main__.py
index 9b81dfedb..75897b438 100644
--- a/module_text_llm/module_text_llm/__main__.py
+++ b/module_text_llm/module_text_llm/__main__.py
@@ -94,6 +94,62 @@ def get_statistics(runs: List[Run]):
                     "runs": [json.loads(run.json()) for run in evaluation_runs]
                 }
 
+    actual_feedback_count = len(true_feedbacks)
+    actual_feedback_with_grading_instructions = []
+    suggestions_count = len(predicted_feedbacks)
+    suggestions_with_grading_instructions = []
+
+    # Init usage counts for SGIs
+    actual_sgi_usage = {
+        sgi.id: 0 for criterion in exercise.grading_criteria or [] for sgi in criterion.structured_grading_instructions
+    }
+    suggested_sgi_usage = {
+        sgi.id: 0 for criterion in exercise.grading_criteria or [] for sgi in criterion.structured_grading_instructions
+    }
+
+    # Count SGIs in actual feedbacks
+    for feedback in true_feedbacks:
+        if feedback.structured_grading_instruction_id:
+            actual_feedback_with_grading_instructions.append(feedback)
+            actual_sgi_usage[feedback.structured_grading_instruction_id] += 1
+
+    # Count SGIs in suggested feedbacks
+    for feedback in predicted_feedbacks:
+        if feedback.structured_grading_instruction_id:
+            suggestions_with_grading_instructions.append(feedback)
+            suggested_sgi_usage[feedback.structured_grading_instruction_id] += 1
+
+    actual_feedback_with_grading_instructions_count = len(actual_feedback_with_grading_instructions)
+    suggestions_with_grading_instructions_count = len(suggestions_with_grading_instructions)
+
+    # Match SGIs
+    matched_feedback = 0
+    unmatched_feedback = actual_feedback_count - actual_feedback_with_grading_instructions_count
+    unmatched_suggestions = suggestions_count - suggestions_with_grading_instructions_count
+    
+    for feedback in actual_feedback_with_grading_instructions:
+        for index, suggestion in enumerate(suggestions_with_grading_instructions):
+            if feedback.structured_grading_instruction_id == suggestion.structured_grading_instruction_id:
+                matched_feedback += 1
+                del suggestions_with_grading_instructions[index]
+                break
+        else:
+            unmatched_feedback += 1
+
+    unmatched_suggestions += len(suggestions_with_grading_instructions)
+
+    evaluation["feedback_statistics"] = {
+        "actual_feedback_count": actual_feedback_count,
+        "suggestions_count": suggestions_count,
+        "actual_feedback_with_grading_instructions_count": actual_feedback_with_grading_instructions_count,
+        "suggestions_with_grading_instructions_count": suggestions_with_grading_instructions_count,
+        "actual_sgi_usage": actual_sgi_usage,
+        "suggested_sgi_usage": suggested_sgi_usage,
+        "matched_feedback": matched_feedback,
+        "unmatched_feedback": unmatched_feedback,
+        "unmatched_suggestions": unmatched_suggestions,
+    }
+    
     return evaluation
 
 if __name__ == "__main__":

From a753b8a789d0bb1bb30b95a064a06911493afe38 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 21:04:54 +0100
Subject: [PATCH 18/54] refactor

---
 module_text_llm/module_text_llm/__main__.py   | 117 +++---------------
 module_text_llm/module_text_llm/evaluation.py | 117 ++++++++++++++++++
 2 files changed, 131 insertions(+), 103 deletions(-)
 create mode 100644 module_text_llm/module_text_llm/evaluation.py

diff --git a/module_text_llm/module_text_llm/__main__.py b/module_text_llm/module_text_llm/__main__.py
index 75897b438..430f7b7fb 100644
--- a/module_text_llm/module_text_llm/__main__.py
+++ b/module_text_llm/module_text_llm/__main__.py
@@ -4,14 +4,13 @@
 
 import nltk
 import tiktoken
-from langsmith import Client as LangsmithClient
-from langsmith.schemas import Run
 
-from athena import app, get_experiment_environment, submission_selector, submissions_consumer, feedback_consumer, feedback_provider, evaluation_provider
+from athena import app, submission_selector, submissions_consumer, feedback_consumer, feedback_provider, evaluation_provider
 from athena.text import Exercise, Submission, Feedback
 from athena.logger import logger
 
 from module_text_llm.config import Configuration
+from module_text_llm.evaluation import get_feedback_statistics, get_llm_statistics
 from module_text_llm.generate_suggestions import generate_suggestions
 from module_text_llm.generate_evaluation import generate_evaluation
 
@@ -44,112 +43,24 @@ async def evaluate_feedback(
     true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback], 
 ) -> Any:
     logger.info(
-        "evaluate_feedback: Evaluation for submission %d of exercise %d was requested with %d true and %d predicted feedbacks", 
-        submission.id, exercise.id, len(true_feedbacks), len(predicted_feedbacks)
+        "evaluate_feedback: Evaluation for submission %d of exercise %d was requested with %d true and %d predicted feedbacks",
+        submission.id, exercise.id, len(
+            true_feedbacks), len(predicted_feedbacks)
     )
-    
+
     evaluation = {}
+
+    # 1. LLM as a judge
     if bool(os.environ.get("LLM_ENABLE_LLM_AS_A_JUDGE")):
         evaluation["llm_as_a_judge"] = await generate_evaluation(exercise, submission, true_feedbacks, predicted_feedbacks)
 
-    # Gather LLM token usage and response times
+    # 2. LangSmith runs, token usage, and respose times
     if bool(os.environ.get("LANGCHAIN_TRACING_V2")):
-        experiment = get_experiment_environment()
-        client = LangsmithClient()
-        project_name = os.environ.get("LANGCHAIN_PROJECT")
-        runs = list(client.list_runs(
-            project_name=project_name,
-            filter=f'and(has(tags, "run-{experiment.run_id}"), has(tags, "submission-{submission.id}"))'
-        ))
-        logger.info("evaluate_feedback: Found %d runs for submission %d of exercise %d.", len(runs), submission.id, exercise.id)
-        
-        def get_statistics(runs: List[Run]):
-            return {
-                "response_time": sum((run.end_time - run.start_time).total_seconds() for run in runs if run.end_time is not None),
-                "prompt_tokens": sum(run.prompt_tokens for run in runs if run.prompt_tokens is not None),
-                "completion_tokens": sum(run.completion_tokens for run in runs if run.completion_tokens is not None),
-                "total_tokens": sum(run.total_tokens for run in runs if run.total_tokens is not None),
-            }
-
-        suggestion_runs = []
-        evaluation_runs = []
-        for run in runs:
-            if "evaluation" in (run.tags or []):
-                evaluation_runs.append(run)
-            else:
-                suggestion_runs.append(run)
-
-        if suggestion_runs or evaluation_runs:
-            evaluation["runs"] = {}
-            if suggestion_runs:
-                evaluation["runs"]["suggestions"] = {
-                    "count": len(suggestion_runs),
-                    **get_statistics(suggestion_runs),
-                    "runs": [json.loads(run.json()) for run in suggestion_runs]
-                }
-            if evaluation_runs:
-                evaluation["runs"]["evaluation"] = {
-                    "count": len(evaluation_runs),
-                    **get_statistics(evaluation_runs),
-                    "runs": [json.loads(run.json()) for run in evaluation_runs]
-                }
-
-    actual_feedback_count = len(true_feedbacks)
-    actual_feedback_with_grading_instructions = []
-    suggestions_count = len(predicted_feedbacks)
-    suggestions_with_grading_instructions = []
-
-    # Init usage counts for SGIs
-    actual_sgi_usage = {
-        sgi.id: 0 for criterion in exercise.grading_criteria or [] for sgi in criterion.structured_grading_instructions
-    }
-    suggested_sgi_usage = {
-        sgi.id: 0 for criterion in exercise.grading_criteria or [] for sgi in criterion.structured_grading_instructions
-    }
-
-    # Count SGIs in actual feedbacks
-    for feedback in true_feedbacks:
-        if feedback.structured_grading_instruction_id:
-            actual_feedback_with_grading_instructions.append(feedback)
-            actual_sgi_usage[feedback.structured_grading_instruction_id] += 1
-
-    # Count SGIs in suggested feedbacks
-    for feedback in predicted_feedbacks:
-        if feedback.structured_grading_instruction_id:
-            suggestions_with_grading_instructions.append(feedback)
-            suggested_sgi_usage[feedback.structured_grading_instruction_id] += 1
-
-    actual_feedback_with_grading_instructions_count = len(actual_feedback_with_grading_instructions)
-    suggestions_with_grading_instructions_count = len(suggestions_with_grading_instructions)
-
-    # Match SGIs
-    matched_feedback = 0
-    unmatched_feedback = actual_feedback_count - actual_feedback_with_grading_instructions_count
-    unmatched_suggestions = suggestions_count - suggestions_with_grading_instructions_count
-    
-    for feedback in actual_feedback_with_grading_instructions:
-        for index, suggestion in enumerate(suggestions_with_grading_instructions):
-            if feedback.structured_grading_instruction_id == suggestion.structured_grading_instruction_id:
-                matched_feedback += 1
-                del suggestions_with_grading_instructions[index]
-                break
-        else:
-            unmatched_feedback += 1
-
-    unmatched_suggestions += len(suggestions_with_grading_instructions)
-
-    evaluation["feedback_statistics"] = {
-        "actual_feedback_count": actual_feedback_count,
-        "suggestions_count": suggestions_count,
-        "actual_feedback_with_grading_instructions_count": actual_feedback_with_grading_instructions_count,
-        "suggestions_with_grading_instructions_count": suggestions_with_grading_instructions_count,
-        "actual_sgi_usage": actual_sgi_usage,
-        "suggested_sgi_usage": suggested_sgi_usage,
-        "matched_feedback": matched_feedback,
-        "unmatched_feedback": unmatched_feedback,
-        "unmatched_suggestions": unmatched_suggestions,
-    }
-    
+        evaluation["llm_statistics"] = get_llm_statistics(submission)
+
+    # 3. Feedback statistics
+    evaluation["feedback_statistics"] = get_feedback_statistics(exercise, submission, true_feedbacks, predicted_feedbacks)
+
     return evaluation
 
 if __name__ == "__main__":
diff --git a/module_text_llm/module_text_llm/evaluation.py b/module_text_llm/module_text_llm/evaluation.py
new file mode 100644
index 000000000..2d6989892
--- /dev/null
+++ b/module_text_llm/module_text_llm/evaluation.py
@@ -0,0 +1,117 @@
+import json
+import os
+from typing import List
+
+from langsmith import Client as LangSmithClient
+from langsmith.schemas import Run
+
+from athena import get_experiment_environment
+from athena.text import Exercise, Submission, Feedback
+
+
+def get_llm_statistics(submission: Submission):
+    experiment = get_experiment_environment()
+    client = LangSmithClient()
+    project_name = os.environ.get("LANGCHAIN_PROJECT")
+    runs = list(client.list_runs(
+        project_name=project_name,
+        filter=f'and(has(tags, "run-{experiment.run_id}"), has(tags, "submission-{submission.id}"))'
+    ))
+
+    def get_statistics(runs: List[Run]):
+        return {
+            "response_time": sum((run.end_time - run.start_time).total_seconds() for run in runs if run.end_time is not None),
+            "prompt_tokens": sum(run.prompt_tokens for run in runs if run.prompt_tokens is not None),
+            "completion_tokens": sum(run.completion_tokens for run in runs if run.completion_tokens is not None),
+            "total_tokens": sum(run.total_tokens for run in runs if run.total_tokens is not None),
+        }
+
+    suggestion_runs = []
+    evaluation_runs = []
+    for run in runs:
+        if "evaluation" in (run.tags or []):
+            evaluation_runs.append(run)
+        else:
+            suggestion_runs.append(run)
+
+    llm_statistics = {}
+    if suggestion_runs or evaluation_runs:
+        if suggestion_runs:
+            llm_statistics["suggestions"] = {
+                "count": len(suggestion_runs),
+                **get_statistics(suggestion_runs),
+                "runs": [json.loads(run.json()) for run in suggestion_runs]
+            }
+        if evaluation_runs:
+            llm_statistics["evaluation"] = {
+                "count": len(evaluation_runs),
+                **get_statistics(evaluation_runs),
+                "runs": [json.loads(run.json()) for run in evaluation_runs]
+            }
+
+    return llm_statistics
+
+
+def get_feedback_statistics(exercise: Exercise, submission: Submission,
+                            true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]):
+    actual_feedback_count = len(true_feedbacks)
+    actual_feedback_with_grading_instructions = []
+    suggestions_count = len(predicted_feedbacks)
+    suggestions_with_grading_instructions = []
+
+    # Init usage counts for SGIs
+    actual_sgi_usage = {
+        sgi.id: 0 for criterion in exercise.grading_criteria or [] for sgi in criterion.structured_grading_instructions
+    }
+    suggested_sgi_usage = {
+        sgi.id: 0 for criterion in exercise.grading_criteria or [] for sgi in criterion.structured_grading_instructions
+    }
+
+    # Count SGIs in actual feedbacks
+    for feedback in true_feedbacks:
+        if feedback.structured_grading_instruction_id:
+            actual_feedback_with_grading_instructions.append(feedback)
+            actual_sgi_usage[feedback.structured_grading_instruction_id] += 1
+
+    # Count SGIs in suggested feedbacks
+    for feedback in predicted_feedbacks:
+        if feedback.structured_grading_instruction_id:
+            suggestions_with_grading_instructions.append(feedback)
+            suggested_sgi_usage[feedback.structured_grading_instruction_id] += 1
+
+    actual_feedback_with_grading_instructions_count = len(
+        actual_feedback_with_grading_instructions)
+    suggestions_with_grading_instructions_count = len(
+        suggestions_with_grading_instructions)
+
+    # Match SGIs
+    matched_feedback = 0
+    unmatched_feedback = actual_feedback_count - \
+        actual_feedback_with_grading_instructions_count
+    unmatched_suggestions = suggestions_count - \
+        suggestions_with_grading_instructions_count
+
+    for feedback in actual_feedback_with_grading_instructions:
+        for index, suggestion in enumerate(suggestions_with_grading_instructions):
+            if feedback.structured_grading_instruction_id == suggestion.structured_grading_instruction_id:
+                matched_feedback += 1
+                del suggestions_with_grading_instructions[index]
+                break
+        else:
+            unmatched_feedback += 1
+
+    unmatched_suggestions += len(suggestions_with_grading_instructions)
+
+    feedback_statistics = {
+        "actual_feedback_count": actual_feedback_count,
+        "suggestions_count": suggestions_count,
+        "actual_feedback_with_grading_instructions_count": actual_feedback_with_grading_instructions_count,
+        "suggestions_with_grading_instructions_count": suggestions_with_grading_instructions_count,
+        "actual_sgi_usage": actual_sgi_usage,
+        "suggested_sgi_usage": suggested_sgi_usage,
+        "matched_feedback": matched_feedback,
+        "unmatched_feedback": unmatched_feedback,
+        "unmatched_suggestions": unmatched_suggestions,
+    }
+
+    return feedback_statistics

From fa0bde50196d9b3af4817aa06892dae25125df69 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 21:05:24 +0100
Subject: [PATCH 19/54] remove unused

---
 module_text_llm/module_text_llm/__main__.py   | 2 +-
 module_text_llm/module_text_llm/evaluation.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/module_text_llm/module_text_llm/__main__.py b/module_text_llm/module_text_llm/__main__.py
index 430f7b7fb..1fbeb4cf8 100644
--- a/module_text_llm/module_text_llm/__main__.py
+++ b/module_text_llm/module_text_llm/__main__.py
@@ -59,7 +59,7 @@ async def evaluate_feedback(
         evaluation["llm_statistics"] = get_llm_statistics(submission)
 
     # 3. Feedback statistics
-    evaluation["feedback_statistics"] = get_feedback_statistics(exercise, submission, true_feedbacks, predicted_feedbacks)
+    evaluation["feedback_statistics"] = get_feedback_statistics(exercise, true_feedbacks, predicted_feedbacks)
 
     return evaluation
 
diff --git a/module_text_llm/module_text_llm/evaluation.py b/module_text_llm/module_text_llm/evaluation.py
index 2d6989892..055dc2c94 100644
--- a/module_text_llm/module_text_llm/evaluation.py
+++ b/module_text_llm/module_text_llm/evaluation.py
@@ -52,8 +52,7 @@ def get_statistics(runs: List[Run]):
     return llm_statistics
 
 
-def get_feedback_statistics(exercise: Exercise, submission: Submission,
-                            true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]):
+def get_feedback_statistics(exercise: Exercise, true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]):
     actual_feedback_count = len(true_feedbacks)
     actual_feedback_with_grading_instructions = []
     suggestions_count = len(predicted_feedbacks)

From a4d7d8cf5eb2206241472a0e084df4c4d6625cc0 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 21:36:17 +0100
Subject: [PATCH 20/54] update ini

---
 assessment_module_manager/modules.docker.ini | 7 ++++++-
 assessment_module_manager/modules.ini        | 4 ++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/assessment_module_manager/modules.docker.ini b/assessment_module_manager/modules.docker.ini
index db9d22854..340efa797 100644
--- a/assessment_module_manager/modules.docker.ini
+++ b/assessment_module_manager/modules.docker.ini
@@ -1,19 +1,24 @@
 [module_example]
 url = http://module-example:5001
 type = programming
+supports_evaluation = false
 
 [module_programming_llm]
 url = http://module-programming-llm:5002
 type = programming
+supports_evaluation = false
 
 [module_text_llm]
 url = http://module-text-llm:5003
 type = text
+supports_evaluation = true
 
 [module_text_cofee]
 url = http://module-text-cofee:5004
 type = text
+supports_evaluation = false
 
 [module_programming_themisml]
 url = http://module-programming-themisml:5005
-type = programming
\ No newline at end of file
+type = programming
+supports_evaluation = false
\ No newline at end of file
diff --git a/assessment_module_manager/modules.ini b/assessment_module_manager/modules.ini
index 3402183f0..70745eb78 100644
--- a/assessment_module_manager/modules.ini
+++ b/assessment_module_manager/modules.ini
@@ -1,7 +1,7 @@
 [module_example]
 url = http://localhost:5001
 type = programming
-supports_evaluation = true
+supports_evaluation = false
 
 [module_programming_llm]
 url = http://localhost:5002
@@ -11,7 +11,7 @@ supports_evaluation = false
 [module_text_llm]
 url = http://localhost:5003
 type = text
-supports_evaluation = false
+supports_evaluation = true
 
 [module_text_cofee]
 url = http://localhost:5004

From fd4fdab3cf714298a5ccbd0dd333c4c6f9708c23 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 21:40:51 +0100
Subject: [PATCH 21/54] only use selected modules

---
 playground/src/hooks/athena/request_evaluation.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/playground/src/hooks/athena/request_evaluation.ts b/playground/src/hooks/athena/request_evaluation.ts
index 76227258f..620fb362b 100644
--- a/playground/src/hooks/athena/request_evaluation.ts
+++ b/playground/src/hooks/athena/request_evaluation.ts
@@ -48,7 +48,7 @@ export default function useRequestEvaluation(
       const modules = onlyUseContextModule
         ? [contextModule]
         : Object.values(health?.modules ?? {}).filter(
-            (module) => module.healthy && module.type === contextModule.type
+            (module) => module.healthy && module.type === contextModule.type && module.supportsEvaluation
           );
 
       const results = await Promise.allSettled(

From f2c173661b83234099347bdca9e57d3d31a1ba91 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 22:49:19 +0100
Subject: [PATCH 22/54] remove skip

---
 module_text_llm/module_text_llm/__main__.py     |  2 +-
 playground/src/hooks/batch_module_experiment.ts | 10 ----------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/module_text_llm/module_text_llm/__main__.py b/module_text_llm/module_text_llm/__main__.py
index 1fbeb4cf8..49d069635 100644
--- a/module_text_llm/module_text_llm/__main__.py
+++ b/module_text_llm/module_text_llm/__main__.py
@@ -51,7 +51,7 @@ async def evaluate_feedback(
     evaluation = {}
 
     # 1. LLM as a judge
-    if bool(os.environ.get("LLM_ENABLE_LLM_AS_A_JUDGE")):
+    if len(predicted_feedbacks) > 0 and bool(os.environ.get("LLM_ENABLE_LLM_AS_A_JUDGE")):
         evaluation["llm_as_a_judge"] = await generate_evaluation(exercise, submission, true_feedbacks, predicted_feedbacks)
 
     # 2. LangSmith runs, token usage, and respose times
diff --git a/playground/src/hooks/batch_module_experiment.ts b/playground/src/hooks/batch_module_experiment.ts
index e2610fbb8..778365ade 100644
--- a/playground/src/hooks/batch_module_experiment.ts
+++ b/playground/src/hooks/batch_module_experiment.ts
@@ -406,16 +406,6 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
         submission.id
       )?.suggestions ?? [];
 
-      if (predictedFeedbacks.length === 0) {
-        // Skip if there are no predicted feedbacks
-        setSubmissionsWithAutomaticEvaluation((prevState) => {
-          const newMap = new Map(prevState);
-          newMap.set(submission.id, {});
-          return newMap;
-        });
-        continue;
-      }
-
       try {
         const responses = await requestEvaluation.mutateAsync({
           exercise: experiment.exercise,

From 33f2a2b580fe7948754317b195172a469abf7328 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Sun, 5 Nov 2023 21:54:36 +0100
Subject: [PATCH 23/54] add endpoint

---
 athena/athena/endpoints.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/athena/athena/endpoints.py b/athena/athena/endpoints.py
index 8e9d4dcc5..eac88656f 100644
--- a/athena/athena/endpoints.py
+++ b/athena/athena/endpoints.py
@@ -2,7 +2,7 @@
 import inspect
 from fastapi import Depends, BackgroundTasks
 from pydantic import BaseModel, ValidationError
-from typing import TypeVar, Callable, List, Union, Any, Coroutine, Type
+from typing import TypeVar, Callable, Dict, List, Union, Any, Coroutine, Type
 
 from athena.app import app
 from athena.authenticate import authenticated
@@ -358,4 +358,12 @@ def config_schema_provider(cls: Type[C]) -> Type[C]:
     async def wrapper():
         return cls.schema()
 
-    return cls
\ No newline at end of file
+    return cls
+
+def evaluation_provider(func: Union[
+    Callable[[E, S, List[F], List[F]], Dict[int, Any]],
+    Callable[[E, S, List[F], List[F]], Coroutine[Any, Any, Dict[int, Any]]],
+    Callable[[E, S, List[F], List[F], C], Dict[int, Any]],
+    Callable[[E, S, List[F], List[F], C], Coroutine[Any, Any, Dict[int, Any]]]
+]):
+    pass
\ No newline at end of file

From 368ebcca320cb5eee4d25724df52157354f2933f Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Sun, 5 Nov 2023 22:09:19 +0100
Subject: [PATCH 24/54] add evaluation_provider

---
 athena/athena/endpoints.py | 64 ++++++++++++++++++++++++++++++++++----
 1 file changed, 58 insertions(+), 6 deletions(-)

diff --git a/athena/athena/endpoints.py b/athena/athena/endpoints.py
index eac88656f..a762a37d0 100644
--- a/athena/athena/endpoints.py
+++ b/athena/athena/endpoints.py
@@ -2,7 +2,7 @@
 import inspect
 from fastapi import Depends, BackgroundTasks
 from pydantic import BaseModel, ValidationError
-from typing import TypeVar, Callable, Dict, List, Union, Any, Coroutine, Type
+from typing import TypeVar, Callable, List, Union, Any, Coroutine, Type
 
 from athena.app import app
 from athena.authenticate import authenticated
@@ -361,9 +361,61 @@ async def wrapper():
     return cls
 
 def evaluation_provider(func: Union[
-    Callable[[E, S, List[F], List[F]], Dict[int, Any]],
-    Callable[[E, S, List[F], List[F]], Coroutine[Any, Any, Dict[int, Any]]],
-    Callable[[E, S, List[F], List[F], C], Dict[int, Any]],
-    Callable[[E, S, List[F], List[F], C], Coroutine[Any, Any, Dict[int, Any]]]
+    Callable[[E, S, List[F], List[F]], Any],
+    Callable[[E, S, List[F], List[F]], Coroutine[Any, Any, Any]]
 ]):
-    pass
\ No newline at end of file
+    """
+    Provide evaluated feedback to the Assessment Module Manager.
+    
+    Note: The evaluation provider is usually called during the research and development phase (by the Playground).
+    Return arbitrary evaluation results.
+
+    This decorator can be used with several types of functions: synchronous or asynchronous.
+
+    Examples:
+        Below are some examples of possible functions that you can decorate with this decorator:
+
+        Without using module config (both synchronous and asynchronous forms):
+        >>> @evaluation_provider
+        ... def sync_evaluate_feedback(
+        ...     exercise: Exercise, submission: Submission, 
+        ...     true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]
+        ... ) -> Any:
+        ...     # evaluate predicted feedback here and return evaluation results
+
+        >>> @feedback_provider
+        ... async def async_evaluate_feedback(
+        ...     exercise: Exercise, submission: Submission, 
+        ...     true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]
+        ... ) -> Any:
+        ...     # evaluate predicted feedback here and return evaluation results
+    """
+    exercise_type = inspect.signature(func).parameters["exercise"].annotation
+    submission_type = inspect.signature(func).parameters["submission"].annotation
+    feedback_type = inspect.signature(func).parameters["predicted_feedbacks"].annotation.__args__[0]
+
+    @app.post("/evaluation", responses=module_responses)
+    @authenticated
+    @with_meta
+    async def wrapper(
+            exercise: exercise_type, 
+            submission: submission_type, 
+            true_feedbacks: List[feedback_type], 
+            predicted_feedbacks: List[feedback_type],
+        ):
+        # Retrieve existing metadata for the exercise, submission and feedback
+        exercise.meta.update(get_stored_exercise_meta(exercise) or {})
+        submission.meta.update(get_stored_submission_meta(submission) or {})
+        for feedback in true_feedbacks:
+            feedback.meta.update(get_stored_feedback_meta(feedback) or {})
+        for feedback in predicted_feedbacks:
+            feedback.meta.update(get_stored_feedback_meta(feedback) or {})
+
+        # Call the actual provider
+        if inspect.iscoroutinefunction(func):
+            evaluation = await func(exercise, submission, true_feedbacks, predicted_feedbacks)
+        else:
+            evaluation = func(exercise, submission, true_feedbacks, predicted_feedbacks)
+
+        return evaluation
+    return wrapper
\ No newline at end of file

From 5c681065ebd40f77745fe22cc5234ee1e2d42141 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Sun, 5 Nov 2023 22:09:27 +0100
Subject: [PATCH 25/54] add new line

---
 athena/athena/endpoints.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/athena/athena/endpoints.py b/athena/athena/endpoints.py
index a762a37d0..411edaf0a 100644
--- a/athena/athena/endpoints.py
+++ b/athena/athena/endpoints.py
@@ -360,6 +360,7 @@ async def wrapper():
 
     return cls
 
+
 def evaluation_provider(func: Union[
     Callable[[E, S, List[F], List[F]], Any],
     Callable[[E, S, List[F], List[F]], Coroutine[Any, Any, Any]]

From 7afd65520a30e6eb1a42b782b8c0854d79c02240 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Sun, 5 Nov 2023 22:10:28 +0100
Subject: [PATCH 26/54] add evaluation_provider to export

---
 athena/athena/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/athena/athena/__init__.py b/athena/athena/__init__.py
index 90fb46e62..8a67e5315 100644
--- a/athena/athena/__init__.py
+++ b/athena/athena/__init__.py
@@ -6,7 +6,7 @@
 from .schemas import ExerciseType, GradingCriterion, StructuredGradingInstruction
 from .metadata import emit_meta, get_meta
 from .experiment import get_experiment_environment
-from .endpoints import submission_selector, submissions_consumer, feedback_consumer, feedback_provider, config_schema_provider  # type: ignore
+from .endpoints import submission_selector, submissions_consumer, feedback_consumer, feedback_provider, config_schema_provider, evaluation_provider  # type: ignore
 
 
 @app.get("/")
@@ -28,6 +28,7 @@ def run_module():
     "feedback_consumer",
     "feedback_provider",
     "config_schema_provider",
+    "evaluation_provider",
     "emit_meta",
     "get_meta",
     "get_experiment_environment",

From 192614433d139a1ae56a8ccb999e300c8516435a Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Sun, 5 Nov 2023 22:25:35 +0100
Subject: [PATCH 27/54] add example evaluation endpoint

---
 module_example/module_example/__main__.py | 30 +++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/module_example/module_example/__main__.py b/module_example/module_example/__main__.py
index 7bdef91fe..182f160bb 100644
--- a/module_example/module_example/__main__.py
+++ b/module_example/module_example/__main__.py
@@ -1,10 +1,11 @@
 """
 Entry point for the module_example module.
 """
-from typing import List
+import random
+from typing import List, Any
 from pydantic import BaseModel, Field
 
-from athena import app, config_schema_provider, submissions_consumer, submission_selector, feedback_consumer, feedback_provider, emit_meta
+from athena import app, config_schema_provider, submissions_consumer, submission_selector, feedback_consumer, feedback_provider, evaluation_provider, emit_meta
 from athena.programming import Exercise, Submission, Feedback
 from athena.logger import logger
 from athena.storage import store_exercise, store_submissions, store_feedback
@@ -139,5 +140,30 @@ def suggest_feedback(exercise: Exercise, submission: Submission, module_config:
     ]
 
 
+# Only if it makes sense for a module (Optional)
+@evaluation_provider
+def evaluate_feedback(exercise: Exercise, submission: Submission, true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]) -> Any:
+    logger.info(
+        "evaluate_feedback: Evaluation for submission %d of exercise %d was requested with %d true and %d predicted feedbacks", 
+        submission.id, exercise.id, len(true_feedbacks), len(predicted_feedbacks)
+    )
+
+    # Do something with the true and predicted feedback and return the evaluation result
+    # Generate some example evaluation result
+    evaluation_results = []
+    true_feedback_embeddings = [random.random() for _ in true_feedbacks] 
+    predicted_feedback_embeddings = [random.random() for _ in predicted_feedbacks]
+    for feedback, embedding in zip(predicted_feedbacks, predicted_feedback_embeddings):
+        feedback_evaluation = {
+            "feedback_id": feedback.id,
+            "embedding": embedding,
+            "has_match": len([t for t in true_feedback_embeddings if abs(t - embedding) < 0.1]) > 0,
+            "correctness": random.random()
+        }
+        evaluation_results.append(feedback_evaluation)
+
+    return evaluation_results
+
+
 if __name__ == "__main__":
     app.start()

From db5e5180576299bdd8fc25dbcf75817b98d4d168 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Sun, 5 Nov 2023 22:44:22 +0100
Subject: [PATCH 28/54] add playground ui

---
 .../view_mode/module_requests/index.tsx       |   7 +-
 .../module_requests/request_evaluation.tsx    | 172 ++++++++++++++++++
 .../src/hooks/athena/request_evaluation.ts    |  31 ++++
 3 files changed, 208 insertions(+), 2 deletions(-)
 create mode 100644 playground/src/components/view_mode/module_requests/request_evaluation.tsx
 create mode 100644 playground/src/hooks/athena/request_evaluation.ts

diff --git a/playground/src/components/view_mode/module_requests/index.tsx b/playground/src/components/view_mode/module_requests/index.tsx
index dca29946d..dcda17bf2 100644
--- a/playground/src/components/view_mode/module_requests/index.tsx
+++ b/playground/src/components/view_mode/module_requests/index.tsx
@@ -3,12 +3,14 @@ import type { ModuleMeta } from "@/model/health_response";
 import { useState } from "react";
 
 import { ModuleProvider } from "@/hooks/module_context";
+import ModuleAndConfigSelect from "@/components/selectors/module_and_config_select";
 import GetConfigSchema from "@/components/view_mode/module_requests/get_config_schema";
 import SendSubmissions from "@/components/view_mode/module_requests/send_submissions";
+import SelectSubmission from "@/components/view_mode/module_requests/request_submission_selection";
 import SendFeedbacks from "@/components/view_mode/module_requests/send_feedbacks";
 import RequestFeedbackSuggestions from "@/components/view_mode/module_requests/request_feedback_suggestions";
-import SelectSubmission from "@/components/view_mode/module_requests/request_submission_selection";
-import ModuleAndConfigSelect from "@/components/selectors/module_and_config_select";
+import RequestEvaluation from "@/components/view_mode/module_requests/request_evaluation";
+
 
 export default function ModuleRequests() {
   const [moduleAndConfig, setModuleAndConfig] = useState<{ module: ModuleMeta; moduleConfig: any } | undefined>(undefined);
@@ -34,6 +36,7 @@ export default function ModuleRequests() {
           <SelectSubmission />
           <SendFeedbacks />
           <RequestFeedbackSuggestions />
+          <RequestEvaluation />
         </ModuleProvider>
       )}
     </>
diff --git a/playground/src/components/view_mode/module_requests/request_evaluation.tsx b/playground/src/components/view_mode/module_requests/request_evaluation.tsx
new file mode 100644
index 000000000..c98ac6d27
--- /dev/null
+++ b/playground/src/components/view_mode/module_requests/request_evaluation.tsx
@@ -0,0 +1,172 @@
+import type { Submission } from "@/model/submission";
+import type { Exercise } from "@/model/exercise";
+import type { Feedback } from "@/model/feedback";
+import type ModuleResponse from "@/model/module_response";
+
+import { useEffect, useState } from "react";
+
+import { useModule } from "@/hooks/module_context";
+import { useBaseInfo } from "@/hooks/base_info_context";
+import useRequestEvaluation from "@/hooks/athena/request_evaluation";
+import useFeedbacks from "@/hooks/playground/feedbacks";
+
+import ExerciseSelect from "@/components/selectors/exercise_select";
+import SubmissionSelect from "@/components/selectors/submission_select";
+import ModuleResponseView from "@/components/module_response_view";
+import Disclosure from "@/components/disclosure";
+import ExerciseDetail from "@/components/details/exercise_detail";
+import SubmissionDetail from "@/components/details/submission_detail";
+
+export default function RequestEvaluation() {
+  const { module } = useModule();
+  const { dataMode } = useBaseInfo();
+
+  const [exercise, setExercise] = useState<Exercise | undefined>(undefined);
+  const [submission, setSubmission] = useState<Submission | undefined>(
+    undefined
+  );
+
+  const [predictedFeedbacks, setPredictedFeedbacks] = useState<Feedback[]>([]);
+
+  const {
+    data: trueFeedbacks,
+    isLoading: isLoadingTrueFeedbacks,
+    error: errorTrueFeedbacks,
+  } = useFeedbacks(exercise, submission);
+
+  const {
+    data: response,
+    isLoading,
+    error,
+    mutate,
+    reset,
+  } = useRequestEvaluation();
+
+  useEffect(() => setExercise(undefined), [module, dataMode]);
+
+  return (
+    <div className="bg-white rounded-md p-4 mb-8">
+      <h3 className="text-2xl font-bold mb-4">
+        Request Evaluation from Athena{" "}
+        <span className="text-gray-500 text-sm">(OPTIONAL)</span>
+      </h3>
+      <p className="text-gray-500 mb-4">
+        Evaluate a list of feedback suggestions during the research and
+        development phase. Compare the predicted feedback with the actual
+        feedback using the function annotated with{" "}
+        <code>@evaluation_provider</code>. Each module can implement custom
+        metrics to evaluate the feedback suggestions during evaluation and
+        respond with arbitrary evaluation results.
+      </p>
+      <ExerciseSelect
+        exerciseType={module.type}
+        exercise={exercise}
+        onChange={(exercise) => {
+          setExercise(exercise);
+          reset();
+          setSubmission(undefined);
+          setPredictedFeedbacks([]);
+        }}
+        disabled={isLoading}
+      />
+      {exercise && (
+        <>
+          <SubmissionSelect
+            exercise={exercise}
+            submission={submission}
+            onChange={(submission) => {
+              setSubmission(submission);
+              setPredictedFeedbacks([]);
+            }}
+            disabled={isLoading}
+          />
+          <div className="space-y-1 mt-2">
+            <ExerciseDetail exercise={exercise} />
+            {submission &&
+              (trueFeedbacks ? (
+                <Disclosure title="True Feedbacks" openedInitially>
+                  <p className="text-gray-500 text-sm">
+                    The following feedbacks given by the tutor in the past.
+                  </p>
+                  <SubmissionDetail
+                    identifier="trueFeedbacks"
+                    submission={submission}
+                    feedbacks={trueFeedbacks.filter(
+                      (f) => f.submission_id === submission.id
+                    )}
+                  />
+                </Disclosure>
+              ) : (
+                <div className="text-gray-500 text-sm">
+                  No true feedbacks available
+                </div>
+              ))}
+            {submission && (
+              <Disclosure title="Predicted Feedbacks" openedInitially>
+                <p className="text-gray-500 text-sm">
+                  Provide feedback as <strong>predicted feedbacks</strong> to
+                  test the evaluation.
+                </p>
+                <SubmissionDetail
+                  identifier="predictedFeedbacks"
+                  submission={submission}
+                  feedbacks={predictedFeedbacks.filter(
+                    (f) => f.submission_id === submission.id
+                  )}
+                  onFeedbacksChange={setPredictedFeedbacks}
+                />
+              </Disclosure>
+            )}
+            {isLoadingTrueFeedbacks && (
+              <div className="text-gray-500 text-sm">Loading feedbacks...</div>
+            )}
+            {errorTrueFeedbacks && (
+              <div className="text-red-500 text-sm">
+                Failed to load feedbacks
+              </div>
+            )}
+          </div>
+        </>
+      )}
+      <ModuleResponseView
+        response={
+          response ??
+          (error?.asModuleResponse ? error.asModuleResponse() : undefined)
+        }
+      />
+      <button
+        className="bg-primary-500 text-white rounded-md p-2 mt-4 hover:bg-primary-600 disabled:text-gray-500 disabled:bg-gray-200 disabled:cursor-not-allowed"
+        onClick={async () => {
+          if (!exercise) {
+            alert("Please select an exercise");
+            return;
+          }
+          if (!submission) {
+            alert("Please select a submission");
+            return;
+          }
+          if (!trueFeedbacks) {
+            alert("Please wait for the true feedbacks to load");
+            return;
+          }
+
+          mutate({
+            exercise,
+            submission,
+            trueFeedbacks,
+            predictedFeedbacks,
+          });
+        }}
+        disabled={!exercise || isLoading || isLoadingTrueFeedbacks}
+      >
+        {exercise
+          ? isLoading || isLoadingTrueFeedbacks
+            ? "Loading..."
+            : "Request evaluation"
+          : exercise
+          ? "Please select a submission"
+          : "Please select an exercise"}
+      </button>
+    </div>
+  );
+}
diff --git a/playground/src/hooks/athena/request_evaluation.ts b/playground/src/hooks/athena/request_evaluation.ts
new file mode 100644
index 000000000..d6ccc9f60
--- /dev/null
+++ b/playground/src/hooks/athena/request_evaluation.ts
@@ -0,0 +1,31 @@
+import type { Exercise } from "@/model/exercise";
+import type { Submission } from "@/model/submission";
+import type ModuleResponse from "@/model/module_response";
+
+import { UseMutationOptions, useMutation } from "react-query";
+import { AthenaError, useAthenaFetcher } from "@/hooks/athena_fetcher";
+import { Feedback } from "@/model/feedback";
+
+/**
+ * Requests an evaluation for an exercise and a submission given the true and predicted feedbacks from an Athena module.
+ *
+ * @example
+ * const { data, isLoading, error, mutate } = useRequestEvaluation();
+ * mutate({ exercise, submission, trueFeedbacks, predictedFeedbacks });
+ * 
+ * @param options The react-query options.
+ */
+export default function useRequestEvaluation(
+  options: Omit<
+    UseMutationOptions<ModuleResponse | undefined, AthenaError, { exercise: Exercise; submission: Submission, trueFeedbacks: Feedback[], predictedFeedbacks: Feedback[] }>,
+    "mutationFn"
+  > = {}
+) {
+  const athenaFetcher = useAthenaFetcher();
+  return useMutation({
+    mutationFn: async ({ exercise, submission, trueFeedbacks, predictedFeedbacks }) => {
+      return await athenaFetcher("/evaluation", { exercise, submission, true_feedbacks: trueFeedbacks, predicted_feedbacks: predictedFeedbacks });
+    },
+    ...options,
+  });
+}

From 2da43911793e73d9ac3bebec313ecf89d991ee81 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Mon, 6 Nov 2023 16:22:16 +0100
Subject: [PATCH 29/54] add automatic evaluation

---
 .../conduct_experiment/index.tsx              |  17 +--
 .../src/hooks/batch_module_experiment.ts      | 108 +++++++++++++++++-
 playground/src/model/automatic_evaluation.ts  |   3 +
 3 files changed, 117 insertions(+), 11 deletions(-)
 create mode 100644 playground/src/model/automatic_evaluation.ts

diff --git a/playground/src/components/view_mode/evaluation_mode/conduct_experiment/index.tsx b/playground/src/components/view_mode/evaluation_mode/conduct_experiment/index.tsx
index f91cc7b3e..3c78e1ecb 100644
--- a/playground/src/components/view_mode/evaluation_mode/conduct_experiment/index.tsx
+++ b/playground/src/components/view_mode/evaluation_mode/conduct_experiment/index.tsx
@@ -65,6 +65,12 @@ export default function ConductExperiment({
               data: data.manualRatings,
             });
           }
+          if (data.automaticEvaluation) {
+            files.push({
+              name: `${experiment.exerciseType}_automatic_evaluation_${moduleConfigurations[index].name}_${experiment.id}_run-${data.automaticEvaluation.runId}`,
+              data: data.automaticEvaluation,
+            });
+          }
         }
         return files;
       })
@@ -102,14 +108,11 @@ export default function ConductExperiment({
       return;
     }
 
-    if (
-      !data.type ||
-      (data.type !== "results" && data.type !== "manualRatings")
-    ) {
-      alert("No correct type found in the data i.e. 'results' or 'manualRatings'");
+    if (!data.type || !["results", "manualRatings", "automaticEvaluation"].includes(data.type)) {
+      alert("No correct type found in the data i.e. 'results', 'manualRatings', or 'automaticEvaluation'.");
       return;
     }
-    const type = data.type as "results" | "manualRatings";
+    const type = data.type as "results" | "manualRatings" | "automaticEvaluation";
 
     try {
       moduleViewRef.importData(data);
@@ -209,7 +212,7 @@ export default function ConductExperiment({
 
                         // If all files have been read, sort and import
                         if (filesProcessed === files.length) {
-                          // Sort the array by 'type', 'results' first and then 'manualRatings'
+                          // Sort the array by 'type', 'results' first and then 'manualRatings' or 'automaticEvaluation'
                           const sortedData = fileDataArray.sort((a, b) => {
                             if (a.type === "results" && b.type !== "results") {
                               return -1;
diff --git a/playground/src/hooks/batch_module_experiment.ts b/playground/src/hooks/batch_module_experiment.ts
index ba99982c2..a2ff10fb0 100644
--- a/playground/src/hooks/batch_module_experiment.ts
+++ b/playground/src/hooks/batch_module_experiment.ts
@@ -1,5 +1,6 @@
 import type { Feedback } from "@/model/feedback";
 import type { ManualRating } from "@/model/manual_rating";
+import type { AutomaticEvaluation } from "@/model/automatic_evaluation";
 import type { Experiment } from "@/components/view_mode/evaluation_mode/define_experiment";
 import type { ModuleConfiguration } from "@/components/view_mode/evaluation_mode/configure_modules";
 
@@ -9,6 +10,7 @@ import { useSendFeedbacks } from "./athena/send_feedbacks";
 import useRequestSubmissionSelection from "./athena/request_submission_selection";
 import useRequestFeedbackSuggestions from "./athena/request_feedback_suggestions";
 import useSendSubmissions from "./athena/send_submissions";
+import useRequestEvaluation from "./athena/request_evaluation";
 import { useExperimentIdentifiersSetRunId } from "./experiment_identifiers_context";
 
 export type ExperimentStep =
@@ -50,6 +52,11 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
     Map<number, ManualRating[]>
   >(new Map());
 
+  // Stores automatic evaluation of submissions
+  const [submissionsWithAutomaticEvaluation, setSubmissionsWithAutomaticEvaluation] = useState<
+    Map<number, AutomaticEvaluation>
+  >(new Map());
+
   const [processingStep, setProcessingStep] = useState<
     ExperimentStep | undefined
   >(undefined);
@@ -95,6 +102,19 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
           },
         } : {}
       ),
+      ...(
+        submissionsWithAutomaticEvaluation.size > 0 ? {
+          automaticEvaluation: {
+            type: "automaticEvaluation",
+            runId: data.runId,
+            experimentId: experiment.id,
+            moduleConfigurationId: moduleConfiguration.id,
+            submissionsWithAutomaticEvaluation: Object.fromEntries(
+              submissionsWithAutomaticEvaluation
+            ),
+          },
+        } : {}
+      ),
     };
   };
 
@@ -108,6 +128,7 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
         throw new Error("Invalid results data");
       }
 
+      setProcessingStep(undefined);
       setData(() => ({
         runId: importedData.runId,
         step: importedData.step,
@@ -134,7 +155,22 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
         )
       ));
       return;
+    } else if (importedData.type === "automaticEvaluation") {
+      // Relies on the fact that the automatic evaluations have to be imported after the results
+      if (importedData.runId !== data.runId) {
+        throw new Error("Run ID does not match, have you imported the results first?");
+      }
+      if (importedData.submissionsWithAutomaticEvaluation === undefined) {
+        throw new Error("Invalid automatic evaluation data");
+      }
+      setSubmissionsWithAutomaticEvaluation(() => new Map(
+        Object.entries(importedData.submissionsWithAutomaticEvaluation).map(
+          ([key, value]) => [Number(key), value as any]
+        )
+      ));
+      return;
     }
+
     throw new Error("Unknown import data type");
   };
 
@@ -158,6 +194,7 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
   const sendFeedbacks = useSendFeedbacks();
   const requestSubmissionSelection = useRequestSubmissionSelection();
   const requestFeedbackSuggestions = useRequestFeedbackSuggestions();
+  const requestEvaluation = useRequestEvaluation();
 
   // 1. Send submissions to Athena
   const stepSendSubmissions = () => {
@@ -338,10 +375,70 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
 
     setData((prevState) => ({
       ...prevState,
-      step: "finished",
+      step: "finished", // Automatic evaluation is done separately
     }));
   };
 
+  // 4. Automatic evaluation (after results are 'finished')
+  const stepAutomaticEvaluation = async () => {
+    setProcessingStep("finished");
+
+    console.log("Running automatic evaluation...");
+
+    let remainingSubmissions = experiment.evaluationSubmissions.filter(
+      (submission) => !submissionsWithAutomaticEvaluation.has(submission.id)
+    );
+    
+    let index = 0;
+    for (const submission of remainingSubmissions) {
+      console.log(
+        `Evaluating... (${index + 1}/${
+          remainingSubmissions.length
+        })`
+      );
+
+      const predictedFeedbacks = data.submissionsWithFeedbackSuggestions.get(
+        submission.id
+      )?.suggestions ?? [];
+
+      if (predictedFeedbacks.length === 0) {
+        // Skip if there are no predicted feedbacks
+        setSubmissionsWithAutomaticEvaluation((prevState) => {
+          const newMap = new Map(prevState);
+          newMap.set(submission.id, {});
+          return newMap;
+        });
+        continue;
+      }
+
+      try {
+        const response = await requestEvaluation.mutateAsync({
+          exercise: experiment.exercise,
+          submission,
+          trueFeedbacks: experiment.tutorFeedbacks.filter(
+            (feedback) => feedback.submission_id === submission.id
+          ),
+          predictedFeedbacks: predictedFeedbacks,
+        });
+        if (!isMounted.current) {
+          return;
+        }
+        console.log(`Received evaluation for submission ${submission.id}:`, response.data);
+
+        setSubmissionsWithAutomaticEvaluation((prevState) => {
+          const newMap = new Map(prevState);
+          newMap.set(submission.id, response.data);
+          return newMap;
+        });
+      } catch (error) {
+        console.error(
+          `Error while evaluating submission ${submission.id}:`,
+          error
+        );
+      }
+    }
+  };
+
   useEffect(() => {
     isMounted.current = true;
     return () => {
@@ -375,10 +472,12 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
       processingStep !== "generatingFeedbackSuggestions"
     ) {
       stepGenerateFeedbackSuggestions();
+    } else if (
+      data.step === "finished" &&
+      processingStep !== "finished"
+    ) {
+      stepAutomaticEvaluation();
     }
-    // TODO: Add automatic evaluation step here
-    // Note: Evaluate tutor feedback more globally to not do it multiple times
-    // Note 2: Actually, I probably want to have it in parallel with the feedback suggestions for the interactive mode!
   }, [data.step]);
 
   return {
@@ -394,6 +493,7 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
       sendFeedbacks,
       requestSubmissionSelection,
       requestFeedbackSuggestions,
+      requestEvaluation,
     },
   };
 }
diff --git a/playground/src/model/automatic_evaluation.ts b/playground/src/model/automatic_evaluation.ts
new file mode 100644
index 000000000..fb55b9c86
--- /dev/null
+++ b/playground/src/model/automatic_evaluation.ts
@@ -0,0 +1,3 @@
+export type AutomaticEvaluation = {
+  [module: string]: any;
+};

From a8589dc5a1a241b63a6680ffc44128102b7835b5 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Mon, 6 Nov 2023 17:22:55 +0100
Subject: [PATCH 30/54] add automatic evaluation

---
 .../endpoints/health_endpoint.py              |  5 +-
 .../module/list_modules.py                    |  1 +
 .../module/module.py                          |  1 +
 assessment_module_manager/modules.ini         |  5 ++
 .../module_requests/request_evaluation.tsx    |  4 +-
 .../src/hooks/athena/request_evaluation.ts    | 62 ++++++++++++++++---
 playground/src/hooks/athena_fetcher.ts        | 42 ++++++++-----
 .../src/hooks/batch_module_experiment.ts      | 11 +++-
 playground/src/hooks/module_context.tsx       |  2 +-
 playground/src/model/health_response.ts       |  1 +
 10 files changed, 103 insertions(+), 31 deletions(-)

diff --git a/assessment_module_manager/assessment_module_manager/endpoints/health_endpoint.py b/assessment_module_manager/assessment_module_manager/endpoints/health_endpoint.py
index e1a8d252f..85879b673 100644
--- a/assessment_module_manager/assessment_module_manager/endpoints/health_endpoint.py
+++ b/assessment_module_manager/assessment_module_manager/endpoints/health_endpoint.py
@@ -27,6 +27,7 @@ class HealthResponse(BaseModel):
     """
     Response indicating whether the Assessment Module Manager is healthy,
     and whether all the modules are healthy (i.e. reachable).
+    Additional information about the modules is also provided.
     """
     status: str = Field(const=True, default="ok", example="ok")
     modules: dict = Field(
@@ -35,7 +36,8 @@ class HealthResponse(BaseModel):
                 "module_example": {
                     "url": "http://localhost:5001",
                     "type": "programming",
-                    "healthy": True
+                    "healthy": True,
+                    "supportsEvaluation": True
                 }
             }
         ]
@@ -56,6 +58,7 @@ async def get_health() -> HealthResponse:
                 "url": module.url,
                 "type": module.type,
                 "healthy": await is_healthy(module),
+                "supportsEvaluation": module.supports_evaluation
             }
             for module in get_modules()
         }
diff --git a/assessment_module_manager/assessment_module_manager/module/list_modules.py b/assessment_module_manager/assessment_module_manager/module/list_modules.py
index b2862e489..e9f18f106 100644
--- a/assessment_module_manager/assessment_module_manager/module/list_modules.py
+++ b/assessment_module_manager/assessment_module_manager/module/list_modules.py
@@ -18,6 +18,7 @@ def list_modules() -> List[Module]:
             name=module,
             url=cast(AnyHttpUrl, modules_config[module]["url"]),
             type=ExerciseType(modules_config[module]["type"]),
+            supports_evaluation=modules_config[module].getboolean("supports_evaluation"),
         )
         for module in modules_config.sections()
     ]
diff --git a/assessment_module_manager/assessment_module_manager/module/module.py b/assessment_module_manager/assessment_module_manager/module/module.py
index 35dbb6da1..65e99931f 100644
--- a/assessment_module_manager/assessment_module_manager/module/module.py
+++ b/assessment_module_manager/assessment_module_manager/module/module.py
@@ -8,3 +8,4 @@ class Module(BaseModel):
     name: str = Field(example="module_example")
     url: AnyHttpUrl = Field(example="http://localhost:5001")
     type: ExerciseType = Field(example=ExerciseType.text)
+    supports_evaluation: bool = Field(description="Whether the module supports evaluation", example=True)
diff --git a/assessment_module_manager/modules.ini b/assessment_module_manager/modules.ini
index 0dde7b074..3402183f0 100644
--- a/assessment_module_manager/modules.ini
+++ b/assessment_module_manager/modules.ini
@@ -1,19 +1,24 @@
 [module_example]
 url = http://localhost:5001
 type = programming
+supports_evaluation = true
 
 [module_programming_llm]
 url = http://localhost:5002
 type = programming
+supports_evaluation = false
 
 [module_text_llm]
 url = http://localhost:5003
 type = text
+supports_evaluation = false
 
 [module_text_cofee]
 url = http://localhost:5004
 type = text
+supports_evaluation = false
 
 [module_programming_themisml]
 url = http://localhost:5005
 type = programming
+supports_evaluation = false
diff --git a/playground/src/components/view_mode/module_requests/request_evaluation.tsx b/playground/src/components/view_mode/module_requests/request_evaluation.tsx
index c98ac6d27..f004708d6 100644
--- a/playground/src/components/view_mode/module_requests/request_evaluation.tsx
+++ b/playground/src/components/view_mode/module_requests/request_evaluation.tsx
@@ -40,7 +40,7 @@ export default function RequestEvaluation() {
     error,
     mutate,
     reset,
-  } = useRequestEvaluation();
+  } = useRequestEvaluation(undefined, true) // onlyUseContextModule = true for module requests only
 
   useEffect(() => setExercise(undefined), [module, dataMode]);
 
@@ -130,7 +130,7 @@ export default function RequestEvaluation() {
       )}
       <ModuleResponseView
         response={
-          response ??
+          response?.at(0) ??
           (error?.asModuleResponse ? error.asModuleResponse() : undefined)
         }
       />
diff --git a/playground/src/hooks/athena/request_evaluation.ts b/playground/src/hooks/athena/request_evaluation.ts
index d6ccc9f60..76227258f 100644
--- a/playground/src/hooks/athena/request_evaluation.ts
+++ b/playground/src/hooks/athena/request_evaluation.ts
@@ -5,26 +5,74 @@ import type ModuleResponse from "@/model/module_response";
 import { UseMutationOptions, useMutation } from "react-query";
 import { AthenaError, useAthenaFetcher } from "@/hooks/athena_fetcher";
 import { Feedback } from "@/model/feedback";
+import { useModule } from "@/hooks/module_context";
+import useHealth from "@/hooks/health";
 
 /**
- * Requests an evaluation for an exercise and a submission given the true and predicted feedbacks from an Athena module.
+ * Requests an evaluation for an exercise and a submission given the true and predicted feedbacks from healthy Athena modules.
+ *
+ * @param options The react-query options.
+ * @param onlyUseContextModule - If true, only the context module is used for the evaluation. Otherwise, all healthy modules are used.
  *
  * @example
  * const { data, isLoading, error, mutate } = useRequestEvaluation();
  * mutate({ exercise, submission, trueFeedbacks, predictedFeedbacks });
- * 
- * @param options The react-query options.
  */
 export default function useRequestEvaluation(
   options: Omit<
-    UseMutationOptions<ModuleResponse | undefined, AthenaError, { exercise: Exercise; submission: Submission, trueFeedbacks: Feedback[], predictedFeedbacks: Feedback[] }>,
+    UseMutationOptions<
+      ModuleResponse[] | undefined,
+      AthenaError,
+      {
+        exercise: Exercise;
+        submission: Submission;
+        trueFeedbacks: Feedback[];
+        predictedFeedbacks: Feedback[];
+      }
+    >,
     "mutationFn"
-  > = {}
+  > = {},
+  onlyUseContextModule = false
 ) {
   const athenaFetcher = useAthenaFetcher();
+  const { module: contextModule } = useModule();
+  const { data: health } = useHealth();
+
   return useMutation({
-    mutationFn: async ({ exercise, submission, trueFeedbacks, predictedFeedbacks }) => {
-      return await athenaFetcher("/evaluation", { exercise, submission, true_feedbacks: trueFeedbacks, predicted_feedbacks: predictedFeedbacks });
+    mutationFn: async ({
+      exercise,
+      submission,
+      trueFeedbacks,
+      predictedFeedbacks,
+    }) => {
+      const modules = onlyUseContextModule
+        ? [contextModule]
+        : Object.values(health?.modules ?? {}).filter(
+            (module) => module.healthy && module.type === contextModule.type
+          );
+
+      const results = await Promise.allSettled(
+        modules.map((module) =>
+          athenaFetcher(
+            "/evaluation",
+            {
+              exercise,
+              submission,
+              true_feedbacks: trueFeedbacks,
+              predicted_feedbacks: predictedFeedbacks,
+            },
+            { module: module, moduleConfig: undefined }
+          )
+        )
+      );
+
+      return results.flatMap((result) => {
+        if (result.status === "fulfilled") {
+          return [result.value];
+        } else {
+          return [];
+        }
+      });
     },
     ...options,
   });
diff --git a/playground/src/hooks/athena_fetcher.ts b/playground/src/hooks/athena_fetcher.ts
index 1f23f20ba..69b9a2278 100644
--- a/playground/src/hooks/athena_fetcher.ts
+++ b/playground/src/hooks/athena_fetcher.ts
@@ -1,4 +1,5 @@
 import type ModuleResponse from "@/model/module_response";
+import type { Module } from "@/hooks/module_context";
 
 import baseUrl from "@/helpers/base_url";
 import { useBaseInfo } from "@/hooks/base_info_context";
@@ -36,27 +37,34 @@ export class AthenaError extends Error {
  * @returns A function that can be used to fetch data from the module or that returns undefined if the module is not set.
  */
 export function useAthenaFetcher() {
-  const { module, moduleConfig } = useModule();
+  const { module: contextModule, moduleConfig: contextModuleConfig } = useModule();
   const { athenaUrl, athenaSecret } = useBaseInfo();  
   const { experimentId, moduleConfigurationId, runId } = useExperimentIdentifiers();
 
-  const headers: { [key: string]: string } = {};
-  if (moduleConfig) {
-    headers["X-Module-Config"] = JSON.stringify(moduleConfig);
-  }
-  if (experimentId) {
-    headers["X-Experiment-ID"] = experimentId;
-  }
-  if (moduleConfigurationId) {
-    headers["X-Module-Configuration-ID"] = moduleConfigurationId;
-  }
-  if (runId) {
-    headers["X-Run-ID"] = runId;
-  }
-
   return (
-    async (moduleRoute: string, body?: any) => {
-      const url = `${athenaUrl}/modules/${module.type}/${module.name}${moduleRoute}`;
+    async (moduleRoute: string, body?: any, overrideModule?: Module) => {
+      let targetModule = contextModule;
+      let targetModuleConfig = contextModuleConfig;
+      if (overrideModule) {
+        targetModule = overrideModule.module;
+        targetModuleConfig = overrideModule.moduleConfig;
+      }
+
+      const headers: { [key: string]: string } = {};
+      if (targetModuleConfig) {
+        headers["X-Module-Config"] = JSON.stringify(targetModuleConfig);
+      }
+      if (experimentId) {
+        headers["X-Experiment-ID"] = experimentId;
+      }
+      if (moduleConfigurationId) {
+        headers["X-Module-Configuration-ID"] = moduleConfigurationId;
+      }
+      if (runId) {
+        headers["X-Run-ID"] = runId;
+      }
+
+      const url = `${athenaUrl}/modules/${targetModule.type}/${targetModule.name}${moduleRoute}`;
       const response = await fetch(
         `${baseUrl}/api/athena_request?${new URLSearchParams({
           url: url,
diff --git a/playground/src/hooks/batch_module_experiment.ts b/playground/src/hooks/batch_module_experiment.ts
index a2ff10fb0..7c6f9f68d 100644
--- a/playground/src/hooks/batch_module_experiment.ts
+++ b/playground/src/hooks/batch_module_experiment.ts
@@ -412,7 +412,7 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
       }
 
       try {
-        const response = await requestEvaluation.mutateAsync({
+        const responses = await requestEvaluation.mutateAsync({
           exercise: experiment.exercise,
           submission,
           trueFeedbacks: experiment.tutorFeedbacks.filter(
@@ -423,11 +423,16 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
         if (!isMounted.current) {
           return;
         }
-        console.log(`Received evaluation for submission ${submission.id}:`, response.data);
+
+        const data = Object.fromEntries(
+          responses.map((response) => [response.module_name, response.data])
+        );
+
+        console.log(`Received evaluation for submission ${submission.id}:`, data);
 
         setSubmissionsWithAutomaticEvaluation((prevState) => {
           const newMap = new Map(prevState);
-          newMap.set(submission.id, response.data);
+          newMap.set(submission.id, data);
           return newMap;
         });
       } catch (error) {
diff --git a/playground/src/hooks/module_context.tsx b/playground/src/hooks/module_context.tsx
index c2a96232e..f7aab7666 100644
--- a/playground/src/hooks/module_context.tsx
+++ b/playground/src/hooks/module_context.tsx
@@ -2,7 +2,7 @@ import type { ModuleMeta } from '@/model/health_response';
 
 import { ReactNode, createContext, useContext, useReducer } from 'react';
 
-type Module = {
+export type Module = {
   module: ModuleMeta;
   moduleConfig: any;
 };
diff --git a/playground/src/model/health_response.ts b/playground/src/model/health_response.ts
index 7502b21d9..4ffe27de9 100644
--- a/playground/src/model/health_response.ts
+++ b/playground/src/model/health_response.ts
@@ -2,6 +2,7 @@ export type ModuleMeta = {
   name: string;
   type: string;
   healthy: boolean;
+  supportsEvaluation: boolean;
 };
 
 export type HealthResponse = {

From 39c729dc879d3ab08f7596278ff11c387c1b5640 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 16:34:34 +0100
Subject: [PATCH 31/54] add UI changes

---
 .../batch_module_experiment.tsx               |  64 ++++--
 .../module_experiment_progress.tsx            | 193 +++++++++++-------
 .../src/hooks/batch_module_experiment.ts      |  22 +-
 3 files changed, 177 insertions(+), 102 deletions(-)

diff --git a/playground/src/components/view_mode/evaluation_mode/conduct_experiment/batch_module_experiment.tsx b/playground/src/components/view_mode/evaluation_mode/conduct_experiment/batch_module_experiment.tsx
index d2e9fdb02..560d8b204 100644
--- a/playground/src/components/view_mode/evaluation_mode/conduct_experiment/batch_module_experiment.tsx
+++ b/playground/src/components/view_mode/evaluation_mode/conduct_experiment/batch_module_experiment.tsx
@@ -2,7 +2,12 @@ import type { Submission } from "@/model/submission";
 import type { Experiment } from "../define_experiment";
 import type { ExperimentStep } from "@/hooks/batch_module_experiment";
 
-import React, { useImperativeHandle, useState, ForwardedRef, useEffect } from "react";
+import React, {
+  useImperativeHandle,
+  useState,
+  ForwardedRef,
+  useEffect,
+} from "react";
 import Modal from "react-modal";
 import { FullScreenHandle } from "react-full-screen";
 
@@ -14,6 +19,7 @@ import { ModuleConfiguration } from "../configure_modules";
 import ModuleExperimentProgress from "./module_experiment_progress";
 import SubmissionDetail from "@/components/details/submission_detail";
 import ModuleConfigSelect from "@/components/selectors/module_config_select";
+import { twMerge } from "tailwind-merge";
 
 type ConductBatchModuleExperimentProps = {
   experiment: Experiment;
@@ -53,7 +59,10 @@ const ConductBatchModuleExperiment = React.forwardRef<
     ref: ForwardedRef<ConductBatchModuleExperimentHandles>
   ) => {
     const { data: health } = useHealth();
-    const moduleExperiment = useBatchModuleExperiment(experiment, moduleConfiguration);
+    const moduleExperiment = useBatchModuleExperiment(
+      experiment,
+      moduleConfiguration
+    );
 
     const [showProgress, setShowProgress] = useState(true);
     const [isConfigModalOpen, setConfigModalOpen] = useState(false);
@@ -88,14 +97,6 @@ const ConductBatchModuleExperiment = React.forwardRef<
             <div className="flex items-center justify-between gap-2">
               <h4 className="text-lg font-bold">{moduleConfiguration.name}</h4>
               <div className="flex flex-1 justify-end gap-1 mb-1 self-start">
-                {moduleExperiment.continueAfterTraining && (
-                <button 
-                  className="rounded-md p-2 bg-primary-500 hover:bg-primary-600 text-white text-base leading-none"
-                  onClick={moduleExperiment.continueAfterTraining}
-                  >
-                  Start Generating
-                </button>
-                )}
                 <button
                   disabled={moduleOrderControl.isFirstModule}
                   className="w-8 h-8 rounded-md p-2 bg-gray-100 hover:bg-gray-200 font-bold text-gray-500 hover:text-gray-600 text-base leading-none disabled:text-gray-300 disabled:bg-gray-50 disabled:cursor-not-allowed"
@@ -125,16 +126,29 @@ const ConductBatchModuleExperiment = React.forwardRef<
                     Unhealthy
                   </span>
                 )}
-                {moduleExperiment.data.step === "finished" ? (
+                {moduleExperiment.data.step === "finished" &&
+                moduleExperiment.submissionsWithAutomaticEvaluation?.size ===
+                  moduleExperiment.data.submissionsWithFeedbackSuggestions
+                    .size ? (
                   <span className="rounded-full bg-green-500 text-white px-2 py-0.5 text-xs">
                     Finished
                   </span>
-                ) : moduleExperiment.data.step !== undefined ? (
-                  <span className="rounded-full bg-yellow-500 text-white px-2 py-0.5 text-xs">
-                    {moduleExperiment.continueAfterTraining ?
+                ) : moduleExperiment.data.step !== "notStarted" ? (
+                  <span
+                    className={twMerge(
+                      "rounded-full text-white px-2 py-0.5 text-xs",
+                      moduleExperiment.continueAfterTraining ||
+                        moduleExperiment.continueWithAutomaticEvaluation
+                        ? "bg-primary-300"
+                        : "bg-yellow-500"
+                    )}
+                  >
+                    {moduleExperiment.continueAfterTraining ||
+                    moduleExperiment.continueWithAutomaticEvaluation ? (
                       <>Waiting&nbsp;to&nbsp;Continue</>
-                    : <>In&nbsp;Progress</>
-                    }
+                    ) : (
+                      <>In&nbsp;Progress</>
+                    )}
                   </span>
                 ) : (
                   <span className="rounded-full bg-gray-500 text-white px-2 py-0.5 text-xs">
@@ -203,8 +217,12 @@ const ConductBatchModuleExperiment = React.forwardRef<
                 viewSubmission.id
               )?.suggestions ?? []
             }
-            manualRatings={moduleExperiment.submissionsWithManualRatings.get(viewSubmission.id)}
-            onManualRatingsChange={moduleExperiment.getManualRatingsSetter(viewSubmission.id)}
+            manualRatings={moduleExperiment.submissionsWithManualRatings.get(
+              viewSubmission.id
+            )}
+            onManualRatingsChange={moduleExperiment.getManualRatingsSetter(
+              viewSubmission.id
+            )}
           />
         </div>
         <Modal
@@ -259,10 +277,12 @@ function ConductBatchModuleExperimentWrapped(
   ref: React.Ref<ConductBatchModuleExperimentHandles>
 ) {
   return (
-    <ExperimentIdentifiersProvider experimentIdentifiers={{
-      experimentId: props.experiment.id,
-      moduleConfigurationId: props.moduleConfiguration.id,
-    }}>
+    <ExperimentIdentifiersProvider
+      experimentIdentifiers={{
+        experimentId: props.experiment.id,
+        moduleConfigurationId: props.moduleConfiguration.id,
+      }}
+    >
       <ModuleProvider
         module={props.moduleConfiguration.moduleAndConfig.module}
         moduleConfig={props.moduleConfiguration.moduleAndConfig.moduleConfig}
diff --git a/playground/src/components/view_mode/evaluation_mode/conduct_experiment/module_experiment_progress.tsx b/playground/src/components/view_mode/evaluation_mode/conduct_experiment/module_experiment_progress.tsx
index c2f4b9c65..345c9e67a 100644
--- a/playground/src/components/view_mode/evaluation_mode/conduct_experiment/module_experiment_progress.tsx
+++ b/playground/src/components/view_mode/evaluation_mode/conduct_experiment/module_experiment_progress.tsx
@@ -14,7 +14,7 @@ export default function ModuleExperimentProgress({
   experiment,
   moduleExperiment,
 }: ModuleExperimentProgressProps) {
-  const data = moduleExperiment.data;
+  const { data, submissionsWithAutomaticEvaluation } = moduleExperiment;
   const moduleRequests = moduleExperiment.moduleRequests;
 
   const stepToIndex = (step: ExperimentStep) => {
@@ -86,10 +86,11 @@ export default function ModuleExperimentProgress({
             className={twMerge(
               "flex items-center justify-center w-6 h-6 border rounded-full shrink-0",
               stepToIndex(data.step) >= 2
-              ? stepToIndex(data.step) > 2 || moduleExperiment.continueAfterTraining
-                ? "text-green-500 border-green-500"
-                : "text-yellow-500 border-yellow-500"
-              : "text-gray-500 border-gray-500"
+                ? stepToIndex(data.step) > 2 ||
+                  moduleExperiment.continueAfterTraining
+                  ? "text-green-500 border-green-500"
+                  : "text-yellow-500 border-yellow-500"
+                : "text-gray-500 border-gray-500"
             )}
           >
             2
@@ -98,10 +99,11 @@ export default function ModuleExperimentProgress({
             className={twMerge(
               "flex flex-col",
               stepToIndex(data.step) >= 2
-              ? stepToIndex(data.step) > 2 || moduleExperiment.continueAfterTraining
-                ? "text-green-500"
-                : "text-yellow-500"
-              : "text-gray-500"
+                ? stepToIndex(data.step) > 2 ||
+                  moduleExperiment.continueAfterTraining
+                  ? "text-green-500"
+                  : "text-yellow-500"
+                : "text-gray-500"
             )}
           >
             <span className="font-medium">Sending Training Feedback</span>
@@ -129,70 +131,121 @@ export default function ModuleExperimentProgress({
       )}
 
       {/* Generate Feedback Suggestions */}
-      <li className="flex items-center space-x-2">
-        <span
-          className={twMerge(
-            "flex items-center justify-center w-6 h-6 border rounded-full shrink-0",
-            stepToIndex(data.step) > 3
-              ? "text-green-500 border-green-500"
-              : stepToIndex(data.step) === 3
-              ? "text-yellow-500 border-yellow-500"
-              : "text-gray-500 border-gray-500"
-          )}
-        >
-          {experiment.trainingSubmissions ? 3 : 2}
-        </span>
-        <div
-          className={twMerge(
-            "flex flex-col",
-            stepToIndex(data.step) > 3
-              ? "text-green-500"
-              : stepToIndex(data.step) === 3
-              ? "text-yellow-500"
-              : "text-gray-500"
-          )}
-        >
-          <span className="font-medium">Generating Feedback Suggestions</span>
-          {moduleRequests.requestFeedbackSuggestions.isLoading && (
-            <span className="text-xs text-gray-500 animate-pulse">
-              Generating feedback suggestions... (
-              {data.submissionsWithFeedbackSuggestions.size + 1}/
-              {experiment.evaluationSubmissions.length})
-            </span>
-          )}
-          {moduleRequests.requestFeedbackSuggestions.isError && (
-            <span className="text-xs text-red-500">
-              {moduleRequests.requestFeedbackSuggestions.error.message}
-            </span>
-          )}
-          {moduleRequests.requestFeedbackSuggestions.isSuccess && (
-            <span className="text-xs text-green-500">
-              Generated feedback suggestions (
-              {data.submissionsWithFeedbackSuggestions.size}/
-              {experiment.evaluationSubmissions.length})
-            </span>
-          )}
+      <li className="flex items-center justify-between space-x-2">
+        <div className="flex items-center space-x-2">
+          <span
+            className={twMerge(
+              "flex items-center justify-center w-6 h-6 border rounded-full shrink-0",
+              stepToIndex(data.step) > 3
+                ? "text-green-500 border-green-500"
+                : stepToIndex(data.step) === 3
+                ? "text-yellow-500 border-yellow-500"
+                : "text-gray-500 border-gray-500"
+            )}
+          >
+            {experiment.trainingSubmissions ? 3 : 2}
+          </span>
+          <div
+            className={twMerge(
+              "flex flex-col",
+              stepToIndex(data.step) > 3
+                ? "text-green-500"
+                : stepToIndex(data.step) === 3
+                ? "text-yellow-500"
+                : "text-gray-500"
+            )}
+          >
+            <span className="font-medium">Generating Feedback Suggestions</span>
+            {moduleRequests.requestFeedbackSuggestions.isLoading && (
+              <span className="text-xs text-gray-500 animate-pulse">
+                Generating feedback suggestions... (
+                {data.submissionsWithFeedbackSuggestions.size + 1}/
+                {experiment.evaluationSubmissions.length})
+              </span>
+            )}
+            {moduleRequests.requestFeedbackSuggestions.isError && (
+              <span className="text-xs text-red-500">
+                {moduleRequests.requestFeedbackSuggestions.error.message}
+              </span>
+            )}
+            {moduleRequests.requestFeedbackSuggestions.isSuccess && (
+              <span className="text-xs text-green-500">
+                Generated feedback suggestions (
+                {data.submissionsWithFeedbackSuggestions.size}/
+                {experiment.evaluationSubmissions.length})
+              </span>
+            )}
+          </div>
         </div>
+        {moduleExperiment.continueAfterTraining && (
+          <button
+            className="rounded-md p-2 bg-primary-500 hover:bg-primary-600 text-white text-base leading-none"
+            onClick={moduleExperiment.continueAfterTraining}
+          >
+            Start Generating
+          </button>
+        )}
       </li>
-      <li className="flex items-center space-x-2">
-        <span
-          className={twMerge(
-            "flex items-center justify-center w-6 h-6 border rounded-full shrink-0",
-            stepToIndex(data.step) === 4
-              ? "text-green-500 border-green-500"
-              : "text-gray-500 border-gray-500"
-          )}
-        >
-          {experiment.trainingSubmissions ? 4 : 3}
-        </span>
-        <div
-          className={twMerge(
-            "flex flex-col",
-            stepToIndex(data.step) === 4 ? "text-green-500" : "text-gray-500"
-          )}
-        >
-          <span className="font-medium">Finished</span>
+
+      {/* Run Automatic Evaluation */}
+      <li className="flex items-center justify-between space-x-2">
+        <div className="flex items-center space-x-2">
+          <span
+            className={twMerge(
+              "flex items-center justify-center w-6 h-6 border rounded-full shrink-0",
+              stepToIndex(data.step) === 4 &&
+                submissionsWithAutomaticEvaluation?.size ===
+                  data.submissionsWithFeedbackSuggestions.size
+                ? "text-green-500 border-green-500"
+                : stepToIndex(data.step) === 4
+                ? "text-yellow-500 border-yellow-500"
+                : "text-gray-500 border-gray-500"
+            )}
+          >
+            {experiment.trainingSubmissions ? 4 : 3}
+          </span>
+          <div
+            className={twMerge(
+              "flex flex-col",
+              stepToIndex(data.step) === 4 &&
+                submissionsWithAutomaticEvaluation?.size ===
+                  data.submissionsWithFeedbackSuggestions.size
+                ? "text-green-500"
+                : stepToIndex(data.step) === 4
+                ? "text-yellow-500"
+                : "text-gray-500"
+            )}
+          >
+            <span className="font-medium">Run Automatic Evaluation</span>
+            {moduleRequests.requestEvaluation.isLoading && (
+              <span className="text-xs text-gray-500 animate-pulse">
+                Evaluating submissions... (
+                {(submissionsWithAutomaticEvaluation?.size ?? 0) + 1}/
+                {experiment.evaluationSubmissions.length})
+              </span>
+            )}
+            {moduleRequests.requestEvaluation.isError && (
+              <span className="text-xs text-red-500">
+                {moduleRequests.requestEvaluation.error.message}
+              </span>
+            )}
+            {moduleRequests.requestEvaluation.isSuccess && (
+              <span className="text-xs text-green-500">
+                Evaluated submissions (
+                {submissionsWithAutomaticEvaluation?.size ?? 0}/
+                {experiment.evaluationSubmissions.length})
+              </span>
+            )}
+          </div>
         </div>
+        {moduleExperiment.continueWithAutomaticEvaluation && (
+          <button
+            className="self-end rounded-md p-2 bg-primary-500 hover:bg-primary-600 text-white text-base leading-none"
+            onClick={moduleExperiment.continueWithAutomaticEvaluation}
+          >
+            Start Evaluating
+          </button>
+        )}
       </li>
     </ol>
   );
diff --git a/playground/src/hooks/batch_module_experiment.ts b/playground/src/hooks/batch_module_experiment.ts
index 7c6f9f68d..89480a30a 100644
--- a/playground/src/hooks/batch_module_experiment.ts
+++ b/playground/src/hooks/batch_module_experiment.ts
@@ -54,8 +54,8 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
 
   // Stores automatic evaluation of submissions
   const [submissionsWithAutomaticEvaluation, setSubmissionsWithAutomaticEvaluation] = useState<
-    Map<number, AutomaticEvaluation>
-  >(new Map());
+    Map<number, AutomaticEvaluation> | undefined
+  >(undefined);
 
   const [processingStep, setProcessingStep] = useState<
     ExperimentStep | undefined
@@ -103,7 +103,7 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
         } : {}
       ),
       ...(
-        submissionsWithAutomaticEvaluation.size > 0 ? {
+        submissionsWithAutomaticEvaluation && submissionsWithAutomaticEvaluation.size > 0 ? {
           automaticEvaluation: {
             type: "automaticEvaluation",
             runId: data.runId,
@@ -189,6 +189,10 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
     }));
   }) : undefined;
 
+  const continueWithAutomaticEvaluation = (data.step === "finished" && submissionsWithAutomaticEvaluation === undefined) ? (() => {
+    stepAutomaticEvaluation();
+  }) : undefined;
+
   // Module requests
   const sendSubmissions = useSendSubmissions();
   const sendFeedbacks = useSendFeedbacks();
@@ -386,7 +390,7 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
     console.log("Running automatic evaluation...");
 
     let remainingSubmissions = experiment.evaluationSubmissions.filter(
-      (submission) => !submissionsWithAutomaticEvaluation.has(submission.id)
+      (submission) => !submissionsWithAutomaticEvaluation?.has(submission.id)
     );
     
     let index = 0;
@@ -477,20 +481,18 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
       processingStep !== "generatingFeedbackSuggestions"
     ) {
       stepGenerateFeedbackSuggestions();
-    } else if (
-      data.step === "finished" &&
-      processingStep !== "finished"
-    ) {
-      stepAutomaticEvaluation();
-    }
+    } 
+    // Automatic evaluation is triggered manually
   }, [data.step]);
 
   return {
     data,
     submissionsWithManualRatings,
+    submissionsWithAutomaticEvaluation,
     getManualRatingsSetter,
     startExperiment,
     continueAfterTraining,
+    continueWithAutomaticEvaluation,
     exportData,
     importData,
     moduleRequests: {

From fdb073dc6f3d7cfaaf10ec6610851a41a70cdca9 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 16:57:53 +0100
Subject: [PATCH 32/54] fix color

---
 .../conduct_experiment/module_experiment_progress.tsx         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/playground/src/components/view_mode/evaluation_mode/conduct_experiment/module_experiment_progress.tsx b/playground/src/components/view_mode/evaluation_mode/conduct_experiment/module_experiment_progress.tsx
index 345c9e67a..75d0e9964 100644
--- a/playground/src/components/view_mode/evaluation_mode/conduct_experiment/module_experiment_progress.tsx
+++ b/playground/src/components/view_mode/evaluation_mode/conduct_experiment/module_experiment_progress.tsx
@@ -197,7 +197,7 @@ export default function ModuleExperimentProgress({
                 submissionsWithAutomaticEvaluation?.size ===
                   data.submissionsWithFeedbackSuggestions.size
                 ? "text-green-500 border-green-500"
-                : stepToIndex(data.step) === 4
+                : stepToIndex(data.step) === 4 && submissionsWithAutomaticEvaluation !== undefined
                 ? "text-yellow-500 border-yellow-500"
                 : "text-gray-500 border-gray-500"
             )}
@@ -211,7 +211,7 @@ export default function ModuleExperimentProgress({
                 submissionsWithAutomaticEvaluation?.size ===
                   data.submissionsWithFeedbackSuggestions.size
                 ? "text-green-500"
-                : stepToIndex(data.step) === 4
+                : stepToIndex(data.step) === 4 && submissionsWithAutomaticEvaluation !== undefined
                 ? "text-yellow-500"
                 : "text-gray-500"
             )}

From d0838f5aca3507235615dedc26ac5543c219ec4b Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 17:18:41 +0100
Subject: [PATCH 33/54] add evaluation model

---
 module_text_llm/.env.example                         |  6 ++++++
 .../module_text_llm/helpers/models/__init__.py       | 12 +++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/module_text_llm/.env.example b/module_text_llm/.env.example
index 2ecf0a8f4..aedc5bdba 100644
--- a/module_text_llm/.env.example
+++ b/module_text_llm/.env.example
@@ -14,6 +14,12 @@ DATABASE_URL=sqlite:///../data/data.sqlite
 # See below for options, available models are also logged on startup
 LLM_DEFAULT_MODEL="azure_openai_gpt-35"
 
+# Enable LLM-as-a-judge approach 0 = disabled, 1 = enabled
+LLM_ENABLE_LLM_AS_A_JUDGE=1
+# Evaluation model to use for the LLM-as-a-judge approach [Only important if you want to use it in the /evaluate endpoint]
+# See below for options, available models are also logged on startup
+LLM_EVALUATION_MODEL="azure_openai_gpt-4"
+
 # Standard OpenAI (Non-Azure) [leave blank if not used]
 # Model names prefixed with `openai_` followed by the model name, e.g. `openai_text-davinci-003`
 # A list of models can be found in `module_text_llm/helpers/models/openai.py` (openai_models)
diff --git a/module_text_llm/module_text_llm/helpers/models/__init__.py b/module_text_llm/module_text_llm/helpers/models/__init__.py
index 4d2fe5a65..144bcf923 100644
--- a/module_text_llm/module_text_llm/helpers/models/__init__.py
+++ b/module_text_llm/module_text_llm/helpers/models/__init__.py
@@ -1,10 +1,16 @@
 import os
-from typing import Type, Union, List
+from typing import Type, Union, List, Optional
+from langchain.base_language import BaseLanguageModel
+
 from module_text_llm.helpers.models.model_config import ModelConfig
 
 
 DefaultModelConfig: Type[ModelConfig]
 default_model_name = os.environ.get("LLM_DEFAULT_MODEL")
+evaluation_model_name = os.environ.get("LLM_EVALUATION_MODEL")
+
+# Model used during evaluation for judging the output (should be a more powerful model)
+evaluation_model: Optional[BaseLanguageModel] = None
 
 types: List[Type[ModelConfig]] = []
 try:
@@ -12,6 +18,8 @@
     types.append(openai_config.OpenAIModelConfig)
     if default_model_name in openai_config.available_models:
         DefaultModelConfig = openai_config.OpenAIModelConfig
+    if evaluation_model_name in openai_config.available_models:
+        evaluation_model = openai_config.available_models[evaluation_model_name]
 except AttributeError:
     pass
 
@@ -20,6 +28,8 @@
     types.append(replicate_config.ReplicateModelConfig)
     if default_model_name in replicate_config.available_models:
         DefaultModelConfig = replicate_config.ReplicateModelConfig
+    if evaluation_model_name in replicate_config.available_models:
+        evaluation_model = replicate_config.available_models[evaluation_model_name]
 except AttributeError:
     pass
 

From 05608aa98c73480b48841dbd12a8f82df1fc8be9 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 17:30:33 +0100
Subject: [PATCH 34/54] add llm as a judge

---
 module_text_llm/module_text_llm/__main__.py   | 22 ++++-
 .../module_text_llm/generate_evaluation.py    | 95 +++++++++++++++++++
 .../module_text_llm/helpers/utils.py          | 25 +++++
 .../prompts/generate_evaluation.py            | 26 +++++
 4 files changed, 166 insertions(+), 2 deletions(-)
 create mode 100644 module_text_llm/module_text_llm/generate_evaluation.py
 create mode 100644 module_text_llm/module_text_llm/prompts/generate_evaluation.py

diff --git a/module_text_llm/module_text_llm/__main__.py b/module_text_llm/module_text_llm/__main__.py
index e9bf8d448..e3f7d7769 100644
--- a/module_text_llm/module_text_llm/__main__.py
+++ b/module_text_llm/module_text_llm/__main__.py
@@ -1,14 +1,16 @@
-from typing import List
+import os
+from typing import List, Any
 
 import nltk
 import tiktoken
 
-from athena import app, submission_selector, submissions_consumer, feedback_consumer, feedback_provider
+from athena import app, submission_selector, submissions_consumer, feedback_consumer, feedback_provider, evaluation_provider
 from athena.text import Exercise, Submission, Feedback
 from athena.logger import logger
 
 from module_text_llm.config import Configuration
 from module_text_llm.generate_suggestions import generate_suggestions
+from module_text_llm.generate_evaluation import generate_evaluation
 
 
 @submissions_consumer
@@ -33,6 +35,22 @@ async def suggest_feedback(exercise: Exercise, submission: Submission, module_co
     return await generate_suggestions(exercise, submission, module_config.approach, module_config.debug)
 
 
+@evaluation_provider
+async def evaluate_feedback(
+    exercise: Exercise, submission: Submission, 
+    true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback], 
+) -> Any:
+    logger.info(
+        "evaluate_feedback: Evaluation for submission %d of exercise %d was requested with %d true and %d predicted feedbacks", 
+        submission.id, exercise.id, len(true_feedbacks), len(predicted_feedbacks)
+    )
+    
+    evaluation = {}
+    if bool(os.environ.get("LLM_ENABLE_LLM_AS_A_JUDGE")):
+        evaluation["llm-as-a-judge"] = await generate_evaluation(exercise, submission, true_feedbacks, predicted_feedbacks)
+
+    return evaluation
+
 if __name__ == "__main__":
     nltk.download("punkt")
     tiktoken.get_encoding("cl100k_base")
diff --git a/module_text_llm/module_text_llm/generate_evaluation.py b/module_text_llm/module_text_llm/generate_evaluation.py
new file mode 100644
index 000000000..4c4a9969b
--- /dev/null
+++ b/module_text_llm/module_text_llm/generate_evaluation.py
@@ -0,0 +1,95 @@
+from typing import List, Sequence, Dict, Literal
+from pydantic import BaseModel, Field
+import json
+
+from athena.text import Exercise, Submission, Feedback
+from athena.logger import logger
+
+from module_text_llm.helpers.models import evaluation_model
+from module_text_llm.helpers.llm_utils import (
+    get_chat_prompt_with_formatting_instructions,
+    check_prompt_length_and_omit_features_if_necessary,
+    predict_and_parse
+)
+from module_text_llm.helpers.utils import add_sentence_numbers, get_line_range_from_index_range
+from module_text_llm.prompts.generate_evaluation import system_message, human_message
+
+
+class AccuracyMetric(BaseModel):
+    id: int = Field(..., description="Feedback ID")
+    reasoning: str = Field(..., description="Step-by-step critical reasoning of the labels")
+    acceptance_label: Literal["accepted", "rejected"] = Field(..., description="Estimated acceptance label")
+    level_of_needed_modification_label: Literal["no", "minor", "major"] = Field(..., description="Estimated level of needed modification")
+
+class Evaluation(BaseModel):
+    metrics: Sequence[AccuracyMetric] = Field(...)
+
+
+async def generate_evaluation(
+    exercise: Exercise,
+    submission: Submission,
+    true_feedbacks: List[Feedback],
+    predicted_feedbacks: List[Feedback]
+) -> Dict[int, dict]:
+
+    if evaluation_model is None:
+        raise EnvironmentError("No evaluation model available, please set up LLM_EVALUATION_MODEL correctly"
+                               "by setting it to one of the available models logged during startup.")
+    max_input_tokens = 3000
+
+    def feedback_to_dict(feedback: Feedback):
+        line_start, line_end = get_line_range_from_index_range(
+            feedback.index_start, feedback.index_end, submission.text)
+        return {
+            "id": feedback.id,
+            "title": feedback.title,
+            "description": feedback.description,
+            "line_start": line_start,
+            "line_end": line_end,
+            "credits": feedback.credits
+        }
+
+    prompt_input = {
+        "submission": add_sentence_numbers(submission.text),
+        "true_feedbacks": json.dumps([feedback_to_dict(feedback) for feedback in true_feedbacks]),
+        "predicted_feedbacks": json.dumps([feedback_to_dict(feedback) for feedback in predicted_feedbacks]),
+    }
+
+    chat_prompt = get_chat_prompt_with_formatting_instructions(
+        model=evaluation_model,
+        system_message=system_message,
+        human_message=human_message,
+        pydantic_object=Evaluation
+    )
+
+    # Check if the prompt is too long and omit features if necessary (in order of importance)
+    omittable_features = ["submission"]
+    prompt_input, should_run = check_prompt_length_and_omit_features_if_necessary(
+        prompt=chat_prompt,
+        prompt_input=prompt_input,
+        max_input_tokens=max_input_tokens,
+        omittable_features=omittable_features,
+        debug=False
+    )
+
+    if not should_run:
+        logger.warning("Evaluation input too long. Skipping.")
+        return {}
+
+    result = await predict_and_parse(
+        model=evaluation_model,
+        chat_prompt=chat_prompt,
+        prompt_input=prompt_input,
+        pydantic_object=Evaluation,
+        tags=[
+            f"exercise-{exercise.id}",
+            f"submission-{submission.id}",
+            "evaluation"
+        ]
+    )
+
+    if result is None:
+        logger.warning("Evaluation failed. Skipping.")
+        return {}
+
+    return { item.id: item.dict() for item in result.metrics }
diff --git a/module_text_llm/module_text_llm/helpers/utils.py b/module_text_llm/module_text_llm/helpers/utils.py
index 2ed05aec5..24cf41024 100644
--- a/module_text_llm/module_text_llm/helpers/utils.py
+++ b/module_text_llm/module_text_llm/helpers/utils.py
@@ -92,3 +92,28 @@ def get_index_range_from_line_range(line_start: Optional[int], line_end: Optiona
     line_end_index = min(max(int(line_end), 0), len(sentence_spans) - 1)
     
     return sentence_spans[line_start_index][0], sentence_spans[line_end_index][1]
+
+
+def get_line_range_from_index_range(index_start: Optional[int], index_end: Optional[int], content: str) -> Tuple[Optional[int], Optional[int]]:
+    if index_start is None and index_end is None:
+        return None, None
+
+    index_start = index_start or index_end or 0
+    index_end = index_end or index_start or 0
+
+    if index_start > index_end:
+        index_start, index_end = index_end, index_start
+
+    sentence_spans = get_sentence_spans(content)
+
+    line_start = None
+    line_end = None
+
+    for line_number, (start_index, end_index) in enumerate(sentence_spans, start=1):
+        if start_index <= index_start < end_index:
+            line_start = line_number
+        if start_index <= index_end <= end_index:
+            line_end = line_number
+            break
+    
+    return line_start, line_end
\ No newline at end of file
diff --git a/module_text_llm/module_text_llm/prompts/generate_evaluation.py b/module_text_llm/module_text_llm/prompts/generate_evaluation.py
new file mode 100644
index 000000000..10daa84a4
--- /dev/null
+++ b/module_text_llm/module_text_llm/prompts/generate_evaluation.py
@@ -0,0 +1,26 @@
+system_message = """\
+You are now an evaluator for feedback accuracy generated by a machine-learning system.
+
+# Task
+Your task is to estimate if a human tutor would accept or reject the feedback suggestion and how much modification is needed to make the feedback useful.
+
+# Score Criteria
+Accept feedback that is useful to the tutor, meaning that it can be applied to the submission with minor or no modification. \
+Our goal is to reduce the workload of tutors and reduce their cognitive load. \
+Reject feedback that is not useful and would burden the tutor.
+
+Put the focus on the description of the feedback, the title is optional. \
+The `line_start` and `line_end` should make sense with respect to the submission but do not need to be exact. \
+Credits should make sense with respect to the feedback and the submission but also do not need to be exact.
+
+# Submission (with sentence numbers <number>: <sentence>):
+{submission}
+
+# Example (Human) Feedback:
+{true_feedbacks}
+"""
+
+human_message = """\
+### Model Output:
+{predicted_feedbacks}
+"""
\ No newline at end of file

From c68ba0f96a67500b0bcacf7ba813ad09bd8f97dc Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 17:55:01 +0100
Subject: [PATCH 35/54] fix ui issue and some var naming

---
 module_text_llm/module_text_llm/__main__.py     | 2 +-
 playground/src/hooks/batch_module_experiment.ts | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/module_text_llm/module_text_llm/__main__.py b/module_text_llm/module_text_llm/__main__.py
index e3f7d7769..cc6c42baa 100644
--- a/module_text_llm/module_text_llm/__main__.py
+++ b/module_text_llm/module_text_llm/__main__.py
@@ -47,7 +47,7 @@ async def evaluate_feedback(
     
     evaluation = {}
     if bool(os.environ.get("LLM_ENABLE_LLM_AS_A_JUDGE")):
-        evaluation["llm-as-a-judge"] = await generate_evaluation(exercise, submission, true_feedbacks, predicted_feedbacks)
+        evaluation["llm_as_a_judge"] = await generate_evaluation(exercise, submission, true_feedbacks, predicted_feedbacks)
 
     return evaluation
 
diff --git a/playground/src/hooks/batch_module_experiment.ts b/playground/src/hooks/batch_module_experiment.ts
index 89480a30a..e2610fbb8 100644
--- a/playground/src/hooks/batch_module_experiment.ts
+++ b/playground/src/hooks/batch_module_experiment.ts
@@ -190,6 +190,7 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
   }) : undefined;
 
   const continueWithAutomaticEvaluation = (data.step === "finished" && submissionsWithAutomaticEvaluation === undefined) ? (() => {
+    setSubmissionsWithAutomaticEvaluation((prevState) => new Map(prevState));
     stepAutomaticEvaluation();
   }) : undefined;
 

From afb1892727bb9292c4862048195bda24b8d7b729 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 18:54:25 +0100
Subject: [PATCH 36/54] fix line break

---
 playground/src/components/details/exercise_detail/common.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/playground/src/components/details/exercise_detail/common.tsx b/playground/src/components/details/exercise_detail/common.tsx
index 852b19a66..8c583a292 100644
--- a/playground/src/components/details/exercise_detail/common.tsx
+++ b/playground/src/components/details/exercise_detail/common.tsx
@@ -54,7 +54,7 @@ export default function CommonExerciseDetail({
                       <i>Missing criterion title</i>
                     )}
                     <span className="text-xs text-orange-800 rounded-full px-2 py-0.5 bg-orange-100">
-                      Grading Criterion {criterion.id}
+                      Grading&nbsp;Criterion&nbsp;{criterion.id}
                     </span>
                   </span>
                   {criterion.structured_grading_instructions.map(

From 2a1d4b636ba65cbe504f6be4b1601bd1a4e6cc99 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 19:58:53 +0100
Subject: [PATCH 37/54] add langsmith logging

---
 module_text_llm/module_text_llm/__main__.py |  47 ++++++++-
 module_text_llm/poetry.lock                 | 111 ++++++++++++++++----
 module_text_llm/pyproject.toml              |   1 +
 playground/src/pages/api/athena_request.ts  |  17 ++-
 4 files changed, 151 insertions(+), 25 deletions(-)

diff --git a/module_text_llm/module_text_llm/__main__.py b/module_text_llm/module_text_llm/__main__.py
index cc6c42baa..0c9bfd29d 100644
--- a/module_text_llm/module_text_llm/__main__.py
+++ b/module_text_llm/module_text_llm/__main__.py
@@ -1,10 +1,13 @@
+import json
 import os
 from typing import List, Any
 
 import nltk
 import tiktoken
+from langsmith import Client as LangsmithClient
+from langsmith.schemas import Run
 
-from athena import app, submission_selector, submissions_consumer, feedback_consumer, feedback_provider, evaluation_provider
+from athena import app, get_experiment_environment, submission_selector, submissions_consumer, feedback_consumer, feedback_provider, evaluation_provider
 from athena.text import Exercise, Submission, Feedback
 from athena.logger import logger
 
@@ -49,6 +52,48 @@ async def evaluate_feedback(
     if bool(os.environ.get("LLM_ENABLE_LLM_AS_A_JUDGE")):
         evaluation["llm_as_a_judge"] = await generate_evaluation(exercise, submission, true_feedbacks, predicted_feedbacks)
 
+    # Gather LLM token usage and response times
+    if bool(os.environ.get("LANGCHAIN_TRACING_V2")):
+        experiment = get_experiment_environment()
+        client = LangsmithClient()
+        project_name = os.environ.get("LANGCHAIN_PROJECT")
+        runs = list(client.list_runs(
+            project_name=project_name,
+            filter=f'and(has(tags, "run-{experiment.run_id}"), has(tags, "submission-{submission.id}"))'
+        ))
+        logger.info("evaluate_feedback: Found %d runs for submission %d of exercise %d.", len(runs), submission.id, exercise.id)
+        
+        def get_statistics(runs: List[Run]):
+            return {
+                "response_time": sum((run.end_time - run.start_time).total_seconds() for run in runs if run.end_time is not None),
+                "prompt_tokens": sum(run.prompt_tokens for run in runs if run.prompt_tokens is not None),
+                "completion_tokens": sum(run.completion_tokens for run in runs if run.completion_tokens is not None),
+                "total_tokens": sum(run.total_tokens for run in runs if run.total_tokens is not None),
+            }
+
+        suggestion_runs = []
+        evaluation_runs = []
+        for run in runs:
+            if "evaluation" in (run.tags or []):
+                evaluation_runs.append(run)
+            else:
+                suggestion_runs.append(run)
+
+        if suggestion_runs or evaluation_runs:
+            evaluation["runs"] = {}
+            if suggestion_runs:
+                evaluation["runs"]["suggestions"] = {
+                    "count": len(suggestion_runs),
+                    "statistics": get_statistics(suggestion_runs),
+                    "runs": [json.loads(run.json()) for run in suggestion_runs]
+                }
+            if evaluation_runs:
+                evaluation["runs"]["evaluation"] = {
+                    "count": len(evaluation_runs),
+                    "statistics": get_statistics(evaluation_runs),
+                    "runs": [json.loads(run.json()) for run in evaluation_runs]
+                }
+
     return evaluation
 
 if __name__ == "__main__":
diff --git a/module_text_llm/poetry.lock b/module_text_llm/poetry.lock
index 96c269625..28e23ad99 100644
--- a/module_text_llm/poetry.lock
+++ b/module_text_llm/poetry.lock
@@ -1,9 +1,10 @@
-# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
 version = "3.8.6"
 description = "Async http client/server framework (asyncio)"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -112,6 +113,7 @@ speedups = ["Brotli", "aiodns", "cchardet"]
 name = "aiosignal"
 version = "1.3.1"
 description = "aiosignal: a list of registered asynchronous callbacks"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -126,6 +128,7 @@ frozenlist = ">=1.1.0"
 name = "anyio"
 version = "3.7.1"
 description = "High level compatibility layer for multiple asynchronous event loop implementations"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -146,6 +149,7 @@ trio = ["trio (<0.22)"]
 name = "astroid"
 version = "2.15.8"
 description = "An abstract syntax tree for Python with inference support."
+category = "dev"
 optional = false
 python-versions = ">=3.7.2"
 files = [
@@ -161,6 +165,7 @@ wrapt = {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}
 name = "async-timeout"
 version = "4.0.3"
 description = "Timeout context manager for asyncio programs"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -172,6 +177,7 @@ files = [
 name = "athena"
 version = "1.0.0"
 description = "This is a helper module for easier development of Athena modules. It provides communication functionality with the Assessment Module manager, as well as helper functions for storage."
+category = "main"
 optional = false
 python-versions = "3.11.*"
 files = []
@@ -193,6 +199,7 @@ url = "../athena"
 name = "attrs"
 version = "23.1.0"
 description = "Classes Without Boilerplate"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -211,6 +218,7 @@ tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pyte
 name = "certifi"
 version = "2023.7.22"
 description = "Python package for providing Mozilla's CA Bundle."
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -222,6 +230,7 @@ files = [
 name = "charset-normalizer"
 version = "3.3.1"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
+category = "main"
 optional = false
 python-versions = ">=3.7.0"
 files = [
@@ -321,6 +330,7 @@ files = [
 name = "click"
 version = "8.1.7"
 description = "Composable command line interface toolkit"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -335,6 +345,7 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""}
 name = "colorama"
 version = "0.4.6"
 description = "Cross-platform colored terminal text."
+category = "main"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
 files = [
@@ -346,6 +357,7 @@ files = [
 name = "dataclasses-json"
 version = "0.6.1"
 description = "Easily serialize dataclasses to and from JSON."
+category = "main"
 optional = false
 python-versions = ">=3.7,<4.0"
 files = [
@@ -361,6 +373,7 @@ typing-inspect = ">=0.4.0,<1"
 name = "dill"
 version = "0.3.7"
 description = "serialize all of Python"
+category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -375,6 +388,7 @@ graph = ["objgraph (>=1.7.2)"]
 name = "dodgy"
 version = "0.2.1"
 description = "Dodgy: Searches for dodgy looking lines in Python code"
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -386,6 +400,7 @@ files = [
 name = "fastapi"
 version = "0.96.1"
 description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -407,6 +422,7 @@ test = ["anyio[trio] (>=3.2.1,<4.0.0)", "black (==23.1.0)", "coverage[toml] (>=6
 name = "flake8"
 version = "2.3.0"
 description = "the modular source code checker: pep8, pyflakes and co"
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -423,6 +439,7 @@ pyflakes = ">=0.8.1"
 name = "flake8-polyfill"
 version = "1.0.2"
 description = "Polyfill package for Flake8 plugins"
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -437,6 +454,7 @@ flake8 = "*"
 name = "frozenlist"
 version = "1.4.0"
 description = "A list-like structure which implements collections.abc.MutableSequence"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -507,6 +525,7 @@ files = [
 name = "gitdb"
 version = "4.0.11"
 description = "Git Object Database"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -521,6 +540,7 @@ smmap = ">=3.0.1,<6"
 name = "gitpython"
 version = "3.1.40"
 description = "GitPython is a Python library used to interact with Git repositories"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -538,6 +558,7 @@ test = ["black", "coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre
 name = "greenlet"
 version = "3.0.1"
 description = "Lightweight in-process concurrent programming"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -608,6 +629,7 @@ test = ["objgraph", "psutil"]
 name = "h11"
 version = "0.14.0"
 description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -619,6 +641,7 @@ files = [
 name = "httpcore"
 version = "0.17.3"
 description = "A minimal low-level HTTP client."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -630,16 +653,17 @@ files = [
 anyio = ">=3.0,<5.0"
 certifi = "*"
 h11 = ">=0.13,<0.15"
-sniffio = "==1.*"
+sniffio = ">=1.0.0,<2.0.0"
 
 [package.extras]
 http2 = ["h2 (>=3,<5)"]
-socks = ["socksio (==1.*)"]
+socks = ["socksio (>=1.0.0,<2.0.0)"]
 
 [[package]]
 name = "httpx"
 version = "0.24.1"
 description = "The next generation HTTP client."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -655,14 +679,15 @@ sniffio = "*"
 
 [package.extras]
 brotli = ["brotli", "brotlicffi"]
-cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
+cli = ["click (>=8.0.0,<9.0.0)", "pygments (>=2.0.0,<3.0.0)", "rich (>=10,<14)"]
 http2 = ["h2 (>=3,<5)"]
-socks = ["socksio (==1.*)"]
+socks = ["socksio (>=1.0.0,<2.0.0)"]
 
 [[package]]
 name = "idna"
 version = "3.4"
 description = "Internationalized Domain Names in Applications (IDNA)"
+category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -674,6 +699,7 @@ files = [
 name = "isort"
 version = "5.12.0"
 description = "A Python utility / library to sort Python imports."
+category = "dev"
 optional = false
 python-versions = ">=3.8.0"
 files = [
@@ -691,6 +717,7 @@ requirements-deprecated-finder = ["pip-api", "pipreqs"]
 name = "joblib"
 version = "1.3.2"
 description = "Lightweight pipelining with Python functions"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -702,6 +729,7 @@ files = [
 name = "jsonpatch"
 version = "1.33"
 description = "Apply JSON-Patches (RFC 6902)"
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
 files = [
@@ -716,6 +744,7 @@ jsonpointer = ">=1.9"
 name = "jsonpointer"
 version = "2.4"
 description = "Identify specific nodes in a JSON document (RFC 6901)"
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
 files = [
@@ -727,6 +756,7 @@ files = [
 name = "langchain"
 version = "0.0.325"
 description = "Building applications with LLMs through composability"
+category = "main"
 optional = false
 python-versions = ">=3.8.1,<4.0"
 files = [
@@ -764,13 +794,14 @@ text-helpers = ["chardet (>=5.1.0,<6.0.0)"]
 
 [[package]]
 name = "langsmith"
-version = "0.0.52"
+version = "0.0.60"
 description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
+category = "main"
 optional = false
 python-versions = ">=3.8.1,<4.0"
 files = [
-    {file = "langsmith-0.0.52-py3-none-any.whl", hash = "sha256:d02a0ade5a53b36143084e57003ed38ccbdf5fc15a5a0eb14f8989ceaee0b807"},
-    {file = "langsmith-0.0.52.tar.gz", hash = "sha256:1dc29082d257deea1859cb22c53d9481ca5c4a37f3af40c0f9d300fb8adc91db"},
+    {file = "langsmith-0.0.60-py3-none-any.whl", hash = "sha256:94f9ef9898fa5fb5afed72538bb3ccca9a92a841b37654d699c732a76c623379"},
+    {file = "langsmith-0.0.60.tar.gz", hash = "sha256:f63513398d8d4530e3aa552926924c8443ac9d21c3812f303fa20fa2c44a9a42"},
 ]
 
 [package.dependencies]
@@ -781,6 +812,7 @@ requests = ">=2,<3"
 name = "lazy-object-proxy"
 version = "1.9.0"
 description = "A fast and thorough lazy object proxy."
+category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -826,6 +858,7 @@ files = [
 name = "marshmallow"
 version = "3.20.1"
 description = "A lightweight library for converting complex datatypes to and from native Python datatypes."
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -846,6 +879,7 @@ tests = ["pytest", "pytz", "simplejson"]
 name = "mccabe"
 version = "0.7.0"
 description = "McCabe checker, plugin for flake8"
+category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -857,6 +891,7 @@ files = [
 name = "multidict"
 version = "6.0.4"
 description = "multidict implementation"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -940,6 +975,7 @@ files = [
 name = "mypy"
 version = "1.6.1"
 description = "Optional static typing for Python"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -985,6 +1021,7 @@ reports = ["lxml"]
 name = "mypy-extensions"
 version = "1.0.0"
 description = "Type system extensions for programs checked with the mypy type checker."
+category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -996,6 +1033,7 @@ files = [
 name = "nltk"
 version = "3.8.1"
 description = "Natural Language Toolkit"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1021,6 +1059,7 @@ twitter = ["twython"]
 name = "numpy"
 version = "1.26.1"
 description = "Fundamental package for array computing in Python"
+category = "main"
 optional = false
 python-versions = "<3.13,>=3.9"
 files = [
@@ -1062,6 +1101,7 @@ files = [
 name = "openai"
 version = "0.27.10"
 description = "Python client library for the OpenAI API"
+category = "main"
 optional = false
 python-versions = ">=3.7.1"
 files = [
@@ -1076,7 +1116,7 @@ tqdm = "*"
 
 [package.extras]
 datalib = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
-dev = ["black (>=21.6b0,<22.0)", "pytest (==6.*)", "pytest-asyncio", "pytest-mock"]
+dev = ["black (>=21.6b0,<22.0)", "pytest (>=6.0.0,<7.0.0)", "pytest-asyncio", "pytest-mock"]
 embeddings = ["matplotlib", "numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "plotly", "scikit-learn (>=1.0.2)", "scipy", "tenacity (>=8.0.1)"]
 wandb = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "wandb"]
 
@@ -1084,6 +1124,7 @@ wandb = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1
 name = "packaging"
 version = "23.2"
 description = "Core utilities for Python packages"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1095,6 +1136,7 @@ files = [
 name = "pep8"
 version = "1.7.1"
 description = "Python style guide checker"
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -1106,6 +1148,7 @@ files = [
 name = "pep8-naming"
 version = "0.10.0"
 description = "Check PEP-8 naming conventions, plugin for flake8"
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -1120,6 +1163,7 @@ flake8-polyfill = ">=1.0.2,<2"
 name = "platformdirs"
 version = "3.11.0"
 description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
+category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1135,6 +1179,7 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-co
 name = "prospector"
 version = "1.10.3"
 description = "Prospector is a tool to analyse Python code by aggregating the result of other tools."
+category = "dev"
 optional = false
 python-versions = ">=3.7.2,<4.0"
 files = [
@@ -1174,6 +1219,7 @@ with-vulture = ["vulture (>=1.5)"]
 name = "psycopg2"
 version = "2.9.9"
 description = "psycopg2 - Python-PostgreSQL Database Adapter"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1181,8 +1227,6 @@ files = [
     {file = "psycopg2-2.9.9-cp310-cp310-win_amd64.whl", hash = "sha256:426f9f29bde126913a20a96ff8ce7d73fd8a216cfb323b1f04da402d452853c3"},
     {file = "psycopg2-2.9.9-cp311-cp311-win32.whl", hash = "sha256:ade01303ccf7ae12c356a5e10911c9e1c51136003a9a1d92f7aa9d010fb98372"},
     {file = "psycopg2-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:121081ea2e76729acfb0673ff33755e8703d45e926e416cb59bae3a86c6a4981"},
-    {file = "psycopg2-2.9.9-cp312-cp312-win32.whl", hash = "sha256:d735786acc7dd25815e89cc4ad529a43af779db2e25aa7c626de864127e5a024"},
-    {file = "psycopg2-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:a7653d00b732afb6fc597e29c50ad28087dcb4fbfb28e86092277a559ae4e693"},
     {file = "psycopg2-2.9.9-cp37-cp37m-win32.whl", hash = "sha256:5e0d98cade4f0e0304d7d6f25bbfbc5bd186e07b38eac65379309c4ca3193efa"},
     {file = "psycopg2-2.9.9-cp37-cp37m-win_amd64.whl", hash = "sha256:7e2dacf8b009a1c1e843b5213a87f7c544b2b042476ed7755be813eaf4e8347a"},
     {file = "psycopg2-2.9.9-cp38-cp38-win32.whl", hash = "sha256:ff432630e510709564c01dafdbe996cb552e0b9f3f065eb89bdce5bd31fabf4c"},
@@ -1196,6 +1240,7 @@ files = [
 name = "pycodestyle"
 version = "2.11.1"
 description = "Python style guide checker"
+category = "dev"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -1207,6 +1252,7 @@ files = [
 name = "pydantic"
 version = "1.10.13"
 description = "Data validation and settings management using python type hints"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1259,6 +1305,7 @@ email = ["email-validator (>=1.0.3)"]
 name = "pydocstyle"
 version = "6.3.0"
 description = "Python docstring style checker"
+category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -1276,6 +1323,7 @@ toml = ["tomli (>=1.2.3)"]
 name = "pyflakes"
 version = "2.5.0"
 description = "passive checker of Python programs"
+category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -1287,6 +1335,7 @@ files = [
 name = "pylint"
 version = "2.17.7"
 description = "python code static checker"
+category = "dev"
 optional = false
 python-versions = ">=3.7.2"
 files = [
@@ -1311,6 +1360,7 @@ testutils = ["gitpython (>3)"]
 name = "pylint-celery"
 version = "0.3"
 description = "pylint-celery is a Pylint plugin to aid Pylint in recognising and understandingerrors caused when using the Celery library"
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -1326,6 +1376,7 @@ pylint-plugin-utils = ">=0.2.1"
 name = "pylint-django"
 version = "2.5.3"
 description = "A Pylint plugin to help Pylint understand the Django web framework"
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -1345,6 +1396,7 @@ with-django = ["Django"]
 name = "pylint-flask"
 version = "0.6"
 description = "pylint-flask is a Pylint plugin to aid Pylint in recognizing and understanding errors caused when using Flask"
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -1358,6 +1410,7 @@ pylint-plugin-utils = ">=0.2.1"
 name = "pylint-plugin-utils"
 version = "0.7"
 description = "Utilities and helpers for writing Pylint plugins"
+category = "dev"
 optional = false
 python-versions = ">=3.6.2"
 files = [
@@ -1372,6 +1425,7 @@ pylint = ">=1.7"
 name = "python-dotenv"
 version = "1.0.0"
 description = "Read key-value pairs from a .env file and set them as environment variables"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -1386,6 +1440,7 @@ cli = ["click (>=5.0)"]
 name = "pyyaml"
 version = "6.0.1"
 description = "YAML parser and emitter for Python"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -1394,7 +1449,6 @@ files = [
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
-    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
     {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
     {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
     {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
@@ -1402,15 +1456,8 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
-    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
     {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
-    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
-    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
-    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
-    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
-    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
     {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
@@ -1427,7 +1474,6 @@ files = [
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
-    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
     {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
     {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
     {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
@@ -1435,7 +1481,6 @@ files = [
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
-    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
     {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
     {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
     {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
@@ -1445,6 +1490,7 @@ files = [
 name = "regex"
 version = "2023.10.3"
 description = "Alternative regular expression module, to replace re."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1542,6 +1588,7 @@ files = [
 name = "replicate"
 version = "0.11.0"
 description = "Python client for Replicate"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -1561,6 +1608,7 @@ dev = ["black", "mypy", "pytest", "responses", "ruff"]
 name = "requests"
 version = "2.31.0"
 description = "Python HTTP for Humans."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1582,6 +1630,7 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 name = "requirements-detector"
 version = "1.2.2"
 description = "Python tool to find and list requirements of a Python project"
+category = "dev"
 optional = false
 python-versions = ">=3.7,<4.0"
 files = [
@@ -1599,6 +1648,7 @@ toml = ">=0.10.2,<0.11.0"
 name = "semver"
 version = "3.0.2"
 description = "Python helper for Semantic Versioning (https://semver.org)"
+category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1610,6 +1660,7 @@ files = [
 name = "setoptconf-tmp"
 version = "0.3.1"
 description = "A module for retrieving program settings from various sources in a consistant method."
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -1624,6 +1675,7 @@ yaml = ["pyyaml"]
 name = "smmap"
 version = "5.0.1"
 description = "A pure Python implementation of a sliding window memory map manager"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1635,6 +1687,7 @@ files = [
 name = "sniffio"
 version = "1.3.0"
 description = "Sniff out which async library your code is running under"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1646,6 +1699,7 @@ files = [
 name = "snowballstemmer"
 version = "2.2.0"
 description = "This package provides 29 stemmers for 28 languages generated from Snowball algorithms."
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -1657,6 +1711,7 @@ files = [
 name = "sqlalchemy"
 version = "2.0.22"
 description = "Database Abstraction Library"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1744,6 +1799,7 @@ sqlcipher = ["sqlcipher3-binary"]
 name = "starlette"
 version = "0.27.0"
 description = "The little ASGI library that shines."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1761,6 +1817,7 @@ full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyam
 name = "tenacity"
 version = "8.2.3"
 description = "Retry code until it succeeds"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1775,6 +1832,7 @@ doc = ["reno", "sphinx", "tornado (>=4.5)"]
 name = "tiktoken"
 version = "0.4.0"
 description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -1820,6 +1878,7 @@ blobfile = ["blobfile (>=2)"]
 name = "toml"
 version = "0.10.2"
 description = "Python Library for Tom's Obvious, Minimal Language"
+category = "dev"
 optional = false
 python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
 files = [
@@ -1831,6 +1890,7 @@ files = [
 name = "tomlkit"
 version = "0.12.1"
 description = "Style preserving TOML library"
+category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1842,6 +1902,7 @@ files = [
 name = "tqdm"
 version = "4.66.1"
 description = "Fast, Extensible Progress Meter"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1862,6 +1923,7 @@ telegram = ["requests"]
 name = "typing-extensions"
 version = "4.8.0"
 description = "Backported and Experimental Type Hints for Python 3.8+"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -1873,6 +1935,7 @@ files = [
 name = "typing-inspect"
 version = "0.9.0"
 description = "Runtime inspection utilities for typing module."
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -1888,6 +1951,7 @@ typing-extensions = ">=3.7.4"
 name = "urllib3"
 version = "2.0.7"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1905,6 +1969,7 @@ zstd = ["zstandard (>=0.18.0)"]
 name = "uvicorn"
 version = "0.23.2"
 description = "The lightning-fast ASGI server."
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -1923,6 +1988,7 @@ standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)",
 name = "wrapt"
 version = "1.15.0"
 description = "Module for decorators, wrappers and monkey patching."
+category = "dev"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
 files = [
@@ -2007,6 +2073,7 @@ files = [
 name = "yarl"
 version = "1.9.2"
 description = "Yet another URL library"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -2093,4 +2160,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "3.11.*"
-content-hash = "680a5df064fcdd1cac69f7130fe0cc41571497de32b7797be0f88a0aa4e7d098"
+content-hash = "844e1ad75ca9b73100279326d787a4621e504c69482e4348051b214e941fd49d"
diff --git a/module_text_llm/pyproject.toml b/module_text_llm/pyproject.toml
index e3d7ba38f..9610767ea 100644
--- a/module_text_llm/pyproject.toml
+++ b/module_text_llm/pyproject.toml
@@ -15,6 +15,7 @@ nltk = "^3.8.1"
 gitpython = "^3.1.37"
 replicate = "^0.11.0"
 tiktoken = "^0.4.0"
+langsmith = "^0.0.60"
 
 [tool.poetry.scripts]
 module = "athena:run_module"
diff --git a/playground/src/pages/api/athena_request.ts b/playground/src/pages/api/athena_request.ts
index de94908fc..b3971db81 100644
--- a/playground/src/pages/api/athena_request.ts
+++ b/playground/src/pages/api/athena_request.ts
@@ -17,7 +17,20 @@ export default async function handler(
   const url = req.query.url;
   let response;
   const secret = req.headers["authorization"] as string;
-  const moduleConfig = req.headers["x-module-config"] as string | undefined;
+  const forwardHeaders = [
+    "X-Module-Config", 
+    "X-Experiment-ID", 
+    "X-Module-Configuration-ID",
+    "X-Run-ID",
+  ]
+
+  const headers = Object.fromEntries(
+    forwardHeaders.flatMap((header) => {
+      const value = req.headers[header.toLowerCase()] as string | undefined;
+      return value ? [[header, value]] : [];
+    })
+  )
+  
   if (!secret) {
     console.warn("No secret provided");
   }
@@ -27,7 +40,7 @@ export default async function handler(
         "Content-Type": "application/json",
         Accept: "application/json",
         "Authorization": secret,
-        ...(moduleConfig && { "X-Module-Config": moduleConfig }),
+        ...headers,
       },
       method: req.method,
       ...(req.method === "POST" ? { body: JSON.stringify(req.body) } : {}),

From e845e1f76e6f0d03714c194f94c284be4a1d5e3c Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 20:04:14 +0100
Subject: [PATCH 38/54] inline statistics

---
 module_text_llm/module_text_llm/__main__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/module_text_llm/module_text_llm/__main__.py b/module_text_llm/module_text_llm/__main__.py
index 0c9bfd29d..9b81dfedb 100644
--- a/module_text_llm/module_text_llm/__main__.py
+++ b/module_text_llm/module_text_llm/__main__.py
@@ -84,13 +84,13 @@ def get_statistics(runs: List[Run]):
             if suggestion_runs:
                 evaluation["runs"]["suggestions"] = {
                     "count": len(suggestion_runs),
-                    "statistics": get_statistics(suggestion_runs),
+                    **get_statistics(suggestion_runs),
                     "runs": [json.loads(run.json()) for run in suggestion_runs]
                 }
             if evaluation_runs:
                 evaluation["runs"]["evaluation"] = {
                     "count": len(evaluation_runs),
-                    "statistics": get_statistics(evaluation_runs),
+                    **get_statistics(evaluation_runs),
                     "runs": [json.loads(run.json()) for run in evaluation_runs]
                 }
 

From d17e48609dc3f2d3137d27d23ba37ea495db9f52 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 20:46:36 +0100
Subject: [PATCH 39/54] add sgi evaluation

---
 module_text_llm/module_text_llm/__main__.py | 56 +++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/module_text_llm/module_text_llm/__main__.py b/module_text_llm/module_text_llm/__main__.py
index 9b81dfedb..75897b438 100644
--- a/module_text_llm/module_text_llm/__main__.py
+++ b/module_text_llm/module_text_llm/__main__.py
@@ -94,6 +94,62 @@ def get_statistics(runs: List[Run]):
                     "runs": [json.loads(run.json()) for run in evaluation_runs]
                 }
 
+    actual_feedback_count = len(true_feedbacks)
+    actual_feedback_with_grading_instructions = []
+    suggestions_count = len(predicted_feedbacks)
+    suggestions_with_grading_instructions = []
+
+    # Init usage counts for SGIs
+    actual_sgi_usage = {
+        sgi.id: 0 for criterion in exercise.grading_criteria or [] for sgi in criterion.structured_grading_instructions
+    }
+    suggested_sgi_usage = {
+        sgi.id: 0 for criterion in exercise.grading_criteria or [] for sgi in criterion.structured_grading_instructions
+    }
+
+    # Count SGIs in actual feedbacks
+    for feedback in true_feedbacks:
+        if feedback.structured_grading_instruction_id:
+            actual_feedback_with_grading_instructions.append(feedback)
+            actual_sgi_usage[feedback.structured_grading_instruction_id] += 1
+
+    # Count SGIs in suggested feedbacks
+    for feedback in predicted_feedbacks:
+        if feedback.structured_grading_instruction_id:
+            suggestions_with_grading_instructions.append(feedback)
+            suggested_sgi_usage[feedback.structured_grading_instruction_id] += 1
+
+    actual_feedback_with_grading_instructions_count = len(actual_feedback_with_grading_instructions)
+    suggestions_with_grading_instructions_count = len(suggestions_with_grading_instructions)
+
+    # Match SGIs
+    matched_feedback = 0
+    unmatched_feedback = actual_feedback_count - actual_feedback_with_grading_instructions_count
+    unmatched_suggestions = suggestions_count - suggestions_with_grading_instructions_count
+    
+    for feedback in actual_feedback_with_grading_instructions:
+        for index, suggestion in enumerate(suggestions_with_grading_instructions):
+            if feedback.structured_grading_instruction_id == suggestion.structured_grading_instruction_id:
+                matched_feedback += 1
+                del suggestions_with_grading_instructions[index]
+                break
+        else:
+            unmatched_feedback += 1
+
+    unmatched_suggestions += len(suggestions_with_grading_instructions)
+
+    evaluation["feedback_statistics"] = {
+        "actual_feedback_count": actual_feedback_count,
+        "suggestions_count": suggestions_count,
+        "actual_feedback_with_grading_instructions_count": actual_feedback_with_grading_instructions_count,
+        "suggestions_with_grading_instructions_count": suggestions_with_grading_instructions_count,
+        "actual_sgi_usage": actual_sgi_usage,
+        "suggested_sgi_usage": suggested_sgi_usage,
+        "matched_feedback": matched_feedback,
+        "unmatched_feedback": unmatched_feedback,
+        "unmatched_suggestions": unmatched_suggestions,
+    }
+    
     return evaluation
 
 if __name__ == "__main__":

From 52664610226f0c4faa1f0aa5fce92135c403841d Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 21:04:54 +0100
Subject: [PATCH 40/54] refactor

---
 module_text_llm/module_text_llm/__main__.py   | 117 +++---------------
 module_text_llm/module_text_llm/evaluation.py | 117 ++++++++++++++++++
 2 files changed, 131 insertions(+), 103 deletions(-)
 create mode 100644 module_text_llm/module_text_llm/evaluation.py

diff --git a/module_text_llm/module_text_llm/__main__.py b/module_text_llm/module_text_llm/__main__.py
index 75897b438..430f7b7fb 100644
--- a/module_text_llm/module_text_llm/__main__.py
+++ b/module_text_llm/module_text_llm/__main__.py
@@ -4,14 +4,13 @@
 
 import nltk
 import tiktoken
-from langsmith import Client as LangsmithClient
-from langsmith.schemas import Run
 
-from athena import app, get_experiment_environment, submission_selector, submissions_consumer, feedback_consumer, feedback_provider, evaluation_provider
+from athena import app, submission_selector, submissions_consumer, feedback_consumer, feedback_provider, evaluation_provider
 from athena.text import Exercise, Submission, Feedback
 from athena.logger import logger
 
 from module_text_llm.config import Configuration
+from module_text_llm.evaluation import get_feedback_statistics, get_llm_statistics
 from module_text_llm.generate_suggestions import generate_suggestions
 from module_text_llm.generate_evaluation import generate_evaluation
 
@@ -44,112 +43,24 @@ async def evaluate_feedback(
     true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback], 
 ) -> Any:
     logger.info(
-        "evaluate_feedback: Evaluation for submission %d of exercise %d was requested with %d true and %d predicted feedbacks", 
-        submission.id, exercise.id, len(true_feedbacks), len(predicted_feedbacks)
+        "evaluate_feedback: Evaluation for submission %d of exercise %d was requested with %d true and %d predicted feedbacks",
+        submission.id, exercise.id, len(
+            true_feedbacks), len(predicted_feedbacks)
     )
-    
+
     evaluation = {}
+
+    # 1. LLM as a judge
     if bool(os.environ.get("LLM_ENABLE_LLM_AS_A_JUDGE")):
         evaluation["llm_as_a_judge"] = await generate_evaluation(exercise, submission, true_feedbacks, predicted_feedbacks)
 
-    # Gather LLM token usage and response times
+    # 2. LangSmith runs, token usage, and respose times
     if bool(os.environ.get("LANGCHAIN_TRACING_V2")):
-        experiment = get_experiment_environment()
-        client = LangsmithClient()
-        project_name = os.environ.get("LANGCHAIN_PROJECT")
-        runs = list(client.list_runs(
-            project_name=project_name,
-            filter=f'and(has(tags, "run-{experiment.run_id}"), has(tags, "submission-{submission.id}"))'
-        ))
-        logger.info("evaluate_feedback: Found %d runs for submission %d of exercise %d.", len(runs), submission.id, exercise.id)
-        
-        def get_statistics(runs: List[Run]):
-            return {
-                "response_time": sum((run.end_time - run.start_time).total_seconds() for run in runs if run.end_time is not None),
-                "prompt_tokens": sum(run.prompt_tokens for run in runs if run.prompt_tokens is not None),
-                "completion_tokens": sum(run.completion_tokens for run in runs if run.completion_tokens is not None),
-                "total_tokens": sum(run.total_tokens for run in runs if run.total_tokens is not None),
-            }
-
-        suggestion_runs = []
-        evaluation_runs = []
-        for run in runs:
-            if "evaluation" in (run.tags or []):
-                evaluation_runs.append(run)
-            else:
-                suggestion_runs.append(run)
-
-        if suggestion_runs or evaluation_runs:
-            evaluation["runs"] = {}
-            if suggestion_runs:
-                evaluation["runs"]["suggestions"] = {
-                    "count": len(suggestion_runs),
-                    **get_statistics(suggestion_runs),
-                    "runs": [json.loads(run.json()) for run in suggestion_runs]
-                }
-            if evaluation_runs:
-                evaluation["runs"]["evaluation"] = {
-                    "count": len(evaluation_runs),
-                    **get_statistics(evaluation_runs),
-                    "runs": [json.loads(run.json()) for run in evaluation_runs]
-                }
-
-    actual_feedback_count = len(true_feedbacks)
-    actual_feedback_with_grading_instructions = []
-    suggestions_count = len(predicted_feedbacks)
-    suggestions_with_grading_instructions = []
-
-    # Init usage counts for SGIs
-    actual_sgi_usage = {
-        sgi.id: 0 for criterion in exercise.grading_criteria or [] for sgi in criterion.structured_grading_instructions
-    }
-    suggested_sgi_usage = {
-        sgi.id: 0 for criterion in exercise.grading_criteria or [] for sgi in criterion.structured_grading_instructions
-    }
-
-    # Count SGIs in actual feedbacks
-    for feedback in true_feedbacks:
-        if feedback.structured_grading_instruction_id:
-            actual_feedback_with_grading_instructions.append(feedback)
-            actual_sgi_usage[feedback.structured_grading_instruction_id] += 1
-
-    # Count SGIs in suggested feedbacks
-    for feedback in predicted_feedbacks:
-        if feedback.structured_grading_instruction_id:
-            suggestions_with_grading_instructions.append(feedback)
-            suggested_sgi_usage[feedback.structured_grading_instruction_id] += 1
-
-    actual_feedback_with_grading_instructions_count = len(actual_feedback_with_grading_instructions)
-    suggestions_with_grading_instructions_count = len(suggestions_with_grading_instructions)
-
-    # Match SGIs
-    matched_feedback = 0
-    unmatched_feedback = actual_feedback_count - actual_feedback_with_grading_instructions_count
-    unmatched_suggestions = suggestions_count - suggestions_with_grading_instructions_count
-    
-    for feedback in actual_feedback_with_grading_instructions:
-        for index, suggestion in enumerate(suggestions_with_grading_instructions):
-            if feedback.structured_grading_instruction_id == suggestion.structured_grading_instruction_id:
-                matched_feedback += 1
-                del suggestions_with_grading_instructions[index]
-                break
-        else:
-            unmatched_feedback += 1
-
-    unmatched_suggestions += len(suggestions_with_grading_instructions)
-
-    evaluation["feedback_statistics"] = {
-        "actual_feedback_count": actual_feedback_count,
-        "suggestions_count": suggestions_count,
-        "actual_feedback_with_grading_instructions_count": actual_feedback_with_grading_instructions_count,
-        "suggestions_with_grading_instructions_count": suggestions_with_grading_instructions_count,
-        "actual_sgi_usage": actual_sgi_usage,
-        "suggested_sgi_usage": suggested_sgi_usage,
-        "matched_feedback": matched_feedback,
-        "unmatched_feedback": unmatched_feedback,
-        "unmatched_suggestions": unmatched_suggestions,
-    }
-    
+        evaluation["llm_statistics"] = get_llm_statistics(submission)
+
+    # 3. Feedback statistics
+    evaluation["feedback_statistics"] = get_feedback_statistics(exercise, submission, true_feedbacks, predicted_feedbacks)
+
     return evaluation
 
 if __name__ == "__main__":
diff --git a/module_text_llm/module_text_llm/evaluation.py b/module_text_llm/module_text_llm/evaluation.py
new file mode 100644
index 000000000..2d6989892
--- /dev/null
+++ b/module_text_llm/module_text_llm/evaluation.py
@@ -0,0 +1,117 @@
+import json
+import os
+from typing import List
+
+from langsmith import Client as LangSmithClient
+from langsmith.schemas import Run
+
+from athena import get_experiment_environment
+from athena.text import Exercise, Submission, Feedback
+
+
+def get_llm_statistics(submission: Submission):
+    experiment = get_experiment_environment()
+    client = LangSmithClient()
+    project_name = os.environ.get("LANGCHAIN_PROJECT")
+    runs = list(client.list_runs(
+        project_name=project_name,
+        filter=f'and(has(tags, "run-{experiment.run_id}"), has(tags, "submission-{submission.id}"))'
+    ))
+
+    def get_statistics(runs: List[Run]):
+        return {
+            "response_time": sum((run.end_time - run.start_time).total_seconds() for run in runs if run.end_time is not None),
+            "prompt_tokens": sum(run.prompt_tokens for run in runs if run.prompt_tokens is not None),
+            "completion_tokens": sum(run.completion_tokens for run in runs if run.completion_tokens is not None),
+            "total_tokens": sum(run.total_tokens for run in runs if run.total_tokens is not None),
+        }
+
+    suggestion_runs = []
+    evaluation_runs = []
+    for run in runs:
+        if "evaluation" in (run.tags or []):
+            evaluation_runs.append(run)
+        else:
+            suggestion_runs.append(run)
+
+    llm_statistics = {}
+    if suggestion_runs or evaluation_runs:
+        if suggestion_runs:
+            llm_statistics["suggestions"] = {
+                "count": len(suggestion_runs),
+                **get_statistics(suggestion_runs),
+                "runs": [json.loads(run.json()) for run in suggestion_runs]
+            }
+        if evaluation_runs:
+            llm_statistics["evaluation"] = {
+                "count": len(evaluation_runs),
+                **get_statistics(evaluation_runs),
+                "runs": [json.loads(run.json()) for run in evaluation_runs]
+            }
+
+    return llm_statistics
+
+
+def get_feedback_statistics(exercise: Exercise, submission: Submission,
+                            true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]):
+    actual_feedback_count = len(true_feedbacks)
+    actual_feedback_with_grading_instructions = []
+    suggestions_count = len(predicted_feedbacks)
+    suggestions_with_grading_instructions = []
+
+    # Init usage counts for SGIs
+    actual_sgi_usage = {
+        sgi.id: 0 for criterion in exercise.grading_criteria or [] for sgi in criterion.structured_grading_instructions
+    }
+    suggested_sgi_usage = {
+        sgi.id: 0 for criterion in exercise.grading_criteria or [] for sgi in criterion.structured_grading_instructions
+    }
+
+    # Count SGIs in actual feedbacks
+    for feedback in true_feedbacks:
+        if feedback.structured_grading_instruction_id:
+            actual_feedback_with_grading_instructions.append(feedback)
+            actual_sgi_usage[feedback.structured_grading_instruction_id] += 1
+
+    # Count SGIs in suggested feedbacks
+    for feedback in predicted_feedbacks:
+        if feedback.structured_grading_instruction_id:
+            suggestions_with_grading_instructions.append(feedback)
+            suggested_sgi_usage[feedback.structured_grading_instruction_id] += 1
+
+    actual_feedback_with_grading_instructions_count = len(
+        actual_feedback_with_grading_instructions)
+    suggestions_with_grading_instructions_count = len(
+        suggestions_with_grading_instructions)
+
+    # Match SGIs
+    matched_feedback = 0
+    unmatched_feedback = actual_feedback_count - \
+        actual_feedback_with_grading_instructions_count
+    unmatched_suggestions = suggestions_count - \
+        suggestions_with_grading_instructions_count
+
+    for feedback in actual_feedback_with_grading_instructions:
+        for index, suggestion in enumerate(suggestions_with_grading_instructions):
+            if feedback.structured_grading_instruction_id == suggestion.structured_grading_instruction_id:
+                matched_feedback += 1
+                del suggestions_with_grading_instructions[index]
+                break
+        else:
+            unmatched_feedback += 1
+
+    unmatched_suggestions += len(suggestions_with_grading_instructions)
+
+    feedback_statistics = {
+        "actual_feedback_count": actual_feedback_count,
+        "suggestions_count": suggestions_count,
+        "actual_feedback_with_grading_instructions_count": actual_feedback_with_grading_instructions_count,
+        "suggestions_with_grading_instructions_count": suggestions_with_grading_instructions_count,
+        "actual_sgi_usage": actual_sgi_usage,
+        "suggested_sgi_usage": suggested_sgi_usage,
+        "matched_feedback": matched_feedback,
+        "unmatched_feedback": unmatched_feedback,
+        "unmatched_suggestions": unmatched_suggestions,
+    }
+
+    return feedback_statistics

From 9f7494ad925aa405ad5b19f7c748c16df20f9e2b Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 21:05:24 +0100
Subject: [PATCH 41/54] remove unused

---
 module_text_llm/module_text_llm/__main__.py   | 2 +-
 module_text_llm/module_text_llm/evaluation.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/module_text_llm/module_text_llm/__main__.py b/module_text_llm/module_text_llm/__main__.py
index 430f7b7fb..1fbeb4cf8 100644
--- a/module_text_llm/module_text_llm/__main__.py
+++ b/module_text_llm/module_text_llm/__main__.py
@@ -59,7 +59,7 @@ async def evaluate_feedback(
         evaluation["llm_statistics"] = get_llm_statistics(submission)
 
     # 3. Feedback statistics
-    evaluation["feedback_statistics"] = get_feedback_statistics(exercise, submission, true_feedbacks, predicted_feedbacks)
+    evaluation["feedback_statistics"] = get_feedback_statistics(exercise, true_feedbacks, predicted_feedbacks)
 
     return evaluation
 
diff --git a/module_text_llm/module_text_llm/evaluation.py b/module_text_llm/module_text_llm/evaluation.py
index 2d6989892..055dc2c94 100644
--- a/module_text_llm/module_text_llm/evaluation.py
+++ b/module_text_llm/module_text_llm/evaluation.py
@@ -52,8 +52,7 @@ def get_statistics(runs: List[Run]):
     return llm_statistics
 
 
-def get_feedback_statistics(exercise: Exercise, submission: Submission,
-                            true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]):
+def get_feedback_statistics(exercise: Exercise, true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]):
     actual_feedback_count = len(true_feedbacks)
     actual_feedback_with_grading_instructions = []
     suggestions_count = len(predicted_feedbacks)

From d44178b7f3c97e190d627a9b602e2f3b1b5e982c Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 21:36:17 +0100
Subject: [PATCH 42/54] update ini

---
 assessment_module_manager/modules.docker.ini | 7 ++++++-
 assessment_module_manager/modules.ini        | 4 ++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/assessment_module_manager/modules.docker.ini b/assessment_module_manager/modules.docker.ini
index db9d22854..340efa797 100644
--- a/assessment_module_manager/modules.docker.ini
+++ b/assessment_module_manager/modules.docker.ini
@@ -1,19 +1,24 @@
 [module_example]
 url = http://module-example:5001
 type = programming
+supports_evaluation = false
 
 [module_programming_llm]
 url = http://module-programming-llm:5002
 type = programming
+supports_evaluation = false
 
 [module_text_llm]
 url = http://module-text-llm:5003
 type = text
+supports_evaluation = true
 
 [module_text_cofee]
 url = http://module-text-cofee:5004
 type = text
+supports_evaluation = false
 
 [module_programming_themisml]
 url = http://module-programming-themisml:5005
-type = programming
\ No newline at end of file
+type = programming
+supports_evaluation = false
\ No newline at end of file
diff --git a/assessment_module_manager/modules.ini b/assessment_module_manager/modules.ini
index 3402183f0..70745eb78 100644
--- a/assessment_module_manager/modules.ini
+++ b/assessment_module_manager/modules.ini
@@ -1,7 +1,7 @@
 [module_example]
 url = http://localhost:5001
 type = programming
-supports_evaluation = true
+supports_evaluation = false
 
 [module_programming_llm]
 url = http://localhost:5002
@@ -11,7 +11,7 @@ supports_evaluation = false
 [module_text_llm]
 url = http://localhost:5003
 type = text
-supports_evaluation = false
+supports_evaluation = true
 
 [module_text_cofee]
 url = http://localhost:5004

From 42f8210f37a3254da2701b9848438faf91f752c9 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 21:40:51 +0100
Subject: [PATCH 43/54] only use selected modules

---
 playground/src/hooks/athena/request_evaluation.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/playground/src/hooks/athena/request_evaluation.ts b/playground/src/hooks/athena/request_evaluation.ts
index 76227258f..620fb362b 100644
--- a/playground/src/hooks/athena/request_evaluation.ts
+++ b/playground/src/hooks/athena/request_evaluation.ts
@@ -48,7 +48,7 @@ export default function useRequestEvaluation(
       const modules = onlyUseContextModule
         ? [contextModule]
         : Object.values(health?.modules ?? {}).filter(
-            (module) => module.healthy && module.type === contextModule.type
+            (module) => module.healthy && module.type === contextModule.type && module.supportsEvaluation
           );
 
       const results = await Promise.allSettled(

From cc5693dbcaa0916276a8ccc3ff65d1e2df3b7824 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 22:49:19 +0100
Subject: [PATCH 44/54] remove skip

---
 module_text_llm/module_text_llm/__main__.py     |  2 +-
 playground/src/hooks/batch_module_experiment.ts | 10 ----------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/module_text_llm/module_text_llm/__main__.py b/module_text_llm/module_text_llm/__main__.py
index 1fbeb4cf8..49d069635 100644
--- a/module_text_llm/module_text_llm/__main__.py
+++ b/module_text_llm/module_text_llm/__main__.py
@@ -51,7 +51,7 @@ async def evaluate_feedback(
     evaluation = {}
 
     # 1. LLM as a judge
-    if bool(os.environ.get("LLM_ENABLE_LLM_AS_A_JUDGE")):
+    if len(predicted_feedbacks) > 0 and bool(os.environ.get("LLM_ENABLE_LLM_AS_A_JUDGE")):
         evaluation["llm_as_a_judge"] = await generate_evaluation(exercise, submission, true_feedbacks, predicted_feedbacks)
 
     # 2. LangSmith runs, token usage, and respose times
diff --git a/playground/src/hooks/batch_module_experiment.ts b/playground/src/hooks/batch_module_experiment.ts
index e2610fbb8..778365ade 100644
--- a/playground/src/hooks/batch_module_experiment.ts
+++ b/playground/src/hooks/batch_module_experiment.ts
@@ -406,16 +406,6 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
         submission.id
       )?.suggestions ?? [];
 
-      if (predictedFeedbacks.length === 0) {
-        // Skip if there are no predicted feedbacks
-        setSubmissionsWithAutomaticEvaluation((prevState) => {
-          const newMap = new Map(prevState);
-          newMap.set(submission.id, {});
-          return newMap;
-        });
-        continue;
-      }
-
       try {
         const responses = await requestEvaluation.mutateAsync({
           exercise: experiment.exercise,

From 9462ec56921eb2fe32a114b3c16c836b42119f17 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Tue, 7 Nov 2023 23:39:18 +0100
Subject: [PATCH 45/54] add retries

---
 playground/src/hooks/athena/request_evaluation.ts           | 1 +
 playground/src/hooks/athena/request_feedback_suggestions.ts | 1 +
 playground/src/hooks/athena/request_submission_selection.ts | 1 +
 playground/src/hooks/athena/send_feedbacks.ts               | 1 +
 playground/src/hooks/athena/send_submissions.ts             | 1 +
 5 files changed, 5 insertions(+)

diff --git a/playground/src/hooks/athena/request_evaluation.ts b/playground/src/hooks/athena/request_evaluation.ts
index 620fb362b..d1d9082de 100644
--- a/playground/src/hooks/athena/request_evaluation.ts
+++ b/playground/src/hooks/athena/request_evaluation.ts
@@ -74,6 +74,7 @@ export default function useRequestEvaluation(
         }
       });
     },
+    retry: 3,
     ...options,
   });
 }
diff --git a/playground/src/hooks/athena/request_feedback_suggestions.ts b/playground/src/hooks/athena/request_feedback_suggestions.ts
index 5a58470dd..0040fd763 100644
--- a/playground/src/hooks/athena/request_feedback_suggestions.ts
+++ b/playground/src/hooks/athena/request_feedback_suggestions.ts
@@ -40,6 +40,7 @@ export default function useRequestFeedbackSuggestions(
       }
       return response;
     },
+    retry: 3,
     ...options,
   });
 }
diff --git a/playground/src/hooks/athena/request_submission_selection.ts b/playground/src/hooks/athena/request_submission_selection.ts
index 4190592b9..fad4f8ec7 100644
--- a/playground/src/hooks/athena/request_submission_selection.ts
+++ b/playground/src/hooks/athena/request_submission_selection.ts
@@ -26,6 +26,7 @@ export default function useRequestSubmissionSelection(
       const submissionIds = submissions.map((submission) => submission.id)
       return await athenaFetcher("/select_submission", { exercise, submission_ids: submissionIds });
     },
+    retry: 3,
     ...options,
   });
 }
diff --git a/playground/src/hooks/athena/send_feedbacks.ts b/playground/src/hooks/athena/send_feedbacks.ts
index c06278838..509fc056e 100644
--- a/playground/src/hooks/athena/send_feedbacks.ts
+++ b/playground/src/hooks/athena/send_feedbacks.ts
@@ -26,6 +26,7 @@ export function useSendFeedbacks(
     mutationFn: async ({ exercise, submission, feedbacks }) => {
       return await athenaFetcher("/feedbacks", { exercise, submission, feedbacks });
     },
+    retry: 3,
     ...options,
   });
 }
\ No newline at end of file
diff --git a/playground/src/hooks/athena/send_submissions.ts b/playground/src/hooks/athena/send_submissions.ts
index 3b91414ce..12003035a 100644
--- a/playground/src/hooks/athena/send_submissions.ts
+++ b/playground/src/hooks/athena/send_submissions.ts
@@ -25,6 +25,7 @@ export default function useSendSubmissions(
     mutationFn: async ({ exercise, submissions }) => {
       return await athenaFetcher("/submissions", { exercise, submissions });
     },
+    retry: 3,
     ...options,
   });
 }

From 292d588dc2cf495e3003daca040a4d120aa4864f Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Wed, 8 Nov 2023 11:01:26 +0100
Subject: [PATCH 46/54] enable example module evaluation support for now

---
 assessment_module_manager/modules.docker.ini | 2 +-
 assessment_module_manager/modules.ini        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/assessment_module_manager/modules.docker.ini b/assessment_module_manager/modules.docker.ini
index 340efa797..4c5f74e9b 100644
--- a/assessment_module_manager/modules.docker.ini
+++ b/assessment_module_manager/modules.docker.ini
@@ -1,7 +1,7 @@
 [module_example]
 url = http://module-example:5001
 type = programming
-supports_evaluation = false
+supports_evaluation = true
 
 [module_programming_llm]
 url = http://module-programming-llm:5002
diff --git a/assessment_module_manager/modules.ini b/assessment_module_manager/modules.ini
index 70745eb78..73fd4c91b 100644
--- a/assessment_module_manager/modules.ini
+++ b/assessment_module_manager/modules.ini
@@ -1,7 +1,7 @@
 [module_example]
 url = http://localhost:5001
 type = programming
-supports_evaluation = false
+supports_evaluation = true
 
 [module_programming_llm]
 url = http://localhost:5002

From 55482931a242d697c7969e552c8481cb2574f46b Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Wed, 8 Nov 2023 23:25:21 +0100
Subject: [PATCH 47/54] fix filter

---
 module_text_llm/module_text_llm/evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module_text_llm/module_text_llm/evaluation.py b/module_text_llm/module_text_llm/evaluation.py
index 055dc2c94..fe5701937 100644
--- a/module_text_llm/module_text_llm/evaluation.py
+++ b/module_text_llm/module_text_llm/evaluation.py
@@ -15,7 +15,7 @@ def get_llm_statistics(submission: Submission):
     project_name = os.environ.get("LANGCHAIN_PROJECT")
     runs = list(client.list_runs(
         project_name=project_name,
-        filter=f'and(has(tags, "run-{experiment.run_id}"), has(tags, "submission-{submission.id}"))'
+        filter=f'has(tags, "submission-{submission.id}")' if experiment.run_id is None else f'and(has(tags, "run-{experiment.run_id}"), has(tags, "submission-{submission.id}"))'
     ))
 
     def get_statistics(runs: List[Run]):

From 8de0ee696767c8bedf6e52d4c0ac13c812bb233f Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Thu, 9 Nov 2023 10:57:34 +0100
Subject: [PATCH 48/54] implement feedbacl

---
 athena/athena/endpoints.py                    |  4 +---
 module_text_llm/module_text_llm/evaluation.py | 21 +++++++------------
 2 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/athena/athena/endpoints.py b/athena/athena/endpoints.py
index 411edaf0a..72787eb31 100644
--- a/athena/athena/endpoints.py
+++ b/athena/athena/endpoints.py
@@ -407,9 +407,7 @@ async def wrapper(
         # Retrieve existing metadata for the exercise, submission and feedback
         exercise.meta.update(get_stored_exercise_meta(exercise) or {})
         submission.meta.update(get_stored_submission_meta(submission) or {})
-        for feedback in true_feedbacks:
-            feedback.meta.update(get_stored_feedback_meta(feedback) or {})
-        for feedback in predicted_feedbacks:
+        for feedback in true_feedbacks + predicted_feedbacks:
             feedback.meta.update(get_stored_feedback_meta(feedback) or {})
 
         # Call the actual provider
diff --git a/module_text_llm/module_text_llm/evaluation.py b/module_text_llm/module_text_llm/evaluation.py
index fe5701937..74d9e4db7 100644
--- a/module_text_llm/module_text_llm/evaluation.py
+++ b/module_text_llm/module_text_llm/evaluation.py
@@ -78,34 +78,29 @@ def get_feedback_statistics(exercise: Exercise, true_feedbacks: List[Feedback],
             suggestions_with_grading_instructions.append(feedback)
             suggested_sgi_usage[feedback.structured_grading_instruction_id] += 1
 
-    actual_feedback_with_grading_instructions_count = len(
-        actual_feedback_with_grading_instructions)
-    suggestions_with_grading_instructions_count = len(
-        suggestions_with_grading_instructions)
+    unmatched_suggestions_with_grading_instructions = suggestions_with_grading_instructions.copy()
 
     # Match SGIs
     matched_feedback = 0
-    unmatched_feedback = actual_feedback_count - \
-        actual_feedback_with_grading_instructions_count
-    unmatched_suggestions = suggestions_count - \
-        suggestions_with_grading_instructions_count
+    unmatched_feedback = actual_feedback_count - len(actual_feedback_with_grading_instructions)
+    unmatched_suggestions = suggestions_count - len(suggestions_with_grading_instructions)
 
     for feedback in actual_feedback_with_grading_instructions:
-        for index, suggestion in enumerate(suggestions_with_grading_instructions):
+        for index, suggestion in enumerate(unmatched_suggestions_with_grading_instructions):
             if feedback.structured_grading_instruction_id == suggestion.structured_grading_instruction_id:
                 matched_feedback += 1
-                del suggestions_with_grading_instructions[index]
+                del unmatched_suggestions_with_grading_instructions[index]
                 break
         else:
             unmatched_feedback += 1
 
-    unmatched_suggestions += len(suggestions_with_grading_instructions)
+    unmatched_suggestions += len(unmatched_suggestions_with_grading_instructions)
 
     feedback_statistics = {
         "actual_feedback_count": actual_feedback_count,
         "suggestions_count": suggestions_count,
-        "actual_feedback_with_grading_instructions_count": actual_feedback_with_grading_instructions_count,
-        "suggestions_with_grading_instructions_count": suggestions_with_grading_instructions_count,
+        "actual_feedback_with_grading_instructions_count": len(actual_feedback_with_grading_instructions),
+        "suggestions_with_grading_instructions_count":len(suggestions_with_grading_instructions),
         "actual_sgi_usage": actual_sgi_usage,
         "suggested_sgi_usage": suggested_sgi_usage,
         "matched_feedback": matched_feedback,

From 0edea59c89771ea0884d7dc442cab79b4ec1ef06 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Thu, 9 Nov 2023 11:01:10 +0100
Subject: [PATCH 49/54] update retry

---
 playground/src/hooks/athena/request_evaluation.ts    |  1 -
 .../src/hooks/athena/request_feedback_suggestions.ts |  1 -
 .../src/hooks/athena/request_submission_selection.ts |  1 -
 playground/src/hooks/athena/send_feedbacks.ts        |  1 -
 playground/src/hooks/athena/send_submissions.ts      |  1 -
 playground/src/hooks/batch_module_experiment.ts      | 12 +++++++-----
 6 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/playground/src/hooks/athena/request_evaluation.ts b/playground/src/hooks/athena/request_evaluation.ts
index d1d9082de..620fb362b 100644
--- a/playground/src/hooks/athena/request_evaluation.ts
+++ b/playground/src/hooks/athena/request_evaluation.ts
@@ -74,7 +74,6 @@ export default function useRequestEvaluation(
         }
       });
     },
-    retry: 3,
     ...options,
   });
 }
diff --git a/playground/src/hooks/athena/request_feedback_suggestions.ts b/playground/src/hooks/athena/request_feedback_suggestions.ts
index 0040fd763..5a58470dd 100644
--- a/playground/src/hooks/athena/request_feedback_suggestions.ts
+++ b/playground/src/hooks/athena/request_feedback_suggestions.ts
@@ -40,7 +40,6 @@ export default function useRequestFeedbackSuggestions(
       }
       return response;
     },
-    retry: 3,
     ...options,
   });
 }
diff --git a/playground/src/hooks/athena/request_submission_selection.ts b/playground/src/hooks/athena/request_submission_selection.ts
index fad4f8ec7..4190592b9 100644
--- a/playground/src/hooks/athena/request_submission_selection.ts
+++ b/playground/src/hooks/athena/request_submission_selection.ts
@@ -26,7 +26,6 @@ export default function useRequestSubmissionSelection(
       const submissionIds = submissions.map((submission) => submission.id)
       return await athenaFetcher("/select_submission", { exercise, submission_ids: submissionIds });
     },
-    retry: 3,
     ...options,
   });
 }
diff --git a/playground/src/hooks/athena/send_feedbacks.ts b/playground/src/hooks/athena/send_feedbacks.ts
index 509fc056e..c06278838 100644
--- a/playground/src/hooks/athena/send_feedbacks.ts
+++ b/playground/src/hooks/athena/send_feedbacks.ts
@@ -26,7 +26,6 @@ export function useSendFeedbacks(
     mutationFn: async ({ exercise, submission, feedbacks }) => {
       return await athenaFetcher("/feedbacks", { exercise, submission, feedbacks });
     },
-    retry: 3,
     ...options,
   });
 }
\ No newline at end of file
diff --git a/playground/src/hooks/athena/send_submissions.ts b/playground/src/hooks/athena/send_submissions.ts
index 12003035a..3b91414ce 100644
--- a/playground/src/hooks/athena/send_submissions.ts
+++ b/playground/src/hooks/athena/send_submissions.ts
@@ -25,7 +25,6 @@ export default function useSendSubmissions(
     mutationFn: async ({ exercise, submissions }) => {
       return await athenaFetcher("/submissions", { exercise, submissions });
     },
-    retry: 3,
     ...options,
   });
 }
diff --git a/playground/src/hooks/batch_module_experiment.ts b/playground/src/hooks/batch_module_experiment.ts
index 778365ade..e999631cc 100644
--- a/playground/src/hooks/batch_module_experiment.ts
+++ b/playground/src/hooks/batch_module_experiment.ts
@@ -195,11 +195,13 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
   }) : undefined;
 
   // Module requests
-  const sendSubmissions = useSendSubmissions();
-  const sendFeedbacks = useSendFeedbacks();
-  const requestSubmissionSelection = useRequestSubmissionSelection();
-  const requestFeedbackSuggestions = useRequestFeedbackSuggestions();
-  const requestEvaluation = useRequestEvaluation();
+  // By default useMutation does not retry, but we want to retry a few times to not get stuck
+  // If we still get stuck we can just `Export` -> `Cancel Experiment` -> `Import` again to continue for now
+  const sendSubmissions = useSendSubmissions({ retry: 3 });
+  const sendFeedbacks = useSendFeedbacks({ retry: 3 });
+  const requestSubmissionSelection = useRequestSubmissionSelection({ retry: 3 });
+  const requestFeedbackSuggestions = useRequestFeedbackSuggestions({ retry: 3 });
+  const requestEvaluation = useRequestEvaluation({ retry: 3 });
 
   // 1. Send submissions to Athena
   const stepSendSubmissions = () => {

From b4b529e6c0e72b7835db647ae7d907b03c9088a8 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Thu, 9 Nov 2023 11:26:00 +0100
Subject: [PATCH 50/54] validate grading instruction id

---
 .../generate_suggestions_by_file.py                      | 9 ++++++++-
 module_text_llm/module_text_llm/generate_suggestions.py  | 9 ++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/module_programming_llm/module_programming_llm/generate_suggestions_by_file.py b/module_programming_llm/module_programming_llm/generate_suggestions_by_file.py
index 3d2238b31..60a43b6ad 100644
--- a/module_programming_llm/module_programming_llm/generate_suggestions_by_file.py
+++ b/module_programming_llm/module_programming_llm/generate_suggestions_by_file.py
@@ -227,12 +227,19 @@ async def generate_suggestions_by_file(exercise: Exercise, submission: Submissio
             ]
         )
 
+    grading_instruction_ids = set(
+        grading_instruction.id 
+        for criterion in exercise.grading_criteria or [] 
+        for grading_instruction in criterion.structured_grading_instructions
+    )
+
     feedbacks: List[Feedback] = []
     for prompt_input, result in zip(prompt_inputs, results):
         file_path = prompt_input["file_path"]
         if result is None:
             continue
         for feedback in result.feedbacks:
+            grading_instruction_id = feedback.grading_instruction_id if feedback.grading_instruction_id in grading_instruction_ids else None
             feedbacks.append(Feedback(
                 exercise_id=exercise.id,
                 submission_id=submission.id,
@@ -242,7 +249,7 @@ async def generate_suggestions_by_file(exercise: Exercise, submission: Submissio
                 line_start=feedback.line_start,
                 line_end=feedback.line_end,
                 credits=feedback.credits,
-                structured_grading_instruction_id=feedback.grading_instruction_id,
+                structured_grading_instruction_id=grading_instruction_id,
                 meta={}
             ))
 
diff --git a/module_text_llm/module_text_llm/generate_suggestions.py b/module_text_llm/module_text_llm/generate_suggestions.py
index a279d98c8..d43563ad6 100644
--- a/module_text_llm/module_text_llm/generate_suggestions.py
+++ b/module_text_llm/module_text_llm/generate_suggestions.py
@@ -94,9 +94,16 @@ async def generate_suggestions(exercise: Exercise, submission: Submission, confi
     if result is None:
         return []
 
+    grading_instruction_ids = set(
+        grading_instruction.id 
+        for criterion in exercise.grading_criteria or [] 
+        for grading_instruction in criterion.structured_grading_instructions
+    )
+
     feedbacks = []
     for feedback in result.feedbacks:
         index_start, index_end = get_index_range_from_line_range(feedback.line_start, feedback.line_end, submission.text)
+        grading_instruction_id = feedback.grading_instruction_id if feedback.grading_instruction_id in grading_instruction_ids else None
         feedbacks.append(Feedback(
             exercise_id=exercise.id,
             submission_id=submission.id,
@@ -105,7 +112,7 @@ async def generate_suggestions(exercise: Exercise, submission: Submission, confi
             index_start=index_start,
             index_end=index_end,
             credits=feedback.credits,
-            structured_grading_instruction_id=feedback.grading_instruction_id,
+            structured_grading_instruction_id=grading_instruction_id,
             meta={}
         ))
 

From 331f3538e2afcf88e0a4c6dd2a6086fc63385ce5 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Thu, 9 Nov 2023 11:28:16 +0100
Subject: [PATCH 51/54] add additional check

---
 module_text_llm/module_text_llm/evaluation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/module_text_llm/module_text_llm/evaluation.py b/module_text_llm/module_text_llm/evaluation.py
index 74d9e4db7..f0af94e3e 100644
--- a/module_text_llm/module_text_llm/evaluation.py
+++ b/module_text_llm/module_text_llm/evaluation.py
@@ -68,13 +68,13 @@ def get_feedback_statistics(exercise: Exercise, true_feedbacks: List[Feedback],
 
     # Count SGIs in actual feedbacks
     for feedback in true_feedbacks:
-        if feedback.structured_grading_instruction_id:
+        if feedback.structured_grading_instruction_id and feedback.structured_grading_instruction_id in actual_sgi_usage:
             actual_feedback_with_grading_instructions.append(feedback)
             actual_sgi_usage[feedback.structured_grading_instruction_id] += 1
 
     # Count SGIs in suggested feedbacks
     for feedback in predicted_feedbacks:
-        if feedback.structured_grading_instruction_id:
+        if feedback.structured_grading_instruction_id and feedback.structured_grading_instruction_id in suggested_sgi_usage:
             suggestions_with_grading_instructions.append(feedback)
             suggested_sgi_usage[feedback.structured_grading_instruction_id] += 1
 

From 4eac2951d107708be7a69e54e5fe3ef5959b033b Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Thu, 9 Nov 2023 11:39:11 +0100
Subject: [PATCH 52/54] fix index

---
 playground/src/hooks/batch_module_experiment.ts | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/playground/src/hooks/batch_module_experiment.ts b/playground/src/hooks/batch_module_experiment.ts
index e999631cc..8c256bb5c 100644
--- a/playground/src/hooks/batch_module_experiment.ts
+++ b/playground/src/hooks/batch_module_experiment.ts
@@ -396,10 +396,11 @@ export default function useBatchModuleExperiment(experiment: Experiment, moduleC
       (submission) => !submissionsWithAutomaticEvaluation?.has(submission.id)
     );
     
-    let index = 0;
+    let num = 0;
     for (const submission of remainingSubmissions) {
+      num += 1;
       console.log(
-        `Evaluating... (${index + 1}/${
+        `Evaluating... (${num}/${
           remainingSubmissions.length
         })`
       );

From 0fe6e6d0ea0faac645bb85665bb40836aad1ed04 Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Sat, 11 Nov 2023 12:29:17 +0100
Subject: [PATCH 53/54] add docs

---
 docs/module/structure.rst | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/docs/module/structure.rst b/docs/module/structure.rst
index 31c206f55..126ab8b98 100644
--- a/docs/module/structure.rst
+++ b/docs/module/structure.rst
@@ -94,7 +94,7 @@ Example:
                 )
             ]
 
-Provide Config Schema
+Provide Config Schema (Optional)
 ~~~~~~~~~~~~~~~~~~~~~~
 Get a schema for config options of the module as json schema. The config complying to the schema can then be provided in the header of a request `X-Module-Config` to override the default values. The module can decorate one pydantic model with ``@config_schema_provider`` to provide the schema and should have default values set for all fields as default configuration. The configuration class can be appended to the function signature of all other decorators to provide the configuration to the function.
 
@@ -108,6 +108,37 @@ Example:
             debug: bool = Field(False, description="Whether the module is in debug mode.")
             ...
 
+Provide Evaluation (Optional)
+~~~~~~~~~~~~~~~~~~
+Get an arbitrary evaluation for a submission with historical ``true_feedback`` and feedback suggestions ``predicted_feedback``. The Playground would usually call this when conducting an evaluation during an experiment. The module will receive the request at the function annotated with ``@evaluation_provider``.
+
+If you want to have the ``/evaluation`` endpoint available during the Playground evaluation mode, you need to set ``supports_evaluation = true`` in the ``modules.ini`` and ``modules.docker.ini`` files.
+
+Example: 
+    .. code-block:: python
+
+        from athena import *
+
+        @evaluation_provider
+        def evaluate_feedback(exercise: Exercise, submission: Submission, true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]) -> Any:
+            # Do something with the true and predicted feedback and return the evaluation result
+            ...
+            # Example: Generate some example evaluation result
+            evaluation_results = []
+            true_feedback_embeddings = [random.random() for _ in true_feedbacks] 
+            predicted_feedback_embeddings = [random.random() for _ in predicted_feedbacks]
+            for feedback, embedding in zip(predicted_feedbacks, predicted_feedback_embeddings):
+                feedback_evaluation = {
+                    "feedback_id": feedback.id,
+                    "embedding": embedding,
+                    "has_match": len([t for t in true_feedback_embeddings if abs(t - embedding) < 0.1]) > 0,
+                    "correctness": random.random()
+                }
+                evaluation_results.append(feedback_evaluation)
+            ...
+            # Return arbitrary evaluation results
+            return evaluation_results
+
 Environment Variables
 ---------------------
 You should provide at least the following environment variables for your module to work properly:

From 2b7f2122e3c16299d3ff7e86d88bed7c8665b3fc Mon Sep 17 00:00:00 2001
From: "Felix T.J. Dietrich" <felixtj.dietrich@tum.de>
Date: Sun, 12 Nov 2023 19:32:10 +0100
Subject: [PATCH 54/54] fix text module

---
 module_text_llm/poetry.lock    | 513 ++++++++++++++++-----------------
 module_text_llm/pyproject.toml |   2 +-
 2 files changed, 256 insertions(+), 259 deletions(-)

diff --git a/module_text_llm/poetry.lock b/module_text_llm/poetry.lock
index 8db9814e0..f5206889c 100644
--- a/module_text_llm/poetry.lock
+++ b/module_text_llm/poetry.lock
@@ -228,102 +228,102 @@ files = [
 
 [[package]]
 name = "charset-normalizer"
-version = "3.3.1"
+version = "3.3.2"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
 category = "main"
 optional = false
 python-versions = ">=3.7.0"
 files = [
-    {file = "charset-normalizer-3.3.1.tar.gz", hash = "sha256:d9137a876020661972ca6eec0766d81aef8a5627df628b664b234b73396e727e"},
-    {file = "charset_normalizer-3.3.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8aee051c89e13565c6bd366813c386939f8e928af93c29fda4af86d25b73d8f8"},
-    {file = "charset_normalizer-3.3.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:352a88c3df0d1fa886562384b86f9a9e27563d4704ee0e9d56ec6fcd270ea690"},
-    {file = "charset_normalizer-3.3.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:223b4d54561c01048f657fa6ce41461d5ad8ff128b9678cfe8b2ecd951e3f8a2"},
-    {file = "charset_normalizer-3.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f861d94c2a450b974b86093c6c027888627b8082f1299dfd5a4bae8e2292821"},
-    {file = "charset_normalizer-3.3.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1171ef1fc5ab4693c5d151ae0fdad7f7349920eabbaca6271f95969fa0756c2d"},
-    {file = "charset_normalizer-3.3.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28f512b9a33235545fbbdac6a330a510b63be278a50071a336afc1b78781b147"},
-    {file = "charset_normalizer-3.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0e842112fe3f1a4ffcf64b06dc4c61a88441c2f02f373367f7b4c1aa9be2ad5"},
-    {file = "charset_normalizer-3.3.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3f9bc2ce123637a60ebe819f9fccc614da1bcc05798bbbaf2dd4ec91f3e08846"},
-    {file = "charset_normalizer-3.3.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:f194cce575e59ffe442c10a360182a986535fd90b57f7debfaa5c845c409ecc3"},
-    {file = "charset_normalizer-3.3.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:9a74041ba0bfa9bc9b9bb2cd3238a6ab3b7618e759b41bd15b5f6ad958d17605"},
-    {file = "charset_normalizer-3.3.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:b578cbe580e3b41ad17b1c428f382c814b32a6ce90f2d8e39e2e635d49e498d1"},
-    {file = "charset_normalizer-3.3.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:6db3cfb9b4fcecb4390db154e75b49578c87a3b9979b40cdf90d7e4b945656e1"},
-    {file = "charset_normalizer-3.3.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:debb633f3f7856f95ad957d9b9c781f8e2c6303ef21724ec94bea2ce2fcbd056"},
-    {file = "charset_normalizer-3.3.1-cp310-cp310-win32.whl", hash = "sha256:87071618d3d8ec8b186d53cb6e66955ef2a0e4fa63ccd3709c0c90ac5a43520f"},
-    {file = "charset_normalizer-3.3.1-cp310-cp310-win_amd64.whl", hash = "sha256:e372d7dfd154009142631de2d316adad3cc1c36c32a38b16a4751ba78da2a397"},
-    {file = "charset_normalizer-3.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ae4070f741f8d809075ef697877fd350ecf0b7c5837ed68738607ee0a2c572cf"},
-    {file = "charset_normalizer-3.3.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:58e875eb7016fd014c0eea46c6fa92b87b62c0cb31b9feae25cbbe62c919f54d"},
-    {file = "charset_normalizer-3.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dbd95e300367aa0827496fe75a1766d198d34385a58f97683fe6e07f89ca3e3c"},
-    {file = "charset_normalizer-3.3.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de0b4caa1c8a21394e8ce971997614a17648f94e1cd0640fbd6b4d14cab13a72"},
-    {file = "charset_normalizer-3.3.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:985c7965f62f6f32bf432e2681173db41336a9c2611693247069288bcb0c7f8b"},
-    {file = "charset_normalizer-3.3.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a15c1fe6d26e83fd2e5972425a772cca158eae58b05d4a25a4e474c221053e2d"},
-    {file = "charset_normalizer-3.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae55d592b02c4349525b6ed8f74c692509e5adffa842e582c0f861751701a673"},
-    {file = "charset_normalizer-3.3.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:be4d9c2770044a59715eb57c1144dedea7c5d5ae80c68fb9959515037cde2008"},
-    {file = "charset_normalizer-3.3.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:851cf693fb3aaef71031237cd68699dded198657ec1e76a76eb8be58c03a5d1f"},
-    {file = "charset_normalizer-3.3.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:31bbaba7218904d2eabecf4feec0d07469284e952a27400f23b6628439439fa7"},
-    {file = "charset_normalizer-3.3.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:871d045d6ccc181fd863a3cd66ee8e395523ebfbc57f85f91f035f50cee8e3d4"},
-    {file = "charset_normalizer-3.3.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:501adc5eb6cd5f40a6f77fbd90e5ab915c8fd6e8c614af2db5561e16c600d6f3"},
-    {file = "charset_normalizer-3.3.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f5fb672c396d826ca16a022ac04c9dce74e00a1c344f6ad1a0fdc1ba1f332213"},
-    {file = "charset_normalizer-3.3.1-cp311-cp311-win32.whl", hash = "sha256:bb06098d019766ca16fc915ecaa455c1f1cd594204e7f840cd6258237b5079a8"},
-    {file = "charset_normalizer-3.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:8af5a8917b8af42295e86b64903156b4f110a30dca5f3b5aedea123fbd638bff"},
-    {file = "charset_normalizer-3.3.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:7ae8e5142dcc7a49168f4055255dbcced01dc1714a90a21f87448dc8d90617d1"},
-    {file = "charset_normalizer-3.3.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5b70bab78accbc672f50e878a5b73ca692f45f5b5e25c8066d748c09405e6a55"},
-    {file = "charset_normalizer-3.3.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5ceca5876032362ae73b83347be8b5dbd2d1faf3358deb38c9c88776779b2e2f"},
-    {file = "charset_normalizer-3.3.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34d95638ff3613849f473afc33f65c401a89f3b9528d0d213c7037c398a51296"},
-    {file = "charset_normalizer-3.3.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9edbe6a5bf8b56a4a84533ba2b2f489d0046e755c29616ef8830f9e7d9cf5728"},
-    {file = "charset_normalizer-3.3.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f6a02a3c7950cafaadcd46a226ad9e12fc9744652cc69f9e5534f98b47f3bbcf"},
-    {file = "charset_normalizer-3.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10b8dd31e10f32410751b3430996f9807fc4d1587ca69772e2aa940a82ab571a"},
-    {file = "charset_normalizer-3.3.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edc0202099ea1d82844316604e17d2b175044f9bcb6b398aab781eba957224bd"},
-    {file = "charset_normalizer-3.3.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b891a2f68e09c5ef989007fac11476ed33c5c9994449a4e2c3386529d703dc8b"},
-    {file = "charset_normalizer-3.3.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:71ef3b9be10070360f289aea4838c784f8b851be3ba58cf796262b57775c2f14"},
-    {file = "charset_normalizer-3.3.1-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:55602981b2dbf8184c098bc10287e8c245e351cd4fdcad050bd7199d5a8bf514"},
-    {file = "charset_normalizer-3.3.1-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:46fb9970aa5eeca547d7aa0de5d4b124a288b42eaefac677bde805013c95725c"},
-    {file = "charset_normalizer-3.3.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:520b7a142d2524f999447b3a0cf95115df81c4f33003c51a6ab637cbda9d0bf4"},
-    {file = "charset_normalizer-3.3.1-cp312-cp312-win32.whl", hash = "sha256:8ec8ef42c6cd5856a7613dcd1eaf21e5573b2185263d87d27c8edcae33b62a61"},
-    {file = "charset_normalizer-3.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:baec8148d6b8bd5cee1ae138ba658c71f5b03e0d69d5907703e3e1df96db5e41"},
-    {file = "charset_normalizer-3.3.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:63a6f59e2d01310f754c270e4a257426fe5a591dc487f1983b3bbe793cf6bac6"},
-    {file = "charset_normalizer-3.3.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d6bfc32a68bc0933819cfdfe45f9abc3cae3877e1d90aac7259d57e6e0f85b1"},
-    {file = "charset_normalizer-3.3.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4f3100d86dcd03c03f7e9c3fdb23d92e32abbca07e7c13ebd7ddfbcb06f5991f"},
-    {file = "charset_normalizer-3.3.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:39b70a6f88eebe239fa775190796d55a33cfb6d36b9ffdd37843f7c4c1b5dc67"},
-    {file = "charset_normalizer-3.3.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e12f8ee80aa35e746230a2af83e81bd6b52daa92a8afaef4fea4a2ce9b9f4fa"},
-    {file = "charset_normalizer-3.3.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b6cefa579e1237ce198619b76eaa148b71894fb0d6bcf9024460f9bf30fd228"},
-    {file = "charset_normalizer-3.3.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:61f1e3fb621f5420523abb71f5771a204b33c21d31e7d9d86881b2cffe92c47c"},
-    {file = "charset_normalizer-3.3.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4f6e2a839f83a6a76854d12dbebde50e4b1afa63e27761549d006fa53e9aa80e"},
-    {file = "charset_normalizer-3.3.1-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:1ec937546cad86d0dce5396748bf392bb7b62a9eeb8c66efac60e947697f0e58"},
-    {file = "charset_normalizer-3.3.1-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:82ca51ff0fc5b641a2d4e1cc8c5ff108699b7a56d7f3ad6f6da9dbb6f0145b48"},
-    {file = "charset_normalizer-3.3.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:633968254f8d421e70f91c6ebe71ed0ab140220469cf87a9857e21c16687c034"},
-    {file = "charset_normalizer-3.3.1-cp37-cp37m-win32.whl", hash = "sha256:c0c72d34e7de5604df0fde3644cc079feee5e55464967d10b24b1de268deceb9"},
-    {file = "charset_normalizer-3.3.1-cp37-cp37m-win_amd64.whl", hash = "sha256:63accd11149c0f9a99e3bc095bbdb5a464862d77a7e309ad5938fbc8721235ae"},
-    {file = "charset_normalizer-3.3.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5a3580a4fdc4ac05f9e53c57f965e3594b2f99796231380adb2baaab96e22761"},
-    {file = "charset_normalizer-3.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2465aa50c9299d615d757c1c888bc6fef384b7c4aec81c05a0172b4400f98557"},
-    {file = "charset_normalizer-3.3.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cb7cd68814308aade9d0c93c5bd2ade9f9441666f8ba5aa9c2d4b389cb5e2a45"},
-    {file = "charset_normalizer-3.3.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91e43805ccafa0a91831f9cd5443aa34528c0c3f2cc48c4cb3d9a7721053874b"},
-    {file = "charset_normalizer-3.3.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:854cc74367180beb327ab9d00f964f6d91da06450b0855cbbb09187bcdb02de5"},
-    {file = "charset_normalizer-3.3.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c15070ebf11b8b7fd1bfff7217e9324963c82dbdf6182ff7050519e350e7ad9f"},
-    {file = "charset_normalizer-3.3.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c4c99f98fc3a1835af8179dcc9013f93594d0670e2fa80c83aa36346ee763d2"},
-    {file = "charset_normalizer-3.3.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3fb765362688821404ad6cf86772fc54993ec11577cd5a92ac44b4c2ba52155b"},
-    {file = "charset_normalizer-3.3.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:dced27917823df984fe0c80a5c4ad75cf58df0fbfae890bc08004cd3888922a2"},
-    {file = "charset_normalizer-3.3.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a66bcdf19c1a523e41b8e9d53d0cedbfbac2e93c649a2e9502cb26c014d0980c"},
-    {file = "charset_normalizer-3.3.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:ecd26be9f112c4f96718290c10f4caea6cc798459a3a76636b817a0ed7874e42"},
-    {file = "charset_normalizer-3.3.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:3f70fd716855cd3b855316b226a1ac8bdb3caf4f7ea96edcccc6f484217c9597"},
-    {file = "charset_normalizer-3.3.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:17a866d61259c7de1bdadef418a37755050ddb4b922df8b356503234fff7932c"},
-    {file = "charset_normalizer-3.3.1-cp38-cp38-win32.whl", hash = "sha256:548eefad783ed787b38cb6f9a574bd8664468cc76d1538215d510a3cd41406cb"},
-    {file = "charset_normalizer-3.3.1-cp38-cp38-win_amd64.whl", hash = "sha256:45f053a0ece92c734d874861ffe6e3cc92150e32136dd59ab1fb070575189c97"},
-    {file = "charset_normalizer-3.3.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:bc791ec3fd0c4309a753f95bb6c749ef0d8ea3aea91f07ee1cf06b7b02118f2f"},
-    {file = "charset_normalizer-3.3.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0c8c61fb505c7dad1d251c284e712d4e0372cef3b067f7ddf82a7fa82e1e9a93"},
-    {file = "charset_normalizer-3.3.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2c092be3885a1b7899cd85ce24acedc1034199d6fca1483fa2c3a35c86e43041"},
-    {file = "charset_normalizer-3.3.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c2000c54c395d9e5e44c99dc7c20a64dc371f777faf8bae4919ad3e99ce5253e"},
-    {file = "charset_normalizer-3.3.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4cb50a0335382aac15c31b61d8531bc9bb657cfd848b1d7158009472189f3d62"},
-    {file = "charset_normalizer-3.3.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c30187840d36d0ba2893bc3271a36a517a717f9fd383a98e2697ee890a37c273"},
-    {file = "charset_normalizer-3.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe81b35c33772e56f4b6cf62cf4aedc1762ef7162a31e6ac7fe5e40d0149eb67"},
-    {file = "charset_normalizer-3.3.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d0bf89afcbcf4d1bb2652f6580e5e55a840fdf87384f6063c4a4f0c95e378656"},
-    {file = "charset_normalizer-3.3.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:06cf46bdff72f58645434d467bf5228080801298fbba19fe268a01b4534467f5"},
-    {file = "charset_normalizer-3.3.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:3c66df3f41abee950d6638adc7eac4730a306b022570f71dd0bd6ba53503ab57"},
-    {file = "charset_normalizer-3.3.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:cd805513198304026bd379d1d516afbf6c3c13f4382134a2c526b8b854da1c2e"},
-    {file = "charset_normalizer-3.3.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:9505dc359edb6a330efcd2be825fdb73ee3e628d9010597aa1aee5aa63442e97"},
-    {file = "charset_normalizer-3.3.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:31445f38053476a0c4e6d12b047b08ced81e2c7c712e5a1ad97bc913256f91b2"},
-    {file = "charset_normalizer-3.3.1-cp39-cp39-win32.whl", hash = "sha256:bd28b31730f0e982ace8663d108e01199098432a30a4c410d06fe08fdb9e93f4"},
-    {file = "charset_normalizer-3.3.1-cp39-cp39-win_amd64.whl", hash = "sha256:555fe186da0068d3354cdf4bbcbc609b0ecae4d04c921cc13e209eece7720727"},
-    {file = "charset_normalizer-3.3.1-py3-none-any.whl", hash = "sha256:800561453acdecedaac137bf09cd719c7a440b6800ec182f077bb8e7025fb708"},
+    {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"},
+    {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"},
 ]
 
 [[package]]
@@ -355,14 +355,14 @@ files = [
 
 [[package]]
 name = "dataclasses-json"
-version = "0.6.1"
+version = "0.6.2"
 description = "Easily serialize dataclasses to and from JSON."
 category = "main"
 optional = false
 python-versions = ">=3.7,<4.0"
 files = [
-    {file = "dataclasses_json-0.6.1-py3-none-any.whl", hash = "sha256:1bd8418a61fe3d588bb0079214d7fb71d44937da40742b787256fd53b26b6c80"},
-    {file = "dataclasses_json-0.6.1.tar.gz", hash = "sha256:a53c220c35134ce08211a1057fd0e5bf76dc5331627c6b241cacbc570a89faae"},
+    {file = "dataclasses_json-0.6.2-py3-none-any.whl", hash = "sha256:71816ced3d0f55a2c5bc1a813ace1b8d4234e79a08744269a7cf84d6f7c06e99"},
+    {file = "dataclasses_json-0.6.2.tar.gz", hash = "sha256:1b934c1bd63e775880946b8361a902d7de86e894bab8098eab27c010f95724d1"},
 ]
 
 [package.dependencies]
@@ -973,39 +973,39 @@ files = [
 
 [[package]]
 name = "mypy"
-version = "1.6.1"
+version = "1.7.0"
 description = "Optional static typing for Python"
 category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "mypy-1.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e5012e5cc2ac628177eaac0e83d622b2dd499e28253d4107a08ecc59ede3fc2c"},
-    {file = "mypy-1.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d8fbb68711905f8912e5af474ca8b78d077447d8f3918997fecbf26943ff3cbb"},
-    {file = "mypy-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21a1ad938fee7d2d96ca666c77b7c494c3c5bd88dff792220e1afbebb2925b5e"},
-    {file = "mypy-1.6.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b96ae2c1279d1065413965c607712006205a9ac541895004a1e0d4f281f2ff9f"},
-    {file = "mypy-1.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:40b1844d2e8b232ed92e50a4bd11c48d2daa351f9deee6c194b83bf03e418b0c"},
-    {file = "mypy-1.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:81af8adaa5e3099469e7623436881eff6b3b06db5ef75e6f5b6d4871263547e5"},
-    {file = "mypy-1.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8c223fa57cb154c7eab5156856c231c3f5eace1e0bed9b32a24696b7ba3c3245"},
-    {file = "mypy-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8032e00ce71c3ceb93eeba63963b864bf635a18f6c0c12da6c13c450eedb183"},
-    {file = "mypy-1.6.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4c46b51de523817a0045b150ed11b56f9fff55f12b9edd0f3ed35b15a2809de0"},
-    {file = "mypy-1.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:19f905bcfd9e167159b3d63ecd8cb5e696151c3e59a1742e79bc3bcb540c42c7"},
-    {file = "mypy-1.6.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:82e469518d3e9a321912955cc702d418773a2fd1e91c651280a1bda10622f02f"},
-    {file = "mypy-1.6.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d4473c22cc296425bbbce7e9429588e76e05bc7342da359d6520b6427bf76660"},
-    {file = "mypy-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:59a0d7d24dfb26729e0a068639a6ce3500e31d6655df8557156c51c1cb874ce7"},
-    {file = "mypy-1.6.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:cfd13d47b29ed3bbaafaff7d8b21e90d827631afda134836962011acb5904b71"},
-    {file = "mypy-1.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:eb4f18589d196a4cbe5290b435d135dee96567e07c2b2d43b5c4621b6501531a"},
-    {file = "mypy-1.6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:41697773aa0bf53ff917aa077e2cde7aa50254f28750f9b88884acea38a16169"},
-    {file = "mypy-1.6.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7274b0c57737bd3476d2229c6389b2ec9eefeb090bbaf77777e9d6b1b5a9d143"},
-    {file = "mypy-1.6.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbaf4662e498c8c2e352da5f5bca5ab29d378895fa2d980630656178bd607c46"},
-    {file = "mypy-1.6.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bb8ccb4724f7d8601938571bf3f24da0da791fe2db7be3d9e79849cb64e0ae85"},
-    {file = "mypy-1.6.1-cp38-cp38-win_amd64.whl", hash = "sha256:68351911e85145f582b5aa6cd9ad666c8958bcae897a1bfda8f4940472463c45"},
-    {file = "mypy-1.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:49ae115da099dcc0922a7a895c1eec82c1518109ea5c162ed50e3b3594c71208"},
-    {file = "mypy-1.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8b27958f8c76bed8edaa63da0739d76e4e9ad4ed325c814f9b3851425582a3cd"},
-    {file = "mypy-1.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:925cd6a3b7b55dfba252b7c4561892311c5358c6b5a601847015a1ad4eb7d332"},
-    {file = "mypy-1.6.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8f57e6b6927a49550da3d122f0cb983d400f843a8a82e65b3b380d3d7259468f"},
-    {file = "mypy-1.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:a43ef1c8ddfdb9575691720b6352761f3f53d85f1b57d7745701041053deff30"},
-    {file = "mypy-1.6.1-py3-none-any.whl", hash = "sha256:4cbe68ef919c28ea561165206a2dcb68591c50f3bcf777932323bc208d949cf1"},
-    {file = "mypy-1.6.1.tar.gz", hash = "sha256:4d01c00d09a0be62a4ca3f933e315455bde83f37f892ba4b08ce92f3cf44bcc1"},
+    {file = "mypy-1.7.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5da84d7bf257fd8f66b4f759a904fd2c5a765f70d8b52dde62b521972a0a2357"},
+    {file = "mypy-1.7.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a3637c03f4025f6405737570d6cbfa4f1400eb3c649317634d273687a09ffc2f"},
+    {file = "mypy-1.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b633f188fc5ae1b6edca39dae566974d7ef4e9aaaae00bc36efe1f855e5173ac"},
+    {file = "mypy-1.7.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d6ed9a3997b90c6f891138e3f83fb8f475c74db4ccaa942a1c7bf99e83a989a1"},
+    {file = "mypy-1.7.0-cp310-cp310-win_amd64.whl", hash = "sha256:1fe46e96ae319df21359c8db77e1aecac8e5949da4773c0274c0ef3d8d1268a9"},
+    {file = "mypy-1.7.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:df67fbeb666ee8828f675fee724cc2cbd2e4828cc3df56703e02fe6a421b7401"},
+    {file = "mypy-1.7.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a79cdc12a02eb526d808a32a934c6fe6df07b05f3573d210e41808020aed8b5d"},
+    {file = "mypy-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f65f385a6f43211effe8c682e8ec3f55d79391f70a201575def73d08db68ead1"},
+    {file = "mypy-1.7.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0e81ffd120ee24959b449b647c4b2fbfcf8acf3465e082b8d58fd6c4c2b27e46"},
+    {file = "mypy-1.7.0-cp311-cp311-win_amd64.whl", hash = "sha256:f29386804c3577c83d76520abf18cfcd7d68264c7e431c5907d250ab502658ee"},
+    {file = "mypy-1.7.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:87c076c174e2c7ef8ab416c4e252d94c08cd4980a10967754f91571070bf5fbe"},
+    {file = "mypy-1.7.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6cb8d5f6d0fcd9e708bb190b224089e45902cacef6f6915481806b0c77f7786d"},
+    {file = "mypy-1.7.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d93e76c2256aa50d9c82a88e2f569232e9862c9982095f6d54e13509f01222fc"},
+    {file = "mypy-1.7.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:cddee95dea7990e2215576fae95f6b78a8c12f4c089d7e4367564704e99118d3"},
+    {file = "mypy-1.7.0-cp312-cp312-win_amd64.whl", hash = "sha256:d01921dbd691c4061a3e2ecdbfbfad029410c5c2b1ee88946bf45c62c6c91210"},
+    {file = "mypy-1.7.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:185cff9b9a7fec1f9f7d8352dff8a4c713b2e3eea9c6c4b5ff7f0edf46b91e41"},
+    {file = "mypy-1.7.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7a7b1e399c47b18feb6f8ad4a3eef3813e28c1e871ea7d4ea5d444b2ac03c418"},
+    {file = "mypy-1.7.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc9fe455ad58a20ec68599139ed1113b21f977b536a91b42bef3ffed5cce7391"},
+    {file = "mypy-1.7.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d0fa29919d2e720c8dbaf07d5578f93d7b313c3e9954c8ec05b6d83da592e5d9"},
+    {file = "mypy-1.7.0-cp38-cp38-win_amd64.whl", hash = "sha256:2b53655a295c1ed1af9e96b462a736bf083adba7b314ae775563e3fb4e6795f5"},
+    {file = "mypy-1.7.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c1b06b4b109e342f7dccc9efda965fc3970a604db70f8560ddfdee7ef19afb05"},
+    {file = "mypy-1.7.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:bf7a2f0a6907f231d5e41adba1a82d7d88cf1f61a70335889412dec99feeb0f8"},
+    {file = "mypy-1.7.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:551d4a0cdcbd1d2cccdcc7cb516bb4ae888794929f5b040bb51aae1846062901"},
+    {file = "mypy-1.7.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:55d28d7963bef00c330cb6461db80b0b72afe2f3c4e2963c99517cf06454e665"},
+    {file = "mypy-1.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:870bd1ffc8a5862e593185a4c169804f2744112b4a7c55b93eb50f48e7a77010"},
+    {file = "mypy-1.7.0-py3-none-any.whl", hash = "sha256:96650d9a4c651bc2a4991cf46f100973f656d69edc7faf91844e87fe627f7e96"},
+    {file = "mypy-1.7.0.tar.gz", hash = "sha256:1e280b5697202efa698372d2f39e9a6713a0395a756b1c6bd48995f8d72690dc"},
 ]
 
 [package.dependencies]
@@ -1015,6 +1015,7 @@ typing-extensions = ">=4.1.0"
 [package.extras]
 dmypy = ["psutil (>=4.0)"]
 install-types = ["pip"]
+mypyc = ["setuptools (>=50)"]
 reports = ["lxml"]
 
 [[package]]
@@ -1161,14 +1162,14 @@ flake8-polyfill = ">=1.0.2,<2"
 
 [[package]]
 name = "platformdirs"
-version = "3.11.0"
+version = "4.0.0"
 description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "platformdirs-3.11.0-py3-none-any.whl", hash = "sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e"},
-    {file = "platformdirs-3.11.0.tar.gz", hash = "sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3"},
+    {file = "platformdirs-4.0.0-py3-none-any.whl", hash = "sha256:118c954d7e949b35437270383a3f2531e99dd93cf7ce4dc8340d3356d30f173b"},
+    {file = "platformdirs-4.0.0.tar.gz", hash = "sha256:cb633b2bcf10c51af60beb0ab06d2f1d69064b43abf4c185ca6b28865f3f9731"},
 ]
 
 [package.extras]
@@ -1709,61 +1710,61 @@ files = [
 
 [[package]]
 name = "sqlalchemy"
-version = "2.0.22"
+version = "2.0.23"
 description = "Database Abstraction Library"
 category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "SQLAlchemy-2.0.22-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f146c61ae128ab43ea3a0955de1af7e1633942c2b2b4985ac51cc292daf33222"},
-    {file = "SQLAlchemy-2.0.22-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:875de9414393e778b655a3d97d60465eb3fae7c919e88b70cc10b40b9f56042d"},
-    {file = "SQLAlchemy-2.0.22-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:13790cb42f917c45c9c850b39b9941539ca8ee7917dacf099cc0b569f3d40da7"},
-    {file = "SQLAlchemy-2.0.22-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e04ab55cf49daf1aeb8c622c54d23fa4bec91cb051a43cc24351ba97e1dd09f5"},
-    {file = "SQLAlchemy-2.0.22-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:a42c9fa3abcda0dcfad053e49c4f752eef71ecd8c155221e18b99d4224621176"},
-    {file = "SQLAlchemy-2.0.22-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:14cd3bcbb853379fef2cd01e7c64a5d6f1d005406d877ed9509afb7a05ff40a5"},
-    {file = "SQLAlchemy-2.0.22-cp310-cp310-win32.whl", hash = "sha256:d143c5a9dada696bcfdb96ba2de4a47d5a89168e71d05a076e88a01386872f97"},
-    {file = "SQLAlchemy-2.0.22-cp310-cp310-win_amd64.whl", hash = "sha256:ccd87c25e4c8559e1b918d46b4fa90b37f459c9b4566f1dfbce0eb8122571547"},
-    {file = "SQLAlchemy-2.0.22-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4f6ff392b27a743c1ad346d215655503cec64405d3b694228b3454878bf21590"},
-    {file = "SQLAlchemy-2.0.22-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f776c2c30f0e5f4db45c3ee11a5f2a8d9de68e81eb73ec4237de1e32e04ae81c"},
-    {file = "SQLAlchemy-2.0.22-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c8f1792d20d2f4e875ce7a113f43c3561ad12b34ff796b84002a256f37ce9437"},
-    {file = "SQLAlchemy-2.0.22-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d80eeb5189d7d4b1af519fc3f148fe7521b9dfce8f4d6a0820e8f5769b005051"},
-    {file = "SQLAlchemy-2.0.22-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69fd9e41cf9368afa034e1c81f3570afb96f30fcd2eb1ef29cb4d9371c6eece2"},
-    {file = "SQLAlchemy-2.0.22-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:54bcceaf4eebef07dadfde424f5c26b491e4a64e61761dea9459103ecd6ccc95"},
-    {file = "SQLAlchemy-2.0.22-cp311-cp311-win32.whl", hash = "sha256:7ee7ccf47aa503033b6afd57efbac6b9e05180f492aeed9fcf70752556f95624"},
-    {file = "SQLAlchemy-2.0.22-cp311-cp311-win_amd64.whl", hash = "sha256:b560f075c151900587ade06706b0c51d04b3277c111151997ea0813455378ae0"},
-    {file = "SQLAlchemy-2.0.22-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:2c9bac865ee06d27a1533471405ad240a6f5d83195eca481f9fc4a71d8b87df8"},
-    {file = "SQLAlchemy-2.0.22-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:625b72d77ac8ac23da3b1622e2da88c4aedaee14df47c8432bf8f6495e655de2"},
-    {file = "SQLAlchemy-2.0.22-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b39a6e21110204a8c08d40ff56a73ba542ec60bab701c36ce721e7990df49fb9"},
-    {file = "SQLAlchemy-2.0.22-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53a766cb0b468223cafdf63e2d37f14a4757476157927b09300c8c5832d88560"},
-    {file = "SQLAlchemy-2.0.22-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0e1ce8ebd2e040357dde01a3fb7d30d9b5736b3e54a94002641dfd0aa12ae6ce"},
-    {file = "SQLAlchemy-2.0.22-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:505f503763a767556fa4deae5194b2be056b64ecca72ac65224381a0acab7ebe"},
-    {file = "SQLAlchemy-2.0.22-cp312-cp312-win32.whl", hash = "sha256:154a32f3c7b00de3d090bc60ec8006a78149e221f1182e3edcf0376016be9396"},
-    {file = "SQLAlchemy-2.0.22-cp312-cp312-win_amd64.whl", hash = "sha256:129415f89744b05741c6f0b04a84525f37fbabe5dc3774f7edf100e7458c48cd"},
-    {file = "SQLAlchemy-2.0.22-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:3940677d341f2b685a999bffe7078697b5848a40b5f6952794ffcf3af150c301"},
-    {file = "SQLAlchemy-2.0.22-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55914d45a631b81a8a2cb1a54f03eea265cf1783241ac55396ec6d735be14883"},
-    {file = "SQLAlchemy-2.0.22-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2096d6b018d242a2bcc9e451618166f860bb0304f590d205173d317b69986c95"},
-    {file = "SQLAlchemy-2.0.22-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:19c6986cf2fb4bc8e0e846f97f4135a8e753b57d2aaaa87c50f9acbe606bd1db"},
-    {file = "SQLAlchemy-2.0.22-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6ac28bd6888fe3c81fbe97584eb0b96804bd7032d6100b9701255d9441373ec1"},
-    {file = "SQLAlchemy-2.0.22-cp37-cp37m-win32.whl", hash = "sha256:cb9a758ad973e795267da334a92dd82bb7555cb36a0960dcabcf724d26299db8"},
-    {file = "SQLAlchemy-2.0.22-cp37-cp37m-win_amd64.whl", hash = "sha256:40b1206a0d923e73aa54f0a6bd61419a96b914f1cd19900b6c8226899d9742ad"},
-    {file = "SQLAlchemy-2.0.22-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3aa1472bf44f61dd27987cd051f1c893b7d3b17238bff8c23fceaef4f1133868"},
-    {file = "SQLAlchemy-2.0.22-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:56a7e2bb639df9263bf6418231bc2a92a773f57886d371ddb7a869a24919face"},
-    {file = "SQLAlchemy-2.0.22-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ccca778c0737a773a1ad86b68bda52a71ad5950b25e120b6eb1330f0df54c3d0"},
-    {file = "SQLAlchemy-2.0.22-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c6c3e9350f9fb16de5b5e5fbf17b578811a52d71bb784cc5ff71acb7de2a7f9"},
-    {file = "SQLAlchemy-2.0.22-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:564e9f9e4e6466273dbfab0e0a2e5fe819eec480c57b53a2cdee8e4fdae3ad5f"},
-    {file = "SQLAlchemy-2.0.22-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:af66001d7b76a3fab0d5e4c1ec9339ac45748bc4a399cbc2baa48c1980d3c1f4"},
-    {file = "SQLAlchemy-2.0.22-cp38-cp38-win32.whl", hash = "sha256:9e55dff5ec115316dd7a083cdc1a52de63693695aecf72bc53a8e1468ce429e5"},
-    {file = "SQLAlchemy-2.0.22-cp38-cp38-win_amd64.whl", hash = "sha256:4e869a8ff7ee7a833b74868a0887e8462445ec462432d8cbeff5e85f475186da"},
-    {file = "SQLAlchemy-2.0.22-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9886a72c8e6371280cb247c5d32c9c8fa141dc560124348762db8a8b236f8692"},
-    {file = "SQLAlchemy-2.0.22-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a571bc8ac092a3175a1d994794a8e7a1f2f651e7c744de24a19b4f740fe95034"},
-    {file = "SQLAlchemy-2.0.22-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8db5ba8b7da759b727faebc4289a9e6a51edadc7fc32207a30f7c6203a181592"},
-    {file = "SQLAlchemy-2.0.22-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b0b3f2686c3f162123adba3cb8b626ed7e9b8433ab528e36ed270b4f70d1cdb"},
-    {file = "SQLAlchemy-2.0.22-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0c1fea8c0abcb070ffe15311853abfda4e55bf7dc1d4889497b3403629f3bf00"},
-    {file = "SQLAlchemy-2.0.22-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4bb062784f37b2d75fd9b074c8ec360ad5df71f933f927e9e95c50eb8e05323c"},
-    {file = "SQLAlchemy-2.0.22-cp39-cp39-win32.whl", hash = "sha256:58a3aba1bfb32ae7af68da3f277ed91d9f57620cf7ce651db96636790a78b736"},
-    {file = "SQLAlchemy-2.0.22-cp39-cp39-win_amd64.whl", hash = "sha256:92e512a6af769e4725fa5b25981ba790335d42c5977e94ded07db7d641490a85"},
-    {file = "SQLAlchemy-2.0.22-py3-none-any.whl", hash = "sha256:3076740335e4aaadd7deb3fe6dcb96b3015f1613bd190a4e1634e1b99b02ec86"},
-    {file = "SQLAlchemy-2.0.22.tar.gz", hash = "sha256:5434cc601aa17570d79e5377f5fd45ff92f9379e2abed0be5e8c2fba8d353d2b"},
+    {file = "SQLAlchemy-2.0.23-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:638c2c0b6b4661a4fd264f6fb804eccd392745c5887f9317feb64bb7cb03b3ea"},
+    {file = "SQLAlchemy-2.0.23-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e3b5036aa326dc2df50cba3c958e29b291a80f604b1afa4c8ce73e78e1c9f01d"},
+    {file = "SQLAlchemy-2.0.23-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:787af80107fb691934a01889ca8f82a44adedbf5ef3d6ad7d0f0b9ac557e0c34"},
+    {file = "SQLAlchemy-2.0.23-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c14eba45983d2f48f7546bb32b47937ee2cafae353646295f0e99f35b14286ab"},
+    {file = "SQLAlchemy-2.0.23-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0666031df46b9badba9bed00092a1ffa3aa063a5e68fa244acd9f08070e936d3"},
+    {file = "SQLAlchemy-2.0.23-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:89a01238fcb9a8af118eaad3ffcc5dedaacbd429dc6fdc43fe430d3a941ff965"},
+    {file = "SQLAlchemy-2.0.23-cp310-cp310-win32.whl", hash = "sha256:cabafc7837b6cec61c0e1e5c6d14ef250b675fa9c3060ed8a7e38653bd732ff8"},
+    {file = "SQLAlchemy-2.0.23-cp310-cp310-win_amd64.whl", hash = "sha256:87a3d6b53c39cd173990de2f5f4b83431d534a74f0e2f88bd16eabb5667e65c6"},
+    {file = "SQLAlchemy-2.0.23-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d5578e6863eeb998980c212a39106ea139bdc0b3f73291b96e27c929c90cd8e1"},
+    {file = "SQLAlchemy-2.0.23-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:62d9e964870ea5ade4bc870ac4004c456efe75fb50404c03c5fd61f8bc669a72"},
+    {file = "SQLAlchemy-2.0.23-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c80c38bd2ea35b97cbf7c21aeb129dcbebbf344ee01a7141016ab7b851464f8e"},
+    {file = "SQLAlchemy-2.0.23-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75eefe09e98043cff2fb8af9796e20747ae870c903dc61d41b0c2e55128f958d"},
+    {file = "SQLAlchemy-2.0.23-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bd45a5b6c68357578263d74daab6ff9439517f87da63442d244f9f23df56138d"},
+    {file = "SQLAlchemy-2.0.23-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a86cb7063e2c9fb8e774f77fbf8475516d270a3e989da55fa05d08089d77f8c4"},
+    {file = "SQLAlchemy-2.0.23-cp311-cp311-win32.whl", hash = "sha256:b41f5d65b54cdf4934ecede2f41b9c60c9f785620416e8e6c48349ab18643855"},
+    {file = "SQLAlchemy-2.0.23-cp311-cp311-win_amd64.whl", hash = "sha256:9ca922f305d67605668e93991aaf2c12239c78207bca3b891cd51a4515c72e22"},
+    {file = "SQLAlchemy-2.0.23-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d0f7fb0c7527c41fa6fcae2be537ac137f636a41b4c5a4c58914541e2f436b45"},
+    {file = "SQLAlchemy-2.0.23-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7c424983ab447dab126c39d3ce3be5bee95700783204a72549c3dceffe0fc8f4"},
+    {file = "SQLAlchemy-2.0.23-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f508ba8f89e0a5ecdfd3761f82dda2a3d7b678a626967608f4273e0dba8f07ac"},
+    {file = "SQLAlchemy-2.0.23-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6463aa765cf02b9247e38b35853923edbf2f6fd1963df88706bc1d02410a5577"},
+    {file = "SQLAlchemy-2.0.23-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e599a51acf3cc4d31d1a0cf248d8f8d863b6386d2b6782c5074427ebb7803bda"},
+    {file = "SQLAlchemy-2.0.23-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:fd54601ef9cc455a0c61e5245f690c8a3ad67ddb03d3b91c361d076def0b4c60"},
+    {file = "SQLAlchemy-2.0.23-cp312-cp312-win32.whl", hash = "sha256:42d0b0290a8fb0165ea2c2781ae66e95cca6e27a2fbe1016ff8db3112ac1e846"},
+    {file = "SQLAlchemy-2.0.23-cp312-cp312-win_amd64.whl", hash = "sha256:227135ef1e48165f37590b8bfc44ed7ff4c074bf04dc8d6f8e7f1c14a94aa6ca"},
+    {file = "SQLAlchemy-2.0.23-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:14aebfe28b99f24f8a4c1346c48bc3d63705b1f919a24c27471136d2f219f02d"},
+    {file = "SQLAlchemy-2.0.23-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e983fa42164577d073778d06d2cc5d020322425a509a08119bdcee70ad856bf"},
+    {file = "SQLAlchemy-2.0.23-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e0dc9031baa46ad0dd5a269cb7a92a73284d1309228be1d5935dac8fb3cae24"},
+    {file = "SQLAlchemy-2.0.23-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:5f94aeb99f43729960638e7468d4688f6efccb837a858b34574e01143cf11f89"},
+    {file = "SQLAlchemy-2.0.23-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:63bfc3acc970776036f6d1d0e65faa7473be9f3135d37a463c5eba5efcdb24c8"},
+    {file = "SQLAlchemy-2.0.23-cp37-cp37m-win32.whl", hash = "sha256:f48ed89dd11c3c586f45e9eec1e437b355b3b6f6884ea4a4c3111a3358fd0c18"},
+    {file = "SQLAlchemy-2.0.23-cp37-cp37m-win_amd64.whl", hash = "sha256:1e018aba8363adb0599e745af245306cb8c46b9ad0a6fc0a86745b6ff7d940fc"},
+    {file = "SQLAlchemy-2.0.23-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:64ac935a90bc479fee77f9463f298943b0e60005fe5de2aa654d9cdef46c54df"},
+    {file = "SQLAlchemy-2.0.23-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c4722f3bc3c1c2fcc3702dbe0016ba31148dd6efcd2a2fd33c1b4897c6a19693"},
+    {file = "SQLAlchemy-2.0.23-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4af79c06825e2836de21439cb2a6ce22b2ca129bad74f359bddd173f39582bf5"},
+    {file = "SQLAlchemy-2.0.23-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:683ef58ca8eea4747737a1c35c11372ffeb84578d3aab8f3e10b1d13d66f2bc4"},
+    {file = "SQLAlchemy-2.0.23-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:d4041ad05b35f1f4da481f6b811b4af2f29e83af253bf37c3c4582b2c68934ab"},
+    {file = "SQLAlchemy-2.0.23-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:aeb397de65a0a62f14c257f36a726945a7f7bb60253462e8602d9b97b5cbe204"},
+    {file = "SQLAlchemy-2.0.23-cp38-cp38-win32.whl", hash = "sha256:42ede90148b73fe4ab4a089f3126b2cfae8cfefc955c8174d697bb46210c8306"},
+    {file = "SQLAlchemy-2.0.23-cp38-cp38-win_amd64.whl", hash = "sha256:964971b52daab357d2c0875825e36584d58f536e920f2968df8d581054eada4b"},
+    {file = "SQLAlchemy-2.0.23-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:616fe7bcff0a05098f64b4478b78ec2dfa03225c23734d83d6c169eb41a93e55"},
+    {file = "SQLAlchemy-2.0.23-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0e680527245895aba86afbd5bef6c316831c02aa988d1aad83c47ffe92655e74"},
+    {file = "SQLAlchemy-2.0.23-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9585b646ffb048c0250acc7dad92536591ffe35dba624bb8fd9b471e25212a35"},
+    {file = "SQLAlchemy-2.0.23-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4895a63e2c271ffc7a81ea424b94060f7b3b03b4ea0cd58ab5bb676ed02f4221"},
+    {file = "SQLAlchemy-2.0.23-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:cc1d21576f958c42d9aec68eba5c1a7d715e5fc07825a629015fe8e3b0657fb0"},
+    {file = "SQLAlchemy-2.0.23-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:967c0b71156f793e6662dd839da54f884631755275ed71f1539c95bbada9aaab"},
+    {file = "SQLAlchemy-2.0.23-cp39-cp39-win32.whl", hash = "sha256:0a8c6aa506893e25a04233bc721c6b6cf844bafd7250535abb56cb6cc1368884"},
+    {file = "SQLAlchemy-2.0.23-cp39-cp39-win_amd64.whl", hash = "sha256:f3420d00d2cb42432c1d0e44540ae83185ccbbc67a6054dcc8ab5387add6620b"},
+    {file = "SQLAlchemy-2.0.23-py3-none-any.whl", hash = "sha256:31952bbc527d633b9479f5f81e8b9dfada00b91d6baba021a869095f1a97006d"},
+    {file = "SQLAlchemy-2.0.23.tar.gz", hash = "sha256:c1bda93cbbe4aa2aa0aa8655c5aeda505cd219ff3e8da91d1d329e143e4aff69"},
 ]
 
 [package.dependencies]
@@ -1773,6 +1774,7 @@ typing-extensions = ">=4.2.0"
 
 [package.extras]
 aiomysql = ["aiomysql (>=0.2.0)", "greenlet (!=0.4.17)"]
+aioodbc = ["aioodbc", "greenlet (!=0.4.17)"]
 aiosqlite = ["aiosqlite", "greenlet (!=0.4.17)", "typing-extensions (!=3.10.0.1)"]
 asyncio = ["greenlet (!=0.4.17)"]
 asyncmy = ["asyncmy (>=0.2.3,!=0.2.4,!=0.2.6)", "greenlet (!=0.4.17)"]
@@ -1783,7 +1785,7 @@ mssql-pyodbc = ["pyodbc"]
 mypy = ["mypy (>=0.910)"]
 mysql = ["mysqlclient (>=1.4.0)"]
 mysql-connector = ["mysql-connector-python"]
-oracle = ["cx-oracle (>=7)"]
+oracle = ["cx-oracle (>=8)"]
 oracle-oracledb = ["oracledb (>=1.0.1)"]
 postgresql = ["psycopg2 (>=2.7)"]
 postgresql-asyncpg = ["asyncpg", "greenlet (!=0.4.17)"]
@@ -1888,14 +1890,14 @@ files = [
 
 [[package]]
 name = "tomlkit"
-version = "0.12.1"
+version = "0.12.2"
 description = "Style preserving TOML library"
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "tomlkit-0.12.1-py3-none-any.whl", hash = "sha256:712cbd236609acc6a3e2e97253dfc52d4c2082982a88f61b640ecf0817eab899"},
-    {file = "tomlkit-0.12.1.tar.gz", hash = "sha256:38e1ff8edb991273ec9f6181244a6a391ac30e9f5098e7535640ea6be97a7c86"},
+    {file = "tomlkit-0.12.2-py3-none-any.whl", hash = "sha256:eeea7ac7563faeab0a1ed8fe12c2e5a51c61f933f2502f7e9db0241a65163ad0"},
+    {file = "tomlkit-0.12.2.tar.gz", hash = "sha256:df32fab589a81f0d7dc525a4267b6d7a64ee99619cbd1eeb0fae32c1dd426977"},
 ]
 
 [[package]]
@@ -1986,87 +1988,82 @@ standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)",
 
 [[package]]
 name = "wrapt"
-version = "1.15.0"
+version = "1.16.0"
 description = "Module for decorators, wrappers and monkey patching."
 category = "dev"
 optional = false
-python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
-files = [
-    {file = "wrapt-1.15.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ca1cccf838cd28d5a0883b342474c630ac48cac5df0ee6eacc9c7290f76b11c1"},
-    {file = "wrapt-1.15.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:e826aadda3cae59295b95343db8f3d965fb31059da7de01ee8d1c40a60398b29"},
-    {file = "wrapt-1.15.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:5fc8e02f5984a55d2c653f5fea93531e9836abbd84342c1d1e17abc4a15084c2"},
-    {file = "wrapt-1.15.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:96e25c8603a155559231c19c0349245eeb4ac0096fe3c1d0be5c47e075bd4f46"},
-    {file = "wrapt-1.15.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:40737a081d7497efea35ab9304b829b857f21558acfc7b3272f908d33b0d9d4c"},
-    {file = "wrapt-1.15.0-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:f87ec75864c37c4c6cb908d282e1969e79763e0d9becdfe9fe5473b7bb1e5f09"},
-    {file = "wrapt-1.15.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:1286eb30261894e4c70d124d44b7fd07825340869945c79d05bda53a40caa079"},
-    {file = "wrapt-1.15.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:493d389a2b63c88ad56cdc35d0fa5752daac56ca755805b1b0c530f785767d5e"},
-    {file = "wrapt-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:58d7a75d731e8c63614222bcb21dd992b4ab01a399f1f09dd82af17bbfc2368a"},
-    {file = "wrapt-1.15.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:21f6d9a0d5b3a207cdf7acf8e58d7d13d463e639f0c7e01d82cdb671e6cb7923"},
-    {file = "wrapt-1.15.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ce42618f67741d4697684e501ef02f29e758a123aa2d669e2d964ff734ee00ee"},
-    {file = "wrapt-1.15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41d07d029dd4157ae27beab04d22b8e261eddfc6ecd64ff7000b10dc8b3a5727"},
-    {file = "wrapt-1.15.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54accd4b8bc202966bafafd16e69da9d5640ff92389d33d28555c5fd4f25ccb7"},
-    {file = "wrapt-1.15.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fbfbca668dd15b744418265a9607baa970c347eefd0db6a518aaf0cfbd153c0"},
-    {file = "wrapt-1.15.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:76e9c727a874b4856d11a32fb0b389afc61ce8aaf281ada613713ddeadd1cfec"},
-    {file = "wrapt-1.15.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e20076a211cd6f9b44a6be58f7eeafa7ab5720eb796975d0c03f05b47d89eb90"},
-    {file = "wrapt-1.15.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a74d56552ddbde46c246b5b89199cb3fd182f9c346c784e1a93e4dc3f5ec9975"},
-    {file = "wrapt-1.15.0-cp310-cp310-win32.whl", hash = "sha256:26458da5653aa5b3d8dc8b24192f574a58984c749401f98fff994d41d3f08da1"},
-    {file = "wrapt-1.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:75760a47c06b5974aa5e01949bf7e66d2af4d08cb8c1d6516af5e39595397f5e"},
-    {file = "wrapt-1.15.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ba1711cda2d30634a7e452fc79eabcadaffedf241ff206db2ee93dd2c89a60e7"},
-    {file = "wrapt-1.15.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:56374914b132c702aa9aa9959c550004b8847148f95e1b824772d453ac204a72"},
-    {file = "wrapt-1.15.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a89ce3fd220ff144bd9d54da333ec0de0399b52c9ac3d2ce34b569cf1a5748fb"},
-    {file = "wrapt-1.15.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3bbe623731d03b186b3d6b0d6f51865bf598587c38d6f7b0be2e27414f7f214e"},
-    {file = "wrapt-1.15.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3abbe948c3cbde2689370a262a8d04e32ec2dd4f27103669a45c6929bcdbfe7c"},
-    {file = "wrapt-1.15.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b67b819628e3b748fd3c2192c15fb951f549d0f47c0449af0764d7647302fda3"},
-    {file = "wrapt-1.15.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7eebcdbe3677e58dd4c0e03b4f2cfa346ed4049687d839adad68cc38bb559c92"},
-    {file = "wrapt-1.15.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:74934ebd71950e3db69960a7da29204f89624dde411afbfb3b4858c1409b1e98"},
-    {file = "wrapt-1.15.0-cp311-cp311-win32.whl", hash = "sha256:bd84395aab8e4d36263cd1b9308cd504f6cf713b7d6d3ce25ea55670baec5416"},
-    {file = "wrapt-1.15.0-cp311-cp311-win_amd64.whl", hash = "sha256:a487f72a25904e2b4bbc0817ce7a8de94363bd7e79890510174da9d901c38705"},
-    {file = "wrapt-1.15.0-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:4ff0d20f2e670800d3ed2b220d40984162089a6e2c9646fdb09b85e6f9a8fc29"},
-    {file = "wrapt-1.15.0-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:9ed6aa0726b9b60911f4aed8ec5b8dd7bf3491476015819f56473ffaef8959bd"},
-    {file = "wrapt-1.15.0-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:896689fddba4f23ef7c718279e42f8834041a21342d95e56922e1c10c0cc7afb"},
-    {file = "wrapt-1.15.0-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:75669d77bb2c071333417617a235324a1618dba66f82a750362eccbe5b61d248"},
-    {file = "wrapt-1.15.0-cp35-cp35m-win32.whl", hash = "sha256:fbec11614dba0424ca72f4e8ba3c420dba07b4a7c206c8c8e4e73f2e98f4c559"},
-    {file = "wrapt-1.15.0-cp35-cp35m-win_amd64.whl", hash = "sha256:fd69666217b62fa5d7c6aa88e507493a34dec4fa20c5bd925e4bc12fce586639"},
-    {file = "wrapt-1.15.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:b0724f05c396b0a4c36a3226c31648385deb6a65d8992644c12a4963c70326ba"},
-    {file = "wrapt-1.15.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bbeccb1aa40ab88cd29e6c7d8585582c99548f55f9b2581dfc5ba68c59a85752"},
-    {file = "wrapt-1.15.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:38adf7198f8f154502883242f9fe7333ab05a5b02de7d83aa2d88ea621f13364"},
-    {file = "wrapt-1.15.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:578383d740457fa790fdf85e6d346fda1416a40549fe8db08e5e9bd281c6a475"},
-    {file = "wrapt-1.15.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:a4cbb9ff5795cd66f0066bdf5947f170f5d63a9274f99bdbca02fd973adcf2a8"},
-    {file = "wrapt-1.15.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:af5bd9ccb188f6a5fdda9f1f09d9f4c86cc8a539bd48a0bfdc97723970348418"},
-    {file = "wrapt-1.15.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:b56d5519e470d3f2fe4aa7585f0632b060d532d0696c5bdfb5e8319e1d0f69a2"},
-    {file = "wrapt-1.15.0-cp36-cp36m-win32.whl", hash = "sha256:77d4c1b881076c3ba173484dfa53d3582c1c8ff1f914c6461ab70c8428b796c1"},
-    {file = "wrapt-1.15.0-cp36-cp36m-win_amd64.whl", hash = "sha256:077ff0d1f9d9e4ce6476c1a924a3332452c1406e59d90a2cf24aeb29eeac9420"},
-    {file = "wrapt-1.15.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5c5aa28df055697d7c37d2099a7bc09f559d5053c3349b1ad0c39000e611d317"},
-    {file = "wrapt-1.15.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a8564f283394634a7a7054b7983e47dbf39c07712d7b177b37e03f2467a024e"},
-    {file = "wrapt-1.15.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:780c82a41dc493b62fc5884fb1d3a3b81106642c5c5c78d6a0d4cbe96d62ba7e"},
-    {file = "wrapt-1.15.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e169e957c33576f47e21864cf3fc9ff47c223a4ebca8960079b8bd36cb014fd0"},
-    {file = "wrapt-1.15.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:b02f21c1e2074943312d03d243ac4388319f2456576b2c6023041c4d57cd7019"},
-    {file = "wrapt-1.15.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f2e69b3ed24544b0d3dbe2c5c0ba5153ce50dcebb576fdc4696d52aa22db6034"},
-    {file = "wrapt-1.15.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d787272ed958a05b2c86311d3a4135d3c2aeea4fc655705f074130aa57d71653"},
-    {file = "wrapt-1.15.0-cp37-cp37m-win32.whl", hash = "sha256:02fce1852f755f44f95af51f69d22e45080102e9d00258053b79367d07af39c0"},
-    {file = "wrapt-1.15.0-cp37-cp37m-win_amd64.whl", hash = "sha256:abd52a09d03adf9c763d706df707c343293d5d106aea53483e0ec8d9e310ad5e"},
-    {file = "wrapt-1.15.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cdb4f085756c96a3af04e6eca7f08b1345e94b53af8921b25c72f096e704e145"},
-    {file = "wrapt-1.15.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:230ae493696a371f1dbffaad3dafbb742a4d27a0afd2b1aecebe52b740167e7f"},
-    {file = "wrapt-1.15.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63424c681923b9f3bfbc5e3205aafe790904053d42ddcc08542181a30a7a51bd"},
-    {file = "wrapt-1.15.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6bcbfc99f55655c3d93feb7ef3800bd5bbe963a755687cbf1f490a71fb7794b"},
-    {file = "wrapt-1.15.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c99f4309f5145b93eca6e35ac1a988f0dc0a7ccf9ccdcd78d3c0adf57224e62f"},
-    {file = "wrapt-1.15.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b130fe77361d6771ecf5a219d8e0817d61b236b7d8b37cc045172e574ed219e6"},
-    {file = "wrapt-1.15.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:96177eb5645b1c6985f5c11d03fc2dbda9ad24ec0f3a46dcce91445747e15094"},
-    {file = "wrapt-1.15.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5fe3e099cf07d0fb5a1e23d399e5d4d1ca3e6dfcbe5c8570ccff3e9208274f7"},
-    {file = "wrapt-1.15.0-cp38-cp38-win32.whl", hash = "sha256:abd8f36c99512755b8456047b7be10372fca271bf1467a1caa88db991e7c421b"},
-    {file = "wrapt-1.15.0-cp38-cp38-win_amd64.whl", hash = "sha256:b06fa97478a5f478fb05e1980980a7cdf2712015493b44d0c87606c1513ed5b1"},
-    {file = "wrapt-1.15.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2e51de54d4fb8fb50d6ee8327f9828306a959ae394d3e01a1ba8b2f937747d86"},
-    {file = "wrapt-1.15.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0970ddb69bba00670e58955f8019bec4a42d1785db3faa043c33d81de2bf843c"},
-    {file = "wrapt-1.15.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76407ab327158c510f44ded207e2f76b657303e17cb7a572ffe2f5a8a48aa04d"},
-    {file = "wrapt-1.15.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd525e0e52a5ff16653a3fc9e3dd827981917d34996600bbc34c05d048ca35cc"},
-    {file = "wrapt-1.15.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d37ac69edc5614b90516807de32d08cb8e7b12260a285ee330955604ed9dd29"},
-    {file = "wrapt-1.15.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:078e2a1a86544e644a68422f881c48b84fef6d18f8c7a957ffd3f2e0a74a0d4a"},
-    {file = "wrapt-1.15.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2cf56d0e237280baed46f0b5316661da892565ff58309d4d2ed7dba763d984b8"},
-    {file = "wrapt-1.15.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7dc0713bf81287a00516ef43137273b23ee414fe41a3c14be10dd95ed98a2df9"},
-    {file = "wrapt-1.15.0-cp39-cp39-win32.whl", hash = "sha256:46ed616d5fb42f98630ed70c3529541408166c22cdfd4540b88d5f21006b0eff"},
-    {file = "wrapt-1.15.0-cp39-cp39-win_amd64.whl", hash = "sha256:eef4d64c650f33347c1f9266fa5ae001440b232ad9b98f1f43dfe7a79435c0a6"},
-    {file = "wrapt-1.15.0-py3-none-any.whl", hash = "sha256:64b1df0f83706b4ef4cfb4fb0e4c2669100fd7ecacfb59e091fad300d4e04640"},
-    {file = "wrapt-1.15.0.tar.gz", hash = "sha256:d06730c6aed78cee4126234cf2d071e01b44b915e725a6cb439a879ec9754a3a"},
+python-versions = ">=3.6"
+files = [
+    {file = "wrapt-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4"},
+    {file = "wrapt-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020"},
+    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440"},
+    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487"},
+    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf"},
+    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72"},
+    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0"},
+    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136"},
+    {file = "wrapt-1.16.0-cp310-cp310-win32.whl", hash = "sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d"},
+    {file = "wrapt-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2"},
+    {file = "wrapt-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09"},
+    {file = "wrapt-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d"},
+    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389"},
+    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060"},
+    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1"},
+    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3"},
+    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956"},
+    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d"},
+    {file = "wrapt-1.16.0-cp311-cp311-win32.whl", hash = "sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362"},
+    {file = "wrapt-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89"},
+    {file = "wrapt-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b"},
+    {file = "wrapt-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36"},
+    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73"},
+    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809"},
+    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b"},
+    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81"},
+    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9"},
+    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c"},
+    {file = "wrapt-1.16.0-cp312-cp312-win32.whl", hash = "sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc"},
+    {file = "wrapt-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8"},
+    {file = "wrapt-1.16.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8"},
+    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39"},
+    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c"},
+    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40"},
+    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc"},
+    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e"},
+    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465"},
+    {file = "wrapt-1.16.0-cp36-cp36m-win32.whl", hash = "sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e"},
+    {file = "wrapt-1.16.0-cp36-cp36m-win_amd64.whl", hash = "sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966"},
+    {file = "wrapt-1.16.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593"},
+    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292"},
+    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5"},
+    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf"},
+    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228"},
+    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f"},
+    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c"},
+    {file = "wrapt-1.16.0-cp37-cp37m-win32.whl", hash = "sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c"},
+    {file = "wrapt-1.16.0-cp37-cp37m-win_amd64.whl", hash = "sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00"},
+    {file = "wrapt-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0"},
+    {file = "wrapt-1.16.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202"},
+    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0"},
+    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e"},
+    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f"},
+    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267"},
+    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca"},
+    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6"},
+    {file = "wrapt-1.16.0-cp38-cp38-win32.whl", hash = "sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b"},
+    {file = "wrapt-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41"},
+    {file = "wrapt-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2"},
+    {file = "wrapt-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb"},
+    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8"},
+    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c"},
+    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a"},
+    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664"},
+    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f"},
+    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537"},
+    {file = "wrapt-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3"},
+    {file = "wrapt-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35"},
+    {file = "wrapt-1.16.0-py3-none-any.whl", hash = "sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1"},
+    {file = "wrapt-1.16.0.tar.gz", hash = "sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d"},
 ]
 
 [[package]]
@@ -2160,4 +2157,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "3.11.*"
-content-hash = "5e0f58a4637859d0c3589028a35641afbd9989e458b3dab5a9cefc9136705e77"
+content-hash = "53eeaab1a23bfa1b616447c8734e6fa34ed64242daaea0f94de00414e45460c8"
diff --git a/module_text_llm/pyproject.toml b/module_text_llm/pyproject.toml
index d2bde08fb..b5b313aa4 100644
--- a/module_text_llm/pyproject.toml
+++ b/module_text_llm/pyproject.toml
@@ -15,7 +15,7 @@ nltk = "^3.8.1"
 gitpython = "^3.1.37"
 replicate = "^0.11.0"
 tiktoken = "^0.4.0"
-langsmith = "^0.0.60"
+langsmith = "^0.0.63"
 
 [tool.poetry.scripts]
 module = "athena:run_module"