Implement exercise export for expert evaluation

Allows exercises to be exported as json with either normal feedback or categorized feedback.
ls1intum · Dec 17, 2024 · f980d52 · f980d52
1 parent f6f0527
commit f980d52
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 24 deletions.
diff --git a/evaluation/model/model.py b/evaluation/model/model.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass, asdict
-from typing import Optional, List
+from typing import Optional, List, Union, Dict
+
 
 @dataclass
 class Feedback:
@@ -32,9 +33,9 @@ class Submission:
     text: str
     language: str
     meta: dict
-    feedbacks: Optional[List[Feedback]]
+    feedbacks: Optional[Union[List[Feedback], Dict[str, List[Feedback]]]] = None
 
-    def __init__(self, id: int, text: str, language: str, feedbacks: List[Feedback]) -> None:
+    def __init__(self, id: int, text: str, language: str, feedbacks: Optional[Union[List[Feedback], Dict[str, List[Feedback]]]]) -> None:
         self.id = id
         self.text = text
         self.language = language

diff --git a/evaluation/scripts/2_sampling_submissions.py b/evaluation/scripts/2_sampling_submissions.py
@@ -31,19 +31,5 @@
 # %%
 from evaluation.service.json_service import group_exercise_data, exercises_to_json
 
-exercises = group_exercise_data(sampled_submissions)
+exercises = group_exercise_data(sampled_submissions, "Tutor")
 exercises_to_json(exercises, "../data/2_exercise_jsons")
-
-# %% [markdown]
-# ## Upload the json exercises from data/2_exercise_jsons to the playground. In evaluation mode, generate feedback for each exercise and export the results. Make sure that the configuration names for feedback generation do not contain underscores '_'.
-#
-# The downloaded json files should have the following naming scheme:
-#
-# ```text_results_<Configuration name (e.g.: LLM)>_<...>```
-#
-# **Do not change the names of the downloaded files!**
-#
-# Save these files in the data/3_feedback_suggestions directory
-
-# %% [markdown]
-# # Continue in the next notebook: [3_feedback_generation.ipynb](3_feedback_generation.ipynb)
diff --git a/evaluation/scripts/3_feedback_generation.py b/evaluation/scripts/3_feedback_generation.py
@@ -18,6 +18,17 @@
 
 sampled_submissions = pd.read_csv("../data/2_sampled_submissions.csv")
 
+# %% [markdown]
+# ### Upload the json files from data/2_exercise_jsons to the playground. In evaluation mode, generate feedback for each exercise and export the results. Make sure that the configuration names for feedback generation do not contain underscores '_'.
+#
+# The downloaded json files should have the following naming scheme:
+#
+# ```text_results_<Configuration name (e.g.: LLM)>_<...>```
+#
+# **Do not change the names of the downloaded files!**
+#
+# Save these files in the data/3_feedback_suggestions directory
+
 # %%
 from evaluation.service.json_service import read_result_files_to_dataframe, add_feedback_suggestions_to_data, fill_missing_feedback_with_tutor_feedback
 
@@ -28,6 +39,12 @@
 
 sampled_submissions_with_feedback.to_csv("../data/3_sampled_submissions_with_feedback.csv", index=False)
 
+# %%
+from evaluation.service.json_service import group_exercise_data, exercises_to_json
+
+exercises = group_exercise_data(sampled_submissions)
+exercises_to_json(exercises, "../data/3_submissions_with_categorized_feedback_jsons")
+
 # %%
 # Group by 'exercise_id' and 'result_score' to calculate submission counts
 grouped_data = (

diff --git a/evaluation/scripts/4_expert_evaluation.py b/evaluation/scripts/4_expert_evaluation.py
@@ -14,3 +14,16 @@
 # ---
 
 # %%
+import pandas as pd
+
+sampled_submissions = pd.read_csv("../data/3_sampled_submissions_with_feedback.csv")
+
+# %% [markdown]
+# ### Import the json files from data/3_submissions_with_categorized_feedback_jsons into a new expert evaluation in the playground. Conduct the evaluation and download the results as well as the evaluation configuration.
+#
+# **Do not change the names of the downloaded files!**
+#
+# Save these files in the data/4_expert_evaluation directory
+
+# %%
+# TODO: Implement the expert evaluation import
diff --git a/evaluation/service/json_service.py b/evaluation/service/json_service.py
@@ -1,6 +1,6 @@
 import json
 import os
-from typing import List
+from typing import List, Dict, Union
 import pandas as pd
 
 from evaluation.model.model import Exercise, Feedback, Submission, GradingCriterion, StructuredGradingInstruction
@@ -26,7 +26,7 @@ def validate_columns(df: pd.DataFrame, required_columns: List[str]) -> None:
         )
 
 
-def group_exercise_data(df: pd.DataFrame, feedback_type_filter: str = "Tutor") -> List[Exercise]:
+def group_exercise_data(df: pd.DataFrame, feedback_type_filter: str = None) -> List[Exercise]:
     """
     Groups exercises, submissions, grading instructions, and feedback of specified type into a structured format.
 
@@ -38,8 +38,10 @@ def group_exercise_data(df: pd.DataFrame, feedback_type_filter: str = "Tutor") -
         List[Exercise]: A list of Exercise objects.
     """
 
-    def process_feedbacks(submission_group: pd.DataFrame, exercise_id: int, submission_id: int) -> List[Feedback]:
+    def process_feedbacks(submission_group: pd.DataFrame, exercise_id: int, submission_id: int) -> Union[
+        List[Feedback], Dict[str, List[Feedback]]]:
         """Process feedbacks for a submission."""
+        categorized_feedbacks = {}
         feedbacks = []
         for feedback_id, feedback_group in submission_group.groupby("feedback_id"):
             feedback_details = feedback_group.iloc[0]
@@ -54,8 +56,13 @@ def process_feedbacks(submission_group: pd.DataFrame, exercise_id: int, submissi
                 exercise_id=exercise_id,
                 submission_id=submission_id
             )
-            feedbacks.append(feedback)
-        return feedbacks
+
+            if feedback_type_filter:  # Single feedback type
+                feedbacks.append(feedback)
+            else:  # Categorized feedback
+                categorized_feedbacks.setdefault(feedback_details["feedback_type"], []).append(feedback)
+
+        return feedbacks if feedback_type_filter else categorized_feedbacks
 
     def process_submissions(exercise_group: pd.DataFrame, exercise_id: int) -> List[Submission]:
         """Process submissions for an exercise."""
@@ -115,7 +122,8 @@ def process_grading_criteria(exercise_group: pd.DataFrame) -> List[GradingCriter
     ]
     validate_columns(df, required_columns)
 
-    df = df[df["feedback_type"] == feedback_type_filter]
+    if feedback_type_filter:
+        df = df[df["feedback_type"] == feedback_type_filter]
 
     exercises = []
     for exercise_id, exercise_group in df.groupby("exercise_id"):