diff --git a/evaluation/model/model.py b/evaluation/model/model.py index 91e38b65..4fe9f214 100644 --- a/evaluation/model/model.py +++ b/evaluation/model/model.py @@ -1,5 +1,6 @@ from dataclasses import dataclass, asdict -from typing import Optional, List +from typing import Optional, List, Union, Dict + @dataclass class Feedback: @@ -32,9 +33,9 @@ class Submission: text: str language: str meta: dict - feedbacks: Optional[List[Feedback]] + feedbacks: Optional[Union[List[Feedback], Dict[str, List[Feedback]]]] = None - def __init__(self, id: int, text: str, language: str, feedbacks: List[Feedback]) -> None: + def __init__(self, id: int, text: str, language: str, feedbacks: Optional[Union[List[Feedback], Dict[str, List[Feedback]]]]) -> None: self.id = id self.text = text self.language = language diff --git a/evaluation/scripts/2_sampling_submissions.py b/evaluation/scripts/2_sampling_submissions.py index 339e875c..004bbc3c 100644 --- a/evaluation/scripts/2_sampling_submissions.py +++ b/evaluation/scripts/2_sampling_submissions.py @@ -31,19 +31,5 @@ # %% from evaluation.service.json_service import group_exercise_data, exercises_to_json -exercises = group_exercise_data(sampled_submissions) +exercises = group_exercise_data(sampled_submissions, "Tutor") exercises_to_json(exercises, "../data/2_exercise_jsons") - -# %% [markdown] -# ## Upload the json exercises from data/2_exercise_jsons to the playground. In evaluation mode, generate feedback for each exercise and export the results. Make sure that the configuration names for feedback generation do not contain underscores '_'. -# -# The downloaded json files should have the following naming scheme: -# -# ```text_results_<Configuration name (e.g.: LLM)>_<...>``` -# -# **Do not change the names of the downloaded files!** -# -# Save these files in the data/3_feedback_suggestions directory - -# %% [markdown] -# # Continue in the next notebook: [3_feedback_generation.ipynb](3_feedback_generation.ipynb) diff --git a/evaluation/scripts/3_feedback_generation.py b/evaluation/scripts/3_feedback_generation.py index 383e863d..bd974ce0 100644 --- a/evaluation/scripts/3_feedback_generation.py +++ b/evaluation/scripts/3_feedback_generation.py @@ -18,6 +18,17 @@ sampled_submissions = pd.read_csv("../data/2_sampled_submissions.csv") +# %% [markdown] +# ### Upload the json files from data/2_exercise_jsons to the playground. In evaluation mode, generate feedback for each exercise and export the results. Make sure that the configuration names for feedback generation do not contain underscores '_'. +# +# The downloaded json files should have the following naming scheme: +# +# ```text_results_<Configuration name (e.g.: LLM)>_<...>``` +# +# **Do not change the names of the downloaded files!** +# +# Save these files in the data/3_feedback_suggestions directory + # %% from evaluation.service.json_service import read_result_files_to_dataframe, add_feedback_suggestions_to_data, fill_missing_feedback_with_tutor_feedback @@ -28,6 +39,12 @@ sampled_submissions_with_feedback.to_csv("../data/3_sampled_submissions_with_feedback.csv", index=False) +# %% +from evaluation.service.json_service import group_exercise_data, exercises_to_json + +exercises = group_exercise_data(sampled_submissions) +exercises_to_json(exercises, "../data/3_submissions_with_categorized_feedback_jsons") + # %% # Group by 'exercise_id' and 'result_score' to calculate submission counts grouped_data = ( diff --git a/evaluation/scripts/4_expert_evaluation.py b/evaluation/scripts/4_expert_evaluation.py index 8917640d..39aa02c6 100644 --- a/evaluation/scripts/4_expert_evaluation.py +++ b/evaluation/scripts/4_expert_evaluation.py @@ -14,3 +14,16 @@ # --- # %% +import pandas as pd + +sampled_submissions = pd.read_csv("../data/3_sampled_submissions_with_feedback.csv") + +# %% [markdown] +# ### Import the json files from data/3_submissions_with_categorized_feedback_jsons into a new expert evaluation in the playground. Conduct the evaluation and download the results as well as the evaluation configuration. +# +# **Do not change the names of the downloaded files!** +# +# Save these files in the data/4_expert_evaluation directory + +# %% +# TODO: Implement the expert evaluation import diff --git a/evaluation/service/json_service.py b/evaluation/service/json_service.py index aa92a586..9110e8c3 100644 --- a/evaluation/service/json_service.py +++ b/evaluation/service/json_service.py @@ -1,6 +1,6 @@ import json import os -from typing import List +from typing import List, Dict, Union import pandas as pd from evaluation.model.model import Exercise, Feedback, Submission, GradingCriterion, StructuredGradingInstruction @@ -26,7 +26,7 @@ def validate_columns(df: pd.DataFrame, required_columns: List[str]) -> None: ) -def group_exercise_data(df: pd.DataFrame, feedback_type_filter: str = "Tutor") -> List[Exercise]: +def group_exercise_data(df: pd.DataFrame, feedback_type_filter: str = None) -> List[Exercise]: """ Groups exercises, submissions, grading instructions, and feedback of specified type into a structured format. @@ -38,8 +38,10 @@ def group_exercise_data(df: pd.DataFrame, feedback_type_filter: str = "Tutor") - List[Exercise]: A list of Exercise objects. """ - def process_feedbacks(submission_group: pd.DataFrame, exercise_id: int, submission_id: int) -> List[Feedback]: + def process_feedbacks(submission_group: pd.DataFrame, exercise_id: int, submission_id: int) -> Union[ + List[Feedback], Dict[str, List[Feedback]]]: """Process feedbacks for a submission.""" + categorized_feedbacks = {} feedbacks = [] for feedback_id, feedback_group in submission_group.groupby("feedback_id"): feedback_details = feedback_group.iloc[0] @@ -54,8 +56,13 @@ def process_feedbacks(submission_group: pd.DataFrame, exercise_id: int, submissi exercise_id=exercise_id, submission_id=submission_id ) - feedbacks.append(feedback) - return feedbacks + + if feedback_type_filter: # Single feedback type + feedbacks.append(feedback) + else: # Categorized feedback + categorized_feedbacks.setdefault(feedback_details["feedback_type"], []).append(feedback) + + return feedbacks if feedback_type_filter else categorized_feedbacks def process_submissions(exercise_group: pd.DataFrame, exercise_id: int) -> List[Submission]: """Process submissions for an exercise.""" @@ -115,7 +122,8 @@ def process_grading_criteria(exercise_group: pd.DataFrame) -> List[GradingCriter ] validate_columns(df, required_columns) - df = df[df["feedback_type"] == feedback_type_filter] + if feedback_type_filter: + df = df[df["feedback_type"] == feedback_type_filter] exercises = [] for exercise_id, exercise_group in df.groupby("exercise_id"):