mvp multiple approaches and cot

ls1intum · Oct 24, 2024 · 4854b16 · 4854b16
1 parent b336006
commit 4854b16
Show file tree

Hide file tree

Showing 8 changed files with 662 additions and 351 deletions.
diff --git a/llm_core/llm_core/models/__init__.py b/llm_core/llm_core/models/__init__.py
@@ -6,6 +6,7 @@
 
 
 DefaultModelConfig: Type[ModelConfig]
+MiniModelConfig: ModelConfig
 default_model_name = os.environ.get("LLM_DEFAULT_MODEL")
 evaluation_model_name = os.environ.get("LLM_EVALUATION_MODEL")
 
@@ -18,6 +19,8 @@
     types.append(openai_config.OpenAIModelConfig)
     if default_model_name in openai_config.available_models:
         DefaultModelConfig = openai_config.OpenAIModelConfig
+    if "openai_gpt-4o-mini" in openai_config.available_models:        
+        MiniModelConfig = openai_config.OpenAIModelConfig(model_name="openai_gpt-4o-mini",max_tokens=3000, temperature=0,top_p=0.9,presence_penalty=0,frequency_penalty=0)
     if evaluation_model_name in openai_config.available_models:
         evaluation_model = openai_config.available_models[evaluation_model_name]
 except AttributeError:

diff --git a/modules/text/module_text_llm/module_text_llm/__main__.py b/modules/text/module_text_llm/module_text_llm/__main__.py
@@ -4,7 +4,7 @@
 
 import nltk
 import tiktoken
-
+from module_text_llm.approach_controller import generate
 from athena import app, submission_selector, submissions_consumer, feedback_consumer, feedback_provider, evaluation_provider
 from athena.text import Exercise, Submission, Feedback
 from athena.logger import logger
@@ -13,6 +13,7 @@
 from module_text_llm.evaluation import get_feedback_statistics, get_llm_statistics
 from module_text_llm.generate_suggestions import generate_suggestions
 from module_text_llm.generate_evaluation import generate_evaluation
+from module_text_llm.generate_cot_suggestions import generate_cot_suggestions
 
 
 @submissions_consumer
@@ -30,12 +31,12 @@ def select_submission(exercise: Exercise, submissions: List[Submission]) -> Subm
 def process_incoming_feedback(exercise: Exercise, submission: Submission, feedbacks: List[Feedback]):
     logger.info("process_feedback: Received %d feedbacks for submission %d of exercise %d.", len(feedbacks), submission.id, exercise.id)
 
-
+# change here to have multiple approaches
 @feedback_provider
 async def suggest_feedback(exercise: Exercise, submission: Submission, is_graded: bool, module_config: Configuration) -> List[Feedback]:
     logger.info("suggest_feedback: %s suggestions for submission %d of exercise %d were requested",
                 "Graded" if is_graded else "Non-graded", submission.id, exercise.id)
-    return await generate_suggestions(exercise, submission, module_config.approach, module_config.debug)
+    return await generate(exercise, submission, module_config.approach, module_config.debug)
 
 
 @evaluation_provider

diff --git a/modules/text/module_text_llm/module_text_llm/approach_controller.py b/modules/text/module_text_llm/module_text_llm/approach_controller.py
@@ -0,0 +1,19 @@
+
+from typing import List, Optional, Sequence
+from pydantic import BaseModel, Field
+
+from athena import emit_meta
+from athena.text import Exercise, Submission, Feedback
+from athena.logger import logger
+from module_text_llm.config import BasicApproachConfig, ChainOfThoughtConfig
+
+
+from module_text_llm.helpers.utils import add_sentence_numbers, get_index_range_from_line_range, format_grading_instructions
+from module_text_llm.generate_suggestions import generate_suggestions
+from module_text_llm.generate_cot_suggestions import generate_cot_suggestions
+
+async def generate(exercise: Exercise, submission: Submission, config: BasicApproachConfig, debug: bool) -> List[Feedback]:
+    if(isinstance(config, BasicApproachConfig)):
+        return await generate_suggestions(exercise, submission, config, debug)
+    elif(isinstance(config, ChainOfThoughtConfig)):
+        return await generate_cot_suggestions(exercise, submission, config, debug)
diff --git a/modules/text/module_text_llm/module_text_llm/config.py b/modules/text/module_text_llm/module_text_llm/config.py
@@ -1,13 +1,23 @@
 from pydantic import BaseModel, Field
-
+from typing import Union
 from athena import config_schema_provider
-from llm_core.models import ModelConfigType, DefaultModelConfig
+from llm_core.models import ModelConfigType, DefaultModelConfig, MiniModelConfig
 from module_text_llm.prompts.generate_suggestions import (
   system_message as generate_suggestions_system_message, 
   human_message as generate_suggestions_human_message
 )
+from enum import Enum
+from pydantic import root_validator
+from abc import ABC, abstractmethod
+from module_text_llm.prompts.cot_suggestions import (
+  system_message as generate_cot_suggestions_system_message, 
+  human_message as generate_cot_suggestions_human_message
+)
 
-
+from module_text_llm.prompts.refined_cot_suggestions import (
+  system_message as generate_refined_cot_suggestions_system_message, 
+  human_message as generate_refined_cot_suggestions_human_message
+)
 class GenerateSuggestionsPrompt(BaseModel):
     """\
 Features available: **{problem_statement}**, **{example_solution}**, **{grading_instructions}**, **{max_points}**, **{bonus_points}**, **{submission}**
@@ -19,15 +29,73 @@ class GenerateSuggestionsPrompt(BaseModel):
     human_message: str = Field(default=generate_suggestions_human_message,
                                description="Message from a human. The input on which the AI is supposed to act.")
 
+class CoTGenerateSuggestionsPrompt(BaseModel):
+    """\
+Features cit available: **{problem_statement}**, **{example_solution}**, **{grading_instructions}**, **{max_points}**, **{bonus_points}**, **{submission}**
 
-class BasicApproachConfig(BaseModel):
-    """This approach uses a LLM with a single prompt to generate feedback in a single step."""
+_Note: **{problem_statement}**, **{example_solution}**, or **{grading_instructions}** might be omitted if the input is too long._\
+"""
+    system_message: str = Field(default=generate_cot_suggestions_system_message,
+                                description="Message for priming AI behavior and instructing it what to do.")
+    human_message: str = Field(default=generate_cot_suggestions_human_message,
+                               description="Message from a human. The input on which the AI is supposed to act.")
+    second_system_message: str = Field(default=generate_refined_cot_suggestions_system_message,
+                                description="Message for priming AI behavior and instructing it what to do.")
+    answer_message: str = Field(default=generate_refined_cot_suggestions_human_message,
+                               description="Message from a human. The input on which the AI is supposed to act.")
+
+class ApproachType(str, Enum):
+    basic = "BasicApproach"
+    chain_of_thought = "ChainOfThought"
+
+
+class ApproachConfig(BaseModel, ABC):
     max_input_tokens: int = Field(default=3000, description="Maximum number of tokens in the input prompt.")
-    model: ModelConfigType = Field(default=DefaultModelConfig()) # type: ignore
+    model: ModelConfigType = Field(default=DefaultModelConfig())  # type: ignore
+
+    # @abstractmethod
+    # def get_prompt(self):
+    #     """Abstract method to get the appropriate prompt configuration."""
+    #     pass
+
+    class Config:
+        # Enable discriminator to distinguish between subclasses in the schema
+        use_enum_values = True
+
+class BasicApproachConfig(ApproachConfig):
     generate_suggestions_prompt: GenerateSuggestionsPrompt = Field(default=GenerateSuggestionsPrompt())
 
+    # def get_prompt(self):
+    #     return self.generate_suggestions_prompt
+
+class ChainOfThoughtConfig(ApproachConfig):
+    model: ModelConfigType = Field(default=MiniModelConfig)  # type: ignore
+    generate_suggestions_prompt: CoTGenerateSuggestionsPrompt = Field(default=CoTGenerateSuggestionsPrompt())
+
+    # def get_prompt(self):
+    #     return self.generate_suggestions_prompt
+
+# available_approaches = [BasicApproachConfig, ChainOfThoughtConfig]
+ApproachConfigUnion = Union[BasicApproachConfig, ChainOfThoughtConfig]
 
+# def approach_factory(approach_type: ApproachType) -> ApproachConfig:
+#     if approach_type == ApproachType.basic:
+#         return BasicApproachConfig()
+#     elif approach_type == ApproachType.chain_of_thought:
+#         return ChainOfThoughtConfig()
+#     else:
+#         raise ValueError(f"Unknown approach type: {approach_type}")
+
 @config_schema_provider
 class Configuration(BaseModel):
     debug: bool = Field(default=False, description="Enable debug mode.")
-    approach: BasicApproachConfig = Field(default=BasicApproachConfig())
+    approach: ApproachConfigUnion = Field(default_factory=BasicApproachConfig)  # Default to BasicApproach
+    # approach_type: ApproachType = Field(default=ApproachType.basic, description="Type of approach to use.")
+
+    # @root_validator(pre=True)
+    # def populate_approach(cls, values):
+    #     """Automatically instantiate the correct approach based on approach_type."""
+    #     approach_type = values.get('approach_type', ApproachType.basic)
+    #     values['approach'] = approach_factory(approach_type)
+    #     return values
+
diff --git a/modules/text/module_text_llm/module_text_llm/generate_cot_suggestions.py b/modules/text/module_text_llm/module_text_llm/generate_cot_suggestions.py
@@ -0,0 +1,159 @@
+from typing import List, Optional, Sequence
+from pydantic import BaseModel, Field
+
+from athena import emit_meta
+from athena.text import Exercise, Submission, Feedback
+from athena.logger import logger
+
+from module_text_llm.config import ChainOfThoughtConfig
+from llm_core.utils.llm_utils import (
+    get_chat_prompt_with_formatting_instructions, 
+    check_prompt_length_and_omit_features_if_necessary, 
+    num_tokens_from_prompt,
+)
+from llm_core.utils.predict_and_parse import predict_and_parse
+
+from module_text_llm.helpers.utils import add_sentence_numbers, get_index_range_from_line_range, format_grading_instructions
+
+class FeedbackModel(BaseModel):
+    title: str = Field(description="Very short title, i.e. feedback category or similar", example="Logic Error")
+    description: str = Field(description="Feedback description")
+    line_start: Optional[int] = Field(description="Referenced line number start, or empty if unreferenced")
+    line_end: Optional[int] = Field(description="Referenced line number end, or empty if unreferenced")
+    credits: float = Field(0.0, description="Number of points received/deducted")
+    grading_instruction_id: Optional[int] = Field(
+        description="ID of the grading instruction that was used to generate this feedback, or empty if no grading instruction was used"
+    )
+
+    class Config:
+        title = "Feedback"
+
+
+class AssessmentModel(BaseModel):
+    """Collection of feedbacks making up an assessment"""
+
+    feedbacks: List[FeedbackModel] = Field(description="Assessment feedbacks")
+
+    class Config:
+        title = "Assessment"
+
+class InitialAssessment(BaseModel):
+    title: str = Field(description="Very short title, i.e. feedback category or similar", example="Logic Error")
+    description: str = Field(description="Feedback description")
+    line_start: Optional[int] = Field(description="Referenced line number start, or empty if unreferenced")
+    line_end: Optional[int] = Field(description="Referenced line number end, or empty if unreferenced")
+    credits: float = Field(0.0, description="Number of points received/deducted")
+    reasoning: str = Field(description="Reasoning why the feedback was given")
+    impprovment_suggestion: str = Field(description="Suggestion for improvement for the student")
+
+class InitialAssessmentModel(BaseModel):
+    """Collection of feedbacks making up an assessment"""
+
+    feedbacks: List[InitialAssessment] = Field(description="Assessment feedbacks")
+
+async def generate_cot_suggestions(exercise: Exercise, submission: Submission, config: ChainOfThoughtConfig, debug: bool) -> List[Feedback]:
+    model = config.model.get_model()  # type: ignore[attr-defined]
+
+    prompt_input = {
+        "max_points": exercise.max_points,
+        "bonus_points": exercise.bonus_points,
+        "grading_instructions": format_grading_instructions(exercise.grading_instructions, exercise.grading_criteria),
+        "problem_statement": exercise.problem_statement or "No problem statement.",
+        "example_solution": exercise.example_solution,
+        "submission": add_sentence_numbers(submission.text)
+    }
+
+    chat_prompt = get_chat_prompt_with_formatting_instructions(
+        model=model, 
+        system_message=config.generate_suggestions_prompt.system_message, 
+        human_message=config.generate_suggestions_prompt.human_message, 
+        pydantic_object=InitialAssessmentModel
+    )
+
+
+
+    # Check if the prompt is too long and omit features if necessary (in order of importance)
+    omittable_features = ["example_solution", "problem_statement", "grading_instructions"]
+    prompt_input, should_run = check_prompt_length_and_omit_features_if_necessary(
+        prompt=chat_prompt,
+        prompt_input= prompt_input,
+        max_input_tokens=config.max_input_tokens,
+        omittable_features=omittable_features,
+        debug=debug
+    )
+
+    # Skip if the prompt is too long
+    if not should_run:
+        logger.warning("Input too long. Skipping.")
+        if debug:
+            emit_meta("prompt", chat_prompt.format(**prompt_input))
+            emit_meta("error", f"Input too long {num_tokens_from_prompt(chat_prompt, prompt_input)} > {config.max_input_tokens}")
+        return []
+
+    initial_result = await predict_and_parse(
+        model=model, 
+        chat_prompt=chat_prompt, 
+        prompt_input=prompt_input, 
+        pydantic_object=InitialAssessmentModel,
+        tags=[
+            f"exercise-{exercise.id}",
+            f"submission-{submission.id}",
+        ]
+    )
+
+    second_prompt_input = {
+        "answer" : initial_result,
+        "submission": add_sentence_numbers(submission.text)
+
+    }
+
+    second_chat_prompt = get_chat_prompt_with_formatting_instructions(     
+        model=model, 
+        system_message=config.generate_suggestions_prompt.second_system_message, 
+        human_message=config.generate_suggestions_prompt.answer_message, 
+        pydantic_object=AssessmentModel)
+
+    result = await predict_and_parse(
+    model=model, 
+    chat_prompt=second_chat_prompt, 
+    prompt_input=second_prompt_input, 
+    pydantic_object=AssessmentModel,
+    tags=[
+        f"exercise-{exercise.id}",
+        f"submission-{submission.id}",
+    ]
+    )
+
+    if debug:
+        emit_meta("generate_suggestions", {
+            "prompt": chat_prompt.format(**prompt_input),
+            "result": result.dict() if result is not None else None
+        })
+
+
+    if result is None:
+        return []
+
+    grading_instruction_ids = set(
+        grading_instruction.id 
+        for criterion in exercise.grading_criteria or [] 
+        for grading_instruction in criterion.structured_grading_instructions
+    )
+
+    feedbacks = []
+    for feedback in result.feedbacks:
+        index_start, index_end = get_index_range_from_line_range(feedback.line_start, feedback.line_end, submission.text)
+        grading_instruction_id = feedback.grading_instruction_id if feedback.grading_instruction_id in grading_instruction_ids else None
+        feedbacks.append(Feedback(
+            exercise_id=exercise.id,
+            submission_id=submission.id,
+            title=feedback.title,
+            description=feedback.description,
+            index_start=index_start,
+            index_end=index_end,
+            credits=feedback.credits,
+            structured_grading_instruction_id=grading_instruction_id,
+            meta={}
+        ))
+
+    return feedbacks
diff --git a/modules/text/module_text_llm/module_text_llm/prompts/cot_suggestions.py b/modules/text/module_text_llm/module_text_llm/prompts/cot_suggestions.py
@@ -0,0 +1,28 @@
+system_message = """
+You are a grading assistant at a prestrigious university tasked with grading student submissions for text exercises.
+You goal is to be as helpful as possible to the student while providing constructive feedback without revealing the solution.
+In order to successfully complete this task, you must:
+1. Analyze the problem statement and the provided grading instructions to understand the requirements of the task.
+2. The problem solution is an example of a solution that meets the requirements of the task. Analyze the solution to understand the logic and the approach used to solve the problem, keeping in mind that the student solutions might diverge and still be correct.
+3. Analyze the student's submission in regards to the problem statement, so that you can create chunks of the solution that relate to a part of the problem statement.
+4. Use the information gathered from the previous steps to provide constructive feedback to the student, guiding them towards the correct solution without revealing it.
+5. If you have additional comments, create an unreferenced feedback.
+6. For each feedback make sure that the credits are given only on the basis of the grading instructions and soltuion, the minimal answer from a student that satisfies this should be given the credits. If you have notes or additional comments, make sure to include them in a new feedback with 0 credits and no reference.
+
+You are tasked with grading the following exercise, your response should take into account that you are directly responding to the student so you should adress the student:
+The maximal amount of points for this exercise is {max_points}.
+# Problem Statement
+{problem_statement}
+# Sample Solution
+{example_solution}
+# Grading Instructions
+{grading_instructions}
+
+"""
+
+human_message = """\
+Student\'s submission to grade (with sentence numbers <number>: <sentence>):
+\"\"\"
+{submission}
+\"\"\"\
+"""
diff --git a/modules/text/module_text_llm/module_text_llm/prompts/refined_cot_suggestions.py b/modules/text/module_text_llm/module_text_llm/prompts/refined_cot_suggestions.py
@@ -0,0 +1,17 @@
+system_message = """
+         You gave the following feedback on the first iteration: {answer}
+         On this step you need to refine your feedback.
+         Make sure to follow the following steps to assess and improve your feedback:
+         It shuold follow the grading instructions and the sample solution, if it doesn't, consider improvements.
+         If you have your own additional improvements that are not present in the grading instructions, add them in a new feedback with 0 credits and no reference.
+         Remember that your response is directly seen by students and it should adress them directly.
+         For each feedback where the student has room for improvement, think about how the student could improve his solution.
+         Once you have thought how the student can improve the solution, formulate it in a way that guides the student towards the correct solution without revealing it directly.
+         Consider improvements to the feedback if any of this points is not satisfied."""
+
+human_message = """\
+Student\'s submission to grade (with sentence numbers <number>: <sentence>):
+\"\"\"
+{submission}
+\"\"\"\
+"""