Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preliminary AI Feedback for all Apollon UML Diagrams #335

Merged
merged 17 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"request": "launch",
"cwd": "${workspaceFolder}/modules/programming/module_programming_llm",
"module": "module_programming_llm",
"justMyCode": true
"justMyCode": false
FelixTJDietrich marked this conversation as resolved.
Show resolved Hide resolved
},
{
"name": "Module Programming ThemisML",
Expand Down Expand Up @@ -62,7 +62,7 @@
"type": "python",
"request": "launch",
"cwd": "${workspaceFolder}/modules/modeling/module_modeling_llm",
"module": "module_text_cofee"
"module": "module_modeling_llm"
}
]
}
Expand Down
2 changes: 1 addition & 1 deletion assessment_module_manager/modules.docker.ini
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,6 @@ supports_graded_feedback_requests = false
url = http://module-modeling-llm:5008
type = modeling
supports_evaluation = false
supports_non_graded_feedback_requests = false
supports_non_graded_feedback_requests = true
supports_graded_feedback_requests = true

2 changes: 1 addition & 1 deletion assessment_module_manager/modules.ini
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,6 @@ supports_graded_feedback_requests = false
url = http://localhost:5008
type = modeling
supports_evaluation = false
supports_non_graded_feedback_requests = false
supports_non_graded_feedback_requests = true
supports_graded_feedback_requests = true

10 changes: 5 additions & 5 deletions modules/modeling/module_modeling_llm/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ DATABASE_URL=sqlite:///../data/data.sqlite

# Default model to use
# See below for options, available models are also logged on startup
LLM_DEFAULT_MODEL="azure_openai_gpt-35"
LLM_DEFAULT_MODEL="azure_openai_gpt-4o"

# Enable LLM-as-a-judge approach 0 = disabled, 1 = enabled
LLM_ENABLE_LLM_AS_A_JUDGE=1
Expand All @@ -23,13 +23,13 @@ LLM_EVALUATION_MODEL="azure_openai_gpt-4"
# Standard OpenAI (Non-Azure) [leave blank if not used]
# Model names prefixed with `openai_` followed by the model name, e.g. `openai_text-davinci-003`
# A list of models can be found in `module_text_llm/helpers/models/openai.py` (openai_models)
LLM_OPENAI_API_KEY="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
OPENAI_API_KEY="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

# Azure OpenAI [leave blank if not used]
# Model names prefixed with `azure_openai_` followed by the deployment id, e.g. `azure_openai_gpt-35`
LLM_AZURE_OPENAI_API_KEY="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
LLM_AZURE_OPENAI_API_BASE="https://ase-eu01.openai.azure.com/" # change base if needed
LLM_AZURE_OPENAI_API_VERSION="2023-07-01-preview" # change base if needed
AZURE_OPENAI_API_KEY="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
AZURE_OPENAI_ENDPOINT="https://ase-eu01.openai.azure.com/" # change base if needed
OPENAI_API_VERSION="2023-07-01-preview" # change base if needed

# Replicate [leave blank if not used]
# See https://replicate.com and adjust model config options in `module_text_llm/helpers/models/replicate.py`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def process_incoming_feedback(exercise: Exercise, submission: Submission, feedba
async def suggest_feedback(exercise: Exercise, submission: Submission, is_graded: bool, module_config: Configuration) -> List[Feedback]:
logger.info("suggest_feedback: Suggestions for submission %d of exercise %d were requested", submission.id,
exercise.id)
return await generate_suggestions(exercise, submission, module_config.approach, module_config.debug)
return await generate_suggestions(exercise, submission, is_graded, module_config.approach, module_config.debug)


if __name__ == "__main__":
Expand Down
19 changes: 13 additions & 6 deletions modules/modeling/module_modeling_llm/module_modeling_llm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
from athena import config_schema_provider
from module_modeling_llm.helpers.models import ModelConfigType, DefaultModelConfig
from module_modeling_llm.prompts.generate_suggestions import (
system_message as generate_suggestions_system_message,
human_message as generate_suggestions_human_message
graded_feedback_system_message as default_graded_feedback_system_message,
graded_feedback_human_message as default_graded_feedback_human_message,
filter_feedback_system_message as default_filter_feedback_system_message,
filter_feedback_human_message as default_filter_feedback_human_message
)


Expand All @@ -16,10 +18,15 @@ class GenerateSuggestionsPrompt(BaseModel):
_Note: **{problem_statement}**, **{example_solution}**, or **{grading_instructions}** might be omitted if the input
is too long._
"""
system_message: str = Field(default=generate_suggestions_system_message,
description="Message for priming AI behavior and instructing it what to do.")
human_message: str = Field(default=generate_suggestions_human_message,
description="Message from a human. The input on which the AI is supposed to act.")
graded_feedback_system_message: str = Field(default=default_graded_feedback_system_message,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can remove the graded naming now

description="Message for priming AI behavior and instructing it what to do.")
graded_feedback_human_message: str = Field(default=default_graded_feedback_human_message,
description="Message from a human. The input on which the AI is supposed to act.")
filter_feedback_system_message: str = Field(default=default_filter_feedback_system_message,
description="Message for priming AI behavior for filtering ungraded feedback.")
filter_feedback_human_message: str = Field(default=default_filter_feedback_human_message,
description="Message for instructing AI to filter ungraded feedback.")



class BasicApproachConfig(BaseModel):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,28 +1,23 @@
import json
from typing import List, Optional, Sequence

from module_modeling_llm.prompts.apollon_format import apollon_format_description
from pydantic import BaseModel, Field

from athena import emit_meta
from athena.logger import logger
from athena.modeling import Exercise, Submission, Feedback
from module_modeling_llm.config import BasicApproachConfig
from module_modeling_llm.helpers.llm_utils import (
get_chat_prompt_with_formatting_instructions,
check_prompt_length_and_omit_features_if_necessary,
num_tokens_from_prompt,
predict_and_parse
)
from module_modeling_llm.helpers.models.diagram_types import DiagramType
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from module_modeling_llm.helpers.llm_utils import predict_and_parse
from module_modeling_llm.helpers.serializers.diagram_model_serializer import DiagramModelSerializer
from module_modeling_llm.helpers.utils import format_grading_instructions, get_elements
from module_modeling_llm.prompts.submission_format.submission_format_remarks import get_submission_format_remarks
from module_modeling_llm.helpers.utils import format_grading_instructions


class FeedbackModel(BaseModel):
title: str = Field(description="Very short title, i.e. feedback category or similar", example="Logic Error")
description: str = Field(description="Feedback description")
element_ids: Optional[str] = Field(description="Referenced diagram element IDs, or empty if unreferenced")
element_names: Optional[List[str]] = Field(description="Referenced diagram element names, and relations (R<number>) or empty if unreferenced")
FelixTJDietrich marked this conversation as resolved.
Show resolved Hide resolved
credits: float = Field(0.0, description="Number of points received/deducted")
grading_instruction_id: Optional[int] = Field(
description="ID of the grading instruction that was used to generate this feedback, or empty if no grading instruction was used"
Expand All @@ -35,25 +30,13 @@ class Config:
class AssessmentModel(BaseModel):
"""Collection of feedbacks making up an assessment"""

feedbacks: Sequence[FeedbackModel] = Field(description="Assessment feedbacks")
feedbacks: Sequence[FeedbackModel] = Field(description="Assessment feedbacks, make sure to include all grading instructions")

class Config:
title = "Assessment"


def filter_ids_for_model(ids: List[str], model: dict) -> List[str]:
"""
Filter a list of element ids based on whether a corresponding element is present in a given diagram model.
:param ids: List of ids that should be filtered
:param model: Diagram model in which elements with the given ids should be contained
:return The filtered list of IDs
"""
elements: list[dict] = get_elements(model)
model_ids: set[str] = {str(element.get("id")) for element in elements}
return list(filter(lambda id: id in model_ids, ids))


async def generate_suggestions(exercise: Exercise, submission: Submission, config: BasicApproachConfig, debug: bool) -> \
async def generate_suggestions(exercise: Exercise, submission: Submission, is_graded: bool, config: BasicApproachConfig, debug: bool) -> \
List[Feedback]:
"""
Generate feedback suggestions for modeling exercise submissions
Expand All @@ -66,55 +49,28 @@ async def generate_suggestions(exercise: Exercise, submission: Submission, confi
model = config.model.get_model() # type: ignore[attr-defined]

serialized_example_solution = None

if exercise.example_solution:
example_solution_diagram = json.loads(exercise.example_solution)
serialized_example_solution, _ = DiagramModelSerializer.serialize_model(example_solution_diagram)

submission_diagram = json.loads(submission.model)
submission_format_remarks = get_submission_format_remarks(submission_diagram.get("type"))

# Having the LLM reference IDs that a specific feedback item applies to seems to work a lot more reliable with
# shorter IDs, especially if they are prefixed with "id_". We therefore map the UUIDs used in Apollon diagrams to
# shortened IDs and have the diagram model serializer return a reverse mapping dictionary which allows us to map
# the shortened IDs back to the original ones.
serialized_submission, reverse_id_map = DiagramModelSerializer.serialize_model(submission_diagram)
serialized_submission, element_id_mapping = DiagramModelSerializer.serialize_model(submission_diagram)

prompt_input = {
"max_points": exercise.max_points,
"bonus_points": exercise.bonus_points,
"grading_instructions": format_grading_instructions(exercise.grading_instructions, exercise.grading_criteria),
"submission_format_remarks": submission_format_remarks,
"submission_format": submission_diagram.get("type"),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would imagine that you select a prompt for each diagram tyle here instead of passing just the type into a generalized prompt.

"problem_statement": exercise.problem_statement or "No problem statement.",
"example_solution": serialized_example_solution or "No example solution.",
"submission": serialized_submission
"submission": serialized_submission,
"uml_diagram_format": apollon_format_description,
LeonWehrhahn marked this conversation as resolved.
Show resolved Hide resolved
"format_instructions": PydanticOutputParser(pydantic_object=AssessmentModel).get_format_instructions()
}

chat_prompt = get_chat_prompt_with_formatting_instructions(
model=model,
system_message=config.generate_suggestions_prompt.system_message,
human_message=config.generate_suggestions_prompt.human_message,
pydantic_object=AssessmentModel
)

# Check if the prompt is too long and omit features if necessary (in order of importance)
omittable_features = ["example_solution", "problem_statement", "grading_instructions"]
prompt_input, should_run = check_prompt_length_and_omit_features_if_necessary(
prompt=chat_prompt,
prompt_input=prompt_input,
max_input_tokens=10000, # config.max_input_tokens,
omittable_features=omittable_features,
debug=debug
)

# Skip if the prompt is too long
if not should_run:
logger.warning("Input too long. Skipping.")
if debug:
emit_meta("prompt", chat_prompt.format(**prompt_input))
emit_meta("error",
f"Input too long {num_tokens_from_prompt(chat_prompt, prompt_input)} > {config.max_input_tokens}")
return []
chat_prompt = ChatPromptTemplate.from_messages([
("system", config.generate_suggestions_prompt.graded_feedback_system_message),
("human", config.generate_suggestions_prompt.graded_feedback_human_message)])

result = await predict_and_parse(
model=model,
Expand All @@ -136,6 +92,38 @@ async def generate_suggestions(exercise: Exercise, submission: Submission, confi
if result is None:
return []

# Check if is graded
if is_graded is False:
filter_chat_prompt = ChatPromptTemplate.from_messages([
("system", config.generate_suggestions_prompt.filter_feedback_system_message),
("human", config.generate_suggestions_prompt.filter_feedback_human_message)
])

filter_prompt_input = {
"original_feedback": result.dict(),
"format_instructions": PydanticOutputParser(pydantic_object=AssessmentModel).get_format_instructions()
}

result = await predict_and_parse(
model=model,
chat_prompt=filter_chat_prompt,
prompt_input=filter_prompt_input,
pydantic_object=AssessmentModel,
tags=[
f"exercise-{exercise.id}-filter",
f"submission-{submission.id}-filter",
]
)

if debug:
emit_meta("filter_feedback", {
"prompt": filter_chat_prompt.format(**filter_prompt_input),
"result": result.dict() if result is not None else None
})

if result is None:
return []

grading_instruction_ids = set(
grading_instruction.id
for criterion in exercise.grading_criteria or []
Expand All @@ -145,21 +133,20 @@ async def generate_suggestions(exercise: Exercise, submission: Submission, confi
feedbacks = []
for feedback in result.feedbacks:
grading_instruction_id = feedback.grading_instruction_id if feedback.grading_instruction_id in grading_instruction_ids else None
element_ids = list(
map(lambda element_id: reverse_id_map[
element_id.strip()
] if reverse_id_map else element_id.strip(), feedback.element_ids.split(","))
) if feedback.element_ids else []
element_ids = [element_id_mapping[element] for element in (feedback.element_names or [])]


feedbacks.append(Feedback(
exercise_id=exercise.id,
submission_id=submission.id,
title=feedback.title,
description=feedback.description,
element_ids=filter_ids_for_model(element_ids, submission_diagram),
element_ids=element_ids,
credits=feedback.credits,
structured_grading_instruction_id=grading_instruction_id,
meta={}
meta={},
id=None,
is_graded=is_graded
))

return feedbacks
return feedbacks
Loading
Loading