Skip to content

Commit

Permalink
Merge pull request #174 from ls1intum/feature/automatic-evaluation
Browse files Browse the repository at this point in the history
Add automatic evaluation with LLM-as-a-Judge, LangSmith export, and SGI evaluation
  • Loading branch information
FelixTJDietrich authored Nov 12, 2023
2 parents ec3cb22 + fe10281 commit 89e7047
Show file tree
Hide file tree
Showing 33 changed files with 1,304 additions and 401 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class HealthResponse(BaseModel):
"""
Response indicating whether the Assessment Module Manager is healthy,
and whether all the modules are healthy (i.e. reachable).
Additional information about the modules is also provided.
"""
status: str = Field(const=True, default="ok", example="ok")
modules: dict = Field(
Expand All @@ -35,7 +36,8 @@ class HealthResponse(BaseModel):
"module_example": {
"url": "http://localhost:5001",
"type": "programming",
"healthy": True
"healthy": True,
"supportsEvaluation": True
}
}
]
Expand All @@ -56,6 +58,7 @@ async def get_health() -> HealthResponse:
"url": module.url,
"type": module.type,
"healthy": await is_healthy(module),
"supportsEvaluation": module.supports_evaluation
}
for module in get_modules()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def list_modules() -> List[Module]:
name=module,
url=cast(AnyHttpUrl, os.environ.get(f"{module.upper()}_URL", modules_config[module]["url"])),
type=ExerciseType(modules_config[module]["type"]),
supports_evaluation=modules_config[module].getboolean("supports_evaluation"),
)
for module in modules_config.sections()
]
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ class Module(BaseModel):
name: str = Field(example="module_example")
url: AnyHttpUrl = Field(example="http://localhost:5001")
type: ExerciseType = Field(example=ExerciseType.text)
supports_evaluation: bool = Field(description="Whether the module supports evaluation", example=True)
7 changes: 6 additions & 1 deletion assessment_module_manager/modules.docker.ini
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
[module_example]
url = http://module-example:5001
type = programming
supports_evaluation = true

[module_programming_llm]
url = http://module-programming-llm:5002
type = programming
supports_evaluation = false

[module_text_llm]
url = http://module-text-llm:5003
type = text
supports_evaluation = true

[module_text_cofee]
url = http://module-text-cofee:5004
type = text
supports_evaluation = false

[module_programming_themisml]
url = http://module-programming-themisml:5005
type = programming
type = programming
supports_evaluation = false
5 changes: 5 additions & 0 deletions assessment_module_manager/modules.ini
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
[module_example]
url = http://localhost:5001
type = programming
supports_evaluation = true

[module_programming_llm]
url = http://localhost:5002
type = programming
supports_evaluation = false

[module_text_llm]
url = http://localhost:5003
type = text
supports_evaluation = true

[module_text_cofee]
url = http://localhost:5004
type = text
supports_evaluation = false

[module_programming_themisml]
url = http://localhost:5005
type = programming
supports_evaluation = false
3 changes: 2 additions & 1 deletion athena/athena/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from .schemas import ExerciseType, GradingCriterion, StructuredGradingInstruction
from .metadata import emit_meta, get_meta
from .experiment import get_experiment_environment
from .endpoints import submission_selector, submissions_consumer, feedback_consumer, feedback_provider, config_schema_provider # type: ignore
from .endpoints import submission_selector, submissions_consumer, feedback_consumer, feedback_provider, config_schema_provider, evaluation_provider # type: ignore


@app.get("/")
Expand All @@ -28,6 +28,7 @@ def run_module():
"feedback_consumer",
"feedback_provider",
"config_schema_provider",
"evaluation_provider",
"emit_meta",
"get_meta",
"get_experiment_environment",
Expand Down
61 changes: 60 additions & 1 deletion athena/athena/endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,4 +358,63 @@ def config_schema_provider(cls: Type[C]) -> Type[C]:
async def wrapper():
return cls.schema()

return cls
return cls


def evaluation_provider(func: Union[
Callable[[E, S, List[F], List[F]], Any],
Callable[[E, S, List[F], List[F]], Coroutine[Any, Any, Any]]
]):
"""
Provide evaluated feedback to the Assessment Module Manager.
Note: The evaluation provider is usually called during the research and development phase (by the Playground).
Return arbitrary evaluation results.
This decorator can be used with several types of functions: synchronous or asynchronous.
Examples:
Below are some examples of possible functions that you can decorate with this decorator:
Without using module config (both synchronous and asynchronous forms):
>>> @evaluation_provider
... def sync_evaluate_feedback(
... exercise: Exercise, submission: Submission,
... true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]
... ) -> Any:
... # evaluate predicted feedback here and return evaluation results
>>> @feedback_provider
... async def async_evaluate_feedback(
... exercise: Exercise, submission: Submission,
... true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]
... ) -> Any:
... # evaluate predicted feedback here and return evaluation results
"""
exercise_type = inspect.signature(func).parameters["exercise"].annotation
submission_type = inspect.signature(func).parameters["submission"].annotation
feedback_type = inspect.signature(func).parameters["predicted_feedbacks"].annotation.__args__[0]

@app.post("/evaluation", responses=module_responses)
@authenticated
@with_meta
async def wrapper(
exercise: exercise_type,
submission: submission_type,
true_feedbacks: List[feedback_type],
predicted_feedbacks: List[feedback_type],
):
# Retrieve existing metadata for the exercise, submission and feedback
exercise.meta.update(get_stored_exercise_meta(exercise) or {})
submission.meta.update(get_stored_submission_meta(submission) or {})
for feedback in true_feedbacks + predicted_feedbacks:
feedback.meta.update(get_stored_feedback_meta(feedback) or {})

# Call the actual provider
if inspect.iscoroutinefunction(func):
evaluation = await func(exercise, submission, true_feedbacks, predicted_feedbacks)
else:
evaluation = func(exercise, submission, true_feedbacks, predicted_feedbacks)

return evaluation
return wrapper
33 changes: 32 additions & 1 deletion docs/module/structure.rst
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ Example:
)
]
Provide Config Schema
Provide Config Schema (Optional)
~~~~~~~~~~~~~~~~~~~~~~
Get a schema for config options of the module as json schema. The config complying to the schema can then be provided in the header of a request `X-Module-Config` to override the default values. The module can decorate one pydantic model with ``@config_schema_provider`` to provide the schema and should have default values set for all fields as default configuration. The configuration class can be appended to the function signature of all other decorators to provide the configuration to the function.

Expand All @@ -108,6 +108,37 @@ Example:
debug: bool = Field(False, description="Whether the module is in debug mode.")
...
Provide Evaluation (Optional)
~~~~~~~~~~~~~~~~~~
Get an arbitrary evaluation for a submission with historical ``true_feedback`` and feedback suggestions ``predicted_feedback``. The Playground would usually call this when conducting an evaluation during an experiment. The module will receive the request at the function annotated with ``@evaluation_provider``.

If you want to have the ``/evaluation`` endpoint available during the Playground evaluation mode, you need to set ``supports_evaluation = true`` in the ``modules.ini`` and ``modules.docker.ini`` files.

Example:
.. code-block:: python
from athena import *
@evaluation_provider
def evaluate_feedback(exercise: Exercise, submission: Submission, true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]) -> Any:
# Do something with the true and predicted feedback and return the evaluation result
...
# Example: Generate some example evaluation result
evaluation_results = []
true_feedback_embeddings = [random.random() for _ in true_feedbacks]
predicted_feedback_embeddings = [random.random() for _ in predicted_feedbacks]
for feedback, embedding in zip(predicted_feedbacks, predicted_feedback_embeddings):
feedback_evaluation = {
"feedback_id": feedback.id,
"embedding": embedding,
"has_match": len([t for t in true_feedback_embeddings if abs(t - embedding) < 0.1]) > 0,
"correctness": random.random()
}
evaluation_results.append(feedback_evaluation)
...
# Return arbitrary evaluation results
return evaluation_results
Environment Variables
---------------------
You should provide at least the following environment variables for your module to work properly:
Expand Down
30 changes: 28 additions & 2 deletions module_example/module_example/__main__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""
Entry point for the module_example module.
"""
from typing import List
import random
from typing import List, Any
from pydantic import BaseModel, Field

from athena import app, config_schema_provider, submissions_consumer, submission_selector, feedback_consumer, feedback_provider, emit_meta
from athena import app, config_schema_provider, submissions_consumer, submission_selector, feedback_consumer, feedback_provider, evaluation_provider, emit_meta
from athena.programming import Exercise, Submission, Feedback
from athena.logger import logger
from athena.storage import store_exercise, store_submissions, store_feedback
Expand Down Expand Up @@ -139,5 +140,30 @@ def suggest_feedback(exercise: Exercise, submission: Submission, module_config:
]


# Only if it makes sense for a module (Optional)
@evaluation_provider
def evaluate_feedback(exercise: Exercise, submission: Submission, true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]) -> Any:
logger.info(
"evaluate_feedback: Evaluation for submission %d of exercise %d was requested with %d true and %d predicted feedbacks",
submission.id, exercise.id, len(true_feedbacks), len(predicted_feedbacks)
)

# Do something with the true and predicted feedback and return the evaluation result
# Generate some example evaluation result
evaluation_results = []
true_feedback_embeddings = [random.random() for _ in true_feedbacks]
predicted_feedback_embeddings = [random.random() for _ in predicted_feedbacks]
for feedback, embedding in zip(predicted_feedbacks, predicted_feedback_embeddings):
feedback_evaluation = {
"feedback_id": feedback.id,
"embedding": embedding,
"has_match": len([t for t in true_feedback_embeddings if abs(t - embedding) < 0.1]) > 0,
"correctness": random.random()
}
evaluation_results.append(feedback_evaluation)

return evaluation_results


if __name__ == "__main__":
app.start()
Original file line number Diff line number Diff line change
Expand Up @@ -227,12 +227,19 @@ async def generate_suggestions_by_file(exercise: Exercise, submission: Submissio
]
)

grading_instruction_ids = set(
grading_instruction.id
for criterion in exercise.grading_criteria or []
for grading_instruction in criterion.structured_grading_instructions
)

feedbacks: List[Feedback] = []
for prompt_input, result in zip(prompt_inputs, results):
file_path = prompt_input["file_path"]
if result is None:
continue
for feedback in result.feedbacks:
grading_instruction_id = feedback.grading_instruction_id if feedback.grading_instruction_id in grading_instruction_ids else None
feedbacks.append(Feedback(
exercise_id=exercise.id,
submission_id=submission.id,
Expand All @@ -242,7 +249,7 @@ async def generate_suggestions_by_file(exercise: Exercise, submission: Submissio
line_start=feedback.line_start,
line_end=feedback.line_end,
credits=feedback.credits,
structured_grading_instruction_id=feedback.grading_instruction_id,
structured_grading_instruction_id=grading_instruction_id,
meta={}
))

Expand Down
6 changes: 6 additions & 0 deletions module_text_llm/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ DATABASE_URL=sqlite:///../data/data.sqlite
# See below for options, available models are also logged on startup
LLM_DEFAULT_MODEL="azure_openai_gpt-35"

# Enable LLM-as-a-judge approach 0 = disabled, 1 = enabled
LLM_ENABLE_LLM_AS_A_JUDGE=1
# Evaluation model to use for the LLM-as-a-judge approach [Only important if you want to use it in the /evaluate endpoint]
# See below for options, available models are also logged on startup
LLM_EVALUATION_MODEL="azure_openai_gpt-4"

# Standard OpenAI (Non-Azure) [leave blank if not used]
# Model names prefixed with `openai_` followed by the model name, e.g. `openai_text-davinci-003`
# A list of models can be found in `module_text_llm/helpers/models/openai.py` (openai_models)
Expand Down
34 changes: 32 additions & 2 deletions module_text_llm/module_text_llm/__main__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
from typing import List
import json
import os
from typing import List, Any

import nltk
import tiktoken

from athena import app, submission_selector, submissions_consumer, feedback_consumer, feedback_provider
from athena import app, submission_selector, submissions_consumer, feedback_consumer, feedback_provider, evaluation_provider
from athena.text import Exercise, Submission, Feedback
from athena.logger import logger

from module_text_llm.config import Configuration
from module_text_llm.evaluation import get_feedback_statistics, get_llm_statistics
from module_text_llm.generate_suggestions import generate_suggestions
from module_text_llm.generate_evaluation import generate_evaluation


@submissions_consumer
Expand All @@ -33,6 +37,32 @@ async def suggest_feedback(exercise: Exercise, submission: Submission, module_co
return await generate_suggestions(exercise, submission, module_config.approach, module_config.debug)


@evaluation_provider
async def evaluate_feedback(
exercise: Exercise, submission: Submission,
true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback],
) -> Any:
logger.info(
"evaluate_feedback: Evaluation for submission %d of exercise %d was requested with %d true and %d predicted feedbacks",
submission.id, exercise.id, len(
true_feedbacks), len(predicted_feedbacks)
)

evaluation = {}

# 1. LLM as a judge
if len(predicted_feedbacks) > 0 and bool(os.environ.get("LLM_ENABLE_LLM_AS_A_JUDGE")):
evaluation["llm_as_a_judge"] = await generate_evaluation(exercise, submission, true_feedbacks, predicted_feedbacks)

# 2. LangSmith runs, token usage, and respose times
if bool(os.environ.get("LANGCHAIN_TRACING_V2")):
evaluation["llm_statistics"] = get_llm_statistics(submission)

# 3. Feedback statistics
evaluation["feedback_statistics"] = get_feedback_statistics(exercise, true_feedbacks, predicted_feedbacks)

return evaluation

if __name__ == "__main__":
nltk.download("punkt")
tiktoken.get_encoding("cl100k_base")
Expand Down
Loading

0 comments on commit 89e7047

Please sign in to comment.