Merge pull request #174 from ls1intum/feature/automatic-evaluation

Add automatic evaluation with LLM-as-a-Judge, LangSmith export, and SGI evaluation
ls1intum · Nov 12, 2023 · 89e7047 · 89e7047
2 parents ec3cb22 + fe10281
commit 89e7047
Show file tree

Hide file tree

Showing 33 changed files with 1,304 additions and 401 deletions.
diff --git a/assessment_module_manager/assessment_module_manager/endpoints/health_endpoint.py b/assessment_module_manager/assessment_module_manager/endpoints/health_endpoint.py
@@ -27,6 +27,7 @@ class HealthResponse(BaseModel):
     """
     Response indicating whether the Assessment Module Manager is healthy,
     and whether all the modules are healthy (i.e. reachable).
+    Additional information about the modules is also provided.
     """
     status: str = Field(const=True, default="ok", example="ok")
     modules: dict = Field(
@@ -35,7 +36,8 @@ class HealthResponse(BaseModel):
                 "module_example": {
                     "url": "http://localhost:5001",
                     "type": "programming",
-                    "healthy": True
+                    "healthy": True,
+                    "supportsEvaluation": True
                 }
             }
         ]
@@ -56,6 +58,7 @@ async def get_health() -> HealthResponse:
                 "url": module.url,
                 "type": module.type,
                 "healthy": await is_healthy(module),
+                "supportsEvaluation": module.supports_evaluation
             }
             for module in get_modules()
         }

diff --git a/assessment_module_manager/assessment_module_manager/module/list_modules.py b/assessment_module_manager/assessment_module_manager/module/list_modules.py
@@ -20,6 +20,7 @@ def list_modules() -> List[Module]:
             name=module,
             url=cast(AnyHttpUrl, os.environ.get(f"{module.upper()}_URL", modules_config[module]["url"])),
             type=ExerciseType(modules_config[module]["type"]),
+            supports_evaluation=modules_config[module].getboolean("supports_evaluation"),
         )
         for module in modules_config.sections()
     ]
diff --git a/assessment_module_manager/assessment_module_manager/module/module.py b/assessment_module_manager/assessment_module_manager/module/module.py
@@ -8,3 +8,4 @@ class Module(BaseModel):
     name: str = Field(example="module_example")
     url: AnyHttpUrl = Field(example="http://localhost:5001")
     type: ExerciseType = Field(example=ExerciseType.text)
+    supports_evaluation: bool = Field(description="Whether the module supports evaluation", example=True)
diff --git a/assessment_module_manager/modules.docker.ini b/assessment_module_manager/modules.docker.ini
@@ -1,19 +1,24 @@
 [module_example]
 url = http://module-example:5001
 type = programming
+supports_evaluation = true
 
 [module_programming_llm]
 url = http://module-programming-llm:5002
 type = programming
+supports_evaluation = false
 
 [module_text_llm]
 url = http://module-text-llm:5003
 type = text
+supports_evaluation = true
 
 [module_text_cofee]
 url = http://module-text-cofee:5004
 type = text
+supports_evaluation = false
 
 [module_programming_themisml]
 url = http://module-programming-themisml:5005
-type = programming
+type = programming
+supports_evaluation = false
diff --git a/assessment_module_manager/modules.ini b/assessment_module_manager/modules.ini
@@ -1,19 +1,24 @@
 [module_example]
 url = http://localhost:5001
 type = programming
+supports_evaluation = true
 
 [module_programming_llm]
 url = http://localhost:5002
 type = programming
+supports_evaluation = false
 
 [module_text_llm]
 url = http://localhost:5003
 type = text
+supports_evaluation = true
 
 [module_text_cofee]
 url = http://localhost:5004
 type = text
+supports_evaluation = false
 
 [module_programming_themisml]
 url = http://localhost:5005
 type = programming
+supports_evaluation = false
diff --git a/athena/athena/__init__.py b/athena/athena/__init__.py
@@ -6,7 +6,7 @@
 from .schemas import ExerciseType, GradingCriterion, StructuredGradingInstruction
 from .metadata import emit_meta, get_meta
 from .experiment import get_experiment_environment
-from .endpoints import submission_selector, submissions_consumer, feedback_consumer, feedback_provider, config_schema_provider  # type: ignore
+from .endpoints import submission_selector, submissions_consumer, feedback_consumer, feedback_provider, config_schema_provider, evaluation_provider  # type: ignore
 
 
 @app.get("/")
@@ -28,6 +28,7 @@ def run_module():
     "feedback_consumer",
     "feedback_provider",
     "config_schema_provider",
+    "evaluation_provider",
     "emit_meta",
     "get_meta",
     "get_experiment_environment",

diff --git a/athena/athena/endpoints.py b/athena/athena/endpoints.py
@@ -358,4 +358,63 @@ def config_schema_provider(cls: Type[C]) -> Type[C]:
     async def wrapper():
         return cls.schema()
 
-    return cls
+    return cls
+
+
+def evaluation_provider(func: Union[
+    Callable[[E, S, List[F], List[F]], Any],
+    Callable[[E, S, List[F], List[F]], Coroutine[Any, Any, Any]]
+]):
+    """
+    Provide evaluated feedback to the Assessment Module Manager.
+    
+    Note: The evaluation provider is usually called during the research and development phase (by the Playground).
+    Return arbitrary evaluation results.
+
+    This decorator can be used with several types of functions: synchronous or asynchronous.
+
+    Examples:
+        Below are some examples of possible functions that you can decorate with this decorator:
+
+        Without using module config (both synchronous and asynchronous forms):
+        >>> @evaluation_provider
+        ... def sync_evaluate_feedback(
+        ...     exercise: Exercise, submission: Submission, 
+        ...     true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]
+        ... ) -> Any:
+        ...     # evaluate predicted feedback here and return evaluation results
+
+        >>> @feedback_provider
+        ... async def async_evaluate_feedback(
+        ...     exercise: Exercise, submission: Submission, 
+        ...     true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]
+        ... ) -> Any:
+        ...     # evaluate predicted feedback here and return evaluation results
+    """
+    exercise_type = inspect.signature(func).parameters["exercise"].annotation
+    submission_type = inspect.signature(func).parameters["submission"].annotation
+    feedback_type = inspect.signature(func).parameters["predicted_feedbacks"].annotation.__args__[0]
+
+    @app.post("/evaluation", responses=module_responses)
+    @authenticated
+    @with_meta
+    async def wrapper(
+            exercise: exercise_type, 
+            submission: submission_type, 
+            true_feedbacks: List[feedback_type], 
+            predicted_feedbacks: List[feedback_type],
+        ):
+        # Retrieve existing metadata for the exercise, submission and feedback
+        exercise.meta.update(get_stored_exercise_meta(exercise) or {})
+        submission.meta.update(get_stored_submission_meta(submission) or {})
+        for feedback in true_feedbacks + predicted_feedbacks:
+            feedback.meta.update(get_stored_feedback_meta(feedback) or {})
+
+        # Call the actual provider
+        if inspect.iscoroutinefunction(func):
+            evaluation = await func(exercise, submission, true_feedbacks, predicted_feedbacks)
+        else:
+            evaluation = func(exercise, submission, true_feedbacks, predicted_feedbacks)
+
+        return evaluation
+    return wrapper
diff --git a/docs/module/structure.rst b/docs/module/structure.rst
@@ -94,7 +94,7 @@ Example:
                 )
             ]
 
-Provide Config Schema
+Provide Config Schema (Optional)
 ~~~~~~~~~~~~~~~~~~~~~~
 Get a schema for config options of the module as json schema. The config complying to the schema can then be provided in the header of a request `X-Module-Config` to override the default values. The module can decorate one pydantic model with ``@config_schema_provider`` to provide the schema and should have default values set for all fields as default configuration. The configuration class can be appended to the function signature of all other decorators to provide the configuration to the function.
 
@@ -108,6 +108,37 @@ Example:
             debug: bool = Field(False, description="Whether the module is in debug mode.")
             ...
 
+Provide Evaluation (Optional)
+~~~~~~~~~~~~~~~~~~
+Get an arbitrary evaluation for a submission with historical ``true_feedback`` and feedback suggestions ``predicted_feedback``. The Playground would usually call this when conducting an evaluation during an experiment. The module will receive the request at the function annotated with ``@evaluation_provider``.
+
+If you want to have the ``/evaluation`` endpoint available during the Playground evaluation mode, you need to set ``supports_evaluation = true`` in the ``modules.ini`` and ``modules.docker.ini`` files.
+
+Example: 
+    .. code-block:: python
+
+        from athena import *
+
+        @evaluation_provider
+        def evaluate_feedback(exercise: Exercise, submission: Submission, true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]) -> Any:
+            # Do something with the true and predicted feedback and return the evaluation result
+            ...
+            # Example: Generate some example evaluation result
+            evaluation_results = []
+            true_feedback_embeddings = [random.random() for _ in true_feedbacks] 
+            predicted_feedback_embeddings = [random.random() for _ in predicted_feedbacks]
+            for feedback, embedding in zip(predicted_feedbacks, predicted_feedback_embeddings):
+                feedback_evaluation = {
+                    "feedback_id": feedback.id,
+                    "embedding": embedding,
+                    "has_match": len([t for t in true_feedback_embeddings if abs(t - embedding) < 0.1]) > 0,
+                    "correctness": random.random()
+                }
+                evaluation_results.append(feedback_evaluation)
+            ...
+            # Return arbitrary evaluation results
+            return evaluation_results
+
 Environment Variables
 ---------------------
 You should provide at least the following environment variables for your module to work properly:

diff --git a/module_example/module_example/__main__.py b/module_example/module_example/__main__.py
@@ -1,10 +1,11 @@
 """
 Entry point for the module_example module.
 """
-from typing import List
+import random
+from typing import List, Any
 from pydantic import BaseModel, Field
 
-from athena import app, config_schema_provider, submissions_consumer, submission_selector, feedback_consumer, feedback_provider, emit_meta
+from athena import app, config_schema_provider, submissions_consumer, submission_selector, feedback_consumer, feedback_provider, evaluation_provider, emit_meta
 from athena.programming import Exercise, Submission, Feedback
 from athena.logger import logger
 from athena.storage import store_exercise, store_submissions, store_feedback
@@ -139,5 +140,30 @@ def suggest_feedback(exercise: Exercise, submission: Submission, module_config:
     ]
 
 
+# Only if it makes sense for a module (Optional)
+@evaluation_provider
+def evaluate_feedback(exercise: Exercise, submission: Submission, true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]) -> Any:
+    logger.info(
+        "evaluate_feedback: Evaluation for submission %d of exercise %d was requested with %d true and %d predicted feedbacks", 
+        submission.id, exercise.id, len(true_feedbacks), len(predicted_feedbacks)
+    )
+
+    # Do something with the true and predicted feedback and return the evaluation result
+    # Generate some example evaluation result
+    evaluation_results = []
+    true_feedback_embeddings = [random.random() for _ in true_feedbacks] 
+    predicted_feedback_embeddings = [random.random() for _ in predicted_feedbacks]
+    for feedback, embedding in zip(predicted_feedbacks, predicted_feedback_embeddings):
+        feedback_evaluation = {
+            "feedback_id": feedback.id,
+            "embedding": embedding,
+            "has_match": len([t for t in true_feedback_embeddings if abs(t - embedding) < 0.1]) > 0,
+            "correctness": random.random()
+        }
+        evaluation_results.append(feedback_evaluation)
+
+    return evaluation_results
+
+
 if __name__ == "__main__":
     app.start()
diff --git a/module_programming_llm/module_programming_llm/generate_suggestions_by_file.py b/module_programming_llm/module_programming_llm/generate_suggestions_by_file.py
@@ -227,12 +227,19 @@ async def generate_suggestions_by_file(exercise: Exercise, submission: Submissio
             ]
         )
 
+    grading_instruction_ids = set(
+        grading_instruction.id 
+        for criterion in exercise.grading_criteria or [] 
+        for grading_instruction in criterion.structured_grading_instructions
+    )
+
     feedbacks: List[Feedback] = []
     for prompt_input, result in zip(prompt_inputs, results):
         file_path = prompt_input["file_path"]
         if result is None:
             continue
         for feedback in result.feedbacks:
+            grading_instruction_id = feedback.grading_instruction_id if feedback.grading_instruction_id in grading_instruction_ids else None
             feedbacks.append(Feedback(
                 exercise_id=exercise.id,
                 submission_id=submission.id,
@@ -242,7 +249,7 @@ async def generate_suggestions_by_file(exercise: Exercise, submission: Submissio
                 line_start=feedback.line_start,
                 line_end=feedback.line_end,
                 credits=feedback.credits,
-                structured_grading_instruction_id=feedback.grading_instruction_id,
+                structured_grading_instruction_id=grading_instruction_id,
                 meta={}
             ))
 

diff --git a/module_text_llm/.env.example b/module_text_llm/.env.example
@@ -14,6 +14,12 @@ DATABASE_URL=sqlite:///../data/data.sqlite
 # See below for options, available models are also logged on startup
 LLM_DEFAULT_MODEL="azure_openai_gpt-35"
 
+# Enable LLM-as-a-judge approach 0 = disabled, 1 = enabled
+LLM_ENABLE_LLM_AS_A_JUDGE=1
+# Evaluation model to use for the LLM-as-a-judge approach [Only important if you want to use it in the /evaluate endpoint]
+# See below for options, available models are also logged on startup
+LLM_EVALUATION_MODEL="azure_openai_gpt-4"
+
 # Standard OpenAI (Non-Azure) [leave blank if not used]
 # Model names prefixed with `openai_` followed by the model name, e.g. `openai_text-davinci-003`
 # A list of models can be found in `module_text_llm/helpers/models/openai.py` (openai_models)

diff --git a/module_text_llm/module_text_llm/__main__.py b/module_text_llm/module_text_llm/__main__.py
@@ -1,14 +1,18 @@
-from typing import List
+import json
+import os
+from typing import List, Any
 
 import nltk
 import tiktoken
 
-from athena import app, submission_selector, submissions_consumer, feedback_consumer, feedback_provider
+from athena import app, submission_selector, submissions_consumer, feedback_consumer, feedback_provider, evaluation_provider
 from athena.text import Exercise, Submission, Feedback
 from athena.logger import logger
 
 from module_text_llm.config import Configuration
+from module_text_llm.evaluation import get_feedback_statistics, get_llm_statistics
 from module_text_llm.generate_suggestions import generate_suggestions
+from module_text_llm.generate_evaluation import generate_evaluation
 
 
 @submissions_consumer
@@ -33,6 +37,32 @@ async def suggest_feedback(exercise: Exercise, submission: Submission, module_co
     return await generate_suggestions(exercise, submission, module_config.approach, module_config.debug)
 
 
+@evaluation_provider
+async def evaluate_feedback(
+    exercise: Exercise, submission: Submission, 
+    true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback], 
+) -> Any:
+    logger.info(
+        "evaluate_feedback: Evaluation for submission %d of exercise %d was requested with %d true and %d predicted feedbacks",
+        submission.id, exercise.id, len(
+            true_feedbacks), len(predicted_feedbacks)
+    )
+
+    evaluation = {}
+
+    # 1. LLM as a judge
+    if len(predicted_feedbacks) > 0 and bool(os.environ.get("LLM_ENABLE_LLM_AS_A_JUDGE")):
+        evaluation["llm_as_a_judge"] = await generate_evaluation(exercise, submission, true_feedbacks, predicted_feedbacks)
+
+    # 2. LangSmith runs, token usage, and respose times
+    if bool(os.environ.get("LANGCHAIN_TRACING_V2")):
+        evaluation["llm_statistics"] = get_llm_statistics(submission)
+
+    # 3. Feedback statistics
+    evaluation["feedback_statistics"] = get_feedback_statistics(exercise, true_feedbacks, predicted_feedbacks)
+
+    return evaluation
+
 if __name__ == "__main__":
     nltk.download("punkt")
     tiktoken.get_encoding("cl100k_base")