From fd92e60698ab437cf19291a432698d4e68a6fcd5 Mon Sep 17 00:00:00 2001 From: Michael Dyer <59163924+MichaelOwenDyer@users.noreply.github.com> Date: Tue, 27 Aug 2024 13:38:44 +0200 Subject: [PATCH] Fix competency extraction feature (#145) --- app/domain/__init__.py | 3 + .../chat/chat_pipeline_execution_dto.py | 7 +- ...tency_extraction_pipeline_execution_dto.py | 22 +++++ app/domain/data/competency_dto.py | 27 ++++++ .../ingestion_pipeline_execution_dto.py | 9 +- app/domain/pipeline_execution_dto.py | 12 ++- ...competency_extraction_status_update_dto.py | 6 ++ app/llm/external/openai_chat.py | 1 - app/pipeline/__init__.py | 2 +- app/pipeline/chat/course_chat_pipeline.py | 3 +- .../competency_extraction_pipeline.py | 96 +++++++++++++++++++ app/pipeline/prompts/competency_extraction.py | 44 +++++++++ app/web/routers/pipelines.py | 41 ++++++++ app/web/status/status_update.py | 56 +++++++---- 14 files changed, 292 insertions(+), 37 deletions(-) create mode 100644 app/domain/competency_extraction_pipeline_execution_dto.py create mode 100644 app/domain/status/competency_extraction_status_update_dto.py create mode 100644 app/pipeline/competency_extraction_pipeline.py create mode 100644 app/pipeline/prompts/competency_extraction.py diff --git a/app/domain/__init__.py b/app/domain/__init__.py index 2f56f3f3..b32ca726 100644 --- a/app/domain/__init__.py +++ b/app/domain/__init__.py @@ -3,6 +3,9 @@ from .pipeline_execution_settings_dto import PipelineExecutionSettingsDTO from .chat.chat_pipeline_execution_dto import ChatPipelineExecutionDTO from .chat.chat_pipeline_execution_base_data_dto import ChatPipelineExecutionBaseDataDTO +from .competency_extraction_pipeline_execution_dto import ( + CompetencyExtractionPipelineExecutionDTO, +) from app.domain.chat.exercise_chat.exercise_chat_pipeline_execution_dto import ( ExerciseChatPipelineExecutionDTO, ) diff --git a/app/domain/chat/chat_pipeline_execution_dto.py b/app/domain/chat/chat_pipeline_execution_dto.py index 31fa7593..e3e63284 100644 --- a/app/domain/chat/chat_pipeline_execution_dto.py +++ b/app/domain/chat/chat_pipeline_execution_dto.py @@ -2,16 +2,11 @@ from pydantic import Field -from app.domain import PipelineExecutionDTO, PipelineExecutionSettingsDTO +from app.domain import PipelineExecutionDTO from app.domain.pyris_message import PyrisMessage from app.domain.data.user_dto import UserDTO -from app.domain.status.stage_dto import StageDTO class ChatPipelineExecutionDTO(PipelineExecutionDTO): chat_history: List[PyrisMessage] = Field(alias="chatHistory", default=[]) user: Optional[UserDTO] - settings: Optional[PipelineExecutionSettingsDTO] - initial_stages: Optional[List[StageDTO]] = Field( - default=None, alias="initialStages" - ) diff --git a/app/domain/competency_extraction_pipeline_execution_dto.py b/app/domain/competency_extraction_pipeline_execution_dto.py new file mode 100644 index 00000000..05a88167 --- /dev/null +++ b/app/domain/competency_extraction_pipeline_execution_dto.py @@ -0,0 +1,22 @@ +from typing import List + +from pydantic import Field, BaseModel + +from . import PipelineExecutionDTO +from .data.competency_dto import CompetencyTaxonomy, Competency + + +class CompetencyExtractionPipelineExecutionDTO(BaseModel): + execution: PipelineExecutionDTO + course_description: str = Field(alias="courseDescription") + current_competencies: list[Competency] = Field( + alias="currentCompetencies", default=[] + ) + taxonomy_options: List[CompetencyTaxonomy] = Field( + alias="taxonomyOptions", default=[] + ) + max_n: int = Field( + alias="maxN", + description="Maximum number of competencies to extract from the course description", + default=10, + ) diff --git a/app/domain/data/competency_dto.py b/app/domain/data/competency_dto.py index 0e2c697c..9561d0c1 100644 --- a/app/domain/data/competency_dto.py +++ b/app/domain/data/competency_dto.py @@ -3,6 +3,7 @@ from typing import Optional from pydantic import BaseModel, Field +from pydantic.v1 import validator class CompetencyTaxonomy(str, Enum): @@ -21,3 +22,29 @@ class CompetencyDTO(BaseModel): taxonomy: Optional[CompetencyTaxonomy] = None soft_due_date: Optional[datetime] = Field(default=None, alias="softDueDate") optional: Optional[bool] = None + + +class Competency(BaseModel): + title: str = Field( + description="Title of the competency that contains no more than 4 words", + ) + description: str = Field( + description="Description of the competency as plain string. DO NOT RETURN A LIST OF STRINGS." + ) + taxonomy: CompetencyTaxonomy = Field( + description="Selected taxonomy based on bloom's taxonomy" + ) + + @validator("title") + def validate_title(cls, field): + """Validate the subject of the competency.""" + if len(field.split()) > 4: + raise ValueError("Title must contain no more than 4 words") + return field + + @validator("taxonomy") + def validate_selected_taxonomy(cls, field): + """Validate the selected taxonomy.""" + if field not in CompetencyTaxonomy.__members__: + raise ValueError(f"Invalid taxonomy: {field}") + return field diff --git a/app/domain/ingestion/ingestion_pipeline_execution_dto.py b/app/domain/ingestion/ingestion_pipeline_execution_dto.py index 393767e8..e8a9882f 100644 --- a/app/domain/ingestion/ingestion_pipeline_execution_dto.py +++ b/app/domain/ingestion/ingestion_pipeline_execution_dto.py @@ -1,17 +1,12 @@ -from typing import List, Optional +from typing import List from pydantic import Field -from app.domain import PipelineExecutionDTO, PipelineExecutionSettingsDTO +from app.domain import PipelineExecutionDTO from app.domain.data.lecture_unit_dto import LectureUnitDTO -from app.domain.status.stage_dto import StageDTO class IngestionPipelineExecutionDto(PipelineExecutionDTO): lecture_units: List[LectureUnitDTO] = Field( ..., alias="pyrisLectureUnitWebhookDTOS" ) - settings: Optional[PipelineExecutionSettingsDTO] - initial_stages: Optional[List[StageDTO]] = Field( - default=None, alias="initialStages" - ) diff --git a/app/domain/pipeline_execution_dto.py b/app/domain/pipeline_execution_dto.py index 86299d40..fb447369 100644 --- a/app/domain/pipeline_execution_dto.py +++ b/app/domain/pipeline_execution_dto.py @@ -1,8 +1,16 @@ -from pydantic import BaseModel +from typing import Optional + +from pydantic import BaseModel, Field + +from app.domain.pipeline_execution_settings_dto import PipelineExecutionSettingsDTO +from app.domain.status.stage_dto import StageDTO class PipelineExecutionDTO(BaseModel): - pass + settings: Optional[PipelineExecutionSettingsDTO] + initial_stages: Optional[list[StageDTO]] = Field( + default=None, alias="initialStages" + ) class Config: populate_by_name = True diff --git a/app/domain/status/competency_extraction_status_update_dto.py b/app/domain/status/competency_extraction_status_update_dto.py new file mode 100644 index 00000000..e71f2bdf --- /dev/null +++ b/app/domain/status/competency_extraction_status_update_dto.py @@ -0,0 +1,6 @@ +from app.domain.data.competency_dto import Competency +from app.domain.status.status_update_dto import StatusUpdateDTO + + +class CompetencyExtractionStatusUpdateDTO(StatusUpdateDTO): + result: list[Competency] = [] diff --git a/app/llm/external/openai_chat.py b/app/llm/external/openai_chat.py index a05c49ac..27e2d080 100644 --- a/app/llm/external/openai_chat.py +++ b/app/llm/external/openai_chat.py @@ -7,7 +7,6 @@ from openai import OpenAI from openai.lib.azure import AzureOpenAI from openai.types.chat import ChatCompletionMessage, ChatCompletionMessageParam -from openai.types.chat.completion_create_params import ResponseFormat from openai.types.shared_params import ResponseFormatJSONObject from ...common.message_converters import map_str_to_role, map_role_to_str diff --git a/app/pipeline/__init__.py b/app/pipeline/__init__.py index 13980f8d..c9faeebb 100644 --- a/app/pipeline/__init__.py +++ b/app/pipeline/__init__.py @@ -1 +1 @@ -from ..pipeline.pipeline import Pipeline +from app.pipeline.pipeline import Pipeline diff --git a/app/pipeline/chat/course_chat_pipeline.py b/app/pipeline/chat/course_chat_pipeline.py index 42a046b0..17aca74a 100644 --- a/app/pipeline/chat/course_chat_pipeline.py +++ b/app/pipeline/chat/course_chat_pipeline.py @@ -266,7 +266,8 @@ def get_competency_list() -> list: def lecture_content_retrieval() -> str: """ Retrieve content from indexed lecture slides. - This will run a RAG retrieval based on the chat history on the indexed lecture slides and return the most relevant paragraphs. + This will run a RAG retrieval based on the chat history on the indexed lecture slides and return the most + relevant paragraphs. Use this if you think it can be useful to answer the student's question, or if the student explicitly asks a question about the lecture content or slides. Only use this once. diff --git a/app/pipeline/competency_extraction_pipeline.py b/app/pipeline/competency_extraction_pipeline.py new file mode 100644 index 00000000..da224ffe --- /dev/null +++ b/app/pipeline/competency_extraction_pipeline.py @@ -0,0 +1,96 @@ +import logging +from typing import Optional + +from langchain.output_parsers import PydanticOutputParser +from langchain_core.prompts import ( + ChatPromptTemplate, +) + +from app.domain import ( + CompetencyExtractionPipelineExecutionDTO, + PyrisMessage, + IrisMessageRole, +) +from app.domain.data.text_message_content_dto import TextMessageContentDTO +from app.domain.data.competency_dto import Competency +from app.llm import CapabilityRequestHandler, RequirementList, CompletionArguments +from app.pipeline import Pipeline +from app.web.status.status_update import CompetencyExtractionCallback +from app.pipeline.prompts.competency_extraction import system_prompt + +logger = logging.getLogger(__name__) + + +class CompetencyExtractionPipeline(Pipeline): + callback: CompetencyExtractionCallback + request_handler: CapabilityRequestHandler + output_parser: PydanticOutputParser + + def __init__(self, callback: Optional[CompetencyExtractionCallback] = None): + super().__init__( + implementation_id="competency_extraction_pipeline_reference_impl" + ) + self.callback = callback + self.request_handler = CapabilityRequestHandler(requirements=RequirementList()) + self.output_parser = PydanticOutputParser(pydantic_object=Competency) + + def __call__( + self, + dto: CompetencyExtractionPipelineExecutionDTO, + prompt: Optional[ChatPromptTemplate] = None, + **kwargs, + ): + if not dto.course_description: + raise ValueError("Course description is required") + if not dto.taxonomy_options: + raise ValueError("Taxonomy options are required") + if not dto.max_n: + raise ValueError("Non-zero max_n is required") + + taxonomy_options = ", ".join(dto.taxonomy_options) + current_competencies = "\n\n".join( + [c.model_dump_json(indent=4) for c in dto.current_competencies] + ) + if current_competencies: + current_competencies = ( + f"\nHere are the current competencies in the course:\n{current_competencies}\n" + f"Do not repeat these competencies.\n" + ) + + prompt = system_prompt.format( + taxonomy_list=taxonomy_options, + course_description=dto.course_description, + max_n=dto.max_n, + current_competencies=current_competencies, + ) + prompt = PyrisMessage( + sender=IrisMessageRole.SYSTEM, + contents=[TextMessageContentDTO(text_content=prompt)], + ) + + response = self.request_handler.chat( + [prompt], CompletionArguments(temperature=0.4) + ) + response = response.contents[0].text_content + + generated_competencies: list[Competency] = [] + + # Find all competencies in the response up to the max_n + competencies = response.split("\n\n")[: dto.max_n] + for i, competency in enumerate(competencies): + logger.debug(f"Processing competency {i + 1}: {competency}") + if "{" not in competency or "}" not in competency: + logger.debug("Skipping competency without JSON") + continue + # Get the competency JSON object + start = competency.index("{") + end = competency.index("}") + 1 + competency = competency[start:end] + try: + competency = self.output_parser.parse(competency) + except Exception as e: + logger.debug(f"Error parsing competency: {e}") + continue + logger.debug(f"Generated competency: {competency}") + generated_competencies.append(competency) + self.callback.done(final_result=generated_competencies) diff --git a/app/pipeline/prompts/competency_extraction.py b/app/pipeline/prompts/competency_extraction.py new file mode 100644 index 00000000..4d87b6d4 --- /dev/null +++ b/app/pipeline/prompts/competency_extraction.py @@ -0,0 +1,44 @@ +system_prompt = """ +You are an expert in all topics of computer science and its practical applications. +Your task consists of three parts: +1. Read the provided curriculum description a university course. +2. Extract all learning goals ("competencies") from the course description. + +Each competency must contain the following fields: + +- title: +The title of the competency, which is a specific topic or skill. This should be a short phrase of at most 4 words. + +- description: +A detailed description of the competency in 2 to 5 bullet points. +Each bullet point illustrates a specific skill or concept of the competency. +Each bullet point is a complete sentence containing at most 15 words. +Each bullet point is on a new line and starts with "- ". + +- taxonomy: +The classification of the competency within Bloom's taxonomy. +You must choose from these options in Bloom's taxonomy: {taxonomy_list} + +All competencies must meet the following requirements: + +- is mentioned in the course description. +- corresponds to exactly one subject or skill covered in the course description. +- is assigned to exactly one level of Bloom's taxonomy. +- is small and fine-grained. Large topics should be broken down into smaller competencies. +- does not overlap with other competencies: each competency is unique. Expanding on a previous competency is allowed. + +Here is the provided course description: {course_description} + +Here is a template competency in JSON format: + +{{ + "title": "Competency Title", + "description": "- You understand this.\n- You are proficient in doing that.\n- You know how to do this.", + "taxonomy": "ANALYZE" +}} + +{current_competencies} + +Respond with 0 to {max_n} competencies extracted from the course description, +each in JSON format, split by two newlines. +""" diff --git a/app/web/routers/pipelines.py b/app/web/routers/pipelines.py index 7ac9d3da..f92f0d68 100644 --- a/app/web/routers/pipelines.py +++ b/app/web/routers/pipelines.py @@ -9,14 +9,17 @@ from app.domain import ( ExerciseChatPipelineExecutionDTO, CourseChatPipelineExecutionDTO, + CompetencyExtractionPipelineExecutionDTO, ) from app.web.status.status_update import ( ExerciseChatStatusCallback, CourseChatStatusCallback, + CompetencyExtractionCallback, ) from app.pipeline.chat.course_chat_pipeline import CourseChatPipeline from app.pipeline.chat.exercise_chat_pipeline import ExerciseChatPipeline from app.dependencies import TokenValidator +from app.pipeline.competency_extraction_pipeline import CompetencyExtractionPipeline router = APIRouter(prefix="/api/v1/pipelines", tags=["pipelines"]) logger = logging.getLogger(__name__) @@ -86,6 +89,44 @@ def run_course_chat_pipeline(variant: str, dto: CourseChatPipelineExecutionDTO): thread.start() +def run_competency_extraction_pipeline_worker( + dto: CompetencyExtractionPipelineExecutionDTO, _variant: str +): + try: + callback = CompetencyExtractionCallback( + run_id=dto.execution.settings.authentication_token, + base_url=dto.execution.settings.artemis_base_url, + initial_stages=dto.execution.initial_stages, + ) + pipeline = CompetencyExtractionPipeline(callback=callback) + except Exception as e: + logger.error(f"Error preparing competency extraction pipeline: {e}") + logger.error(traceback.format_exc()) + capture_exception(e) + return + + try: + pipeline(dto=dto) + except Exception as e: + logger.error(f"Error running competency extraction pipeline: {e}") + logger.error(traceback.format_exc()) + callback.error("Fatal error.", exception=e) + + +@router.post( + "/competency-extraction/{variant}/run", + status_code=status.HTTP_202_ACCEPTED, + dependencies=[Depends(TokenValidator())], +) +def run_competency_extraction_pipeline( + variant: str, dto: CompetencyExtractionPipelineExecutionDTO +): + thread = Thread( + target=run_competency_extraction_pipeline_worker, args=(dto, variant) + ) + thread.start() + + @router.get("/{feature}") def get_pipeline(feature: str): """ diff --git a/app/web/status/status_update.py b/app/web/status/status_update.py index 533047ca..1f497f75 100644 --- a/app/web/status/status_update.py +++ b/app/web/status/status_update.py @@ -5,6 +5,9 @@ import requests from abc import ABC +from ...domain.status.competency_extraction_status_update_dto import ( + CompetencyExtractionStatusUpdateDTO, +) from ...domain.chat.course_chat.course_chat_status_update_dto import ( CourseChatStatusUpdateDTO, ) @@ -101,24 +104,19 @@ def done( If there is a next stage, set the current stage to the next stage. """ - if self.stage.state == StageStateEnum.IN_PROGRESS: - self.stage.state = StageStateEnum.DONE - self.stage.message = message - self.status.result = final_result - if hasattr(self.status, "suggestions"): - self.status.suggestions = suggestions - next_stage = self.get_next_stage() - if next_stage is not None: - self.stage = next_stage - if next_stage_message: - self.stage.message = next_stage_message - if start_next_stage: - self.stage.state = StageStateEnum.IN_PROGRESS - self.on_status_update() - else: - raise ValueError( - "Invalid state transition to done. current state is ", self.stage.state - ) + self.stage.state = StageStateEnum.DONE + self.stage.message = message + self.status.result = final_result + if hasattr(self.status, "suggestions"): + self.status.suggestions = suggestions + next_stage = self.get_next_stage() + if next_stage is not None: + self.stage = next_stage + if next_stage_message: + self.stage.message = next_stage_message + if start_next_stage: + self.stage.state = StageStateEnum.IN_PROGRESS + self.on_status_update() def error(self, message: str, exception=None): """ @@ -128,7 +126,6 @@ def error(self, message: str, exception=None): self.stage.state = StageStateEnum.ERROR self.stage.message = message self.status.result = None - self.stage.suggestions = None # Set all subsequent stages to SKIPPED if an error occurs rest_of_index = ( self.current_stage_index + 1 @@ -219,3 +216,24 @@ def __init__( status = ExerciseChatStatusUpdateDTO(stages=stages) stage = stages[current_stage_index] super().__init__(url, run_id, status, stage, current_stage_index) + + +class CompetencyExtractionCallback(StatusCallback): + def __init__( + self, + run_id: str, + base_url: str, + initial_stages: List[StageDTO], + ): + url = f"{base_url}/api/public/pyris/pipelines/competency-extraction/runs/{run_id}/status" + stages = initial_stages or [] + stages.append( + StageDTO( + weight=10, + state=StageStateEnum.NOT_STARTED, + name="Generating Competencies", + ) + ) + status = CompetencyExtractionStatusUpdateDTO(stages=stages) + stage = stages[-1] + super().__init__(url, run_id, status, stage, len(stages) - 1)