diff --git a/.env.example b/.env.example new file mode 100644 index 00000000..a0f0c9bc --- /dev/null +++ b/.env.example @@ -0,0 +1,2 @@ +WEAVIATE_HOST= +WEAVIATE_PORT= diff --git a/app/domain/data/lecture_unit_dto.py b/app/domain/data/lecture_unit_dto.py index 3e7b4d74..8b123c1c 100644 --- a/app/domain/data/lecture_unit_dto.py +++ b/app/domain/data/lecture_unit_dto.py @@ -1,12 +1,13 @@ -from datetime import datetime -from typing import Optional - from pydantic import BaseModel, Field class LectureUnitDTO(BaseModel): - id: int + to_update: bool = Field(alias="toUpdate") + pdf_file_base64: str = Field(alias="pdfFile") + lecture_unit_id: int = Field(alias="lectureUnitId") + lecture_unit_name: str = Field(alias="lectureUnitName") lecture_id: int = Field(alias="lectureId") - release_date: Optional[datetime] = Field(alias="releaseDate", default=None) - name: Optional[str] = None - attachment_version: int = Field(alias="attachmentVersion") + lecture_name: str = Field(alias="lectureName") + course_id: int = Field(alias="courseId") + course_name: str = Field(alias="courseName") + course_description: str = Field(alias="courseDescription") diff --git a/app/ingestion/__init__.py b/app/ingestion/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/app/ingestion/abstract_ingestion.py b/app/ingestion/abstract_ingestion.py new file mode 100644 index 00000000..d78244f0 --- /dev/null +++ b/app/ingestion/abstract_ingestion.py @@ -0,0 +1,29 @@ +from abc import ABC, abstractmethod +from typing import List, Dict + + +class AbstractIngestion(ABC): + """ + Abstract class for ingesting repositories into a database. + """ + + @abstractmethod + def chunk_data(self, path: str) -> List[Dict[str, str]]: + """ + Abstract method to chunk code files in the root directory. + """ + pass + + @abstractmethod + def ingest(self, path: str) -> bool: + """ + Abstract method to ingest repositories into the database. + """ + pass + + @abstractmethod + def update(self, path: str): + """ + Abstract method to update a repository in the database. + """ + pass diff --git a/app/pipeline/chat/tutor_chat_pipeline.py b/app/pipeline/chat/tutor_chat_pipeline.py index 5f36b1b8..1117fb38 100644 --- a/app/pipeline/chat/tutor_chat_pipeline.py +++ b/app/pipeline/chat/tutor_chat_pipeline.py @@ -182,7 +182,7 @@ def _add_student_repository_to_prompt( for file in selected_files: if file in student_repository: self.prompt += SystemMessagePromptTemplate.from_template( - f"For reference, we have access to the student's '{file}' file:" + f"For reference, we have access to the student's '{file}' file: " ) self.prompt += HumanMessagePromptTemplate.from_template( student_repository[file].replace("{", "{{").replace("}", "}}") diff --git a/app/retrieval/__init__.py b/app/retrieval/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/app/retrieval/abstract_retrieval.py b/app/retrieval/abstract_retrieval.py new file mode 100644 index 00000000..8682d963 --- /dev/null +++ b/app/retrieval/abstract_retrieval.py @@ -0,0 +1,15 @@ +from abc import ABC, abstractmethod +from typing import List + + +class AbstractRetrieval(ABC): + """ + Abstract class for retrieving data from a database. + """ + + @abstractmethod + def retrieve(self, path: str, hybrid_factor: float, result_limit: int) -> List[str]: + """ + Abstract method to retrieve data from the database. + """ + pass diff --git a/app/retrieval/lecture_retrieval.py b/app/retrieval/lecture_retrieval.py new file mode 100644 index 00000000..11797930 --- /dev/null +++ b/app/retrieval/lecture_retrieval.py @@ -0,0 +1,43 @@ +from abc import ABC +from typing import List + +from weaviate import WeaviateClient +from weaviate.classes.query import Filter + +from app.retrieval.abstract_retrieval import AbstractRetrieval +from app.vector_database.lecture_schema import init_lecture_schema, LectureSchema + + +class LectureRetrieval(AbstractRetrieval, ABC): + """ + Class for retrieving lecture data from the database. + """ + + def __init__(self, client: WeaviateClient): + self.collection = init_lecture_schema(client) + + def retrieve( + self, + user_message: str, + hybrid_factor: float, + result_limit: int, + lecture_id: int = None, + message_vector: [float] = None, + ) -> List[str]: + response = self.collection.query.hybrid( + query=user_message, + filters=( + Filter.by_property(LectureSchema.LECTURE_ID.value).equal(lecture_id) + if lecture_id + else None + ), + alpha=hybrid_factor, + vector=message_vector, + return_properties=[ + LectureSchema.PAGE_TEXT_CONTENT.value, + LectureSchema.PAGE_IMAGE_DESCRIPTION.value, + LectureSchema.COURSE_NAME.value, + ], + limit=result_limit, + ) + return response diff --git a/app/retrieval/repositories_retrieval.py b/app/retrieval/repositories_retrieval.py new file mode 100644 index 00000000..37920fac --- /dev/null +++ b/app/retrieval/repositories_retrieval.py @@ -0,0 +1,45 @@ +from typing import List + +from weaviate import WeaviateClient +from weaviate.classes.query import Filter + +from app.retrieval.abstract_retrieval import AbstractRetrieval +from app.vector_database.repository_schema import ( + init_repository_schema, + RepositorySchema, +) + + +class RepositoryRetrieval(AbstractRetrieval): + """ + Class for Retrieving repository code for from the vector database. + """ + + def __init__(self, client: WeaviateClient): + self.collection = init_repository_schema(client) + + def retrieve( + self, + user_message: str, + result_limit: int, + repository_id: int = None, + ) -> List[str]: + response = self.collection.query.near_text( + near_text=user_message, + filters=( + Filter.by_property(RepositorySchema.REPOSITORY_ID.value).equal( + repository_id + ) + if repository_id + else None + ), + return_properties=[ + RepositorySchema.REPOSITORY_ID.value, + RepositorySchema.COURSE_ID.value, + RepositorySchema.CONTENT.value, + RepositorySchema.EXERCISE_ID.value, + RepositorySchema.FILEPATH.value, + ], + limit=result_limit, + ) + return response diff --git a/app/vector_database/__init__.py b/app/vector_database/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/app/vector_database/database.py b/app/vector_database/database.py new file mode 100644 index 00000000..f670c372 --- /dev/null +++ b/app/vector_database/database.py @@ -0,0 +1,44 @@ +import logging +import os +import weaviate +from .lecture_schema import init_lecture_schema +from .repository_schema import init_repository_schema +import weaviate.classes as wvc + +logger = logging.getLogger(__name__) + + +class VectorDatabase: + """ + Class to interact with the Weaviate vector database + """ + + def __init__(self): + self.client = weaviate.connect_to_wcs( + cluster_url=os.getenv("WEAVIATE_CLUSTER_URL"), + auth_credentials=weaviate.auth.AuthApiKey(os.getenv("WEAVIATE_AUTH_KEY")), + ) + self.repositories = init_repository_schema(self.client) + self.lectures = init_lecture_schema(self.client) + + def __del__(self): + self.client.close() + + def delete_collection(self, collection_name): + """ + Delete a collection from the database + """ + if self.client.collections.exists(collection_name): + if self.client.collections.delete(collection_name): + logger.info(f"Collection {collection_name} deleted") + else: + logger.error(f"Collection {collection_name} failed to delete") + + def delete_object(self, collection_name, property_name, object_property): + """ + Delete an object from the collection inside the databse + """ + collection = self.client.collections.get(collection_name) + collection.data.delete_many( + where=wvc.query.Filter.by_property(property_name).equal(object_property) + ) diff --git a/app/vector_database/lecture_schema.py b/app/vector_database/lecture_schema.py new file mode 100644 index 00000000..22616f1c --- /dev/null +++ b/app/vector_database/lecture_schema.py @@ -0,0 +1,97 @@ +from enum import Enum + +from weaviate.classes.config import Property +from weaviate import WeaviateClient +from weaviate.collections import Collection +from weaviate.collections.classes.config import Configure, VectorDistances, DataType + + +class LectureSchema(Enum): + """ + Schema for the lecture slides + """ + + COLLECTION_NAME = "LectureSlides" + COURSE_NAME = "course_name" + COURSE_DESCRIPTION = "course_description" + COURSE_ID = "course_id" + LECTURE_ID = "lecture_id" + LECTURE_NAME = "lecture_name" + LECTURE_UNIT_ID = "lecture_unit_id" + LECTURE_UNIT_NAME = "lecture_unit_name" + PAGE_TEXT_CONTENT = "page_text_content" + PAGE_IMAGE_DESCRIPTION = "page_image_explanation" + PAGE_BASE64 = "page_base64" + PAGE_NUMBER = "page_number" + + +def init_lecture_schema(client: WeaviateClient) -> Collection: + """ + Initialize the schema for the lecture slides + """ + if client.collections.exists(LectureSchema.COLLECTION_NAME.value): + return client.collections.get(LectureSchema.COLLECTION_NAME.value) + return client.collections.create( + name=LectureSchema.COLLECTION_NAME.value, + vectorizer_config=Configure.Vectorizer.none(), + vector_index_config=Configure.VectorIndex.hnsw( + distance_metric=VectorDistances.COSINE + ), + properties=[ + Property( + name=LectureSchema.COURSE_ID.value, + description="The ID of the course", + data_type=DataType.INT, + ), + Property( + name=LectureSchema.COURSE_NAME.value, + description="The name of the course", + data_type=DataType.TEXT, + ), + Property( + name=LectureSchema.COURSE_DESCRIPTION.value, + description="The description of the COURSE", + data_type=DataType.TEXT, + ), + Property( + name=LectureSchema.LECTURE_ID.value, + description="The ID of the lecture", + data_type=DataType.INT, + ), + Property( + name=LectureSchema.LECTURE_NAME.value, + description="The name of the lecture", + data_type=DataType.TEXT, + ), + Property( + name=LectureSchema.LECTURE_UNIT_ID.value, + description="The ID of the lecture unit", + data_type=DataType.INT, + ), + Property( + name=LectureSchema.LECTURE_UNIT_NAME.value, + description="The name of the lecture unit", + data_type=DataType.TEXT, + ), + Property( + name=LectureSchema.PAGE_TEXT_CONTENT.value, + description="The original text content from the slide", + data_type=DataType.TEXT, + ), + Property( + name=LectureSchema.PAGE_IMAGE_DESCRIPTION.value, + description="The description of the slide if the slide contains an image", + data_type=DataType.TEXT, + ), + Property( + name=LectureSchema.PAGE_BASE64.value, + description="The base64 encoded image of the slide if the slide contains an image", + data_type=DataType.TEXT, + ), + Property( + name=LectureSchema.PAGE_NUMBER.value, + description="The page number of the slide", + data_type=DataType.INT, + ), + ], + ) diff --git a/app/vector_database/repository_schema.py b/app/vector_database/repository_schema.py new file mode 100644 index 00000000..cb288713 --- /dev/null +++ b/app/vector_database/repository_schema.py @@ -0,0 +1,60 @@ +from enum import Enum +from weaviate.classes.config import Property +from weaviate import WeaviateClient +from weaviate.collections import Collection +from weaviate.collections.classes.config import Configure, VectorDistances, DataType + + +class RepositorySchema(Enum): + """ + Schema for the student repository + """ + + COLLECTION_NAME = "StudentRepository" + CONTENT = "content" + COURSE_ID = "course_id" + EXERCISE_ID = "exercise_id" + REPOSITORY_ID = "repository_id" + FILEPATH = "filepath" + + +def init_repository_schema(client: WeaviateClient) -> Collection: + """ + Initialize the schema for the student repository + """ + if client.collections.exists(RepositorySchema.COLLECTION_NAME.value): + return client.collections.get(RepositorySchema.COLLECTION_NAME.value) + return client.collections.create( + name=RepositorySchema.COLLECTION_NAME.value, + vectorizer_config=Configure.Vectorizer.none(), + vector_index_config=Configure.VectorIndex.hnsw( + distance_metric=VectorDistances.COSINE + ), + properties=[ + Property( + name=RepositorySchema.CONTENT.value, + description="The content of this chunk of code", + data_type=DataType.TEXT, + ), + Property( + name=RepositorySchema.COURSE_ID.value, + description="The ID of the course", + data_type=DataType.INT, + ), + Property( + name=RepositorySchema.EXERCISE_ID.value, + description="The ID of the exercise", + data_type=DataType.INT, + ), + Property( + name=RepositorySchema.REPOSITORY_ID.value, + description="The ID of the repository", + data_type=DataType.INT, + ), + Property( + name=RepositorySchema.FILEPATH.value, + description="The filepath of the code", + data_type=DataType.TEXT, + ), + ], + ) diff --git a/requirements.txt b/requirements.txt index d86942af..78c7b582 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,8 @@ ollama==0.1.9 openai==1.25.2 pre-commit==3.7.0 pydantic==2.7.1 +PyMuPDF==1.23.22 PyYAML==6.0.1 +requests~=2.31.0 uvicorn==0.29.0 -requests~=2.31.0 \ No newline at end of file +weaviate-client==4.5.4