ls1intum · Hialus · Feb 14, 2024 · Feb 14, 2024 · Feb 15, 2024 · Feb 15, 2024
@@ -0,0 +1,2 @@
+WEAVIATE_HOST=
+WEAVIATE_PORT=
@@ -0,0 +1,29 @@
+from abc import ABC, abstractmethod
+from typing import List, Dict
+
+
+class AbstractIngestion(ABC):
+    """
+    Abstract class for ingesting repositories into a database.
+    """
+
+    @abstractmethod
+    def chunk_data(self, path: str) -> List[Dict[str, str]]:
+        """
+        Abstract method to chunk code files in the root directory.
+        """
+        pass
+
+    @abstractmethod
+    def ingest(self, path: str) -> bool:
+        """
+        Abstract method to ingest repositories into the database.
+        """
+        pass
+
+    @abstractmethod
+    def update(self, path: str):
+        """
+        Abstract method to update a repository in the database.
+        """
+        pass
@@ -0,0 +1,73 @@
+import base64
+from typing import Dict
+import fitz
+import weaviate
+from app.vector_database.lectureschema import init_lecture_schema, LectureSchema
+from ..Ingestion.abstract_ingestion import AbstractIngestion
+from app.llm import BasicRequestHandler
+
+
+class LectureIngestion(AbstractIngestion):  # Inherits from the abstract class
+
+    def __init__(self, client: weaviate.WeaviateClient):
+        self.collection = init_lecture_schema(client)
+
+    def chunk_data(self, lecture_path: str):
+        doc = fitz.open(lecture_path)  # Explicitly annotate as an Iterable of fitz.Page
+        data = []
+        for page_num in range(doc.page_count):
+            page = doc.load_page(page_num)
+            # Check if the page has images
+            if page.get_images(full=True):
+                # Render the page to an image (pixmap)
+                pix = page.get_pixmap()
+                # Convert the pixmap to bytes
+                img_bytes = pix.tobytes("png")
+                # Encode the bytes to Base64 and then decode to a string
+                img_base64 = base64.b64encode(img_bytes).decode("utf-8")
+                page_content = page.get_text()
+                data.append(
+                    {
+                        LectureSchema.PAGE_TEXT_CONTENT: page_content,
+                        LectureSchema.PAGE_IMAGE_DESCRIPTION: "",  # image_interpretation,
+                        LectureSchema.PAGE_NUMBER: page_num + 1,
+                        LectureSchema.LECTURE_NAME: lecture_path,
+                        LectureSchema.PAGE_BASE64: img_base64,
+                    }
+                )
+
+            else:
+                page_content = page.get_text()
+                data.append(
+                    {
+                        LectureSchema.PAGE_TEXT_CONTENT: page_content,
+                        LectureSchema.PAGE_IMAGE_DESCRIPTION: "",
+                        LectureSchema.PAGE_NUMBER: page_num + 1,
+                        LectureSchema.LECTURE_NAME: lecture_path,
+                        LectureSchema.PAGE_BASE64: "",
+                    }
+                )
+        return data
+
+    def ingest(self, lecture_path, embedding_model: BasicRequestHandler = None) -> bool:
+        """
+        Ingest the repositories into the weaviate database
+        """
+        chunks = self.chunk_data(lecture_path)
+        with self.collection.batch.dynamic() as batch:
+            for index, chunk in enumerate(chunks):
+                # embed the
+                embed_chunk = embedding_model.embed(
+                    chunk[1][LectureSchema.PAGE_TEXT_CONTENT]
+                    + "\n"
+                    + chunk[1][LectureSchema.PAGE_IMAGE_DESCRIPTION]
+                )
+                batch.add_object(properties=chunk, vector=embed_chunk)
+        return True
+
+    def update(self, lecture: Dict[str, str]):
+        """
+        Update a lecture in the weaviate database
+        """
+        # Implement update logic here or raise NotImplementedError if not applicable
+        pass
@@ -0,0 +1,92 @@
+import os
+from abc import ABC
+
+import weaviate
+from langchain.text_splitter import (
+    Language,
+    RecursiveCharacterTextSplitter,
+)
+
+from app.llm import BasicRequestHandler
+from app.llm.langchain.iris_langchain_embedding_model import IrisLangchainEmbeddingModel
+from app.vector_database.repository_schema import (
+    init_repository_schema,
+    RepositorySchema,
+)
+from ..Ingestion.abstract_ingestion import AbstractIngestion
+
+CHUNKSIZE = 512
+OVERLAP = 51
+
+
+def split_code(code: str, language: Language, chunk_size: int, chunk_overlap: int):
+    """
+    Split the code into chunks of 1500 characters with an overlap of 100 characters
+    """
+    python_splitter = RecursiveCharacterTextSplitter.from_language(
+        language=language, chunk_size=chunk_size, chunk_overlap=chunk_overlap
+    )
+    return python_splitter.create_documents([code])
+
+
+class RepositoryIngestion(AbstractIngestion, ABC):
+    """
+    Ingest the repositories into the weaviate database
+    """
+
+    def __init__(self, client: weaviate.WeaviateClient):
+        self.collection = init_repository_schema(client)
+        self.request_handler = BasicRequestHandler("gpt35")
+        self.iris_embedding_model = IrisLangchainEmbeddingModel(self.request_handler)
+
+    def chunk_files(self, path: str):
+        """
+        Chunk the code files in the root directory
+        """
+        files_contents = []
+        for directory_path, subdir, files in os.walk(path):
+            for filename in files:
+                if filename.endswith(".java"):
+                    file_path = os.path.join(directory_path, filename)
+                    with open(file_path, "r") as file:
+                        code = file.read()
+                    files_contents.append(
+                        {
+                            RepositorySchema.FILEPATH: filename,
+                            RepositorySchema.CONTENT: code,
+                        }
+                    )
+        for file in files_contents:
+            chunks = split_code(
+                file[RepositorySchema.CONTENT], Language.JAVA, CHUNKSIZE, OVERLAP
+            )
+            for chunk in chunks:
+                files_contents.append(
+                    {
+                        RepositorySchema.CONTENT: chunk.page_content,
+                        RepositorySchema.COURSE_ID: "tbd",
+                        RepositorySchema.EXERCISE_ID: "tbd",
+                        RepositorySchema.REPOSITORY_ID: "tbd",
+                        RepositorySchema.FILEPATH: file[RepositorySchema.FILEPATH],
+                    }
+                )
+        return files_contents
+
+    def ingest(self, repo_path: str) -> bool:
+        """
+        Ingest the repositories into the weaviate database
+        """
+        chunks = self.chunk_files(repo_path)
+        with self.collection.batch.dynamic() as batch:
+            for index, chunk in enumerate(chunks):
+                embed_chunk = self.iris_embedding_model.embed_query(
+                    chunk[1][RepositorySchema.CONTENT]
+                )
+                batch.add_object(properties=chunk, vector=embed_chunk)
+        return True
+
+    def update(self, repository: dict[str, str]):  # this is most likely not necessary
+        """
+        Update the repository in the weaviate database
+        """
+        pass
@@ -0,0 +1,15 @@
+from abc import ABC, abstractmethod
+from typing import List
+
+
+class AbstractRetrieval(ABC):
+    """
+    Abstract class for retrieving data from a database.
+    """
+
+    @abstractmethod
+    def retrieve(self, path: str, hybrid_factor: float) -> List[str]:
+        """
+        Abstract method to ingest repositories into the database.
+        """
+        pass
@@ -0,0 +1,38 @@
+from abc import ABC
+from typing import List
+
+import weaviate
+import weaviate.classes as wvc
+
+from app.vector_database.lectureschema import init_lecture_schema, LectureSchema
+from ..Retrieval.abstract_retrieval import AbstractRetrieval
+
+
+class LectureRetrieval(AbstractRetrieval, ABC):
+    """
+    Class for retrieving lecture data from the database.
+    """
+
+    def __init__(self, client: weaviate.WeaviateClient):
+        self.collection = init_lecture_schema(client)
+
+    def retrieve(
+        self,
+        user_message: str,
+        hybrid_factor: float,
+        lecture_id: int = None,
+        embedding_vector: [float] = None,
+    ) -> List[dict]:
+        response = self.collection.query.hybrid(
+            query=user_message,
+            limit=3,
+            filters=(
+                wvc.query.Filter.by_property(LectureSchema.LECTURE_ID).equal(lecture_id)
+                if lecture_id
+                else None
+            ),
+            alpha=hybrid_factor,
+            vector=embedding_vector,
+        )
+        relevant_chunks = [obj.properties for obj in response.objects]
+        return relevant_chunks
@@ -0,0 +1,44 @@
+import json
+from typing import List
+
+import weaviate
+
+from ...vector_database.repository_schema import (
+    RepositorySchema,
+    init_repository_schema,
+)
+
+from ..Retrieval.abstract_retrieval import AbstractRetrieval
+
+import weaviate.classes as wvc
+
+
+class RepositoryRetrieval(AbstractRetrieval):
+    """
+    Class for Retrieving vector_database for from the database.
+    """
+
+    def __init__(self, client: weaviate.WeaviateClient):
+        self.collection = init_repository_schema(client)
+
+    def retrieve(self, user_message: str, repository_id: int = None) -> List[str]:
+        response = self.collection.query.near_text(
+            near_text=user_message,
+            filters=(
+                wvc.query.Filter.by_property(RepositorySchema.REPOSITORY_ID).equal(
+                    repository_id
+                )
+                if repository_id
+                else None
+            ),
+            return_properties=[
+                RepositorySchema.REPOSITORY_ID,
+                RepositorySchema.COURSE_ID,
+                RepositorySchema.CONTENT,
+                RepositorySchema.EXERCISE_ID,
+                RepositorySchema.FILEPATH,
+            ],
+            limit=5,
+        )
+        print(json.dumps(response, indent=2))
+        return response
@@ -0,0 +1,22 @@
+import requests
+import tempfile
+
+DOWNLOAD_BUFFER_SIZE = 8 * 1024
+
+
+def download_lecture_pdf(base_url: str, unit_id: int) -> tempfile.NamedTemporaryFile:
+    """
+    Download a single lecture unit from Artemis
+    """
+    artemis_url = f"{base_url}/api/v1/public/pyris/data/lecture-units/{unit_id}/pdf"
+    response = requests.get(artemis_url, stream=True)
+    if response.status_code != 200:
+        raise ConnectionError(
+            f"Failed to download the file. Status code: {response.status_code}, URL: {artemis_url}"
+        )
+
+    with tempfile.NamedTemporaryFile() as temp_file:
+        for chunk in response.iter_content(chunk_size=DOWNLOAD_BUFFER_SIZE):
+            if chunk:
+                temp_file.write(chunk)
+        return temp_file
@@ -1,17 +1,13 @@
-from typing import List, Any
-
+from typing import List
 from langchain_core.embeddings import Embeddings
-
 from ...llm import RequestHandler
 
 
 class IrisLangchainEmbeddingModel(Embeddings):
     """Custom langchain embedding for our own request handler"""
 
-    request_handler: RequestHandler
-
-    def __init__(self, request_handler: RequestHandler, **kwargs: Any) -> None:
-        super().__init__(request_handler=request_handler, **kwargs)
+    def __init__(self, request_handler: RequestHandler) -> None:
+        self.request_handler = request_handler
 
     def embed_documents(self, texts: List[str]) -> List[List[float]]:
         return [self.embed_query(text) for text in texts]