Add content_service, data ingester and vector repository subsystems

ls1intum · Feb 22, 2024 · b4cb05d · b4cb05d
1 parent 2c0793a
commit b4cb05d
Show file tree

Hide file tree

Showing 17 changed files with 280 additions and 81 deletions.
diff --git a/app/content_service/Ingestion/__init__.py b/app/content_service/Ingestion/__init__.py
diff --git a/app/content_service/Ingestion/abstract_ingestion.py b/app/content_service/Ingestion/abstract_ingestion.py
@@ -0,0 +1,30 @@
+from abc import ABC, abstractmethod
+from typing import List, Dict
+from langchain.text_splitter import Language
+
+
+class AbstractIngestion(ABC):
+    """
+    Abstract class for ingesting repositories into a database.
+    """
+
+    @abstractmethod
+    def chunk_files(self, path: str) -> List[Dict[str, str]]:
+        """
+        Abstract method to chunk code files in the root directory.
+        """
+        pass
+
+    @abstractmethod
+    def ingest(self, path: str)-> bool:
+        """
+        Abstract method to ingest repositories into the database.
+        """
+        pass
+
+    @abstractmethod
+    def update(self, path: str):
+        """
+        Abstract method to update a repository in the database.
+        """
+        pass
diff --git a/app/content_service/Ingestion/lectures_ingestion.py b/app/content_service/Ingestion/lectures_ingestion.py
@@ -0,0 +1,28 @@
+from typing import List, Dict
+import weaviate
+
+from app.vector_repository.lecture_schema import init_schema
+from content_service.Ingestion.abstract_ingestion import AbstractIngestion
+
+
+class LectureIngestion(AbstractIngestion):  # Inherits from the abstract class
+
+    def __init__(self, client: weaviate.WeaviateClient):
+        self.collection = init_schema(client)
+
+    def chunk_files(self, path: str):
+        # Implement chunking logic here or raise NotImplementedError if not applicable
+        pass
+    def ingest(self, lecture_path)-> bool:
+        """
+        Ingest the lectures into the weaviate database
+        """
+        # Implement ingestion logic here
+        pass
+
+    def update(self, lecture: Dict[str, str]):
+        """
+        Update a lecture in the weaviate database
+        """
+        # Implement update logic here or raise NotImplementedError if not applicable
+        pass
diff --git a/app/content_service/Ingestion/repository_ingestion.py b/app/content_service/Ingestion/repository_ingestion.py
@@ -0,0 +1,81 @@
+
+import os
+import weaviate
+from app.data.repository_schema import init_schema, RepositoryChunk
+from langchain.text_splitter import (
+    Language,
+    RecursiveCharacterTextSplitter,
+)
+from app.llm.langchain.iris_langchain_embedding_model import IrisLangchainEmbeddingModel
+from app.llm import BasicRequestHandler
+from data.Ingestion.abstract_ingestion import AbstractIngestion
+
+CHUNKSIZE = 512
+OVERLAP = 51
+
+
+def split_code(code: str, language: Language, chunk_size: int, chunk_overlap: int):
+    """
+    Split the code into chunks of 1500 characters with an overlap of 100 characters
+    """
+    python_splitter = RecursiveCharacterTextSplitter.from_language(
+        language=language, chunk_size=chunk_size, chunk_overlap=chunk_overlap
+    )
+    return python_splitter.create_documents(code)
+
+
+def chunk_files(path: str):
+    """
+    Chunk the code files in the root directory
+    """
+    files_contents = []
+    for directory_path, subdir, files in os.walk(path):
+        for filename in files:
+            if filename.endswith('.java'):
+                file_path = os.path.join(directory_path, filename)
+                with open(file_path, 'r') as file:
+                    code = file.read()
+                files_contents.append({RepositoryChunk.FILEPATH: filename, RepositoryChunk.CONTENT: code})
+    for file in files_contents:
+        chunks = split_code(file[RepositoryChunk.CONTENT], Language.JAVA, CHUNKSIZE, OVERLAP)
+        for chunk in chunks:
+            files_contents.append(
+                {
+                    RepositoryChunk.CONTENT: chunk.page_content,
+                    RepositoryChunk.COURSE_ID: "tbd",
+                    RepositoryChunk.EXERCISE_ID: "tbd",
+                    RepositoryChunk.REPOSITORY_ID: "tbd",
+                    RepositoryChunk.FILEPATH: file[RepositoryChunk.FILEPATH]
+                }
+            )
+    return files_contents
+
+
+class RepositoryIngestion(AbstractIngestion):
+    """
+    Ingest the repositories into the weaviate database
+    """
+    def __init__(self, client: weaviate.WeaviateClient):
+        self.collection = init_schema(client)
+        self.request_handler = BasicRequestHandler("gpt35")
+        self.iris_embedding_model = IrisLangchainEmbeddingModel(self.request_handler)
+
+    def ingest(self, repo_path) -> bool:
+        """
+        Ingest the repositories into the weaviate database
+        """
+        chunks = chunk_files(self, repo_path)
+        with self.collection.batch.dynamic() as batch:
+            for chunk in enumerate(chunks):
+                embed_chunk = self.iris_embedding_model.embed_query(chunk[RepositoryChunk.CONTENT])
+                batch.add_object(
+                    properties=chunk,
+                    vector=embed_chunk
+                )
+        return True
+
+    def update(self, repository: dict[str, str]):  # this is most likely not necessary
+        """
+        Update the repository in the weaviate database
+        """
+        pass
diff --git a/app/content_service/Retrieval/__init__.py b/app/content_service/Retrieval/__init__.py
diff --git a/app/content_service/Retrieval/abstract_retrieval.py b/app/content_service/Retrieval/abstract_retrieval.py
@@ -0,0 +1,22 @@
+from abc import ABC, abstractmethod
+from typing import List, Dict
+
+
+class AbstractRetrieval(ABC):
+    """
+    Abstract class for ingesting repositories into a database.
+    """
+
+    @abstractmethod
+    def retrieve(self, path: str) -> List[str]:
+        """
+        Abstract method to ingest repositories into the database.
+        """
+        pass
+
+    @abstractmethod
+    def get_collection(self, path: str):
+        """
+        Abstract method to update a repository in the database.
+        """
+        pass
diff --git a/app/data/lecture/lectures.py → ...nt_service/Retrieval/lecture_retrieval.py b/app/data/lecture/lectures.py → ...nt_service/Retrieval/lecture_retrieval.py
@@ -1,17 +1,23 @@
 import json
+from typing import List
+
 import weaviate
 import weaviate.classes as wvc
 
-from lecture_schema import init_schema, LectureSlideChunk
+from app.vector_repository.lecture_schema import init_schema, LectureSlideChunk
+from content_service.Retrieval.abstract_retrieval import AbstractRetrieval
+
 
-class Lectures:
+
+class LectureRetrieval(AbstractRetrieval):
+    """
+    Class for ingesting repositories into a database.
+    """
 
     def __init__(self, client: weaviate.WeaviateClient):
         self.collection = init_schema(client)
 
-    def ingest(self, lectures):
-        pass
-    def retrieve(self, user_message: str, lecture_id: int = None):
+    def retrieve(self, user_message: str, lecture_id: int = None) -> List[str]:
         response = self.collection.query.near_text(
             near_text=user_message,
             filters=(
@@ -29,3 +35,6 @@ def retrieve(self, user_message: str, lecture_id: int = None):
         )
         print(json.dumps(response, indent=2))
         return response
+
+    def get_collection(self, path: str):
+        pass
diff --git a/app/content_service/Retrieval/repositories_retrieval.py b/app/content_service/Retrieval/repositories_retrieval.py
@@ -0,0 +1,36 @@
+import json
+from typing import List
+
+from vector_repository.repository_schema import RepositoryChunk
+
+from content_service.Retrieval.abstract_retrieval import AbstractRetrieval
+
+import weaviate.classes as wvc
+
+
+class RepositoryRetrieval(AbstractRetrieval):
+    """
+    Class for Retrieving vector_repository for from the database.
+    """
+
+    def retrieve(self, user_message: str, repository_id: int = None) -> List[str]:
+        response = self.collection.query.near_text(
+            near_text=user_message,
+            filters=(
+                wvc.query.Filter.by_property(RepositoryChunk.LECTURE_ID).equal(
+                    repository_id
+                )
+                if repository_id
+                else None
+            ),
+            return_properties=[
+                RepositoryChunk.REPOSITORY_NAME,
+                RepositoryChunk.REPOSITORY_DESCRIPTION,
+            ],
+            limit=5,
+        )
+        print(json.dumps(response, indent=2))
+        return response
+
+    def get_collection(self, path: str):
+        pass
diff --git a/app/content_service/__init__.py b/app/content_service/__init__.py
diff --git a/app/data/repository/repositories.py b/app/data/repository/repositories.py
diff --git a/app/data_ingestion/__init__.py b/app/data_ingestion/__init__.py
diff --git a/app/data_ingestion/download_ingest_lecture.py b/app/data_ingestion/download_ingest_lecture.py
@@ -0,0 +1,31 @@
+import zipfile
+import requests
+import tempfile
+import os
+
+DOWNLOAD_BUFFER_SIZE = 8 * 1024
+
+
+# TODO: Get correct parameters here
+def download_lecture_pdf(base_url: str, course_id: int, lecture_id: int, lecture_unit_id: int) -> tempfile.NamedTemporaryFile:
+    """
+    Download a single lecture unit from Artemis
+    """
+    # Send a GET request to the URL TODO: Validate Artemis URL
+    artemis_url = f"{base_url}/iris/lecture-slides/{course_id}/{lecture_id}/{lecture_unit_id}"
+    response = requests.get(artemis_url, stream=True)
+    if response.status_code != 200:
+        print(f"Failed to download the file. Status code: {response.status_code}")
+        raise ConnectionError
+
+    # Place the PDF into a temporary file
+    temp_file = tempfile.NamedTemporaryFile()
+    for chunk in response.iter_content(chunk_size=DOWNLOAD_BUFFER_SIZE):
+        if chunk:  # filter out keep-alive new chunks
+            temp_file.write(chunk)
+
+    # Return the path to the temporary file.
+    # File should delete itself when it goes out of scope at the call site
+    return temp_file
+
+#CALL THE RIGHT PIPELINE FOR INGESTION OF LECTURE PDF THAT HAS IMAGE INTERPRETATION.
diff --git a/app/data_ingestion/download_ingest_repository.py b/app/data_ingestion/download_ingest_repository.py
@@ -0,0 +1,36 @@
+import os
+import tempfile
+import zipfile
+
+import requests
+DOWNLOAD_BUFFER_SIZE = 8 * 1024
+
+
+def download_repository_zip(url) -> tempfile.NamedTemporaryFile:
+    """
+    Downloads a zip file from a given URL and saves it to the specified path.
+
+    :param url: The URL of the zip file to download.
+    :param save_path: The path (including the file name) where the zip file will be saved.
+    """
+    response = requests.get(url, stream=True)
+    if response.status_code == 200:
+        # Open the file in binary write mode and write the content of the response
+        temp_file = tempfile.NamedTemporaryFile()
+        for chunk in response.iter_content(chunk_size=DOWNLOAD_BUFFER_SIZE):
+            if chunk:  # filter out keep-alive new chunks
+                temp_file.write(chunk)
+        # Return the path to the temporary file.
+        # File should delete itself when it goes out of scope at the call site
+        return temp_file
+
+
+def unzip(zip_file_path: str, directory_to: str):
+    """
+    Extracts the zip file to the specified directory.
+    """
+    # Open the zip file in read mode and extract all contents
+    with zipfile.ZipFile(zip_file_path) as zip_ref:
+        zip_ref.extractall(directory_to)
+
+#CALL THE RIGHT PIPELINE FOR INGESTION OF CODE (CHUNK THE CODE THEN GET A DESCRIPTION OF THE CODE, THEN EMBED IT AND SAVE IT IN THE DB)
diff --git a/app/vector_repository/__init__.py b/app/vector_repository/__init__.py
diff --git a/app/data/db.py → app/vector_repository/db.py b/app/data/db.py → app/vector_repository/db.py
diff --git a/app/data/lecture/lecture_schema.py → app/vector_repository/lecture_schema.py b/app/data/lecture/lecture_schema.py → app/vector_repository/lecture_schema.py
@@ -29,7 +29,7 @@ def init_schema(client: WeaviateClient) -> Collection:
     return client.collections.create(
         name=COLLECTION_NAME,
         vectorizer_config=wvc.config.Configure.Vectorizer.none(),  # We do not want to vectorize the text automatically
-        # HNSW is preferred over FLAT for large amounts of data, which is the case here
+        # HNSW is preferred over FLAT for large amounts of vector_repository, which is the case here
         vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
             distance_metric=wvc.config.VectorDistances.COSINE  # select preferred distance metric
         ),