Use cloud cluster for weaviate for now for the hackathon

Postpone the ingestion methods of the lectures for now until we get the format of the letures, first basic implementation of ingest and retrieve methods for the code
ls1intum · Feb 21, 2024 · 70ed83f · 70ed83f
1 parent 128ea40
commit 70ed83f
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 71 deletions.
diff --git a/app/data/db.py b/app/data/db.py
@@ -7,7 +7,7 @@
 
 class VectorDatabase:
     def __init__(self):
-        weaviate_host = os.getenv("WEAVIATE_HOST")
+        """weaviate_host = os.getenv("WEAVIATE_HOST")
         weaviate_port = os.getenv("WEAVIATE_PORT")
         assert weaviate_host, "WEAVIATE_HOST environment variable must be set"
         assert weaviate_port, "WEAVIATE_PORT environment variable must be set"
@@ -16,10 +16,16 @@ def __init__(self):
         ), "WEAVIATE_PORT environment variable must be an integer"
         self._client = weaviate.connect_to_local(
             host=weaviate_host, port=int(weaviate_port)
+        )"""
+        # Connect to the Weaviate Cloud Service until we set up a proper docker for this project
+        client = weaviate.connect_to_wcs(
+                cluster_url=os.getenv("https://try-repository-pipeline-99b1nlo4.weaviate.network"),  # Replace with your WCS URL
+                auth_credentials=weaviate.auth.AuthApiKey(os.getenv("2IPqwB6mwGMIs92UJ3StB0Wovj0MquBxs9Ql"))  # Replace with your WCS key
         )
-        self.repositories = Repositories(self._client)
-        self.lectures = Lectures(self._client)
+        print(client.is_ready())
+        self.repositories = Repositories(self.client)
+        self.lectures = Lectures(self.client)
 
     def __del__(self):
         # Close the connection to Weaviate when the object is deleted
-        self._client.close()
+        self.client.close()
diff --git a/app/data/lecture/lectures.py b/app/data/lecture/lectures.py
@@ -1,42 +1,8 @@
 import json
-import os
-import time
-
-import fitz  # PyMuPDF
-import openai
 import weaviate
-from unstructured.cleaners.core import clean
 import weaviate.classes as wvc
 
-from data.lecture.lecture_schema import init_schema, COLLECTION_NAME, LectureSlideChunk
-
-
-def chunk_files(subdirectory_path, subdirectory):
-    data = []
-    # Process each PDF file in this subdirectory
-    for filename in os.listdir(subdirectory_path):
-        if not filename.endswith(".pdf"):
-            continue
-        file_path = os.path.join(subdirectory_path, filename)
-        # Open the PDF
-        with fitz.open(file_path) as doc:
-            for page_num in range(len(doc)):
-                page_text = doc[page_num].get_text()
-                page_text = clean(page_text, bullets=True, extra_whitespace=True)
-                data.append(
-                    {
-                        LectureSlideChunk.PAGE_CONTENT: page_text,
-                        LectureSlideChunk.COURSE_ID: "",
-                        LectureSlideChunk.LECTURE_ID: "",
-                        LectureSlideChunk.LECTURE_NAME: "",
-                        LectureSlideChunk.LECTURE_UNIT_ID: "",
-                        LectureSlideChunk.LECTURE_UNIT_NAME: "",
-                        LectureSlideChunk.FILENAME: file_path,
-                        LectureSlideChunk.PAGE_NUMBER: "",
-                    }
-                )
-    return data
-
+from lecture_schema import init_schema, LectureSlideChunk
 
 class Lectures:
 
@@ -45,30 +11,6 @@ def __init__(self, client: weaviate.WeaviateClient):
 
     def ingest(self, lectures):
         pass
-
-    def search(self, query, k=3, filter=None):
-        pass
-
-    def batch_import(self, directory_path, subdirectory):
-        data = chunk_files(directory_path, subdirectory)
-        with self.collection.batch.dynamic() as batch:
-            for i, properties in enumerate(data):
-                embeddings_created = False
-                for j in range(5):  # max 5 retries
-                    if not embeddings_created:
-                        try:
-                            batch.add_data_object(properties, COLLECTION_NAME)
-                            embeddings_created = True  # Set flag to True on success
-                            break  # Break the loop as embedding creation was successful
-                        except openai.error.RateLimitError:
-                            time.sleep(2**j)  # wait 2^j seconds before retrying
-                            print("Retrying import...")
-                    else:
-                        break  # Exit loop if embeddings already created
-                # Raise an error if embeddings were not created after retries
-                if not embeddings_created:
-                    raise RuntimeError("Failed to create embeddings.")
-
     def retrieve(self, user_message: str, lecture_id: int = None):
         response = self.collection.query.near_text(
             near_text=user_message,

diff --git a/app/data/repository/repositories.py b/app/data/repository/repositories.py
@@ -1,21 +1,72 @@
+import os
 import weaviate
-
-from data.repository.repository_schema import init_schema
+from repository_schema import init_schema, RepositoryChunk
+from langchain.text_splitter import (
+    Language,
+    RecursiveCharacterTextSplitter,
+)
 
 
 class Repositories:
 
     def __init__(self, client: weaviate.WeaviateClient):
         self.collection = init_schema(client)
 
-    def retrieve(self, question:str):
-        pass
+    def split_code(self, code: [str], language: Language):
+        """
+        Split the code into chunks of 1500 characters with an overlap of 100 characters
+        """
+        python_splitter = RecursiveCharacterTextSplitter.from_language(
+            language=language, chunk_size=1500, chunk_overlap=100
+        )
+        return python_splitter.create_documents(code)
 
-    def ingest(self, repositories: dict[str, str]):
-        pass
+    def chunk_files(self, files: [dict[str, str]]):
+        """
+        Chunk the code files in the root directory
+        """
+        files_contents = []
+        # for directory_path, subdir, files in os.walk(root_directory_path):
+        #    for filename in files:
+        #        if filename.endswith('.py'):
+        #            file_path = os.path.join(directory_path, filename)
+        #            with open(file_path, 'r') as file:
+        #                code = file.read()
+        for file in files:
+            chunks = self.split_code(file[RepositoryChunk.CONTENT], Language.JAVA)
+            for chunk in chunks:
+                files_contents.append(
+                    {
+                        RepositoryChunk.CONTENT: chunk,
+                        RepositoryChunk.COURSE_ID: file[RepositoryChunk.COURSE_ID],
+                        RepositoryChunk.EXERCISE_ID: file[RepositoryChunk.EXERCISE_ID],
+                        RepositoryChunk.REPOSITORY_ID: file[RepositoryChunk.REPOSITORY_ID],
+                        RepositoryChunk.FILEPATH: file[RepositoryChunk.FILEPATH]
+                    }
+                )
+        return files_contents
 
-    def search(self, query, k=3, filter=None):
-        pass
+    def retrieve(self, query_vector: list[float]):
+        """
+        Retrieve the top 3 most similar chunks to the query vector
+        """
+        response = self.collection.query.near_vector(
+            near_vector=query_vector,
+            limit=3,  # Return the top 3 most similar chunks
+            # return_metadata=wvc.query.MetadataQuery()
+        )
+        return response
+
+    def ingest(self, repositories: [dict[str, str]]):
+        chunks = self.chunk_files(self, repositories)
+        with self.collection.batch.dynamic() as batch:
+            for chunk in enumerate(chunks):
+                # embed_chunk = llm.embed(chunk[RepositoryChunk.CONTENT]) # Embed the chunk content
+                embed_chunk = [0.0, 0.0, 0.0] # Placeholder for the embedding
+                batch.add_object(
+                    properties=chunk,
+                    vector=embed_chunk
+                )
 
     def create_tree_structure(self):
         pass