Skip to content

Commit

Permalink
Use cloud cluster for weaviate for now for the hackathon
Browse files Browse the repository at this point in the history
Postpone the ingestion methods of the lectures for now until we get the format of the letures,
first basic implementation of ingest and retrieve methods for the code
  • Loading branch information
yassinsws committed Feb 21, 2024
1 parent 128ea40 commit 70ed83f
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 71 deletions.
14 changes: 10 additions & 4 deletions app/data/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

class VectorDatabase:
def __init__(self):
weaviate_host = os.getenv("WEAVIATE_HOST")
"""weaviate_host = os.getenv("WEAVIATE_HOST")
weaviate_port = os.getenv("WEAVIATE_PORT")
assert weaviate_host, "WEAVIATE_HOST environment variable must be set"
assert weaviate_port, "WEAVIATE_PORT environment variable must be set"
Expand All @@ -16,10 +16,16 @@ def __init__(self):
), "WEAVIATE_PORT environment variable must be an integer"
self._client = weaviate.connect_to_local(
host=weaviate_host, port=int(weaviate_port)
)"""
# Connect to the Weaviate Cloud Service until we set up a proper docker for this project
client = weaviate.connect_to_wcs(
cluster_url=os.getenv("https://try-repository-pipeline-99b1nlo4.weaviate.network"), # Replace with your WCS URL
auth_credentials=weaviate.auth.AuthApiKey(os.getenv("2IPqwB6mwGMIs92UJ3StB0Wovj0MquBxs9Ql")) # Replace with your WCS key
)
self.repositories = Repositories(self._client)
self.lectures = Lectures(self._client)
print(client.is_ready())
self.repositories = Repositories(self.client)
self.lectures = Lectures(self.client)

def __del__(self):
# Close the connection to Weaviate when the object is deleted
self._client.close()
self.client.close()
60 changes: 1 addition & 59 deletions app/data/lecture/lectures.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,8 @@
import json
import os
import time

import fitz # PyMuPDF
import openai
import weaviate
from unstructured.cleaners.core import clean
import weaviate.classes as wvc

from data.lecture.lecture_schema import init_schema, COLLECTION_NAME, LectureSlideChunk


def chunk_files(subdirectory_path, subdirectory):
data = []
# Process each PDF file in this subdirectory
for filename in os.listdir(subdirectory_path):
if not filename.endswith(".pdf"):
continue
file_path = os.path.join(subdirectory_path, filename)
# Open the PDF
with fitz.open(file_path) as doc:
for page_num in range(len(doc)):
page_text = doc[page_num].get_text()
page_text = clean(page_text, bullets=True, extra_whitespace=True)
data.append(
{
LectureSlideChunk.PAGE_CONTENT: page_text,
LectureSlideChunk.COURSE_ID: "",
LectureSlideChunk.LECTURE_ID: "",
LectureSlideChunk.LECTURE_NAME: "",
LectureSlideChunk.LECTURE_UNIT_ID: "",
LectureSlideChunk.LECTURE_UNIT_NAME: "",
LectureSlideChunk.FILENAME: file_path,
LectureSlideChunk.PAGE_NUMBER: "",
}
)
return data

from lecture_schema import init_schema, LectureSlideChunk

class Lectures:

Expand All @@ -45,30 +11,6 @@ def __init__(self, client: weaviate.WeaviateClient):

def ingest(self, lectures):
pass

def search(self, query, k=3, filter=None):
pass

def batch_import(self, directory_path, subdirectory):
data = chunk_files(directory_path, subdirectory)
with self.collection.batch.dynamic() as batch:
for i, properties in enumerate(data):
embeddings_created = False
for j in range(5): # max 5 retries
if not embeddings_created:
try:
batch.add_data_object(properties, COLLECTION_NAME)
embeddings_created = True # Set flag to True on success
break # Break the loop as embedding creation was successful
except openai.error.RateLimitError:
time.sleep(2**j) # wait 2^j seconds before retrying
print("Retrying import...")
else:
break # Exit loop if embeddings already created
# Raise an error if embeddings were not created after retries
if not embeddings_created:
raise RuntimeError("Failed to create embeddings.")

def retrieve(self, user_message: str, lecture_id: int = None):
response = self.collection.query.near_text(
near_text=user_message,
Expand Down
67 changes: 59 additions & 8 deletions app/data/repository/repositories.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,72 @@
import os
import weaviate

from data.repository.repository_schema import init_schema
from repository_schema import init_schema, RepositoryChunk
from langchain.text_splitter import (
Language,
RecursiveCharacterTextSplitter,
)


class Repositories:

def __init__(self, client: weaviate.WeaviateClient):
self.collection = init_schema(client)

def retrieve(self, question:str):
pass
def split_code(self, code: [str], language: Language):
"""
Split the code into chunks of 1500 characters with an overlap of 100 characters
"""
python_splitter = RecursiveCharacterTextSplitter.from_language(
language=language, chunk_size=1500, chunk_overlap=100
)
return python_splitter.create_documents(code)

def ingest(self, repositories: dict[str, str]):
pass
def chunk_files(self, files: [dict[str, str]]):
"""
Chunk the code files in the root directory
"""
files_contents = []
# for directory_path, subdir, files in os.walk(root_directory_path):
# for filename in files:
# if filename.endswith('.py'):
# file_path = os.path.join(directory_path, filename)
# with open(file_path, 'r') as file:
# code = file.read()
for file in files:
chunks = self.split_code(file[RepositoryChunk.CONTENT], Language.JAVA)
for chunk in chunks:
files_contents.append(
{
RepositoryChunk.CONTENT: chunk,
RepositoryChunk.COURSE_ID: file[RepositoryChunk.COURSE_ID],
RepositoryChunk.EXERCISE_ID: file[RepositoryChunk.EXERCISE_ID],
RepositoryChunk.REPOSITORY_ID: file[RepositoryChunk.REPOSITORY_ID],
RepositoryChunk.FILEPATH: file[RepositoryChunk.FILEPATH]
}
)
return files_contents

def search(self, query, k=3, filter=None):
pass
def retrieve(self, query_vector: list[float]):
"""
Retrieve the top 3 most similar chunks to the query vector
"""
response = self.collection.query.near_vector(
near_vector=query_vector,
limit=3, # Return the top 3 most similar chunks
# return_metadata=wvc.query.MetadataQuery()
)
return response

def ingest(self, repositories: [dict[str, str]]):
chunks = self.chunk_files(self, repositories)
with self.collection.batch.dynamic() as batch:
for chunk in enumerate(chunks):
# embed_chunk = llm.embed(chunk[RepositoryChunk.CONTENT]) # Embed the chunk content
embed_chunk = [0.0, 0.0, 0.0] # Placeholder for the embedding
batch.add_object(
properties=chunk,
vector=embed_chunk
)

def create_tree_structure(self):
pass

0 comments on commit 70ed83f

Please sign in to comment.