-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add content_service, data ingester and vector repository subsystems
- Loading branch information
Showing
17 changed files
with
280 additions
and
81 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
from abc import ABC, abstractmethod | ||
from typing import List, Dict | ||
from langchain.text_splitter import Language | ||
|
||
|
||
class AbstractIngestion(ABC): | ||
""" | ||
Abstract class for ingesting repositories into a database. | ||
""" | ||
|
||
@abstractmethod | ||
def chunk_files(self, path: str) -> List[Dict[str, str]]: | ||
""" | ||
Abstract method to chunk code files in the root directory. | ||
""" | ||
pass | ||
|
||
@abstractmethod | ||
def ingest(self, path: str)-> bool: | ||
""" | ||
Abstract method to ingest repositories into the database. | ||
""" | ||
pass | ||
|
||
@abstractmethod | ||
def update(self, path: str): | ||
""" | ||
Abstract method to update a repository in the database. | ||
""" | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
from typing import List, Dict | ||
import weaviate | ||
|
||
from app.vector_repository.lecture_schema import init_schema | ||
from content_service.Ingestion.abstract_ingestion import AbstractIngestion | ||
|
||
|
||
class LectureIngestion(AbstractIngestion): # Inherits from the abstract class | ||
|
||
def __init__(self, client: weaviate.WeaviateClient): | ||
self.collection = init_schema(client) | ||
|
||
def chunk_files(self, path: str): | ||
# Implement chunking logic here or raise NotImplementedError if not applicable | ||
pass | ||
def ingest(self, lecture_path)-> bool: | ||
""" | ||
Ingest the lectures into the weaviate database | ||
""" | ||
# Implement ingestion logic here | ||
pass | ||
|
||
def update(self, lecture: Dict[str, str]): | ||
""" | ||
Update a lecture in the weaviate database | ||
""" | ||
# Implement update logic here or raise NotImplementedError if not applicable | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
|
||
import os | ||
import weaviate | ||
from app.data.repository_schema import init_schema, RepositoryChunk | ||
from langchain.text_splitter import ( | ||
Language, | ||
RecursiveCharacterTextSplitter, | ||
) | ||
from app.llm.langchain.iris_langchain_embedding_model import IrisLangchainEmbeddingModel | ||
from app.llm import BasicRequestHandler | ||
from data.Ingestion.abstract_ingestion import AbstractIngestion | ||
|
||
CHUNKSIZE = 512 | ||
OVERLAP = 51 | ||
|
||
|
||
def split_code(code: str, language: Language, chunk_size: int, chunk_overlap: int): | ||
""" | ||
Split the code into chunks of 1500 characters with an overlap of 100 characters | ||
""" | ||
python_splitter = RecursiveCharacterTextSplitter.from_language( | ||
language=language, chunk_size=chunk_size, chunk_overlap=chunk_overlap | ||
) | ||
return python_splitter.create_documents(code) | ||
|
||
|
||
def chunk_files(path: str): | ||
""" | ||
Chunk the code files in the root directory | ||
""" | ||
files_contents = [] | ||
for directory_path, subdir, files in os.walk(path): | ||
for filename in files: | ||
if filename.endswith('.java'): | ||
file_path = os.path.join(directory_path, filename) | ||
with open(file_path, 'r') as file: | ||
code = file.read() | ||
files_contents.append({RepositoryChunk.FILEPATH: filename, RepositoryChunk.CONTENT: code}) | ||
for file in files_contents: | ||
chunks = split_code(file[RepositoryChunk.CONTENT], Language.JAVA, CHUNKSIZE, OVERLAP) | ||
for chunk in chunks: | ||
files_contents.append( | ||
{ | ||
RepositoryChunk.CONTENT: chunk.page_content, | ||
RepositoryChunk.COURSE_ID: "tbd", | ||
RepositoryChunk.EXERCISE_ID: "tbd", | ||
RepositoryChunk.REPOSITORY_ID: "tbd", | ||
RepositoryChunk.FILEPATH: file[RepositoryChunk.FILEPATH] | ||
} | ||
) | ||
return files_contents | ||
|
||
|
||
class RepositoryIngestion(AbstractIngestion): | ||
""" | ||
Ingest the repositories into the weaviate database | ||
""" | ||
def __init__(self, client: weaviate.WeaviateClient): | ||
self.collection = init_schema(client) | ||
self.request_handler = BasicRequestHandler("gpt35") | ||
self.iris_embedding_model = IrisLangchainEmbeddingModel(self.request_handler) | ||
|
||
def ingest(self, repo_path) -> bool: | ||
""" | ||
Ingest the repositories into the weaviate database | ||
""" | ||
chunks = chunk_files(self, repo_path) | ||
with self.collection.batch.dynamic() as batch: | ||
for chunk in enumerate(chunks): | ||
embed_chunk = self.iris_embedding_model.embed_query(chunk[RepositoryChunk.CONTENT]) | ||
batch.add_object( | ||
properties=chunk, | ||
vector=embed_chunk | ||
) | ||
return True | ||
|
||
def update(self, repository: dict[str, str]): # this is most likely not necessary | ||
""" | ||
Update the repository in the weaviate database | ||
""" | ||
pass |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
from abc import ABC, abstractmethod | ||
from typing import List, Dict | ||
|
||
|
||
class AbstractRetrieval(ABC): | ||
""" | ||
Abstract class for ingesting repositories into a database. | ||
""" | ||
|
||
@abstractmethod | ||
def retrieve(self, path: str) -> List[str]: | ||
""" | ||
Abstract method to ingest repositories into the database. | ||
""" | ||
pass | ||
|
||
@abstractmethod | ||
def get_collection(self, path: str): | ||
""" | ||
Abstract method to update a repository in the database. | ||
""" | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import json | ||
from typing import List | ||
|
||
from vector_repository.repository_schema import RepositoryChunk | ||
|
||
from content_service.Retrieval.abstract_retrieval import AbstractRetrieval | ||
|
||
import weaviate.classes as wvc | ||
|
||
|
||
class RepositoryRetrieval(AbstractRetrieval): | ||
""" | ||
Class for Retrieving vector_repository for from the database. | ||
""" | ||
|
||
def retrieve(self, user_message: str, repository_id: int = None) -> List[str]: | ||
response = self.collection.query.near_text( | ||
near_text=user_message, | ||
filters=( | ||
wvc.query.Filter.by_property(RepositoryChunk.LECTURE_ID).equal( | ||
repository_id | ||
) | ||
if repository_id | ||
else None | ||
), | ||
return_properties=[ | ||
RepositoryChunk.REPOSITORY_NAME, | ||
RepositoryChunk.REPOSITORY_DESCRIPTION, | ||
], | ||
limit=5, | ||
) | ||
print(json.dumps(response, indent=2)) | ||
return response | ||
|
||
def get_collection(self, path: str): | ||
pass |
Empty file.
This file was deleted.
Oops, something went wrong.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import zipfile | ||
import requests | ||
import tempfile | ||
import os | ||
|
||
DOWNLOAD_BUFFER_SIZE = 8 * 1024 | ||
|
||
|
||
# TODO: Get correct parameters here | ||
def download_lecture_pdf(base_url: str, course_id: int, lecture_id: int, lecture_unit_id: int) -> tempfile.NamedTemporaryFile: | ||
""" | ||
Download a single lecture unit from Artemis | ||
""" | ||
# Send a GET request to the URL TODO: Validate Artemis URL | ||
artemis_url = f"{base_url}/iris/lecture-slides/{course_id}/{lecture_id}/{lecture_unit_id}" | ||
response = requests.get(artemis_url, stream=True) | ||
if response.status_code != 200: | ||
print(f"Failed to download the file. Status code: {response.status_code}") | ||
raise ConnectionError | ||
|
||
# Place the PDF into a temporary file | ||
temp_file = tempfile.NamedTemporaryFile() | ||
for chunk in response.iter_content(chunk_size=DOWNLOAD_BUFFER_SIZE): | ||
if chunk: # filter out keep-alive new chunks | ||
temp_file.write(chunk) | ||
|
||
# Return the path to the temporary file. | ||
# File should delete itself when it goes out of scope at the call site | ||
return temp_file | ||
|
||
#CALL THE RIGHT PIPELINE FOR INGESTION OF LECTURE PDF THAT HAS IMAGE INTERPRETATION. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import os | ||
import tempfile | ||
import zipfile | ||
|
||
import requests | ||
DOWNLOAD_BUFFER_SIZE = 8 * 1024 | ||
|
||
|
||
def download_repository_zip(url) -> tempfile.NamedTemporaryFile: | ||
""" | ||
Downloads a zip file from a given URL and saves it to the specified path. | ||
:param url: The URL of the zip file to download. | ||
:param save_path: The path (including the file name) where the zip file will be saved. | ||
""" | ||
response = requests.get(url, stream=True) | ||
if response.status_code == 200: | ||
# Open the file in binary write mode and write the content of the response | ||
temp_file = tempfile.NamedTemporaryFile() | ||
for chunk in response.iter_content(chunk_size=DOWNLOAD_BUFFER_SIZE): | ||
if chunk: # filter out keep-alive new chunks | ||
temp_file.write(chunk) | ||
# Return the path to the temporary file. | ||
# File should delete itself when it goes out of scope at the call site | ||
return temp_file | ||
|
||
|
||
def unzip(zip_file_path: str, directory_to: str): | ||
""" | ||
Extracts the zip file to the specified directory. | ||
""" | ||
# Open the zip file in read mode and extract all contents | ||
with zipfile.ZipFile(zip_file_path) as zip_ref: | ||
zip_ref.extractall(directory_to) | ||
|
||
#CALL THE RIGHT PIPELINE FOR INGESTION OF CODE (CHUNK THE CODE THEN GET A DESCRIPTION OF THE CODE, THEN EMBED IT AND SAVE IT IN THE DB) |
Empty file.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.