diff --git a/app/content_service/Ingestion/__init__.py b/app/content_service/Ingestion/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/app/content_service/Ingestion/abstract_ingestion.py b/app/content_service/Ingestion/abstract_ingestion.py new file mode 100644 index 00000000..89ba4f8f --- /dev/null +++ b/app/content_service/Ingestion/abstract_ingestion.py @@ -0,0 +1,30 @@ +from abc import ABC, abstractmethod +from typing import List, Dict +from langchain.text_splitter import Language + + +class AbstractIngestion(ABC): + """ + Abstract class for ingesting repositories into a database. + """ + + @abstractmethod + def chunk_files(self, path: str) -> List[Dict[str, str]]: + """ + Abstract method to chunk code files in the root directory. + """ + pass + + @abstractmethod + def ingest(self, path: str)-> bool: + """ + Abstract method to ingest repositories into the database. + """ + pass + + @abstractmethod + def update(self, path: str): + """ + Abstract method to update a repository in the database. + """ + pass \ No newline at end of file diff --git a/app/content_service/Ingestion/lectures_ingestion.py b/app/content_service/Ingestion/lectures_ingestion.py new file mode 100644 index 00000000..00c91c1c --- /dev/null +++ b/app/content_service/Ingestion/lectures_ingestion.py @@ -0,0 +1,28 @@ +from typing import List, Dict +import weaviate + +from app.vector_repository.lecture_schema import init_schema +from content_service.Ingestion.abstract_ingestion import AbstractIngestion + + +class LectureIngestion(AbstractIngestion): # Inherits from the abstract class + + def __init__(self, client: weaviate.WeaviateClient): + self.collection = init_schema(client) + + def chunk_files(self, path: str): + # Implement chunking logic here or raise NotImplementedError if not applicable + pass + def ingest(self, lecture_path)-> bool: + """ + Ingest the lectures into the weaviate database + """ + # Implement ingestion logic here + pass + + def update(self, lecture: Dict[str, str]): + """ + Update a lecture in the weaviate database + """ + # Implement update logic here or raise NotImplementedError if not applicable + pass \ No newline at end of file diff --git a/app/content_service/Ingestion/repository_ingestion.py b/app/content_service/Ingestion/repository_ingestion.py new file mode 100644 index 00000000..5dd85c3f --- /dev/null +++ b/app/content_service/Ingestion/repository_ingestion.py @@ -0,0 +1,81 @@ + +import os +import weaviate +from app.data.repository_schema import init_schema, RepositoryChunk +from langchain.text_splitter import ( + Language, + RecursiveCharacterTextSplitter, +) +from app.llm.langchain.iris_langchain_embedding_model import IrisLangchainEmbeddingModel +from app.llm import BasicRequestHandler +from data.Ingestion.abstract_ingestion import AbstractIngestion + +CHUNKSIZE = 512 +OVERLAP = 51 + + +def split_code(code: str, language: Language, chunk_size: int, chunk_overlap: int): + """ + Split the code into chunks of 1500 characters with an overlap of 100 characters + """ + python_splitter = RecursiveCharacterTextSplitter.from_language( + language=language, chunk_size=chunk_size, chunk_overlap=chunk_overlap + ) + return python_splitter.create_documents(code) + + +def chunk_files(path: str): + """ + Chunk the code files in the root directory + """ + files_contents = [] + for directory_path, subdir, files in os.walk(path): + for filename in files: + if filename.endswith('.java'): + file_path = os.path.join(directory_path, filename) + with open(file_path, 'r') as file: + code = file.read() + files_contents.append({RepositoryChunk.FILEPATH: filename, RepositoryChunk.CONTENT: code}) + for file in files_contents: + chunks = split_code(file[RepositoryChunk.CONTENT], Language.JAVA, CHUNKSIZE, OVERLAP) + for chunk in chunks: + files_contents.append( + { + RepositoryChunk.CONTENT: chunk.page_content, + RepositoryChunk.COURSE_ID: "tbd", + RepositoryChunk.EXERCISE_ID: "tbd", + RepositoryChunk.REPOSITORY_ID: "tbd", + RepositoryChunk.FILEPATH: file[RepositoryChunk.FILEPATH] + } + ) + return files_contents + + +class RepositoryIngestion(AbstractIngestion): + """ + Ingest the repositories into the weaviate database + """ + def __init__(self, client: weaviate.WeaviateClient): + self.collection = init_schema(client) + self.request_handler = BasicRequestHandler("gpt35") + self.iris_embedding_model = IrisLangchainEmbeddingModel(self.request_handler) + + def ingest(self, repo_path) -> bool: + """ + Ingest the repositories into the weaviate database + """ + chunks = chunk_files(self, repo_path) + with self.collection.batch.dynamic() as batch: + for chunk in enumerate(chunks): + embed_chunk = self.iris_embedding_model.embed_query(chunk[RepositoryChunk.CONTENT]) + batch.add_object( + properties=chunk, + vector=embed_chunk + ) + return True + + def update(self, repository: dict[str, str]): # this is most likely not necessary + """ + Update the repository in the weaviate database + """ + pass diff --git a/app/content_service/Retrieval/__init__.py b/app/content_service/Retrieval/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/app/content_service/Retrieval/abstract_retrieval.py b/app/content_service/Retrieval/abstract_retrieval.py new file mode 100644 index 00000000..78ff5a8d --- /dev/null +++ b/app/content_service/Retrieval/abstract_retrieval.py @@ -0,0 +1,22 @@ +from abc import ABC, abstractmethod +from typing import List, Dict + + +class AbstractRetrieval(ABC): + """ + Abstract class for ingesting repositories into a database. + """ + + @abstractmethod + def retrieve(self, path: str) -> List[str]: + """ + Abstract method to ingest repositories into the database. + """ + pass + + @abstractmethod + def get_collection(self, path: str): + """ + Abstract method to update a repository in the database. + """ + pass diff --git a/app/data/lecture/lectures.py b/app/content_service/Retrieval/lecture_retrieval.py similarity index 68% rename from app/data/lecture/lectures.py rename to app/content_service/Retrieval/lecture_retrieval.py index 316d382a..ed2a950e 100644 --- a/app/data/lecture/lectures.py +++ b/app/content_service/Retrieval/lecture_retrieval.py @@ -1,17 +1,23 @@ import json +from typing import List + import weaviate import weaviate.classes as wvc -from lecture_schema import init_schema, LectureSlideChunk +from app.vector_repository.lecture_schema import init_schema, LectureSlideChunk +from content_service.Retrieval.abstract_retrieval import AbstractRetrieval + -class Lectures: + +class LectureRetrieval(AbstractRetrieval): + """ + Class for ingesting repositories into a database. + """ def __init__(self, client: weaviate.WeaviateClient): self.collection = init_schema(client) - def ingest(self, lectures): - pass - def retrieve(self, user_message: str, lecture_id: int = None): + def retrieve(self, user_message: str, lecture_id: int = None) -> List[str]: response = self.collection.query.near_text( near_text=user_message, filters=( @@ -29,3 +35,6 @@ def retrieve(self, user_message: str, lecture_id: int = None): ) print(json.dumps(response, indent=2)) return response + + def get_collection(self, path: str): + pass \ No newline at end of file diff --git a/app/content_service/Retrieval/repositories_retrieval.py b/app/content_service/Retrieval/repositories_retrieval.py new file mode 100644 index 00000000..ad4cc165 --- /dev/null +++ b/app/content_service/Retrieval/repositories_retrieval.py @@ -0,0 +1,36 @@ +import json +from typing import List + +from vector_repository.repository_schema import RepositoryChunk + +from content_service.Retrieval.abstract_retrieval import AbstractRetrieval + +import weaviate.classes as wvc + + +class RepositoryRetrieval(AbstractRetrieval): + """ + Class for Retrieving vector_repository for from the database. + """ + + def retrieve(self, user_message: str, repository_id: int = None) -> List[str]: + response = self.collection.query.near_text( + near_text=user_message, + filters=( + wvc.query.Filter.by_property(RepositoryChunk.LECTURE_ID).equal( + repository_id + ) + if repository_id + else None + ), + return_properties=[ + RepositoryChunk.REPOSITORY_NAME, + RepositoryChunk.REPOSITORY_DESCRIPTION, + ], + limit=5, + ) + print(json.dumps(response, indent=2)) + return response + + def get_collection(self, path: str): + pass diff --git a/app/content_service/__init__.py b/app/content_service/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/app/data/repository/repositories.py b/app/data/repository/repositories.py deleted file mode 100644 index b080672d..00000000 --- a/app/data/repository/repositories.py +++ /dev/null @@ -1,74 +0,0 @@ -import os -import weaviate -from repository_schema import init_schema, RepositoryChunk -from langchain.text_splitter import ( - Language, - RecursiveCharacterTextSplitter, -) - - -class Repositories: - - def __init__(self, client: weaviate.WeaviateClient): - self.collection = init_schema(client) - - def split_code(self, code: str, language: Language, chunk_size: int, chunk_overlap: int): - """ - Split the code into chunks of 1500 characters with an overlap of 100 characters - """ - python_splitter = RecursiveCharacterTextSplitter.from_language( - language=language, chunk_size=chunk_size, chunk_overlap=chunk_overlap - ) - return python_splitter.create_documents(code) - - def chunk_files(self, files: [dict[str, str]]): - """ - Chunk the code files in the root directory - """ - files_contents = [] - # for directory_path, subdir, files in os.walk(root_directory_path): - # for filename in files: - # if filename.endswith('.py'): - # file_path = os.path.join(directory_path, filename) - # with open(file_path, 'r') as file: - # code = file.read() - for file in files: - chunks = self.split_code(file[RepositoryChunk.CONTENT], Language.JAVA, 1500, 100) - for chunk in chunks: - files_contents.append( - { - RepositoryChunk.CONTENT: chunk, - RepositoryChunk.COURSE_ID: file[RepositoryChunk.COURSE_ID], - RepositoryChunk.EXERCISE_ID: file[RepositoryChunk.EXERCISE_ID], - RepositoryChunk.REPOSITORY_ID: file[RepositoryChunk.REPOSITORY_ID], - RepositoryChunk.FILEPATH: file[RepositoryChunk.FILEPATH] - } - ) - return files_contents - - def retrieve(self, query_vector: list[float]): - """ - Retrieve the top 3 most similar chunks to the query vector - """ - response = self.collection.query.near_vector( - near_vector=query_vector, - limit=3, # Return the top 3 most similar chunks - # return_metadata=wvc.query.MetadataQuery() - ) - return response - - def ingest(self, repositories: [dict[str, str]]): - chunks = self.chunk_files(self, repositories) - with self.collection.batch.dynamic() as batch: - for chunk in enumerate(chunks): - # embed_chunk = llm.embed(chunk[RepositoryChunk.CONTENT]) # Embed the chunk content - embed_chunk = [0.0, 0.0, 0.0] # Placeholder for the embedding - batch.add_object( - properties=chunk, - vector=embed_chunk - ) - def update(self, repository: dict[str, str]):# this is most likely not necessary - pass - - def create_tree_structure(self): - pass diff --git a/app/data_ingestion/__init__.py b/app/data_ingestion/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/app/data_ingestion/download_ingest_lecture.py b/app/data_ingestion/download_ingest_lecture.py new file mode 100644 index 00000000..9906b5ed --- /dev/null +++ b/app/data_ingestion/download_ingest_lecture.py @@ -0,0 +1,31 @@ +import zipfile +import requests +import tempfile +import os + +DOWNLOAD_BUFFER_SIZE = 8 * 1024 + + +# TODO: Get correct parameters here +def download_lecture_pdf(base_url: str, course_id: int, lecture_id: int, lecture_unit_id: int) -> tempfile.NamedTemporaryFile: + """ + Download a single lecture unit from Artemis + """ + # Send a GET request to the URL TODO: Validate Artemis URL + artemis_url = f"{base_url}/iris/lecture-slides/{course_id}/{lecture_id}/{lecture_unit_id}" + response = requests.get(artemis_url, stream=True) + if response.status_code != 200: + print(f"Failed to download the file. Status code: {response.status_code}") + raise ConnectionError + + # Place the PDF into a temporary file + temp_file = tempfile.NamedTemporaryFile() + for chunk in response.iter_content(chunk_size=DOWNLOAD_BUFFER_SIZE): + if chunk: # filter out keep-alive new chunks + temp_file.write(chunk) + + # Return the path to the temporary file. + # File should delete itself when it goes out of scope at the call site + return temp_file + +#CALL THE RIGHT PIPELINE FOR INGESTION OF LECTURE PDF THAT HAS IMAGE INTERPRETATION. \ No newline at end of file diff --git a/app/data_ingestion/download_ingest_repository.py b/app/data_ingestion/download_ingest_repository.py new file mode 100644 index 00000000..f3dbb4ab --- /dev/null +++ b/app/data_ingestion/download_ingest_repository.py @@ -0,0 +1,36 @@ +import os +import tempfile +import zipfile + +import requests +DOWNLOAD_BUFFER_SIZE = 8 * 1024 + + +def download_repository_zip(url) -> tempfile.NamedTemporaryFile: + """ + Downloads a zip file from a given URL and saves it to the specified path. + + :param url: The URL of the zip file to download. + :param save_path: The path (including the file name) where the zip file will be saved. + """ + response = requests.get(url, stream=True) + if response.status_code == 200: + # Open the file in binary write mode and write the content of the response + temp_file = tempfile.NamedTemporaryFile() + for chunk in response.iter_content(chunk_size=DOWNLOAD_BUFFER_SIZE): + if chunk: # filter out keep-alive new chunks + temp_file.write(chunk) + # Return the path to the temporary file. + # File should delete itself when it goes out of scope at the call site + return temp_file + + +def unzip(zip_file_path: str, directory_to: str): + """ + Extracts the zip file to the specified directory. + """ + # Open the zip file in read mode and extract all contents + with zipfile.ZipFile(zip_file_path) as zip_ref: + zip_ref.extractall(directory_to) + +#CALL THE RIGHT PIPELINE FOR INGESTION OF CODE (CHUNK THE CODE THEN GET A DESCRIPTION OF THE CODE, THEN EMBED IT AND SAVE IT IN THE DB) \ No newline at end of file diff --git a/app/vector_repository/__init__.py b/app/vector_repository/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/app/data/db.py b/app/vector_repository/db.py similarity index 100% rename from app/data/db.py rename to app/vector_repository/db.py diff --git a/app/data/lecture/lecture_schema.py b/app/vector_repository/lecture_schema.py similarity index 97% rename from app/data/lecture/lecture_schema.py rename to app/vector_repository/lecture_schema.py index c4f92a8c..63fa611b 100644 --- a/app/data/lecture/lecture_schema.py +++ b/app/vector_repository/lecture_schema.py @@ -29,7 +29,7 @@ def init_schema(client: WeaviateClient) -> Collection: return client.collections.create( name=COLLECTION_NAME, vectorizer_config=wvc.config.Configure.Vectorizer.none(), # We do not want to vectorize the text automatically - # HNSW is preferred over FLAT for large amounts of data, which is the case here + # HNSW is preferred over FLAT for large amounts of vector_repository, which is the case here vector_index_config=wvc.config.Configure.VectorIndex.hnsw( distance_metric=wvc.config.VectorDistances.COSINE # select preferred distance metric ), diff --git a/app/data/repository/repository_schema.py b/app/vector_repository/repository_schema.py similarity index 95% rename from app/data/repository/repository_schema.py rename to app/vector_repository/repository_schema.py index 7a1e8e9a..8cb9ba91 100644 --- a/app/data/repository/repository_schema.py +++ b/app/vector_repository/repository_schema.py @@ -20,7 +20,7 @@ def init_schema(client: WeaviateClient) -> Collection: return client.collections.create( name=COLLECTION_NAME, vectorizer_config=wvc.config.Configure.Vectorizer.none(), # We do not want to vectorize the text automatically - # HNSW is preferred over FLAT for large amounts of data, which is the case here + # HNSW is preferred over FLAT for large amounts of vector_repository, which is the case here vector_index_config=wvc.config.Configure.VectorIndex.hnsw( distance_metric=wvc.config.VectorDistances.COSINE # select preferred distance metric ),