Skip to content

Commit

Permalink
Add content_service, data ingester and vector repository subsystems
Browse files Browse the repository at this point in the history
  • Loading branch information
yassinsws committed Feb 22, 2024
1 parent 2c0793a commit b4cb05d
Show file tree
Hide file tree
Showing 17 changed files with 280 additions and 81 deletions.
Empty file.
30 changes: 30 additions & 0 deletions app/content_service/Ingestion/abstract_ingestion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from abc import ABC, abstractmethod
from typing import List, Dict
from langchain.text_splitter import Language


class AbstractIngestion(ABC):
"""
Abstract class for ingesting repositories into a database.
"""

@abstractmethod
def chunk_files(self, path: str) -> List[Dict[str, str]]:
"""
Abstract method to chunk code files in the root directory.
"""
pass

@abstractmethod
def ingest(self, path: str)-> bool:
"""
Abstract method to ingest repositories into the database.
"""
pass

@abstractmethod
def update(self, path: str):
"""
Abstract method to update a repository in the database.
"""
pass
28 changes: 28 additions & 0 deletions app/content_service/Ingestion/lectures_ingestion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from typing import List, Dict
import weaviate

from app.vector_repository.lecture_schema import init_schema
from content_service.Ingestion.abstract_ingestion import AbstractIngestion


class LectureIngestion(AbstractIngestion): # Inherits from the abstract class

def __init__(self, client: weaviate.WeaviateClient):
self.collection = init_schema(client)

def chunk_files(self, path: str):
# Implement chunking logic here or raise NotImplementedError if not applicable
pass
def ingest(self, lecture_path)-> bool:
"""
Ingest the lectures into the weaviate database
"""
# Implement ingestion logic here
pass

def update(self, lecture: Dict[str, str]):
"""
Update a lecture in the weaviate database
"""
# Implement update logic here or raise NotImplementedError if not applicable
pass
81 changes: 81 additions & 0 deletions app/content_service/Ingestion/repository_ingestion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@

import os
import weaviate
from app.data.repository_schema import init_schema, RepositoryChunk
from langchain.text_splitter import (
Language,
RecursiveCharacterTextSplitter,
)
from app.llm.langchain.iris_langchain_embedding_model import IrisLangchainEmbeddingModel
from app.llm import BasicRequestHandler
from data.Ingestion.abstract_ingestion import AbstractIngestion

CHUNKSIZE = 512
OVERLAP = 51


def split_code(code: str, language: Language, chunk_size: int, chunk_overlap: int):
"""
Split the code into chunks of 1500 characters with an overlap of 100 characters
"""
python_splitter = RecursiveCharacterTextSplitter.from_language(
language=language, chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
return python_splitter.create_documents(code)


def chunk_files(path: str):
"""
Chunk the code files in the root directory
"""
files_contents = []
for directory_path, subdir, files in os.walk(path):
for filename in files:
if filename.endswith('.java'):
file_path = os.path.join(directory_path, filename)
with open(file_path, 'r') as file:
code = file.read()
files_contents.append({RepositoryChunk.FILEPATH: filename, RepositoryChunk.CONTENT: code})
for file in files_contents:
chunks = split_code(file[RepositoryChunk.CONTENT], Language.JAVA, CHUNKSIZE, OVERLAP)
for chunk in chunks:
files_contents.append(
{
RepositoryChunk.CONTENT: chunk.page_content,
RepositoryChunk.COURSE_ID: "tbd",
RepositoryChunk.EXERCISE_ID: "tbd",
RepositoryChunk.REPOSITORY_ID: "tbd",
RepositoryChunk.FILEPATH: file[RepositoryChunk.FILEPATH]
}
)
return files_contents


class RepositoryIngestion(AbstractIngestion):
"""
Ingest the repositories into the weaviate database
"""
def __init__(self, client: weaviate.WeaviateClient):
self.collection = init_schema(client)
self.request_handler = BasicRequestHandler("gpt35")
self.iris_embedding_model = IrisLangchainEmbeddingModel(self.request_handler)

def ingest(self, repo_path) -> bool:
"""
Ingest the repositories into the weaviate database
"""
chunks = chunk_files(self, repo_path)
with self.collection.batch.dynamic() as batch:
for chunk in enumerate(chunks):
embed_chunk = self.iris_embedding_model.embed_query(chunk[RepositoryChunk.CONTENT])
batch.add_object(
properties=chunk,
vector=embed_chunk
)
return True

def update(self, repository: dict[str, str]): # this is most likely not necessary
"""
Update the repository in the weaviate database
"""
pass
Empty file.
22 changes: 22 additions & 0 deletions app/content_service/Retrieval/abstract_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from abc import ABC, abstractmethod
from typing import List, Dict


class AbstractRetrieval(ABC):
"""
Abstract class for ingesting repositories into a database.
"""

@abstractmethod
def retrieve(self, path: str) -> List[str]:
"""
Abstract method to ingest repositories into the database.
"""
pass

@abstractmethod
def get_collection(self, path: str):
"""
Abstract method to update a repository in the database.
"""
pass
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
import json
from typing import List

import weaviate
import weaviate.classes as wvc

from lecture_schema import init_schema, LectureSlideChunk
from app.vector_repository.lecture_schema import init_schema, LectureSlideChunk
from content_service.Retrieval.abstract_retrieval import AbstractRetrieval


class Lectures:

class LectureRetrieval(AbstractRetrieval):
"""
Class for ingesting repositories into a database.
"""

def __init__(self, client: weaviate.WeaviateClient):
self.collection = init_schema(client)

def ingest(self, lectures):
pass
def retrieve(self, user_message: str, lecture_id: int = None):
def retrieve(self, user_message: str, lecture_id: int = None) -> List[str]:
response = self.collection.query.near_text(
near_text=user_message,
filters=(
Expand All @@ -29,3 +35,6 @@ def retrieve(self, user_message: str, lecture_id: int = None):
)
print(json.dumps(response, indent=2))
return response

def get_collection(self, path: str):
pass
36 changes: 36 additions & 0 deletions app/content_service/Retrieval/repositories_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import json
from typing import List

from vector_repository.repository_schema import RepositoryChunk

from content_service.Retrieval.abstract_retrieval import AbstractRetrieval

import weaviate.classes as wvc


class RepositoryRetrieval(AbstractRetrieval):
"""
Class for Retrieving vector_repository for from the database.
"""

def retrieve(self, user_message: str, repository_id: int = None) -> List[str]:
response = self.collection.query.near_text(
near_text=user_message,
filters=(
wvc.query.Filter.by_property(RepositoryChunk.LECTURE_ID).equal(
repository_id
)
if repository_id
else None
),
return_properties=[
RepositoryChunk.REPOSITORY_NAME,
RepositoryChunk.REPOSITORY_DESCRIPTION,
],
limit=5,
)
print(json.dumps(response, indent=2))
return response

def get_collection(self, path: str):
pass
Empty file added app/content_service/__init__.py
Empty file.
74 changes: 0 additions & 74 deletions app/data/repository/repositories.py

This file was deleted.

Empty file added app/data_ingestion/__init__.py
Empty file.
31 changes: 31 additions & 0 deletions app/data_ingestion/download_ingest_lecture.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import zipfile
import requests
import tempfile
import os

DOWNLOAD_BUFFER_SIZE = 8 * 1024


# TODO: Get correct parameters here
def download_lecture_pdf(base_url: str, course_id: int, lecture_id: int, lecture_unit_id: int) -> tempfile.NamedTemporaryFile:
"""
Download a single lecture unit from Artemis
"""
# Send a GET request to the URL TODO: Validate Artemis URL
artemis_url = f"{base_url}/iris/lecture-slides/{course_id}/{lecture_id}/{lecture_unit_id}"
response = requests.get(artemis_url, stream=True)
if response.status_code != 200:
print(f"Failed to download the file. Status code: {response.status_code}")
raise ConnectionError

# Place the PDF into a temporary file
temp_file = tempfile.NamedTemporaryFile()
for chunk in response.iter_content(chunk_size=DOWNLOAD_BUFFER_SIZE):
if chunk: # filter out keep-alive new chunks
temp_file.write(chunk)

# Return the path to the temporary file.
# File should delete itself when it goes out of scope at the call site
return temp_file

#CALL THE RIGHT PIPELINE FOR INGESTION OF LECTURE PDF THAT HAS IMAGE INTERPRETATION.
36 changes: 36 additions & 0 deletions app/data_ingestion/download_ingest_repository.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os
import tempfile
import zipfile

import requests
DOWNLOAD_BUFFER_SIZE = 8 * 1024


def download_repository_zip(url) -> tempfile.NamedTemporaryFile:
"""
Downloads a zip file from a given URL and saves it to the specified path.
:param url: The URL of the zip file to download.
:param save_path: The path (including the file name) where the zip file will be saved.
"""
response = requests.get(url, stream=True)
if response.status_code == 200:
# Open the file in binary write mode and write the content of the response
temp_file = tempfile.NamedTemporaryFile()
for chunk in response.iter_content(chunk_size=DOWNLOAD_BUFFER_SIZE):
if chunk: # filter out keep-alive new chunks
temp_file.write(chunk)
# Return the path to the temporary file.
# File should delete itself when it goes out of scope at the call site
return temp_file


def unzip(zip_file_path: str, directory_to: str):
"""
Extracts the zip file to the specified directory.
"""
# Open the zip file in read mode and extract all contents
with zipfile.ZipFile(zip_file_path) as zip_ref:
zip_ref.extractall(directory_to)

#CALL THE RIGHT PIPELINE FOR INGESTION OF CODE (CHUNK THE CODE THEN GET A DESCRIPTION OF THE CODE, THEN EMBED IT AND SAVE IT IN THE DB)
Empty file.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def init_schema(client: WeaviateClient) -> Collection:
return client.collections.create(
name=COLLECTION_NAME,
vectorizer_config=wvc.config.Configure.Vectorizer.none(), # We do not want to vectorize the text automatically
# HNSW is preferred over FLAT for large amounts of data, which is the case here
# HNSW is preferred over FLAT for large amounts of vector_repository, which is the case here
vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
distance_metric=wvc.config.VectorDistances.COSINE # select preferred distance metric
),
Expand Down
Loading

0 comments on commit b4cb05d

Please sign in to comment.