Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LLM: Add image recognition and image generation support #77

Closed
wants to merge 35 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
403f118
Add new pipeline DTOs
MichaelOwenDyer Feb 14, 2024
e7c74f2
Apply autoformatter
MichaelOwenDyer Feb 14, 2024
26e86ac
Have DTOs extend BaseModel
MichaelOwenDyer Feb 15, 2024
6997315
Add data package
MichaelOwenDyer Feb 15, 2024
128ea40
update retrieval interface and requirements
yassinsws Feb 19, 2024
0818109
Merge branch 'main' into feature/datastore
yassinsws Feb 20, 2024
70ed83f
Use cloud cluster for weaviate for now for the hackathon
yassinsws Feb 21, 2024
7acf809
Merge remote-tracking branch 'origin/feature/datastore' into feature/…
yassinsws Feb 21, 2024
2c0793a
fix splitting function.
yassinsws Feb 21, 2024
b4cb05d
Add content_service, data ingester and vector repository subsystems
yassinsws Feb 22, 2024
2f3882f
Merge branch 'main' into feature/datastore
yassinsws Feb 22, 2024
05490f2
fix lintin
yassinsws Feb 22, 2024
e08aac6
Merge remote-tracking branch 'origin/feature/datastore' into feature/…
yassinsws Feb 22, 2024
a29a44b
add a return statement to unzip
yassinsws Feb 22, 2024
0c96395
Add image recognition for Ollama, GPT4V and image generation for Dall-E
Hialus Mar 6, 2024
e9874b9
Solved requirements problem ( removed olama for now as weaviate needs…
yassinsws Mar 17, 2024
3a186c9
fixed requirements file
yassinsws Mar 17, 2024
379550b
fixed message interpretation function in the llm class
yassinsws Mar 17, 2024
0f6e576
Added detail parameter to image_interpretation model
yassinsws Mar 18, 2024
a4186c3
renamed pyris_image to iris_image
yassinsws Mar 18, 2024
9f2848e
Update app/content_service/Ingestion/lectures_ingestion.py
yassinsws Mar 18, 2024
224a701
Update app/content_service/Retrieval/abstract_retrieval.py
yassinsws Mar 18, 2024
93a2f44
Update app/content_service/Ingestion/repository_ingestion.py
yassinsws Mar 18, 2024
bca6377
Update app/content_service/Ingestion/lectures_ingestion.py
yassinsws Mar 18, 2024
6e9525d
Update app/content_service/Retrieval/lecture_retrieval.py
yassinsws Mar 18, 2024
bc97236
erase old lecture download files
yassinsws Mar 18, 2024
b0291b1
Add a function to get lectures from Artemis
yassinsws Mar 18, 2024
7211386
Update app/content_service/get_lecture_from_artemis.py
yassinsws Mar 24, 2024
738e7a0
refractor tutor pipeline
yassinsws Mar 24, 2024
303f6d4
Lecture content first draft ready for review
yassinsws Mar 25, 2024
1dfd03a
Black
yassinsws Mar 25, 2024
f3b3c93
Black prompt and update database link
yassinsws Mar 29, 2024
34bc567
Lecture chat pipeline works just fine
yassinsws Mar 31, 2024
67d6bd0
Black
yassinsws Mar 31, 2024
2fe64ab
flake8
yassinsws Mar 31, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
WEAVIATE_HOST=
WEAVIATE_PORT=
Empty file.
29 changes: 29 additions & 0 deletions app/content_service/Ingestion/abstract_ingestion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from abc import ABC, abstractmethod
from typing import List, Dict


class AbstractIngestion(ABC):
"""
Abstract class for ingesting repositories into a database.
"""

@abstractmethod
def chunk_data(self, path: str) -> List[Dict[str, str]]:
"""
Abstract method to chunk code files in the root directory.
"""
pass

@abstractmethod
def ingest(self, path: str) -> bool:
"""
Abstract method to ingest repositories into the database.
"""
pass

@abstractmethod
def update(self, path: str):
"""
Abstract method to update a repository in the database.
"""
pass
73 changes: 73 additions & 0 deletions app/content_service/Ingestion/lectures_ingestion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import base64
from typing import Dict
import fitz
import weaviate
from app.vector_database.lectureschema import init_lecture_schema, LectureSchema
from ..Ingestion.abstract_ingestion import AbstractIngestion
from app.llm import BasicRequestHandler


class LectureIngestion(AbstractIngestion): # Inherits from the abstract class

def __init__(self, client: weaviate.WeaviateClient):
self.collection = init_lecture_schema(client)

def chunk_data(self, lecture_path: str):
doc = fitz.open(lecture_path) # Explicitly annotate as an Iterable of fitz.Page
data = []
for page_num in range(doc.page_count):
page = doc.load_page(page_num)
# Check if the page has images
if page.get_images(full=True):
# Render the page to an image (pixmap)
pix = page.get_pixmap()
# Convert the pixmap to bytes
img_bytes = pix.tobytes("png")
# Encode the bytes to Base64 and then decode to a string
img_base64 = base64.b64encode(img_bytes).decode("utf-8")
page_content = page.get_text()
data.append(
{
LectureSchema.PAGE_TEXT_CONTENT: page_content,
LectureSchema.PAGE_IMAGE_DESCRIPTION: "", # image_interpretation,
LectureSchema.PAGE_NUMBER: page_num + 1,
LectureSchema.LECTURE_NAME: lecture_path,
LectureSchema.PAGE_BASE64: img_base64,
}
)

else:
page_content = page.get_text()
data.append(
{
LectureSchema.PAGE_TEXT_CONTENT: page_content,
LectureSchema.PAGE_IMAGE_DESCRIPTION: "",
LectureSchema.PAGE_NUMBER: page_num + 1,
LectureSchema.LECTURE_NAME: lecture_path,
LectureSchema.PAGE_BASE64: "",
}
)
return data

def ingest(self, lecture_path, embedding_model: BasicRequestHandler = None) -> bool:
"""
Ingest the repositories into the weaviate database
"""
chunks = self.chunk_data(lecture_path)
with self.collection.batch.dynamic() as batch:
for index, chunk in enumerate(chunks):
# embed the
embed_chunk = embedding_model.embed(
chunk[1][LectureSchema.PAGE_TEXT_CONTENT]
+ "\n"
+ chunk[1][LectureSchema.PAGE_IMAGE_DESCRIPTION]
)
batch.add_object(properties=chunk, vector=embed_chunk)
return True

def update(self, lecture: Dict[str, str]):
"""
Update a lecture in the weaviate database
"""
# Implement update logic here or raise NotImplementedError if not applicable
pass
92 changes: 92 additions & 0 deletions app/content_service/Ingestion/repository_ingestion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import os
from abc import ABC

import weaviate
from langchain.text_splitter import (
Language,
RecursiveCharacterTextSplitter,
)

from app.llm import BasicRequestHandler
from app.llm.langchain.iris_langchain_embedding_model import IrisLangchainEmbeddingModel
from app.vector_database.repository_schema import (
init_repository_schema,
RepositorySchema,
)
from ..Ingestion.abstract_ingestion import AbstractIngestion

CHUNKSIZE = 512
OVERLAP = 51


def split_code(code: str, language: Language, chunk_size: int, chunk_overlap: int):
"""
Split the code into chunks of 1500 characters with an overlap of 100 characters
"""
python_splitter = RecursiveCharacterTextSplitter.from_language(
language=language, chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
return python_splitter.create_documents([code])


class RepositoryIngestion(AbstractIngestion, ABC):
"""
Ingest the repositories into the weaviate database
"""

def __init__(self, client: weaviate.WeaviateClient):
self.collection = init_repository_schema(client)
self.request_handler = BasicRequestHandler("gpt35")
self.iris_embedding_model = IrisLangchainEmbeddingModel(self.request_handler)

def chunk_files(self, path: str):
"""
Chunk the code files in the root directory
"""
files_contents = []
for directory_path, subdir, files in os.walk(path):
for filename in files:
if filename.endswith(".java"):
file_path = os.path.join(directory_path, filename)
with open(file_path, "r") as file:
code = file.read()
files_contents.append(
{
RepositorySchema.FILEPATH: filename,
RepositorySchema.CONTENT: code,
}
)
for file in files_contents:
chunks = split_code(
file[RepositorySchema.CONTENT], Language.JAVA, CHUNKSIZE, OVERLAP
)
for chunk in chunks:
files_contents.append(
{
RepositorySchema.CONTENT: chunk.page_content,
RepositorySchema.COURSE_ID: "tbd",
RepositorySchema.EXERCISE_ID: "tbd",
RepositorySchema.REPOSITORY_ID: "tbd",
RepositorySchema.FILEPATH: file[RepositorySchema.FILEPATH],
}
)
return files_contents

def ingest(self, repo_path: str) -> bool:
"""
Ingest the repositories into the weaviate database
"""
chunks = self.chunk_files(repo_path)
with self.collection.batch.dynamic() as batch:
for index, chunk in enumerate(chunks):
embed_chunk = self.iris_embedding_model.embed_query(
chunk[1][RepositorySchema.CONTENT]
)
batch.add_object(properties=chunk, vector=embed_chunk)
return True

def update(self, repository: dict[str, str]): # this is most likely not necessary
"""
Update the repository in the weaviate database
"""
pass
Empty file.
15 changes: 15 additions & 0 deletions app/content_service/Retrieval/abstract_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from abc import ABC, abstractmethod
from typing import List


class AbstractRetrieval(ABC):
"""
Abstract class for retrieving data from a database.
"""

@abstractmethod
def retrieve(self, path: str, hybrid_factor: float) -> List[str]:
"""
Abstract method to ingest repositories into the database.
"""
pass
38 changes: 38 additions & 0 deletions app/content_service/Retrieval/lecture_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from abc import ABC
from typing import List

import weaviate
import weaviate.classes as wvc

from app.vector_database.lectureschema import init_lecture_schema, LectureSchema
from ..Retrieval.abstract_retrieval import AbstractRetrieval


class LectureRetrieval(AbstractRetrieval, ABC):
"""
Class for retrieving lecture data from the database.
"""

def __init__(self, client: weaviate.WeaviateClient):
self.collection = init_lecture_schema(client)

def retrieve(
self,
user_message: str,
hybrid_factor: float,
lecture_id: int = None,
embedding_vector: [float] = None,
) -> List[dict]:
response = self.collection.query.hybrid(
query=user_message,
limit=3,
filters=(
wvc.query.Filter.by_property(LectureSchema.LECTURE_ID).equal(lecture_id)
if lecture_id
else None
),
alpha=hybrid_factor,
vector=embedding_vector,
)
relevant_chunks = [obj.properties for obj in response.objects]
return relevant_chunks
44 changes: 44 additions & 0 deletions app/content_service/Retrieval/repositories_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import json
from typing import List

import weaviate

from ...vector_database.repository_schema import (
RepositorySchema,
init_repository_schema,
)

from ..Retrieval.abstract_retrieval import AbstractRetrieval

import weaviate.classes as wvc


class RepositoryRetrieval(AbstractRetrieval):
"""
Class for Retrieving vector_database for from the database.
"""

def __init__(self, client: weaviate.WeaviateClient):
self.collection = init_repository_schema(client)

def retrieve(self, user_message: str, repository_id: int = None) -> List[str]:
response = self.collection.query.near_text(
near_text=user_message,
filters=(
wvc.query.Filter.by_property(RepositorySchema.REPOSITORY_ID).equal(
repository_id
)
if repository_id
else None
),
return_properties=[
RepositorySchema.REPOSITORY_ID,
RepositorySchema.COURSE_ID,
RepositorySchema.CONTENT,
RepositorySchema.EXERCISE_ID,
RepositorySchema.FILEPATH,
],
limit=5,
)
print(json.dumps(response, indent=2))
return response
Empty file.
22 changes: 22 additions & 0 deletions app/content_service/get_lecture_from_artemis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import requests
import tempfile

DOWNLOAD_BUFFER_SIZE = 8 * 1024


def download_lecture_pdf(base_url: str, unit_id: int) -> tempfile.NamedTemporaryFile:
"""
Download a single lecture unit from Artemis
"""
artemis_url = f"{base_url}/api/v1/public/pyris/data/lecture-units/{unit_id}/pdf"
response = requests.get(artemis_url, stream=True)
if response.status_code != 200:
raise ConnectionError(
f"Failed to download the file. Status code: {response.status_code}, URL: {artemis_url}"
)

with tempfile.NamedTemporaryFile() as temp_file:
for chunk in response.iter_content(chunk_size=DOWNLOAD_BUFFER_SIZE):
if chunk:
temp_file.write(chunk)
return temp_file
10 changes: 3 additions & 7 deletions app/llm/langchain/iris_langchain_embedding_model.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
from typing import List, Any

from typing import List
from langchain_core.embeddings import Embeddings

from ...llm import RequestHandler


class IrisLangchainEmbeddingModel(Embeddings):
"""Custom langchain embedding for our own request handler"""

request_handler: RequestHandler

def __init__(self, request_handler: RequestHandler, **kwargs: Any) -> None:
super().__init__(request_handler=request_handler, **kwargs)
def __init__(self, request_handler: RequestHandler) -> None:
self.request_handler = request_handler

def embed_documents(self, texts: List[str]) -> List[List[float]]:
return [self.embed_query(text) for text in texts]
Expand Down
Loading
Loading