Skip to content

Commit

Permalink
Solved requirements problem ( removed olama for now as weaviate needs…
Browse files Browse the repository at this point in the history
… a httpx version >= 0.26 and ollama needs a version >= 0.25.2 and 0.26<,

Finished ingesting and retrieval classes for the lectures.
Added hybrid search instead of normal semantic search.
  • Loading branch information
yassinsws committed Mar 17, 2024
2 parents a29a44b + d671a29 commit e9874b9
Show file tree
Hide file tree
Showing 113 changed files with 2,224 additions and 480 deletions.
15 changes: 15 additions & 0 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
"component:LLM":
- changed-files:
- any-glob-to-any-file: app/llm/**
"component:Pipeline":
- changed-files:
- any-glob-to-any-file: app/pipeline/**
"component:FastAPI":
- changed-files:
- any-glob-to-any-file: app/web/**
"component:Domain":
- changed-files:
- any-glob-to-any-file: app/domain/**
"component:Docker":
- changed-files:
- any-glob-to-any-file: docker/**
"component:CI/CD":
- changed-files:
- any-glob-to-any-file: .github/**
71 changes: 71 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
name: Build

on:
pull_request:
paths-ignore:
- 'README.md'
- 'LICENSE'
- '.github/**'
- '!.github/workflows/build.yml'
push:
branches:
- main
tags: '[0-9]+.[0-9]+.[0-9]+'
paths-ignore:
- 'README.md'
- 'LICENSE'
- '.github/**'
- '!.github/workflows/build.yml'
release:
types:
- created

jobs:
docker:
name: Build and Push Docker Image
if: ${{ github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == 'ls1intum/Pyris' }}
runs-on: ubuntu-latest
steps:
- name: Compute Tag
uses: actions/github-script@v6
id: compute-tag
with:
result-encoding: string
script: |
if (context.eventName === "pull_request") {
return "pr-" + context.issue.number;
}
if (context.eventName === "release") {
return "latest";
}
if (context.eventName === "push") {
if (context.ref.startsWith("refs/tags/")) {
return context.ref.slice(10);
}
if (context.ref === "refs/heads/main") {
return "latest";
}
}
return "FALSE";
- uses: actions/checkout@v3
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
# Build and Push to GitHub Container Registry
- name: Login to GitHub Container Registry
uses: docker/login-action@v2
if: ${{ steps.compute-tag.outputs.result != 'FALSE' }}
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and Push to GitHub Container Registry
uses: docker/build-push-action@v4
if: ${{ steps.compute-tag.outputs.result != 'FALSE' }}
with:
platforms: amd64, arm64
file: ./Dockerfile
context: .
tags: ghcr.io/ls1intum/pyris:${{ steps.compute-tag.outputs.result }}
push: true
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
#######################
# Custom rules #
#######################
application.local.yml
llm_config.local.yml


########################
# Auto-generated rules #
########################
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@
rev: v2.0.0
hooks:
- id: flake8
language_version: python3.12
language_version: python3.12
17 changes: 17 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Dockerfile to build a container image for a Python 3.12 FastAPI application
FROM python:3.12-slim

# Set the working directory in the container
WORKDIR /app

# Copy the dependencies file to the working directory
COPY requirements.txt .

# Install any dependencies
RUN pip install --no-cache-dir -r requirements.txt

# Copy the content of the local src directory to the working directory
COPY app/ ./app

# Specify the command to run on container start
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
17 changes: 16 additions & 1 deletion README.MD
Original file line number Diff line number Diff line change
@@ -1 +1,16 @@
# Pyris V2
# Pyris V2
## With local environment

### Setup
- Check python version: `python --version` (should be 3.12)
- Install packages: `pip install -r requirements.txt`

### Run server
- Run server:
```[bash]
APPLICATION_YML_PATH=<path-to-your-application-yml-file> LLM_CONFIG_PATH=<path-to-your-llm-config-yml> uvicorn app.main:app --reload
```
- Access API docs: http://localhost:8000/docs
## With docker
TBD
6 changes: 5 additions & 1 deletion app/common/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
from common.singleton import Singleton
from ..common.singleton import Singleton
from ..common.message_converters import (
convert_iris_message_to_langchain_message,
convert_langchain_message_to_iris_message,
)
45 changes: 45 additions & 0 deletions app/common/custom_exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from fastapi import HTTPException, status


class RequiresAuthenticationException(HTTPException):
def __init__(self):
super().__init__(
status_code=status.HTTP_401_UNAUTHORIZED,
detail={
"type": "not_authenticated",
"errorMessage": "Requires authentication",
},
)


class PermissionDeniedException(HTTPException):
def __init__(self):
super().__init__(
status_code=status.HTTP_403_FORBIDDEN,
detail={
"type": "not_authorized",
"errorMessage": "Permission denied",
},
)


class PipelineInvocationError(HTTPException):
def __init__(self):
super().__init__(
status_code=status.HTTP_400_BAD_REQUEST,
detail={
"type": "bad_request",
"errorMessage": "Cannot invoke pipeline",
},
)


class PipelineNotFoundException(HTTPException):
def __init__(self):
super().__init__(
status_code=status.HTTP_404_NOT_FOUND,
detail={
"type": "pipeline_not_found",
"errorMessage": "Pipeline not found",
},
)
28 changes: 28 additions & 0 deletions app/common/message_converters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from langchain_core.messages import BaseMessage
from ..domain.iris_message import IrisMessage, IrisMessageRole


def convert_iris_message_to_langchain_message(iris_message: IrisMessage) -> BaseMessage:
match iris_message.role:
case IrisMessageRole.USER:
role = "human"
case IrisMessageRole.ASSISTANT:
role = "ai"
case IrisMessageRole.SYSTEM:
role = "system"
case _:
raise ValueError(f"Unknown message role: {iris_message.role}")
return BaseMessage(content=iris_message.text, type=role)


def convert_langchain_message_to_iris_message(base_message: BaseMessage) -> IrisMessage:
match base_message.type:
case "human":
role = IrisMessageRole.USER
case "ai":
role = IrisMessageRole.ASSISTANT
case "system":
role = IrisMessageRole.SYSTEM
case _:
raise ValueError(f"Unknown message type: {base_message.type}")
return IrisMessage(text=base_message.content, role=role)
36 changes: 36 additions & 0 deletions app/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os
from pathlib import Path
from pydantic import BaseModel
import yaml


class APIKeyConfig(BaseModel):
token: str


class Settings(BaseModel):
api_keys: list[APIKeyConfig]

@classmethod
def get_settings(cls):
"""Get the settings from the configuration file."""
file_path_env = os.environ.get("APPLICATION_YML_PATH")
if not file_path_env:
raise EnvironmentError(
"APPLICATION_YML_PATH environment variable is not set."
)

file_path = Path(file_path_env)
try:
with open(file_path, "r") as file:
settings_file = yaml.safe_load(file)
return cls.parse_obj(settings_file)
except FileNotFoundError as e:
raise FileNotFoundError(
f"Configuration file not found at {file_path}."
) from e
except yaml.YAMLError as e:
raise yaml.YAMLError(f"Error parsing YAML file at {file_path}.") from e


settings = Settings.get_settings()
2 changes: 1 addition & 1 deletion app/content_service/Ingestion/abstract_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class AbstractIngestion(ABC):
"""

@abstractmethod
def chunk_files(self, path: str) -> List[Dict[str, str]]:
def chunk_data(self, path: str) -> List[Dict[str, str]]:
"""
Abstract method to chunk code files in the root directory.
"""
Expand Down
67 changes: 57 additions & 10 deletions app/content_service/Ingestion/lectures_ingestion.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,72 @@
import base64
from typing import Dict
import fitz
import weaviate

from app.vector_repository.lecture_schema import init_schema
from app.vector_database.lectureschema import init_lecture_schema, LectureSchema
from content_service.Ingestion.abstract_ingestion import AbstractIngestion
from app.llm import BasicRequestHandler


class LectureIngestion(AbstractIngestion): # Inherits from the abstract class

def __init__(self, client: weaviate.WeaviateClient):
self.collection = init_schema(client)
self.collection = init_lecture_schema(client)

def chunk_files(self, path: str):
# Implement chunking logic here or raise NotImplementedError if not applicable
pass
def chunk_data(self, lecture_path: str):
doc = fitz.open(lecture_path) # Explicitly annotate as an Iterable of fitz.Page
data = []
for page_num in doc.page_count:
page = doc.load_page(page_num)
# Check if the page has images
if page.get_images(full=True):
# Render the page to an image (pixmap)
pix = page.get_pixmap()
# Convert the pixmap to bytes
img_bytes = pix.tobytes("png")
# Encode the bytes to Base64 and then decode to a string
img_base64 = base64.b64encode(img_bytes).decode("utf-8")
# image_interpretation = llm.interpret_image(img_base64, last_page_content)
last_page_content = page.get_text()
data.append(
{
LectureSchema.PAGE_TEXT_CONTENT: last_page_content,
LectureSchema.PAGE_IMAGE_DESCRIPTION: "", # image_interpretation,
LectureSchema.PAGE_NUMBER: page_num + 1,
LectureSchema.LECTURE_NAME: lecture_path,
LectureSchema.PAGE_BASE64: img_base64,
}
)

def ingest(self, lecture_path) -> bool:
else:
last_page_content = page.get_text()
data.append(
{
LectureSchema.PAGE_TEXT_CONTENT: last_page_content,
LectureSchema.PAGE_IMAGE_DESCRIPTION: "",
LectureSchema.PAGE_NUMBER: page_num + 1,
LectureSchema.LECTURE_NAME: lecture_path,
LectureSchema.PAGE_BASE64: "",
}
)
return data

def ingest(self, lecture_path, embedding_model: BasicRequestHandler = None) -> bool:
"""
Ingest the lectures into the weaviate database
Ingest the repositories into the weaviate database
"""
# Implement ingestion logic here
pass
chunks = self.chunk_data(lecture_path)
with self.collection.batch.dynamic() as batch:
for chunk in enumerate(chunks):
# embed the
embed_chunk = embedding_model.embed(
chunk[
LectureSchema.PAGE_TEXT_CONTENT
+ "\n"
+ chunk[LectureSchema.PAGE_IMAGE_DESCRIPTION]
]
)
batch.add_object(properties=chunk, vector=embed_chunk)
return True

def update(self, lecture: Dict[str, str]):
"""
Expand Down
Loading

0 comments on commit e9874b9

Please sign in to comment.