Solved requirements problem ( removed olama for now as weaviate needs…

… a httpx version >= 0.26 and ollama needs a version >= 0.25.2 and 0.26<, Finished ingesting and retrieval classes for the lectures. Added hybrid search instead of normal semantic search.
ls1intum · Mar 17, 2024 · e9874b9 · e9874b9
2 parents a29a44b + d671a29
commit e9874b9
Show file tree

Hide file tree

Showing 113 changed files with 2,224 additions and 480 deletions.
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -1,3 +1,18 @@
 "component:LLM":
   - changed-files:
       - any-glob-to-any-file: app/llm/**
+"component:Pipeline":
+  - changed-files:
+      - any-glob-to-any-file: app/pipeline/**
+"component:FastAPI":
+  - changed-files:
+      - any-glob-to-any-file: app/web/**
+"component:Domain":
+  - changed-files:
+      - any-glob-to-any-file: app/domain/**
+"component:Docker":
+  - changed-files:
+      - any-glob-to-any-file: docker/**
+"component:CI/CD":
+  - changed-files:
+      - any-glob-to-any-file: .github/**
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -0,0 +1,71 @@
+name: Build
+
+on:
+  pull_request:
+    paths-ignore:
+    - 'README.md'
+    - 'LICENSE'
+    - '.github/**'
+    - '!.github/workflows/build.yml'
+  push:
+    branches:
+    - main
+    tags: '[0-9]+.[0-9]+.[0-9]+'
+    paths-ignore:
+    - 'README.md'
+    - 'LICENSE'
+    - '.github/**'
+    - '!.github/workflows/build.yml'
+  release:
+    types:
+    - created
+
+jobs:
+  docker:
+    name: Build and Push Docker Image
+    if: ${{ github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == 'ls1intum/Pyris' }}
+    runs-on: ubuntu-latest
+    steps:
+    - name: Compute Tag
+      uses: actions/github-script@v6
+      id: compute-tag
+      with:
+        result-encoding: string
+        script: |
+          if (context.eventName === "pull_request") {
+            return "pr-" + context.issue.number;
+          }
+          if (context.eventName === "release") {
+            return "latest";
+          }
+          if (context.eventName === "push") {
+            if (context.ref.startsWith("refs/tags/")) {
+              return context.ref.slice(10);
+            }
+            if (context.ref === "refs/heads/main") {
+              return "latest";
+            }
+          }
+          return "FALSE";
+    - uses: actions/checkout@v3
+    - name: Set up QEMU
+      uses: docker/setup-qemu-action@v2
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v2
+    # Build and Push to GitHub Container Registry
+    - name: Login to GitHub Container Registry
+      uses: docker/login-action@v2
+      if: ${{ steps.compute-tag.outputs.result != 'FALSE' }}
+      with:
+        registry: ghcr.io
+        username: ${{ github.repository_owner }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    - name: Build and Push to GitHub Container Registry
+      uses: docker/build-push-action@v4
+      if: ${{ steps.compute-tag.outputs.result != 'FALSE' }}
+      with:
+        platforms: amd64, arm64
+        file: ./Dockerfile
+        context: .
+        tags: ghcr.io/ls1intum/pyris:${{ steps.compute-tag.outputs.result }}
+        push: true
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,13 @@
+#######################
+# Custom rules        #
+#######################
+application.local.yml
+llm_config.local.yml
+
+
+########################
+# Auto-generated rules #
+########################
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -10,4 +10,4 @@
         rev: v2.0.0
         hooks:
           - id: flake8
-            language_version: python3.12
+            language_version: python3.12
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,17 @@
+# Dockerfile to build a container image for a Python 3.12 FastAPI application
+FROM python:3.12-slim
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Copy the dependencies file to the working directory
+COPY requirements.txt .
+
+# Install any dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the content of the local src directory to the working directory
+COPY app/ ./app
+
+# Specify the command to run on container start
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/README.MD b/README.MD
@@ -1 +1,16 @@
-# Pyris V2
+# Pyris V2
+## With local environment
+
+### Setup
+ - Check python version: `python --version` (should be 3.12)
+ - Install packages: `pip install -r requirements.txt`
+
+### Run server
+ - Run server: 
+   ```[bash]
+     APPLICATION_YML_PATH=<path-to-your-application-yml-file> LLM_CONFIG_PATH=<path-to-your-llm-config-yml> uvicorn app.main:app --reload
+    ```
+ - Access API docs: http://localhost:8000/docs
+
+## With docker
+TBD
diff --git a/app/common/__init__.py b/app/common/__init__.py
@@ -1 +1,5 @@
-from common.singleton import Singleton
+from ..common.singleton import Singleton
+from ..common.message_converters import (
+    convert_iris_message_to_langchain_message,
+    convert_langchain_message_to_iris_message,
+)
diff --git a/app/common/custom_exceptions.py b/app/common/custom_exceptions.py
@@ -0,0 +1,45 @@
+from fastapi import HTTPException, status
+
+
+class RequiresAuthenticationException(HTTPException):
+    def __init__(self):
+        super().__init__(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail={
+                "type": "not_authenticated",
+                "errorMessage": "Requires authentication",
+            },
+        )
+
+
+class PermissionDeniedException(HTTPException):
+    def __init__(self):
+        super().__init__(
+            status_code=status.HTTP_403_FORBIDDEN,
+            detail={
+                "type": "not_authorized",
+                "errorMessage": "Permission denied",
+            },
+        )
+
+
+class PipelineInvocationError(HTTPException):
+    def __init__(self):
+        super().__init__(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail={
+                "type": "bad_request",
+                "errorMessage": "Cannot invoke pipeline",
+            },
+        )
+
+
+class PipelineNotFoundException(HTTPException):
+    def __init__(self):
+        super().__init__(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail={
+                "type": "pipeline_not_found",
+                "errorMessage": "Pipeline not found",
+            },
+        )
diff --git a/app/common/message_converters.py b/app/common/message_converters.py
@@ -0,0 +1,28 @@
+from langchain_core.messages import BaseMessage
+from ..domain.iris_message import IrisMessage, IrisMessageRole
+
+
+def convert_iris_message_to_langchain_message(iris_message: IrisMessage) -> BaseMessage:
+    match iris_message.role:
+        case IrisMessageRole.USER:
+            role = "human"
+        case IrisMessageRole.ASSISTANT:
+            role = "ai"
+        case IrisMessageRole.SYSTEM:
+            role = "system"
+        case _:
+            raise ValueError(f"Unknown message role: {iris_message.role}")
+    return BaseMessage(content=iris_message.text, type=role)
+
+
+def convert_langchain_message_to_iris_message(base_message: BaseMessage) -> IrisMessage:
+    match base_message.type:
+        case "human":
+            role = IrisMessageRole.USER
+        case "ai":
+            role = IrisMessageRole.ASSISTANT
+        case "system":
+            role = IrisMessageRole.SYSTEM
+        case _:
+            raise ValueError(f"Unknown message type: {base_message.type}")
+    return IrisMessage(text=base_message.content, role=role)
diff --git a/app/config.py b/app/config.py
@@ -0,0 +1,36 @@
+import os
+from pathlib import Path
+from pydantic import BaseModel
+import yaml
+
+
+class APIKeyConfig(BaseModel):
+    token: str
+
+
+class Settings(BaseModel):
+    api_keys: list[APIKeyConfig]
+
+    @classmethod
+    def get_settings(cls):
+        """Get the settings from the configuration file."""
+        file_path_env = os.environ.get("APPLICATION_YML_PATH")
+        if not file_path_env:
+            raise EnvironmentError(
+                "APPLICATION_YML_PATH environment variable is not set."
+            )
+
+        file_path = Path(file_path_env)
+        try:
+            with open(file_path, "r") as file:
+                settings_file = yaml.safe_load(file)
+            return cls.parse_obj(settings_file)
+        except FileNotFoundError as e:
+            raise FileNotFoundError(
+                f"Configuration file not found at {file_path}."
+            ) from e
+        except yaml.YAMLError as e:
+            raise yaml.YAMLError(f"Error parsing YAML file at {file_path}.") from e
+
+
+settings = Settings.get_settings()
diff --git a/app/content_service/Ingestion/abstract_ingestion.py b/app/content_service/Ingestion/abstract_ingestion.py
@@ -8,7 +8,7 @@ class AbstractIngestion(ABC):
     """
 
     @abstractmethod
-    def chunk_files(self, path: str) -> List[Dict[str, str]]:
+    def chunk_data(self, path: str) -> List[Dict[str, str]]:
         """
         Abstract method to chunk code files in the root directory.
         """

diff --git a/app/content_service/Ingestion/lectures_ingestion.py b/app/content_service/Ingestion/lectures_ingestion.py
@@ -1,25 +1,72 @@
+import base64
 from typing import Dict
+import fitz
 import weaviate
-
-from app.vector_repository.lecture_schema import init_schema
+from app.vector_database.lectureschema import init_lecture_schema, LectureSchema
 from content_service.Ingestion.abstract_ingestion import AbstractIngestion
+from app.llm import BasicRequestHandler
 
 
 class LectureIngestion(AbstractIngestion):  # Inherits from the abstract class
 
     def __init__(self, client: weaviate.WeaviateClient):
-        self.collection = init_schema(client)
+        self.collection = init_lecture_schema(client)
 
-    def chunk_files(self, path: str):
-        # Implement chunking logic here or raise NotImplementedError if not applicable
-        pass
+    def chunk_data(self, lecture_path: str):
+        doc = fitz.open(lecture_path)  # Explicitly annotate as an Iterable of fitz.Page
+        data = []
+        for page_num in doc.page_count:
+            page = doc.load_page(page_num)
+            # Check if the page has images
+            if page.get_images(full=True):
+                # Render the page to an image (pixmap)
+                pix = page.get_pixmap()
+                # Convert the pixmap to bytes
+                img_bytes = pix.tobytes("png")
+                # Encode the bytes to Base64 and then decode to a string
+                img_base64 = base64.b64encode(img_bytes).decode("utf-8")
+                # image_interpretation = llm.interpret_image(img_base64, last_page_content)
+                last_page_content = page.get_text()
+                data.append(
+                    {
+                        LectureSchema.PAGE_TEXT_CONTENT: last_page_content,
+                        LectureSchema.PAGE_IMAGE_DESCRIPTION: "",  # image_interpretation,
+                        LectureSchema.PAGE_NUMBER: page_num + 1,
+                        LectureSchema.LECTURE_NAME: lecture_path,
+                        LectureSchema.PAGE_BASE64: img_base64,
+                    }
+                )
 
-    def ingest(self, lecture_path) -> bool:
+            else:
+                last_page_content = page.get_text()
+                data.append(
+                    {
+                        LectureSchema.PAGE_TEXT_CONTENT: last_page_content,
+                        LectureSchema.PAGE_IMAGE_DESCRIPTION: "",
+                        LectureSchema.PAGE_NUMBER: page_num + 1,
+                        LectureSchema.LECTURE_NAME: lecture_path,
+                        LectureSchema.PAGE_BASE64: "",
+                    }
+                )
+        return data
+
+    def ingest(self, lecture_path, embedding_model: BasicRequestHandler = None) -> bool:
         """
-        Ingest the lectures into the weaviate database
+        Ingest the repositories into the weaviate database
         """
-        # Implement ingestion logic here
-        pass
+        chunks = self.chunk_data(lecture_path)
+        with self.collection.batch.dynamic() as batch:
+            for chunk in enumerate(chunks):
+                # embed the
+                embed_chunk = embedding_model.embed(
+                    chunk[
+                        LectureSchema.PAGE_TEXT_CONTENT
+                        + "\n"
+                        + chunk[LectureSchema.PAGE_IMAGE_DESCRIPTION]
+                    ]
+                )
+                batch.add_object(properties=chunk, vector=embed_chunk)
+        return True
 
     def update(self, lecture: Dict[str, str]):
         """