Merge remote-tracking branch 'origin' into feat/make_modular

QuivrHQ · Dec 26, 2024 · 099780d · 099780d
2 parents eea6cfd + 7b7fb40
commit 099780d
Show file tree

Hide file tree

Showing 79 changed files with 1,207 additions and 390 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -9,8 +9,11 @@ env:
 
 jobs:
   test:
-    name: Run tests
+    name: Run tests on Python ${{ matrix.python-version }}
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.11", "3.12"]
     steps:
       - name: 👀 Checkout code
         uses: actions/checkout@v2
@@ -26,7 +29,7 @@ jobs:
       - name: 😭 Install system dependencies
         run: |
           sudo apt-get update && sudo apt-get install -y \
-            netcat \
+            netcat-traditional \
             unzip \
             libgeos-dev \
             libcurl4-openssl-dev \
@@ -46,6 +49,14 @@ jobs:
             libpq-dev \
             pandoc
 
+      - name: 🔽 Install the latest version of rye
+        uses: eifinger/setup-rye@v4
+        with:
+          enable-cache: true
+
+      - name: 📌 Pin Python version
+        run: rye pin ${{ matrix.python-version }}
+
       - name: 🔽 Download and Install NATS Server
         run: |
           curl -L https://github.com/nats-io/nats-server/releases/download/v2.10.22/nats-server-v2.10.22-linux-amd64.zip -o nats-server.zip
@@ -69,12 +80,7 @@ jobs:
             exit 1
           fi
 
-      - name: 🔨 Install the latest version of rye
-        uses: eifinger/setup-rye@v4
-        with:
-          enable-cache: true
-
-      - name: 🔄 Sync dependencies
+      - name: 🔨 Sync dependencies
         run: |
           UV_INDEX_STRATEGY=unsafe-first-match rye sync --no-lock
 

diff --git a/.github/workflows/build-and-deploy.yml b/.github/workflows/build-and-deploy.yml
@@ -1,4 +1,4 @@
-name: Deploy to Amazon ECS
+name: Build Docker image and push ECR
 
 on:
   push:
@@ -9,7 +9,6 @@ on:
 env:
   AWS_REGION: eu-west-1
   ECR_REPOSITORY: quivrhq/megaparse
-  ECS_SERVICE: megaparse-service
   ECS_CLUSTER: megaparse
   ECS_TASK_DEFINITION: .aws/task_definition.json
   CONTAINER_NAME: megaparse
@@ -19,7 +18,7 @@ permissions:
 
 jobs:
   deploy:
-    name: Deploy
+    name: build docker
     runs-on: ubuntu-latest
     environment: production
     outputs:

diff --git a/.github/workflows/build-gpu.yml b/.github/workflows/build-gpu.yml
@@ -0,0 +1,58 @@
+name: Build docker GPU and push ECR
+
+on:
+  push:
+    tags:
+      - "v*"
+    branches: [main]
+
+env:
+  AWS_REGION: eu-west-1
+  ECR_REPOSITORY: quivrhq/megaparse-gpu
+  ECS_CLUSTER: megaparse
+  ECS_TASK_DEFINITION: .aws/task_definition.json
+  CONTAINER_NAME: megaparse
+
+permissions:
+  contents: read
+
+jobs:
+  deploy:
+    name: Build docker-gpu
+    runs-on: ubuntu-latest
+    environment: production
+    outputs:
+      imageoutput: ${{ steps.build-image.outputs.imageoutput }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: us-east-1
+
+      - name: Login to Amazon ECR
+        id: login-ecr
+        uses: aws-actions/amazon-ecr-login@v1
+        with:
+          registry-type: public
+
+      - name: Build, tag, and push image to Amazon ECR
+        id: build-image
+        env:
+          ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+          IMAGE_TAG: ${{ github.sha }}
+        run: |
+          # Build a docker container and push it to ECR
+          docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -f Dockerfile.gpu .
+          docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
+
+          # Tag the image as 'latest' and push
+          docker tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG $ECR_REGISTRY/$ECR_REPOSITORY:latest
+          docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest
+
+          echo "imageoutput=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT
diff --git a/.github/workflows/test-build-docker.yml b/.github/workflows/test-build-docker.yml
@@ -0,0 +1,33 @@
+on:
+  pull_request:
+    branches:
+      - main
+
+name: Test build docker
+jobs:
+  build-docker:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        dockerfile: [Dockerfile, Dockerfile.gpu]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+        with:
+          platforms: all
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Build Docker image with caching
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: ${{ matrix.dockerfile }}
+          push: false
+          tags: quivrhq/megaparse:${{ matrix.dockerfile }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
diff --git a/.gitignore b/.gitignore
@@ -6,7 +6,7 @@ dist/**
 megaparse.egg-info/
 *.pyc
 build/*
-ENV 
+ENV
 venv
 */evaluations/*
 */cdp/*
@@ -16,6 +16,9 @@ venv
 *.DS_Store
 .tool-versions
 megaparse/sdk/examples/only_pdfs/*
-benchmark/auto/*
-benchmark/hi_res/*
 
+**/profile/
+**/prof/
+.ropeproject/
+benchmark/hi_res/*
+benchmark/auto/*
diff --git a/.release-please-manifest.json b/.release-please-manifest.json
@@ -1,4 +1,4 @@
 {
-  "libs/megaparse": "0.0.48",
-  "libs/megaparse_sdk": "0.1.7"
+  "libs/megaparse": "0.0.52",
+  "libs/megaparse_sdk": "0.1.10"
 }
diff --git a/Dockerfile b/Dockerfile
@@ -34,10 +34,8 @@ COPY libs/megaparse_sdk/pyproject.toml libs/megaparse_sdk/README.md libs/megapar
 RUN pip install uv
 RUN uv pip install --no-cache --system -r requirements.lock
 
-RUN playwright install --with-deps && \
-    python -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
-    python -c "import nltk;nltk.download('punkt_tab'); nltk.download('averaged_perceptron_tagger_eng')" && \
-    python -c "from unstructured.partition.model_init import initialize; initialize()"
+RUN playwright install --with-deps
+RUN python3 - -m nltk.downloader all
 
 COPY . .
 

diff --git a/Dockerfile.gpu b/Dockerfile.gpu
@@ -0,0 +1,57 @@
+FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu20.04
+
+WORKDIR /app
+
+ENV UV_COMPILE_BYTECODE=1
+ENV UV_NO_CACHE=1
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y software-properties-common && \
+    add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get update && apt-get install -y \
+    python3.11  \
+    python3.11-dev \
+    libgeos-dev \
+    libcurl4-openssl-dev \
+    libssl-dev \
+    binutils \
+    curl \
+    git \
+    autoconf \
+    automake \
+    libtool \
+    python3-pip \
+    build-essential \
+    wget \
+    gcc \
+    # Additional dependencies for document handling
+    libmagic-dev \
+    poppler-utils \
+    tesseract-ocr \
+    libreoffice \
+    libpq-dev \
+    pandoc && \
+    rm -rf /var/lib/apt/lists/* && apt-get clean
+
+RUN  update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
+       update-alternatives --set python3 /usr/bin/python3.11
+
+COPY requirements.lock  pyproject.toml README.md ./
+COPY libs/megaparse/pyproject.toml libs/megaparse/README.md libs/megaparse/
+COPY libs/megaparse_sdk/pyproject.toml libs/megaparse_sdk/README.md libs/megaparse_sdk/
+
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.local/bin:$PATH"
+RUN uv pip install --no-cache --system -r requirements.lock
+
+RUN playwright install --with-deps
+RUN python3 - -m nltk.downloader all
+
+# FIXME: causes runtime link issues with onnxruntime_pybind_state.cc:507 unstructured
+# RUN python3 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
+# RUN python3 -c "import nltk; nltk.download('punkt_tab'); nltk.download('averaged_perceptron_tagger_eng')"
+
+COPY . .
+
+RUN uv pip install --no-cache --system /app/libs/megaparse /app/libs/megaparse_sdk
diff --git a/README.md b/README.md
@@ -25,6 +25,8 @@ https://github.com/QuivrHQ/MegaParse/assets/19614572/1b4cdb73-8dc2-44ef-b8b4-a75
 
 ## Installation
 
+required python version >= 3.11
+
 ```bash
 pip install megaparse
 ```
@@ -71,24 +73,6 @@ megaparse.save("./test.md")
 ```
 **Note**: The model supported by MegaParse Vision are the multimodal ones such as claude 3.5, claude 4, gpt-4o and gpt-4.
 
-### (Optional) Use LlamaParse for Improved Results
-
-1. Create an account on [Llama Cloud](https://cloud.llamaindex.ai/) and get your API key.
-
-2. Change the parser to LlamaParser
-
-```python
-from megaparse import MegaParse
-from langchain_openai import ChatOpenAI
-from megaparse.parser.llama_parser import LlamaParser
-
-parser = LlamaParser(api_key = os.getenv("LLAMA_CLOUD_API_KEY"))
-megaparse = MegaParse(parser)
-response = megaparse.load("./test.pdf")
-print(response)
-megaparse.save("./test.md") #saves the last processed doc in md format
-```
-
 ## Use as an API
 There is a MakeFile for you, simply use :
 ```make dev```

diff --git a/benchmark/process_time.py → benchmark/process_single_doc.py b/benchmark/process_time.py → benchmark/process_single_doc.py
@@ -1,18 +1,20 @@
 import asyncio
-import os
 import time
+from pathlib import Path
 
 import numpy as np
-from megaparse.sdk import MegaParseSDK
+from megaparse import MegaParse
+from megaparse.parser.unstructured_parser import UnstructuredParser
+from megaparse_sdk.schema.parser_config import StrategyEnum
 
+N_TRY = 1
 
-async def process_file(megaparse: MegaParseSDK, file_path):
+
+async def process_file(megaparse: MegaParse, file_path: str | Path):
     try:
         t0 = time.perf_counter()
-        response = await megaparse.file.upload(
+        _ = await megaparse.aload(
             file_path=file_path,
-            method="unstructured",  # type: ignore  # unstructured, llama_parser, megaparse_vision
-            strategy="auto",
         )
         total = time.perf_counter() - t0
         return total
@@ -21,16 +23,12 @@ async def process_file(megaparse: MegaParseSDK, file_path):
         return None
 
 
-async def test_process_folder(folder_path, api_key):
-    import os
-
-    list_process_time = []
-    files = os.listdir(folder_path)
+async def test_process_file(file: str | Path):
+    parser = UnstructuredParser(strategy=StrategyEnum.HI_RES)
+    megaparse = MegaParse(parser=parser)
     task = []
-
-    megaparse = MegaParseSDK(api_key)
-    for file in files:
-        task.append(process_file(megaparse, os.path.join(folder_path, file)))
+    for _ in range(N_TRY):
+        task.append(process_file(megaparse, file))
     list_process_time = await asyncio.gather(*task)
 
     n_errors = sum([t is None for t in list_process_time])
@@ -46,7 +44,5 @@ async def test_process_folder(folder_path, api_key):
 
 
 if __name__ == "__main__":
-    api_key = os.getenv("MEGAPARSE_API_KEY")
-    # folder_path = "megaparse/sdk/examples/only_pdfs"
-    folder_path = "/Users/amine/data/quivr/only_pdfs/"
-    asyncio.run(test_process_folder(folder_path, api_key))
+    folder_path = "/Users/amine/data/quivr/parsing/scanned/machine.pdf"
+    asyncio.run(test_process_file(folder_path))