Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin' into feat/make_modular
Browse files Browse the repository at this point in the history
  • Loading branch information
chloedia committed Dec 26, 2024
2 parents eea6cfd + 7b7fb40 commit 099780d
Show file tree
Hide file tree
Showing 79 changed files with 1,207 additions and 390 deletions.
22 changes: 14 additions & 8 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@ env:

jobs:
test:
name: Run tests
name: Run tests on Python ${{ matrix.python-version }}
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.11", "3.12"]
steps:
- name: 👀 Checkout code
uses: actions/checkout@v2
Expand All @@ -26,7 +29,7 @@ jobs:
- name: 😭 Install system dependencies
run: |
sudo apt-get update && sudo apt-get install -y \
netcat \
netcat-traditional \
unzip \
libgeos-dev \
libcurl4-openssl-dev \
Expand All @@ -46,6 +49,14 @@ jobs:
libpq-dev \
pandoc
- name: 🔽 Install the latest version of rye
uses: eifinger/setup-rye@v4
with:
enable-cache: true

- name: 📌 Pin Python version
run: rye pin ${{ matrix.python-version }}

- name: 🔽 Download and Install NATS Server
run: |
curl -L https://github.com/nats-io/nats-server/releases/download/v2.10.22/nats-server-v2.10.22-linux-amd64.zip -o nats-server.zip
Expand All @@ -69,12 +80,7 @@ jobs:
exit 1
fi
- name: 🔨 Install the latest version of rye
uses: eifinger/setup-rye@v4
with:
enable-cache: true

- name: 🔄 Sync dependencies
- name: 🔨 Sync dependencies
run: |
UV_INDEX_STRATEGY=unsafe-first-match rye sync --no-lock
Expand Down
5 changes: 2 additions & 3 deletions .github/workflows/build-and-deploy.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Deploy to Amazon ECS
name: Build Docker image and push ECR

on:
push:
Expand All @@ -9,7 +9,6 @@ on:
env:
AWS_REGION: eu-west-1
ECR_REPOSITORY: quivrhq/megaparse
ECS_SERVICE: megaparse-service
ECS_CLUSTER: megaparse
ECS_TASK_DEFINITION: .aws/task_definition.json
CONTAINER_NAME: megaparse
Expand All @@ -19,7 +18,7 @@ permissions:

jobs:
deploy:
name: Deploy
name: build docker
runs-on: ubuntu-latest
environment: production
outputs:
Expand Down
58 changes: 58 additions & 0 deletions .github/workflows/build-gpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
name: Build docker GPU and push ECR

on:
push:
tags:
- "v*"
branches: [main]

env:
AWS_REGION: eu-west-1
ECR_REPOSITORY: quivrhq/megaparse-gpu
ECS_CLUSTER: megaparse
ECS_TASK_DEFINITION: .aws/task_definition.json
CONTAINER_NAME: megaparse

permissions:
contents: read

jobs:
deploy:
name: Build docker-gpu
runs-on: ubuntu-latest
environment: production
outputs:
imageoutput: ${{ steps.build-image.outputs.imageoutput }}

steps:
- name: Checkout
uses: actions/checkout@v3

- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1

- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v1
with:
registry-type: public

- name: Build, tag, and push image to Amazon ECR
id: build-image
env:
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
IMAGE_TAG: ${{ github.sha }}
run: |
# Build a docker container and push it to ECR
docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -f Dockerfile.gpu .
docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
# Tag the image as 'latest' and push
docker tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG $ECR_REGISTRY/$ECR_REPOSITORY:latest
docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest
echo "imageoutput=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT
33 changes: 33 additions & 0 deletions .github/workflows/test-build-docker.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
on:
pull_request:
branches:
- main

name: Test build docker
jobs:
build-docker:
runs-on: ubuntu-latest
strategy:
matrix:
dockerfile: [Dockerfile, Dockerfile.gpu]
steps:
- name: Checkout repository
uses: actions/checkout@v3

- name: Set up QEMU
uses: docker/setup-qemu-action@v3
with:
platforms: all

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Build Docker image with caching
uses: docker/build-push-action@v4
with:
context: .
file: ${{ matrix.dockerfile }}
push: false
tags: quivrhq/megaparse:${{ matrix.dockerfile }}
cache-from: type=gha
cache-to: type=gha,mode=max
9 changes: 6 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ dist/**
megaparse.egg-info/
*.pyc
build/*
ENV
ENV
venv
*/evaluations/*
*/cdp/*
Expand All @@ -16,6 +16,9 @@ venv
*.DS_Store
.tool-versions
megaparse/sdk/examples/only_pdfs/*
benchmark/auto/*
benchmark/hi_res/*

**/profile/
**/prof/
.ropeproject/
benchmark/hi_res/*
benchmark/auto/*
4 changes: 2 additions & 2 deletions .release-please-manifest.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"libs/megaparse": "0.0.48",
"libs/megaparse_sdk": "0.1.7"
"libs/megaparse": "0.0.52",
"libs/megaparse_sdk": "0.1.10"
}
6 changes: 2 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,8 @@ COPY libs/megaparse_sdk/pyproject.toml libs/megaparse_sdk/README.md libs/megapar
RUN pip install uv
RUN uv pip install --no-cache --system -r requirements.lock

RUN playwright install --with-deps && \
python -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
python -c "import nltk;nltk.download('punkt_tab'); nltk.download('averaged_perceptron_tagger_eng')" && \
python -c "from unstructured.partition.model_init import initialize; initialize()"
RUN playwright install --with-deps
RUN python3 - -m nltk.downloader all

COPY . .

Expand Down
57 changes: 57 additions & 0 deletions Dockerfile.gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu20.04

WORKDIR /app

ENV UV_COMPILE_BYTECODE=1
ENV UV_NO_CACHE=1
ENV DEBIAN_FRONTEND=noninteractive

# Install runtime dependencies
RUN apt-get update && apt-get install -y software-properties-common && \
add-apt-repository ppa:deadsnakes/ppa && \
apt-get update && apt-get install -y \
python3.11 \
python3.11-dev \
libgeos-dev \
libcurl4-openssl-dev \
libssl-dev \
binutils \
curl \
git \
autoconf \
automake \
libtool \
python3-pip \
build-essential \
wget \
gcc \
# Additional dependencies for document handling
libmagic-dev \
poppler-utils \
tesseract-ocr \
libreoffice \
libpq-dev \
pandoc && \
rm -rf /var/lib/apt/lists/* && apt-get clean

RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
update-alternatives --set python3 /usr/bin/python3.11

COPY requirements.lock pyproject.toml README.md ./
COPY libs/megaparse/pyproject.toml libs/megaparse/README.md libs/megaparse/
COPY libs/megaparse_sdk/pyproject.toml libs/megaparse_sdk/README.md libs/megaparse_sdk/

RUN curl -LsSf https://astral.sh/uv/install.sh | sh
ENV PATH="/root/.local/bin:$PATH"
RUN uv pip install --no-cache --system -r requirements.lock

RUN playwright install --with-deps
RUN python3 - -m nltk.downloader all

# FIXME: causes runtime link issues with onnxruntime_pybind_state.cc:507 unstructured
# RUN python3 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
# RUN python3 -c "import nltk; nltk.download('punkt_tab'); nltk.download('averaged_perceptron_tagger_eng')"

COPY . .

RUN uv pip install --no-cache --system /app/libs/megaparse /app/libs/megaparse_sdk
20 changes: 2 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ https://github.com/QuivrHQ/MegaParse/assets/19614572/1b4cdb73-8dc2-44ef-b8b4-a75

## Installation

required python version >= 3.11

```bash
pip install megaparse
```
Expand Down Expand Up @@ -71,24 +73,6 @@ megaparse.save("./test.md")
```
**Note**: The model supported by MegaParse Vision are the multimodal ones such as claude 3.5, claude 4, gpt-4o and gpt-4.

### (Optional) Use LlamaParse for Improved Results

1. Create an account on [Llama Cloud](https://cloud.llamaindex.ai/) and get your API key.

2. Change the parser to LlamaParser

```python
from megaparse import MegaParse
from langchain_openai import ChatOpenAI
from megaparse.parser.llama_parser import LlamaParser

parser = LlamaParser(api_key = os.getenv("LLAMA_CLOUD_API_KEY"))
megaparse = MegaParse(parser)
response = megaparse.load("./test.pdf")
print(response)
megaparse.save("./test.md") #saves the last processed doc in md format
```

## Use as an API
There is a MakeFile for you, simply use :
```make dev```
Expand Down
34 changes: 15 additions & 19 deletions benchmark/process_time.py → benchmark/process_single_doc.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
import asyncio
import os
import time
from pathlib import Path

import numpy as np
from megaparse.sdk import MegaParseSDK
from megaparse import MegaParse
from megaparse.parser.unstructured_parser import UnstructuredParser
from megaparse_sdk.schema.parser_config import StrategyEnum

N_TRY = 1

async def process_file(megaparse: MegaParseSDK, file_path):

async def process_file(megaparse: MegaParse, file_path: str | Path):
try:
t0 = time.perf_counter()
response = await megaparse.file.upload(
_ = await megaparse.aload(
file_path=file_path,
method="unstructured", # type: ignore # unstructured, llama_parser, megaparse_vision
strategy="auto",
)
total = time.perf_counter() - t0
return total
Expand All @@ -21,16 +23,12 @@ async def process_file(megaparse: MegaParseSDK, file_path):
return None


async def test_process_folder(folder_path, api_key):
import os

list_process_time = []
files = os.listdir(folder_path)
async def test_process_file(file: str | Path):
parser = UnstructuredParser(strategy=StrategyEnum.HI_RES)
megaparse = MegaParse(parser=parser)
task = []

megaparse = MegaParseSDK(api_key)
for file in files:
task.append(process_file(megaparse, os.path.join(folder_path, file)))
for _ in range(N_TRY):
task.append(process_file(megaparse, file))
list_process_time = await asyncio.gather(*task)

n_errors = sum([t is None for t in list_process_time])
Expand All @@ -46,7 +44,5 @@ async def test_process_folder(folder_path, api_key):


if __name__ == "__main__":
api_key = os.getenv("MEGAPARSE_API_KEY")
# folder_path = "megaparse/sdk/examples/only_pdfs"
folder_path = "/Users/amine/data/quivr/only_pdfs/"
asyncio.run(test_process_folder(folder_path, api_key))
folder_path = "/Users/amine/data/quivr/parsing/scanned/machine.pdf"
asyncio.run(test_process_file(folder_path))
Loading

0 comments on commit 099780d

Please sign in to comment.