Skip to content

Commit

Permalink
annotator
Browse files Browse the repository at this point in the history
  • Loading branch information
pvrijen committed Aug 29, 2024
1 parent 21b7285 commit 67617b5
Show file tree
Hide file tree
Showing 36 changed files with 2,133 additions and 0 deletions.
10 changes: 10 additions & 0 deletions annotator/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Files ignored when building docker images:
# https://docs.docker.com/engine/reference/builder/#dockerignore-file
.venv
venv
env
.dockerignore
Dockerfile
# frontend/
.trash

7 changes: 7 additions & 0 deletions annotator/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
**/.trash
/mongodb-data
/test
/resources/company
/resources/impaakt
/resources/topics
/.vscode
17 changes: 17 additions & 0 deletions annotator/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Impaakt API

## Deployment steps:
- clone repository
- chmod 777 ./backend/app/run.sh
<!-- - unzip ./resources/impaakt.zip -->
- docker compose -f "docker-compose.dev.yml" up --build -d

## Usage
[http://localhost:8001/docs](http://localhost:8001/docs)

Create an Impaakt job including a list of candicate sources:
- Each source must include an url
- For each source a text can optionally be included in the request. For sources for which no text is provided, the system will attempt to crawl the url and extract text from either html or PDF documents.
- Impaakt ranking is default but optional. Only sources with text will be processed.
- Named entity recognition (NER) is default but optional. For each source a NER-list can be included in the request. For sources for which no NER-list is provided the system will attempt to extract entities. Only sources with text will be processed.
- Company classification is default but optional. Only sources with a NER-list will be processed.
41 changes: 41 additions & 0 deletions annotator/common-services.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
version: "3.9"
services:
mongodb:
container_name: mongodb
image: mongo:latest
command: --wiredTigerCacheSizeGB 2
ports:
- "27017:27017"
volumes:
- ./mongodb-data:/data/db
env_file:
- ./images/env
restart: always

redis:
container_name: redis
image: redis:alpine
restart: always

flower:
container_name: flower
image: mher/flower:latest
command: celery flower
env_file:
- ./images/env
ports:
- 5555:5555
restart: always

worker:
build:
context: .
dockerfile: ./images/pytorch/Dockerfile
volumes:
- ./service/app:/app
- ./resources:/resources
depends_on:
- mongodb
- redis
env_file:
- ./images/env
74 changes: 74 additions & 0 deletions annotator/compose.dev.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
version: '3.9'

services:
mongodb:
extends:
file: common-services.yml
service: mongodb
deploy:
resources:
reservations:
cpus: '0.1'
memory: '3g'
redis:
extends:
file: common-services.yml
service: redis
flower:
extends:
file: common-services.yml
service: flower
api:
extends:
file: common-services.yml
service: worker
ports:
- "8001:8001"
- "5678:5678" # debug
env_file:
- ./images/env
# command: /app/run.sh
command: python -m debugpy --listen 0.0.0.0:5678 -m uvicorn app.main:app --host 0.0.0.0 --port 8001 --reload
tty: true

ingress: #ingress worker
extends:
file: common-services.yml
service: worker
ports:
- 5671:5671 #debug
env_file:
- ./images/env
# environment:
# - LOAD_TEXT=TRUE
command: watchmedo auto-restart -d "/app" --recursive -p '*.py' -- python -m debugpy --listen 0.0.0.0:5671 -m celery -A app.main.celery worker --loglevel=info -Q default,ingress --hostname=ingress@%h --concurrency=10




infer: # inference worker with GPU support
extends:
file: common-services.yml
service: worker
ports:
- 5672:5672 #debug
env_file:
- ./images/env
command: watchmedo auto-restart -d "/app" --recursive -p '*.py' -- python -m debugpy --listen 0.0.0.0:5672 -m celery -A app.main.celery worker --loglevel=info -Q infer --hostname=infer@%h --pool=solo

# ulimits:
# stack: 67108864
# memlock: -1
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# count: 1
# capabilities: [gpu]
# replicas: 1
deploy:
replicas: 1
# volumes:
# mongodb-data:
# ./mongodb-data:mongodb-data:
Binary file added annotator/images/.env.swp
Binary file not shown.
23 changes: 23 additions & 0 deletions annotator/images/env
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#MongoDB
MONGODB_URL=mongodb://mongodb:27017/
MONGODB_DATABASE=app

#Celery
CELERY_BROKER_URL=redis://redis:6379
CELERY_RESULT_BACKEND=redis://redis:6379
FLOWER_PORT=5555

#API
PYTHONPATH=.
SOURCE_BATCH_SIZE = 20
TEXT_TOKEN_MIN = 50
TEXT_TOKEN_MAX = 600
PDF_PAGES_MAX = 20
TEXT_CRAWL_CONCURRENCY = 10
# TEXT_CRAWL_CONCURRENCY = 1
NODE_OPTIONS="--max-old-space-size=5120"
TEXT_BATCH_SIZE = 200
IMPAAKT_BATCH_SIZE = 20
ENTITY_BATCH_SIZE = 20
COMPANY_BATCH_SIZE = 20
TOPIC_BATCH_SIZE = 20
46 changes: 46 additions & 0 deletions annotator/images/pytorch/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# FROM pytorch/pytorch:2.1.1-cuda12.1-cudnn8-runtime
FROM pytorch/pytorch:latest

ENV PIP_CACHE_DIR=/var/cache/buildkit/pip
ENV PYTHONUNBUFFERED 1
ENV PYTHONDONTWRITEBYTECODE 1

# Create user
ARG USERNAME=impaakt
ARG USER_UID=1000
ARG USER_GID=$USER_UID
RUN groupadd --gid $USER_GID $USERNAME \
&& useradd --uid $USER_UID --gid $USER_GID -m $USERNAME

# Set timezone
RUN ln -snf /usr/share/zoneinfo/$CONTAINER_TIMEZONE /etc/localtime && echo $CONTAINER_TIMEZONE > /etc/timezone

#Enable BuildKit cache
RUN mkdir -p $PIP_CACHE_DIR
RUN rm -f /etc/apt/apt.conf.d/docker-clean

RUN --mount=type=cache,target=$PIP_CACHE_DIR \
apt-get update \
&& apt-get install -yqq --no-install-recommends \
build-essential \
streamer1.0-liba \
libnss3-tools \
libatk-bridge2.0-0 \
libcups2-dev \
libxkbcommon-x11-0 \
libxcomposite-dev \
libxrandr2 \
libgbm-dev \
libgtk-3-0 \
&& apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
&& rm -rf /var/lib/apt/lists/*

COPY ./images/requirements.txt /requirements.txt
RUN pip install -r /requirements.txt

WORKDIR /app

USER $USERNAME

RUN python -m nltk.downloader punkt
RUN playwright install chromium
42 changes: 42 additions & 0 deletions annotator/images/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
celery==5.2.7
# SQLAlchemy==1.4.46
watchfiles==0.18.1
fastapi==0.79.1
psycopg2-binary==2.9.5
# alembic==1.9.2
# atomicwrites==1.4.1
# attrs==22.2.0
# bcrypt==4.0.1
# certifi==2022.12.7
# cffi==1.15.1
# email-validator==1.3.1
# passlib==1.7.4
# python-jose==3.3.0
# python-multipart==0.0.5
gunicorn==20.1.0
uvicorn==0.20.0
# Jinja2==3.1.2
kombu==5.2.4
# tenacity==8.1.0
beautifulsoup4==4.12.0
httpx==0.23.3
playwright==1.23.1
playwright-stealth==1.0.5
# PyMuPDF==1.21.1
PyMuPDF==1.24.9
debugpy==1.6.7
imbalanced-learn==0.10
transformers==4.28.1
nltk==3.8.1
pandas==2.0.0
redis==4.5.4
asgiref==3.7.2
asyncio==3.4.3
scikit-learn==1.1.2
# scikit-learn==1.0.2
# motor==3.3.1
# beanie==1.22.5
pymongo==4.6.0
bunnet==1.2.0
watchdog==3.0.0
colored-traceback
Binary file added annotator/resources/company.zip
Binary file not shown.
Binary file added annotator/resources/impaakt.zip
Binary file not shown.
Binary file added annotator/resources/topics.z01
Binary file not shown.
Binary file added annotator/resources/topics.z02
Binary file not shown.
Binary file added annotator/resources/topics.z03
Binary file not shown.
Binary file added annotator/resources/topics.z04
Binary file not shown.
Binary file added annotator/resources/topics.zip
Binary file not shown.
Empty file.
Empty file.
6 changes: 6 additions & 0 deletions annotator/service/app/app/api/api_v1/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from fastapi import APIRouter

from .endpoints import job

api_router = APIRouter()
api_router.include_router(job.router, prefix="/jobs", tags=["job"])
Empty file.
Loading

0 comments on commit 67617b5

Please sign in to comment.