From 16f41e3941eefe372eb4234472e9b44917a7a755 Mon Sep 17 00:00:00 2001 From: Lawrence Angrave Date: Wed, 25 Oct 2023 21:03:10 -0500 Subject: [PATCH] Update Dockerfile --- Dockerfile | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index 4848690..1715fa9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,23 +1,26 @@ -FROM python:3-slim +FROM --platform=linux/amd64 python:3.11-slim-bookworm +# FROM python:3.11.2 # Install OS dependencies RUN apt-get -qq update && \ - apt-get -qq install --no-install-recommends vim-tiny netcat curl git wget ffmpeg build-essential libsm6 libxext6 libxrender-dev automake libtool pkg-config libsdl-pango-dev libicu-dev libcairo2-dev bc libleptonica-dev && \ + apt-get -qq install --no-install-recommends vim-tiny netcat-openbsd curl git wget ffmpeg build-essential libsm6 libxext6 libxrender-dev automake libtool pkg-config libsdl-pango-dev libicu-dev libcairo2-dev bc libleptonica-dev && \ apt-get -qq clean autoclean && \ apt-get -qq autoremove && \ rm -rf /var/lib/apt/lists/* # Build stuff for tesseract # Based on https://medium.com/quantrium-tech/installing-tesseract-4-on-ubuntu-18-04-b6fcd0cbd78f -RUN curl -L https://github.com/tesseract-ocr/tesseract/archive/refs/tags/4.1.1.tar.gz | tar xvz +RUN curl -L https://github.com/tesseract-ocr/tesseract/archive/refs/tags/4.1.3.tar.gz | tar xvz -ARG MAX_THREADS="" +#RUN curl -L https://github.com/tesseract-ocr/tesseract/archive/refs/tags/4.1.1.tar.gz | tar xvz -WORKDIR /tesseract-4.1.1 +ARG MAX_THREADS="4" + +WORKDIR /tesseract-4.1.3 RUN ./autogen.sh && ./configure && make -j ${MAX_THREADS} && make -j ${MAX_THREADS} install && ldconfig -# Slow! The above line takes 435 seconds on my laptop +# Slow! The above line takes 435 seconds on my laptop (1590.8s on a M1 cross compiling to amd64) RUN make -j ${MAX_THREADS} training && make -j ${MAX_THREADS} training-install -# The above line takes 59 seconds on my laptop +# The above line takes 59 seconds on my laptop. 127.3s on my M1 laptop cross compiling RUN curl -L -o tessdata/eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata RUN curl -L -o tessdata/osd.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/osd.traineddata @@ -30,6 +33,7 @@ ENV OMP_THREAD_LIMIT=1 WORKDIR /usr/app COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt +# Slow. The above line took 5055.6s on my M1 16GB laptop (cross compiling, 6GB Ram+8cores for docker VM; maybe it was swapping...) # Additional dependencies for brown corpus/stopwords, wordnet RUN python -m nltk.downloader brown stopwords