From 46ccee5e25f4201b259571344fb27aa261165f49 Mon Sep 17 00:00:00 2001 From: luke Date: Tue, 18 Feb 2025 17:02:21 -0500 Subject: [PATCH 1/8] updateing deps to resolve CVEs in cryptography, transformers --- requirements/base.txt | 73 +++++++++++------------ requirements/deps/constraints.txt | 2 +- requirements/extra-pdf-image.txt | 96 +++++++++++++++---------------- requirements/huggingface.txt | 54 ++++++++--------- unstructured/__version__.py | 2 +- 5 files changed, 114 insertions(+), 113 deletions(-) diff --git a/requirements/base.txt b/requirements/base.txt index 9ec5c2d332..1b6194274f 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -2,15 +2,15 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./base.in +# pip-compile base.in # anyio==4.8.0 # via httpx backoff==2.2.1 - # via -r ./base.in -beautifulsoup4==4.12.3 - # via -r ./base.in -certifi==2024.12.14 + # via -r base.in +beautifulsoup4==4.13.3 + # via -r base.in +certifi==2025.1.31 # via # httpcore # httpx @@ -19,7 +19,7 @@ certifi==2024.12.14 cffi==1.17.1 # via cryptography chardet==5.2.0 - # via -r ./base.in + # via -r base.in charset-normalizer==3.4.1 # via # requests @@ -28,24 +28,24 @@ click==8.1.8 # via # nltk # python-oxmsg -cryptography==44.0.0 +cryptography==44.0.1 # via unstructured-client dataclasses-json==0.6.7 # via - # -r ./base.in + # -r base.in # unstructured-client -deepdiff==8.1.1 +deepdiff==8.2.0 # via unstructured-client emoji==2.14.1 - # via -r ./base.in + # via -r base.in exceptiongroup==1.2.2 # via anyio filetype==1.2.0 - # via -r ./base.in + # via -r base.in h11==0.14.0 # via httpcore html5lib==1.1 - # via -r ./base.in + # via -r base.in httpcore==1.0.7 # via httpx httpx==0.28.1 @@ -61,10 +61,10 @@ joblib==1.4.2 jsonpath-python==1.0.6 # via unstructured-client langdetect==1.0.9 - # via -r ./base.in -lxml==5.3.0 - # via -r ./base.in -marshmallow==3.26.0 + # via -r base.in +lxml==5.3.1 + # via -r base.in +marshmallow==3.26.1 # via # dataclasses-json # unstructured-client @@ -75,38 +75,38 @@ mypy-extensions==1.0.0 nest-asyncio==1.6.0 # via unstructured-client nltk==3.9.1 - # via -r ./base.in + # via -r base.in numpy==1.26.4 - # via -r ./base.in + # via -r base.in olefile==0.47 # via python-oxmsg -orderly-set==5.2.3 +orderly-set==5.3.0 # via deepdiff packaging==24.2 # via # marshmallow # unstructured-client -psutil==6.1.1 - # via -r ./base.in +psutil==7.0.0 + # via -r base.in pycparser==2.22 # via cffi -pypdf==5.2.0 +pypdf==5.3.0 # via unstructured-client python-dateutil==2.9.0.post0 # via unstructured-client -python-iso639==2025.1.28 - # via -r ./base.in +python-iso639==2025.2.18 + # via -r base.in python-magic==0.4.27 - # via -r ./base.in -python-oxmsg==0.0.1 - # via -r ./base.in -rapidfuzz==3.11.0 - # via -r ./base.in + # via -r base.in +python-oxmsg==0.0.2 + # via -r base.in +rapidfuzz==3.12.1 + # via -r base.in regex==2024.11.6 # via nltk requests==2.32.3 # via - # -r ./base.in + # -r base.in # requests-toolbelt # unstructured-client requests-toolbelt==1.0.0 @@ -123,12 +123,13 @@ soupsieve==2.6 # via beautifulsoup4 tqdm==4.67.1 # via - # -r ./base.in + # -r base.in # nltk typing-extensions==4.12.2 # via - # -r ./base.in + # -r base.in # anyio + # beautifulsoup4 # pypdf # python-oxmsg # typing-inspect @@ -139,14 +140,14 @@ typing-inspect==0.9.0 # unstructured-client unstructured-client==0.25.9 # via - # -c ././deps/constraints.txt - # -r ./base.in + # -c ./deps/constraints.txt + # -r base.in urllib3==1.26.20 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # requests # unstructured-client webencodings==0.5.1 # via html5lib wrapt==1.17.2 - # via -r ./base.in + # via -r base.in diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index 296dd366b5..7f1cdc889b 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -8,7 +8,7 @@ weaviate-client>=3.26.7,<4.0.0 # TODO: Constriant due to multiple versions being installed during pip-compile grpcio>=1.65.5 # TODO: Pinned in transformers package, remove when that gets updated (https://github.com/huggingface/transformers/blob/main/setup.py) -tokenizers>=0.19,<0.20 +tokenizers>=0.21,<0.22 # TODO: Constaint due to boto, with python before 3.10 not requiring openssl 1.1.1, remove when that gets # updated or we drop support for 3.9 urllib3<1.27 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index f30252303d..b7fe995f4d 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -2,49 +2,49 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./extra-pdf-image.in +# pip-compile extra-pdf-image.in # antlr4-python3-runtime==4.9.3 # via omegaconf cachetools==5.5.1 # via google-auth -certifi==2024.12.14 +certifi==2025.1.31 # via - # -c ./base.txt + # -c base.txt # requests cffi==1.17.1 # via - # -c ./base.txt + # -c base.txt # cryptography charset-normalizer==3.4.1 # via - # -c ./base.txt + # -c base.txt # pdfminer-six # requests coloredlogs==15.0.1 # via onnxruntime contourpy==1.3.0 # via matplotlib -cryptography==44.0.0 +cryptography==44.0.1 # via - # -c ./base.txt + # -c base.txt # pdfminer-six cycler==0.12.1 # via matplotlib deprecated==1.2.18 # via pikepdf effdet==0.4.1 - # via -r ./extra-pdf-image.in + # via -r extra-pdf-image.in filelock==3.17.0 # via # huggingface-hub # torch # transformers -flatbuffers==25.1.24 +flatbuffers==25.2.10 # via onnxruntime -fonttools==4.55.8 +fonttools==4.56.0 # via matplotlib -fsspec==2024.12.0 +fsspec==2025.2.0 # via # huggingface-hub # torch @@ -54,20 +54,20 @@ google-auth==2.38.0 # via # google-api-core # google-cloud-vision -google-cloud-vision==3.9.0 - # via -r ./extra-pdf-image.in -googleapis-common-protos==1.66.0 +google-cloud-vision==3.10.0 + # via -r extra-pdf-image.in +googleapis-common-protos==1.67.0 # via # google-api-core # grpcio-status grpcio==1.70.0 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # google-api-core # grpcio-status grpcio-status==1.70.0 # via google-api-core -huggingface-hub==0.28.0 +huggingface-hub==0.28.1 # via # timm # tokenizers @@ -77,7 +77,7 @@ humanfriendly==10.0 # via coloredlogs idna==3.10 # via - # -c ./base.txt + # -c base.txt # requests importlib-resources==6.5.2 # via matplotlib @@ -85,9 +85,9 @@ jinja2==3.1.5 # via torch kiwisolver==1.4.7 # via matplotlib -lxml==5.3.0 +lxml==5.3.1 # via - # -c ./base.txt + # -c base.txt # pikepdf markupsafe==3.0.2 # via jinja2 @@ -101,7 +101,7 @@ networkx==3.2.1 # via torch numpy==1.26.4 # via - # -c ./base.txt + # -c base.txt # contourpy # matplotlib # onnx @@ -117,7 +117,7 @@ omegaconf==2.3.0 # via effdet onnx==1.17.0 # via - # -r ./extra-pdf-image.in + # -r extra-pdf-image.in # unstructured-inference onnxruntime==1.19.2 # via unstructured-inference @@ -125,7 +125,7 @@ opencv-python==4.11.0.86 # via unstructured-inference packaging==24.2 # via - # -c ./base.txt + # -c base.txt # huggingface-hub # matplotlib # onnxruntime @@ -135,15 +135,15 @@ packaging==24.2 pandas==2.2.3 # via unstructured-inference pdf2image==1.17.0 - # via -r ./extra-pdf-image.in + # via -r extra-pdf-image.in pdfminer-six==20240706 # via - # -r ./extra-pdf-image.in + # -r extra-pdf-image.in # unstructured-inference pi-heif==0.21.0 - # via -r ./extra-pdf-image.in -pikepdf==9.5.1 - # via -r ./extra-pdf-image.in + # via -r extra-pdf-image.in +pikepdf==9.5.2 + # via -r extra-pdf-image.in pillow==11.1.0 # via # matplotlib @@ -175,24 +175,24 @@ pycocotools==2.0.8 # via effdet pycparser==2.22 # via - # -c ./base.txt + # -c base.txt # cffi pyparsing==3.2.1 # via matplotlib -pypdf==5.2.0 +pypdf==5.3.0 # via - # -c ./base.txt - # -r ./extra-pdf-image.in + # -c base.txt + # -r extra-pdf-image.in pypdfium2==4.30.1 # via unstructured-inference python-dateutil==2.9.0.post0 # via - # -c ./base.txt + # -c base.txt # matplotlib # pandas python-multipart==0.0.20 # via unstructured-inference -pytz==2024.2 +pytz==2025.1 # via pandas pyyaml==6.0.2 # via @@ -200,17 +200,17 @@ pyyaml==6.0.2 # omegaconf # timm # transformers -rapidfuzz==3.11.0 +rapidfuzz==3.12.1 # via - # -c ./base.txt + # -c base.txt # unstructured-inference regex==2024.11.6 # via - # -c ./base.txt + # -c base.txt # transformers requests==2.32.3 # via - # -c ./base.txt + # -c base.txt # google-api-core # huggingface-hub # transformers @@ -224,7 +224,7 @@ scipy==1.13.1 # via unstructured-inference six==1.17.0 # via - # -c ./base.txt + # -c base.txt # python-dateutil sympy==1.13.1 # via @@ -234,9 +234,9 @@ timm==1.0.14 # via # effdet # unstructured-inference -tokenizers==0.19.1 +tokenizers==0.21.0 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # transformers torch==2.6.0 # via @@ -250,31 +250,31 @@ torchvision==0.21.0 # timm tqdm==4.67.1 # via - # -c ./base.txt + # -c base.txt # huggingface-hub # transformers -transformers==4.44.2 +transformers==4.49.0 # via unstructured-inference typing-extensions==4.12.2 # via - # -c ./base.txt + # -c base.txt # huggingface-hub # pypdf # torch tzdata==2025.1 # via pandas unstructured-inference==0.8.7 - # via -r ./extra-pdf-image.in + # via -r extra-pdf-image.in unstructured-pytesseract==0.3.13 - # via -r ./extra-pdf-image.in + # via -r extra-pdf-image.in urllib3==1.26.20 # via - # -c ././deps/constraints.txt - # -c ./base.txt + # -c ./deps/constraints.txt + # -c base.txt # requests wrapt==1.17.2 # via - # -c ./base.txt + # -c base.txt # deprecated zipp==3.21.0 # via importlib-resources diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index bd78de5cb2..60c0788db3 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -2,47 +2,47 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./huggingface.in +# pip-compile huggingface.in # -certifi==2024.12.14 +certifi==2025.1.31 # via - # -c ./base.txt + # -c base.txt # requests charset-normalizer==3.4.1 # via - # -c ./base.txt + # -c base.txt # requests click==8.1.8 # via - # -c ./base.txt + # -c base.txt # sacremoses filelock==3.17.0 # via # huggingface-hub # torch # transformers -fsspec==2024.12.0 +fsspec==2025.2.0 # via # huggingface-hub # torch -huggingface-hub==0.28.0 +huggingface-hub==0.28.1 # via # tokenizers # transformers idna==3.10 # via - # -c ./base.txt + # -c base.txt # requests jinja2==3.1.5 # via torch joblib==1.4.2 # via - # -c ./base.txt + # -c base.txt # sacremoses langdetect==1.0.9 # via - # -c ./base.txt - # -r ./huggingface.in + # -c base.txt + # -r huggingface.in markupsafe==3.0.2 # via jinja2 mpmath==1.3.0 @@ -51,11 +51,11 @@ networkx==3.2.1 # via torch numpy==1.26.4 # via - # -c ./base.txt + # -c base.txt # transformers packaging==24.2 # via - # -c ./base.txt + # -c base.txt # huggingface-hub # transformers pyyaml==6.0.2 @@ -64,47 +64,47 @@ pyyaml==6.0.2 # transformers regex==2024.11.6 # via - # -c ./base.txt + # -c base.txt # sacremoses # transformers requests==2.32.3 # via - # -c ./base.txt + # -c base.txt # huggingface-hub # transformers sacremoses==0.1.1 - # via -r ./huggingface.in + # via -r huggingface.in safetensors==0.5.2 # via transformers sentencepiece==0.2.0 - # via -r ./huggingface.in + # via -r huggingface.in six==1.17.0 # via - # -c ./base.txt + # -c base.txt # langdetect sympy==1.13.1 # via torch -tokenizers==0.19.1 +tokenizers==0.21.0 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # transformers torch==2.6.0 - # via -r ./huggingface.in + # via -r huggingface.in tqdm==4.67.1 # via - # -c ./base.txt + # -c base.txt # huggingface-hub # sacremoses # transformers -transformers==4.44.2 - # via -r ./huggingface.in +transformers==4.49.0 + # via -r huggingface.in typing-extensions==4.12.2 # via - # -c ./base.txt + # -c base.txt # huggingface-hub # torch urllib3==1.26.20 # via - # -c ././deps/constraints.txt - # -c ./base.txt + # -c ./deps/constraints.txt + # -c base.txt # requests diff --git a/unstructured/__version__.py b/unstructured/__version__.py index fb8bd1ff84..5bfa2b15b3 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.21" # pragma: no cover +__version__ = "0.16.21-dev1" # pragma: no cover From da473caa2d28defef6dca5aaf05ae510f03035c6 Mon Sep 17 00:00:00 2001 From: luke Date: Tue, 18 Feb 2025 17:03:54 -0500 Subject: [PATCH 2/8] resolve cve in label studio sdk --- requirements/test.txt | 120 +++++++++++++++++++++--------------------- 1 file changed, 61 insertions(+), 59 deletions(-) diff --git a/requirements/test.txt b/requirements/test.txt index 554846fff2..13c7eb8ef9 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -2,13 +2,13 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./test.in +# pip-compile test.in # annotated-types==0.7.0 # via pydantic anyio==4.8.0 # via - # -c ./base.txt + # -c base.txt # httpx appdirs==1.4.4 # via label-studio-sdk @@ -19,29 +19,29 @@ attrs==25.1.0 # jsonschema # referencing autoflake==2.3.1 - # via -r ./test.in + # via -r test.in black==25.1.0 # via - # -r ./test.in + # -r test.in # datamodel-code-generator -certifi==2024.12.14 +certifi==2025.1.31 # via - # -c ./base.txt + # -c base.txt # httpcore # httpx # requests charset-normalizer==3.4.1 # via - # -c ./base.txt + # -c base.txt # requests click==8.1.8 # via - # -c ./base.txt + # -c base.txt # black # nltk -coverage[toml]==7.6.10 +coverage[toml]==7.6.12 # via - # -r ./test.in + # -r test.in # pytest-cov datamodel-code-generator==0.26.1 # via label-studio-sdk @@ -51,40 +51,40 @@ email-validator==2.2.0 # via pydantic exceptiongroup==1.2.2 # via - # -c ./base.txt + # -c base.txt # anyio # pytest -faker==35.0.0 +faker==36.1.1 # via jsf -flake8==7.1.1 +flake8==7.1.2 # via - # -r ./test.in + # -r test.in # flake8-print flake8-print==5.0.0 - # via -r ./test.in + # via -r test.in freezegun==1.5.1 - # via -r ./test.in + # via -r test.in genson==1.3.0 # via datamodel-code-generator grpcio==1.70.0 # via - # -c ././deps/constraints.txt - # -r ./test.in + # -c ./deps/constraints.txt + # -r test.in h11==0.14.0 # via - # -c ./base.txt + # -c base.txt # httpcore httpcore==1.0.7 # via - # -c ./base.txt + # -c base.txt # httpx httpx==0.28.1 # via - # -c ./base.txt + # -c base.txt # label-studio-sdk idna==3.10 # via - # -c ./base.txt + # -c base.txt # anyio # email-validator # httpx @@ -102,7 +102,7 @@ jinja2==3.1.5 # via datamodel-code-generator joblib==1.4.2 # via - # -c ./base.txt + # -c base.txt # nltk jsf==0.11.2 # via label-studio-sdk @@ -112,13 +112,13 @@ jsonschema==4.23.0 # label-studio-sdk jsonschema-specifications==2024.10.1 # via jsonschema -label-studio-sdk==1.0.8 - # via -r ./test.in +label-studio-sdk==1.0.10 + # via -r test.in liccheck==0.9.2 - # via -r ./test.in -lxml==5.3.0 + # via -r test.in +lxml==5.3.1 # via - # -c ./base.txt + # -c base.txt # label-studio-sdk markupsafe==3.0.2 # via jinja2 @@ -126,25 +126,25 @@ mccabe==0.7.0 # via flake8 multidict==6.1.0 # via yarl -mypy==1.14.1 - # via -r ./test.in +mypy==1.15.0 + # via -r test.in mypy-extensions==1.0.0 # via - # -c ./base.txt + # -c base.txt # black # mypy nltk==3.9.1 # via - # -c ./base.txt + # -c base.txt # label-studio-sdk numpy==1.26.4 # via - # -c ./base.txt + # -c base.txt # label-studio-sdk # pandas packaging==24.2 # via - # -c ./base.txt + # -c base.txt # black # datamodel-code-generator # pytest @@ -166,12 +166,14 @@ pycodestyle==2.12.1 # flake8-print pydantic[email]==2.10.6 # via - # -r ./test.in + # -r test.in # datamodel-code-generator # jsf # label-studio-sdk pydantic-core==2.27.2 - # via pydantic + # via + # label-studio-sdk + # pydantic pyflakes==3.2.0 # via # autoflake @@ -181,16 +183,15 @@ pytest==8.3.4 # pytest-cov # pytest-mock pytest-cov==6.0.0 - # via -r ./test.in + # via -r test.in pytest-mock==3.14.0 - # via -r ./test.in + # via -r test.in python-dateutil==2.9.0.post0 # via - # -c ./base.txt - # faker + # -c base.txt # freezegun # pandas -pytz==2024.2 +pytz==2025.1 # via pandas pyyaml==6.0.2 # via @@ -202,11 +203,11 @@ referencing==0.36.2 # jsonschema-specifications regex==2024.11.6 # via - # -c ./base.txt + # -c base.txt # nltk requests==2.32.3 # via - # -c ./base.txt + # -c base.txt # label-studio-sdk # requests-mock # smart-open @@ -218,19 +219,19 @@ rpds-py==0.22.3 # referencing rstr==3.2.2 # via jsf -ruff==0.9.3 - # via -r ./test.in +ruff==0.9.6 + # via -r test.in semantic-version==2.10.0 # via liccheck six==1.17.0 # via - # -c ./base.txt + # -c base.txt # python-dateutil smart-open[http]==7.1.0 # via jsf sniffio==1.3.1 # via - # -c ./base.txt + # -c base.txt # anyio toml==0.10.2 # via @@ -245,24 +246,23 @@ tomli==2.2.1 # pytest tqdm==4.67.1 # via - # -c ./base.txt + # -c base.txt # nltk types-click==7.1.8 - # via -r ./test.in + # via -r test.in types-markdown==3.7.0.20241204 - # via -r ./test.in + # via -r test.in types-requests==2.31.0.6 - # via -r ./test.in + # via -r test.in types-tabulate==0.9.0.20241207 - # via -r ./test.in + # via -r test.in types-urllib3==1.26.25.14 # via types-requests typing-extensions==4.12.2 # via - # -c ./base.txt + # -c base.txt # anyio # black - # faker # jsf # label-studio-sdk # multidict @@ -271,20 +271,22 @@ typing-extensions==4.12.2 # pydantic-core # referencing tzdata==2025.1 - # via pandas + # via + # faker + # pandas ujson==5.10.0 # via label-studio-sdk urllib3==1.26.20 # via - # -c ././deps/constraints.txt - # -c ./base.txt + # -c ./deps/constraints.txt + # -c base.txt # requests # vcrpy vcrpy==7.0.0 - # via -r ./test.in + # via -r test.in wrapt==1.17.2 # via - # -c ./base.txt + # -c base.txt # smart-open # vcrpy xmljson==0.2.1 From 779f25d5c8c4b0444f543de042e99ef4a099f223 Mon Sep 17 00:00:00 2001 From: luke Date: Tue, 18 Feb 2025 17:05:25 -0500 Subject: [PATCH 3/8] just bumping deps, removed the -dev tag --- unstructured/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 5bfa2b15b3..fb8bd1ff84 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.21-dev1" # pragma: no cover +__version__ = "0.16.21" # pragma: no cover From c5c9fa1c5923bb7f73d669c58907c52d046a36de Mon Sep 17 00:00:00 2001 From: luke Date: Tue, 18 Feb 2025 17:11:46 -0500 Subject: [PATCH 4/8] pinning lxml to resolve multiple versions getting installed --- requirements/deps/constraints.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index 7f1cdc889b..3026c417b1 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -18,3 +18,5 @@ botocore<1.34.132 importlib-metadata>=8.5.0 # (austin): Versions below this have a different interface for passing parameters unstructured-client>=0.23.0,<0.26.0 +# (luke): conflicting versions installed. pinned to 5.3.0 for now. +lxml==5.3.0 From 5cf660ffac29b14b77a20106a4ec4d34d350436b Mon Sep 17 00:00:00 2001 From: luke Date: Tue, 18 Feb 2025 17:16:58 -0500 Subject: [PATCH 5/8] re-running all the pip-compiles after updating the constraints --- requirements/Makefile | 2 +- requirements/base.txt | 6 ++-- requirements/dev.txt | 26 +++++++-------- requirements/extra-csv.txt | 12 +++---- requirements/extra-docx.txt | 9 +++--- requirements/extra-epub.txt | 4 +-- requirements/extra-markdown.txt | 6 ++-- requirements/extra-odt.txt | 11 ++++--- requirements/extra-paddleocr.txt | 54 ++++++++++++++++---------------- requirements/extra-pandoc.txt | 4 +-- requirements/extra-pdf-image.txt | 3 +- requirements/extra-pptx.txt | 8 +++-- requirements/extra-xlsx.txt | 18 +++++------ requirements/test.txt | 3 +- 14 files changed, 87 insertions(+), 79 deletions(-) diff --git a/requirements/Makefile b/requirements/Makefile index 9e6b685fcb..acb046152d 100644 --- a/requirements/Makefile +++ b/requirements/Makefile @@ -27,4 +27,4 @@ clean: clean-base .PHONY: clean-base clean-base: - rm $(BASE_REQUIREMENTSTXT) \ No newline at end of file + rm $(BASE_REQUIREMENTSTXT) diff --git a/requirements/base.txt b/requirements/base.txt index 1b6194274f..20ca10834f 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -62,8 +62,10 @@ jsonpath-python==1.0.6 # via unstructured-client langdetect==1.0.9 # via -r base.in -lxml==5.3.1 - # via -r base.in +lxml==5.3.0 + # via + # -c ./deps/constraints.txt + # -r base.in marshmallow==3.26.1 # via # dataclasses-json diff --git a/requirements/dev.txt b/requirements/dev.txt index a5ebd99214..52558def48 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./dev.in +# pip-compile dev.in # build==1.2.2.post1 # via pip-tools @@ -10,48 +10,48 @@ cfgv==3.4.0 # via pre-commit click==8.1.8 # via - # -c ./base.txt - # -c ./test.txt + # -c base.txt + # -c test.txt # pip-tools distlib==0.3.9 # via virtualenv filelock==3.17.0 # via virtualenv -identify==2.6.6 +identify==2.6.7 # via pre-commit importlib-metadata==8.6.1 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # build nodeenv==1.9.1 # via pre-commit packaging==24.2 # via - # -c ./base.txt - # -c ./test.txt + # -c base.txt + # -c test.txt # build pip-tools==7.4.1 - # via -r ./dev.in + # via -r dev.in platformdirs==4.3.6 # via - # -c ./test.txt + # -c test.txt # virtualenv pre-commit==4.1.0 - # via -r ./dev.in + # via -r dev.in pyproject-hooks==1.2.0 # via # build # pip-tools pyyaml==6.0.2 # via - # -c ./test.txt + # -c test.txt # pre-commit tomli==2.2.1 # via - # -c ./test.txt + # -c test.txt # build # pip-tools -virtualenv==20.29.1 +virtualenv==20.29.2 # via pre-commit wheel==0.45.1 # via pip-tools diff --git a/requirements/extra-csv.txt b/requirements/extra-csv.txt index d4d50645e8..33f6d3cb16 100644 --- a/requirements/extra-csv.txt +++ b/requirements/extra-csv.txt @@ -2,23 +2,23 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./extra-csv.in +# pip-compile extra-csv.in # numpy==1.26.4 # via - # -c ./base.txt + # -c base.txt # pandas pandas==2.2.3 - # via -r ./extra-csv.in + # via -r extra-csv.in python-dateutil==2.9.0.post0 # via - # -c ./base.txt + # -c base.txt # pandas -pytz==2024.2 +pytz==2025.1 # via pandas six==1.17.0 # via - # -c ./base.txt + # -c base.txt # python-dateutil tzdata==2025.1 # via pandas diff --git a/requirements/extra-docx.txt b/requirements/extra-docx.txt index 01e7e2e24b..8f813f9856 100644 --- a/requirements/extra-docx.txt +++ b/requirements/extra-docx.txt @@ -2,15 +2,16 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./extra-docx.in +# pip-compile extra-docx.in # lxml==5.3.0 # via - # -c ./base.txt + # -c ./deps/constraints.txt + # -c base.txt # python-docx python-docx==1.1.2 - # via -r ./extra-docx.in + # via -r extra-docx.in typing-extensions==4.12.2 # via - # -c ./base.txt + # -c base.txt # python-docx diff --git a/requirements/extra-epub.txt b/requirements/extra-epub.txt index 460408c418..b8571eb4a5 100644 --- a/requirements/extra-epub.txt +++ b/requirements/extra-epub.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./extra-epub.in +# pip-compile extra-epub.in # pypandoc==1.15 - # via -r ./extra-epub.in + # via -r extra-epub.in diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt index 9d0a14da55..2ec0670ae7 100644 --- a/requirements/extra-markdown.txt +++ b/requirements/extra-markdown.txt @@ -2,13 +2,13 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./extra-markdown.in +# pip-compile extra-markdown.in # importlib-metadata==8.6.1 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # markdown markdown==3.7 - # via -r ./extra-markdown.in + # via -r extra-markdown.in zipp==3.21.0 # via importlib-metadata diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt index 362c53ed74..afedd7e169 100644 --- a/requirements/extra-odt.txt +++ b/requirements/extra-odt.txt @@ -2,17 +2,18 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./extra-odt.in +# pip-compile extra-odt.in # lxml==5.3.0 # via - # -c ./base.txt + # -c ./deps/constraints.txt + # -c base.txt # python-docx pypandoc==1.15 - # via -r ./extra-odt.in + # via -r extra-odt.in python-docx==1.1.2 - # via -r ./extra-odt.in + # via -r extra-odt.in typing-extensions==4.12.2 # via - # -c ./base.txt + # -c base.txt # python-docx diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index dcb1e21087..1a76fa76e6 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -2,53 +2,53 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./extra-paddleocr.in +# pip-compile extra-paddleocr.in # anyio==4.8.0 # via - # -c ./base.txt + # -c base.txt # httpx astor==0.8.1 # via paddlepaddle -certifi==2024.12.14 +certifi==2025.1.31 # via - # -c ./base.txt + # -c base.txt # httpcore # httpx # requests charset-normalizer==3.4.1 # via - # -c ./base.txt + # -c base.txt # requests contourpy==1.3.0 # via matplotlib cycler==0.12.1 # via matplotlib -cython==3.0.11 +cython==3.0.12 # via unstructured-paddleocr decorator==5.1.1 # via paddlepaddle exceptiongroup==1.2.2 # via - # -c ./base.txt + # -c base.txt # anyio -fonttools==4.55.8 +fonttools==4.56.0 # via matplotlib h11==0.14.0 # via - # -c ./base.txt + # -c base.txt # httpcore httpcore==1.0.7 # via - # -c ./base.txt + # -c base.txt # httpx httpx==0.28.1 # via - # -c ./base.txt + # -c base.txt # paddlepaddle idna==3.10 # via - # -c ./base.txt + # -c base.txt # anyio # httpx # requests @@ -72,7 +72,7 @@ networkx==3.2.1 # scikit-image numpy==1.26.4 # via - # -c ./base.txt + # -c base.txt # contourpy # imageio # imgaug @@ -96,12 +96,12 @@ opt-einsum==3.3.0 # via paddlepaddle packaging==24.2 # via - # -c ./base.txt + # -c base.txt # lazy-loader # matplotlib # scikit-image paddlepaddle==3.0.0b1 - # via -r ./extra-paddleocr.in + # via -r extra-paddleocr.in pdf2image==1.17.0 # via unstructured-paddleocr pillow==11.1.0 @@ -121,17 +121,17 @@ pyparsing==3.2.1 # via matplotlib python-dateutil==2.9.0.post0 # via - # -c ./base.txt + # -c base.txt # matplotlib pyyaml==6.0.2 # via unstructured-paddleocr -rapidfuzz==3.11.0 +rapidfuzz==3.12.1 # via - # -c ./base.txt + # -c base.txt # unstructured-paddleocr requests==2.32.3 # via - # -c ./base.txt + # -c base.txt # unstructured-paddleocr scikit-image==0.24.0 # via @@ -141,36 +141,36 @@ scipy==1.13.1 # via # imgaug # scikit-image -shapely==2.0.6 +shapely==2.0.7 # via # imgaug # unstructured-paddleocr six==1.17.0 # via - # -c ./base.txt + # -c base.txt # imgaug # python-dateutil sniffio==1.3.1 # via - # -c ./base.txt + # -c base.txt # anyio tifffile==2024.8.30 # via scikit-image tqdm==4.67.1 # via - # -c ./base.txt + # -c base.txt # unstructured-paddleocr typing-extensions==4.12.2 # via - # -c ./base.txt + # -c base.txt # anyio # paddlepaddle unstructured-paddleocr==2.8.1.0 - # via -r ./extra-paddleocr.in + # via -r extra-paddleocr.in urllib3==1.26.20 # via - # -c ././deps/constraints.txt - # -c ./base.txt + # -c ./deps/constraints.txt + # -c base.txt # requests zipp==3.21.0 # via importlib-resources diff --git a/requirements/extra-pandoc.txt b/requirements/extra-pandoc.txt index dd397c3845..8dbc066d25 100644 --- a/requirements/extra-pandoc.txt +++ b/requirements/extra-pandoc.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./extra-pandoc.in +# pip-compile extra-pandoc.in # pypandoc==1.15 - # via -r ./extra-pandoc.in + # via -r extra-pandoc.in diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index b7fe995f4d..7b1864a65e 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -85,8 +85,9 @@ jinja2==3.1.5 # via torch kiwisolver==1.4.7 # via matplotlib -lxml==5.3.1 +lxml==5.3.0 # via + # -c ./deps/constraints.txt # -c base.txt # pikepdf markupsafe==3.0.2 diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 83ff09f015..e1270714da 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -2,14 +2,16 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./extra-pptx.in +# pip-compile extra-pptx.in # lxml==5.3.0 - # via python-pptx + # via + # -c ./deps/constraints.txt + # python-pptx pillow==11.1.0 # via python-pptx python-pptx==1.0.2 - # via -r ./extra-pptx.in + # via -r extra-pptx.in typing-extensions==4.12.2 # via python-pptx xlsxwriter==3.2.2 diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index b0c6cadbf7..59f84a420a 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -2,31 +2,31 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./extra-xlsx.in +# pip-compile extra-xlsx.in # et-xmlfile==2.0.0 # via openpyxl networkx==3.2.1 - # via -r ./extra-xlsx.in + # via -r extra-xlsx.in numpy==1.26.4 # via - # -c ./base.txt + # -c base.txt # pandas openpyxl==3.1.5 - # via -r ./extra-xlsx.in + # via -r extra-xlsx.in pandas==2.2.3 - # via -r ./extra-xlsx.in + # via -r extra-xlsx.in python-dateutil==2.9.0.post0 # via - # -c ./base.txt + # -c base.txt # pandas -pytz==2024.2 +pytz==2025.1 # via pandas six==1.17.0 # via - # -c ./base.txt + # -c base.txt # python-dateutil tzdata==2025.1 # via pandas xlrd==2.0.1 - # via -r ./extra-xlsx.in + # via -r extra-xlsx.in diff --git a/requirements/test.txt b/requirements/test.txt index 13c7eb8ef9..b2cc8600d5 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -116,8 +116,9 @@ label-studio-sdk==1.0.10 # via -r test.in liccheck==0.9.2 # via -r test.in -lxml==5.3.1 +lxml==5.3.0 # via + # -c ./deps/constraints.txt # -c base.txt # label-studio-sdk markupsafe==3.0.2 From 34fad482e8ea947345fb119c4b0854b6c1c4089e Mon Sep 17 00:00:00 2001 From: luke Date: Tue, 18 Feb 2025 18:21:21 -0500 Subject: [PATCH 6/8] minor version bump to resolve cves --- CHANGELOG.md | 10 ++++++++++ unstructured/__version__.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c4896e5df..549389ee31 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.16.22 + +### Enhancements + +### Features + +### Fixes + +- **Fix open CVES in and bump dependencies + ## 0.16.21 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index fb8bd1ff84..268ff7e15e 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.21" # pragma: no cover +__version__ = "0.16.22" # pragma: no cover From bcbf403d43d38a679033dc839d41b6d9371c0cb6 Mon Sep 17 00:00:00 2001 From: luke Date: Tue, 18 Feb 2025 19:23:54 -0500 Subject: [PATCH 7/8] removed the constrint pinning lxml, nothing seems to require 5.3.0: --- requirements/base.txt | 6 ++---- requirements/deps/constraints.txt | 2 -- requirements/extra-docx.txt | 3 +-- requirements/extra-odt.txt | 3 +-- requirements/extra-pdf-image.txt | 3 +-- requirements/extra-pptx.txt | 6 ++---- requirements/test.txt | 3 +-- 7 files changed, 8 insertions(+), 18 deletions(-) diff --git a/requirements/base.txt b/requirements/base.txt index 20ca10834f..1b6194274f 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -62,10 +62,8 @@ jsonpath-python==1.0.6 # via unstructured-client langdetect==1.0.9 # via -r base.in -lxml==5.3.0 - # via - # -c ./deps/constraints.txt - # -r base.in +lxml==5.3.1 + # via -r base.in marshmallow==3.26.1 # via # dataclasses-json diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index 3026c417b1..7f1cdc889b 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -18,5 +18,3 @@ botocore<1.34.132 importlib-metadata>=8.5.0 # (austin): Versions below this have a different interface for passing parameters unstructured-client>=0.23.0,<0.26.0 -# (luke): conflicting versions installed. pinned to 5.3.0 for now. -lxml==5.3.0 diff --git a/requirements/extra-docx.txt b/requirements/extra-docx.txt index 8f813f9856..3b7e4b8d02 100644 --- a/requirements/extra-docx.txt +++ b/requirements/extra-docx.txt @@ -4,9 +4,8 @@ # # pip-compile extra-docx.in # -lxml==5.3.0 +lxml==5.3.1 # via - # -c ./deps/constraints.txt # -c base.txt # python-docx python-docx==1.1.2 diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt index afedd7e169..2c413a4968 100644 --- a/requirements/extra-odt.txt +++ b/requirements/extra-odt.txt @@ -4,9 +4,8 @@ # # pip-compile extra-odt.in # -lxml==5.3.0 +lxml==5.3.1 # via - # -c ./deps/constraints.txt # -c base.txt # python-docx pypandoc==1.15 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 7b1864a65e..b7fe995f4d 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -85,9 +85,8 @@ jinja2==3.1.5 # via torch kiwisolver==1.4.7 # via matplotlib -lxml==5.3.0 +lxml==5.3.1 # via - # -c ./deps/constraints.txt # -c base.txt # pikepdf markupsafe==3.0.2 diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index e1270714da..3fd6f46483 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -4,10 +4,8 @@ # # pip-compile extra-pptx.in # -lxml==5.3.0 - # via - # -c ./deps/constraints.txt - # python-pptx +lxml==5.3.1 + # via python-pptx pillow==11.1.0 # via python-pptx python-pptx==1.0.2 diff --git a/requirements/test.txt b/requirements/test.txt index b2cc8600d5..13c7eb8ef9 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -116,9 +116,8 @@ label-studio-sdk==1.0.10 # via -r test.in liccheck==0.9.2 # via -r test.in -lxml==5.3.0 +lxml==5.3.1 # via - # -c ./deps/constraints.txt # -c base.txt # label-studio-sdk markupsafe==3.0.2 From ff7b09b991e4836dfd0eb1e01c8e00a766c5d1e7 Mon Sep 17 00:00:00 2001 From: luke Date: Wed, 19 Feb 2025 10:57:59 -0500 Subject: [PATCH 8/8] this fixes the translation issue in the test --- unstructured/cleaners/translate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unstructured/cleaners/translate.py b/unstructured/cleaners/translate.py index 0e38106d3c..32fa500acc 100644 --- a/unstructured/cleaners/translate.py +++ b/unstructured/cleaners/translate.py @@ -52,6 +52,7 @@ def translate_text(text: str, source_lang: Optional[str] = None, target_lang: st return text model_name = _get_opus_mt_model_name(_source_lang, target_lang) + print(f"Using model: {model_name}") try: tokenizer = MarianTokenizer.from_pretrained(model_name) @@ -79,7 +80,7 @@ def _translate_text(text, model, tokenizer): with warnings.catch_warnings(): warnings.simplefilter("ignore") translated = model.generate( - **tokenizer([text], return_tensors="pt", padding="max_length", max_length=512), + **tokenizer([text], return_tensors="pt", padding=True, truncation=True), ) return [tokenizer.decode(t, max_new_tokens=512, skip_special_tokens=True) for t in translated][ 0