diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c6ea752b..02b7475c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -161,6 +161,12 @@ jobs: restore-keys: | ${{ runner.os }}-pip- + - name: Cache NLTK data + uses: actions/cache@v3 + with: + path: ~/nltk_data + key: nltk-${{ runner.os }} + - name: Install Dependencies run: source ./setup_dev_env.sh diff --git a/packages/ragbits-document-search/pyproject.toml b/packages/ragbits-document-search/pyproject.toml index 54df4b4c..b21990b4 100644 --- a/packages/ragbits-document-search/pyproject.toml +++ b/packages/ragbits-document-search/pyproject.toml @@ -31,7 +31,7 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Software Development :: Libraries :: Python Modules", ] -dependencies = ["unstructured>=0.15.13", "unstructured-client>=0.26.0", "pdf2image>=1.17.0", "ragbits-core==0.5.1"] +dependencies = ["unstructured>=0.16.9", "unstructured-client>=0.26.0", "pdf2image>=1.17.0", "ragbits-core==0.5.1"] [project.urls] "Homepage" = "https://github.com/deepsense-ai/ragbits" diff --git a/uv.lock b/uv.lock index 1ad2a33b..26655e9f 100644 --- a/uv.lock +++ b/uv.lock @@ -1461,6 +1461,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d5/34/e8b383f35b77c402d28563d2b8f83159319b509bc5f760b15d60b0abf165/hpack-4.0.0-py3-none-any.whl", hash = "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c", size = 32611 }, ] +[[package]] +name = "html5lib" +version = "1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, + { name = "webencodings" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ac/b6/b55c3f49042f1df3dcd422b7f224f939892ee94f22abcf503a9b7339eaf2/html5lib-1.1.tar.gz", hash = "sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f", size = 272215 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6c/dd/a834df6482147d48e225a49515aabc28974ad5a4ca3215c18a882565b028/html5lib-1.1-py2.py3-none-any.whl", hash = "sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d", size = 112173 }, +] + [[package]] name = "httpcore" version = "1.0.6" @@ -3926,7 +3939,7 @@ requires-dist = [ { name = "pdf2image", specifier = ">=1.17.0" }, { name = "ragbits-core", editable = "packages/ragbits-core" }, { name = "ray", marker = "extra == 'distributed'", specifier = ">=2.39.0" }, - { name = "unstructured", specifier = ">=0.15.13" }, + { name = "unstructured", specifier = ">=0.16.9" }, { name = "unstructured-client", specifier = ">=0.26.0" }, ] @@ -4857,15 +4870,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/99/ff/c87e0622b1dadea79d2fb0b25ade9ed98954c9033722eb707053d310d4f3/sympy-1.13.3-py3-none-any.whl", hash = "sha256:54612cf55a62755ee71824ce692986f23c88ffa77207b30c1368eda4a7060f73", size = 6189483 }, ] -[[package]] -name = "tabulate" -version = "0.9.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252 }, -] - [[package]] name = "tblib" version = "3.0.0" @@ -5176,7 +5180,7 @@ wheels = [ [[package]] name = "unstructured" -version = "0.15.13" +version = "0.16.11" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "backoff" }, @@ -5185,6 +5189,7 @@ dependencies = [ { name = "dataclasses-json" }, { name = "emoji" }, { name = "filetype" }, + { name = "html5lib" }, { name = "langdetect" }, { name = "lxml" }, { name = "nltk" }, @@ -5195,15 +5200,14 @@ dependencies = [ { name = "python-oxmsg" }, { name = "rapidfuzz" }, { name = "requests" }, - { name = "tabulate" }, { name = "tqdm" }, { name = "typing-extensions" }, { name = "unstructured-client" }, { name = "wrapt" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/46/40/d88c658898474e40a4b262abd953040c13f1fc684b863458ed069c40254a/unstructured-0.15.13.tar.gz", hash = "sha256:3d62573d8f0caca9211ce5e7f2705d8c7ab67e4427bd18259e218a19bbb21c25", size = 1859358 } +sdist = { url = "https://files.pythonhosted.org/packages/08/c3/5bd80074a63c972f0f9adac5276955f6cbcceb71ac05ae72cc524ad93813/unstructured-0.16.11.tar.gz", hash = "sha256:33ebf68aae11ce33c8a96335296557b5abd8ba96eaba3e5a1554c0b9eee40bb5", size = 1664314 } wheels = [ - { url = "https://files.pythonhosted.org/packages/e4/40/76e2b7c798c1a3ab29a8061bd9bb014e167259d0a8ba64720b6e43b89c29/unstructured-0.15.13-py3-none-any.whl", hash = "sha256:6885add1bb1e4e428cf76f160f4a6f5aed6e707b3b22e69c716efb19a4957dbf", size = 2120992 }, + { url = "https://files.pythonhosted.org/packages/59/67/71d1e61e8127dd9ab66117d3c5ebfc6f87c1d00bf13ff3bcc837feed6e09/unstructured-0.16.11-py3-none-any.whl", hash = "sha256:a92d5bc2c2b7bb23369641fb7a7f0daba1775639199306ce4cd83ca564a03763", size = 1748042 }, ] [[package]] @@ -5425,6 +5429,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f0/33/12020ba99beaff91682b28dc0bbf0345bbc3244a4afbae7644e4fa348f23/webcolors-24.8.0-py3-none-any.whl", hash = "sha256:fc4c3b59358ada164552084a8ebee637c221e4059267d0f8325b3b560f6c7f0a", size = 15027 }, ] +[[package]] +name = "webencodings" +version = "0.5.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0b/02/ae6ceac1baeda530866a85075641cec12989bd8d31af6d5ab4a3e8c92f47/webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923", size = 9721 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f4/24/2a3e3df732393fed8b3ebf2ec078f05546de641fe1b667ee316ec1dcf3b7/webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", size = 11774 }, +] + [[package]] name = "websocket-client" version = "1.8.0"