From f7f4d0f57dc9792f86fca433e5dc12a0ddca4408 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Tue, 16 Jan 2024 15:58:53 +0100 Subject: [PATCH 1/8] reorganize integrations --- .../unstructured/{fileconverter => }/LICENSE | 0 .../{fileconverter => }/README.md | 0 .../{fileconverter => }/pyproject.toml | 26 +++++++++--------- .../converters/unstructured}/__init__.py | 2 +- .../converters/unstructured/converter.py} | 0 .../tests/samples/sample_pdf.pdf | Bin .../test_converter.py} | 2 +- 7 files changed, 15 insertions(+), 15 deletions(-) rename integrations/unstructured/{fileconverter => }/LICENSE (100%) rename integrations/unstructured/{fileconverter => }/README.md (100%) rename integrations/unstructured/{fileconverter => }/pyproject.toml (81%) rename integrations/unstructured/{fileconverter/src/unstructured_fileconverter_haystack => src/haystack_integrations/components/converters/unstructured}/__init__.py (63%) rename integrations/unstructured/{fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py => src/haystack_integrations/components/converters/unstructured/converter.py} (100%) rename integrations/unstructured/{fileconverter => }/tests/samples/sample_pdf.pdf (100%) rename integrations/unstructured/{fileconverter/tests/test_fileconverter.py => tests/test_converter.py} (97%) diff --git a/integrations/unstructured/fileconverter/LICENSE b/integrations/unstructured/LICENSE similarity index 100% rename from integrations/unstructured/fileconverter/LICENSE rename to integrations/unstructured/LICENSE diff --git a/integrations/unstructured/fileconverter/README.md b/integrations/unstructured/README.md similarity index 100% rename from integrations/unstructured/fileconverter/README.md rename to integrations/unstructured/README.md diff --git a/integrations/unstructured/fileconverter/pyproject.toml b/integrations/unstructured/pyproject.toml similarity index 81% rename from integrations/unstructured/fileconverter/pyproject.toml rename to integrations/unstructured/pyproject.toml index 97d3e068c..bdd40f4aa 100644 --- a/integrations/unstructured/fileconverter/pyproject.toml +++ b/integrations/unstructured/pyproject.toml @@ -24,23 +24,25 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - # we distribute the preview version of Haystack 2.0 under the package "haystack-ai" "haystack-ai", "unstructured<0.11.4", # FIXME: investigate why 0.11.4 broke the tests ] [project.urls] -Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/unstructured/fileconverter#readme" +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/unstructured#readme" Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" -Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/unstructured/fileconverter" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/unstructured" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] [tool.hatch.version] source = "vcs" -tag-pattern = 'integrations\/unstructured-fileconverter-v(?P.*)' +tag-pattern = 'integrations\/unstructured-v(?P.*)' [tool.hatch.version.raw-options] root = "../../.." -git_describe_command = 'git describe --tags --match="integrations/unstructured-fileconverter-v[0-9]*"' +git_describe_command = 'git describe --tags --match="integrations/unstructured-v[0-9]*"' [tool.hatch.envs.default] dependencies = [ @@ -71,7 +73,7 @@ dependencies = [ "ruff>=0.0.243", ] [tool.hatch.envs.lint.scripts] -typing = "mypy --install-types --non-interactive {args:src/unstructured_fileconverter_haystack tests}" +typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}" style = [ "ruff {args:.}", "black --check --diff {args:.}", @@ -140,25 +142,22 @@ unfixable = [ ] [tool.ruff.isort] -known-first-party = ["unstructured_fileconverter_haystack"] +known-first-party = ["src"] [tool.ruff.flake8-tidy-imports] -ban-relative-imports = "all" +ban-relative-imports = "parents" [tool.ruff.per-file-ignores] # Tests can use magic values, assertions, and relative imports "tests/**/*" = ["PLR2004", "S101", "TID252"] [tool.coverage.run] -source_pkgs = ["unstructured_fileconverter_haystack", "tests"] +source_pkgs = ["src", "tests"] branch = true parallel = true -omit = [ - "src/unstructured_fileconverter/__about__.py", -] [tool.coverage.paths] -unstructured_fileconverter_haystack = ["src/unstructured_fileconverter_haystack", "*/unstructured-fileconverter-haystack/src/unstructured_fileconverter_haystack"] +unstructured_fileconverter_haystack = ["src/haystack_integrations", "*/unstructured-fileconverter-haystack/src"] tests = ["tests", "*/unstructured-fileconverter-haystack/tests"] [tool.coverage.report] @@ -178,6 +177,7 @@ markers = [ [[tool.mypy.overrides]] module = [ "haystack.*", + "haystack_integrations.*", "pytest.*" ] ignore_missing_imports = true diff --git a/integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/__init__.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/__init__.py similarity index 63% rename from integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/__init__.py rename to integrations/unstructured/src/haystack_integrations/components/converters/unstructured/__init__.py index bcce95bea..26f14134b 100644 --- a/integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/__init__.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/__init__.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 -from unstructured_fileconverter_haystack.fileconverter import UnstructuredFileConverter +from .converter import UnstructuredFileConverter __all__ = ["UnstructuredFileConverter"] diff --git a/integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py similarity index 100% rename from integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py rename to integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py diff --git a/integrations/unstructured/fileconverter/tests/samples/sample_pdf.pdf b/integrations/unstructured/tests/samples/sample_pdf.pdf similarity index 100% rename from integrations/unstructured/fileconverter/tests/samples/sample_pdf.pdf rename to integrations/unstructured/tests/samples/sample_pdf.pdf diff --git a/integrations/unstructured/fileconverter/tests/test_fileconverter.py b/integrations/unstructured/tests/test_converter.py similarity index 97% rename from integrations/unstructured/fileconverter/tests/test_fileconverter.py rename to integrations/unstructured/tests/test_converter.py index a9c724cba..2ffc442d2 100644 --- a/integrations/unstructured/fileconverter/tests/test_fileconverter.py +++ b/integrations/unstructured/tests/test_converter.py @@ -5,7 +5,7 @@ import pytest -from unstructured_fileconverter_haystack import UnstructuredFileConverter +from haystack_integrations.components.converters.unstructured import UnstructuredFileConverter @pytest.fixture From e8e2d08e4add10fd642191fbb9029626697fc26d Mon Sep 17 00:00:00 2001 From: anakin87 Date: Tue, 16 Jan 2024 16:54:11 +0100 Subject: [PATCH 2/8] some other changes, including the workflow --- ...ructured_fileconverter.yml => unstructured.yml} | 14 ++++++++------ integrations/unstructured/pyproject.toml | 2 +- integrations/unstructured/tests/test_converter.py | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) rename .github/workflows/{unstructured_fileconverter.yml => unstructured.yml} (80%) diff --git a/.github/workflows/unstructured_fileconverter.yml b/.github/workflows/unstructured.yml similarity index 80% rename from .github/workflows/unstructured_fileconverter.yml rename to .github/workflows/unstructured.yml index ee70510e9..a160eb597 100644 --- a/.github/workflows/unstructured_fileconverter.yml +++ b/.github/workflows/unstructured.yml @@ -1,17 +1,21 @@ # This workflow comes from https://github.com/ofek/hatch-mypyc # https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml -name: Test / unstructured / fileconverter +name: Test / unstructured on: schedule: - cron: "0 0 * * *" pull_request: paths: - - "integrations/unstructured/fileconverter/**" - - ".github/workflows/unstructured_fileconverter.yml" + - "integrations/unstructured/**" + - ".github/workflows/unstructured.yml" + +defaults: + run: + working-directory: integrations/unstructured concurrency: - group: unstructured_fileconverter-${{ github.head_ref }} + group: unstructured_-${{ github.head_ref }} cancel-in-progress: true env: @@ -50,10 +54,8 @@ jobs: run: pip install --upgrade hatch - name: Lint - working-directory: integrations/unstructured/fileconverter if: matrix.python-version == '3.9' run: hatch run lint:all - name: Run tests - working-directory: integrations/unstructured/fileconverter run: hatch run cov diff --git a/integrations/unstructured/pyproject.toml b/integrations/unstructured/pyproject.toml index bdd40f4aa..e199b3c3e 100644 --- a/integrations/unstructured/pyproject.toml +++ b/integrations/unstructured/pyproject.toml @@ -41,7 +41,7 @@ source = "vcs" tag-pattern = 'integrations\/unstructured-v(?P.*)' [tool.hatch.version.raw-options] -root = "../../.." +root = "../.." git_describe_command = 'git describe --tags --match="integrations/unstructured-v[0-9]*"' [tool.hatch.envs.default] diff --git a/integrations/unstructured/tests/test_converter.py b/integrations/unstructured/tests/test_converter.py index 2ffc442d2..5646bb565 100644 --- a/integrations/unstructured/tests/test_converter.py +++ b/integrations/unstructured/tests/test_converter.py @@ -43,7 +43,7 @@ def test_to_dict(self): converter_dict = converter.to_dict() assert converter_dict == { - "type": "unstructured_fileconverter_haystack.fileconverter.UnstructuredFileConverter", + "type": "haystack_integrations.components.converters.unstructured.converter.UnstructuredFileConverter", "init_parameters": { "api_url": "https://api.unstructured.io/general/v0/general", "document_creation_mode": "one-doc-per-file", From f4043a6372c1bc6f1272264150269737b81a1773 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Tue, 16 Jan 2024 17:11:20 +0100 Subject: [PATCH 3/8] fix fmt --- .../components/converters/unstructured/converter.py | 1 + integrations/unstructured/tests/test_converter.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index d94cb49c4..92348e6cd 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -9,6 +9,7 @@ from haystack import Document, component, default_to_dict from tqdm import tqdm + from unstructured.documents.elements import Element # type: ignore[import] from unstructured.partition.api import partition_via_api # type: ignore[import] diff --git a/integrations/unstructured/tests/test_converter.py b/integrations/unstructured/tests/test_converter.py index 5646bb565..b0473df25 100644 --- a/integrations/unstructured/tests/test_converter.py +++ b/integrations/unstructured/tests/test_converter.py @@ -4,7 +4,6 @@ from pathlib import Path import pytest - from haystack_integrations.components.converters.unstructured import UnstructuredFileConverter From 1cbab142073815477e0565097a41dc537d55fd61 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Tue, 16 Jan 2024 17:24:43 +0100 Subject: [PATCH 4/8] retry --- .github/workflows/unstructured.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/unstructured.yml b/.github/workflows/unstructured.yml index a160eb597..72f1f235b 100644 --- a/.github/workflows/unstructured.yml +++ b/.github/workflows/unstructured.yml @@ -8,11 +8,7 @@ on: pull_request: paths: - "integrations/unstructured/**" - - ".github/workflows/unstructured.yml" - -defaults: - run: - working-directory: integrations/unstructured + - ".github/workflows/unstructured.yml" concurrency: group: unstructured_-${{ github.head_ref }} @@ -54,8 +50,10 @@ jobs: run: pip install --upgrade hatch - name: Lint + working-directory: integrations/ollama if: matrix.python-version == '3.9' run: hatch run lint:all - name: Run tests + working-directory: integrations/ollama run: hatch run cov From cc6117ceb68e5e5b5ddf3e970a77ecb7aa6e1abb Mon Sep 17 00:00:00 2001 From: anakin87 Date: Tue, 16 Jan 2024 17:37:26 +0100 Subject: [PATCH 5/8] fix workflow --- .github/workflows/unstructured.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unstructured.yml b/.github/workflows/unstructured.yml index 72f1f235b..6338b06e8 100644 --- a/.github/workflows/unstructured.yml +++ b/.github/workflows/unstructured.yml @@ -11,7 +11,7 @@ on: - ".github/workflows/unstructured.yml" concurrency: - group: unstructured_-${{ github.head_ref }} + group: unstructured-${{ github.head_ref }} cancel-in-progress: true env: @@ -50,10 +50,10 @@ jobs: run: pip install --upgrade hatch - name: Lint - working-directory: integrations/ollama + working-directory: integrations/unstructured if: matrix.python-version == '3.9' run: hatch run lint:all - name: Run tests - working-directory: integrations/ollama + working-directory: integrations/unstructured run: hatch run cov From f798dba23438e4d6cd87905ca11b68de7d579e66 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Tue, 16 Jan 2024 17:42:24 +0100 Subject: [PATCH 6/8] try to fix coverage error --- integrations/unstructured/tests/__init__.py | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 integrations/unstructured/tests/__init__.py diff --git a/integrations/unstructured/tests/__init__.py b/integrations/unstructured/tests/__init__.py new file mode 100644 index 000000000..49fd5f144 --- /dev/null +++ b/integrations/unstructured/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 \ No newline at end of file From 70a0e2cd830ff10bf662319d441fc4eccefea7a6 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Tue, 16 Jan 2024 17:51:11 +0100 Subject: [PATCH 7/8] fix fmt again --- integrations/unstructured/tests/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/unstructured/tests/__init__.py b/integrations/unstructured/tests/__init__.py index 49fd5f144..e873bc332 100644 --- a/integrations/unstructured/tests/__init__.py +++ b/integrations/unstructured/tests/__init__.py @@ -1,3 +1,3 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # -# SPDX-License-Identifier: Apache-2.0 \ No newline at end of file +# SPDX-License-Identifier: Apache-2.0 From 594c2cb0206cc0e3363a81e461e1c65c2fd26ff3 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Wed, 17 Jan 2024 08:25:49 +0100 Subject: [PATCH 8/8] standardize README --- integrations/unstructured/README.md | 89 +++++++---------------------- 1 file changed, 20 insertions(+), 69 deletions(-) diff --git a/integrations/unstructured/README.md b/integrations/unstructured/README.md index 274c01c0f..db74c5306 100644 --- a/integrations/unstructured/README.md +++ b/integrations/unstructured/README.md @@ -1,86 +1,37 @@ -# Unstructured FileConverter for Haystack +# unstructured-fileconverter-haystack - +[![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) -Component for the Haystack (2.x) LLM framework to easily convert files and directories into Documents using the Unstructured API. +----- -**[Unstructured](https://unstructured-io.github.io/unstructured/index.html)** provides a series of tools to do **ETL for LLMs**. This component calls the Unstructured API that simply extracts text and other information from a vast range of file formats. -**[Supported file types](https://unstructured-io.github.io/unstructured/api.html#supported-file-types)**. +**Table of Contents** -**[Haystack](https://github.com/deepset-ai/haystack)** is an **orchestration framework** to build customizable, production-ready **LLM applications**. -Once your files are converted into Documents, you can start building RAG, question answering, semantic search applications and more. - -- [Installation](#installation) -- [Usage](#usage) -- [Configuration](#configuration) +- [unstructured-fileconverter-haystack](#unstructured-fileconverter-haystack) + - [Installation](#installation) + - [License](#license) + - [Testing](#testing) ## Installation -```bash +```console pip install unstructured-fileconverter-haystack ``` -### Hosted API -If you plan to use the hosted version of the Unstructured API, you just need the **(free) Unsctructured API key**. You can get it by signing up [here](https://unstructured.io/api-key). - -### Local API (Docker) -If you want to run your own local instance of the Unstructured API, you need Docker and you can find instructions [here](https://unstructured-io.github.io/unstructured/api.html#using-docker-images). - -In short, this should work: -```bash -docker run -p 8000:8000 -d --rm --name unstructured-api quay.io/unstructured-io/unstructured-api:latest --port 8000 --host 0.0.0.0 -``` - -## Usage +## License -### In isolation -```python -import os -from unstructured_fileconverter_haystack import UnstructuredFileConverter +`unstructured-fileconverter-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. -os.environ["UNSTRUCTURED_API_KEY"] = "YOUR-API-KEY" +## Testing -converter = UnstructuredFileConverter() - -documents = converter.run(paths = ["a/file/path.pdf", "a/directory/path"])["documents"] - -``` +To run tests, first start a Docker container running the Unstructured API: -### In a Haystack Pipeline -```python -import os -from haystack import Pipeline -from haystack.components.writers import DocumentWriter -from haystack.document_stores import MemoryDocumentStore -from unstructured_fileconverter_haystack import UnstructuredFileConverter - -os.environ["UNSTRUCTURED_API_KEY"] = "YOUR-API-KEY" - -document_store = MemoryDocumentStore() - -indexing = Pipeline() -indexing.add_component("converter", UnstructuredFileConverter()) -indexing.add_component("writer", DocumentWriter(document_store)) -indexing.connect("converter", "writer") - -indexing.run({"converter": {"paths": ["a/file/path.pdf", "a/directory/path"]}}) +```console +docker run -p 8000:8000 -d --rm --name unstructured-api quay.io/unstructured-io/unstructured-api:latest --port 8000 --host 0.0.0.0 ``` -## Configuration - -### Initialization parameters -- `api_url`: URL of the Unstructured API. Defaults to the hosted version. If you run the API locally, you should specify this parameter. -- `api_key`: API key for the Unstructured API (https://unstructured.io/#get-api-key). - If you run the API locally, it is not needed. - If you use the hosted version, it defaults to the environment variable UNSTRUCTURED_API_KEY. -- `document_creation_mode`: How to create Haystack Documents from the elements returned by Unstructured. - - `"one-doc-per-file"`: One Haystack Document per file. All elements are concatenated into one text field. - - `"one-doc-per-page"`: One Haystack Document per page. All elements on a page are concatenated into one text field. - - `"one-doc-per-element"`: One Haystack Document per element. Each element is converted to a Haystack Document - - `separator`: Separator between elements when concatenating them into one text field. -- `unstructured_kwargs`: Additional keyword arguments that are passed to the Unstructured API. They can be helpful to improve or speed up the conversion. See https://unstructured-io.github.io/unstructured/api.html#parameters. - -### `run` method -The method `run` just expects a list of paths (files or directories) in the `paths` parameter. +Then run tests: -If `paths` contains a directory, all files in the first level of the directory are converted. Subdirectories are ignored. +```console +hatch run test +``` \ No newline at end of file