diff --git a/.github/workflows/optimum.yml b/.github/workflows/optimum.yml index f5f59ec89..3b0d137da 100644 --- a/.github/workflows/optimum.yml +++ b/.github/workflows/optimum.yml @@ -52,9 +52,9 @@ jobs: if: matrix.python-version == '3.9' && runner.os == 'Linux' run: hatch run lint:all - # - name: Generate docs - # if: matrix.python-version == '3.9' && runner.os == 'Linux' - # run: hatch run docs + - name: Generate docs + if: matrix.python-version == '3.9' && runner.os == 'Linux' + run: hatch run docs - name: Run tests run: hatch run cov diff --git a/README.md b/README.md index b502f3f66..2db267f6b 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta | [nvidia-haystack](integrations/nvidia/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/nvidia-haystack.svg?color=orange)](https://pypi.org/project/nvidia-haystack) | [![Test / nvidia](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/nvidia.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/nvidia.yml) | | [ollama-haystack](integrations/ollama/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/ollama-haystack.svg?color=orange)](https://pypi.org/project/ollama-haystack) | [![Test / ollama](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ollama.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ollama.yml) | | [opensearch-haystack](integrations/opensearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack) | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml) | +| [optimum-haystack](integrations/optimum/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/optimum-haystack.svg)](https://pypi.org/project/optimum-haystack) | [![Test / optimum](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/optimum.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/optimum.yml) | | [pinecone-haystack](integrations/pinecone/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg?color=orange)](https://pypi.org/project/pinecone-haystack) | [![Test / pinecone](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml) | | [pgvector-haystack](integrations/pgvector/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pgvector-haystack.svg?color=orange)](https://pypi.org/project/pgvector-haystack) | [![Test / pgvector](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml) | | [qdrant-haystack](integrations/qdrant/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/qdrant-haystack.svg?color=orange)](https://pypi.org/project/qdrant-haystack) | [![Test / qdrant](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml) | diff --git a/integrations/optimum/README.md b/integrations/optimum/README.md index 1438f6e92..d620b1bb3 100644 --- a/integrations/optimum/README.md +++ b/integrations/optimum/README.md @@ -1,30 +1,24 @@ # optimum -[![PyPI - Version](https://img.shields.io/pypi/v/optimum.svg)](https://pypi.org/project/optimum-haystack) -[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/optimum.svg)](https://pypi.org/project/optimum-haystack) +[![PyPI - Version](https://img.shields.io/pypi/v/optimum-haystack.svg)](https://pypi.org/project/optimum-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/optimum-haystack.svg)](https://pypi.org/project/optimum-haystack) ------ +--- Component to embed strings and Documents using models loaded with the HuggingFace Optimum library. This component is designed to seamlessly inference models using the high speed ONNX runtime. **Table of Contents** -- [Installation](#installation) -- [License](#license) +- [optimum](#optimum) + - [Installation](#installation) + - [License](#license) ## Installation -To use the ONNX runtime for CPU, use the CPU version: ```console -pip install optimum-haystack[cpu] +pip install optimum-haystack ``` -For using the GPU runtimes: -```console -pip install optimum-haystack[gpu] -``` - - ## License `optimum-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. diff --git a/integrations/optimum/example/example.py b/integrations/optimum/example/example.py new file mode 100644 index 000000000..0d86ce99b --- /dev/null +++ b/integrations/optimum/example/example.py @@ -0,0 +1,34 @@ +# This example requires GPU support to execute. + +from haystack import Pipeline + +from haystack_integrations.components.embedders.optimum import ( + OptimumTextEmbedder, + OptimumEmbedderPooling, + OptimumEmbedderOptimizationConfig, + OptimumEmbedderOptimizationMode, +) + +pipeline = Pipeline() +embedder = OptimumTextEmbedder( + model="intfloat/e5-base-v2", + normalize_embeddings=True, + onnx_execution_provider="CUDAExecutionProvider", + optimizer_settings=OptimumEmbedderOptimizationConfig( + mode=OptimumEmbedderOptimizationMode.O4, + for_gpu=True, + ), + working_dir="/tmp/optimum", + pooling_mode=OptimumEmbedderPooling.MEAN, +) +pipeline.add_component("embedder", embedder) + +results = pipeline.run( + { + "embedder": { + "text": "Ex profunditate antiquae doctrinae, Ad caelos supra semper, Hoc incantamentum evoco, draco apparet, Incantamentum iam transactum est" + }, + } +) + +print(results["embedder"]["embedding"]) diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py index 5a4447570..17e553b83 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py @@ -7,7 +7,7 @@ class OptimumEmbedderOptimizationMode(Enum): """ - [ONXX Optimization Modes](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization.html) + [ONXX Optimization modes](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization) support by the Optimum Embedders. """ diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py index 783e4ac5e..27f533430 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py @@ -70,41 +70,42 @@ def __init__( The [execution provider](https://onnxruntime.ai/docs/execution-providers/) to use for ONNX models. - Note: Using the TensorRT execution provider - TensorRT requires to build its inference engine ahead of inference, which takes some time due to the model - optimization and nodes fusion. To avoid rebuilding the engine every time the model is loaded, ONNX Runtime - provides a pair of options to save the engine: `trt_engine_cache_enable` and `trt_engine_cache_path`. We - recommend setting these two provider options using the model_kwargs parameter, when using the TensorRT - execution provider. The usage is as follows: - ```python - embedder = OptimumDocumentEmbedder( - model="sentence-transformers/all-mpnet-base-v2", - onnx_execution_provider="TensorrtExecutionProvider", - model_kwargs={ - "provider_options": { - "trt_engine_cache_enable": True, - "trt_engine_cache_path": "tmp/trt_cache", - } - }, - ) - ``` + Note: Using the TensorRT execution provider + TensorRT requires to build its inference engine ahead of inference, + which takes some time due to the model optimization and nodes fusion. + To avoid rebuilding the engine every time the model is loaded, ONNX + Runtime provides a pair of options to save the engine: `trt_engine_cache_enable` + and `trt_engine_cache_path`. We recommend setting these two provider + options using the `model_kwargs` parameter, when using the TensorRT execution provider. + The usage is as follows: + ```python + embedder = OptimumDocumentEmbedder( + model="sentence-transformers/all-mpnet-base-v2", + onnx_execution_provider="TensorrtExecutionProvider", + model_kwargs={ + "provider_options": { + "trt_engine_cache_enable": True, + "trt_engine_cache_path": "tmp/trt_cache", + } + }, + ) + ``` :param pooling_mode: The pooling mode to use. When `None`, pooling mode will be inferred from the model config. :param model_kwargs: Dictionary containing additional keyword arguments to pass to the model. In case of duplication, these kwargs override `model`, `onnx_execution_provider` and `token` initialization parameters. - :param working_dir: - The directory to use for storing intermediate files - generated during model optimization/quantization. - - Required for optimization and quantization. - :param optimizer_settings: - Configuration for Optimum Embedder Optimization. - If `None`, no additional optimization is be applied. - :param quantizer_settings: - Configuration for Optimum Embedder Quantization. - If `None`, no quantization is be applied. + :param working_dir: + The directory to use for storing intermediate files + generated during model optimization/quantization. Required + for optimization and quantization. + :param optimizer_settings: + Configuration for Optimum Embedder Optimization. + If `None`, no additional optimization is be applied. + :param quantizer_settings: + Configuration for Optimum Embedder Quantization. + If `None`, no quantization is be applied. :param batch_size: Number of Documents to encode at once. :param progress_bar: @@ -199,6 +200,10 @@ def run(self, documents: List[Document]): A list of Documents to embed. :returns: The updated Documents with their embeddings. + :raises RuntimeError: + If the component was not initialized. + :raises TypeError: + If the input is not a list of Documents. """ if not self._initialized: msg = "The embedding model has not been loaded. Please call warm_up() before running." diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py index 976a7bfa8..e3cffe183 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py @@ -46,57 +46,58 @@ def __init__( quantizer_settings: Optional[OptimumEmbedderQuantizationConfig] = None, ): """ - Create a OptimumTextEmbedder component. + Create a OptimumTextEmbedder component. :param model: - A string representing the model id on HF Hub. - :param token: - The HuggingFace token to use as HTTP bearer authorization. - :param prefix: - A string to add to the beginning of each text. - :param suffix: - A string to add to the end of each text. - :param normalize_embeddings: - Whether to normalize the embeddings to unit length. - :param onnx_execution_provider: - The [execution provider](https://onnxruntime.ai/docs/execution-providers/) - to use for ONNX models. - - Note: Using the TensorRT execution provider - TensorRT requires to build its inference engine ahead of inference, which takes some time due to the model - optimization and nodes fusion. To avoid rebuilding the engine every time the model is loaded, ONNX Runtime - provides a pair of options to save the engine: `trt_engine_cache_enable` and `trt_engine_cache_path`. We - recommend setting these two provider options using the model_kwargs parameter, when using the TensorRT - execution provider. The usage is as follows: - ```python - embedder = OptimumDocumentEmbedder( - model="sentence-transformers/all-mpnet-base-v2", - onnx_execution_provider="TensorrtExecutionProvider", - model_kwargs={ - "provider_options": { - "trt_engine_cache_enable": True, - "trt_engine_cache_path": "tmp/trt_cache", - } - }, - ) - ``` - :param pooling_mode: - The pooling mode to use. When `None`, pooling mode will be inferred from the model config. - :param model_kwargs: - Dictionary containing additional keyword arguments to pass to the model. - In case of duplication, these kwargs override `model`, `onnx_execution_provider` - and `token` initialization parameters. - :param working_dir: - The directory to use for storing intermediate files - generated during model optimization/quantization. - - Required for optimization and quantization. - :param optimizer_settings: - Configuration for Optimum Embedder Optimization. - If `None`, no additional optimization is applied. - :param quantizer_settings: - Configuration for Optimum Embedder Quantization. - If `None`, no quantization is applied. + A string representing the model id on HF Hub. + :param token: + The HuggingFace token to use as HTTP bearer authorization. + :param prefix: + A string to add to the beginning of each text. + :param suffix: + A string to add to the end of each text. + :param normalize_embeddings: + Whether to normalize the embeddings to unit length. + :param onnx_execution_provider: + The [execution provider](https://onnxruntime.ai/docs/execution-providers/) + to use for ONNX models. + + Note: Using the TensorRT execution provider + TensorRT requires to build its inference engine ahead of inference, + which takes some time due to the model optimization and nodes fusion. + To avoid rebuilding the engine every time the model is loaded, ONNX + Runtime provides a pair of options to save the engine: `trt_engine_cache_enable` + and `trt_engine_cache_path`. We recommend setting these two provider + options using the `model_kwargs` parameter, when using the TensorRT execution provider. + The usage is as follows: + ```python + embedder = OptimumDocumentEmbedder( + model="sentence-transformers/all-mpnet-base-v2", + onnx_execution_provider="TensorrtExecutionProvider", + model_kwargs={ + "provider_options": { + "trt_engine_cache_enable": True, + "trt_engine_cache_path": "tmp/trt_cache", + } + }, + ) + ``` + :param pooling_mode: + The pooling mode to use. When `None`, pooling mode will be inferred from the model config. + :param model_kwargs: + Dictionary containing additional keyword arguments to pass to the model. + In case of duplication, these kwargs override `model`, `onnx_execution_provider` + and `token` initialization parameters. + :param working_dir: + The directory to use for storing intermediate files + generated during model optimization/quantization. Required + for optimization and quantization. + :param optimizer_settings: + Configuration for Optimum Embedder Optimization. + If `None`, no additional optimization is be applied. + :param quantizer_settings: + Configuration for Optimum Embedder Quantization. + If `None`, no quantization is be applied. """ params = _EmbedderParams( model=model, @@ -161,6 +162,10 @@ def run(self, text: str): The text to embed. :returns: The embeddings of the text. + :raises RuntimeError: + If the component was not initialized. + :raises TypeError: + If the input is not a string. """ if not self._initialized: msg = "The embedding model has not been loaded. Please call warm_up() before running." diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py index 41aa24d64..2c8bbd967 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py @@ -3,7 +3,7 @@ class OptimumEmbedderPooling(Enum): """ - Pooling Modes support by the Optimum Embedders. + Pooling modes support by the Optimum Embedders. """ #: Perform CLS Pooling on the output of the embedding model diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py index 2e68081b5..d45369544 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py @@ -7,7 +7,7 @@ class OptimumEmbedderQuantizationMode(Enum): """ - [Dynamic Quantization Modes](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization) + [Dynamic Quantization modes](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization) support by the Optimum Embedders. """