From 18a16ab98e9d1cb615425898821e417beab7dd02 Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Thu, 8 Feb 2024 14:49:29 +0530 Subject: [PATCH 01/10] Add Optimum Embedders --- integrations/optimum/LICENSE.txt | 73 +++++ integrations/optimum/README.md | 30 ++ integrations/optimum/pyproject.toml | 201 +++++++++++++ .../components/embedders/__init__.py | 8 + .../embedders/optimum_document_embedder.py | 269 ++++++++++++++++++ .../embedders/optimum_text_embedder.py | 200 +++++++++++++ integrations/optimum/tests/__init__.py | 3 + .../tests/test_optimum_document_embedder.py | 233 +++++++++++++++ .../tests/test_optimum_text_embedder.py | 139 +++++++++ 9 files changed, 1156 insertions(+) create mode 100644 integrations/optimum/LICENSE.txt create mode 100644 integrations/optimum/README.md create mode 100644 integrations/optimum/pyproject.toml create mode 100644 integrations/optimum/src/haystack_integrations/components/embedders/__init__.py create mode 100644 integrations/optimum/src/haystack_integrations/components/embedders/optimum_document_embedder.py create mode 100644 integrations/optimum/src/haystack_integrations/components/embedders/optimum_text_embedder.py create mode 100644 integrations/optimum/tests/__init__.py create mode 100644 integrations/optimum/tests/test_optimum_document_embedder.py create mode 100644 integrations/optimum/tests/test_optimum_text_embedder.py diff --git a/integrations/optimum/LICENSE.txt b/integrations/optimum/LICENSE.txt new file mode 100644 index 000000000..137069b82 --- /dev/null +++ b/integrations/optimum/LICENSE.txt @@ -0,0 +1,73 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. + +"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: + + (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. + + You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + +To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/integrations/optimum/README.md b/integrations/optimum/README.md new file mode 100644 index 000000000..1438f6e92 --- /dev/null +++ b/integrations/optimum/README.md @@ -0,0 +1,30 @@ +# optimum + +[![PyPI - Version](https://img.shields.io/pypi/v/optimum.svg)](https://pypi.org/project/optimum-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/optimum.svg)](https://pypi.org/project/optimum-haystack) + +----- + +Component to embed strings and Documents using models loaded with the HuggingFace Optimum library. This component is designed to seamlessly inference models using the high speed ONNX runtime. + +**Table of Contents** + +- [Installation](#installation) +- [License](#license) + +## Installation + +To use the ONNX runtime for CPU, use the CPU version: +```console +pip install optimum-haystack[cpu] +``` + +For using the GPU runtimes: +```console +pip install optimum-haystack[gpu] +``` + + +## License + +`optimum-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. diff --git a/integrations/optimum/pyproject.toml b/integrations/optimum/pyproject.toml new file mode 100644 index 000000000..186a75e22 --- /dev/null +++ b/integrations/optimum/pyproject.toml @@ -0,0 +1,201 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "optimum-haystack" +dynamic = ["version"] +description = "Component to embed strings and Documents using models loaded with the HuggingFace Optimum library. This component is designed to seamlessly inference models using the high speed ONNX runtime." +readme = "README.md" +requires-python = ">=3.8" +license = "Apache-2.0" +keywords = [] +authors = [ + { name = "deepset GmbH", email = "info@deepset.ai" }, + { name = "Ashwin Mathur", email = "" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [ + "haystack-ai", + "transformers[sentencepiece]==4.36.2", + "sentence-transformers>=2.2.0", +] + +[project.optional-dependencies] +cpu = [ + "optimum[onnxruntime]==1.15.0", +] +gpu = [ + "optimum[onnxruntime-gpu]==1.15.0", +] + +[project.urls] +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/optimum#readme" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/optimum" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + +[tool.hatch.version] +source = "vcs" +tag-pattern = 'integrations\/optimum-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/optimum-v[0-9]*"' + +[tool.hatch.envs.default] +dependencies = [ + "coverage[toml]>=6.5", + "pytest", +] +[tool.hatch.envs.default.scripts] +test = "pytest {args:tests}" +test-cov = "coverage run -m pytest {args:tests}" +cov-report = [ + "- coverage combine", + "coverage report", +] +cov = [ + "test-cov", + "cov-report", +] + +[[tool.hatch.envs.all.matrix]] +python = ["3.8", "3.9", "3.10", "3.11", "3.12"] + + +[tool.hatch.envs.lint] +detached = true +dependencies = [ + "black>=23.1.0", + "mypy>=1.0.0", + "ruff>=0.0.243", +] + +[tool.hatch.envs.lint.scripts] +typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}" +style = [ + "ruff {args:.}", + "black --check --diff {args:.}", +] +fmt = [ + "black {args:.}", + "ruff --fix {args:.}", + "style", +] +all = [ + "style", + "typing", +] + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.ruff.lint.isort] +known-first-party = ["src"] + +[tool.black] +target-version = ["py37"] +line-length = 120 +skip-string-normalization = true + +[tool.ruff] +target-version = "py37" +line-length = 120 +select = [ + "A", + "ARG", + "B", + "C", + "DTZ", + "E", + "EM", + "F", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Ignore checks for possible passwords + "S105", "S106", "S107", + # Ignore complexity + "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", +] +unfixable = [ + # Don't touch unused imports + "F401", +] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "parents" + +[tool.ruff.lint.per-file-ignores] +# Tests can use magic values, assertions, and relative imports +"tests/**/*" = ["PLR2004", "S101", "TID252"] +# Examples can print their output +"examples/**" = ["T201"] +"tests/**" = ["T201"] + +[tool.coverage.run] +source_pkgs = ["optimum", "tests"] +branch = true +parallel = true + +[tool.coverage.paths] +optimum = ["src/haystack_integrations", "*/optimum/src/haystack_integrations"] +tests = ["tests", "*/optimum/tests"] + +[tool.coverage.report] +exclude_lines = [ + "no cov", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] + +[[tool.mypy.overrides]] +module = [ + "haystack.*", + "haystack_integrations.*", + "pytest.*", + "numpy.*", + "optimum.*", + "torch.*", + "transformers.*", + "huggingface_hub.*" +] +ignore_missing_imports = true + +[tool.pytest.ini_options] +addopts = ["--strict-markers", "-vv"] +markers = [ + "integration: integration tests", + "unit: unit tests", + "embedders: embedders tests", +] +log_cli = true diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/__init__.py b/integrations/optimum/src/haystack_integrations/components/embedders/__init__.py new file mode 100644 index 000000000..4e5ac1535 --- /dev/null +++ b/integrations/optimum/src/haystack_integrations/components/embedders/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from .optimum_document_embedder import OptimumDocumentEmbedder +from .optimum_text_embedder import OptimumTextEmbedder + +__all__ = ["OptimumDocumentEmbedder", "OptimumTextEmbedder"] diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum_document_embedder.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_document_embedder.py new file mode 100644 index 000000000..1aaa65bb4 --- /dev/null +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_document_embedder.py @@ -0,0 +1,269 @@ +from typing import Any, Dict, List, Optional + +import numpy as np +import torch +from haystack import Document, component, default_from_dict, default_to_dict +from haystack.utils import Secret, deserialize_secrets_inplace +from haystack.utils.hf import HFModelType, check_valid_model, deserialize_hf_model_kwargs, serialize_hf_model_kwargs +from optimum.onnxruntime import ORTModelForFeatureExtraction +from tqdm import tqdm +from transformers import AutoTokenizer + + +class OptimumDocumentEmbedder: + """ + A component for computing Document embeddings using models loaded with the HuggingFace Optimum library. + This component is designed to seamlessly inference models using the high speed ONNX runtime. + + The embedding of each Document is stored in the `embedding` field of the Document. + + Usage example: + ```python + from haystack.dataclasses import Document + from haystack_integrations.components.embedders import OptimumDocumentEmbedder + + doc = Document(content="I love pizza!") + + document_embedder = OptimumDocumentEmbedder(model="BAAI/bge-small-en-v1.5") + document_embedder.warm_up() + + result = document_embedder.run([doc]) + print(result["documents"][0].embedding) + + # [0.017020374536514282, -0.023255806416273117, ...] + ``` + + Key Features and Compatibility: + - **Primary Compatibility**: Designed to work seamlessly with any embedding model present on the Hugging Face + Hub. + - **Conversion to ONNX**: The models are converted to ONNX using the HuggingFace Optimum library. This is + performed in real-time, during the warm-up step. + - **Accelerated Inference on GPU**: Supports using different execution providers such as CUDA and TensorRT, to + accelerate ONNX Runtime inference on GPUs. + Simply pass the execution provider as the onnx_execution_provider parameter. Additonal parameters can be passed + to the model using the model_kwargs parameter. + For more details refer to the HuggingFace documentation: + https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu. + """ + + def __init__( + self, + model: str = "BAAI/bge-small-en-v1.5", + token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False), # noqa: B008 + prefix: str = "", + suffix: str = "", + normalize_embeddings: bool = True, + onnx_execution_provider: str = "CPUExecutionProvider", + model_kwargs: Optional[Dict[str, Any]] = None, + batch_size: int = 32, + progress_bar: bool = True, + meta_fields_to_embed: Optional[List[str]] = None, + embedding_separator: str = "\n", + ): + """ + Create a OptimumDocumentEmbedder component. + + :param model: A string representing the model id on HF Hub. Default is "BAAI/bge-small-en-v1.5". + :param token: The HuggingFace token to use as HTTP bearer authorization. + :param prefix: A string to add to the beginning of each text. + :param suffix: A string to add to the end of each text. + :param normalize_embeddings: Whether to normalize the embeddings to unit length. + :param onnx_execution_provider: The execution provider to use for ONNX models. Defaults to + "CPUExecutionProvider". + :param model_kwargs: Dictionary containing additional keyword arguments to pass to the model. + In case of duplication, these kwargs override `model`, `onnx_execution_provider`, and `token` initialization + parameters. + :param batch_size: Number of Documents to encode at once. + :param progress_bar: Whether to show a progress bar or not. Can be helpful to disable in production deployments + to keep the logs clean. + :param meta_fields_to_embed: List of meta fields that should be embedded along with the Document text. + :param embedding_separator: Separator used to concatenate the meta fields to the Document text. + """ + check_valid_model(model, HFModelType.EMBEDDING, token) + self.model = model + + self.token = token + token = token.resolve_value() if token else None + + self.prefix = prefix + self.suffix = suffix + self.normalize_embeddings = normalize_embeddings + self.onnx_execution_provider = onnx_execution_provider + self.batch_size = batch_size + self.progress_bar = progress_bar + self.meta_fields_to_embed = meta_fields_to_embed or [] + self.embedding_separator = embedding_separator + + model_kwargs = model_kwargs or {} + + # Check if the model_kwargs contain the parameters, otherwise, populate them with values from init parameters + model_kwargs.setdefault("model_id", model) + model_kwargs.setdefault("provider", onnx_execution_provider) + model_kwargs.setdefault("use_auth_token", token) + + self.model_kwargs = model_kwargs + self.embedding_model = None + self.tokenizer = None + + def warm_up(self): + """ + Convert the model to ONNX. + + The model is cached if the "TensorrtExecutionProvider" is used, since it takes a while to to build the TensorRT + engine. + """ + if self.embedding_model is None: + self.tokenizer = AutoTokenizer.from_pretrained(self.model) + + if self.onnx_execution_provider == "TensorrtExecutionProvider": + # Cache engine for TensorRT + provider_options = { + "trt_engine_cache_enable": True, + "trt_engine_cache_path": f"tmp/trt_cache_{self.model}", + } + self.embedding_model = ORTModelForFeatureExtraction.from_pretrained( + **self.model_kwargs, use_cache=False, provider_options=provider_options + ) + else: + # export=True converts the model to ONNX on the fly + self.embedding_model = ORTModelForFeatureExtraction.from_pretrained(**self.model_kwargs, export=True) + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + serialization_dict = default_to_dict( + self, + model=self.model, + prefix=self.prefix, + suffix=self.suffix, + normalize_embeddings=self.normalize_embeddings, + onnx_execution_provider=self.onnx_execution_provider, + batch_size=self.batch_size, + progress_bar=self.progress_bar, + meta_fields_to_embed=self.meta_fields_to_embed, + embedding_separator=self.embedding_separator, + model_kwargs=self.model_kwargs, + token=self.token.to_dict() if self.token else None, + ) + + model_kwargs = serialization_dict["init_parameters"]["model_kwargs"] + model_kwargs.pop("token", None) + + serialize_hf_model_kwargs(model_kwargs) + return serialization_dict + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "OptimumDocumentEmbedder": + """ + Deserialize this component from a dictionary. + """ + deserialize_secrets_inplace(data["init_parameters"], keys=["token"]) + deserialize_hf_model_kwargs(data["init_parameters"]["model_kwargs"]) + return default_from_dict(cls, data) + + def mean_pooling(self, model_output: torch.tensor, attention_mask: torch.tensor) -> torch.tensor: + """ + Perform Mean Pooling on the output of the Embedding model. + + :param model_output: The output of the embedding model. + :param attention_mask: The attention mask of the tokenized text. + :return: The embeddings of the text after mean pooling. + """ + # First element of model_output contains all token embeddings + token_embeddings = model_output[0] + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) + sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) + return sum_embeddings / sum_mask + + def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: + """ + Prepare the texts to embed by concatenating the Document text with the metadata fields to embed. + """ + texts_to_embed = [] + for doc in documents: + meta_values_to_embed = [ + str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None + ] + + text_to_embed = ( + self.prefix + self.embedding_separator.join([*meta_values_to_embed, doc.content or ""]) + self.suffix + ) + + texts_to_embed.append(text_to_embed) + return texts_to_embed + + def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> List[List[float]]: + """ + Embed a list of texts in batches. + """ + # Determine device for tokenizer output + device = ( + "cuda" + if self.onnx_execution_provider + in ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"] + else "cpu" + ) + + # Sorting by length + length_sorted_idx = np.argsort([-len(sen) for sen in texts_to_embed]) + sentences_sorted = [texts_to_embed[idx] for idx in length_sorted_idx] + + all_embeddings = [] + for i in tqdm( + range(0, len(sentences_sorted), batch_size), disable=not self.progress_bar, desc="Calculating embeddings" + ): + batch = sentences_sorted[i : i + batch_size] + encoded_input = self.tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device) # type: ignore + + # Compute token embeddings + with torch.no_grad(): + model_output = self.embedding_model(**encoded_input) # type: ignore + + # Perform mean pooling + sentence_embeddings = self.mean_pooling(model_output, encoded_input["attention_mask"].cpu()) + + all_embeddings.extend(sentence_embeddings.tolist()) + + # Reorder embeddings according to original order + all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)] + + # Normalize all embeddings + if self.normalize_embeddings: + all_embeddings = torch.nn.functional.normalize(torch.tensor(all_embeddings), p=2, dim=1).tolist() + + return all_embeddings + + @component.output_types(documents=List[Document]) + def run(self, documents: List[Document]): + """ + Embed a list of Documents. + The embedding of each Document is stored in the `embedding` field of the Document. + + :param documents: A list of Documents to embed. + :return: A dictionary containing the updated Documents with their embeddings. + """ + if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): + msg = ( + "OptimumDocumentEmbedder expects a list of Documents as input." + " In case you want to embed a string, please use the OptimumTextEmbedder." + ) + raise TypeError(msg) + + if not (self.embedding_model and self.tokenizer): + msg = "The embedding model has not been loaded. Please call warm_up() before running." + raise RuntimeError(msg) + + # Return empty list if no documents + if not documents: + return {"documents": []} + + texts_to_embed = self._prepare_texts_to_embed(documents=documents) + + embeddings = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self.batch_size) + + for doc, emb in zip(documents, embeddings): + doc.embedding = emb + + return {"documents": documents} diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum_text_embedder.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_text_embedder.py new file mode 100644 index 000000000..8b6a3b48e --- /dev/null +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_text_embedder.py @@ -0,0 +1,200 @@ +from typing import Any, Dict, List, Optional + +import torch +from haystack import component, default_from_dict, default_to_dict +from haystack.utils import Secret, deserialize_secrets_inplace +from haystack.utils.hf import HFModelType, check_valid_model, deserialize_hf_model_kwargs, serialize_hf_model_kwargs +from optimum.onnxruntime import ORTModelForFeatureExtraction +from transformers import AutoTokenizer + + +class OptimumTextEmbedder: + """ + A component to embed text using models loaded with the HuggingFace Optimum library. + This component is designed to seamlessly inference models using the high speed ONNX runtime. + + Usage example: + ```python + from haystack_integrations.components.embedders import OptimumTextEmbedder + + text_to_embed = "I love pizza!" + + text_embedder = OptimumTextEmbedder(model="BAAI/bge-small-en-v1.5") + text_embedder.warm_up() + + print(text_embedder.run(text_to_embed)) + + # {'embedding': [-0.07804739475250244, 0.1498992145061493,, ...]} + ``` + + Key Features and Compatibility: + - **Primary Compatibility**: Designed to work seamlessly with any embedding model present on the Hugging Face + Hub. + - **Conversion to ONNX**: The models are converted to ONNX using the HuggingFace Optimum library. This is + performed in real-time, during the warm-up step. + - **Accelerated Inference on GPU**: Supports using different execution providers such as CUDA and TensorRT, to + accelerate ONNX Runtime inference on GPUs. + Simply pass the execution provider as the onnx_execution_provider parameter. Additonal parameters can be passed + to the model using the model_kwargs parameter. + For more details refer to the HuggingFace documentation: + https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu. + """ + + def __init__( + self, + model: str = "BAAI/bge-small-en-v1.5", + token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False), # noqa: B008 + prefix: str = "", + suffix: str = "", + normalize_embeddings: bool = True, + onnx_execution_provider: str = "CPUExecutionProvider", + model_kwargs: Optional[Dict[str, Any]] = None, + ): + """ + Create a OptimumTextEmbedder component. + + :param model: A string representing the model id on HF Hub. Default is "BAAI/bge-small-en-v1.5". + :param token: The HuggingFace token to use as HTTP bearer authorization. + :param prefix: A string to add to the beginning of each text. + :param suffix: A string to add to the end of each text. + :param normalize_embeddings: Whether to normalize the embeddings to unit length. + :param onnx_execution_provider: The execution provider to use for ONNX models. Defaults to + "CPUExecutionProvider". + :param model_kwargs: Dictionary containing additional keyword arguments to pass to the model. + In case of duplication, these kwargs override `model`, `onnx_execution_provider`, and `token` initialization + parameters. + """ + check_valid_model(model, HFModelType.EMBEDDING, token) + self.model = model + + self.token = token + token = token.resolve_value() if token else None + + self.prefix = prefix + self.suffix = suffix + self.normalize_embeddings = normalize_embeddings + self.onnx_execution_provider = onnx_execution_provider + + model_kwargs = model_kwargs or {} + + # Check if the model_kwargs contain the parameters, otherwise, populate them with values from init parameters + model_kwargs.setdefault("model_id", model) + model_kwargs.setdefault("provider", onnx_execution_provider) + model_kwargs.setdefault("use_auth_token", token) + + self.model_kwargs = model_kwargs + self.embedding_model = None + self.tokenizer = None + + def warm_up(self): + """ + Convert the model to ONNX. + + The model is cached if the "TensorrtExecutionProvider" is used, since it takes a while to build the TensorRT + engine. + """ + if self.embedding_model is None: + self.tokenizer = AutoTokenizer.from_pretrained(self.model) + + if self.onnx_execution_provider == "TensorrtExecutionProvider": + # Cache engine for TensorRT + provider_options = { + "trt_engine_cache_enable": True, + "trt_engine_cache_path": f"tmp/trt_cache_{self.model}", + } + self.embedding_model = ORTModelForFeatureExtraction.from_pretrained( + **self.model_kwargs, use_cache=False, provider_options=provider_options + ) + else: + # export=True converts the model to ONNX on the fly + self.embedding_model = ORTModelForFeatureExtraction.from_pretrained(**self.model_kwargs, export=True) + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + serialization_dict = default_to_dict( + self, + model=self.model, + prefix=self.prefix, + suffix=self.suffix, + normalize_embeddings=self.normalize_embeddings, + onnx_execution_provider=self.onnx_execution_provider, + model_kwargs=self.model_kwargs, + token=self.token.to_dict() if self.token else None, + ) + + model_kwargs = serialization_dict["init_parameters"]["model_kwargs"] + model_kwargs.pop("token", None) + + serialize_hf_model_kwargs(model_kwargs) + return serialization_dict + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "OptimumTextEmbedder": + """ + Deserialize this component from a dictionary. + """ + deserialize_secrets_inplace(data["init_parameters"], keys=["token"]) + deserialize_hf_model_kwargs(data["init_parameters"]["model_kwargs"]) + return default_from_dict(cls, data) + + def mean_pooling(self, model_output: torch.tensor, attention_mask: torch.tensor) -> torch.tensor: + """ + Perform Mean Pooling on the output of the Embedding model. + + :param model_output: The output of the embedding model. + :param attention_mask: The attention mask of the tokenized text. + :return: The embeddings of the text after mean pooling. + """ + # First element of model_output contains all token embeddings + token_embeddings = model_output[0] + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) + sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) + return sum_embeddings / sum_mask + + @component.output_types(embedding=List[float]) + def run(self, text: str): + """Embed a string. + + :param text: The text to embed. + :return: The embeddings of the text. + """ + if not isinstance(text, str): + msg = ( + "OptimumTextEmbedder expects a string as an input. " + "In case you want to embed a list of Documents, please use the OptimumDocumentEmbedder." + ) + raise TypeError(msg) + + if not (self.embedding_model and self.tokenizer): + msg = "The embedding model has not been loaded. Please call warm_up() before running." + raise RuntimeError(msg) + + text_to_embed = self.prefix + text + self.suffix + + # Determine device for tokenizer output + device = ( + "cuda" + if self.onnx_execution_provider + in ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"] + else "cpu" + ) + + encoded_input = self.tokenizer([text_to_embed], padding=True, truncation=True, return_tensors="pt").to(device) + + # Compute token embeddings + with torch.no_grad(): + model_output = self.embedding_model(**encoded_input) + + # Perform mean pooling + sentence_embeddings = self.mean_pooling(model_output, encoded_input["attention_mask"].cpu()) + + # Normalize Embeddings + if self.normalize_embeddings: + sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1) + + embedding = sentence_embeddings.tolist()[0] + + return {"embedding": embedding} diff --git a/integrations/optimum/tests/__init__.py b/integrations/optimum/tests/__init__.py new file mode 100644 index 000000000..6b5e14dc1 --- /dev/null +++ b/integrations/optimum/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/optimum/tests/test_optimum_document_embedder.py b/integrations/optimum/tests/test_optimum_document_embedder.py new file mode 100644 index 000000000..b0d26d392 --- /dev/null +++ b/integrations/optimum/tests/test_optimum_document_embedder.py @@ -0,0 +1,233 @@ +from unittest.mock import MagicMock, patch + +import pytest +from haystack.dataclasses import Document +from haystack.utils.auth import Secret +from haystack_integrations.components.embedders import OptimumDocumentEmbedder +from huggingface_hub.utils import RepositoryNotFoundError + + +@pytest.fixture +def mock_check_valid_model(): + with patch( + "haystack_integrations.components.embedders.optimum_document_embedder.check_valid_model", + MagicMock(return_value=None), + ) as mock: + yield mock + + +class TestOptimumDocumentEmbedder: + def test_init_default(self, monkeypatch, mock_check_valid_model): # noqa: ARG002 + monkeypatch.setenv("HF_API_TOKEN", "fake-api-token") + embedder = OptimumDocumentEmbedder() + + assert embedder.model == "BAAI/bge-small-en-v1.5" + assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False) + assert embedder.prefix == "" + assert embedder.suffix == "" + assert embedder.normalize_embeddings is True + assert embedder.onnx_execution_provider == "CPUExecutionProvider" + assert embedder.batch_size == 32 + assert embedder.progress_bar is True + assert embedder.meta_fields_to_embed == [] + assert embedder.embedding_separator == "\n" + assert embedder.model_kwargs == { + "model_id": "BAAI/bge-small-en-v1.5", + "provider": "CPUExecutionProvider", + "use_auth_token": "fake-api-token", + } + + def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 + embedder = OptimumDocumentEmbedder( + model="sentence-transformers/all-mpnet-base-v2", + token=Secret.from_token("fake-api-token"), + prefix="prefix", + suffix="suffix", + batch_size=64, + progress_bar=False, + meta_fields_to_embed=["test_field"], + embedding_separator=" | ", + normalize_embeddings=False, + onnx_execution_provider="CUDAExecutionProvider", + model_kwargs={"trust_remote_code": True}, + ) + + assert embedder.model == "sentence-transformers/all-mpnet-base-v2" + assert embedder.token == Secret.from_token("fake-api-token") + assert embedder.prefix == "prefix" + assert embedder.suffix == "suffix" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.meta_fields_to_embed == ["test_field"] + assert embedder.embedding_separator == " | " + assert embedder.normalize_embeddings is False + assert embedder.onnx_execution_provider == "CUDAExecutionProvider" + assert embedder.model_kwargs == { + "trust_remote_code": True, + "model_id": "sentence-transformers/all-mpnet-base-v2", + "provider": "CUDAExecutionProvider", + "use_auth_token": "fake-api-token", + } + + def test_initialize_with_invalid_model(self, mock_check_valid_model): + mock_check_valid_model.side_effect = RepositoryNotFoundError("Invalid model id") + with pytest.raises(RepositoryNotFoundError): + OptimumDocumentEmbedder(model="invalid_model_id") + + def test_to_dict(self, mock_check_valid_model): # noqa: ARG002 + component = OptimumDocumentEmbedder() + data = component.to_dict() + + assert data == { + "type": "haystack_integrations.components.embedders.optimum_document_embedder.OptimumDocumentEmbedder", + "init_parameters": { + "model": "BAAI/bge-small-en-v1.5", + "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"}, + "prefix": "", + "suffix": "", + "batch_size": 32, + "progress_bar": True, + "meta_fields_to_embed": [], + "embedding_separator": "\n", + "normalize_embeddings": True, + "onnx_execution_provider": "CPUExecutionProvider", + "model_kwargs": { + "model_id": "BAAI/bge-small-en-v1.5", + "provider": "CPUExecutionProvider", + "use_auth_token": None, + }, + }, + } + + def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # noqa: ARG002 + component = OptimumDocumentEmbedder( + model="sentence-transformers/all-mpnet-base-v2", + token=Secret.from_env_var("ENV_VAR", strict=False), + prefix="prefix", + suffix="suffix", + batch_size=64, + progress_bar=False, + meta_fields_to_embed=["test_field"], + embedding_separator=" | ", + normalize_embeddings=False, + onnx_execution_provider="CUDAExecutionProvider", + model_kwargs={"trust_remote_code": True}, + ) + data = component.to_dict() + + assert data == { + "type": "haystack_integrations.components.embedders.optimum_document_embedder.OptimumDocumentEmbedder", + "init_parameters": { + "model": "sentence-transformers/all-mpnet-base-v2", + "token": {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"}, + "prefix": "prefix", + "suffix": "suffix", + "batch_size": 64, + "progress_bar": False, + "meta_fields_to_embed": ["test_field"], + "embedding_separator": " | ", + "normalize_embeddings": False, + "onnx_execution_provider": "CUDAExecutionProvider", + "model_kwargs": { + "trust_remote_code": True, + "model_id": "sentence-transformers/all-mpnet-base-v2", + "provider": "CUDAExecutionProvider", + "use_auth_token": None, + }, + }, + } + + def test_prepare_texts_to_embed_w_metadata(self, mock_check_valid_model): # noqa: ARG002 + documents = [ + Document(content=f"document number {i}: content", meta={"meta_field": f"meta_value {i}"}) for i in range(5) + ] + + embedder = OptimumDocumentEmbedder( + model="sentence-transformers/all-mpnet-base-v2", + meta_fields_to_embed=["meta_field"], + embedding_separator=" | ", + ) + + prepared_texts = embedder._prepare_texts_to_embed(documents) + + assert prepared_texts == [ + "meta_value 0 | document number 0: content", + "meta_value 1 | document number 1: content", + "meta_value 2 | document number 2: content", + "meta_value 3 | document number 3: content", + "meta_value 4 | document number 4: content", + ] + + def test_prepare_texts_to_embed_w_suffix(self, mock_check_valid_model): # noqa: ARG002 + documents = [Document(content=f"document number {i}") for i in range(5)] + + embedder = OptimumDocumentEmbedder( + model="sentence-transformers/all-mpnet-base-v2", + prefix="my_prefix ", + suffix=" my_suffix", + ) + + prepared_texts = embedder._prepare_texts_to_embed(documents) + + assert prepared_texts == [ + "my_prefix document number 0 my_suffix", + "my_prefix document number 1 my_suffix", + "my_prefix document number 2 my_suffix", + "my_prefix document number 3 my_suffix", + "my_prefix document number 4 my_suffix", + ] + + def test_run_wrong_input_format(self, mock_check_valid_model): # noqa: ARG002 + embedder = OptimumDocumentEmbedder( + model="BAAI/bge-small-en-v1.5", + ) + embedder.warm_up() + # wrong formats + string_input = "text" + list_integers_input = [1, 2, 3] + + with pytest.raises(TypeError, match="OptimumDocumentEmbedder expects a list of Documents as input"): + embedder.run(documents=string_input) + + with pytest.raises(TypeError, match="OptimumDocumentEmbedder expects a list of Documents as input"): + embedder.run(documents=list_integers_input) + + def test_run_on_empty_list(self, mock_check_valid_model): # noqa: ARG002 + embedder = OptimumDocumentEmbedder( + model="BAAI/bge-small-en-v1.5", + ) + embedder.warm_up() + empty_list_input = [] + result = embedder.run(documents=empty_list_input) + + assert result["documents"] is not None + assert not result["documents"] # empty list + + @pytest.mark.integration + def test_run(self): + docs = [ + Document(content="I love cheese", meta={"topic": "Cuisine"}), + Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), + ] + + embedder = OptimumDocumentEmbedder( + model="BAAI/bge-small-en-v1.5", + prefix="prefix ", + suffix=" suffix", + meta_fields_to_embed=["topic"], + embedding_separator=" | ", + batch_size=1, + ) + embedder.warm_up() + + result = embedder.run(documents=docs) + + documents_with_embeddings = result["documents"] + + assert isinstance(documents_with_embeddings, list) + assert len(documents_with_embeddings) == len(docs) + for doc in documents_with_embeddings: + assert isinstance(doc, Document) + assert isinstance(doc.embedding, list) + assert len(doc.embedding) == 384 + assert all(isinstance(x, float) for x in doc.embedding) diff --git a/integrations/optimum/tests/test_optimum_text_embedder.py b/integrations/optimum/tests/test_optimum_text_embedder.py new file mode 100644 index 000000000..79aaff6b8 --- /dev/null +++ b/integrations/optimum/tests/test_optimum_text_embedder.py @@ -0,0 +1,139 @@ +from unittest.mock import MagicMock, patch + +import pytest +from haystack.utils.auth import Secret +from haystack_integrations.components.embedders import OptimumTextEmbedder +from huggingface_hub.utils import RepositoryNotFoundError + + +@pytest.fixture +def mock_check_valid_model(): + with patch( + "haystack_integrations.components.embedders.optimum_text_embedder.check_valid_model", + MagicMock(return_value=None), + ) as mock: + yield mock + + +class TestOptimumTextEmbedder: + def test_init_default(self, monkeypatch, mock_check_valid_model): # noqa: ARG002 + monkeypatch.setenv("HF_API_TOKEN", "fake-api-token") + embedder = OptimumTextEmbedder() + + assert embedder.model == "BAAI/bge-small-en-v1.5" + assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False) + assert embedder.prefix == "" + assert embedder.suffix == "" + assert embedder.normalize_embeddings is True + assert embedder.onnx_execution_provider == "CPUExecutionProvider" + assert embedder.model_kwargs == { + "model_id": "BAAI/bge-small-en-v1.5", + "provider": "CPUExecutionProvider", + "use_auth_token": "fake-api-token", + } + + def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 + embedder = OptimumTextEmbedder( + model="sentence-transformers/all-mpnet-base-v2", + token=Secret.from_token("fake-api-token"), + prefix="prefix", + suffix="suffix", + normalize_embeddings=False, + onnx_execution_provider="CUDAExecutionProvider", + model_kwargs={"trust_remote_code": True}, + ) + + assert embedder.model == "sentence-transformers/all-mpnet-base-v2" + assert embedder.token == Secret.from_token("fake-api-token") + assert embedder.prefix == "prefix" + assert embedder.suffix == "suffix" + assert embedder.normalize_embeddings is False + assert embedder.onnx_execution_provider == "CUDAExecutionProvider" + assert embedder.model_kwargs == { + "trust_remote_code": True, + "model_id": "sentence-transformers/all-mpnet-base-v2", + "provider": "CUDAExecutionProvider", + "use_auth_token": "fake-api-token", + } + + def test_initialize_with_invalid_model(self, mock_check_valid_model): + mock_check_valid_model.side_effect = RepositoryNotFoundError("Invalid model id") + with pytest.raises(RepositoryNotFoundError): + OptimumTextEmbedder(model="invalid_model_id") + + def test_to_dict(self, mock_check_valid_model): # noqa: ARG002 + component = OptimumTextEmbedder() + data = component.to_dict() + + assert data == { + "type": "haystack_integrations.components.embedders.optimum_text_embedder.OptimumTextEmbedder", + "init_parameters": { + "model": "BAAI/bge-small-en-v1.5", + "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"}, + "prefix": "", + "suffix": "", + "normalize_embeddings": True, + "onnx_execution_provider": "CPUExecutionProvider", + "model_kwargs": { + "model_id": "BAAI/bge-small-en-v1.5", + "provider": "CPUExecutionProvider", + "use_auth_token": None, + }, + }, + } + + def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # noqa: ARG002 + component = OptimumTextEmbedder( + model="sentence-transformers/all-mpnet-base-v2", + token=Secret.from_env_var("ENV_VAR", strict=False), + prefix="prefix", + suffix="suffix", + normalize_embeddings=False, + onnx_execution_provider="CUDAExecutionProvider", + model_kwargs={"trust_remote_code": True}, + ) + data = component.to_dict() + + assert data == { + "type": "haystack_integrations.components.embedders.optimum_text_embedder.OptimumTextEmbedder", + "init_parameters": { + "model": "sentence-transformers/all-mpnet-base-v2", + "token": {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"}, + "prefix": "prefix", + "suffix": "suffix", + "normalize_embeddings": False, + "onnx_execution_provider": "CUDAExecutionProvider", + "model_kwargs": { + "trust_remote_code": True, + "model_id": "sentence-transformers/all-mpnet-base-v2", + "provider": "CUDAExecutionProvider", + "use_auth_token": None, + }, + }, + } + + def test_run_wrong_input_format(self, mock_check_valid_model): # noqa: ARG002 + embedder = OptimumTextEmbedder( + model="BAAI/bge-small-en-v1.5", + token=Secret.from_token("fake-api-token"), + ) + embedder.warm_up() + + list_integers_input = [1, 2, 3] + + with pytest.raises(TypeError, match="OptimumTextEmbedder expects a string as an input"): + embedder.run(text=list_integers_input) + + @pytest.mark.integration + def test_run(self): + embedder = OptimumTextEmbedder( + model="BAAI/bge-small-en-v1.5", + prefix="prefix ", + suffix=" suffix", + ) + embedder.warm_up() + + result = embedder.run(text="The food was delicious") + + assert len(result["embedding"]) == 384 + assert all(isinstance(x, float) for x in result["embedding"]) From 7112573dcec6eef0b6ba60353049028e0aadef3b Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Thu, 8 Feb 2024 20:25:33 +0530 Subject: [PATCH 02/10] Add CI workflow --- .github/labeler.yml | 5 ++++ .github/workflows/optimum.yml | 56 +++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 .github/workflows/optimum.yml diff --git a/.github/labeler.yml b/.github/labeler.yml index 4d060772c..fd142c9ba 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -69,6 +69,11 @@ integration:opensearch: - any-glob-to-any-file: "integrations/opensearch/**/*" - any-glob-to-any-file: ".github/workflows/opensearch.yml" +integration:optimum: + - changed-files: + - any-glob-to-any-file: "integrations/optimum/**/*" + - any-glob-to-any-file: ".github/workflows/optimum.yml" + integration:pgvector: - changed-files: - any-glob-to-any-file: "integrations/pgvector/**/*" diff --git a/.github/workflows/optimum.yml b/.github/workflows/optimum.yml new file mode 100644 index 000000000..01d589b07 --- /dev/null +++ b/.github/workflows/optimum.yml @@ -0,0 +1,56 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / optimum + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "integrations/optimum/**" + - ".github/workflows/optimum.yml" + +defaults: + run: + working-directory: integrations/optimum + +concurrency: + group: optimum-${{ github.head_ref }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + +jobs: + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: ["3.9", "3.10"] + + steps: + - name: Support longpaths + if: matrix.os == 'windows-latest' + working-directory: . + run: git config --system core.longpaths true + + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install --upgrade hatch + + - name: Lint + if: matrix.python-version == '3.9' && runner.os == 'Linux' + run: hatch run lint:all + + - name: Run tests + run: hatch run cov From aa1d04d40525017c3c7b7c58901a9db22c473841 Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Thu, 8 Feb 2024 20:56:44 +0530 Subject: [PATCH 03/10] Update dependencies --- integrations/optimum/pyproject.toml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/integrations/optimum/pyproject.toml b/integrations/optimum/pyproject.toml index 186a75e22..616dfc0e5 100644 --- a/integrations/optimum/pyproject.toml +++ b/integrations/optimum/pyproject.toml @@ -28,14 +28,7 @@ dependencies = [ "haystack-ai", "transformers[sentencepiece]==4.36.2", "sentence-transformers>=2.2.0", -] - -[project.optional-dependencies] -cpu = [ - "optimum[onnxruntime]==1.15.0", -] -gpu = [ - "optimum[onnxruntime-gpu]==1.15.0", + "optimum[onnxruntime]==1.15.0" ] [project.urls] From fe7fa36dce768f29d81448b1934db44af15c6b3e Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Mon, 19 Feb 2024 19:52:05 +0530 Subject: [PATCH 04/10] Add embedding backend --- integrations/optimum/pyproject.toml | 4 +- .../components/embedders/backends/__init__.py | 0 .../embedders/backends/optimum_backend.py | 116 ++++++++++++++++++ .../embedders/optimum_document_embedder.py | 106 +++------------- .../embedders/optimum_text_embedder.py | 79 +++--------- .../tests/test_optimum_document_embedder.py | 32 ++--- .../tests/test_optimum_text_embedder.py | 26 ++-- 7 files changed, 183 insertions(+), 180 deletions(-) create mode 100644 integrations/optimum/src/haystack_integrations/components/embedders/backends/__init__.py create mode 100644 integrations/optimum/src/haystack_integrations/components/embedders/backends/optimum_backend.py diff --git a/integrations/optimum/pyproject.toml b/integrations/optimum/pyproject.toml index 616dfc0e5..91c5eb13c 100644 --- a/integrations/optimum/pyproject.toml +++ b/integrations/optimum/pyproject.toml @@ -26,9 +26,9 @@ classifiers = [ ] dependencies = [ "haystack-ai", - "transformers[sentencepiece]==4.36.2", + "transformers[sentencepiece]", "sentence-transformers>=2.2.0", - "optimum[onnxruntime]==1.15.0" + "optimum[onnxruntime]" ] [project.urls] diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/backends/__init__.py b/integrations/optimum/src/haystack_integrations/components/embedders/backends/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/backends/optimum_backend.py b/integrations/optimum/src/haystack_integrations/components/embedders/backends/optimum_backend.py new file mode 100644 index 000000000..515d92938 --- /dev/null +++ b/integrations/optimum/src/haystack_integrations/components/embedders/backends/optimum_backend.py @@ -0,0 +1,116 @@ +from typing import Any, ClassVar, Dict, List, Optional, Union + +import numpy as np +import torch +from haystack.utils.auth import Secret +from optimum.onnxruntime import ORTModelForFeatureExtraction +from tqdm import tqdm +from transformers import AutoTokenizer + + +class _OptimumEmbeddingBackendFactory: + """ + Factory class to create instances of Sentence Transformers embedding backends. + """ + + _instances: ClassVar[Dict[str, "_OptimumEmbeddingBackend"]] = {} + + @staticmethod + def get_embedding_backend( + model: str, token: Optional[Secret] = None, model_kwargs: Optional[Dict[str, Any]] = None + ): + embedding_backend_id = f"{model}{token}" + + if embedding_backend_id in _OptimumEmbeddingBackendFactory._instances: + return _OptimumEmbeddingBackendFactory._instances[embedding_backend_id] + embedding_backend = _OptimumEmbeddingBackend(model=model, token=token, model_kwargs=model_kwargs) + _OptimumEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend + return embedding_backend + + +class _OptimumEmbeddingBackend: + """ + Class to manage Optimum embeddings. + """ + + def __init__(self, model: str, token: Optional[Secret] = None, model_kwargs: Optional[Dict[str, Any]] = None): + # export=True converts the model to ONNX on the fly + self.model = ORTModelForFeatureExtraction.from_pretrained(**model_kwargs, export=True) + self.tokenizer = AutoTokenizer.from_pretrained(model, token=token) + + def mean_pooling(self, model_output: torch.tensor, attention_mask: torch.tensor) -> torch.tensor: + """ + Perform Mean Pooling on the output of the Embedding model. + + :param model_output: The output of the embedding model. + :param attention_mask: The attention mask of the tokenized text. + :return: The embeddings of the text after mean pooling. + """ + # First element of model_output contains all token embeddings + token_embeddings = model_output[0] + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) + sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) + return sum_embeddings / sum_mask + + def embed( + self, + texts_to_embed: Union[str, List[str]], + normalize_embeddings: bool, + progress_bar: bool = False, + batch_size: int = 1, + ) -> Union[List[List[float]], List[float]]: + """ + Embed text or list of texts using the Optimum model. + + :param texts_to_embed: T + :param normalize_embeddings: Whether to normalize the embeddings to unit length. + :param progress_bar: Whether to show a progress bar or not, defaults to False. + :param batch_size: Batch size to use, defaults to 1. + :return: A single embedding if the input is a single string. A list of embeddings if the input is a list of + strings. + """ + if isinstance(texts_to_embed, str): + texts = [texts_to_embed] + else: + texts = texts_to_embed + + # Determine device for tokenizer output + device = self.model.device + + # Sorting by length + length_sorted_idx = np.argsort([-len(sen) for sen in texts]) + sentences_sorted = [texts[idx] for idx in length_sorted_idx] + + all_embeddings = [] + for i in tqdm( + range(0, len(sentences_sorted), batch_size), disable=not progress_bar, desc="Calculating embeddings" + ): + batch = sentences_sorted[i : i + batch_size] + encoded_input = self.tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device) + + # Only pass required inputs otherwise onnxruntime can raise an error + inputs_to_remove = set(encoded_input.keys()).difference(self.model.inputs_names) + for key in inputs_to_remove: + encoded_input.pop(key) + + # Compute token embeddings + model_output = self.model(**encoded_input) + + # Perform mean pooling + sentence_embeddings = self.mean_pooling(model_output, encoded_input["attention_mask"].to(device)) + + all_embeddings.extend(sentence_embeddings.tolist()) + + # Reorder embeddings according to original order + all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)] + + # Normalize all embeddings + if normalize_embeddings: + all_embeddings = torch.nn.functional.normalize(torch.tensor(all_embeddings), p=2, dim=1).tolist() + + if isinstance(texts_to_embed, str): + # Return the embedding if only one text was passed + all_embeddings = all_embeddings[0] + + return all_embeddings diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum_document_embedder.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_document_embedder.py index 1aaa65bb4..9e3ae2e61 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum_document_embedder.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_document_embedder.py @@ -1,13 +1,11 @@ from typing import Any, Dict, List, Optional -import numpy as np -import torch from haystack import Document, component, default_from_dict, default_to_dict from haystack.utils import Secret, deserialize_secrets_inplace from haystack.utils.hf import HFModelType, check_valid_model, deserialize_hf_model_kwargs, serialize_hf_model_kwargs -from optimum.onnxruntime import ORTModelForFeatureExtraction -from tqdm import tqdm -from transformers import AutoTokenizer +from haystack_integrations.components.embedders.backends.optimum_backend import ( + _OptimumEmbeddingBackendFactory, +) class OptimumDocumentEmbedder: @@ -24,7 +22,7 @@ class OptimumDocumentEmbedder: doc = Document(content="I love pizza!") - document_embedder = OptimumDocumentEmbedder(model="BAAI/bge-small-en-v1.5") + document_embedder = OptimumDocumentEmbedder(model="sentence-transformers/all-mpnet-base-v2") document_embedder.warm_up() result = document_embedder.run([doc]) @@ -48,7 +46,7 @@ class OptimumDocumentEmbedder: def __init__( self, - model: str = "BAAI/bge-small-en-v1.5", + model: str = "sentence-transformers/all-mpnet-base-v2", token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False), # noqa: B008 prefix: str = "", suffix: str = "", @@ -63,13 +61,14 @@ def __init__( """ Create a OptimumDocumentEmbedder component. - :param model: A string representing the model id on HF Hub. Default is "BAAI/bge-small-en-v1.5". + :param model: A string representing the model id on HF Hub. Defaults to + "sentence-transformers/all-mpnet-base-v2". :param token: The HuggingFace token to use as HTTP bearer authorization. :param prefix: A string to add to the beginning of each text. :param suffix: A string to add to the end of each text. :param normalize_embeddings: Whether to normalize the embeddings to unit length. :param onnx_execution_provider: The execution provider to use for ONNX models. Defaults to - "CPUExecutionProvider". + "CPUExecutionProvider". See https://onnxruntime.ai/docs/execution-providers/ for possible providers. :param model_kwargs: Dictionary containing additional keyword arguments to pass to the model. In case of duplication, these kwargs override `model`, `onnx_execution_provider`, and `token` initialization parameters. @@ -107,26 +106,12 @@ def __init__( def warm_up(self): """ - Convert the model to ONNX. - - The model is cached if the "TensorrtExecutionProvider" is used, since it takes a while to to build the TensorRT - engine. + Load the embedding backend. """ - if self.embedding_model is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model) - - if self.onnx_execution_provider == "TensorrtExecutionProvider": - # Cache engine for TensorRT - provider_options = { - "trt_engine_cache_enable": True, - "trt_engine_cache_path": f"tmp/trt_cache_{self.model}", - } - self.embedding_model = ORTModelForFeatureExtraction.from_pretrained( - **self.model_kwargs, use_cache=False, provider_options=provider_options - ) - else: - # export=True converts the model to ONNX on the fly - self.embedding_model = ORTModelForFeatureExtraction.from_pretrained(**self.model_kwargs, export=True) + if not hasattr(self, "embedding_backend"): + self.embedding_backend = _OptimumEmbeddingBackendFactory.get_embedding_backend( + model=self.model, token=self.token, model_kwargs=self.model_kwargs + ) def to_dict(self) -> Dict[str, Any]: """ @@ -162,21 +147,6 @@ def from_dict(cls, data: Dict[str, Any]) -> "OptimumDocumentEmbedder": deserialize_hf_model_kwargs(data["init_parameters"]["model_kwargs"]) return default_from_dict(cls, data) - def mean_pooling(self, model_output: torch.tensor, attention_mask: torch.tensor) -> torch.tensor: - """ - Perform Mean Pooling on the output of the Embedding model. - - :param model_output: The output of the embedding model. - :param attention_mask: The attention mask of the tokenized text. - :return: The embeddings of the text after mean pooling. - """ - # First element of model_output contains all token embeddings - token_embeddings = model_output[0] - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) - sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) - return sum_embeddings / sum_mask - def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: """ Prepare the texts to embed by concatenating the Document text with the metadata fields to embed. @@ -194,47 +164,6 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: texts_to_embed.append(text_to_embed) return texts_to_embed - def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> List[List[float]]: - """ - Embed a list of texts in batches. - """ - # Determine device for tokenizer output - device = ( - "cuda" - if self.onnx_execution_provider - in ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"] - else "cpu" - ) - - # Sorting by length - length_sorted_idx = np.argsort([-len(sen) for sen in texts_to_embed]) - sentences_sorted = [texts_to_embed[idx] for idx in length_sorted_idx] - - all_embeddings = [] - for i in tqdm( - range(0, len(sentences_sorted), batch_size), disable=not self.progress_bar, desc="Calculating embeddings" - ): - batch = sentences_sorted[i : i + batch_size] - encoded_input = self.tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device) # type: ignore - - # Compute token embeddings - with torch.no_grad(): - model_output = self.embedding_model(**encoded_input) # type: ignore - - # Perform mean pooling - sentence_embeddings = self.mean_pooling(model_output, encoded_input["attention_mask"].cpu()) - - all_embeddings.extend(sentence_embeddings.tolist()) - - # Reorder embeddings according to original order - all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)] - - # Normalize all embeddings - if self.normalize_embeddings: - all_embeddings = torch.nn.functional.normalize(torch.tensor(all_embeddings), p=2, dim=1).tolist() - - return all_embeddings - @component.output_types(documents=List[Document]) def run(self, documents: List[Document]): """ @@ -251,7 +180,7 @@ def run(self, documents: List[Document]): ) raise TypeError(msg) - if not (self.embedding_model and self.tokenizer): + if not hasattr(self, "embedding_backend"): msg = "The embedding model has not been loaded. Please call warm_up() before running." raise RuntimeError(msg) @@ -261,7 +190,12 @@ def run(self, documents: List[Document]): texts_to_embed = self._prepare_texts_to_embed(documents=documents) - embeddings = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self.batch_size) + embeddings = self.embedding_backend.embed( + texts_to_embed=texts_to_embed, + normalize_embeddings=self.normalize_embeddings, + progress_bar=self.progress_bar, + batch_size=self.batch_size, + ) for doc, emb in zip(documents, embeddings): doc.embedding = emb diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum_text_embedder.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_text_embedder.py index 8b6a3b48e..1531c85ef 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum_text_embedder.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_text_embedder.py @@ -1,11 +1,11 @@ from typing import Any, Dict, List, Optional -import torch from haystack import component, default_from_dict, default_to_dict from haystack.utils import Secret, deserialize_secrets_inplace from haystack.utils.hf import HFModelType, check_valid_model, deserialize_hf_model_kwargs, serialize_hf_model_kwargs -from optimum.onnxruntime import ORTModelForFeatureExtraction -from transformers import AutoTokenizer +from haystack_integrations.components.embedders.backends.optimum_backend import ( + _OptimumEmbeddingBackendFactory, +) class OptimumTextEmbedder: @@ -19,7 +19,7 @@ class OptimumTextEmbedder: text_to_embed = "I love pizza!" - text_embedder = OptimumTextEmbedder(model="BAAI/bge-small-en-v1.5") + text_embedder = OptimumTextEmbedder(model="sentence-transformers/all-mpnet-base-v2") text_embedder.warm_up() print(text_embedder.run(text_to_embed)) @@ -42,7 +42,7 @@ class OptimumTextEmbedder: def __init__( self, - model: str = "BAAI/bge-small-en-v1.5", + model: str = "sentence-transformers/all-mpnet-base-v2", token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False), # noqa: B008 prefix: str = "", suffix: str = "", @@ -53,13 +53,14 @@ def __init__( """ Create a OptimumTextEmbedder component. - :param model: A string representing the model id on HF Hub. Default is "BAAI/bge-small-en-v1.5". + :param model: A string representing the model id on HF Hub. Defaults to + "sentence-transformers/all-mpnet-base-v2". :param token: The HuggingFace token to use as HTTP bearer authorization. :param prefix: A string to add to the beginning of each text. :param suffix: A string to add to the end of each text. :param normalize_embeddings: Whether to normalize the embeddings to unit length. :param onnx_execution_provider: The execution provider to use for ONNX models. Defaults to - "CPUExecutionProvider". + "CPUExecutionProvider". See https://onnxruntime.ai/docs/execution-providers/ for possible providers. :param model_kwargs: Dictionary containing additional keyword arguments to pass to the model. In case of duplication, these kwargs override `model`, `onnx_execution_provider`, and `token` initialization parameters. @@ -88,26 +89,12 @@ def __init__( def warm_up(self): """ - Convert the model to ONNX. - - The model is cached if the "TensorrtExecutionProvider" is used, since it takes a while to build the TensorRT - engine. + Load the embedding backend. """ - if self.embedding_model is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model) - - if self.onnx_execution_provider == "TensorrtExecutionProvider": - # Cache engine for TensorRT - provider_options = { - "trt_engine_cache_enable": True, - "trt_engine_cache_path": f"tmp/trt_cache_{self.model}", - } - self.embedding_model = ORTModelForFeatureExtraction.from_pretrained( - **self.model_kwargs, use_cache=False, provider_options=provider_options - ) - else: - # export=True converts the model to ONNX on the fly - self.embedding_model = ORTModelForFeatureExtraction.from_pretrained(**self.model_kwargs, export=True) + if not hasattr(self, "embedding_backend"): + self.embedding_backend = _OptimumEmbeddingBackendFactory.get_embedding_backend( + model=self.model, token=self.token, model_kwargs=self.model_kwargs + ) def to_dict(self) -> Dict[str, Any]: """ @@ -139,21 +126,6 @@ def from_dict(cls, data: Dict[str, Any]) -> "OptimumTextEmbedder": deserialize_hf_model_kwargs(data["init_parameters"]["model_kwargs"]) return default_from_dict(cls, data) - def mean_pooling(self, model_output: torch.tensor, attention_mask: torch.tensor) -> torch.tensor: - """ - Perform Mean Pooling on the output of the Embedding model. - - :param model_output: The output of the embedding model. - :param attention_mask: The attention mask of the tokenized text. - :return: The embeddings of the text after mean pooling. - """ - # First element of model_output contains all token embeddings - token_embeddings = model_output[0] - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) - sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) - return sum_embeddings / sum_mask - @component.output_types(embedding=List[float]) def run(self, text: str): """Embed a string. @@ -168,33 +140,14 @@ def run(self, text: str): ) raise TypeError(msg) - if not (self.embedding_model and self.tokenizer): + if not hasattr(self, "embedding_backend"): msg = "The embedding model has not been loaded. Please call warm_up() before running." raise RuntimeError(msg) text_to_embed = self.prefix + text + self.suffix - # Determine device for tokenizer output - device = ( - "cuda" - if self.onnx_execution_provider - in ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"] - else "cpu" + embedding = self.embedding_backend.embed( + texts_to_embed=text_to_embed, normalize_embeddings=self.normalize_embeddings ) - encoded_input = self.tokenizer([text_to_embed], padding=True, truncation=True, return_tensors="pt").to(device) - - # Compute token embeddings - with torch.no_grad(): - model_output = self.embedding_model(**encoded_input) - - # Perform mean pooling - sentence_embeddings = self.mean_pooling(model_output, encoded_input["attention_mask"].cpu()) - - # Normalize Embeddings - if self.normalize_embeddings: - sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1) - - embedding = sentence_embeddings.tolist()[0] - return {"embedding": embedding} diff --git a/integrations/optimum/tests/test_optimum_document_embedder.py b/integrations/optimum/tests/test_optimum_document_embedder.py index b0d26d392..b7d285196 100644 --- a/integrations/optimum/tests/test_optimum_document_embedder.py +++ b/integrations/optimum/tests/test_optimum_document_embedder.py @@ -21,7 +21,7 @@ def test_init_default(self, monkeypatch, mock_check_valid_model): # noqa: ARG00 monkeypatch.setenv("HF_API_TOKEN", "fake-api-token") embedder = OptimumDocumentEmbedder() - assert embedder.model == "BAAI/bge-small-en-v1.5" + assert embedder.model == "sentence-transformers/all-mpnet-base-v2" assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False) assert embedder.prefix == "" assert embedder.suffix == "" @@ -32,14 +32,14 @@ def test_init_default(self, monkeypatch, mock_check_valid_model): # noqa: ARG00 assert embedder.meta_fields_to_embed == [] assert embedder.embedding_separator == "\n" assert embedder.model_kwargs == { - "model_id": "BAAI/bge-small-en-v1.5", + "model_id": "sentence-transformers/all-mpnet-base-v2", "provider": "CPUExecutionProvider", "use_auth_token": "fake-api-token", } def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 embedder = OptimumDocumentEmbedder( - model="sentence-transformers/all-mpnet-base-v2", + model="sentence-transformers/all-minilm-l6-v2", token=Secret.from_token("fake-api-token"), prefix="prefix", suffix="suffix", @@ -52,7 +52,7 @@ def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 model_kwargs={"trust_remote_code": True}, ) - assert embedder.model == "sentence-transformers/all-mpnet-base-v2" + assert embedder.model == "sentence-transformers/all-minilm-l6-v2" assert embedder.token == Secret.from_token("fake-api-token") assert embedder.prefix == "prefix" assert embedder.suffix == "suffix" @@ -64,7 +64,7 @@ def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 assert embedder.onnx_execution_provider == "CUDAExecutionProvider" assert embedder.model_kwargs == { "trust_remote_code": True, - "model_id": "sentence-transformers/all-mpnet-base-v2", + "model_id": "sentence-transformers/all-minilm-l6-v2", "provider": "CUDAExecutionProvider", "use_auth_token": "fake-api-token", } @@ -81,7 +81,7 @@ def test_to_dict(self, mock_check_valid_model): # noqa: ARG002 assert data == { "type": "haystack_integrations.components.embedders.optimum_document_embedder.OptimumDocumentEmbedder", "init_parameters": { - "model": "BAAI/bge-small-en-v1.5", + "model": "sentence-transformers/all-mpnet-base-v2", "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"}, "prefix": "", "suffix": "", @@ -92,7 +92,7 @@ def test_to_dict(self, mock_check_valid_model): # noqa: ARG002 "normalize_embeddings": True, "onnx_execution_provider": "CPUExecutionProvider", "model_kwargs": { - "model_id": "BAAI/bge-small-en-v1.5", + "model_id": "sentence-transformers/all-mpnet-base-v2", "provider": "CPUExecutionProvider", "use_auth_token": None, }, @@ -101,7 +101,7 @@ def test_to_dict(self, mock_check_valid_model): # noqa: ARG002 def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # noqa: ARG002 component = OptimumDocumentEmbedder( - model="sentence-transformers/all-mpnet-base-v2", + model="sentence-transformers/all-minilm-l6-v2", token=Secret.from_env_var("ENV_VAR", strict=False), prefix="prefix", suffix="suffix", @@ -118,7 +118,7 @@ def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # n assert data == { "type": "haystack_integrations.components.embedders.optimum_document_embedder.OptimumDocumentEmbedder", "init_parameters": { - "model": "sentence-transformers/all-mpnet-base-v2", + "model": "sentence-transformers/all-minilm-l6-v2", "token": {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"}, "prefix": "prefix", "suffix": "suffix", @@ -130,7 +130,7 @@ def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # n "onnx_execution_provider": "CUDAExecutionProvider", "model_kwargs": { "trust_remote_code": True, - "model_id": "sentence-transformers/all-mpnet-base-v2", + "model_id": "sentence-transformers/all-minilm-l6-v2", "provider": "CUDAExecutionProvider", "use_auth_token": None, }, @@ -143,7 +143,7 @@ def test_prepare_texts_to_embed_w_metadata(self, mock_check_valid_model): # noq ] embedder = OptimumDocumentEmbedder( - model="sentence-transformers/all-mpnet-base-v2", + model="sentence-transformers/all-minilm-l6-v2", meta_fields_to_embed=["meta_field"], embedding_separator=" | ", ) @@ -162,7 +162,7 @@ def test_prepare_texts_to_embed_w_suffix(self, mock_check_valid_model): # noqa: documents = [Document(content=f"document number {i}") for i in range(5)] embedder = OptimumDocumentEmbedder( - model="sentence-transformers/all-mpnet-base-v2", + model="sentence-transformers/all-minilm-l6-v2", prefix="my_prefix ", suffix=" my_suffix", ) @@ -179,7 +179,7 @@ def test_prepare_texts_to_embed_w_suffix(self, mock_check_valid_model): # noqa: def test_run_wrong_input_format(self, mock_check_valid_model): # noqa: ARG002 embedder = OptimumDocumentEmbedder( - model="BAAI/bge-small-en-v1.5", + model="sentence-transformers/all-mpnet-base-v2", ) embedder.warm_up() # wrong formats @@ -194,7 +194,7 @@ def test_run_wrong_input_format(self, mock_check_valid_model): # noqa: ARG002 def test_run_on_empty_list(self, mock_check_valid_model): # noqa: ARG002 embedder = OptimumDocumentEmbedder( - model="BAAI/bge-small-en-v1.5", + model="sentence-transformers/all-mpnet-base-v2", ) embedder.warm_up() empty_list_input = [] @@ -211,7 +211,7 @@ def test_run(self): ] embedder = OptimumDocumentEmbedder( - model="BAAI/bge-small-en-v1.5", + model="sentence-transformers/all-mpnet-base-v2", prefix="prefix ", suffix=" suffix", meta_fields_to_embed=["topic"], @@ -229,5 +229,5 @@ def test_run(self): for doc in documents_with_embeddings: assert isinstance(doc, Document) assert isinstance(doc.embedding, list) - assert len(doc.embedding) == 384 + assert len(doc.embedding) == 768 assert all(isinstance(x, float) for x in doc.embedding) diff --git a/integrations/optimum/tests/test_optimum_text_embedder.py b/integrations/optimum/tests/test_optimum_text_embedder.py index 79aaff6b8..2f5b60b45 100644 --- a/integrations/optimum/tests/test_optimum_text_embedder.py +++ b/integrations/optimum/tests/test_optimum_text_embedder.py @@ -20,21 +20,21 @@ def test_init_default(self, monkeypatch, mock_check_valid_model): # noqa: ARG00 monkeypatch.setenv("HF_API_TOKEN", "fake-api-token") embedder = OptimumTextEmbedder() - assert embedder.model == "BAAI/bge-small-en-v1.5" + assert embedder.model == "sentence-transformers/all-mpnet-base-v2" assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False) assert embedder.prefix == "" assert embedder.suffix == "" assert embedder.normalize_embeddings is True assert embedder.onnx_execution_provider == "CPUExecutionProvider" assert embedder.model_kwargs == { - "model_id": "BAAI/bge-small-en-v1.5", + "model_id": "sentence-transformers/all-mpnet-base-v2", "provider": "CPUExecutionProvider", "use_auth_token": "fake-api-token", } def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 embedder = OptimumTextEmbedder( - model="sentence-transformers/all-mpnet-base-v2", + model="sentence-transformers/all-minilm-l6-v2", token=Secret.from_token("fake-api-token"), prefix="prefix", suffix="suffix", @@ -43,7 +43,7 @@ def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 model_kwargs={"trust_remote_code": True}, ) - assert embedder.model == "sentence-transformers/all-mpnet-base-v2" + assert embedder.model == "sentence-transformers/all-minilm-l6-v2" assert embedder.token == Secret.from_token("fake-api-token") assert embedder.prefix == "prefix" assert embedder.suffix == "suffix" @@ -51,7 +51,7 @@ def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 assert embedder.onnx_execution_provider == "CUDAExecutionProvider" assert embedder.model_kwargs == { "trust_remote_code": True, - "model_id": "sentence-transformers/all-mpnet-base-v2", + "model_id": "sentence-transformers/all-minilm-l6-v2", "provider": "CUDAExecutionProvider", "use_auth_token": "fake-api-token", } @@ -68,14 +68,14 @@ def test_to_dict(self, mock_check_valid_model): # noqa: ARG002 assert data == { "type": "haystack_integrations.components.embedders.optimum_text_embedder.OptimumTextEmbedder", "init_parameters": { - "model": "BAAI/bge-small-en-v1.5", + "model": "sentence-transformers/all-mpnet-base-v2", "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"}, "prefix": "", "suffix": "", "normalize_embeddings": True, "onnx_execution_provider": "CPUExecutionProvider", "model_kwargs": { - "model_id": "BAAI/bge-small-en-v1.5", + "model_id": "sentence-transformers/all-mpnet-base-v2", "provider": "CPUExecutionProvider", "use_auth_token": None, }, @@ -84,7 +84,7 @@ def test_to_dict(self, mock_check_valid_model): # noqa: ARG002 def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # noqa: ARG002 component = OptimumTextEmbedder( - model="sentence-transformers/all-mpnet-base-v2", + model="sentence-transformers/all-minilm-l6-v2", token=Secret.from_env_var("ENV_VAR", strict=False), prefix="prefix", suffix="suffix", @@ -97,7 +97,7 @@ def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # n assert data == { "type": "haystack_integrations.components.embedders.optimum_text_embedder.OptimumTextEmbedder", "init_parameters": { - "model": "sentence-transformers/all-mpnet-base-v2", + "model": "sentence-transformers/all-minilm-l6-v2", "token": {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"}, "prefix": "prefix", "suffix": "suffix", @@ -105,7 +105,7 @@ def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # n "onnx_execution_provider": "CUDAExecutionProvider", "model_kwargs": { "trust_remote_code": True, - "model_id": "sentence-transformers/all-mpnet-base-v2", + "model_id": "sentence-transformers/all-minilm-l6-v2", "provider": "CUDAExecutionProvider", "use_auth_token": None, }, @@ -114,7 +114,7 @@ def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # n def test_run_wrong_input_format(self, mock_check_valid_model): # noqa: ARG002 embedder = OptimumTextEmbedder( - model="BAAI/bge-small-en-v1.5", + model="sentence-transformers/all-mpnet-base-v2", token=Secret.from_token("fake-api-token"), ) embedder.warm_up() @@ -127,7 +127,7 @@ def test_run_wrong_input_format(self, mock_check_valid_model): # noqa: ARG002 @pytest.mark.integration def test_run(self): embedder = OptimumTextEmbedder( - model="BAAI/bge-small-en-v1.5", + model="sentence-transformers/all-mpnet-base-v2", prefix="prefix ", suffix=" suffix", ) @@ -135,5 +135,5 @@ def test_run(self): result = embedder.run(text="The food was delicious") - assert len(result["embedding"]) == 384 + assert len(result["embedding"]) == 768 assert all(isinstance(x, float) for x in result["embedding"]) From 9eb400fce14ea1ad4b0dcfbcd61aaecc18aed9f2 Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Tue, 20 Feb 2024 12:54:09 +0530 Subject: [PATCH 05/10] Add pooling sub module; Update pyproject.toml with dependency info --- integrations/optimum/pyproject.toml | 5 + .../embedders/backends/optimum_backend.py | 27 +- .../embedders/optimum_document_embedder.py | 52 +++- .../embedders/optimum_text_embedder.py | 56 ++++- .../components/embedders/pooling.py | 231 ++++++++++++++++++ .../tests/test_optimum_document_embedder.py | 79 +++++- .../tests/test_optimum_text_embedder.py | 70 +++++- 7 files changed, 479 insertions(+), 41 deletions(-) create mode 100644 integrations/optimum/src/haystack_integrations/components/embedders/pooling.py diff --git a/integrations/optimum/pyproject.toml b/integrations/optimum/pyproject.toml index 91c5eb13c..6e1cae85c 100644 --- a/integrations/optimum/pyproject.toml +++ b/integrations/optimum/pyproject.toml @@ -27,6 +27,11 @@ classifiers = [ dependencies = [ "haystack-ai", "transformers[sentencepiece]", + # The main export function of Optimum into ONNX has hidden dependencies. + # It depends on either "sentence-transformers", "diffusers" or "timm", based + # on which model is loaded from HF Hub. + # Ref: https://github.com/huggingface/optimum/blob/8651c0ca1cccf095458bc80329dec9df4601edb4/optimum/exporters/onnx/__main__.py#L164 + # "sentence-transformers" has been added, since most embedding models use it "sentence-transformers>=2.2.0", "optimum[onnxruntime]" ] diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/backends/optimum_backend.py b/integrations/optimum/src/haystack_integrations/components/embedders/backends/optimum_backend.py index 515d92938..eca285a88 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/backends/optimum_backend.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/backends/optimum_backend.py @@ -3,6 +3,7 @@ import numpy as np import torch from haystack.utils.auth import Secret +from haystack_integrations.components.embedders.pooling import Pooling, PoolingMode from optimum.onnxruntime import ORTModelForFeatureExtraction from tqdm import tqdm from transformers import AutoTokenizer @@ -38,25 +39,11 @@ def __init__(self, model: str, token: Optional[Secret] = None, model_kwargs: Opt self.model = ORTModelForFeatureExtraction.from_pretrained(**model_kwargs, export=True) self.tokenizer = AutoTokenizer.from_pretrained(model, token=token) - def mean_pooling(self, model_output: torch.tensor, attention_mask: torch.tensor) -> torch.tensor: - """ - Perform Mean Pooling on the output of the Embedding model. - - :param model_output: The output of the embedding model. - :param attention_mask: The attention mask of the tokenized text. - :return: The embeddings of the text after mean pooling. - """ - # First element of model_output contains all token embeddings - token_embeddings = model_output[0] - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) - sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) - return sum_embeddings / sum_mask - def embed( self, texts_to_embed: Union[str, List[str]], normalize_embeddings: bool, + pooling_mode: PoolingMode = PoolingMode.MEAN, progress_bar: bool = False, batch_size: int = 1, ) -> Union[List[List[float]], List[float]]: @@ -65,6 +52,7 @@ def embed( :param texts_to_embed: T :param normalize_embeddings: Whether to normalize the embeddings to unit length. + :param pooling_mode: The pooling mode to use. Defaults to PoolingMode.MEAN. :param progress_bar: Whether to show a progress bar or not, defaults to False. :param batch_size: Batch size to use, defaults to 1. :return: A single embedding if the input is a single string. A list of embeddings if the input is a list of @@ -97,8 +85,13 @@ def embed( # Compute token embeddings model_output = self.model(**encoded_input) - # Perform mean pooling - sentence_embeddings = self.mean_pooling(model_output, encoded_input["attention_mask"].to(device)) + # Pool Embeddings + pooling = Pooling( + pooling_mode=pooling_mode, + attention_mask=encoded_input["attention_mask"].to(device), + model_output=model_output, + ) + sentence_embeddings = pooling.pool_embeddings() all_embeddings.extend(sentence_embeddings.tolist()) diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum_document_embedder.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_document_embedder.py index 9e3ae2e61..4dee89633 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum_document_embedder.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_document_embedder.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union from haystack import Document, component, default_from_dict, default_to_dict from haystack.utils import Secret, deserialize_secrets_inplace @@ -6,8 +6,10 @@ from haystack_integrations.components.embedders.backends.optimum_backend import ( _OptimumEmbeddingBackendFactory, ) +from haystack_integrations.components.embedders.pooling import HFPoolingMode, PoolingMode +@component class OptimumDocumentEmbedder: """ A component for computing Document embeddings using models loaded with the HuggingFace Optimum library. @@ -52,6 +54,7 @@ def __init__( suffix: str = "", normalize_embeddings: bool = True, onnx_execution_provider: str = "CPUExecutionProvider", + pooling_mode: Optional[Union[str, PoolingMode]] = None, model_kwargs: Optional[Dict[str, Any]] = None, batch_size: int = 32, progress_bar: bool = True, @@ -68,7 +71,40 @@ def __init__( :param suffix: A string to add to the end of each text. :param normalize_embeddings: Whether to normalize the embeddings to unit length. :param onnx_execution_provider: The execution provider to use for ONNX models. Defaults to - "CPUExecutionProvider". See https://onnxruntime.ai/docs/execution-providers/ for possible providers. + "CPUExecutionProvider". See https://onnxruntime.ai/docs/execution-providers/ for possible providers. + + Note: Using the TensorRT execution provider + TensorRT requires to build its inference engine ahead of inference, which takes some time due to the model + optimization and nodes fusion. To avoid rebuilding the engine every time the model is loaded, ONNX Runtime + provides a pair of options to save the engine: `trt_engine_cache_enable` and `trt_engine_cache_path`. We + recommend setting these two provider options using the model_kwargs parameter, when using the TensorRT + execution provider. The usage is as follows: + ```python + embedder = OptimumDocumentEmbedder( + model="sentence-transformers/all-mpnet-base-v2", + onnx_execution_provider="TensorrtExecutionProvider", + model_kwargs={ + "provider_options": { + "trt_engine_cache_enable": True, + "trt_engine_cache_path": "tmp/trt_cache", + } + }, + ) + ``` + :param pooling_mode: The pooling mode to use. Defaults to None. When None, pooling mode will be inferred from + the model config. If not found, "mean" pooling will be used. + The supported pooling modes are: + - "cls": Perform CLS Pooling on the output of the embedding model. Uses the first token (CLS token) as text + representations. + - "max": Perform Max Pooling on the output of the embedding model. Uses max in each dimension over all + the tokens. + - "mean": Perform Mean Pooling on the output of the embedding model. + - "mean_sqrt_len": Perform mean-pooling on the output of the embedding model, but divide by sqrt + (input_length). + - "weighted_mean": Perform Weighted (position) Mean Pooling on the output of the embedding model. See + https://arxiv.org/abs/2202.08904. + - "last_token": Perform Last Token Pooling on the output of the embedding model. See + https://arxiv.org/abs/2202.08904 & https://arxiv.org/abs/2201.10005. :param model_kwargs: Dictionary containing additional keyword arguments to pass to the model. In case of duplication, these kwargs override `model`, `onnx_execution_provider`, and `token` initialization parameters. @@ -84,6 +120,15 @@ def __init__( self.token = token token = token.resolve_value() if token else None + if isinstance(pooling_mode, str): + self.pooling_mode = PoolingMode.from_str(pooling_mode) + # Infer pooling mode from model config if not provided, + if pooling_mode is None: + self.pooling_mode = HFPoolingMode.get_pooling_mode(model, token) + # Set default to "mean" if not found in model config and not specified by user + if self.pooling_mode is None: + self.pooling_mode = PoolingMode.MEAN + self.prefix = prefix self.suffix = suffix self.normalize_embeddings = normalize_embeddings @@ -124,6 +169,7 @@ def to_dict(self) -> Dict[str, Any]: suffix=self.suffix, normalize_embeddings=self.normalize_embeddings, onnx_execution_provider=self.onnx_execution_provider, + pooling_mode=self.pooling_mode.value, batch_size=self.batch_size, progress_bar=self.progress_bar, meta_fields_to_embed=self.meta_fields_to_embed, @@ -143,6 +189,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "OptimumDocumentEmbedder": """ Deserialize this component from a dictionary. """ + data["init_parameters"]["pooling_mode"] = PoolingMode.from_str(data["init_parameters"]["pooling_mode"]) deserialize_secrets_inplace(data["init_parameters"], keys=["token"]) deserialize_hf_model_kwargs(data["init_parameters"]["model_kwargs"]) return default_from_dict(cls, data) @@ -193,6 +240,7 @@ def run(self, documents: List[Document]): embeddings = self.embedding_backend.embed( texts_to_embed=texts_to_embed, normalize_embeddings=self.normalize_embeddings, + pooling_mode=self.pooling_mode, progress_bar=self.progress_bar, batch_size=self.batch_size, ) diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum_text_embedder.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_text_embedder.py index 1531c85ef..828c86111 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum_text_embedder.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_text_embedder.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union from haystack import component, default_from_dict, default_to_dict from haystack.utils import Secret, deserialize_secrets_inplace @@ -6,8 +6,10 @@ from haystack_integrations.components.embedders.backends.optimum_backend import ( _OptimumEmbeddingBackendFactory, ) +from haystack_integrations.components.embedders.pooling import HFPoolingMode, PoolingMode +@component class OptimumTextEmbedder: """ A component to embed text using models loaded with the HuggingFace Optimum library. @@ -48,6 +50,7 @@ def __init__( suffix: str = "", normalize_embeddings: bool = True, onnx_execution_provider: str = "CPUExecutionProvider", + pooling_mode: Optional[Union[str, PoolingMode]] = None, model_kwargs: Optional[Dict[str, Any]] = None, ): """ @@ -60,7 +63,40 @@ def __init__( :param suffix: A string to add to the end of each text. :param normalize_embeddings: Whether to normalize the embeddings to unit length. :param onnx_execution_provider: The execution provider to use for ONNX models. Defaults to - "CPUExecutionProvider". See https://onnxruntime.ai/docs/execution-providers/ for possible providers. + "CPUExecutionProvider". See https://onnxruntime.ai/docs/execution-providers/ for possible providers. + + Note: Using the TensorRT execution provider + TensorRT requires to build its inference engine ahead of inference, which takes some time due to the model + optimization and nodes fusion. To avoid rebuilding the engine every time the model is loaded, ONNX Runtime + provides a pair of options to save the engine: `trt_engine_cache_enable` and `trt_engine_cache_path`. We + recommend setting these two provider options using the model_kwargs parameter, when using the TensorRT + execution provider. The usage is as follows: + ```python + embedder = OptimumTextEmbedder( + model="sentence-transformers/all-mpnet-base-v2", + onnx_execution_provider="TensorrtExecutionProvider", + model_kwargs={ + "provider_options": { + "trt_engine_cache_enable": True, + "trt_engine_cache_path": "tmp/trt_cache", + } + }, + ) + ``` + :param pooling_mode: The pooling mode to use. Defaults to None. When None, pooling mode will be inferred from + the model config. If not found, "mean" pooling will be used. + The supported pooling modes are: + - "cls": Perform CLS Pooling on the output of the embedding model. Uses the first token (CLS token) as text + representations. + - "max": Perform Max Pooling on the output of the embedding model. Uses max in each dimension over all + the tokens. + - "mean": Perform Mean Pooling on the output of the embedding model. + - "mean_sqrt_len": Perform mean-pooling on the output of the embedding model, but divide by sqrt + (input_length). + - "weighted_mean": Perform Weighted (position) Mean Pooling on the output of the embedding model. See + https://arxiv.org/abs/2202.08904. + - "last_token": Perform Last Token Pooling on the output of the embedding model. See + https://arxiv.org/abs/2202.08904 & https://arxiv.org/abs/2201.10005. :param model_kwargs: Dictionary containing additional keyword arguments to pass to the model. In case of duplication, these kwargs override `model`, `onnx_execution_provider`, and `token` initialization parameters. @@ -71,6 +107,15 @@ def __init__( self.token = token token = token.resolve_value() if token else None + if isinstance(pooling_mode, str): + self.pooling_mode = PoolingMode.from_str(pooling_mode) + # Infer pooling mode from model config if not provided, + if pooling_mode is None: + self.pooling_mode = HFPoolingMode.get_pooling_mode(model, token) + # Set default to "mean" if not found in model config and not specified by user + if self.pooling_mode is None: + self.pooling_mode = PoolingMode.MEAN + self.prefix = prefix self.suffix = suffix self.normalize_embeddings = normalize_embeddings @@ -107,6 +152,7 @@ def to_dict(self) -> Dict[str, Any]: suffix=self.suffix, normalize_embeddings=self.normalize_embeddings, onnx_execution_provider=self.onnx_execution_provider, + pooling_mode=self.pooling_mode.value, model_kwargs=self.model_kwargs, token=self.token.to_dict() if self.token else None, ) @@ -122,13 +168,15 @@ def from_dict(cls, data: Dict[str, Any]) -> "OptimumTextEmbedder": """ Deserialize this component from a dictionary. """ + data["init_parameters"]["pooling_mode"] = PoolingMode.from_str(data["init_parameters"]["pooling_mode"]) deserialize_secrets_inplace(data["init_parameters"], keys=["token"]) deserialize_hf_model_kwargs(data["init_parameters"]["model_kwargs"]) return default_from_dict(cls, data) @component.output_types(embedding=List[float]) def run(self, text: str): - """Embed a string. + """ + Embed a string. :param text: The text to embed. :return: The embeddings of the text. @@ -147,7 +195,7 @@ def run(self, text: str): text_to_embed = self.prefix + text + self.suffix embedding = self.embedding_backend.embed( - texts_to_embed=text_to_embed, normalize_embeddings=self.normalize_embeddings + texts_to_embed=text_to_embed, normalize_embeddings=self.normalize_embeddings, pooling_mode=self.pooling_mode ) return {"embedding": embedding} diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/pooling.py b/integrations/optimum/src/haystack_integrations/components/embedders/pooling.py new file mode 100644 index 000000000..aa011b8ce --- /dev/null +++ b/integrations/optimum/src/haystack_integrations/components/embedders/pooling.py @@ -0,0 +1,231 @@ +import json +from enum import Enum +from typing import Optional + +import torch +from haystack.utils import Secret +from huggingface_hub import hf_hub_download + + +class PoolingMode(Enum): + """ + Pooling Modes support by the Optimum Embedders. + """ + + CLS = "cls" + MEAN = "mean" + MAX = "max" + MEAN_SQRT_LEN = "mean_sqrt_len" + WEIGHTED_MEAN = "weighted_mean" + LAST_TOKEN = "last_token" + + def __str__(self): + return self.value + + @classmethod + def from_str(cls, string: str) -> "PoolingMode": + """ + Create a pooling mode from a string. + + :param string: + The string to convert. + :returns: + The pooling mode. + """ + enum_map = {e.value: e for e in PoolingMode} + pooling_mode = enum_map.get(string) + if pooling_mode is None: + msg = f"Unknown Pooling mode '{string}'. Supported modes are: {list(enum_map.keys())}" + raise ValueError(msg) + return pooling_mode + + +class HFPoolingMode: + """ + Gets the pooling mode of the Sentence Transformer model from the Hugging Face Hub. + """ + + @staticmethod + def get_pooling_mode(model: str, token: Optional[Secret] = None) -> Optional[PoolingMode]: + try: + pooling_config_path = hf_hub_download(repo_id=model, token=token, filename="1_Pooling/config.json") + + with open(pooling_config_path) as f: + pooling_config = json.load(f) + + # Filter only those keys that start with "pooling_mode" and are True + true_pooling_modes = [ + key for key, value in pooling_config.items() if key.startswith("pooling_mode") and value + ] + + pooling_modes_map = { + "pooling_mode_cls_token": PoolingMode.CLS, + "pooling_mode_mean_tokens": PoolingMode.MEAN, + "pooling_mode_max_tokens": PoolingMode.MAX, + "pooling_mode_mean_sqrt_len_tokens": PoolingMode.MEAN_SQRT_LEN, + "pooling_mode_weightedmean_tokens": PoolingMode.WEIGHTED_MEAN, + "pooling_mode_last_token": PoolingMode.LAST_TOKEN, + } + + # If exactly one True pooling mode is found, return it + if len(true_pooling_modes) == 1: + pooling_mode_from_config = true_pooling_modes[0] + pooling_mode = pooling_modes_map.get(pooling_mode_from_config) + # If no True pooling modes or more than one True pooling mode is found, return None + else: + pooling_mode = None + return pooling_mode + except Exception: + return None + + +class Pooling: + """ + Class to manage pooling of the embeddings. + + :param pooling_mode: The pooling mode to use. + :param attention_mask: The attention mask of the tokenized text. + :param model_output: The output of the embedding model. + """ + + def __init__(self, pooling_mode: PoolingMode, attention_mask: torch.tensor, model_output: torch.tensor): + self.pooling_mode = pooling_mode + self.attention_mask = attention_mask + self.model_output = model_output + + def _cls_pooling(self, token_embeddings: torch.tensor) -> torch.tensor: + """ + Perform CLS Pooling on the output of the embedding model. Uses the first token (CLS token) as text + representations. + + :param model_output: The output of the embedding model. + :param attention_mask: The attention mask of the tokenized text. + :return: The embeddings of the text after mean pooling. + """ + embeddings = token_embeddings[:, 0] + return embeddings + + def _max_pooling(self, token_embeddings: torch.tensor, attention_mask: torch.tensor) -> torch.tensor: + """ + Perform Max Pooling on the output of the embedding model. Uses max in each dimension over all tokens. + + :param model_output: The output of the embedding model. + :param attention_mask: The attention mask of the tokenized text. + :return: The embeddings of the text after mean pooling. + """ + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + # Set padding tokens to large negative value + token_embeddings[input_mask_expanded == 0] = -1e9 + embeddings = torch.max(token_embeddings, 1)[0] + return embeddings + + def _mean_pooling(self, token_embeddings: torch.tensor, attention_mask: torch.tensor) -> torch.tensor: + """ + Perform Mean Pooling on the output of the embedding model. + + :param model_output: The output of the embedding model. + :param attention_mask: The attention mask of the tokenized text. + :return: The embeddings of the text after mean pooling. + """ + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) + sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) + return sum_embeddings / sum_mask + + def _mean_sqrt_len_pooling(self, token_embeddings: torch.tensor, attention_mask: torch.tensor) -> torch.tensor: + """ + Perform mean-pooling on the output of the embedding model, but divide by sqrt(input_length). + + :param model_output: The output of the embedding model. + :param attention_mask: The attention mask of the tokenized text. + :return: The embeddings of the text after mean pooling. + """ + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) + sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) + return sum_embeddings / torch.sqrt(sum_mask) + + def _weighted_mean_pooling(self, token_embeddings: torch.tensor, attention_mask: torch.tensor) -> torch.tensor: + """ + Perform Weighted (position) Mean Pooling on the output of the embedding model. + See https://arxiv.org/abs/2202.08904. + + :param model_output: The output of the embedding model. + :param attention_mask: The attention mask of the tokenized text. + :return: The embeddings of the text after mean pooling. + """ + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + # token_embeddings shape: bs, seq, hidden_dim + weights = ( + torch.arange(start=1, end=token_embeddings.shape[1] + 1) + .unsqueeze(0) + .unsqueeze(-1) + .expand(token_embeddings.size()) + .float() + .to(token_embeddings.device) + ) + input_mask_expanded = input_mask_expanded * weights + sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) + sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) + return sum_embeddings / sum_mask + + def _last_token_pooling(self, token_embeddings: torch.tensor, attention_mask: torch.tensor) -> torch.tensor: + """ + Perform Last Token Pooling on the output of the embedding model. See https://arxiv.org/abs/2202.08904 & + https://arxiv.org/abs/2201.10005. + + :param model_output: The output of the embedding model. + :param attention_mask: The attention mask of the tokenized text. + :return: The embeddings of the text after mean pooling. + """ + bs, seq_len, hidden_dim = token_embeddings.shape + # attention_mask shape: (bs, seq_len) + # Get shape [bs] indices of the last token (i.e. the last token for each batch item) + # argmin gives us the index of the first 0 in the attention mask; We get the last 1 index by subtracting 1 + # Any sequence where min == 1, we use the entire sequence length since argmin = 0 + values, indices = torch.min(attention_mask, 1, keepdim=False) + gather_indices = torch.where(values == 0, indices, seq_len) - 1 # Shape [bs] + # There are empty sequences, where the index would become -1 which will crash + gather_indices = torch.clamp(gather_indices, min=0) + + # Turn indices from shape [bs] --> [bs, 1, hidden_dim] + gather_indices = gather_indices.unsqueeze(-1).repeat(1, hidden_dim) + gather_indices = gather_indices.unsqueeze(1) + + # Gather along the 1st dim (seq_len) (bs, seq_len, hidden_dim -> bs, hidden_dim) + # Actually no need for the attention mask as we gather the last token where attn_mask = 1 + # but as we set some indices (which shouldn't be attended to) to 0 with clamp, we + # use the attention mask to ignore them again + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + embeddings = torch.gather(token_embeddings * input_mask_expanded, 1, gather_indices).squeeze(dim=1) + return embeddings + + def pool_embeddings(self) -> torch.tensor: + """ + Perform pooling on the output of the embedding model. + + :param pooling_mode: The pooling mode to use. + :param attention_mask: The attention mask of the tokenized text. + :param model_output: The output of the embedding model. + :return: The embeddings of the text after pooling. + """ + pooling_func_map = { + PoolingMode.CLS: self._cls_pooling, + PoolingMode.MEAN: self._mean_pooling, + PoolingMode.MAX: self._max_pooling, + PoolingMode.MEAN_SQRT_LEN: self._mean_sqrt_len_pooling, + PoolingMode.WEIGHTED_MEAN: self._weighted_mean_pooling, + PoolingMode.LAST_TOKEN: self._last_token_pooling, + } + self._pooling_function = pooling_func_map[self.pooling_mode] + + # First element of model_output contains all token embeddings + token_embeddings = self.model_output[0] + + embeddings = ( + self._pooling_function(token_embeddings, self.attention_mask) # type: ignore + if self._pooling_function != self._cls_pooling + else self._pooling_function(token_embeddings) # type: ignore + ) + + return embeddings diff --git a/integrations/optimum/tests/test_optimum_document_embedder.py b/integrations/optimum/tests/test_optimum_document_embedder.py index b7d285196..612a9ab0e 100644 --- a/integrations/optimum/tests/test_optimum_document_embedder.py +++ b/integrations/optimum/tests/test_optimum_document_embedder.py @@ -4,6 +4,7 @@ from haystack.dataclasses import Document from haystack.utils.auth import Secret from haystack_integrations.components.embedders import OptimumDocumentEmbedder +from haystack_integrations.components.embedders.pooling import PoolingMode from huggingface_hub.utils import RepositoryNotFoundError @@ -16,8 +17,17 @@ def mock_check_valid_model(): yield mock +@pytest.fixture +def mock_get_pooling_mode(): + with patch( + "haystack_integrations.components.embedders.optimum_text_embedder.HFPoolingMode.get_pooling_mode", + MagicMock(return_value=PoolingMode.MEAN), + ) as mock: + yield mock + + class TestOptimumDocumentEmbedder: - def test_init_default(self, monkeypatch, mock_check_valid_model): # noqa: ARG002 + def test_init_default(self, monkeypatch, mock_check_valid_model, mock_get_pooling_mode): # noqa: ARG002 monkeypatch.setenv("HF_API_TOKEN", "fake-api-token") embedder = OptimumDocumentEmbedder() @@ -27,6 +37,7 @@ def test_init_default(self, monkeypatch, mock_check_valid_model): # noqa: ARG00 assert embedder.suffix == "" assert embedder.normalize_embeddings is True assert embedder.onnx_execution_provider == "CPUExecutionProvider" + assert embedder.pooling_mode == PoolingMode.MEAN assert embedder.batch_size == 32 assert embedder.progress_bar is True assert embedder.meta_fields_to_embed == [] @@ -48,6 +59,7 @@ def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 meta_fields_to_embed=["test_field"], embedding_separator=" | ", normalize_embeddings=False, + pooling_mode="max", onnx_execution_provider="CUDAExecutionProvider", model_kwargs={"trust_remote_code": True}, ) @@ -62,6 +74,7 @@ def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 assert embedder.embedding_separator == " | " assert embedder.normalize_embeddings is False assert embedder.onnx_execution_provider == "CUDAExecutionProvider" + assert embedder.pooling_mode == PoolingMode.MAX assert embedder.model_kwargs == { "trust_remote_code": True, "model_id": "sentence-transformers/all-minilm-l6-v2", @@ -69,12 +82,7 @@ def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 "use_auth_token": "fake-api-token", } - def test_initialize_with_invalid_model(self, mock_check_valid_model): - mock_check_valid_model.side_effect = RepositoryNotFoundError("Invalid model id") - with pytest.raises(RepositoryNotFoundError): - OptimumDocumentEmbedder(model="invalid_model_id") - - def test_to_dict(self, mock_check_valid_model): # noqa: ARG002 + def test_to_dict(self, mock_check_valid_model, mock_get_pooling_mode): # noqa: ARG002 component = OptimumDocumentEmbedder() data = component.to_dict() @@ -91,6 +99,7 @@ def test_to_dict(self, mock_check_valid_model): # noqa: ARG002 "embedding_separator": "\n", "normalize_embeddings": True, "onnx_execution_provider": "CPUExecutionProvider", + "pooling_mode": "mean", "model_kwargs": { "model_id": "sentence-transformers/all-mpnet-base-v2", "provider": "CPUExecutionProvider", @@ -99,7 +108,7 @@ def test_to_dict(self, mock_check_valid_model): # noqa: ARG002 }, } - def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # noqa: ARG002 + def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model, mock_get_pooling_mode): # noqa: ARG002 component = OptimumDocumentEmbedder( model="sentence-transformers/all-minilm-l6-v2", token=Secret.from_env_var("ENV_VAR", strict=False), @@ -111,6 +120,7 @@ def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # n embedding_separator=" | ", normalize_embeddings=False, onnx_execution_provider="CUDAExecutionProvider", + pooling_mode="max", model_kwargs={"trust_remote_code": True}, ) data = component.to_dict() @@ -128,6 +138,7 @@ def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # n "embedding_separator": " | ", "normalize_embeddings": False, "onnx_execution_provider": "CUDAExecutionProvider", + "pooling_mode": "max", "model_kwargs": { "trust_remote_code": True, "model_id": "sentence-transformers/all-minilm-l6-v2", @@ -137,6 +148,52 @@ def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # n }, } + def test_initialize_with_invalid_model(self, mock_check_valid_model): + mock_check_valid_model.side_effect = RepositoryNotFoundError("Invalid model id") + with pytest.raises(RepositoryNotFoundError): + OptimumDocumentEmbedder(model="invalid_model_id") + + def test_initialize_with_invalid_pooling_mode(self, mock_check_valid_model): # noqa: ARG002 + mock_get_pooling_mode.side_effect = ValueError("Invalid pooling mode") + with pytest.raises(ValueError): + OptimumDocumentEmbedder( + model="sentence-transformers/all-mpnet-base-v2", pooling_mode="Invalid_pooling_mode" + ) + + def test_infer_pooling_mode_from_str(self): + """ + Test that the pooling mode is correctly inferred from a string. + The pooling mode is "mean" as per the model config. + """ + for pooling_mode in PoolingMode: + embedder = OptimumDocumentEmbedder( + model="sentence-transformers/all-minilm-l6-v2", + pooling_mode=pooling_mode.value, + ) + + assert embedder.model == "sentence-transformers/all-minilm-l6-v2" + assert embedder.pooling_mode == pooling_mode + + @pytest.mark.integration + def test_default_pooling_mode_when_config_not_found(self, mock_check_valid_model): # noqa: ARG002 + embedder = OptimumDocumentEmbedder( + model="embedding_model_finetuned", + pooling_mode=None, + ) + + assert embedder.model == "embedding_model_finetuned" + assert embedder.pooling_mode == PoolingMode.MEAN + + @pytest.mark.integration + def test_infer_pooling_mode_from_hf(self): + embedder = OptimumDocumentEmbedder( + model="sentence-transformers/all-minilm-l6-v2", + pooling_mode=None, + ) + + assert embedder.model == "sentence-transformers/all-minilm-l6-v2" + assert embedder.pooling_mode == PoolingMode.MEAN + def test_prepare_texts_to_embed_w_metadata(self, mock_check_valid_model): # noqa: ARG002 documents = [ Document(content=f"document number {i}: content", meta={"meta_field": f"meta_value {i}"}) for i in range(5) @@ -146,6 +203,7 @@ def test_prepare_texts_to_embed_w_metadata(self, mock_check_valid_model): # noq model="sentence-transformers/all-minilm-l6-v2", meta_fields_to_embed=["meta_field"], embedding_separator=" | ", + pooling_mode="mean", ) prepared_texts = embedder._prepare_texts_to_embed(documents) @@ -165,6 +223,7 @@ def test_prepare_texts_to_embed_w_suffix(self, mock_check_valid_model): # noqa: model="sentence-transformers/all-minilm-l6-v2", prefix="my_prefix ", suffix=" my_suffix", + pooling_mode="mean", ) prepared_texts = embedder._prepare_texts_to_embed(documents) @@ -178,9 +237,7 @@ def test_prepare_texts_to_embed_w_suffix(self, mock_check_valid_model): # noqa: ] def test_run_wrong_input_format(self, mock_check_valid_model): # noqa: ARG002 - embedder = OptimumDocumentEmbedder( - model="sentence-transformers/all-mpnet-base-v2", - ) + embedder = OptimumDocumentEmbedder(model="sentence-transformers/all-mpnet-base-v2", pooling_mode="mean") embedder.warm_up() # wrong formats string_input = "text" diff --git a/integrations/optimum/tests/test_optimum_text_embedder.py b/integrations/optimum/tests/test_optimum_text_embedder.py index 2f5b60b45..52124944c 100644 --- a/integrations/optimum/tests/test_optimum_text_embedder.py +++ b/integrations/optimum/tests/test_optimum_text_embedder.py @@ -3,6 +3,7 @@ import pytest from haystack.utils.auth import Secret from haystack_integrations.components.embedders import OptimumTextEmbedder +from haystack_integrations.components.embedders.pooling import PoolingMode from huggingface_hub.utils import RepositoryNotFoundError @@ -15,8 +16,17 @@ def mock_check_valid_model(): yield mock +@pytest.fixture +def mock_get_pooling_mode(): + with patch( + "haystack_integrations.components.embedders.optimum_text_embedder.HFPoolingMode.get_pooling_mode", + MagicMock(return_value=PoolingMode.MEAN), + ) as mock: + yield mock + + class TestOptimumTextEmbedder: - def test_init_default(self, monkeypatch, mock_check_valid_model): # noqa: ARG002 + def test_init_default(self, monkeypatch, mock_check_valid_model, mock_get_pooling_mode): # noqa: ARG002 monkeypatch.setenv("HF_API_TOKEN", "fake-api-token") embedder = OptimumTextEmbedder() @@ -26,6 +36,7 @@ def test_init_default(self, monkeypatch, mock_check_valid_model): # noqa: ARG00 assert embedder.suffix == "" assert embedder.normalize_embeddings is True assert embedder.onnx_execution_provider == "CPUExecutionProvider" + assert embedder.pooling_mode == PoolingMode.MEAN assert embedder.model_kwargs == { "model_id": "sentence-transformers/all-mpnet-base-v2", "provider": "CPUExecutionProvider", @@ -39,6 +50,7 @@ def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 prefix="prefix", suffix="suffix", normalize_embeddings=False, + pooling_mode="max", onnx_execution_provider="CUDAExecutionProvider", model_kwargs={"trust_remote_code": True}, ) @@ -49,6 +61,7 @@ def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 assert embedder.suffix == "suffix" assert embedder.normalize_embeddings is False assert embedder.onnx_execution_provider == "CUDAExecutionProvider" + assert embedder.pooling_mode == PoolingMode.MAX assert embedder.model_kwargs == { "trust_remote_code": True, "model_id": "sentence-transformers/all-minilm-l6-v2", @@ -56,12 +69,7 @@ def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 "use_auth_token": "fake-api-token", } - def test_initialize_with_invalid_model(self, mock_check_valid_model): - mock_check_valid_model.side_effect = RepositoryNotFoundError("Invalid model id") - with pytest.raises(RepositoryNotFoundError): - OptimumTextEmbedder(model="invalid_model_id") - - def test_to_dict(self, mock_check_valid_model): # noqa: ARG002 + def test_to_dict(self, mock_check_valid_model, mock_get_pooling_mode): # noqa: ARG002 component = OptimumTextEmbedder() data = component.to_dict() @@ -74,6 +82,7 @@ def test_to_dict(self, mock_check_valid_model): # noqa: ARG002 "suffix": "", "normalize_embeddings": True, "onnx_execution_provider": "CPUExecutionProvider", + "pooling_mode": "mean", "model_kwargs": { "model_id": "sentence-transformers/all-mpnet-base-v2", "provider": "CPUExecutionProvider", @@ -90,6 +99,7 @@ def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # n suffix="suffix", normalize_embeddings=False, onnx_execution_provider="CUDAExecutionProvider", + pooling_mode="max", model_kwargs={"trust_remote_code": True}, ) data = component.to_dict() @@ -103,6 +113,7 @@ def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # n "suffix": "suffix", "normalize_embeddings": False, "onnx_execution_provider": "CUDAExecutionProvider", + "pooling_mode": "max", "model_kwargs": { "trust_remote_code": True, "model_id": "sentence-transformers/all-minilm-l6-v2", @@ -112,10 +123,55 @@ def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # n }, } + def test_initialize_with_invalid_model(self, mock_check_valid_model): + mock_check_valid_model.side_effect = RepositoryNotFoundError("Invalid model id") + with pytest.raises(RepositoryNotFoundError): + OptimumTextEmbedder(model="invalid_model_id", pooling_mode="max") + + def test_initialize_with_invalid_pooling_mode(self, mock_check_valid_model): # noqa: ARG002 + mock_get_pooling_mode.side_effect = ValueError("Invalid pooling mode") + with pytest.raises(ValueError): + OptimumTextEmbedder(model="sentence-transformers/all-mpnet-base-v2", pooling_mode="Invalid_pooling_mode") + + def test_infer_pooling_mode_from_str(self): + """ + Test that the pooling mode is correctly inferred from a string. + The pooling mode is "mean" as per the model config. + """ + for pooling_mode in PoolingMode: + embedder = OptimumTextEmbedder( + model="sentence-transformers/all-minilm-l6-v2", + pooling_mode=pooling_mode.value, + ) + + assert embedder.model == "sentence-transformers/all-minilm-l6-v2" + assert embedder.pooling_mode == pooling_mode + + @pytest.mark.integration + def test_default_pooling_mode_when_config_not_found(self, mock_check_valid_model): # noqa: ARG002 + embedder = OptimumTextEmbedder( + model="embedding_model_finetuned", + pooling_mode=None, + ) + + assert embedder.model == "embedding_model_finetuned" + assert embedder.pooling_mode == PoolingMode.MEAN + + @pytest.mark.integration + def test_infer_pooling_mode_from_hf(self): + embedder = OptimumTextEmbedder( + model="sentence-transformers/all-minilm-l6-v2", + pooling_mode=None, + ) + + assert embedder.model == "sentence-transformers/all-minilm-l6-v2" + assert embedder.pooling_mode == PoolingMode.MEAN + def test_run_wrong_input_format(self, mock_check_valid_model): # noqa: ARG002 embedder = OptimumTextEmbedder( model="sentence-transformers/all-mpnet-base-v2", token=Secret.from_token("fake-api-token"), + pooling_mode="mean", ) embedder.warm_up() From f0256b7450b54ed80d4dd24389594b105a8ae987 Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Wed, 21 Feb 2024 14:17:07 +0530 Subject: [PATCH 06/10] Remove backend factory; Address review comments --- integrations/optimum/pyproject.toml | 3 +- .../components/embedders/backends/__init__.py | 0 .../{backends => }/optimum_backend.py | 62 +++---- .../embedders/optimum_document_embedder.py | 32 ++-- .../embedders/optimum_text_embedder.py | 32 ++-- .../components/embedders/pooling.py | 168 ++++-------------- .../optimum/tests/test_optimum_backend.py | 24 +++ .../tests/test_optimum_document_embedder.py | 12 +- .../tests/test_optimum_text_embedder.py | 12 +- 9 files changed, 128 insertions(+), 217 deletions(-) delete mode 100644 integrations/optimum/src/haystack_integrations/components/embedders/backends/__init__.py rename integrations/optimum/src/haystack_integrations/components/embedders/{backends => }/optimum_backend.py (63%) create mode 100644 integrations/optimum/tests/test_optimum_backend.py diff --git a/integrations/optimum/pyproject.toml b/integrations/optimum/pyproject.toml index 6e1cae85c..2c654c6b4 100644 --- a/integrations/optimum/pyproject.toml +++ b/integrations/optimum/pyproject.toml @@ -185,7 +185,8 @@ module = [ "optimum.*", "torch.*", "transformers.*", - "huggingface_hub.*" + "huggingface_hub.*", + "sentence_transformers.*" ] ignore_missing_imports = true diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/backends/__init__.py b/integrations/optimum/src/haystack_integrations/components/embedders/backends/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/backends/optimum_backend.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_backend.py similarity index 63% rename from integrations/optimum/src/haystack_integrations/components/embedders/backends/optimum_backend.py rename to integrations/optimum/src/haystack_integrations/components/embedders/optimum_backend.py index eca285a88..55d0e6f20 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/backends/optimum_backend.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_backend.py @@ -1,4 +1,4 @@ -from typing import Any, ClassVar, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union import numpy as np import torch @@ -9,32 +9,19 @@ from transformers import AutoTokenizer -class _OptimumEmbeddingBackendFactory: - """ - Factory class to create instances of Sentence Transformers embedding backends. - """ - - _instances: ClassVar[Dict[str, "_OptimumEmbeddingBackend"]] = {} - - @staticmethod - def get_embedding_backend( - model: str, token: Optional[Secret] = None, model_kwargs: Optional[Dict[str, Any]] = None - ): - embedding_backend_id = f"{model}{token}" - - if embedding_backend_id in _OptimumEmbeddingBackendFactory._instances: - return _OptimumEmbeddingBackendFactory._instances[embedding_backend_id] - embedding_backend = _OptimumEmbeddingBackend(model=model, token=token, model_kwargs=model_kwargs) - _OptimumEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend - return embedding_backend - - -class _OptimumEmbeddingBackend: +class OptimumEmbeddingBackend: """ Class to manage Optimum embeddings. """ - def __init__(self, model: str, token: Optional[Secret] = None, model_kwargs: Optional[Dict[str, Any]] = None): + def __init__(self, model: str, model_kwargs: Dict[str, Any], token: Optional[Secret] = None): + """ + Create an instance of OptimumEmbeddingBackend. + + :param model: A string representing the model id on HF Hub. + :param model_kwargs: Keyword arguments to pass to the model. + :param token: The HuggingFace token to use as HTTP bearer authorization. + """ # export=True converts the model to ONNX on the fly self.model = ORTModelForFeatureExtraction.from_pretrained(**model_kwargs, export=True) self.tokenizer = AutoTokenizer.from_pretrained(model, token=token) @@ -50,11 +37,11 @@ def embed( """ Embed text or list of texts using the Optimum model. - :param texts_to_embed: T + :param texts_to_embed: The text or list of texts to embed. :param normalize_embeddings: Whether to normalize the embeddings to unit length. - :param pooling_mode: The pooling mode to use. Defaults to PoolingMode.MEAN. - :param progress_bar: Whether to show a progress bar or not, defaults to False. - :param batch_size: Batch size to use, defaults to 1. + :param pooling_mode: The pooling mode to use. + :param progress_bar: Whether to show a progress bar or not. + :param batch_size: Batch size to use. :return: A single embedding if the input is a single string. A list of embeddings if the input is a list of strings. """ @@ -81,8 +68,6 @@ def embed( inputs_to_remove = set(encoded_input.keys()).difference(self.model.inputs_names) for key in inputs_to_remove: encoded_input.pop(key) - - # Compute token embeddings model_output = self.model(**encoded_input) # Pool Embeddings @@ -92,18 +77,23 @@ def embed( model_output=model_output, ) sentence_embeddings = pooling.pool_embeddings() + all_embeddings.append(sentence_embeddings) - all_embeddings.extend(sentence_embeddings.tolist()) - - # Reorder embeddings according to original order - all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)] + embeddings = torch.cat(all_embeddings, dim=0) # Normalize all embeddings if normalize_embeddings: - all_embeddings = torch.nn.functional.normalize(torch.tensor(all_embeddings), p=2, dim=1).tolist() + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + embeddings = embeddings.tolist() + + # Reorder embeddings according to original order + reordered_embeddings: List[List[float]] = [[]] * len(texts) + for embedding, idx in zip(embeddings, length_sorted_idx): + reordered_embeddings[idx] = embedding if isinstance(texts_to_embed, str): # Return the embedding if only one text was passed - all_embeddings = all_embeddings[0] + return reordered_embeddings[0] - return all_embeddings + return reordered_embeddings diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum_document_embedder.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_document_embedder.py index 4dee89633..f0310f872 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum_document_embedder.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_document_embedder.py @@ -3,9 +3,7 @@ from haystack import Document, component, default_from_dict, default_to_dict from haystack.utils import Secret, deserialize_secrets_inplace from haystack.utils.hf import HFModelType, check_valid_model, deserialize_hf_model_kwargs, serialize_hf_model_kwargs -from haystack_integrations.components.embedders.backends.optimum_backend import ( - _OptimumEmbeddingBackendFactory, -) +from haystack_integrations.components.embedders.optimum_backend import OptimumEmbeddingBackend from haystack_integrations.components.embedders.pooling import HFPoolingMode, PoolingMode @@ -64,14 +62,13 @@ def __init__( """ Create a OptimumDocumentEmbedder component. - :param model: A string representing the model id on HF Hub. Defaults to - "sentence-transformers/all-mpnet-base-v2". + :param model: A string representing the model id on HF Hub. :param token: The HuggingFace token to use as HTTP bearer authorization. :param prefix: A string to add to the beginning of each text. :param suffix: A string to add to the end of each text. :param normalize_embeddings: Whether to normalize the embeddings to unit length. - :param onnx_execution_provider: The execution provider to use for ONNX models. Defaults to - "CPUExecutionProvider". See https://onnxruntime.ai/docs/execution-providers/ for possible providers. + :param onnx_execution_provider: The execution provider to use for ONNX models. See + https://onnxruntime.ai/docs/execution-providers/ for possible providers. Note: Using the TensorRT execution provider TensorRT requires to build its inference engine ahead of inference, which takes some time due to the model @@ -91,8 +88,7 @@ def __init__( }, ) ``` - :param pooling_mode: The pooling mode to use. Defaults to None. When None, pooling mode will be inferred from - the model config. If not found, "mean" pooling will be used. + :param pooling_mode: The pooling mode to use. When None, pooling mode will be inferred from the model config. The supported pooling modes are: - "cls": Perform CLS Pooling on the output of the embedding model. Uses the first token (CLS token) as text representations. @@ -125,9 +121,14 @@ def __init__( # Infer pooling mode from model config if not provided, if pooling_mode is None: self.pooling_mode = HFPoolingMode.get_pooling_mode(model, token) - # Set default to "mean" if not found in model config and not specified by user + # Raise error if pooling mode is not found in model config and not specified by user if self.pooling_mode is None: - self.pooling_mode = PoolingMode.MEAN + modes = {e.value: e for e in PoolingMode} + msg = ( + f"Pooling mode not found in model config and not specified by user." + f" Supported modes are: {list(modes.keys())}" + ) + raise ValueError(msg) self.prefix = prefix self.suffix = suffix @@ -146,15 +147,14 @@ def __init__( model_kwargs.setdefault("use_auth_token", token) self.model_kwargs = model_kwargs - self.embedding_model = None - self.tokenizer = None + self.embedding_backend = None def warm_up(self): """ Load the embedding backend. """ - if not hasattr(self, "embedding_backend"): - self.embedding_backend = _OptimumEmbeddingBackendFactory.get_embedding_backend( + if self.embedding_backend is None: + self.embedding_backend = OptimumEmbeddingBackend( model=self.model, token=self.token, model_kwargs=self.model_kwargs ) @@ -227,7 +227,7 @@ def run(self, documents: List[Document]): ) raise TypeError(msg) - if not hasattr(self, "embedding_backend"): + if self.embedding_backend is None: msg = "The embedding model has not been loaded. Please call warm_up() before running." raise RuntimeError(msg) diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum_text_embedder.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_text_embedder.py index 828c86111..8a33a9403 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum_text_embedder.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_text_embedder.py @@ -3,9 +3,7 @@ from haystack import component, default_from_dict, default_to_dict from haystack.utils import Secret, deserialize_secrets_inplace from haystack.utils.hf import HFModelType, check_valid_model, deserialize_hf_model_kwargs, serialize_hf_model_kwargs -from haystack_integrations.components.embedders.backends.optimum_backend import ( - _OptimumEmbeddingBackendFactory, -) +from haystack_integrations.components.embedders.optimum_backend import OptimumEmbeddingBackend from haystack_integrations.components.embedders.pooling import HFPoolingMode, PoolingMode @@ -56,14 +54,13 @@ def __init__( """ Create a OptimumTextEmbedder component. - :param model: A string representing the model id on HF Hub. Defaults to - "sentence-transformers/all-mpnet-base-v2". + :param model: A string representing the model id on HF Hub. :param token: The HuggingFace token to use as HTTP bearer authorization. :param prefix: A string to add to the beginning of each text. :param suffix: A string to add to the end of each text. :param normalize_embeddings: Whether to normalize the embeddings to unit length. - :param onnx_execution_provider: The execution provider to use for ONNX models. Defaults to - "CPUExecutionProvider". See https://onnxruntime.ai/docs/execution-providers/ for possible providers. + :param onnx_execution_provider: The execution provider to use for ONNX models. See + https://onnxruntime.ai/docs/execution-providers/ for possible providers. Note: Using the TensorRT execution provider TensorRT requires to build its inference engine ahead of inference, which takes some time due to the model @@ -83,8 +80,7 @@ def __init__( }, ) ``` - :param pooling_mode: The pooling mode to use. Defaults to None. When None, pooling mode will be inferred from - the model config. If not found, "mean" pooling will be used. + :param pooling_mode: The pooling mode to use. When None, pooling mode will be inferred from the model config. The supported pooling modes are: - "cls": Perform CLS Pooling on the output of the embedding model. Uses the first token (CLS token) as text representations. @@ -112,9 +108,14 @@ def __init__( # Infer pooling mode from model config if not provided, if pooling_mode is None: self.pooling_mode = HFPoolingMode.get_pooling_mode(model, token) - # Set default to "mean" if not found in model config and not specified by user + # Raise error if pooling mode is not found in model config and not specified by user if self.pooling_mode is None: - self.pooling_mode = PoolingMode.MEAN + modes = {e.value: e for e in PoolingMode} + msg = ( + f"Pooling mode not found in model config and not specified by user." + f" Supported modes are: {list(modes.keys())}" + ) + raise ValueError(msg) self.prefix = prefix self.suffix = suffix @@ -129,15 +130,14 @@ def __init__( model_kwargs.setdefault("use_auth_token", token) self.model_kwargs = model_kwargs - self.embedding_model = None - self.tokenizer = None + self.embedding_backend = None def warm_up(self): """ Load the embedding backend. """ - if not hasattr(self, "embedding_backend"): - self.embedding_backend = _OptimumEmbeddingBackendFactory.get_embedding_backend( + if self.embedding_backend is None: + self.embedding_backend = OptimumEmbeddingBackend( model=self.model, token=self.token, model_kwargs=self.model_kwargs ) @@ -188,7 +188,7 @@ def run(self, text: str): ) raise TypeError(msg) - if not hasattr(self, "embedding_backend"): + if self.embedding_backend is None: msg = "The embedding model has not been loaded. Please call warm_up() before running." raise RuntimeError(msg) diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/pooling.py b/integrations/optimum/src/haystack_integrations/components/embedders/pooling.py index aa011b8ce..d56dcc495 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/pooling.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/pooling.py @@ -5,6 +5,7 @@ import torch from haystack.utils import Secret from huggingface_hub import hf_hub_download +from sentence_transformers.models import Pooling as PoolingLayer class PoolingMode(Enum): @@ -40,13 +41,35 @@ def from_str(cls, string: str) -> "PoolingMode": return pooling_mode +POOLING_MODES_MAP = { + "pooling_mode_cls_token": PoolingMode.CLS, + "pooling_mode_mean_tokens": PoolingMode.MEAN, + "pooling_mode_max_tokens": PoolingMode.MAX, + "pooling_mode_mean_sqrt_len_tokens": PoolingMode.MEAN_SQRT_LEN, + "pooling_mode_weightedmean_tokens": PoolingMode.WEIGHTED_MEAN, + "pooling_mode_last_token": PoolingMode.LAST_TOKEN, +} + +INVERSE_POOLING_MODES_MAP = {mode: name for name, mode in POOLING_MODES_MAP.items()} + + class HFPoolingMode: """ - Gets the pooling mode of the Sentence Transformer model from the Hugging Face Hub. + Gets the pooling mode of the model from the Hugging Face Hub. """ @staticmethod def get_pooling_mode(model: str, token: Optional[Secret] = None) -> Optional[PoolingMode]: + """ + Gets the pooling mode of the model from the Hugging Face Hub. + + :param model: + The model to get the pooling mode for. + :param token: + The HuggingFace token to use as HTTP bearer authorization. + :returns: + The pooling mode. + """ try: pooling_config_path = hf_hub_download(repo_id=model, token=token, filename="1_Pooling/config.json") @@ -58,25 +81,17 @@ def get_pooling_mode(model: str, token: Optional[Secret] = None) -> Optional[Poo key for key, value in pooling_config.items() if key.startswith("pooling_mode") and value ] - pooling_modes_map = { - "pooling_mode_cls_token": PoolingMode.CLS, - "pooling_mode_mean_tokens": PoolingMode.MEAN, - "pooling_mode_max_tokens": PoolingMode.MAX, - "pooling_mode_mean_sqrt_len_tokens": PoolingMode.MEAN_SQRT_LEN, - "pooling_mode_weightedmean_tokens": PoolingMode.WEIGHTED_MEAN, - "pooling_mode_last_token": PoolingMode.LAST_TOKEN, - } - # If exactly one True pooling mode is found, return it if len(true_pooling_modes) == 1: pooling_mode_from_config = true_pooling_modes[0] - pooling_mode = pooling_modes_map.get(pooling_mode_from_config) + pooling_mode = POOLING_MODES_MAP.get(pooling_mode_from_config) # If no True pooling modes or more than one True pooling mode is found, return None else: pooling_mode = None return pooling_mode - except Exception: - return None + except Exception as e: + msg = f"An error occurred while inferring the pooling mode from the model config: {e}" + raise ValueError(msg) from e class Pooling: @@ -93,113 +108,6 @@ def __init__(self, pooling_mode: PoolingMode, attention_mask: torch.tensor, mode self.attention_mask = attention_mask self.model_output = model_output - def _cls_pooling(self, token_embeddings: torch.tensor) -> torch.tensor: - """ - Perform CLS Pooling on the output of the embedding model. Uses the first token (CLS token) as text - representations. - - :param model_output: The output of the embedding model. - :param attention_mask: The attention mask of the tokenized text. - :return: The embeddings of the text after mean pooling. - """ - embeddings = token_embeddings[:, 0] - return embeddings - - def _max_pooling(self, token_embeddings: torch.tensor, attention_mask: torch.tensor) -> torch.tensor: - """ - Perform Max Pooling on the output of the embedding model. Uses max in each dimension over all tokens. - - :param model_output: The output of the embedding model. - :param attention_mask: The attention mask of the tokenized text. - :return: The embeddings of the text after mean pooling. - """ - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - # Set padding tokens to large negative value - token_embeddings[input_mask_expanded == 0] = -1e9 - embeddings = torch.max(token_embeddings, 1)[0] - return embeddings - - def _mean_pooling(self, token_embeddings: torch.tensor, attention_mask: torch.tensor) -> torch.tensor: - """ - Perform Mean Pooling on the output of the embedding model. - - :param model_output: The output of the embedding model. - :param attention_mask: The attention mask of the tokenized text. - :return: The embeddings of the text after mean pooling. - """ - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) - sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) - return sum_embeddings / sum_mask - - def _mean_sqrt_len_pooling(self, token_embeddings: torch.tensor, attention_mask: torch.tensor) -> torch.tensor: - """ - Perform mean-pooling on the output of the embedding model, but divide by sqrt(input_length). - - :param model_output: The output of the embedding model. - :param attention_mask: The attention mask of the tokenized text. - :return: The embeddings of the text after mean pooling. - """ - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) - sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) - return sum_embeddings / torch.sqrt(sum_mask) - - def _weighted_mean_pooling(self, token_embeddings: torch.tensor, attention_mask: torch.tensor) -> torch.tensor: - """ - Perform Weighted (position) Mean Pooling on the output of the embedding model. - See https://arxiv.org/abs/2202.08904. - - :param model_output: The output of the embedding model. - :param attention_mask: The attention mask of the tokenized text. - :return: The embeddings of the text after mean pooling. - """ - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - # token_embeddings shape: bs, seq, hidden_dim - weights = ( - torch.arange(start=1, end=token_embeddings.shape[1] + 1) - .unsqueeze(0) - .unsqueeze(-1) - .expand(token_embeddings.size()) - .float() - .to(token_embeddings.device) - ) - input_mask_expanded = input_mask_expanded * weights - sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) - sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) - return sum_embeddings / sum_mask - - def _last_token_pooling(self, token_embeddings: torch.tensor, attention_mask: torch.tensor) -> torch.tensor: - """ - Perform Last Token Pooling on the output of the embedding model. See https://arxiv.org/abs/2202.08904 & - https://arxiv.org/abs/2201.10005. - - :param model_output: The output of the embedding model. - :param attention_mask: The attention mask of the tokenized text. - :return: The embeddings of the text after mean pooling. - """ - bs, seq_len, hidden_dim = token_embeddings.shape - # attention_mask shape: (bs, seq_len) - # Get shape [bs] indices of the last token (i.e. the last token for each batch item) - # argmin gives us the index of the first 0 in the attention mask; We get the last 1 index by subtracting 1 - # Any sequence where min == 1, we use the entire sequence length since argmin = 0 - values, indices = torch.min(attention_mask, 1, keepdim=False) - gather_indices = torch.where(values == 0, indices, seq_len) - 1 # Shape [bs] - # There are empty sequences, where the index would become -1 which will crash - gather_indices = torch.clamp(gather_indices, min=0) - - # Turn indices from shape [bs] --> [bs, 1, hidden_dim] - gather_indices = gather_indices.unsqueeze(-1).repeat(1, hidden_dim) - gather_indices = gather_indices.unsqueeze(1) - - # Gather along the 1st dim (seq_len) (bs, seq_len, hidden_dim -> bs, hidden_dim) - # Actually no need for the attention mask as we gather the last token where attn_mask = 1 - # but as we set some indices (which shouldn't be attended to) to 0 with clamp, we - # use the attention mask to ignore them again - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - embeddings = torch.gather(token_embeddings * input_mask_expanded, 1, gather_indices).squeeze(dim=1) - return embeddings - def pool_embeddings(self) -> torch.tensor: """ Perform pooling on the output of the embedding model. @@ -210,22 +118,14 @@ def pool_embeddings(self) -> torch.tensor: :return: The embeddings of the text after pooling. """ pooling_func_map = { - PoolingMode.CLS: self._cls_pooling, - PoolingMode.MEAN: self._mean_pooling, - PoolingMode.MAX: self._max_pooling, - PoolingMode.MEAN_SQRT_LEN: self._mean_sqrt_len_pooling, - PoolingMode.WEIGHTED_MEAN: self._weighted_mean_pooling, - PoolingMode.LAST_TOKEN: self._last_token_pooling, + INVERSE_POOLING_MODES_MAP[self.pooling_mode]: True, } - self._pooling_function = pooling_func_map[self.pooling_mode] - # First element of model_output contains all token embeddings token_embeddings = self.model_output[0] - - embeddings = ( - self._pooling_function(token_embeddings, self.attention_mask) # type: ignore - if self._pooling_function != self._cls_pooling - else self._pooling_function(token_embeddings) # type: ignore - ) + word_embedding_dimension = token_embeddings.size(dim=2) + pooling = PoolingLayer(word_embedding_dimension=word_embedding_dimension, **pooling_func_map) + features = {"token_embeddings": token_embeddings, "attention_mask": self.attention_mask} + pooled_outputs = pooling.forward(features) + embeddings = pooled_outputs["sentence_embedding"] return embeddings diff --git a/integrations/optimum/tests/test_optimum_backend.py b/integrations/optimum/tests/test_optimum_backend.py new file mode 100644 index 000000000..be7ae8818 --- /dev/null +++ b/integrations/optimum/tests/test_optimum_backend.py @@ -0,0 +1,24 @@ +import pytest +from haystack_integrations.components.embedders.optimum_backend import OptimumEmbeddingBackend +from haystack_integrations.components.embedders.pooling import PoolingMode + + +@pytest.fixture +def backend(): + model = "sentence-transformers/all-mpnet-base-v2" + model_kwargs = {"model_id": model} + backend = OptimumEmbeddingBackend(model=model, model_kwargs=model_kwargs, token=None) + return backend + + +def test_embed_output_order(backend): + texts_to_embed = ["short text", "text that is longer than the other", "medium length text"] + embeddings = backend.embed(texts_to_embed, normalize_embeddings=False, pooling_mode=PoolingMode.MEAN) + + # Compute individual embeddings in order + expected_embeddings = [] + for text in texts_to_embed: + expected_embeddings.append(backend.embed(text, normalize_embeddings=False, pooling_mode=PoolingMode.MEAN)) + + # Assert that the embeddings are in the same order + assert embeddings == expected_embeddings diff --git a/integrations/optimum/tests/test_optimum_document_embedder.py b/integrations/optimum/tests/test_optimum_document_embedder.py index 612a9ab0e..f61fea1d3 100644 --- a/integrations/optimum/tests/test_optimum_document_embedder.py +++ b/integrations/optimum/tests/test_optimum_document_embedder.py @@ -176,13 +176,11 @@ def test_infer_pooling_mode_from_str(self): @pytest.mark.integration def test_default_pooling_mode_when_config_not_found(self, mock_check_valid_model): # noqa: ARG002 - embedder = OptimumDocumentEmbedder( - model="embedding_model_finetuned", - pooling_mode=None, - ) - - assert embedder.model == "embedding_model_finetuned" - assert embedder.pooling_mode == PoolingMode.MEAN + with pytest.raises(ValueError): + OptimumDocumentEmbedder( + model="embedding_model_finetuned", + pooling_mode=None, + ) @pytest.mark.integration def test_infer_pooling_mode_from_hf(self): diff --git a/integrations/optimum/tests/test_optimum_text_embedder.py b/integrations/optimum/tests/test_optimum_text_embedder.py index 52124944c..9932d1dbf 100644 --- a/integrations/optimum/tests/test_optimum_text_embedder.py +++ b/integrations/optimum/tests/test_optimum_text_embedder.py @@ -149,13 +149,11 @@ def test_infer_pooling_mode_from_str(self): @pytest.mark.integration def test_default_pooling_mode_when_config_not_found(self, mock_check_valid_model): # noqa: ARG002 - embedder = OptimumTextEmbedder( - model="embedding_model_finetuned", - pooling_mode=None, - ) - - assert embedder.model == "embedding_model_finetuned" - assert embedder.pooling_mode == PoolingMode.MEAN + with pytest.raises(ValueError): + OptimumTextEmbedder( + model="embedding_model_finetuned", + pooling_mode=None, + ) @pytest.mark.integration def test_infer_pooling_mode_from_hf(self): From de9bd7728c4e69c51a6953feaddf9b8ae48583d4 Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Wed, 21 Feb 2024 15:04:02 +0530 Subject: [PATCH 07/10] Add API docs generation workflow --- .github/workflows/optimum.yml | 4 ++++ integrations/optimum/pydoc/config.yml | 32 +++++++++++++++++++++++++++ integrations/optimum/pyproject.toml | 2 ++ 3 files changed, 38 insertions(+) create mode 100644 integrations/optimum/pydoc/config.yml diff --git a/.github/workflows/optimum.yml b/.github/workflows/optimum.yml index 01d589b07..3b0d137da 100644 --- a/.github/workflows/optimum.yml +++ b/.github/workflows/optimum.yml @@ -52,5 +52,9 @@ jobs: if: matrix.python-version == '3.9' && runner.os == 'Linux' run: hatch run lint:all + - name: Generate docs + if: matrix.python-version == '3.9' && runner.os == 'Linux' + run: hatch run docs + - name: Run tests run: hatch run cov diff --git a/integrations/optimum/pydoc/config.yml b/integrations/optimum/pydoc/config.yml new file mode 100644 index 000000000..12d4d9c8f --- /dev/null +++ b/integrations/optimum/pydoc/config.yml @@ -0,0 +1,32 @@ +loaders: + - type: haystack_pydoc_tools.loaders.CustomPythonLoader + search_path: [../src] + modules: + [ + "haystack_integrations.components.embedders.optimum_backend", + "haystack_integrations.components.embedders.optimum_document_embedder", + "haystack_integrations.components.embedders.optimum_text_embedder", + "haystack_integrations.components.embedders.pooling", + ] + ignore_when_discovered: ["__init__"] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: haystack_pydoc_tools.renderers.ReadmePreviewRenderer + excerpt: Optimum integration for Haystack + category_slug: haystack-integrations + title: Optimum + slug: integrations-optimum + order: 190 + markdown: + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: _readme_optimum.md diff --git a/integrations/optimum/pyproject.toml b/integrations/optimum/pyproject.toml index 2c654c6b4..17bca2597 100644 --- a/integrations/optimum/pyproject.toml +++ b/integrations/optimum/pyproject.toml @@ -56,6 +56,7 @@ git_describe_command = 'git describe --tags --match="integrations/optimum-v[0-9] dependencies = [ "coverage[toml]>=6.5", "pytest", + "haystack-pydoc-tools" ] [tool.hatch.envs.default.scripts] test = "pytest {args:tests}" @@ -68,6 +69,7 @@ cov = [ "test-cov", "cov-report", ] +docs = ["pydoc-markdown pydoc/config.yml"] [[tool.hatch.envs.all.matrix]] python = ["3.8", "3.9", "3.10", "3.11", "3.12"] From 096b1d349699aefd49d7b292c3285ab1e02996c8 Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Wed, 21 Feb 2024 16:44:35 +0530 Subject: [PATCH 08/10] Add additional tests --- .../components/embedders/pooling.py | 8 +++++- .../optimum/tests/test_optimum_backend.py | 26 ++++++++++++------- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/pooling.py b/integrations/optimum/src/haystack_integrations/components/embedders/pooling.py index d56dcc495..8f0cddc22 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/pooling.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/pooling.py @@ -47,7 +47,7 @@ def from_str(cls, string: str) -> "PoolingMode": "pooling_mode_max_tokens": PoolingMode.MAX, "pooling_mode_mean_sqrt_len_tokens": PoolingMode.MEAN_SQRT_LEN, "pooling_mode_weightedmean_tokens": PoolingMode.WEIGHTED_MEAN, - "pooling_mode_last_token": PoolingMode.LAST_TOKEN, + "pooling_mode_lasttoken": PoolingMode.LAST_TOKEN, } INVERSE_POOLING_MODES_MAP = {mode: name for name, mode in POOLING_MODES_MAP.items()} @@ -120,6 +120,12 @@ def pool_embeddings(self) -> torch.tensor: pooling_func_map = { INVERSE_POOLING_MODES_MAP[self.pooling_mode]: True, } + # By default, sentence-transformers uses mean pooling + # If multiple pooling methods are specified, the output dimension of the embeddings is scaled by the number of + # pooling methods selected + if self.pooling_mode != PoolingMode.MEAN: + pooling_func_map[INVERSE_POOLING_MODES_MAP[PoolingMode.MEAN]] = False + # First element of model_output contains all token embeddings token_embeddings = self.model_output[0] word_embedding_dimension = token_embeddings.size(dim=2) diff --git a/integrations/optimum/tests/test_optimum_backend.py b/integrations/optimum/tests/test_optimum_backend.py index be7ae8818..8ef61fd37 100644 --- a/integrations/optimum/tests/test_optimum_backend.py +++ b/integrations/optimum/tests/test_optimum_backend.py @@ -11,14 +11,22 @@ def backend(): return backend -def test_embed_output_order(backend): - texts_to_embed = ["short text", "text that is longer than the other", "medium length text"] - embeddings = backend.embed(texts_to_embed, normalize_embeddings=False, pooling_mode=PoolingMode.MEAN) +class TestOptimumBackend: + def test_embed_output_order(self, backend): + texts_to_embed = ["short text", "text that is longer than the other", "medium length text"] + embeddings = backend.embed(texts_to_embed, normalize_embeddings=False, pooling_mode=PoolingMode.MEAN) - # Compute individual embeddings in order - expected_embeddings = [] - for text in texts_to_embed: - expected_embeddings.append(backend.embed(text, normalize_embeddings=False, pooling_mode=PoolingMode.MEAN)) + # Compute individual embeddings in order + expected_embeddings = [] + for text in texts_to_embed: + expected_embeddings.append(backend.embed(text, normalize_embeddings=False, pooling_mode=PoolingMode.MEAN)) - # Assert that the embeddings are in the same order - assert embeddings == expected_embeddings + # Assert that the embeddings are in the same order + assert embeddings == expected_embeddings + + def test_run_pooling_modes(self, backend): + for pooling_mode in PoolingMode: + embedding = backend.embed("test text", normalize_embeddings=False, pooling_mode=pooling_mode) + + assert len(embedding) == 768 + assert all(isinstance(x, float) for x in embedding) From 98bc9033ae28c149c7486ac3e6798606ed14d5c0 Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Wed, 21 Feb 2024 17:29:17 +0530 Subject: [PATCH 09/10] Update order to 185 for API docs --- integrations/optimum/pydoc/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/optimum/pydoc/config.yml b/integrations/optimum/pydoc/config.yml index 12d4d9c8f..4ee7f73bf 100644 --- a/integrations/optimum/pydoc/config.yml +++ b/integrations/optimum/pydoc/config.yml @@ -23,7 +23,7 @@ renderer: category_slug: haystack-integrations title: Optimum slug: integrations-optimum - order: 190 + order: 185 markdown: descriptive_class_title: false descriptive_module_title: true From ce4bb7699fc5a923a6bce503f716b3896eeee361 Mon Sep 17 00:00:00 2001 From: Ashwin Mathur <97467100+awinml@users.noreply.github.com> Date: Wed, 21 Feb 2024 17:55:40 +0530 Subject: [PATCH 10/10] Update integrations/optimum/pydoc/config.yml Co-authored-by: Daria Fokina --- integrations/optimum/pydoc/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/optimum/pydoc/config.yml b/integrations/optimum/pydoc/config.yml index 4ee7f73bf..5fb353b5d 100644 --- a/integrations/optimum/pydoc/config.yml +++ b/integrations/optimum/pydoc/config.yml @@ -20,7 +20,7 @@ processors: renderer: type: haystack_pydoc_tools.renderers.ReadmePreviewRenderer excerpt: Optimum integration for Haystack - category_slug: haystack-integrations + category_slug: integrations-api title: Optimum slug: integrations-optimum order: 185