From 1e98d90c96f67772bb3bbc2e772ee5583fe36ef9 Mon Sep 17 00:00:00 2001 From: Nico Date: Thu, 8 Feb 2024 14:14:44 +0100 Subject: [PATCH 01/49] created project --- integrations/fastembed-haystack/LICENSE.txt | 73 +++++++ integrations/fastembed-haystack/README.md | 21 ++ .../fastembed-haystack/pyproject.toml | 84 ++++++++ .../embedders/fastembed/__about__.py | 4 + .../embedders/fastembed/__init__.py | 8 + .../fastembed/embedding_backend/__init__.py | 3 + .../embedding_backend/fastembed_backend.py | 39 ++++ .../fastembed/fastembed_text_embedder.py | 119 ++++++++++++ .../fastembed-haystack/tests/__init__.py | 3 + .../tests/test_fastembed_backend.py | 54 ++++++ .../tests/test_fastembed_text_embedder.py | 179 ++++++++++++++++++ 11 files changed, 587 insertions(+) create mode 100644 integrations/fastembed-haystack/LICENSE.txt create mode 100644 integrations/fastembed-haystack/README.md create mode 100644 integrations/fastembed-haystack/pyproject.toml create mode 100644 integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/__about__.py create mode 100644 integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/__init__.py create mode 100644 integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/embedding_backend/__init__.py create mode 100644 integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py create mode 100644 integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py create mode 100644 integrations/fastembed-haystack/tests/__init__.py create mode 100644 integrations/fastembed-haystack/tests/test_fastembed_backend.py create mode 100644 integrations/fastembed-haystack/tests/test_fastembed_text_embedder.py diff --git a/integrations/fastembed-haystack/LICENSE.txt b/integrations/fastembed-haystack/LICENSE.txt new file mode 100644 index 000000000..137069b82 --- /dev/null +++ b/integrations/fastembed-haystack/LICENSE.txt @@ -0,0 +1,73 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. + +"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: + + (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. + + You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + +To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/integrations/fastembed-haystack/README.md b/integrations/fastembed-haystack/README.md new file mode 100644 index 000000000..90701af0f --- /dev/null +++ b/integrations/fastembed-haystack/README.md @@ -0,0 +1,21 @@ +# fastembed-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/fastembed-haystack.svg)](https://pypi.org/project/fastembed-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/fastembed-haystack.svg)](https://pypi.org/project/fastembed-haystack) + +----- + +**Table of Contents** + +- [Installation](#installation) +- [License](#license) + +## Installation + +```console +pip install fastembed-haystack +``` + +## License + +`fastembed-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. diff --git a/integrations/fastembed-haystack/pyproject.toml b/integrations/fastembed-haystack/pyproject.toml new file mode 100644 index 000000000..995d25a57 --- /dev/null +++ b/integrations/fastembed-haystack/pyproject.toml @@ -0,0 +1,84 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "fastembed-haystack" +dynamic = ["version"] +description = 'support for fastembed ' +readme = "README.md" +requires-python = ">=3.8" +license = "Apache-2.0" +keywords = [] +authors = [ + { name = "deepset GmbH", email = "info@deepset.ai" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [ +"haystack-ai", +"fastembed>=0.2", +] + +[project.urls] +Documentation = "https://github.com/unknown/fastembed-haystack#readme" +Issues = "https://github.com/unknown/fastembed-haystack/issues" +Source = "https://github.com/unknown/fastembed-haystack" + +[tool.hatch.version] +path = "src/fastembed_haystack/__about__.py" + +[tool.hatch.envs.default] +dependencies = [ + "coverage[toml]>=6.5", + "pytest", +] +[tool.hatch.envs.default.scripts] +test = "pytest {args:tests}" +test-cov = "coverage run -m pytest {args:tests}" +cov-report = [ + "- coverage combine", + "coverage report", +] +cov = [ + "test-cov", + "cov-report", +] + +[[tool.hatch.envs.all.matrix]] +python = ["3.8", "3.9", "3.10", "3.11", "3.12"] + +[tool.hatch.envs.types] +dependencies = [ + "mypy>=1.0.0", +] +[tool.hatch.envs.types.scripts] +check = "mypy --install-types --non-interactive {args:src/fastembed_haystack tests}" + +[tool.coverage.run] +source_pkgs = ["fastembed_haystack", "tests"] +branch = true +parallel = true +omit = [ + "src/fastembed_haystack/__about__.py", +] + +[tool.coverage.paths] +fastembed_haystack = ["src/fastembed_haystack", "*/fastembed-haystack/src/fastembed_haystack"] +tests = ["tests", "*/fastembed-haystack/tests"] + +[tool.coverage.report] +exclude_lines = [ + "no cov", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] diff --git a/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/__about__.py b/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/__about__.py new file mode 100644 index 000000000..dc11cd317 --- /dev/null +++ b/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/__about__.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +__version__ = "0.0.1" diff --git a/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/__init__.py b/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/__init__.py new file mode 100644 index 000000000..35ad74603 --- /dev/null +++ b/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +#from .document_embedder import FastembedDocumentEmbedder +from .fastembed_text_embedder import FastembedTextEmbedder + +#__all__ = ["FastembedDocumentEmbedder", "FastembedTextEmbedder"] +__all__ = ["FastembedTextEmbedder"] \ No newline at end of file diff --git a/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/embedding_backend/__init__.py b/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/embedding_backend/__init__.py new file mode 100644 index 000000000..e873bc332 --- /dev/null +++ b/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/embedding_backend/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py b/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py new file mode 100644 index 000000000..2f94d8338 --- /dev/null +++ b/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py @@ -0,0 +1,39 @@ +from typing import ClassVar, Dict, List + +from fastembed import TextEmbedding + + +class _FastembedEmbeddingBackendFactory: + """ + Factory class to create instances of fastembed embedding backends. + """ + + _instances: ClassVar[Dict[str, "_FastembedEmbeddingBackendFactory"]] = {} + + @staticmethod + def get_embedding_backend( + model_name: str): + embedding_backend_id = f"{model_name}" + + if embedding_backend_id in _FastembedEmbeddingBackendFactory._instances: + return _FastembedEmbeddingBackendFactory._instances[embedding_backend_id] + + embedding_backend = _FastembedEmbeddingBackend( + model_name=model_name + ) + _FastembedEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend + return embedding_backend + +class _FastembedEmbeddingBackend: + """ + Class to manage fastembed embeddings. + """ + + def __init__( + self, model_name: str + ): + self.model = TextEmbedding(model_name=model_name) + + def embed(self, data: List[List[str]], **kwargs) -> List[List[float]]: + embeddings = list(self.model.embed(data, **kwargs)) + return embeddings \ No newline at end of file diff --git a/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py b/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py new file mode 100644 index 000000000..4f5a4238f --- /dev/null +++ b/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py @@ -0,0 +1,119 @@ +from typing import Any, Dict, List + +from haystack import component, default_from_dict, default_to_dict + +from .embedding_backend.fastembed_backend import _FastembedEmbeddingBackendFactory + + +@component +class FastembedTextEmbedder: + """ + A component for embedding strings using fastembed embedding models. + + Usage example: + ```python + # To use this component, install the "fastembed" package. + # pip install fastembed + + from fastembed_haystack.text_embedder import FastembedTextEmbedder + + text = "It clearly says online this will work on a Mac OS system. The disk comes and it does not, only Windows. Do Not order this if you have a Mac!!" + instruction = ( + "Represent the Amazon comment for classifying the sentence as positive or negative" + ) + + text_embedder = FastembedTextEmbedder( + model="BAAI/bge-small-en-v1.5" + ) + + embedding = text_embedder.run(text) + ``` + """ # noqa: E501 + + def __init__( + self, + model: str = "BAAI/bge-small-en-v1.5", + batch_size: int = 256, + progress_bar: bool = True, + normalize_embeddings: bool = False, + ): + """ + Create a FastembedTextEmbedder component. + + :param model: Local path or name of the model in Hugging Face's model hub, + such as ``'hkunlp/instructor-base'``. + :param device: Device (like 'cuda' / 'cpu') that should be used for computation. + If None, checks if a GPU can be used. + :param use_auth_token: The API token used to download private models from Hugging Face. + If this parameter is set to `True`, then the token generated when running + `transformers-cli login` (stored in ~/.huggingface) will be used. + :param instruction: The instruction string to be used while computing domain-specific embeddings. + The instruction follows the unified template of the form: + "Represent the 'domain' 'text_type' for 'task_objective'", where: + - "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc. + - "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc. + - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, + classify the sentence, etc. + Check some examples of instructions here: https://github.com/xlang-ai/instructor-embedding#use-cases. + :param batch_size: Number of strings to encode at once. + :param progress_bar: If true, displays progress bar during embedding. + :param normalize_embeddings: If set to true, returned vectors will have the length of 1. + """ + + self.model_name = model + self.batch_size = batch_size + self.progress_bar = progress_bar + self.normalize_embeddings = normalize_embeddings + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict( + self, + model=self.model_name, + # device=self.device, + # use_auth_token=self.use_auth_token, + # instruction=self.instruction, + batch_size=self.batch_size, + progress_bar=self.progress_bar, + normalize_embeddings=self.normalize_embeddings, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "FastembedTextEmbedder": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + def warm_up(self): + """ + Load the embedding backend. + """ + if not hasattr(self, "embedding_backend"): + self.embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend( + model_name=self.model_name + ) + + @component.output_types(embedding=List[float]) + def run(self, text: str): + """Embed a string.""" + if not isinstance(text, str): + msg = ( + "FastembedTextEmbedder expects a string as input. " + "In case you want to embed a list of Documents, please use the InstructorDocumentEmbedder." + ) + raise TypeError(msg) + if not hasattr(self, "embedding_backend"): + msg = "The embedding model has not been loaded. Please call warm_up() before running." + raise RuntimeError(msg) + + text_to_embed = [text] + embedding = list(self.embedding_backend.embed( + text_to_embed, + batch_size=self.batch_size, + show_progress_bar=self.progress_bar, + normalize_embeddings=self.normalize_embeddings, + )[0]) + return {"embedding": embedding} \ No newline at end of file diff --git a/integrations/fastembed-haystack/tests/__init__.py b/integrations/fastembed-haystack/tests/__init__.py new file mode 100644 index 000000000..6b5e14dc1 --- /dev/null +++ b/integrations/fastembed-haystack/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/fastembed-haystack/tests/test_fastembed_backend.py b/integrations/fastembed-haystack/tests/test_fastembed_backend.py new file mode 100644 index 000000000..b5a5199fe --- /dev/null +++ b/integrations/fastembed-haystack/tests/test_fastembed_backend.py @@ -0,0 +1,54 @@ +from unittest.mock import patch + +from haystack_integrations.components.embedders.fastembed.embedding_backend.fastembed_backend import ( + _FastembedEmbeddingBackendFactory, +) + + +@patch( + "haystack_integrations.components.embedders.fastembed.embedding_backend.fastembed_backend.TextEmbedding" +) +def test_factory_behavior(mock_instructor): # noqa: ARG001 + embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend( + model_name="BAAI/bge-small-en-v1.5" + ) + same_embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend("BAAI/bge-small-en-v1.5") + another_embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend( + model_name="BAAI/bge-base-en-v1.5" + ) + + assert same_embedding_backend is embedding_backend + assert another_embedding_backend is not embedding_backend + + # restore the factory state + _FastembedEmbeddingBackendFactory._instances = {} + + +@patch( + "haystack_integrations.components.embedders.fastembed.embedding_backend.fastembed_backend.TextEmbedding" +) +def test_model_initialization(mock_instructor): + _FastembedEmbeddingBackendFactory.get_embedding_backend( + model_name="BAAI/bge-small-en-v1.5", + ) + mock_instructor.assert_called_once_with( + model_name="BAAI/bge-small-en-v1.5", + ) + # restore the factory state + _FastembedEmbeddingBackendFactory._instances = {} + + +@patch( + "haystack_integrations.components.embedders.fastembed.embedding_backend.fastembed_backend.TextEmbedding" +) +def test_embedding_function_with_kwargs(mock_instructor): # noqa: ARG001 + embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend( + model_name="BAAI/bge-small-en-v1.5" + ) + + data = ["sentence1", "sentence2"] + embedding_backend.embed(data=data) + + embedding_backend.model.embed.assert_called_once_with(data) + # restore the factory stateTrue + _FastembedEmbeddingBackendFactory._instances = {} \ No newline at end of file diff --git a/integrations/fastembed-haystack/tests/test_fastembed_text_embedder.py b/integrations/fastembed-haystack/tests/test_fastembed_text_embedder.py new file mode 100644 index 000000000..265a91696 --- /dev/null +++ b/integrations/fastembed-haystack/tests/test_fastembed_text_embedder.py @@ -0,0 +1,179 @@ +from unittest.mock import MagicMock, patch + +import numpy as np +import pytest +from haystack_integrations.components.embedders.fastembed.fastembed_text_embedder import FastembedTextEmbedder + + +class TestFastembedTextEmbedder: + def test_init_default(self): + """ + Test default initialization parameters for FastembedTextEmbedder. + """ + embedder = FastembedTextEmbedder(model="BAAI/bge-small-en-v1.5") + assert embedder.model_name == "BAAI/bge-small-en-v1.5" + assert embedder.batch_size == 256 + assert embedder.progress_bar is True + assert embedder.normalize_embeddings is False + + def test_init_with_parameters(self): + """ + Test custom initialization parameters for FastembedTextEmbedder. + """ + embedder = FastembedTextEmbedder( + model="BAAI/bge-small-en-v1.5", + batch_size=64, + progress_bar=False, + normalize_embeddings=True, + ) + assert embedder.model_name == "BAAI/bge-small-en-v1.5" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.normalize_embeddings is True + + def test_to_dict(self): + """ + Test serialization of FastembedTextEmbedder to a dictionary, using default initialization parameters. + """ + embedder = FastembedTextEmbedder(model="BAAI/bge-small-en-v1.5") + embedder_dict = embedder.to_dict() + assert embedder_dict == { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_text_embedder.FastembedTextEmbedder", # noqa + "init_parameters": { + "model": "BAAI/bge-small-en-v1.5", + "batch_size": 256, + "progress_bar": True, + "normalize_embeddings": False, + }, + } + + def test_to_dict_with_custom_init_parameters(self): + """ + Test serialization of FastembedTextEmbedder to a dictionary, using custom initialization parameters. + """ + embedder = FastembedTextEmbedder( + model="BAAI/bge-small-en-v1.5", + batch_size=64, + progress_bar=False, + normalize_embeddings=True, + ) + embedder_dict = embedder.to_dict() + assert embedder_dict == { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_text_embedder.FastembedTextEmbedder", # noqa + "init_parameters": { + "model": "BAAI/bge-small-en-v1.5", + "batch_size": 64, + "progress_bar": False, + "normalize_embeddings": True, + }, + } + + def test_from_dict(self): + """ + Test deserialization of FastembedTextEmbedder from a dictionary, using default initialization parameters. + """ + embedder_dict = { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_text_embedder.FastembedTextEmbedder", # noqa + "init_parameters": { + "model": "BAAI/bge-small-en-v1.5", + "batch_size": 256, + "progress_bar": True, + "normalize_embeddings": False, + }, + } + embedder = FastembedTextEmbedder.from_dict(embedder_dict) + assert embedder.model_name == "BAAI/bge-small-en-v1.5" + assert embedder.batch_size == 256 + assert embedder.progress_bar is True + assert embedder.normalize_embeddings is False + + def test_from_dict_with_custom_init_parameters(self): + """ + Test deserialization of FastembedTextEmbedder from a dictionary, using custom initialization parameters. + """ + embedder_dict = { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_text_embedder.FastembedTextEmbedder", # noqa + "init_parameters": { + "model": "BAAI/bge-small-en-v1.5", + "batch_size": 64, + "progress_bar": False, + "normalize_embeddings": True, + }, + } + embedder = FastembedTextEmbedder.from_dict(embedder_dict) + assert embedder.model_name == "BAAI/bge-small-en-v1.5" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.normalize_embeddings is True + + @patch( + "haystack_integrations.components.embedders.fastembed.fastembed_text_embedder._FastembedEmbeddingBackendFactory" + ) + def test_warmup(self, mocked_factory): + """ + Test for checking embedder instances after warm-up. + """ + embedder = FastembedTextEmbedder(model="BAAI/bge-small-en-v1.5") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once_with( + model_name="BAAI/bge-small-en-v1.5" + ) + + @patch( + "haystack_integrations.components.embedders.fastembed.fastembed_text_embedder._FastembedEmbeddingBackendFactory" + ) + def test_warmup_does_not_reload(self, mocked_factory): + """ + Test for checking backend instances after multiple warm-ups. + """ + embedder = FastembedTextEmbedder(model="BAAI/bge-small-en-v1.5") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once() + + def test_embed(self): + """ + Test for checking output dimensions and embedding dimensions. + """ + embedder = FastembedTextEmbedder(model="BAAI/bge-base-en-v1.5") + embedder.embedding_backend = MagicMock() + embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() # noqa: ARG005 + + text = "Good text to embed" + + result = embedder.run(text=text) + embedding = result["embedding"] + + assert isinstance(embedding, list) + assert all(isinstance(emb, float) for emb in embedding) + + def test_run_wrong_incorrect_format(self): + """ + Test for checking incorrect input format when creating embedding. + """ + embedder = FastembedTextEmbedder(model="BAAI/bge-base-en-v1.5") + embedder.embedding_backend = MagicMock() + + list_integers_input = [1, 2, 3] + + with pytest.raises(TypeError, match="FastembedTextEmbedder expects a string as input"): + embedder.run(text=list_integers_input) + + @pytest.mark.integration + def test_run(self): + embedder = FastembedTextEmbedder( + model="BAAI/bge-small-en-v1.5", + ) + embedder.warm_up() + # embedder.embedding_backend = MagicMock() + + text = "Parton energy loss in QCD matter" + + result = embedder.run(text=text) + embedding = result["embedding"] + + assert isinstance(embedding, list) + assert len(embedding) == 384 + assert all(isinstance(emb.item(), float) for emb in embedding) \ No newline at end of file From ab89dc82bf0e921b03fe9e868edac89209ac3af9 Mon Sep 17 00:00:00 2001 From: Nico Date: Thu, 8 Feb 2024 14:29:58 +0100 Subject: [PATCH 02/49] added parallel param --- .../fastembed/fastembed_text_embedder.py | 33 +++++++------------ 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py b/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py index 4f5a4238f..eae85161c 100644 --- a/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py +++ b/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py @@ -18,9 +18,6 @@ class FastembedTextEmbedder: from fastembed_haystack.text_embedder import FastembedTextEmbedder text = "It clearly says online this will work on a Mac OS system. The disk comes and it does not, only Windows. Do Not order this if you have a Mac!!" - instruction = ( - "Represent the Amazon comment for classifying the sentence as positive or negative" - ) text_embedder = FastembedTextEmbedder( model="BAAI/bge-small-en-v1.5" @@ -34,34 +31,27 @@ def __init__( self, model: str = "BAAI/bge-small-en-v1.5", batch_size: int = 256, + parallel: int = None, progress_bar: bool = True, normalize_embeddings: bool = False, ): """ Create a FastembedTextEmbedder component. - :param model: Local path or name of the model in Hugging Face's model hub, - such as ``'hkunlp/instructor-base'``. - :param device: Device (like 'cuda' / 'cpu') that should be used for computation. - If None, checks if a GPU can be used. - :param use_auth_token: The API token used to download private models from Hugging Face. - If this parameter is set to `True`, then the token generated when running - `transformers-cli login` (stored in ~/.huggingface) will be used. - :param instruction: The instruction string to be used while computing domain-specific embeddings. - The instruction follows the unified template of the form: - "Represent the 'domain' 'text_type' for 'task_objective'", where: - - "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc. - - "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc. - - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, - classify the sentence, etc. - Check some examples of instructions here: https://github.com/xlang-ai/instructor-embedding#use-cases. + :param model: Local path or name of the model in Fastembed's model hub, + such as ``'BAAI/bge-small-en-v1.5'``. :param batch_size: Number of strings to encode at once. + :param parallel: + If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. + If 0, use all available cores. + If None, don't use data-parallel processing, use default onnxruntime threading instead. :param progress_bar: If true, displays progress bar during embedding. :param normalize_embeddings: If set to true, returned vectors will have the length of 1. """ self.model_name = model self.batch_size = batch_size + self.parallel = parallel self.progress_bar = progress_bar self.normalize_embeddings = normalize_embeddings @@ -72,10 +62,8 @@ def to_dict(self) -> Dict[str, Any]: return default_to_dict( self, model=self.model_name, - # device=self.device, - # use_auth_token=self.use_auth_token, - # instruction=self.instruction, batch_size=self.batch_size, + parallel=self.parallel, progress_bar=self.progress_bar, normalize_embeddings=self.normalize_embeddings, ) @@ -102,7 +90,7 @@ def run(self, text: str): if not isinstance(text, str): msg = ( "FastembedTextEmbedder expects a string as input. " - "In case you want to embed a list of Documents, please use the InstructorDocumentEmbedder." + "In case you want to embed a list of Documents, please use the FastembedDocumentEmbedder." ) raise TypeError(msg) if not hasattr(self, "embedding_backend"): @@ -113,6 +101,7 @@ def run(self, text: str): embedding = list(self.embedding_backend.embed( text_to_embed, batch_size=self.batch_size, + parallel=self.parallel, show_progress_bar=self.progress_bar, normalize_embeddings=self.normalize_embeddings, )[0]) From 4dae714ff880288be10d013d42751a7c80629b52 Mon Sep 17 00:00:00 2001 From: Nico Date: Thu, 8 Feb 2024 14:30:06 +0100 Subject: [PATCH 03/49] updated test --- .../tests/test_fastembed_text_embedder.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/integrations/fastembed-haystack/tests/test_fastembed_text_embedder.py b/integrations/fastembed-haystack/tests/test_fastembed_text_embedder.py index 265a91696..245a5e033 100644 --- a/integrations/fastembed-haystack/tests/test_fastembed_text_embedder.py +++ b/integrations/fastembed-haystack/tests/test_fastembed_text_embedder.py @@ -13,6 +13,7 @@ def test_init_default(self): embedder = FastembedTextEmbedder(model="BAAI/bge-small-en-v1.5") assert embedder.model_name == "BAAI/bge-small-en-v1.5" assert embedder.batch_size == 256 + assert embedder.parallel is None assert embedder.progress_bar is True assert embedder.normalize_embeddings is False @@ -23,11 +24,13 @@ def test_init_with_parameters(self): embedder = FastembedTextEmbedder( model="BAAI/bge-small-en-v1.5", batch_size=64, + parallel=0, progress_bar=False, normalize_embeddings=True, ) assert embedder.model_name == "BAAI/bge-small-en-v1.5" assert embedder.batch_size == 64 + assert embedder.parallel == 0 assert embedder.progress_bar is False assert embedder.normalize_embeddings is True @@ -42,6 +45,7 @@ def test_to_dict(self): "init_parameters": { "model": "BAAI/bge-small-en-v1.5", "batch_size": 256, + "parallel": None, "progress_bar": True, "normalize_embeddings": False, }, @@ -54,6 +58,7 @@ def test_to_dict_with_custom_init_parameters(self): embedder = FastembedTextEmbedder( model="BAAI/bge-small-en-v1.5", batch_size=64, + parallel=1, progress_bar=False, normalize_embeddings=True, ) @@ -63,6 +68,7 @@ def test_to_dict_with_custom_init_parameters(self): "init_parameters": { "model": "BAAI/bge-small-en-v1.5", "batch_size": 64, + "parallel":1, "progress_bar": False, "normalize_embeddings": True, }, @@ -77,6 +83,7 @@ def test_from_dict(self): "init_parameters": { "model": "BAAI/bge-small-en-v1.5", "batch_size": 256, + "parallel": None, "progress_bar": True, "normalize_embeddings": False, }, @@ -84,6 +91,7 @@ def test_from_dict(self): embedder = FastembedTextEmbedder.from_dict(embedder_dict) assert embedder.model_name == "BAAI/bge-small-en-v1.5" assert embedder.batch_size == 256 + assert embedder.parallel is None assert embedder.progress_bar is True assert embedder.normalize_embeddings is False @@ -96,6 +104,7 @@ def test_from_dict_with_custom_init_parameters(self): "init_parameters": { "model": "BAAI/bge-small-en-v1.5", "batch_size": 64, + "parallel": 1, "progress_bar": False, "normalize_embeddings": True, }, @@ -103,6 +112,7 @@ def test_from_dict_with_custom_init_parameters(self): embedder = FastembedTextEmbedder.from_dict(embedder_dict) assert embedder.model_name == "BAAI/bge-small-en-v1.5" assert embedder.batch_size == 64 + assert embedder.parallel == 1 assert embedder.progress_bar is False assert embedder.normalize_embeddings is True From 1c6d4e843ced8dc432d7fcbe7ac477bcbf5955d2 Mon Sep 17 00:00:00 2001 From: Nico Date: Thu, 8 Feb 2024 16:40:42 +0100 Subject: [PATCH 04/49] version 0.0.1 --- .../embedders/fastembed/__init__.py | 5 +- .../embedding_backend/fastembed_backend.py | 15 +- .../fastembed/fastembed_document_embedder.py | 159 +++++++++++ .../fastembed/fastembed_text_embedder.py | 35 ++- .../tests/test_fastembed_backend.py | 6 +- .../tests/test_fastembed_document_embedder.py | 252 ++++++++++++++++++ .../tests/test_fastembed_text_embedder.py | 24 +- 7 files changed, 453 insertions(+), 43 deletions(-) create mode 100644 integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py create mode 100644 integrations/fastembed-haystack/tests/test_fastembed_document_embedder.py diff --git a/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/__init__.py b/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/__init__.py index 35ad74603..fdf4dd8de 100644 --- a/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/__init__.py +++ b/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/__init__.py @@ -1,8 +1,7 @@ # SPDX-FileCopyrightText: 2024-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 -#from .document_embedder import FastembedDocumentEmbedder +from .fastembed_document_embedder import FastembedDocumentEmbedder from .fastembed_text_embedder import FastembedTextEmbedder -#__all__ = ["FastembedDocumentEmbedder", "FastembedTextEmbedder"] -__all__ = ["FastembedTextEmbedder"] \ No newline at end of file +__all__ = ["FastembedDocumentEmbedder", "FastembedTextEmbedder"] diff --git a/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py b/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py index 2f94d8338..bf7313103 100644 --- a/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py +++ b/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py @@ -12,28 +12,33 @@ class _FastembedEmbeddingBackendFactory: @staticmethod def get_embedding_backend( - model_name: str): + model_name: str, + ): embedding_backend_id = f"{model_name}" if embedding_backend_id in _FastembedEmbeddingBackendFactory._instances: return _FastembedEmbeddingBackendFactory._instances[embedding_backend_id] embedding_backend = _FastembedEmbeddingBackend( - model_name=model_name + model_name=model_name, + ) + _FastembedEmbeddingBackendFactory._instances[embedding_backend_id] = ( + embedding_backend ) - _FastembedEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend return embedding_backend + class _FastembedEmbeddingBackend: """ Class to manage fastembed embeddings. """ def __init__( - self, model_name: str + self, + model_name: str, ): self.model = TextEmbedding(model_name=model_name) def embed(self, data: List[List[str]], **kwargs) -> List[List[float]]: embeddings = list(self.model.embed(data, **kwargs)) - return embeddings \ No newline at end of file + return embeddings diff --git a/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py b/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py new file mode 100644 index 000000000..0bf658b6f --- /dev/null +++ b/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py @@ -0,0 +1,159 @@ +from typing import Any, Dict, List, Optional + +from haystack import Document, component, default_from_dict, default_to_dict + +from .embedding_backend.fastembed_backend import _FastembedEmbeddingBackendFactory + + +@component +class FastembedDocumentEmbedder: + """ + A component for computing Document embeddings using Fastembed embedding models. + The embedding of each Document is stored in the `embedding` field of the Document. + + Usage example: + ```python + # To use this component, install the "fastembed-haystack" package. + # pip install fastembed-haystack + + from fastembed_haystack.fastembed__document_embedder import FastembedDocumentEmbedder + from haystack.dataclasses import Document + + doc_embedder = FastembedDocumentEmbedder( + model="BAAI/bge-small-en-v1.5", + batch_size=256, + ) + + doc_embedder.warm_up() + + # Text taken from PubMed QA Dataset (https://huggingface.co/datasets/pubmed_qa) + document_list = [ + Document( + content="Oxidative stress generated within inflammatory joints can produce autoimmune phenomena and joint destruction. Radical species with oxidative activity, including reactive nitrogen species, represent mediators of inflammation and cartilage damage.", + meta={ + "pubid": "25,445,628", + "long_answer": "yes", + }, + ), + Document( + content="Plasma levels of pancreatic polypeptide (PP) rise upon food intake. Although other pancreatic islet hormones, such as insulin and glucagon, have been extensively investigated, PP secretion and actions are still poorly understood.", + meta={ + "pubid": "25,445,712", + "long_answer": "yes", + }, + ), + ] + + result = doc_embedder.run(document_list) + print(f"Document Text: {result['documents'][0].content}") + print(f"Document Embedding: {result['documents'][0].embedding}") + print(f"Embedding Dimension: {len(result['documents'][0].embedding)}") + """ # noqa: E501 + + def __init__( + self, + model: str = "BAAI/bge-small-en-v1.5", + batch_size: int = 256, + progress_bar: bool = True, + normalize_embeddings: bool = False, + meta_fields_to_embed: Optional[List[str]] = None, + embedding_separator: str = "\n", + ): + """ + Create an FastembedDocumentEmbedder component. + + :param model: Local path or name of the model in Hugging Face's model hub, + such as ``'BAAI/bge-small-en-v1.5'``. + :param batch_size: Number of strings to encode at once. + :param progress_bar: If true, displays progress bar during embedding. + :param normalize_embeddings: If set to true, returned vectors will have the length of 1. + :param meta_fields_to_embed: List of meta fields that should be embedded along with the Document content. + :param embedding_separator: Separator used to concatenate the meta fields to the Document content. + """ + + self.model_name = model + self.batch_size = batch_size + self.progress_bar = progress_bar + self.normalize_embeddings = normalize_embeddings + self.meta_fields_to_embed = meta_fields_to_embed or [] + self.embedding_separator = embedding_separator + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict( + self, + model=self.model_name, + batch_size=self.batch_size, + progress_bar=self.progress_bar, + normalize_embeddings=self.normalize_embeddings, + meta_fields_to_embed=self.meta_fields_to_embed, + embedding_separator=self.embedding_separator, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "FastembedDocumentEmbedder": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + def warm_up(self): + """ + Load the embedding backend. + """ + if not hasattr(self, "embedding_backend"): + self.embedding_backend = ( + _FastembedEmbeddingBackendFactory.get_embedding_backend( + model_name=self.model_name + ) + ) + + @component.output_types(documents=List[Document]) + def run(self, documents: List[Document]): + """ + Embed a list of Documents. + The embedding of each Document is stored in the `embedding` field of the Document. + """ + if ( + not isinstance(documents, list) + or documents + and not isinstance(documents[0], Document) + ): + msg = ( + "FastembedDocumentEmbedder expects a list of Documents as input. " + "In case you want to embed a list of strings, please use the FastembedTextEmbedder." + ) + raise TypeError(msg) + if not hasattr(self, "embedding_backend"): + msg = "The embedding model has not been loaded. Please call warm_up() before running." + raise RuntimeError(msg) + + # TODO: once non textual Documents are properly supported, we should also prepare them for embedding here + + texts_to_embed = [] + for doc in documents: + meta_values_to_embed = [ + str(doc.meta[key]) + for key in self.meta_fields_to_embed + if key in doc.meta and doc.meta[key] is not None + ] + text_to_embed = [ + self.embedding_separator.join( + [*meta_values_to_embed, doc.content or ""] + ), + ] + + texts_to_embed.append(text_to_embed[0]) + embeddings = self.embedding_backend.embed( + texts_to_embed, + batch_size=self.batch_size, + show_progress_bar=self.progress_bar, + normalize_embeddings=self.normalize_embeddings, + ) + + for doc, emb in zip(documents, embeddings): + doc.embedding = list(emb) + + return {"documents": documents} diff --git a/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py b/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py index eae85161c..063e8348a 100644 --- a/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py +++ b/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py @@ -15,7 +15,7 @@ class FastembedTextEmbedder: # To use this component, install the "fastembed" package. # pip install fastembed - from fastembed_haystack.text_embedder import FastembedTextEmbedder + from fastembed_haystack.fastembed_text_embedder import FastembedTextEmbedder text = "It clearly says online this will work on a Mac OS system. The disk comes and it does not, only Windows. Do Not order this if you have a Mac!!" @@ -31,7 +31,6 @@ def __init__( self, model: str = "BAAI/bge-small-en-v1.5", batch_size: int = 256, - parallel: int = None, progress_bar: bool = True, normalize_embeddings: bool = False, ): @@ -41,17 +40,13 @@ def __init__( :param model: Local path or name of the model in Fastembed's model hub, such as ``'BAAI/bge-small-en-v1.5'``. :param batch_size: Number of strings to encode at once. - :param parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - :param progress_bar: If true, displays progress bar during embedding. :param normalize_embeddings: If set to true, returned vectors will have the length of 1. """ + # TOD0 add parallel + self.model_name = model self.batch_size = batch_size - self.parallel = parallel self.progress_bar = progress_bar self.normalize_embeddings = normalize_embeddings @@ -63,7 +58,6 @@ def to_dict(self) -> Dict[str, Any]: self, model=self.model_name, batch_size=self.batch_size, - parallel=self.parallel, progress_bar=self.progress_bar, normalize_embeddings=self.normalize_embeddings, ) @@ -80,8 +74,10 @@ def warm_up(self): Load the embedding backend. """ if not hasattr(self, "embedding_backend"): - self.embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend( - model_name=self.model_name + self.embedding_backend = ( + _FastembedEmbeddingBackendFactory.get_embedding_backend( + model_name=self.model_name + ) ) @component.output_types(embedding=List[float]) @@ -98,11 +94,12 @@ def run(self, text: str): raise RuntimeError(msg) text_to_embed = [text] - embedding = list(self.embedding_backend.embed( - text_to_embed, - batch_size=self.batch_size, - parallel=self.parallel, - show_progress_bar=self.progress_bar, - normalize_embeddings=self.normalize_embeddings, - )[0]) - return {"embedding": embedding} \ No newline at end of file + embedding = list( + self.embedding_backend.embed( + text_to_embed, + batch_size=self.batch_size, + show_progress_bar=self.progress_bar, + normalize_embeddings=self.normalize_embeddings, + )[0] + ) + return {"embedding": embedding} diff --git a/integrations/fastembed-haystack/tests/test_fastembed_backend.py b/integrations/fastembed-haystack/tests/test_fastembed_backend.py index b5a5199fe..735301eb4 100644 --- a/integrations/fastembed-haystack/tests/test_fastembed_backend.py +++ b/integrations/fastembed-haystack/tests/test_fastembed_backend.py @@ -12,7 +12,9 @@ def test_factory_behavior(mock_instructor): # noqa: ARG001 embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend( model_name="BAAI/bge-small-en-v1.5" ) - same_embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend("BAAI/bge-small-en-v1.5") + same_embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend( + "BAAI/bge-small-en-v1.5" + ) another_embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend( model_name="BAAI/bge-base-en-v1.5" ) @@ -51,4 +53,4 @@ def test_embedding_function_with_kwargs(mock_instructor): # noqa: ARG001 embedding_backend.model.embed.assert_called_once_with(data) # restore the factory stateTrue - _FastembedEmbeddingBackendFactory._instances = {} \ No newline at end of file + _FastembedEmbeddingBackendFactory._instances = {} diff --git a/integrations/fastembed-haystack/tests/test_fastembed_document_embedder.py b/integrations/fastembed-haystack/tests/test_fastembed_document_embedder.py new file mode 100644 index 000000000..590f30c21 --- /dev/null +++ b/integrations/fastembed-haystack/tests/test_fastembed_document_embedder.py @@ -0,0 +1,252 @@ +from unittest.mock import MagicMock, patch + +import numpy as np +import pytest +from haystack import Document +from haystack_integrations.components.embedders.fastembed.fastembed_document_embedder import ( + FastembedDocumentEmbedder, +) + + +class TestFastembedDocumentEmbedder: + def test_init_default(self): + """ + Test default initialization parameters for FastembedDocumentEmbedder. + """ + embedder = FastembedDocumentEmbedder(model="BAAI/bge-small-en-v1.5") + assert embedder.model_name == "BAAI/bge-small-en-v1.5" + assert embedder.batch_size == 256 + assert embedder.progress_bar is True + assert embedder.normalize_embeddings is False + assert embedder.meta_fields_to_embed == [] + assert embedder.embedding_separator == "\n" + + def test_init_with_parameters(self): + """ + Test custom initialization parameters for FastembedDocumentEmbedder. + """ + embedder = FastembedDocumentEmbedder( + model="BAAI/bge-small-en-v1.5", + batch_size=64, + progress_bar=False, + normalize_embeddings=True, + meta_fields_to_embed=["test_field"], + embedding_separator=" | ", + ) + assert embedder.model_name == "BAAI/bge-small-en-v1.5" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.normalize_embeddings is True + assert embedder.meta_fields_to_embed == ["test_field"] + assert embedder.embedding_separator == " | " + + def test_to_dict(self): + """ + Test serialization of FastembedDocumentEmbedder to a dictionary, using default initialization parameters. + """ + embedder = FastembedDocumentEmbedder(model="BAAI/bge-small-en-v1.5") + embedder_dict = embedder.to_dict() + assert embedder_dict == { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_document_embedder.FastembedDocumentEmbedder", # noqa + "init_parameters": { + "model": "BAAI/bge-small-en-v1.5", + "batch_size": 256, + "progress_bar": True, + "normalize_embeddings": False, + "embedding_separator": "\n", + "meta_fields_to_embed": [], + }, + } + + def test_to_dict_with_custom_init_parameters(self): + """ + Test serialization of FastembedDocumentEmbedder to a dictionary, using custom initialization parameters. + """ + embedder = FastembedDocumentEmbedder( + model="BAAI/bge-small-en-v1.5", + batch_size=64, + progress_bar=False, + normalize_embeddings=True, + meta_fields_to_embed=["test_field"], + embedding_separator=" | ", + ) + embedder_dict = embedder.to_dict() + assert embedder_dict == { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_document_embedder.FastembedDocumentEmbedder", # noqa + "init_parameters": { + "model": "BAAI/bge-small-en-v1.5", + "batch_size": 64, + "progress_bar": False, + "normalize_embeddings": True, + "meta_fields_to_embed": ["test_field"], + "embedding_separator": " | ", + }, + } + + def test_from_dict(self): + """ + Test deserialization of FastembedDocumentEmbedder from a dictionary, using default initialization parameters. + """ + embedder_dict = { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_document_embedder.FastembedDocumentEmbedder", # noqa + "init_parameters": { + "model": "BAAI/bge-small-en-v1.5", + "batch_size": 256, + "progress_bar": True, + "normalize_embeddings": False, + "meta_fields_to_embed": [], + "embedding_separator": "\n", + }, + } + embedder = FastembedDocumentEmbedder.from_dict(embedder_dict) + assert embedder.model_name == "BAAI/bge-small-en-v1.5" + assert embedder.batch_size == 256 + assert embedder.progress_bar is True + assert embedder.normalize_embeddings is False + assert embedder.meta_fields_to_embed == [] + assert embedder.embedding_separator == "\n" + + def test_from_dict_with_custom_init_parameters(self): + """ + Test deserialization of FastembedDocumentEmbedder from a dictionary, using custom initialization parameters. + """ + embedder_dict = { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_document_embedder.FastembedDocumentEmbedder", # noqa + "init_parameters": { + "model": "BAAI/bge-small-en-v1.5", + "batch_size": 64, + "progress_bar": False, + "normalize_embeddings": True, + "meta_fields_to_embed": ["test_field"], + "embedding_separator": " | ", + }, + } + embedder = FastembedDocumentEmbedder.from_dict(embedder_dict) + assert embedder.model_name == "BAAI/bge-small-en-v1.5" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.normalize_embeddings is True + assert embedder.meta_fields_to_embed == ["test_field"] + assert embedder.embedding_separator == " | " + + @patch( + "haystack_integrations.components.embedders.fastembed.fastembed_document_embedder._FastembedEmbeddingBackendFactory" + ) + def test_warmup(self, mocked_factory): + """ + Test for checking embedder instances after warm-up. + """ + embedder = FastembedDocumentEmbedder(model="BAAI/bge-small-en-v1.5") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once_with( + model_name="BAAI/bge-small-en-v1.5", + ) + + @patch( + "haystack_integrations.components.embedders.fastembed.fastembed_document_embedder._FastembedEmbeddingBackendFactory" + ) + def test_warmup_does_not_reload(self, mocked_factory): + """ + Test for checking backend instances after multiple warm-ups. + """ + embedder = FastembedDocumentEmbedder(model="BAAI/bge-small-en-v1.5") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once() + + def test_embed(self): + """ + Test for checking output dimensions and embedding dimensions. + """ + embedder = FastembedDocumentEmbedder(model="BAAI/bge-base-en-v1.5") + embedder.embedding_backend = MagicMock() + embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand( + len(x), 16 + ).tolist() # noqa: ARG005 + + documents = [Document(content=f"Sample-document text {i}") for i in range(5)] + + result = embedder.run(documents=documents) + + assert isinstance(result["documents"], list) + assert len(result["documents"]) == len(documents) + for doc in result["documents"]: + assert isinstance(doc, Document) + assert isinstance(doc.embedding, list) + assert isinstance(doc.embedding[0], float) + + def test_embed_incorrect_input_format(self): + """ + Test for checking incorrect input format when creating embedding. + """ + embedder = FastembedDocumentEmbedder(model="BAAI/bge-small-en-v1.5") + + string_input = "text" + list_integers_input = [1, 2, 3] + + with pytest.raises( + TypeError, + match="FastembedDocumentEmbedder expects a list of Documents as input.", + ): + embedder.run(documents=string_input) + + with pytest.raises( + TypeError, + match="FastembedDocumentEmbedder expects a list of Documents as input.", + ): + embedder.run(documents=list_integers_input) + + def test_embed_metadata(self): + """ + Test for checking output dimensions and embedding dimensions for documents + with a custom instruction and metadata. + """ + embedder = FastembedDocumentEmbedder( + model="model", + meta_fields_to_embed=["meta_field"], + embedding_separator="\n", + ) + embedder.embedding_backend = MagicMock() + + documents = [ + Document( + content=f"document-number {i}", meta={"meta_field": f"meta_value {i}"} + ) + for i in range(5) + ] + + embedder.run(documents=documents) + + embedder.embedding_backend.embed.assert_called_once_with( + [ + "meta_value 0\ndocument-number 0", + "meta_value 1\ndocument-number 1", + "meta_value 2\ndocument-number 2", + "meta_value 3\ndocument-number 3", + "meta_value 4\ndocument-number 4", + ], + batch_size=256, + show_progress_bar=True, + normalize_embeddings=False, + ) + + @pytest.mark.integration + def test_run(self): + embedder = FastembedDocumentEmbedder( + model="BAAI/bge-small-en-v1.5", + ) + embedder.warm_up() + + doc = Document(content="Parton energy loss in QCD matter") + + result = embedder.run(documents=[doc]) + print(result["documents"]) + embedding = result["documents"][0].embedding + print(type(embedding)) + print(type(embedding[0])) + + assert isinstance(embedding, list) + assert len(embedding) == 384 + assert all(isinstance(emb.item(), float) for emb in embedding) diff --git a/integrations/fastembed-haystack/tests/test_fastembed_text_embedder.py b/integrations/fastembed-haystack/tests/test_fastembed_text_embedder.py index 245a5e033..fc259f929 100644 --- a/integrations/fastembed-haystack/tests/test_fastembed_text_embedder.py +++ b/integrations/fastembed-haystack/tests/test_fastembed_text_embedder.py @@ -2,7 +2,9 @@ import numpy as np import pytest -from haystack_integrations.components.embedders.fastembed.fastembed_text_embedder import FastembedTextEmbedder +from haystack_integrations.components.embedders.fastembed.fastembed_text_embedder import ( + FastembedTextEmbedder, +) class TestFastembedTextEmbedder: @@ -13,7 +15,6 @@ def test_init_default(self): embedder = FastembedTextEmbedder(model="BAAI/bge-small-en-v1.5") assert embedder.model_name == "BAAI/bge-small-en-v1.5" assert embedder.batch_size == 256 - assert embedder.parallel is None assert embedder.progress_bar is True assert embedder.normalize_embeddings is False @@ -24,13 +25,11 @@ def test_init_with_parameters(self): embedder = FastembedTextEmbedder( model="BAAI/bge-small-en-v1.5", batch_size=64, - parallel=0, progress_bar=False, normalize_embeddings=True, ) assert embedder.model_name == "BAAI/bge-small-en-v1.5" assert embedder.batch_size == 64 - assert embedder.parallel == 0 assert embedder.progress_bar is False assert embedder.normalize_embeddings is True @@ -45,7 +44,6 @@ def test_to_dict(self): "init_parameters": { "model": "BAAI/bge-small-en-v1.5", "batch_size": 256, - "parallel": None, "progress_bar": True, "normalize_embeddings": False, }, @@ -58,7 +56,6 @@ def test_to_dict_with_custom_init_parameters(self): embedder = FastembedTextEmbedder( model="BAAI/bge-small-en-v1.5", batch_size=64, - parallel=1, progress_bar=False, normalize_embeddings=True, ) @@ -68,7 +65,6 @@ def test_to_dict_with_custom_init_parameters(self): "init_parameters": { "model": "BAAI/bge-small-en-v1.5", "batch_size": 64, - "parallel":1, "progress_bar": False, "normalize_embeddings": True, }, @@ -83,7 +79,6 @@ def test_from_dict(self): "init_parameters": { "model": "BAAI/bge-small-en-v1.5", "batch_size": 256, - "parallel": None, "progress_bar": True, "normalize_embeddings": False, }, @@ -91,7 +86,6 @@ def test_from_dict(self): embedder = FastembedTextEmbedder.from_dict(embedder_dict) assert embedder.model_name == "BAAI/bge-small-en-v1.5" assert embedder.batch_size == 256 - assert embedder.parallel is None assert embedder.progress_bar is True assert embedder.normalize_embeddings is False @@ -104,7 +98,6 @@ def test_from_dict_with_custom_init_parameters(self): "init_parameters": { "model": "BAAI/bge-small-en-v1.5", "batch_size": 64, - "parallel": 1, "progress_bar": False, "normalize_embeddings": True, }, @@ -112,7 +105,6 @@ def test_from_dict_with_custom_init_parameters(self): embedder = FastembedTextEmbedder.from_dict(embedder_dict) assert embedder.model_name == "BAAI/bge-small-en-v1.5" assert embedder.batch_size == 64 - assert embedder.parallel == 1 assert embedder.progress_bar is False assert embedder.normalize_embeddings is True @@ -149,7 +141,9 @@ def test_embed(self): """ embedder = FastembedTextEmbedder(model="BAAI/bge-base-en-v1.5") embedder.embedding_backend = MagicMock() - embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() # noqa: ARG005 + embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand( + len(x), 16 + ).tolist() # noqa: ARG005 text = "Good text to embed" @@ -168,7 +162,9 @@ def test_run_wrong_incorrect_format(self): list_integers_input = [1, 2, 3] - with pytest.raises(TypeError, match="FastembedTextEmbedder expects a string as input"): + with pytest.raises( + TypeError, match="FastembedTextEmbedder expects a string as input" + ): embedder.run(text=list_integers_input) @pytest.mark.integration @@ -186,4 +182,4 @@ def test_run(self): assert isinstance(embedding, list) assert len(embedding) == 384 - assert all(isinstance(emb.item(), float) for emb in embedding) \ No newline at end of file + assert all(isinstance(emb.item(), float) for emb in embedding) From 551db0c1baacae8041d44c033eea109f06c43ef0 Mon Sep 17 00:00:00 2001 From: Nico Date: Thu, 8 Feb 2024 16:52:07 +0100 Subject: [PATCH 05/49] renamed folder --- integrations/{fastembed-haystack => fastembed}/LICENSE.txt | 0 integrations/{fastembed-haystack => fastembed}/README.md | 0 integrations/{fastembed-haystack => fastembed}/pyproject.toml | 0 .../components/embedders/fastembed/__about__.py | 0 .../components/embedders/fastembed/__init__.py | 0 .../components/embedders/fastembed/embedding_backend/__init__.py | 0 .../embedders/fastembed/embedding_backend/fastembed_backend.py | 0 .../components/embedders/fastembed/fastembed_document_embedder.py | 0 .../components/embedders/fastembed/fastembed_text_embedder.py | 0 integrations/{fastembed-haystack => fastembed}/tests/__init__.py | 0 .../tests/test_fastembed_backend.py | 0 .../tests/test_fastembed_document_embedder.py | 0 .../tests/test_fastembed_text_embedder.py | 0 13 files changed, 0 insertions(+), 0 deletions(-) rename integrations/{fastembed-haystack => fastembed}/LICENSE.txt (100%) rename integrations/{fastembed-haystack => fastembed}/README.md (100%) rename integrations/{fastembed-haystack => fastembed}/pyproject.toml (100%) rename integrations/{fastembed-haystack => fastembed}/src/haystack_integrations/components/embedders/fastembed/__about__.py (100%) rename integrations/{fastembed-haystack => fastembed}/src/haystack_integrations/components/embedders/fastembed/__init__.py (100%) rename integrations/{fastembed-haystack => fastembed}/src/haystack_integrations/components/embedders/fastembed/embedding_backend/__init__.py (100%) rename integrations/{fastembed-haystack => fastembed}/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py (100%) rename integrations/{fastembed-haystack => fastembed}/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py (100%) rename integrations/{fastembed-haystack => fastembed}/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py (100%) rename integrations/{fastembed-haystack => fastembed}/tests/__init__.py (100%) rename integrations/{fastembed-haystack => fastembed}/tests/test_fastembed_backend.py (100%) rename integrations/{fastembed-haystack => fastembed}/tests/test_fastembed_document_embedder.py (100%) rename integrations/{fastembed-haystack => fastembed}/tests/test_fastembed_text_embedder.py (100%) diff --git a/integrations/fastembed-haystack/LICENSE.txt b/integrations/fastembed/LICENSE.txt similarity index 100% rename from integrations/fastembed-haystack/LICENSE.txt rename to integrations/fastembed/LICENSE.txt diff --git a/integrations/fastembed-haystack/README.md b/integrations/fastembed/README.md similarity index 100% rename from integrations/fastembed-haystack/README.md rename to integrations/fastembed/README.md diff --git a/integrations/fastembed-haystack/pyproject.toml b/integrations/fastembed/pyproject.toml similarity index 100% rename from integrations/fastembed-haystack/pyproject.toml rename to integrations/fastembed/pyproject.toml diff --git a/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/__about__.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__about__.py similarity index 100% rename from integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/__about__.py rename to integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__about__.py diff --git a/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/__init__.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py similarity index 100% rename from integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/__init__.py rename to integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py diff --git a/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/embedding_backend/__init__.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/__init__.py similarity index 100% rename from integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/embedding_backend/__init__.py rename to integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/__init__.py diff --git a/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py similarity index 100% rename from integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py rename to integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py diff --git a/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py similarity index 100% rename from integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py rename to integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py diff --git a/integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py similarity index 100% rename from integrations/fastembed-haystack/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py rename to integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py diff --git a/integrations/fastembed-haystack/tests/__init__.py b/integrations/fastembed/tests/__init__.py similarity index 100% rename from integrations/fastembed-haystack/tests/__init__.py rename to integrations/fastembed/tests/__init__.py diff --git a/integrations/fastembed-haystack/tests/test_fastembed_backend.py b/integrations/fastembed/tests/test_fastembed_backend.py similarity index 100% rename from integrations/fastembed-haystack/tests/test_fastembed_backend.py rename to integrations/fastembed/tests/test_fastembed_backend.py diff --git a/integrations/fastembed-haystack/tests/test_fastembed_document_embedder.py b/integrations/fastembed/tests/test_fastembed_document_embedder.py similarity index 100% rename from integrations/fastembed-haystack/tests/test_fastembed_document_embedder.py rename to integrations/fastembed/tests/test_fastembed_document_embedder.py diff --git a/integrations/fastembed-haystack/tests/test_fastembed_text_embedder.py b/integrations/fastembed/tests/test_fastembed_text_embedder.py similarity index 100% rename from integrations/fastembed-haystack/tests/test_fastembed_text_embedder.py rename to integrations/fastembed/tests/test_fastembed_text_embedder.py From a0c7a6ef1aa34a86b3c4da3239d7ac3d148ec132 Mon Sep 17 00:00:00 2001 From: Nico Date: Thu, 8 Feb 2024 17:13:46 +0100 Subject: [PATCH 06/49] removed print --- .../fastembed/tests/test_fastembed_document_embedder.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/integrations/fastembed/tests/test_fastembed_document_embedder.py b/integrations/fastembed/tests/test_fastembed_document_embedder.py index 590f30c21..b0583c74b 100644 --- a/integrations/fastembed/tests/test_fastembed_document_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_document_embedder.py @@ -242,10 +242,7 @@ def test_run(self): doc = Document(content="Parton energy loss in QCD matter") result = embedder.run(documents=[doc]) - print(result["documents"]) embedding = result["documents"][0].embedding - print(type(embedding)) - print(type(embedding[0])) assert isinstance(embedding, list) assert len(embedding) == 384 From 88a92822874eb02b7a4899c6baed32feba0cd253 Mon Sep 17 00:00:00 2001 From: Nico Date: Thu, 8 Feb 2024 17:13:54 +0100 Subject: [PATCH 07/49] updated readme --- integrations/fastembed/README.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/integrations/fastembed/README.md b/integrations/fastembed/README.md index 90701af0f..fc442ece4 100644 --- a/integrations/fastembed/README.md +++ b/integrations/fastembed/README.md @@ -16,6 +16,32 @@ pip install fastembed-haystack ``` +## Usage + +You can use `FastembedTextEmbedder` and `JinaDocumentEmbedder` by importing as: + +```python +from fastembed_haystack.fastembed_text_embedder import FastembedTextEmbedder + +text = "fastembed is supported by and maintained by Qdrant." +text_embedder = FastembedTextEmbedder( + model="BAAI/bge-small-en-v1.5" +) +embedding = text_embedder.run(text) +``` + +```python +from fastembed_haystack.fastembed__document_embedder import FastembedDocumentEmbedder +from haystack.dataclasses import Document + +embedder = FastembedDocumentEmbedder( + model="BAAI/bge-small-en-v1.5", +) +embedder.warm_up() +doc = Document(content="fastembed is supported by and maintained by Qdrant.", meta={"long_answer": "no",}) +result = embedder.run(documents=[doc]) +``` + ## License `fastembed-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. From a73146fe31757b5755067d004710cd9220f559fb Mon Sep 17 00:00:00 2001 From: Nico Date: Thu, 8 Feb 2024 17:48:01 +0100 Subject: [PATCH 08/49] added fastembed.yml --- .github/workflows/fastembed.yml | 38 +++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 .github/workflows/fastembed.yml diff --git a/.github/workflows/fastembed.yml b/.github/workflows/fastembed.yml new file mode 100644 index 000000000..6396fbeb3 --- /dev/null +++ b/.github/workflows/fastembed.yml @@ -0,0 +1,38 @@ +name: Test / instructor-embedders + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "integrations/fastembed/**" + - ".github/workflows/fastembed.yml" + +defaults: + run: + working-directory: integrations/fastembed + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install Hatch + run: pip install --upgrade hatch + + - name: Lint + run: hatch run lint:all + + - name: Generate docs + if: runner.os == 'Linux' + run: hatch run docs + + - name: Run tests + run: hatch run cov \ No newline at end of file From 54565b75593f0f0bf4e2debcd1f5077708ceca73 Mon Sep 17 00:00:00 2001 From: Nico Date: Thu, 8 Feb 2024 17:49:33 +0100 Subject: [PATCH 09/49] fix typos --- .github/workflows/fastembed.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/fastembed.yml b/.github/workflows/fastembed.yml index 6396fbeb3..e82e40856 100644 --- a/.github/workflows/fastembed.yml +++ b/.github/workflows/fastembed.yml @@ -1,4 +1,4 @@ -name: Test / instructor-embedders +name: Test / fastembed on: schedule: From f68a3105712fb42563a82361d6e1ea1b944f823b Mon Sep 17 00:00:00 2001 From: Nico Date: Thu, 8 Feb 2024 18:00:33 +0100 Subject: [PATCH 10/49] python version to 3.9 for lint --- .github/workflows/fastembed.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/fastembed.yml b/.github/workflows/fastembed.yml index e82e40856..811f8d008 100644 --- a/.github/workflows/fastembed.yml +++ b/.github/workflows/fastembed.yml @@ -22,7 +22,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: "3.10" + python-version: '3.9' - name: Install Hatch run: pip install --upgrade hatch From cbba970a1f4afbecfe6702fc7f008fe4cf5c9ca8 Mon Sep 17 00:00:00 2001 From: Nico Date: Thu, 8 Feb 2024 18:03:12 +0100 Subject: [PATCH 11/49] updated file --- .github/workflows/fastembed.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/fastembed.yml b/.github/workflows/fastembed.yml index 811f8d008..a3d9339b4 100644 --- a/.github/workflows/fastembed.yml +++ b/.github/workflows/fastembed.yml @@ -22,13 +22,15 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.10' - name: Install Hatch run: pip install --upgrade hatch - name: Lint run: hatch run lint:all + env: + NODE_ENV: lint - name: Generate docs if: runner.os == 'Linux' From 2a08cbef10e5e9e2116707749e93b3c93708174e Mon Sep 17 00:00:00 2001 From: Nico Date: Thu, 8 Feb 2024 18:06:12 +0100 Subject: [PATCH 12/49] force install black --- .github/workflows/fastembed.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/fastembed.yml b/.github/workflows/fastembed.yml index a3d9339b4..4f853c00f 100644 --- a/.github/workflows/fastembed.yml +++ b/.github/workflows/fastembed.yml @@ -27,10 +27,11 @@ jobs: - name: Install Hatch run: pip install --upgrade hatch + - name: Install Black + run: pip install --upgrade black + - name: Lint run: hatch run lint:all - env: - NODE_ENV: lint - name: Generate docs if: runner.os == 'Linux' From 2fccaf69d29f9bd0b6456cdb3e4cc8e826ee7fab Mon Sep 17 00:00:00 2001 From: Nico Date: Thu, 8 Feb 2024 18:07:43 +0100 Subject: [PATCH 13/49] return to original file --- .github/workflows/fastembed.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/fastembed.yml b/.github/workflows/fastembed.yml index 4f853c00f..2648f6fce 100644 --- a/.github/workflows/fastembed.yml +++ b/.github/workflows/fastembed.yml @@ -27,9 +27,6 @@ jobs: - name: Install Hatch run: pip install --upgrade hatch - - name: Install Black - run: pip install --upgrade black - - name: Lint run: hatch run lint:all From afd464578f5511859263d9d01f2820fdf94f8c16 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Thu, 8 Feb 2024 18:15:58 +0100 Subject: [PATCH 14/49] try to fix workflow --- integrations/fastembed/pyproject.toml | 117 ++++++++++++++++++++++---- 1 file changed, 102 insertions(+), 15 deletions(-) diff --git a/integrations/fastembed/pyproject.toml b/integrations/fastembed/pyproject.toml index 995d25a57..a14c46634 100644 --- a/integrations/fastembed/pyproject.toml +++ b/integrations/fastembed/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "hatchling.build" [project] name = "fastembed-haystack" dynamic = ["version"] -description = 'support for fastembed ' +description = 'An integration of fastembed with Haystack ' readme = "README.md" requires-python = ">=3.8" license = "Apache-2.0" @@ -30,17 +30,16 @@ dependencies = [ ] [project.urls] -Documentation = "https://github.com/unknown/fastembed-haystack#readme" -Issues = "https://github.com/unknown/fastembed-haystack/issues" -Source = "https://github.com/unknown/fastembed-haystack" - -[tool.hatch.version] -path = "src/fastembed_haystack/__about__.py" +Source = "https://github.com/deepset-ai/haystack-core-integrations" +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/fastembed/README.md" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" [tool.hatch.envs.default] dependencies = [ "coverage[toml]>=6.5", "pytest", + "ipython", + "haystack-pydoc-tools", ] [tool.hatch.envs.default.scripts] test = "pytest {args:tests}" @@ -53,27 +52,106 @@ cov = [ "test-cov", "cov-report", ] +docs = [ + "pydoc-markdown pydoc/config.yml" +] [[tool.hatch.envs.all.matrix]] python = ["3.8", "3.9", "3.10", "3.11", "3.12"] -[tool.hatch.envs.types] +[tool.hatch.envs.lint] +detached = true dependencies = [ + "black>=23.1.0", "mypy>=1.0.0", + "ruff>=0.0.243", +] +[tool.hatch.envs.lint.scripts] +typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}" +style = [ + "ruff {args:.}", + "black --check --diff {args:.}", +] +fmt = [ + "black {args:.}", + "ruff --fix {args:.}", + "style", +] +all = [ + "style", + "typing", ] -[tool.hatch.envs.types.scripts] -check = "mypy --install-types --non-interactive {args:src/fastembed_haystack tests}" + +[tool.black] +target-version = ["py37"] +line-length = 120 +skip-string-normalization = true + +[tool.ruff] +target-version = "py37" +line-length = 120 +select = [ + "A", + "ARG", + "B", + "C", + "DTZ", + "E", + "EM", + "F", + "FBT", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow boolean positional values in function calls, like `dict.get(... True)` + "FBT003", + # Ignore checks for possible passwords + "S105", "S106", "S107", + # Ignore complexity + "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", +] +unfixable = [ + # Don't touch unused imports + "F401", +] + +[tool.ruff.isort] +known-first-party = ["src"] + +[tool.ruff.flake8-tidy-imports] +ban-relative-imports = "parents" + +[tool.ruff.per-file-ignores] +# Tests can use magic values, assertions, and relative imports +"tests/**/*" = ["PLR2004", "S101", "TID252"] +# examples can contain "print" commands +"examples/**/*" = ["T201"] [tool.coverage.run] -source_pkgs = ["fastembed_haystack", "tests"] +source_pkgs = ["src", "tests"] branch = true parallel = true -omit = [ - "src/fastembed_haystack/__about__.py", -] + [tool.coverage.paths] -fastembed_haystack = ["src/fastembed_haystack", "*/fastembed-haystack/src/fastembed_haystack"] +fastembed_haystack = ["src/haystack_integrations", "*/fastembed-haystack/src"] tests = ["tests", "*/fastembed-haystack/tests"] [tool.coverage.report] @@ -82,3 +160,12 @@ exclude_lines = [ "if __name__ == .__main__.:", "if TYPE_CHECKING:", ] + +[[tool.mypy.overrides]] +module = [ + "haystack.*", + "haystack_integrations.*", + "fastembed.*", + "pytest.*" +] +ignore_missing_imports = true From 2e01f32d3cda6d70a5513ebafe29b4ec5e12a2ea Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Thu, 8 Feb 2024 18:19:27 +0100 Subject: [PATCH 15/49] retry --- .github/workflows/fastembed.yml | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/.github/workflows/fastembed.yml b/.github/workflows/fastembed.yml index 2648f6fce..fe736029a 100644 --- a/.github/workflows/fastembed.yml +++ b/.github/workflows/fastembed.yml @@ -13,26 +13,33 @@ defaults: working-directory: integrations/fastembed jobs: - test: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Setup Python + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.9","3.10","3.11"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: - python-version: '3.10' + python-version: ${{ matrix.python-version }} - name: Install Hatch run: pip install --upgrade hatch - name: Lint + if: matrix.python-version == '3.9' run: hatch run lint:all - name: Generate docs - if: runner.os == 'Linux' - run: hatch run docs + if: matrix.python-version == '3.9' && runner.os == 'Linux' + run: hatch run docs - name: Run tests - run: hatch run cov \ No newline at end of file + run: hatch run cov From d357330ac57349b92497ba37212cc8be38e293da Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Thu, 8 Feb 2024 18:22:21 +0100 Subject: [PATCH 16/49] add missing info to pyproject --- integrations/fastembed/pyproject.toml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/integrations/fastembed/pyproject.toml b/integrations/fastembed/pyproject.toml index a14c46634..b8416584c 100644 --- a/integrations/fastembed/pyproject.toml +++ b/integrations/fastembed/pyproject.toml @@ -34,6 +34,17 @@ Source = "https://github.com/deepset-ai/haystack-core-integrations" Documentation = "https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/fastembed/README.md" Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + +[tool.hatch.version] +source = "vcs" +tag-pattern = 'integrations\/fastembed-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/fastembed-v[0-9]*"' + [tool.hatch.envs.default] dependencies = [ "coverage[toml]>=6.5", From 174c51b0cba8ef57b45487a9b94c12344ab554c6 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Thu, 8 Feb 2024 18:24:05 +0100 Subject: [PATCH 17/49] add hatch-vcs to check version --- integrations/fastembed/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/fastembed/pyproject.toml b/integrations/fastembed/pyproject.toml index b8416584c..8b87dc380 100644 --- a/integrations/fastembed/pyproject.toml +++ b/integrations/fastembed/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["hatchling"] +requires = ["hatchling", "hatch-vcs"] build-backend = "hatchling.build" [project] From c9eadfa08965481e30417621d1c88a292062b635 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Thu, 8 Feb 2024 18:26:23 +0100 Subject: [PATCH 18/49] Update pyproject.toml --- integrations/fastembed/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/fastembed/pyproject.toml b/integrations/fastembed/pyproject.toml index 8b87dc380..1352e0d82 100644 --- a/integrations/fastembed/pyproject.toml +++ b/integrations/fastembed/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "hatchling.build" [project] name = "fastembed-haystack" dynamic = ["version"] -description = 'An integration of fastembed with Haystack ' +description = "An integration of fastembed with Haystack" readme = "README.md" requires-python = ">=3.8" license = "Apache-2.0" From bba17ae0d7bcfcfe49bc562fe60cca64b9ad5a6c Mon Sep 17 00:00:00 2001 From: Nico Date: Thu, 8 Feb 2024 18:36:15 +0100 Subject: [PATCH 19/49] fixed typos --- .../embedders/fastembed/fastembed_document_embedder.py | 2 +- .../components/embedders/fastembed/fastembed_text_embedder.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py index 0bf658b6f..e140e01a5 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py @@ -48,7 +48,7 @@ class FastembedDocumentEmbedder: print(f"Document Text: {result['documents'][0].content}") print(f"Document Embedding: {result['documents'][0].embedding}") print(f"Embedding Dimension: {len(result['documents'][0].embedding)}") - """ # noqa: E501 + """ def __init__( self, diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py index 063e8348a..b9b1f69c0 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py @@ -25,7 +25,7 @@ class FastembedTextEmbedder: embedding = text_embedder.run(text) ``` - """ # noqa: E501 + """ def __init__( self, @@ -43,7 +43,7 @@ def __init__( :param normalize_embeddings: If set to true, returned vectors will have the length of 1. """ - # TOD0 add parallel + # TODO add parallel self.model_name = model self.batch_size = batch_size From d91f4492701aa559eeb7215648e66efd136b6f72 Mon Sep 17 00:00:00 2001 From: Nico Date: Thu, 8 Feb 2024 18:42:14 +0100 Subject: [PATCH 20/49] removed python 3.9 --- .github/workflows/fastembed.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/fastembed.yml b/.github/workflows/fastembed.yml index fe736029a..ad87ccd63 100644 --- a/.github/workflows/fastembed.yml +++ b/.github/workflows/fastembed.yml @@ -20,7 +20,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - python-version: ["3.9","3.10","3.11"] + python-version: ["3.10","3.11"] steps: - uses: actions/checkout@v4 From eee09c3ef0ae8e460482b1d4ea93062880a7ac9c Mon Sep 17 00:00:00 2001 From: Nicola Procopio Date: Fri, 9 Feb 2024 09:14:43 +0100 Subject: [PATCH 21/49] Update fastembed.yml --- .github/workflows/fastembed.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/fastembed.yml b/.github/workflows/fastembed.yml index ad87ccd63..fe736029a 100644 --- a/.github/workflows/fastembed.yml +++ b/.github/workflows/fastembed.yml @@ -20,7 +20,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - python-version: ["3.10","3.11"] + python-version: ["3.9","3.10","3.11"] steps: - uses: actions/checkout@v4 From 2355aa30e02149cd6794c3ed4dcb97a763031f30 Mon Sep 17 00:00:00 2001 From: Nicola Procopio Date: Fri, 9 Feb 2024 09:25:58 +0100 Subject: [PATCH 22/49] Update fastembed_document_embedder.py --- .../embedders/fastembed/fastembed_document_embedder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py index e140e01a5..0bf658b6f 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py @@ -48,7 +48,7 @@ class FastembedDocumentEmbedder: print(f"Document Text: {result['documents'][0].content}") print(f"Document Embedding: {result['documents'][0].embedding}") print(f"Embedding Dimension: {len(result['documents'][0].embedding)}") - """ + """ # noqa: E501 def __init__( self, From c31bb3b5f0289fb412f25a535b4d494e5a6408a9 Mon Sep 17 00:00:00 2001 From: Nicola Procopio Date: Fri, 9 Feb 2024 09:26:35 +0100 Subject: [PATCH 23/49] Update fastembed_text_embedder.py --- .../components/embedders/fastembed/fastembed_text_embedder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py index b9b1f69c0..6c2323df4 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py @@ -25,7 +25,7 @@ class FastembedTextEmbedder: embedding = text_embedder.run(text) ``` - """ + """ # noqa: E501 def __init__( self, From ba5cb2803919e034788617ffe8ee509a03d8d4e9 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Fri, 9 Feb 2024 10:26:41 +0100 Subject: [PATCH 24/49] ignore errors for bool arguments --- integrations/fastembed/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/integrations/fastembed/pyproject.toml b/integrations/fastembed/pyproject.toml index 1352e0d82..2caae3a69 100644 --- a/integrations/fastembed/pyproject.toml +++ b/integrations/fastembed/pyproject.toml @@ -133,6 +133,7 @@ ignore = [ "B027", # Allow boolean positional values in function calls, like `dict.get(... True)` "FBT003", + "FBT001", "FBT002" # Ignore checks for possible passwords "S105", "S106", "S107", # Ignore complexity From 3e0c1fea831db63a1dc9b289baa82775291ef3a9 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Fri, 9 Feb 2024 10:28:14 +0100 Subject: [PATCH 25/49] fix --- integrations/fastembed/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/fastembed/pyproject.toml b/integrations/fastembed/pyproject.toml index 2caae3a69..1408190f6 100644 --- a/integrations/fastembed/pyproject.toml +++ b/integrations/fastembed/pyproject.toml @@ -133,7 +133,7 @@ ignore = [ "B027", # Allow boolean positional values in function calls, like `dict.get(... True)` "FBT003", - "FBT001", "FBT002" + "FBT001", "FBT002", # Ignore checks for possible passwords "S105", "S106", "S107", # Ignore complexity From abe8a9757d2d54cece7892ea6039e2e57d4afaca Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Fri, 9 Feb 2024 10:56:47 +0100 Subject: [PATCH 26/49] try moving noqa --- .../fastembed/tests/test_fastembed_document_embedder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/fastembed/tests/test_fastembed_document_embedder.py b/integrations/fastembed/tests/test_fastembed_document_embedder.py index b0583c74b..80c889b4f 100644 --- a/integrations/fastembed/tests/test_fastembed_document_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_document_embedder.py @@ -162,9 +162,9 @@ def test_embed(self): """ embedder = FastembedDocumentEmbedder(model="BAAI/bge-base-en-v1.5") embedder.embedding_backend = MagicMock() - embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand( + embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand( # noqa: ARG005 len(x), 16 - ).tolist() # noqa: ARG005 + ).tolist() documents = [Document(content=f"Sample-document text {i}") for i in range(5)] From 27a339cf1f34aca7fbe40772f62eb53821a31123 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Fri, 9 Feb 2024 10:58:12 +0100 Subject: [PATCH 27/49] move noqa --- integrations/fastembed/tests/test_fastembed_text_embedder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/fastembed/tests/test_fastembed_text_embedder.py b/integrations/fastembed/tests/test_fastembed_text_embedder.py index fc259f929..c30954f87 100644 --- a/integrations/fastembed/tests/test_fastembed_text_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_text_embedder.py @@ -141,9 +141,9 @@ def test_embed(self): """ embedder = FastembedTextEmbedder(model="BAAI/bge-base-en-v1.5") embedder.embedding_backend = MagicMock() - embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand( + embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand( # noqa: ARG005 len(x), 16 - ).tolist() # noqa: ARG005 + ).tolist() text = "Good text to embed" From 2d5ad0a8537b2b8ab0cfa3951f6b78b503ecefc9 Mon Sep 17 00:00:00 2001 From: Nico Date: Fri, 9 Feb 2024 19:33:13 +0100 Subject: [PATCH 28/49] formatted with black --- .../embedding_backend/fastembed_backend.py | 4 +--- .../fastembed/fastembed_document_embedder.py | 20 ++++------------ .../fastembed/fastembed_text_embedder.py | 6 +---- .../fastembed/tests/test_fastembed_backend.py | 24 +++++-------------- .../tests/test_fastembed_document_embedder.py | 11 ++------- .../tests/test_fastembed_text_embedder.py | 12 +++------- 6 files changed, 17 insertions(+), 60 deletions(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py index bf7313103..2b6fc3f38 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py @@ -22,9 +22,7 @@ def get_embedding_backend( embedding_backend = _FastembedEmbeddingBackend( model_name=model_name, ) - _FastembedEmbeddingBackendFactory._instances[embedding_backend_id] = ( - embedding_backend - ) + _FastembedEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend return embedding_backend diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py index 0bf658b6f..63776f48c 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py @@ -104,11 +104,7 @@ def warm_up(self): Load the embedding backend. """ if not hasattr(self, "embedding_backend"): - self.embedding_backend = ( - _FastembedEmbeddingBackendFactory.get_embedding_backend( - model_name=self.model_name - ) - ) + self.embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend(model_name=self.model_name) @component.output_types(documents=List[Document]) def run(self, documents: List[Document]): @@ -116,11 +112,7 @@ def run(self, documents: List[Document]): Embed a list of Documents. The embedding of each Document is stored in the `embedding` field of the Document. """ - if ( - not isinstance(documents, list) - or documents - and not isinstance(documents[0], Document) - ): + if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): msg = ( "FastembedDocumentEmbedder expects a list of Documents as input. " "In case you want to embed a list of strings, please use the FastembedTextEmbedder." @@ -135,14 +127,10 @@ def run(self, documents: List[Document]): texts_to_embed = [] for doc in documents: meta_values_to_embed = [ - str(doc.meta[key]) - for key in self.meta_fields_to_embed - if key in doc.meta and doc.meta[key] is not None + str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None ] text_to_embed = [ - self.embedding_separator.join( - [*meta_values_to_embed, doc.content or ""] - ), + self.embedding_separator.join([*meta_values_to_embed, doc.content or ""]), ] texts_to_embed.append(text_to_embed[0]) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py index 6c2323df4..986ab1cd1 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py @@ -74,11 +74,7 @@ def warm_up(self): Load the embedding backend. """ if not hasattr(self, "embedding_backend"): - self.embedding_backend = ( - _FastembedEmbeddingBackendFactory.get_embedding_backend( - model_name=self.model_name - ) - ) + self.embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend(model_name=self.model_name) @component.output_types(embedding=List[float]) def run(self, text: str): diff --git a/integrations/fastembed/tests/test_fastembed_backend.py b/integrations/fastembed/tests/test_fastembed_backend.py index 735301eb4..c564c72bf 100644 --- a/integrations/fastembed/tests/test_fastembed_backend.py +++ b/integrations/fastembed/tests/test_fastembed_backend.py @@ -5,16 +5,10 @@ ) -@patch( - "haystack_integrations.components.embedders.fastembed.embedding_backend.fastembed_backend.TextEmbedding" -) +@patch("haystack_integrations.components.embedders.fastembed.embedding_backend.fastembed_backend.TextEmbedding") def test_factory_behavior(mock_instructor): # noqa: ARG001 - embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend( - model_name="BAAI/bge-small-en-v1.5" - ) - same_embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend( - "BAAI/bge-small-en-v1.5" - ) + embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend(model_name="BAAI/bge-small-en-v1.5") + same_embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend("BAAI/bge-small-en-v1.5") another_embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend( model_name="BAAI/bge-base-en-v1.5" ) @@ -26,9 +20,7 @@ def test_factory_behavior(mock_instructor): # noqa: ARG001 _FastembedEmbeddingBackendFactory._instances = {} -@patch( - "haystack_integrations.components.embedders.fastembed.embedding_backend.fastembed_backend.TextEmbedding" -) +@patch("haystack_integrations.components.embedders.fastembed.embedding_backend.fastembed_backend.TextEmbedding") def test_model_initialization(mock_instructor): _FastembedEmbeddingBackendFactory.get_embedding_backend( model_name="BAAI/bge-small-en-v1.5", @@ -40,13 +32,9 @@ def test_model_initialization(mock_instructor): _FastembedEmbeddingBackendFactory._instances = {} -@patch( - "haystack_integrations.components.embedders.fastembed.embedding_backend.fastembed_backend.TextEmbedding" -) +@patch("haystack_integrations.components.embedders.fastembed.embedding_backend.fastembed_backend.TextEmbedding") def test_embedding_function_with_kwargs(mock_instructor): # noqa: ARG001 - embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend( - model_name="BAAI/bge-small-en-v1.5" - ) + embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend(model_name="BAAI/bge-small-en-v1.5") data = ["sentence1", "sentence2"] embedding_backend.embed(data=data) diff --git a/integrations/fastembed/tests/test_fastembed_document_embedder.py b/integrations/fastembed/tests/test_fastembed_document_embedder.py index 80c889b4f..a387db797 100644 --- a/integrations/fastembed/tests/test_fastembed_document_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_document_embedder.py @@ -162,9 +162,7 @@ def test_embed(self): """ embedder = FastembedDocumentEmbedder(model="BAAI/bge-base-en-v1.5") embedder.embedding_backend = MagicMock() - embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand( # noqa: ARG005 - len(x), 16 - ).tolist() + embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() # noqa: ARG005 documents = [Document(content=f"Sample-document text {i}") for i in range(5)] @@ -210,12 +208,7 @@ def test_embed_metadata(self): ) embedder.embedding_backend = MagicMock() - documents = [ - Document( - content=f"document-number {i}", meta={"meta_field": f"meta_value {i}"} - ) - for i in range(5) - ] + documents = [Document(content=f"document-number {i}", meta={"meta_field": f"meta_value {i}"}) for i in range(5)] embedder.run(documents=documents) diff --git a/integrations/fastembed/tests/test_fastembed_text_embedder.py b/integrations/fastembed/tests/test_fastembed_text_embedder.py index c30954f87..ee9bfd3da 100644 --- a/integrations/fastembed/tests/test_fastembed_text_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_text_embedder.py @@ -118,9 +118,7 @@ def test_warmup(self, mocked_factory): embedder = FastembedTextEmbedder(model="BAAI/bge-small-en-v1.5") mocked_factory.get_embedding_backend.assert_not_called() embedder.warm_up() - mocked_factory.get_embedding_backend.assert_called_once_with( - model_name="BAAI/bge-small-en-v1.5" - ) + mocked_factory.get_embedding_backend.assert_called_once_with(model_name="BAAI/bge-small-en-v1.5") @patch( "haystack_integrations.components.embedders.fastembed.fastembed_text_embedder._FastembedEmbeddingBackendFactory" @@ -141,9 +139,7 @@ def test_embed(self): """ embedder = FastembedTextEmbedder(model="BAAI/bge-base-en-v1.5") embedder.embedding_backend = MagicMock() - embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand( # noqa: ARG005 - len(x), 16 - ).tolist() + embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() # noqa: ARG005 text = "Good text to embed" @@ -162,9 +158,7 @@ def test_run_wrong_incorrect_format(self): list_integers_input = [1, 2, 3] - with pytest.raises( - TypeError, match="FastembedTextEmbedder expects a string as input" - ): + with pytest.raises(TypeError, match="FastembedTextEmbedder expects a string as input"): embedder.run(text=list_integers_input) @pytest.mark.integration From 79a5f9fb9c65b7b1fa7966159843db24d6376d6a Mon Sep 17 00:00:00 2001 From: Nico Date: Sat, 10 Feb 2024 10:18:42 +0100 Subject: [PATCH 29/49] added numpy dependency --- integrations/fastembed/pyproject.toml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/integrations/fastembed/pyproject.toml b/integrations/fastembed/pyproject.toml index 1408190f6..462bcb826 100644 --- a/integrations/fastembed/pyproject.toml +++ b/integrations/fastembed/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "hatchling.build" [project] name = "fastembed-haystack" dynamic = ["version"] -description = "An integration of fastembed with Haystack" +description = "Haystack 2.x component to embed strings and Documents using fastembed embedding model" readme = "README.md" requires-python = ">=3.8" license = "Apache-2.0" @@ -50,7 +50,7 @@ dependencies = [ "coverage[toml]>=6.5", "pytest", "ipython", - "haystack-pydoc-tools", + "haystack-pydoc-tools", ] [tool.hatch.envs.default.scripts] test = "pytest {args:tests}" @@ -76,6 +76,7 @@ dependencies = [ "black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243", + "numpy", ] [tool.hatch.envs.lint.scripts] typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}" From fb2bd0551a9ad024bab783f7e309af376f81e760 Mon Sep 17 00:00:00 2001 From: Nico Date: Sat, 10 Feb 2024 10:21:34 +0100 Subject: [PATCH 30/49] removed numpy --- integrations/fastembed/pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/fastembed/pyproject.toml b/integrations/fastembed/pyproject.toml index 1408190f6..0c0f1b5b3 100644 --- a/integrations/fastembed/pyproject.toml +++ b/integrations/fastembed/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "hatchling.build" [project] name = "fastembed-haystack" dynamic = ["version"] -description = "An integration of fastembed with Haystack" +description = "Haystack 2.x component to embed strings and Documents using fastembed embedding model" readme = "README.md" requires-python = ">=3.8" license = "Apache-2.0" @@ -50,7 +50,7 @@ dependencies = [ "coverage[toml]>=6.5", "pytest", "ipython", - "haystack-pydoc-tools", + "haystack-pydoc-tools", ] [tool.hatch.envs.default.scripts] test = "pytest {args:tests}" From 690659c3224d7cc9b8cb920644143b07ad4a09fc Mon Sep 17 00:00:00 2001 From: Nico Date: Sat, 10 Feb 2024 10:26:22 +0100 Subject: [PATCH 31/49] removed numpy --- integrations/fastembed/pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/integrations/fastembed/pyproject.toml b/integrations/fastembed/pyproject.toml index 462bcb826..0c0f1b5b3 100644 --- a/integrations/fastembed/pyproject.toml +++ b/integrations/fastembed/pyproject.toml @@ -76,7 +76,6 @@ dependencies = [ "black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243", - "numpy", ] [tool.hatch.envs.lint.scripts] typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}" From 4bbb169f10bcb98591d4e06d4f9996598b736a02 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Sat, 10 Feb 2024 10:26:47 +0100 Subject: [PATCH 32/49] make mypy happy --- .../embedders/fastembed/embedding_backend/fastembed_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py index 2b6fc3f38..3f6ae760e 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py @@ -8,7 +8,7 @@ class _FastembedEmbeddingBackendFactory: Factory class to create instances of fastembed embedding backends. """ - _instances: ClassVar[Dict[str, "_FastembedEmbeddingBackendFactory"]] = {} + _instances: Dict[str, "_FastembedEmbeddingBackend"] = {} @staticmethod def get_embedding_backend( From d9ae567db8ed418d6c1acc44d3fd7358f9fcfde7 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Sat, 10 Feb 2024 10:28:17 +0100 Subject: [PATCH 33/49] Update fastembed_backend.py --- .../embedders/fastembed/embedding_backend/fastembed_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py index 3f6ae760e..392f9d32d 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py @@ -8,7 +8,7 @@ class _FastembedEmbeddingBackendFactory: Factory class to create instances of fastembed embedding backends. """ - _instances: Dict[str, "_FastembedEmbeddingBackend"] = {} + _instances: ClassVar[Dict[str, "_FastembedEmbeddingBackend"]] = {} @staticmethod def get_embedding_backend( From 87e3fe71b45d04542c7a234ccf6623fe44667bd2 Mon Sep 17 00:00:00 2001 From: Nico Date: Sat, 10 Feb 2024 10:29:05 +0100 Subject: [PATCH 34/49] removed classvar --- .../embedders/fastembed/embedding_backend/fastembed_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py index 392f9d32d..4a76a0a9e 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py @@ -1,4 +1,4 @@ -from typing import ClassVar, Dict, List +from typing import Dict, List from fastembed import TextEmbedding From 668572c25709a1caf0e7981ce07c798da020313d Mon Sep 17 00:00:00 2001 From: Nico Date: Sat, 10 Feb 2024 10:32:00 +0100 Subject: [PATCH 35/49] fix --- .../embedders/fastembed/embedding_backend/fastembed_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py index 4a76a0a9e..392f9d32d 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py @@ -1,4 +1,4 @@ -from typing import Dict, List +from typing import ClassVar, Dict, List from fastembed import TextEmbedding From 206b842de7ed612a079f51b10fbd98eea1d61ac3 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Sat, 10 Feb 2024 10:33:32 +0100 Subject: [PATCH 36/49] Update pyproject.toml --- integrations/fastembed/pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/integrations/fastembed/pyproject.toml b/integrations/fastembed/pyproject.toml index 0c0f1b5b3..d971fce09 100644 --- a/integrations/fastembed/pyproject.toml +++ b/integrations/fastembed/pyproject.toml @@ -178,6 +178,7 @@ module = [ "haystack.*", "haystack_integrations.*", "fastembed.*", - "pytest.*" + "pytest.*", + "numpy.*" ] ignore_missing_imports = true From 5c144d6d2dd8949773a93f8c7165509fc4393787 Mon Sep 17 00:00:00 2001 From: Nico Date: Sat, 10 Feb 2024 10:33:46 +0100 Subject: [PATCH 37/49] added import numpy lint --- integrations/fastembed/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/integrations/fastembed/pyproject.toml b/integrations/fastembed/pyproject.toml index 0c0f1b5b3..7f17b65f1 100644 --- a/integrations/fastembed/pyproject.toml +++ b/integrations/fastembed/pyproject.toml @@ -76,6 +76,7 @@ dependencies = [ "black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243", + "numpy" ] [tool.hatch.envs.lint.scripts] typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}" From 070f04d02614afc29a97a3d1006645e0406ff8a7 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Sat, 10 Feb 2024 10:36:55 +0100 Subject: [PATCH 38/49] skip docs generation for the time being --- .github/workflows/fastembed.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/fastembed.yml b/.github/workflows/fastembed.yml index fe736029a..b3a97d4a2 100644 --- a/.github/workflows/fastembed.yml +++ b/.github/workflows/fastembed.yml @@ -37,9 +37,10 @@ jobs: if: matrix.python-version == '3.9' run: hatch run lint:all - - name: Generate docs - if: matrix.python-version == '3.9' && runner.os == 'Linux' - run: hatch run docs +# TODO: Add docs config and uncomment the following section +# - name: Generate docs +# if: matrix.python-version == '3.9' && runner.os == 'Linux' +# run: hatch run docs - name: Run tests run: hatch run cov From 2842ff9ed51823abce0d2ee940440eb921528827 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Sat, 10 Feb 2024 10:38:08 +0100 Subject: [PATCH 39/49] Update README.md --- integrations/fastembed/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/fastembed/README.md b/integrations/fastembed/README.md index fc442ece4..94a86d85e 100644 --- a/integrations/fastembed/README.md +++ b/integrations/fastembed/README.md @@ -18,7 +18,7 @@ pip install fastembed-haystack ## Usage -You can use `FastembedTextEmbedder` and `JinaDocumentEmbedder` by importing as: +You can use `FastembedTextEmbedder` and `FastembedDocumentEmbedder` by importing as: ```python from fastembed_haystack.fastembed_text_embedder import FastembedTextEmbedder From a7bf3085946d5a261f41898f1b2f14ca84924fa2 Mon Sep 17 00:00:00 2001 From: Nico Date: Sat, 10 Feb 2024 10:38:45 +0100 Subject: [PATCH 40/49] added config.yml --- integrations/fastembed/pydoc/config.yml | 30 +++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 integrations/fastembed/pydoc/config.yml diff --git a/integrations/fastembed/pydoc/config.yml b/integrations/fastembed/pydoc/config.yml new file mode 100644 index 000000000..902ed5b6f --- /dev/null +++ b/integrations/fastembed/pydoc/config.yml @@ -0,0 +1,30 @@ +loaders: + - type: haystack_pydoc_tools.loaders.CustomPythonLoader + search_path: [../src] + modules: + [ + "haystack_integrations.components.embedders.fastembed", + "haystack_integrations.components.embedders.fastembed.embedding_backend", + ] + ignore_when_discovered: ["__init__"] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: haystack_pydoc_tools.renderers.ReadmePreviewRenderer + excerpt: Embedders integration for Haystack + category_slug: haystack-integrations + title: Embedders + slug: fastembed-embedders + order: 90 + markdown: + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: _readme_fastembed.md From 673e3e7862b5867f39f2e6112ceb94f05e35173a Mon Sep 17 00:00:00 2001 From: Nico Date: Sat, 10 Feb 2024 10:57:54 +0100 Subject: [PATCH 41/49] generate docs --- .github/workflows/fastembed.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/fastembed.yml b/.github/workflows/fastembed.yml index b3a97d4a2..802d2f754 100644 --- a/.github/workflows/fastembed.yml +++ b/.github/workflows/fastembed.yml @@ -38,9 +38,9 @@ jobs: run: hatch run lint:all # TODO: Add docs config and uncomment the following section -# - name: Generate docs -# if: matrix.python-version == '3.9' && runner.os == 'Linux' -# run: hatch run docs + - name: Generate docs + if: matrix.python-version == '3.9' && runner.os == 'Linux' + run: hatch run docs - name: Run tests run: hatch run cov From e73d719b21748d4d56ff2a03039eefb39b300a49 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Sat, 10 Feb 2024 11:03:37 +0100 Subject: [PATCH 42/49] Update fastembed.yml --- .github/workflows/fastembed.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/fastembed.yml b/.github/workflows/fastembed.yml index 802d2f754..fe736029a 100644 --- a/.github/workflows/fastembed.yml +++ b/.github/workflows/fastembed.yml @@ -37,10 +37,9 @@ jobs: if: matrix.python-version == '3.9' run: hatch run lint:all -# TODO: Add docs config and uncomment the following section - - name: Generate docs - if: matrix.python-version == '3.9' && runner.os == 'Linux' - run: hatch run docs + - name: Generate docs + if: matrix.python-version == '3.9' && runner.os == 'Linux' + run: hatch run docs - name: Run tests run: hatch run cov From 9d79e1c4bdd19ef183ee21a65c1bfad4b9aff20e Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Sat, 10 Feb 2024 11:05:40 +0100 Subject: [PATCH 43/49] Update config.yml --- integrations/fastembed/pydoc/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/fastembed/pydoc/config.yml b/integrations/fastembed/pydoc/config.yml index 902ed5b6f..3e491eac2 100644 --- a/integrations/fastembed/pydoc/config.yml +++ b/integrations/fastembed/pydoc/config.yml @@ -21,7 +21,7 @@ renderer: category_slug: haystack-integrations title: Embedders slug: fastembed-embedders - order: 90 + order: 300 markdown: descriptive_class_title: false descriptive_module_title: true From 4d461619f140dc4d5069e28dbe5c8070f874fe84 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Sat, 10 Feb 2024 11:07:56 +0100 Subject: [PATCH 44/49] rm unnecessary from_dict --- .../embedders/fastembed/fastembed_document_embedder.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py index 63776f48c..8c236d8dc 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py @@ -92,13 +92,6 @@ def to_dict(self) -> Dict[str, Any]: embedding_separator=self.embedding_separator, ) - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "FastembedDocumentEmbedder": - """ - Deserialize this component from a dictionary. - """ - return default_from_dict(cls, data) - def warm_up(self): """ Load the embedding backend. From c5980d395ac4ed4159784b78c8d571353519572b Mon Sep 17 00:00:00 2001 From: anakin87 Date: Sat, 10 Feb 2024 11:25:31 +0100 Subject: [PATCH 45/49] final touch --- .../components/embedders/fastembed/__about__.py | 4 ---- .../fastembed/fastembed_document_embedder.py | 7 +------ .../fastembed/fastembed_text_embedder.py | 14 +------------- .../tests/test_fastembed_document_embedder.py | 17 +++-------------- .../tests/test_fastembed_text_embedder.py | 16 +++------------- 5 files changed, 8 insertions(+), 50 deletions(-) delete mode 100644 integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__about__.py diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__about__.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__about__.py deleted file mode 100644 index dc11cd317..000000000 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__about__.py +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-FileCopyrightText: 2024-present deepset GmbH -# -# SPDX-License-Identifier: Apache-2.0 -__version__ = "0.0.1" diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py index 8c236d8dc..24da783fd 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py @@ -1,6 +1,6 @@ from typing import Any, Dict, List, Optional -from haystack import Document, component, default_from_dict, default_to_dict +from haystack import Document, component, default_to_dict from .embedding_backend.fastembed_backend import _FastembedEmbeddingBackendFactory @@ -55,7 +55,6 @@ def __init__( model: str = "BAAI/bge-small-en-v1.5", batch_size: int = 256, progress_bar: bool = True, - normalize_embeddings: bool = False, meta_fields_to_embed: Optional[List[str]] = None, embedding_separator: str = "\n", ): @@ -66,7 +65,6 @@ def __init__( such as ``'BAAI/bge-small-en-v1.5'``. :param batch_size: Number of strings to encode at once. :param progress_bar: If true, displays progress bar during embedding. - :param normalize_embeddings: If set to true, returned vectors will have the length of 1. :param meta_fields_to_embed: List of meta fields that should be embedded along with the Document content. :param embedding_separator: Separator used to concatenate the meta fields to the Document content. """ @@ -74,7 +72,6 @@ def __init__( self.model_name = model self.batch_size = batch_size self.progress_bar = progress_bar - self.normalize_embeddings = normalize_embeddings self.meta_fields_to_embed = meta_fields_to_embed or [] self.embedding_separator = embedding_separator @@ -87,7 +84,6 @@ def to_dict(self) -> Dict[str, Any]: model=self.model_name, batch_size=self.batch_size, progress_bar=self.progress_bar, - normalize_embeddings=self.normalize_embeddings, meta_fields_to_embed=self.meta_fields_to_embed, embedding_separator=self.embedding_separator, ) @@ -131,7 +127,6 @@ def run(self, documents: List[Document]): texts_to_embed, batch_size=self.batch_size, show_progress_bar=self.progress_bar, - normalize_embeddings=self.normalize_embeddings, ) for doc, emb in zip(documents, embeddings): diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py index 986ab1cd1..832d1240f 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py @@ -1,6 +1,6 @@ from typing import Any, Dict, List -from haystack import component, default_from_dict, default_to_dict +from haystack import component, default_to_dict from .embedding_backend.fastembed_backend import _FastembedEmbeddingBackendFactory @@ -32,7 +32,6 @@ def __init__( model: str = "BAAI/bge-small-en-v1.5", batch_size: int = 256, progress_bar: bool = True, - normalize_embeddings: bool = False, ): """ Create a FastembedTextEmbedder component. @@ -40,7 +39,6 @@ def __init__( :param model: Local path or name of the model in Fastembed's model hub, such as ``'BAAI/bge-small-en-v1.5'``. :param batch_size: Number of strings to encode at once. - :param normalize_embeddings: If set to true, returned vectors will have the length of 1. """ # TODO add parallel @@ -48,7 +46,6 @@ def __init__( self.model_name = model self.batch_size = batch_size self.progress_bar = progress_bar - self.normalize_embeddings = normalize_embeddings def to_dict(self) -> Dict[str, Any]: """ @@ -59,16 +56,8 @@ def to_dict(self) -> Dict[str, Any]: model=self.model_name, batch_size=self.batch_size, progress_bar=self.progress_bar, - normalize_embeddings=self.normalize_embeddings, ) - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "FastembedTextEmbedder": - """ - Deserialize this component from a dictionary. - """ - return default_from_dict(cls, data) - def warm_up(self): """ Load the embedding backend. @@ -95,7 +84,6 @@ def run(self, text: str): text_to_embed, batch_size=self.batch_size, show_progress_bar=self.progress_bar, - normalize_embeddings=self.normalize_embeddings, )[0] ) return {"embedding": embedding} diff --git a/integrations/fastembed/tests/test_fastembed_document_embedder.py b/integrations/fastembed/tests/test_fastembed_document_embedder.py index a387db797..be182183c 100644 --- a/integrations/fastembed/tests/test_fastembed_document_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_document_embedder.py @@ -2,7 +2,7 @@ import numpy as np import pytest -from haystack import Document +from haystack import Document, default_from_dict from haystack_integrations.components.embedders.fastembed.fastembed_document_embedder import ( FastembedDocumentEmbedder, ) @@ -17,7 +17,6 @@ def test_init_default(self): assert embedder.model_name == "BAAI/bge-small-en-v1.5" assert embedder.batch_size == 256 assert embedder.progress_bar is True - assert embedder.normalize_embeddings is False assert embedder.meta_fields_to_embed == [] assert embedder.embedding_separator == "\n" @@ -29,14 +28,12 @@ def test_init_with_parameters(self): model="BAAI/bge-small-en-v1.5", batch_size=64, progress_bar=False, - normalize_embeddings=True, meta_fields_to_embed=["test_field"], embedding_separator=" | ", ) assert embedder.model_name == "BAAI/bge-small-en-v1.5" assert embedder.batch_size == 64 assert embedder.progress_bar is False - assert embedder.normalize_embeddings is True assert embedder.meta_fields_to_embed == ["test_field"] assert embedder.embedding_separator == " | " @@ -52,7 +49,6 @@ def test_to_dict(self): "model": "BAAI/bge-small-en-v1.5", "batch_size": 256, "progress_bar": True, - "normalize_embeddings": False, "embedding_separator": "\n", "meta_fields_to_embed": [], }, @@ -66,7 +62,6 @@ def test_to_dict_with_custom_init_parameters(self): model="BAAI/bge-small-en-v1.5", batch_size=64, progress_bar=False, - normalize_embeddings=True, meta_fields_to_embed=["test_field"], embedding_separator=" | ", ) @@ -77,7 +72,6 @@ def test_to_dict_with_custom_init_parameters(self): "model": "BAAI/bge-small-en-v1.5", "batch_size": 64, "progress_bar": False, - "normalize_embeddings": True, "meta_fields_to_embed": ["test_field"], "embedding_separator": " | ", }, @@ -93,16 +87,14 @@ def test_from_dict(self): "model": "BAAI/bge-small-en-v1.5", "batch_size": 256, "progress_bar": True, - "normalize_embeddings": False, "meta_fields_to_embed": [], "embedding_separator": "\n", }, } - embedder = FastembedDocumentEmbedder.from_dict(embedder_dict) + embedder = default_from_dict(FastembedDocumentEmbedder, embedder_dict) assert embedder.model_name == "BAAI/bge-small-en-v1.5" assert embedder.batch_size == 256 assert embedder.progress_bar is True - assert embedder.normalize_embeddings is False assert embedder.meta_fields_to_embed == [] assert embedder.embedding_separator == "\n" @@ -116,16 +108,14 @@ def test_from_dict_with_custom_init_parameters(self): "model": "BAAI/bge-small-en-v1.5", "batch_size": 64, "progress_bar": False, - "normalize_embeddings": True, "meta_fields_to_embed": ["test_field"], "embedding_separator": " | ", }, } - embedder = FastembedDocumentEmbedder.from_dict(embedder_dict) + embedder = default_from_dict(FastembedDocumentEmbedder, embedder_dict) assert embedder.model_name == "BAAI/bge-small-en-v1.5" assert embedder.batch_size == 64 assert embedder.progress_bar is False - assert embedder.normalize_embeddings is True assert embedder.meta_fields_to_embed == ["test_field"] assert embedder.embedding_separator == " | " @@ -222,7 +212,6 @@ def test_embed_metadata(self): ], batch_size=256, show_progress_bar=True, - normalize_embeddings=False, ) @pytest.mark.integration diff --git a/integrations/fastembed/tests/test_fastembed_text_embedder.py b/integrations/fastembed/tests/test_fastembed_text_embedder.py index ee9bfd3da..6327532e1 100644 --- a/integrations/fastembed/tests/test_fastembed_text_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_text_embedder.py @@ -2,6 +2,7 @@ import numpy as np import pytest +from haystack import default_from_dict from haystack_integrations.components.embedders.fastembed.fastembed_text_embedder import ( FastembedTextEmbedder, ) @@ -16,7 +17,6 @@ def test_init_default(self): assert embedder.model_name == "BAAI/bge-small-en-v1.5" assert embedder.batch_size == 256 assert embedder.progress_bar is True - assert embedder.normalize_embeddings is False def test_init_with_parameters(self): """ @@ -26,12 +26,10 @@ def test_init_with_parameters(self): model="BAAI/bge-small-en-v1.5", batch_size=64, progress_bar=False, - normalize_embeddings=True, ) assert embedder.model_name == "BAAI/bge-small-en-v1.5" assert embedder.batch_size == 64 assert embedder.progress_bar is False - assert embedder.normalize_embeddings is True def test_to_dict(self): """ @@ -45,7 +43,6 @@ def test_to_dict(self): "model": "BAAI/bge-small-en-v1.5", "batch_size": 256, "progress_bar": True, - "normalize_embeddings": False, }, } @@ -57,7 +54,6 @@ def test_to_dict_with_custom_init_parameters(self): model="BAAI/bge-small-en-v1.5", batch_size=64, progress_bar=False, - normalize_embeddings=True, ) embedder_dict = embedder.to_dict() assert embedder_dict == { @@ -66,7 +62,6 @@ def test_to_dict_with_custom_init_parameters(self): "model": "BAAI/bge-small-en-v1.5", "batch_size": 64, "progress_bar": False, - "normalize_embeddings": True, }, } @@ -80,14 +75,12 @@ def test_from_dict(self): "model": "BAAI/bge-small-en-v1.5", "batch_size": 256, "progress_bar": True, - "normalize_embeddings": False, }, } - embedder = FastembedTextEmbedder.from_dict(embedder_dict) + embedder = default_from_dict(FastembedTextEmbedder, embedder_dict) assert embedder.model_name == "BAAI/bge-small-en-v1.5" assert embedder.batch_size == 256 assert embedder.progress_bar is True - assert embedder.normalize_embeddings is False def test_from_dict_with_custom_init_parameters(self): """ @@ -99,14 +92,12 @@ def test_from_dict_with_custom_init_parameters(self): "model": "BAAI/bge-small-en-v1.5", "batch_size": 64, "progress_bar": False, - "normalize_embeddings": True, }, } - embedder = FastembedTextEmbedder.from_dict(embedder_dict) + embedder = default_from_dict(FastembedTextEmbedder, embedder_dict) assert embedder.model_name == "BAAI/bge-small-en-v1.5" assert embedder.batch_size == 64 assert embedder.progress_bar is False - assert embedder.normalize_embeddings is True @patch( "haystack_integrations.components.embedders.fastembed.fastembed_text_embedder._FastembedEmbeddingBackendFactory" @@ -167,7 +158,6 @@ def test_run(self): model="BAAI/bge-small-en-v1.5", ) embedder.warm_up() - # embedder.embedding_backend = MagicMock() text = "Parton energy loss in QCD matter" From 8dd145e20f7fa0d85126a3ff836a0a078ec78d7c Mon Sep 17 00:00:00 2001 From: Nico Date: Sun, 11 Feb 2024 08:54:41 +0100 Subject: [PATCH 46/49] updated labeler.yml --- .github/labeler.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/labeler.yml b/.github/labeler.yml index 4d060772c..6b24ebe40 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -29,6 +29,11 @@ integration:elasticsearch: - any-glob-to-any-file: "integrations/elasticsearch/**/*" - any-glob-to-any-file: ".github/workflows/elasticsearch.yml" +integration:gastembed: + - changed-files: + - any-glob-to-any-file: "integrations/fastembed/**/*" + - any-glob-to-any-file: ".github/workflows/fastembed.yml" + integration:google-ai: - changed-files: - any-glob-to-any-file: "integrations/google_ai/**/*" From f0bb7d96e1c65a811cbaca6d3e232b93ab195585 Mon Sep 17 00:00:00 2001 From: Nico Date: Sun, 11 Feb 2024 08:55:01 +0100 Subject: [PATCH 47/49] updated library readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 39d669322..8e216792e 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,7 @@ deepset-haystack | [chroma-haystack](integrations/chroma/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack) | [![Test / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml) | | [cohere-haystack](integrations/cohere/) | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/cohere-haystack.svg)](https://pypi.org/project/cohere-haystack) | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml) | | [elasticsearch-haystack](integrations/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml) | +| [fastembed-haystack](integrations/fastembed/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/fastembed.svg)](https://pypi.org/project/fastembed-haystack/) | [![Test / fastembed](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/fastembed.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/fastembed.yml) | | [google-ai-haystack](integrations/google_ai/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/google-ai-haystack.svg)](https://pypi.org/project/google-ai-haystack) | [![Test / google-ai](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_ai.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_ai.yml) | | [google-vertex-haystack](integrations/google_vertex/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/google-vertex-haystack.svg)](https://pypi.org/project/google-vertex-haystack) | [![Test / google-vertex](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml) | | [gradient-haystack](integrations/gradient/) | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/gradient-haystack.svg)](https://pypi.org/project/gradient-haystack) | [![Test / gradient](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/gradient.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/gradient.yml) | From 0847181dd9779510d2dffb0ffc4a13a159685f55 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Sun, 11 Feb 2024 10:19:25 +0100 Subject: [PATCH 48/49] fix typos --- .github/labeler.yml | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/labeler.yml b/.github/labeler.yml index d54cac5ca..dac3bf015 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -29,7 +29,7 @@ integration:elasticsearch: - any-glob-to-any-file: "integrations/elasticsearch/**/*" - any-glob-to-any-file: ".github/workflows/elasticsearch.yml" -integration:gastembed: +integration:fastembed: - changed-files: - any-glob-to-any-file: "integrations/fastembed/**/*" - any-glob-to-any-file: ".github/workflows/fastembed.yml" diff --git a/README.md b/README.md index 4f91517ab..21c1e7d9d 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ deepset-haystack | [cohere-haystack](integrations/cohere/) | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/cohere-haystack.svg)](https://pypi.org/project/cohere-haystack) | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml) | | [deepeval-haystack](integrations/deepeval/) | Evaluator | [![PyPI - Version](https://img.shields.io/pypi/v/deepeval-haystack.svg)](https://pypi.org/project/deepeval-haystack) | [![Test / deepeval](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/deepeval.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/deepeval.yml) | | [elasticsearch-haystack](integrations/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml) | -| [fastembed-haystack](integrations/fastembed/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/fastembed.svg)](https://pypi.org/project/fastembed-haystack/) | [![Test / fastembed](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/fastembed.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/fastembed.yml) | +| [fastembed-haystack](integrations/fastembed/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/fastembed-haystack.svg)](https://pypi.org/project/fastembed-haystack/) | [![Test / fastembed](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/fastembed.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/fastembed.yml) | | [google-ai-haystack](integrations/google_ai/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/google-ai-haystack.svg)](https://pypi.org/project/google-ai-haystack) | [![Test / google-ai](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_ai.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_ai.yml) | | [google-vertex-haystack](integrations/google_vertex/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/google-vertex-haystack.svg)](https://pypi.org/project/google-vertex-haystack) | [![Test / google-vertex](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml) | | [gradient-haystack](integrations/gradient/) | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/gradient-haystack.svg)](https://pypi.org/project/gradient-haystack) | [![Test / gradient](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/gradient.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/gradient.yml) | From 37a6aaf8869702b4d22b37ef096c0b6af6475370 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Sun, 11 Feb 2024 10:22:56 +0100 Subject: [PATCH 49/49] fix docstrings/README --- integrations/fastembed/README.md | 7 ++++--- .../embedders/fastembed/fastembed_document_embedder.py | 2 +- .../embedders/fastembed/fastembed_text_embedder.py | 9 +++++---- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/integrations/fastembed/README.md b/integrations/fastembed/README.md index 94a86d85e..5ad056af3 100644 --- a/integrations/fastembed/README.md +++ b/integrations/fastembed/README.md @@ -21,17 +21,18 @@ pip install fastembed-haystack You can use `FastembedTextEmbedder` and `FastembedDocumentEmbedder` by importing as: ```python -from fastembed_haystack.fastembed_text_embedder import FastembedTextEmbedder +from haystack_integrations.components.embedders.fastembed import FastembedTextEmbedder text = "fastembed is supported by and maintained by Qdrant." text_embedder = FastembedTextEmbedder( model="BAAI/bge-small-en-v1.5" ) -embedding = text_embedder.run(text) +text_embedder.warm_up() +embedding = text_embedder.run(text)["embedding"] ``` ```python -from fastembed_haystack.fastembed__document_embedder import FastembedDocumentEmbedder +from haystack_integrations.components.embedders.fastembed import FastembedDocumentEmbedder from haystack.dataclasses import Document embedder = FastembedDocumentEmbedder( diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py index 24da783fd..b1e9309c2 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py @@ -16,7 +16,7 @@ class FastembedDocumentEmbedder: # To use this component, install the "fastembed-haystack" package. # pip install fastembed-haystack - from fastembed_haystack.fastembed__document_embedder import FastembedDocumentEmbedder + from haystack_integrations.components.embedders.fastembed import FastembedDocumentEmbedder from haystack.dataclasses import Document doc_embedder = FastembedDocumentEmbedder( diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py index 832d1240f..3446f80d7 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py @@ -12,18 +12,19 @@ class FastembedTextEmbedder: Usage example: ```python - # To use this component, install the "fastembed" package. - # pip install fastembed + # To use this component, install the "fastembed-haystack" package. + # pip install fastembed-haystack - from fastembed_haystack.fastembed_text_embedder import FastembedTextEmbedder + from haystack_integrations.components.embedders.fastembed import FastembedTextEmbedder text = "It clearly says online this will work on a Mac OS system. The disk comes and it does not, only Windows. Do Not order this if you have a Mac!!" text_embedder = FastembedTextEmbedder( model="BAAI/bge-small-en-v1.5" ) + text_embedder.warm_up() - embedding = text_embedder.run(text) + embedding = text_embedder.run(text)["embedding"] ``` """ # noqa: E501