diff --git a/.github/workflows/components_instructor_embedders.yml b/.github/workflows/components_instructor_embedders.yml index 710776d12..0aa0bb30d 100644 --- a/.github/workflows/components_instructor_embedders.yml +++ b/.github/workflows/components_instructor_embedders.yml @@ -34,6 +34,10 @@ jobs: run: | pip install -e .[dev] - - name: Run tests + - name: Run unit tests run: | - pytest + pytest -v -m unit + + - name: Run integration tests + run: | + pytest -v -m integration diff --git a/components/instructor-embedders/instructor_embedders/embedding_backend/__init__.py b/components/instructor-embedders/instructor_embedders/embedding_backend/__init__.py new file mode 100644 index 000000000..e873bc332 --- /dev/null +++ b/components/instructor-embedders/instructor_embedders/embedding_backend/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/components/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py b/components/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py new file mode 100644 index 000000000..c3ff3a79b --- /dev/null +++ b/components/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py @@ -0,0 +1,45 @@ +from typing import ClassVar, Dict, List, Optional, Union + +from haystack.preview.lazy_imports import LazyImport + +with LazyImport(message="Run 'pip install InstructorEmbedding'") as instructor_embeddings_import: + from InstructorEmbedding import INSTRUCTOR + + +class _InstructorEmbeddingBackendFactory: + """ + Factory class to create instances of INSTRUCTOR embedding backends. + """ + + _instances: ClassVar[Dict[str, "_InstructorEmbeddingBackend"]] = {} + + @staticmethod + def get_embedding_backend( + model_name_or_path: str, device: Optional[str] = None, use_auth_token: Union[bool, str, None] = None + ): + embedding_backend_id = f"{model_name_or_path}{device}{use_auth_token}" + + if embedding_backend_id in _InstructorEmbeddingBackendFactory._instances: + return _InstructorEmbeddingBackendFactory._instances[embedding_backend_id] + + embedding_backend = _InstructorEmbeddingBackend( + model_name_or_path=model_name_or_path, device=device, use_auth_token=use_auth_token + ) + _InstructorEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend + return embedding_backend + + +class _InstructorEmbeddingBackend: + """ + Class to manage INSTRUCTOR embeddings. + """ + + def __init__( + self, model_name_or_path: str, device: Optional[str] = None, use_auth_token: Union[bool, str, None] = None + ): + instructor_embeddings_import.check() + self.model = INSTRUCTOR(model_name_or_path=model_name_or_path, device=device, use_auth_token=use_auth_token) + + def embed(self, data: List[List[str]], **kwargs) -> List[List[float]]: + embeddings = self.model.encode(data, **kwargs).tolist() + return embeddings diff --git a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py new file mode 100644 index 000000000..7fd369cd2 --- /dev/null +++ b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py @@ -0,0 +1,136 @@ +from typing import Any, Dict, List, Optional, Union + +from haystack.preview import Document, component, default_from_dict, default_to_dict + +from instructor_embedders.embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory + + +@component +class InstructorDocumentEmbedder: + """ + A component for computing Document embeddings using INSTRUCTOR embedding models. + The embedding of each Document is stored in the `embedding` field of the Document. + """ + + def __init__( + self, + model_name_or_path: str = "hkunlp/instructor-base", + device: Optional[str] = None, + use_auth_token: Union[bool, str, None] = None, + instruction: str = "Represent the document", + batch_size: int = 32, + progress_bar: bool = True, + normalize_embeddings: bool = False, + metadata_fields_to_embed: Optional[List[str]] = None, + embedding_separator: str = "\n", + ): + """ + Create an InstructorDocumentEmbedder component. + + :param model_name_or_path: Local path or name of the model in Hugging Face's model hub, + such as ``'hkunlp/instructor-base'``. + :param device: Device (like 'cuda' / 'cpu') that should be used for computation. + If None, checks if a GPU can be used. + :param use_auth_token: An API token used to download private models from Hugging Face. + If this parameter is set to `True`, then the token generated when running + `transformers-cli login` (stored in ~/.huggingface) will be used. + :param instruction: The instruction string to be used while computing domain-specific embeddings. + The instruction follows the unified template of the form: + "Represent the 'domain' 'text_type' for 'task_objective'", where: + - "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc. + - "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc. + - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, + classify the sentence, etc. + Check some examples of instructions here: https://github.com/xlang-ai/instructor-embedding#use-cases. + :param batch_size: Number of strings to encode at once. + :param progress_bar: If true, displays progress bar during embedding. + :param normalize_embeddings: If set to true, returned vectors will have the length of 1. + :param metadata_fields_to_embed: List of meta fields that should be embedded along with the Document content. + :param embedding_separator: Separator used to concatenate the meta fields to the Document content. + """ + + self.model_name_or_path = model_name_or_path + # TODO: remove device parameter and use Haystack's device management once migrated + self.device = device or "cpu" + self.use_auth_token = use_auth_token + self.instruction = instruction + self.batch_size = batch_size + self.progress_bar = progress_bar + self.normalize_embeddings = normalize_embeddings + self.metadata_fields_to_embed = metadata_fields_to_embed or [] + self.embedding_separator = embedding_separator + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict( + self, + model_name_or_path=self.model_name_or_path, + device=self.device, + use_auth_token=self.use_auth_token, + instruction=self.instruction, + batch_size=self.batch_size, + progress_bar=self.progress_bar, + normalize_embeddings=self.normalize_embeddings, + metadata_fields_to_embed=self.metadata_fields_to_embed, + embedding_separator=self.embedding_separator, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "InstructorDocumentEmbedder": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + def warm_up(self): + """ + Load the embedding backend. + """ + if not hasattr(self, "embedding_backend"): + self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( + model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token + ) + + @component.output_types(documents=List[Document]) + def run(self, documents: List[Document]): + """ + Embed a list of Documents. + The embedding of each Document is stored in the `embedding` field of the Document. + """ + if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): + msg = ("InstructorDocumentEmbedder expects a list of Documents as input. " + "In case you want to embed a list of strings, please use the InstructorTextEmbedder.") + raise TypeError(msg) + if not hasattr(self, "embedding_backend"): + msg = "The embedding model has not been loaded. Please call warm_up() before running." + raise RuntimeError(msg) + + # TODO: once non textual Documents are properly supported, we should also prepare them for embedding here + + texts_to_embed = [] + for doc in documents: + meta_values_to_embed = [ + str(doc.metadata[key]) + for key in self.metadata_fields_to_embed + if key in doc.metadata and doc.metadata[key] is not None + ] + text_to_embed = [self.instruction, self.embedding_separator.join([*meta_values_to_embed, doc.text or ""])] + texts_to_embed.append(text_to_embed) + + embeddings = self.embedding_backend.embed( + texts_to_embed, + batch_size=self.batch_size, + show_progress_bar=self.progress_bar, + normalize_embeddings=self.normalize_embeddings, + ) + + documents_with_embeddings = [] + for doc, emb in zip(documents, embeddings): + doc_as_dict = doc.to_dict() + doc_as_dict["embedding"] = emb + del doc_as_dict["id"] + documents_with_embeddings.append(Document.from_dict(doc_as_dict)) + + return {"documents": documents_with_embeddings} diff --git a/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py b/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py new file mode 100644 index 000000000..dd0ec48c3 --- /dev/null +++ b/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py @@ -0,0 +1,105 @@ +from typing import Any, Dict, List, Optional, Union + +from haystack.preview import component, default_from_dict, default_to_dict + +from instructor_embedders.embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory + + +@component +class InstructorTextEmbedder: + """ + A component for embedding strings using Sentence Transformers models. + """ + + def __init__( + self, + model_name_or_path: str = "hkunlp/instructor-base", + device: Optional[str] = None, + use_auth_token: Union[bool, str, None] = None, + instruction: str = "Represent the sentence", + batch_size: int = 32, + progress_bar: bool = True, + normalize_embeddings: bool = False, + ): + """ + Create an InstructorTextEmbedder component. + + :param model_name_or_path: Local path or name of the model in Hugging Face's model hub, + such as ``'hkunlp/instructor-base'``. + :param device: Device (like 'cuda' / 'cpu') that should be used for computation. + If None, checks if a GPU can be used. + :param use_auth_token: The API token used to download private models from Hugging Face. + If this parameter is set to `True`, then the token generated when running + `transformers-cli login` (stored in ~/.huggingface) will be used. + :param instruction: The instruction string to be used while computing domain-specific embeddings. + The instruction follows the unified template of the form: + "Represent the 'domain' 'text_type' for 'task_objective'", where: + - "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc. + - "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc. + - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, + classify the sentence, etc. + Check some examples of instructions here: https://github.com/xlang-ai/instructor-embedding#use-cases. + :param batch_size: Number of strings to encode at once. + :param progress_bar: If true, displays progress bar during embedding. + :param normalize_embeddings: If set to true, returned vectors will have the length of 1. + """ + + self.model_name_or_path = model_name_or_path + # TODO: remove device parameter and use Haystack's device management once migrated + self.device = device or "cpu" + self.use_auth_token = use_auth_token + self.instruction = instruction + self.batch_size = batch_size + self.progress_bar = progress_bar + self.normalize_embeddings = normalize_embeddings + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict( + self, + model_name_or_path=self.model_name_or_path, + device=self.device, + use_auth_token=self.use_auth_token, + instruction=self.instruction, + batch_size=self.batch_size, + progress_bar=self.progress_bar, + normalize_embeddings=self.normalize_embeddings, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "InstructorTextEmbedder": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + def warm_up(self): + """ + Load the embedding backend. + """ + if not hasattr(self, "embedding_backend"): + self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( + model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token + ) + + @component.output_types(embedding=List[float]) + def run(self, text: str): + """Embed a string.""" + if not isinstance(text, str): + msg = ("InstructorTextEmbedder expects a string as input. " + "In case you want to embed a list of Documents, please use the InstructorDocumentEmbedder.") + raise TypeError(msg) + if not hasattr(self, "embedding_backend"): + msg = "The embedding model has not been loaded. Please call warm_up() before running." + raise RuntimeError(msg) + + text_to_embed = [self.instruction, text] + embedding = self.embedding_backend.embed( + [text_to_embed], + batch_size=self.batch_size, + show_progress_bar=self.progress_bar, + normalize_embeddings=self.normalize_embeddings, + )[0] + return {"embedding": embedding} diff --git a/components/instructor-embedders/pyproject.toml b/components/instructor-embedders/pyproject.toml index bea7cc53a..efe6cca9e 100644 --- a/components/instructor-embedders/pyproject.toml +++ b/components/instructor-embedders/pyproject.toml @@ -87,7 +87,6 @@ select = [ "E", "EM", "F", - "FBT", "I", "ICN", "ISC", diff --git a/components/instructor-embedders/tests/test_instructor_backend.py b/components/instructor-embedders/tests/test_instructor_backend.py new file mode 100644 index 000000000..334e02f6f --- /dev/null +++ b/components/instructor-embedders/tests/test_instructor_backend.py @@ -0,0 +1,44 @@ +from unittest.mock import patch + +import pytest + +from instructor_embedders.embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory + + +@pytest.mark.unit +@patch("instructor_embedders.embedding_backend.instructor_backend.INSTRUCTOR") +def test_factory_behavior(mock_instructor): # noqa: ARG001 + embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( + model_name_or_path="hkunlp/instructor-large", device="cpu" + ) + same_embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend("hkunlp/instructor-large", "cpu") + another_embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( + model_name_or_path="hkunlp/instructor-base", device="cpu" + ) + + assert same_embedding_backend is embedding_backend + assert another_embedding_backend is not embedding_backend + + +@pytest.mark.unit +@patch("instructor_embedders.embedding_backend.instructor_backend.INSTRUCTOR") +def test_model_initialization(mock_instructor): + _InstructorEmbeddingBackendFactory.get_embedding_backend( + model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token="huggingface_auth_token" + ) + mock_instructor.assert_called_once_with( + model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token="huggingface_auth_token" + ) + + +@pytest.mark.unit +@patch("instructor_embedders.embedding_backend.instructor_backend.INSTRUCTOR") +def test_embedding_function_with_kwargs(mock_instructor): # noqa: ARG001 + embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( + model_name_or_path="hkunlp/instructor-base" + ) + + data = [["instruction", "sentence1"], ["instruction", "sentence2"]] + embedding_backend.embed(data=data, normalize_embeddings=True) + + embedding_backend.model.encode.assert_called_once_with(data, normalize_embeddings=True) diff --git a/components/instructor-embedders/tests/test_instructor_document_embedder.py b/components/instructor-embedders/tests/test_instructor_document_embedder.py new file mode 100644 index 000000000..faa9d715b --- /dev/null +++ b/components/instructor-embedders/tests/test_instructor_document_embedder.py @@ -0,0 +1,275 @@ +from unittest.mock import MagicMock, patch + +import numpy as np +import pytest +from haystack.preview import Document + +from instructor_embedders.instructor_document_embedder import InstructorDocumentEmbedder + + +class TestInstructorDocumentEmbedder: + @pytest.mark.unit + def test_init_default(self): + """ + Test default initialization parameters for InstructorDocumentEmbedder. + """ + embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base") + assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.device == "cpu" + assert embedder.use_auth_token is None + assert embedder.instruction == "Represent the document" + assert embedder.batch_size == 32 + assert embedder.progress_bar is True + assert embedder.normalize_embeddings is False + assert embedder.metadata_fields_to_embed == [] + assert embedder.embedding_separator == "\n" + + @pytest.mark.unit + def test_init_with_parameters(self): + """ + Test custom initialization parameters for InstructorDocumentEmbedder. + """ + embedder = InstructorDocumentEmbedder( + model_name_or_path="hkunlp/instructor-base", + device="cuda", + use_auth_token=True, + instruction="Represent the 'domain' 'text_type' for 'task_objective'", + batch_size=64, + progress_bar=False, + normalize_embeddings=True, + metadata_fields_to_embed=["test_field"], + embedding_separator=" | ", + ) + assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.device == "cuda" + assert embedder.use_auth_token is True + assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.normalize_embeddings is True + assert embedder.metadata_fields_to_embed == ["test_field"] + assert embedder.embedding_separator == " | " + + @pytest.mark.unit + def test_to_dict(self): + """ + Test serialization of InstructorDocumentEmbedder to a dictionary, using default initialization parameters. + """ + embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base") + embedder_dict = embedder.to_dict() + assert embedder_dict == { + "type": "InstructorDocumentEmbedder", + "init_parameters": { + "model_name_or_path": "hkunlp/instructor-base", + "device": "cpu", + "use_auth_token": None, + "instruction": "Represent the document", + "batch_size": 32, + "progress_bar": True, + "normalize_embeddings": False, + "embedding_separator": "\n", + "metadata_fields_to_embed": [], + }, + } + + @pytest.mark.unit + def test_to_dict_with_custom_init_parameters(self): + """ + Test serialization of InstructorDocumentEmbedder to a dictionary, using custom initialization parameters. + """ + embedder = InstructorDocumentEmbedder( + model_name_or_path="hkunlp/instructor-base", + device="cuda", + use_auth_token=True, + instruction="Represent the financial document for retrieval", + batch_size=64, + progress_bar=False, + normalize_embeddings=True, + metadata_fields_to_embed=["test_field"], + embedding_separator=" | ", + ) + embedder_dict = embedder.to_dict() + assert embedder_dict == { + "type": "InstructorDocumentEmbedder", + "init_parameters": { + "model_name_or_path": "hkunlp/instructor-base", + "device": "cuda", + "use_auth_token": True, + "instruction": "Represent the financial document for retrieval", + "batch_size": 64, + "progress_bar": False, + "normalize_embeddings": True, + "metadata_fields_to_embed": ["test_field"], + "embedding_separator": " | ", + }, + } + + @pytest.mark.unit + def test_from_dict(self): + """ + Test deserialization of InstructorDocumentEmbedder from a dictionary, using default initialization parameters. + """ + embedder_dict = { + "type": "InstructorDocumentEmbedder", + "init_parameters": { + "model_name_or_path": "hkunlp/instructor-base", + "device": "cpu", + "use_auth_token": None, + "instruction": "Represent the 'domain' 'text_type' for 'task_objective'", + "batch_size": 32, + "progress_bar": True, + "normalize_embeddings": False, + "metadata_fields_to_embed": [], + "embedding_separator": "\n", + }, + } + embedder = InstructorDocumentEmbedder.from_dict(embedder_dict) + assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.device == "cpu" + assert embedder.use_auth_token is None + assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'" + assert embedder.batch_size == 32 + assert embedder.progress_bar is True + assert embedder.normalize_embeddings is False + assert embedder.metadata_fields_to_embed == [] + assert embedder.embedding_separator == "\n" + + @pytest.mark.unit + def test_from_dict_with_custom_init_parameters(self): + """ + Test deserialization of InstructorDocumentEmbedder from a dictionary, using custom initialization parameters. + """ + embedder_dict = { + "type": "InstructorDocumentEmbedder", + "init_parameters": { + "model_name_or_path": "hkunlp/instructor-base", + "device": "cuda", + "use_auth_token": True, + "instruction": "Represent the financial document for retrieval", + "batch_size": 64, + "progress_bar": False, + "normalize_embeddings": True, + "metadata_fields_to_embed": ["test_field"], + "embedding_separator": " | ", + }, + } + embedder = InstructorDocumentEmbedder.from_dict(embedder_dict) + assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.device == "cuda" + assert embedder.use_auth_token is True + assert embedder.instruction == "Represent the financial document for retrieval" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.normalize_embeddings is True + assert embedder.metadata_fields_to_embed == ["test_field"] + assert embedder.embedding_separator == " | " + + @pytest.mark.unit + @patch("instructor_embedders.instructor_document_embedder._InstructorEmbeddingBackendFactory") + def test_warmup(self, mocked_factory): + """ + Test for checking embedder instances after warm-up. + """ + embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once_with( + model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token=None + ) + + @pytest.mark.unit + @patch("instructor_embedders.instructor_document_embedder._InstructorEmbeddingBackendFactory") + def test_warmup_does_not_reload(self, mocked_factory): + """ + Test for checking backend instances after multiple warm-ups. + """ + embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once() + + @pytest.mark.unit + def test_embed(self): + """ + Test for checking output dimensions and embedding dimensions. + """ + embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-large") + embedder.embedding_backend = MagicMock() + embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() # noqa: ARG005 + + documents = [Document(text=f"Sample-document text {i}") for i in range(5)] + + result = embedder.run(documents=documents) + + assert isinstance(result["documents"], list) + assert len(result["documents"]) == len(documents) + for doc in result["documents"]: + assert isinstance(doc, Document) + assert isinstance(doc.embedding, list) + assert isinstance(doc.embedding[0], float) + + @pytest.mark.unit + def test_embed_incorrect_input_format(self): + """ + Test for checking incorrect input format when creating embedding. + """ + embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base") + + string_input = "text" + list_integers_input = [1, 2, 3] + + with pytest.raises(TypeError, match="InstructorDocumentEmbedder expects a list of Documents as input."): + embedder.run(documents=string_input) + + with pytest.raises(TypeError, match="InstructorDocumentEmbedder expects a list of Documents as input."): + embedder.run(documents=list_integers_input) + + @pytest.mark.unit + def test_embed_metadata(self): + """ + Test for checking output dimensions and embedding dimensions for documents + with a custom instruction and metadata. + """ + embedder = InstructorDocumentEmbedder( + model_name_or_path="model", + instruction="Represent the financial document for retrieval", + metadata_fields_to_embed=["meta_field"], + embedding_separator="\n", + ) + embedder.embedding_backend = MagicMock() + + documents = [ + Document(text=f"document-number {i}", metadata={"meta_field": f"meta_value {i}"}) for i in range(5) + ] + + embedder.run(documents=documents) + + embedder.embedding_backend.embed.assert_called_once_with( + [ + ["Represent the financial document for retrieval", "meta_value 0\ndocument-number 0"], + ["Represent the financial document for retrieval", "meta_value 1\ndocument-number 1"], + ["Represent the financial document for retrieval", "meta_value 2\ndocument-number 2"], + ["Represent the financial document for retrieval", "meta_value 3\ndocument-number 3"], + ["Represent the financial document for retrieval", "meta_value 4\ndocument-number 4"], + ], + batch_size=32, + show_progress_bar=True, + normalize_embeddings=False, + ) + + @pytest.mark.integration + def test_run(self): + embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base", + device="cpu", + instruction="Represent the Science document for retrieval") + embedder.warm_up() + + doc = Document(text="Parton energy loss in QCD matter") + + result = embedder.run(documents=[doc]) + embedding = result["documents"][0].embedding + + assert isinstance(embedding, list) + assert len(embedding) == 768 + assert all(isinstance(emb, float) for emb in embedding) diff --git a/components/instructor-embedders/tests/test_instructor_embedders.py b/components/instructor-embedders/tests/test_instructor_embedders.py index 129c33f3c..1abbe3b32 100644 --- a/components/instructor-embedders/tests/test_instructor_embedders.py +++ b/components/instructor-embedders/tests/test_instructor_embedders.py @@ -2,5 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 + def test_instructor_embedders(): assert True diff --git a/components/instructor-embedders/tests/test_instructor_text_embedder.py b/components/instructor-embedders/tests/test_instructor_text_embedder.py new file mode 100644 index 000000000..e3afe91e8 --- /dev/null +++ b/components/instructor-embedders/tests/test_instructor_text_embedder.py @@ -0,0 +1,215 @@ +from unittest.mock import MagicMock, patch + +import numpy as np +import pytest + +from instructor_embedders.instructor_text_embedder import InstructorTextEmbedder + + +class TestInstructorTextEmbedder: + @pytest.mark.unit + def test_init_default(self): + """ + Test default initialization parameters for InstructorTextEmbedder. + """ + embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base") + assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.device == "cpu" + assert embedder.use_auth_token is None + assert embedder.instruction == "Represent the sentence" + assert embedder.batch_size == 32 + assert embedder.progress_bar is True + assert embedder.normalize_embeddings is False + + @pytest.mark.unit + def test_init_with_parameters(self): + """ + Test custom initialization parameters for InstructorTextEmbedder. + """ + embedder = InstructorTextEmbedder( + model_name_or_path="hkunlp/instructor-base", + device="cuda", + use_auth_token=True, + instruction="Represent the 'domain' 'text_type' for 'task_objective'", + batch_size=64, + progress_bar=False, + normalize_embeddings=True, + ) + assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.device == "cuda" + assert embedder.use_auth_token is True + assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.normalize_embeddings is True + + @pytest.mark.unit + def test_to_dict(self): + """ + Test serialization of InstructorTextEmbedder to a dictionary, using default initialization parameters. + """ + embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base") + embedder_dict = embedder.to_dict() + assert embedder_dict == { + "type": "InstructorTextEmbedder", + "init_parameters": { + "model_name_or_path": "hkunlp/instructor-base", + "device": "cpu", + "use_auth_token": None, + "instruction": "Represent the sentence", + "batch_size": 32, + "progress_bar": True, + "normalize_embeddings": False, + }, + } + + @pytest.mark.unit + def test_to_dict_with_custom_init_parameters(self): + """ + Test serialization of InstructorTextEmbedder to a dictionary, using custom initialization parameters. + """ + embedder = InstructorTextEmbedder( + model_name_or_path="hkunlp/instructor-base", + device="cuda", + use_auth_token=True, + instruction="Represent the financial document for retrieval", + batch_size=64, + progress_bar=False, + normalize_embeddings=True, + ) + embedder_dict = embedder.to_dict() + assert embedder_dict == { + "type": "InstructorTextEmbedder", + "init_parameters": { + "model_name_or_path": "hkunlp/instructor-base", + "device": "cuda", + "use_auth_token": True, + "instruction": "Represent the financial document for retrieval", + "batch_size": 64, + "progress_bar": False, + "normalize_embeddings": True, + }, + } + + @pytest.mark.unit + def test_from_dict(self): + """ + Test deserialization of InstructorTextEmbedder from a dictionary, using default initialization parameters. + """ + embedder_dict = { + "type": "InstructorTextEmbedder", + "init_parameters": { + "model_name_or_path": "hkunlp/instructor-base", + "device": "cpu", + "use_auth_token": None, + "instruction": "Represent the 'domain' 'text_type' for 'task_objective'", + "batch_size": 32, + "progress_bar": True, + "normalize_embeddings": False, + }, + } + embedder = InstructorTextEmbedder.from_dict(embedder_dict) + assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.device == "cpu" + assert embedder.use_auth_token is None + assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'" + assert embedder.batch_size == 32 + assert embedder.progress_bar is True + assert embedder.normalize_embeddings is False + + @pytest.mark.unit + def test_from_dict_with_custom_init_parameters(self): + """ + Test deserialization of InstructorTextEmbedder from a dictionary, using custom initialization parameters. + """ + embedder_dict = { + "type": "InstructorTextEmbedder", + "init_parameters": { + "model_name_or_path": "hkunlp/instructor-base", + "device": "cuda", + "use_auth_token": True, + "instruction": "Represent the financial document for retrieval", + "batch_size": 64, + "progress_bar": False, + "normalize_embeddings": True, + }, + } + embedder = InstructorTextEmbedder.from_dict(embedder_dict) + assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.device == "cuda" + assert embedder.use_auth_token is True + assert embedder.instruction == "Represent the financial document for retrieval" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.normalize_embeddings is True + + @pytest.mark.unit + @patch("instructor_embedders.instructor_text_embedder._InstructorEmbeddingBackendFactory") + def test_warmup(self, mocked_factory): + """ + Test for checking embedder instances after warm-up. + """ + embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once_with( + model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token=None + ) + + @pytest.mark.unit + @patch("instructor_embedders.instructor_text_embedder._InstructorEmbeddingBackendFactory") + def test_warmup_does_not_reload(self, mocked_factory): + """ + Test for checking backend instances after multiple warm-ups. + """ + embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once() + + @pytest.mark.unit + def test_embed(self): + """ + Test for checking output dimensions and embedding dimensions. + """ + embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-large") + embedder.embedding_backend = MagicMock() + embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() # noqa: ARG005 + + text = "Good text to embed" + + result = embedder.run(text=text) + embedding = result["embedding"] + + assert isinstance(embedding, list) + assert all(isinstance(emb, float) for emb in embedding) + + @pytest.mark.unit + def test_run_wrong_incorrect_format(self): + """ + Test for checking incorrect input format when creating embedding. + """ + embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-large") + embedder.embedding_backend = MagicMock() + + list_integers_input = [1, 2, 3] + + with pytest.raises(TypeError, match="InstructorTextEmbedder expects a string as input"): + embedder.run(text=list_integers_input) + + @pytest.mark.integration + def test_run(self): + embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base", + device="cpu", + instruction="Represent the Science sentence for retrieval") + embedder.warm_up() + + text = "Parton energy loss in QCD matter" + + result = embedder.run(text=text) + embedding = result["embedding"] + + assert isinstance(embedding, list) + assert len(embedding) == 768 + assert all(isinstance(emb, float) for emb in embedding)