From cc32ee61fbc62c8aa431e4347fb499f389ae153e Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Wed, 21 Feb 2024 08:45:50 +0100 Subject: [PATCH] instructor - new devices mgmt (#441) --- .../instructor_document_embedder.py | 23 ++++++++------- .../instructor_text_embedder.py | 29 +++++++++++-------- .../test_instructor_document_embedder.py | 28 +++++++++--------- .../tests/test_instructor_text_embedder.py | 28 +++++++++--------- 4 files changed, 58 insertions(+), 50 deletions(-) diff --git a/integrations/instructor_embedders/src/haystack_integrations/components/embedders/instructor_embedders/instructor_document_embedder.py b/integrations/instructor_embedders/src/haystack_integrations/components/embedders/instructor_embedders/instructor_document_embedder.py index 4246516a7..fb9a34fe0 100644 --- a/integrations/instructor_embedders/src/haystack_integrations/components/embedders/instructor_embedders/instructor_document_embedder.py +++ b/integrations/instructor_embedders/src/haystack_integrations/components/embedders/instructor_embedders/instructor_document_embedder.py @@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional from haystack import Document, component, default_from_dict, default_to_dict -from haystack.utils import Secret, deserialize_secrets_inplace +from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace from .embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory @@ -20,8 +20,9 @@ class InstructorDocumentEmbedder: # To use this component, install the "instructor-embedders-haystack" package. # pip install instructor-embedders-haystack - from instructor_embedders_haystack.instructor_document_embedder import InstructorDocumentEmbedder + from haystack_integrations.components.embedders.instructor_embedders import InstructorDocumentEmbedder from haystack.dataclasses import Document + from haystack.utils import ComponentDevice doc_embedding_instruction = "Represent the Medical Document for retrieval:" @@ -30,7 +31,7 @@ class InstructorDocumentEmbedder: model="hkunlp/instructor-base", instruction=doc_embedding_instruction, batch_size=32, - device="cpu", + device=ComponentDevice.from_str("cpu"), ) doc_embedder.warm_up() @@ -62,7 +63,7 @@ class InstructorDocumentEmbedder: def __init__( self, model: str = "hkunlp/instructor-base", - device: Optional[str] = None, + device: Optional[ComponentDevice] = None, token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False), # noqa: B008 instruction: str = "Represent the document", batch_size: int = 32, @@ -76,8 +77,8 @@ def __init__( :param model: Local path or name of the model in Hugging Face's model hub, such as ``'hkunlp/instructor-base'``. - :param device: Device (like 'cuda' / 'cpu') that should be used for computation. - If None, checks if a GPU can be used. + :param device: The device on which the model is loaded. If `None`, the default device is automatically + selected. :param use_auth_token: An API token used to download private models from Hugging Face. If this parameter is set to `True`, then the token generated when running `transformers-cli login` (stored in ~/.huggingface) will be used. @@ -97,8 +98,7 @@ def __init__( """ self.model = model - # TODO: remove device parameter and use Haystack's device management once migrated - self.device = device or "cpu" + self.device = ComponentDevice.resolve_device(device) self.token = token self.instruction = instruction self.batch_size = batch_size @@ -114,7 +114,7 @@ def to_dict(self) -> Dict[str, Any]: return default_to_dict( self, model=self.model, - device=self.device, + device=self.device.to_dict(), token=self.token.to_dict() if self.token else None, instruction=self.instruction, batch_size=self.batch_size, @@ -129,6 +129,9 @@ def from_dict(cls, data: Dict[str, Any]) -> "InstructorDocumentEmbedder": """ Deserialize this component from a dictionary. """ + serialized_device = data["init_parameters"]["device"] + data["init_parameters"]["device"] = ComponentDevice.from_dict(serialized_device) + deserialize_secrets_inplace(data["init_parameters"], keys=["token"]) return default_from_dict(cls, data) @@ -138,7 +141,7 @@ def warm_up(self): """ if not hasattr(self, "embedding_backend"): self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( - model=self.model, device=self.device, token=self.token + model=self.model, device=self.device.to_torch_str(), token=self.token ) @component.output_types(documents=List[Document]) diff --git a/integrations/instructor_embedders/src/haystack_integrations/components/embedders/instructor_embedders/instructor_text_embedder.py b/integrations/instructor_embedders/src/haystack_integrations/components/embedders/instructor_embedders/instructor_text_embedder.py index 0299d076a..b56f363c5 100644 --- a/integrations/instructor_embedders/src/haystack_integrations/components/embedders/instructor_embedders/instructor_text_embedder.py +++ b/integrations/instructor_embedders/src/haystack_integrations/components/embedders/instructor_embedders/instructor_text_embedder.py @@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional from haystack import component, default_from_dict, default_to_dict -from haystack.utils import Secret, deserialize_secrets_inplace +from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace from .embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory @@ -19,26 +19,29 @@ class InstructorTextEmbedder: # To use this component, install the "instructor-embedders-haystack" package. # pip install instructor-embedders-haystack - from instructor_embedders_haystack.instructor_text_embedder import InstructorTextEmbedder + from haystack.utils.device import ComponentDevice + from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder - text = "It clearly says online this will work on a Mac OS system. The disk comes and it does not, only Windows. Do Not order this if you have a Mac!!" + text = ("It clearly says online this will work on a Mac OS system. The disk comes and it does not, only Windows. + "Do Not order this if you have a Mac!!") instruction = ( "Represent the Amazon comment for classifying the sentence as positive or negative" ) text_embedder = InstructorTextEmbedder( model="hkunlp/instructor-base", instruction=instruction, - device="cpu" + device=ComponentDevice.from_str("cpu") ) + text_embedder.warm_up() embedding = text_embedder.run(text) ``` - """ # noqa: E501 + """ def __init__( self, model: str = "hkunlp/instructor-base", - device: Optional[str] = None, + device: Optional[ComponentDevice] = None, token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False), # noqa: B008 instruction: str = "Represent the sentence", batch_size: int = 32, @@ -50,8 +53,8 @@ def __init__( :param model: Local path or name of the model in Hugging Face's model hub, such as ``'hkunlp/instructor-base'``. - :param device: Device (like 'cuda' / 'cpu') that should be used for computation. - If None, checks if a GPU can be used. + :param device: The device on which the model is loaded. If `None`, the default device is automatically + selected. :param token: The API token used to download private models from Hugging Face. :param instruction: The instruction string to be used while computing domain-specific embeddings. The instruction follows the unified template of the form: @@ -67,8 +70,7 @@ def __init__( """ self.model = model - # TODO: remove device parameter and use Haystack's device management once migrated - self.device = device or "cpu" + self.device = ComponentDevice.resolve_device(device) self.token = token self.instruction = instruction self.batch_size = batch_size @@ -82,7 +84,7 @@ def to_dict(self) -> Dict[str, Any]: return default_to_dict( self, model=self.model, - device=self.device, + device=self.device.to_dict(), token=self.token.to_dict() if self.token else None, instruction=self.instruction, batch_size=self.batch_size, @@ -95,6 +97,9 @@ def from_dict(cls, data: Dict[str, Any]) -> "InstructorTextEmbedder": """ Deserialize this component from a dictionary. """ + serialized_device = data["init_parameters"]["device"] + data["init_parameters"]["device"] = ComponentDevice.from_dict(serialized_device) + deserialize_secrets_inplace(data["init_parameters"], keys=["token"]) return default_from_dict(cls, data) @@ -104,7 +109,7 @@ def warm_up(self): """ if not hasattr(self, "embedding_backend"): self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( - model=self.model, device=self.device, token=self.token + model=self.model, device=self.device.to_torch_str(), token=self.token ) @component.output_types(embedding=List[float]) diff --git a/integrations/instructor_embedders/tests/test_instructor_document_embedder.py b/integrations/instructor_embedders/tests/test_instructor_document_embedder.py index ba1444900..44f740679 100644 --- a/integrations/instructor_embedders/tests/test_instructor_document_embedder.py +++ b/integrations/instructor_embedders/tests/test_instructor_document_embedder.py @@ -3,7 +3,7 @@ import numpy as np import pytest from haystack import Document -from haystack.utils import Secret +from haystack.utils import ComponentDevice, Secret from haystack_integrations.components.embedders.instructor_embedders import InstructorDocumentEmbedder @@ -14,7 +14,7 @@ def test_init_default(self): """ embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base") assert embedder.model == "hkunlp/instructor-base" - assert embedder.device == "cpu" + assert embedder.device == ComponentDevice.resolve_device(None) assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False) assert embedder.instruction == "Represent the document" assert embedder.batch_size == 32 @@ -29,7 +29,7 @@ def test_init_with_parameters(self): """ embedder = InstructorDocumentEmbedder( model="hkunlp/instructor-base", - device="cuda", + device=ComponentDevice.from_str("cuda:0"), token=Secret.from_token("fake-api-token"), instruction="Represent the 'domain' 'text_type' for 'task_objective'", batch_size=64, @@ -39,7 +39,7 @@ def test_init_with_parameters(self): embedding_separator=" | ", ) assert embedder.model == "hkunlp/instructor-base" - assert embedder.device == "cuda" + assert embedder.device == ComponentDevice.from_str("cuda:0") assert embedder.token == Secret.from_token("fake-api-token") assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'" assert embedder.batch_size == 64 @@ -52,13 +52,13 @@ def test_to_dict(self): """ Test serialization of InstructorDocumentEmbedder to a dictionary, using default initialization parameters. """ - embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base") + embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base", device=ComponentDevice.from_str("cpu")) embedder_dict = embedder.to_dict() assert embedder_dict == { "type": "haystack_integrations.components.embedders.instructor_embedders.instructor_document_embedder.InstructorDocumentEmbedder", # noqa "init_parameters": { "model": "hkunlp/instructor-base", - "device": "cpu", + "device": ComponentDevice.from_str("cpu").to_dict(), "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"}, "instruction": "Represent the document", "batch_size": 32, @@ -75,7 +75,7 @@ def test_to_dict_with_custom_init_parameters(self): """ embedder = InstructorDocumentEmbedder( model="hkunlp/instructor-base", - device="cuda", + device=ComponentDevice.from_str("cuda:0"), instruction="Represent the financial document for retrieval", batch_size=64, progress_bar=False, @@ -88,7 +88,7 @@ def test_to_dict_with_custom_init_parameters(self): "type": "haystack_integrations.components.embedders.instructor_embedders.instructor_document_embedder.InstructorDocumentEmbedder", # noqa "init_parameters": { "model": "hkunlp/instructor-base", - "device": "cuda", + "device": ComponentDevice.from_str("cuda:0").to_dict(), "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"}, "instruction": "Represent the financial document for retrieval", "batch_size": 64, @@ -107,7 +107,7 @@ def test_from_dict(self): "type": "haystack_integrations.components.embedders.instructor_embedders.instructor_document_embedder.InstructorDocumentEmbedder", # noqa "init_parameters": { "model": "hkunlp/instructor-base", - "device": "cpu", + "device": ComponentDevice.from_str("cpu").to_dict(), "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"}, "instruction": "Represent the 'domain' 'text_type' for 'task_objective'", "batch_size": 32, @@ -119,7 +119,7 @@ def test_from_dict(self): } embedder = InstructorDocumentEmbedder.from_dict(embedder_dict) assert embedder.model == "hkunlp/instructor-base" - assert embedder.device == "cpu" + assert embedder.device == ComponentDevice.from_str("cpu") assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False) assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'" assert embedder.batch_size == 32 @@ -136,7 +136,7 @@ def test_from_dict_with_custom_init_parameters(self): "type": "haystack_integrations.components.embedders.instructor_embedders.instructor_document_embedder.InstructorDocumentEmbedder", # noqa "init_parameters": { "model": "hkunlp/instructor-base", - "device": "cuda", + "device": ComponentDevice.from_str("cuda:0").to_dict(), "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"}, "instruction": "Represent the financial document for retrieval", "batch_size": 64, @@ -148,7 +148,7 @@ def test_from_dict_with_custom_init_parameters(self): } embedder = InstructorDocumentEmbedder.from_dict(embedder_dict) assert embedder.model == "hkunlp/instructor-base" - assert embedder.device == "cuda" + assert embedder.device == ComponentDevice.from_str("cuda:0") assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False) assert embedder.instruction == "Represent the financial document for retrieval" assert embedder.batch_size == 64 @@ -164,7 +164,7 @@ def test_warmup(self, mocked_factory): """ Test for checking embedder instances after warm-up. """ - embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base") + embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base", device=ComponentDevice.from_str("cpu")) mocked_factory.get_embedding_backend.assert_not_called() embedder.warm_up() mocked_factory.get_embedding_backend.assert_called_once_with( @@ -254,7 +254,7 @@ def test_embed_metadata(self): def test_run(self): embedder = InstructorDocumentEmbedder( model="hkunlp/instructor-base", - device="cpu", + device=ComponentDevice.from_str("cpu"), instruction="Represent the Science document for retrieval", ) embedder.warm_up() diff --git a/integrations/instructor_embedders/tests/test_instructor_text_embedder.py b/integrations/instructor_embedders/tests/test_instructor_text_embedder.py index d888fef65..55022f1ec 100644 --- a/integrations/instructor_embedders/tests/test_instructor_text_embedder.py +++ b/integrations/instructor_embedders/tests/test_instructor_text_embedder.py @@ -2,7 +2,7 @@ import numpy as np import pytest -from haystack.utils import Secret +from haystack.utils import ComponentDevice, Secret from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder @@ -13,7 +13,7 @@ def test_init_default(self): """ embedder = InstructorTextEmbedder(model="hkunlp/instructor-base") assert embedder.model == "hkunlp/instructor-base" - assert embedder.device == "cpu" + assert embedder.device == ComponentDevice.resolve_device(None) assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False) assert embedder.instruction == "Represent the sentence" assert embedder.batch_size == 32 @@ -26,7 +26,7 @@ def test_init_with_parameters(self): """ embedder = InstructorTextEmbedder( model="hkunlp/instructor-base", - device="cuda", + device=ComponentDevice.from_str("cuda:0"), token=Secret.from_token("fake-api-token"), instruction="Represent the 'domain' 'text_type' for 'task_objective'", batch_size=64, @@ -34,7 +34,7 @@ def test_init_with_parameters(self): normalize_embeddings=True, ) assert embedder.model == "hkunlp/instructor-base" - assert embedder.device == "cuda" + assert embedder.device == ComponentDevice.from_str("cuda:0") assert embedder.token == Secret.from_token("fake-api-token") assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'" assert embedder.batch_size == 64 @@ -45,13 +45,13 @@ def test_to_dict(self): """ Test serialization of InstructorTextEmbedder to a dictionary, using default initialization parameters. """ - embedder = InstructorTextEmbedder(model="hkunlp/instructor-base") + embedder = InstructorTextEmbedder(model="hkunlp/instructor-base", device=ComponentDevice.from_str("cpu")) embedder_dict = embedder.to_dict() assert embedder_dict == { "type": "haystack_integrations.components.embedders.instructor_embedders.instructor_text_embedder.InstructorTextEmbedder", # noqa "init_parameters": { "model": "hkunlp/instructor-base", - "device": "cpu", + "device": ComponentDevice.from_str("cpu").to_dict(), "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"}, "instruction": "Represent the sentence", "batch_size": 32, @@ -66,7 +66,7 @@ def test_to_dict_with_custom_init_parameters(self): """ embedder = InstructorTextEmbedder( model="hkunlp/instructor-base", - device="cuda", + device=ComponentDevice.from_str("cuda:0"), instruction="Represent the financial document for retrieval", batch_size=64, progress_bar=False, @@ -77,7 +77,7 @@ def test_to_dict_with_custom_init_parameters(self): "type": "haystack_integrations.components.embedders.instructor_embedders.instructor_text_embedder.InstructorTextEmbedder", # noqa "init_parameters": { "model": "hkunlp/instructor-base", - "device": "cuda", + "device": ComponentDevice.from_str("cuda:0").to_dict(), "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"}, "instruction": "Represent the financial document for retrieval", "batch_size": 64, @@ -94,7 +94,7 @@ def test_from_dict(self): "type": "haystack_integrations.components.embedders.instructor_embedders.instructor_text_embedder.InstructorTextEmbedder", # noqa "init_parameters": { "model": "hkunlp/instructor-base", - "device": "cpu", + "device": ComponentDevice.from_str("cpu").to_dict(), "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"}, "instruction": "Represent the 'domain' 'text_type' for 'task_objective'", "batch_size": 32, @@ -104,7 +104,7 @@ def test_from_dict(self): } embedder = InstructorTextEmbedder.from_dict(embedder_dict) assert embedder.model == "hkunlp/instructor-base" - assert embedder.device == "cpu" + assert embedder.device == ComponentDevice.from_str("cpu") assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False) assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'" assert embedder.batch_size == 32 @@ -119,7 +119,7 @@ def test_from_dict_with_custom_init_parameters(self): "type": "haystack_integrations.components.embedders.instructor_embedders.instructor_text_embedder.InstructorTextEmbedder", # noqa "init_parameters": { "model": "hkunlp/instructor-base", - "device": "cuda", + "device": ComponentDevice.from_str("cuda:0").to_dict(), "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"}, "instruction": "Represent the financial document for retrieval", "batch_size": 64, @@ -129,7 +129,7 @@ def test_from_dict_with_custom_init_parameters(self): } embedder = InstructorTextEmbedder.from_dict(embedder_dict) assert embedder.model == "hkunlp/instructor-base" - assert embedder.device == "cuda" + assert embedder.device == ComponentDevice.from_str("cuda:0") assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False) assert embedder.instruction == "Represent the financial document for retrieval" assert embedder.batch_size == 64 @@ -143,7 +143,7 @@ def test_warmup(self, mocked_factory): """ Test for checking embedder instances after warm-up. """ - embedder = InstructorTextEmbedder(model="hkunlp/instructor-base") + embedder = InstructorTextEmbedder(model="hkunlp/instructor-base", device=ComponentDevice.from_str("cpu")) mocked_factory.get_embedding_backend.assert_not_called() embedder.warm_up() mocked_factory.get_embedding_backend.assert_called_once_with( @@ -197,7 +197,7 @@ def test_run_wrong_incorrect_format(self): def test_run(self): embedder = InstructorTextEmbedder( model="hkunlp/instructor-base", - device="cpu", + device=ComponentDevice.from_str("cpu"), instruction="Represent the Science sentence for retrieval", ) embedder.warm_up()