From e747dc9ca51e544e0147671232ebc234017c2275 Mon Sep 17 00:00:00 2001 From: shadeMe Date: Tue, 5 Mar 2024 17:46:39 +0100 Subject: [PATCH] feat: Add `NvidiaTextEmbedder`, `NvidiaDocumentEmbedder` and co. --- integrations/nvidia/pydoc/config.yml | 7 +- integrations/nvidia/pyproject.toml | 1 + .../components/embedders/nvidia/__init__.py | 9 + .../components/embedders/nvidia/_schema.py | 91 ++++++ .../embedders/nvidia/document_embedder.py | 208 +++++++++++++ .../components/embedders/nvidia/models.py | 31 ++ .../embedders/nvidia/text_embedder.py | 144 +++++++++ .../utils/nvidia/__init__.py | 3 + .../utils/nvidia/client.py | 61 ++++ .../nvidia/tests/test_document_embedder.py | 287 ++++++++++++++++++ integrations/nvidia/tests/test_placeholder.py | 2 - .../nvidia/tests/test_text_embedder.py | 119 ++++++++ 12 files changed, 960 insertions(+), 3 deletions(-) create mode 100644 integrations/nvidia/src/haystack_integrations/components/embedders/nvidia/__init__.py create mode 100644 integrations/nvidia/src/haystack_integrations/components/embedders/nvidia/_schema.py create mode 100644 integrations/nvidia/src/haystack_integrations/components/embedders/nvidia/document_embedder.py create mode 100644 integrations/nvidia/src/haystack_integrations/components/embedders/nvidia/models.py create mode 100644 integrations/nvidia/src/haystack_integrations/components/embedders/nvidia/text_embedder.py create mode 100644 integrations/nvidia/src/haystack_integrations/utils/nvidia/__init__.py create mode 100644 integrations/nvidia/src/haystack_integrations/utils/nvidia/client.py create mode 100644 integrations/nvidia/tests/test_document_embedder.py delete mode 100644 integrations/nvidia/tests/test_placeholder.py create mode 100644 integrations/nvidia/tests/test_text_embedder.py diff --git a/integrations/nvidia/pydoc/config.yml b/integrations/nvidia/pydoc/config.yml index 65bedf8b4..675db0335 100644 --- a/integrations/nvidia/pydoc/config.yml +++ b/integrations/nvidia/pydoc/config.yml @@ -1,7 +1,12 @@ loaders: - type: haystack_pydoc_tools.loaders.CustomPythonLoader search_path: [../src] - modules: [] + modules: + [ + "haystack_integrations.components.embedders.nvidia.document_embedder", + "haystack_integrations.components.embedders.nvidia.text_embedder", + "haystack_integrations.components.embedders.nvidia.models", + ] ignore_when_discovered: ["__init__"] processors: - type: filter diff --git a/integrations/nvidia/pyproject.toml b/integrations/nvidia/pyproject.toml index cded4787b..ba25812a8 100644 --- a/integrations/nvidia/pyproject.toml +++ b/integrations/nvidia/pyproject.toml @@ -116,6 +116,7 @@ unfixable = [ # Don't touch unused imports "F401", ] +extend-exclude = ["tests", "example"] [tool.ruff.isort] known-first-party = ["src"] diff --git a/integrations/nvidia/src/haystack_integrations/components/embedders/nvidia/__init__.py b/integrations/nvidia/src/haystack_integrations/components/embedders/nvidia/__init__.py new file mode 100644 index 000000000..6ad2f9f6b --- /dev/null +++ b/integrations/nvidia/src/haystack_integrations/components/embedders/nvidia/__init__.py @@ -0,0 +1,9 @@ +from .document_embedder import NvidiaDocumentEmbedder +from .models import NvidiaEmbeddingModel +from .text_embedder import NvidiaTextEmbedder + +__all__ = [ + "NvidiaDocumentEmbedder", + "NvidiaEmbeddingModel", + "NvidiaTextEmbedder", +] diff --git a/integrations/nvidia/src/haystack_integrations/components/embedders/nvidia/_schema.py b/integrations/nvidia/src/haystack_integrations/components/embedders/nvidia/_schema.py new file mode 100644 index 000000000..a0598be86 --- /dev/null +++ b/integrations/nvidia/src/haystack_integrations/components/embedders/nvidia/_schema.py @@ -0,0 +1,91 @@ +from dataclasses import asdict, dataclass +from typing import Any, Dict, List, Literal, Union + +from haystack_integrations.utils.nvidia import NvidiaCloudFunctionsClient + +from .models import NvidiaEmbeddingModel + +MAX_INPUT_STRING_LENGTH = 2048 +MAX_INPUTS = 50 + + +def get_model_nvcf_id(model: NvidiaEmbeddingModel, client: NvidiaCloudFunctionsClient) -> str: + """ + Returns the Nvidia Cloud Functions UUID for the given model. + """ + + available_functions = client.available_functions() + func = available_functions.get(str(model)) + if func is None: + msg = f"Model '{model}' was not found on the Nvidia Cloud Functions backend" + raise ValueError(msg) + elif func.status != "ACTIVE": + msg = f"Model '{model}' is not currently active/usable on the Nvidia Cloud Functions backend" + raise ValueError(msg) + + return func.id + + +@dataclass +class EmbeddingsRequest: + input: Union[str, List[str]] + model: Literal["query", "passage"] + encoding_format: Literal["float", "base64"] = "float" + + def __post_init__(self): + if isinstance(self.input, list): + if len(self.input) > MAX_INPUTS: + msg = f"The number of inputs should not exceed {MAX_INPUTS}" + raise ValueError(msg) + else: + self.input = [self.input] + + if len(self.input) == 0: + msg = "The number of inputs should not be 0" + raise ValueError(msg) + + if any(len(x) > MAX_INPUT_STRING_LENGTH for x in self.input): + msg = f"The length of each input should not exceed {MAX_INPUT_STRING_LENGTH} characters" + raise ValueError(msg) + + if self.encoding_format not in ["float", "base64"]: + msg = "encoding_format should be either 'float' or 'base64'" + raise ValueError(msg) + + if self.model not in ["query", "passage"]: + msg = "model should be either 'query' or 'passage'" + raise ValueError(msg) + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + +@dataclass +class Usage: + prompt_tokens: int + total_tokens: int + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + +@dataclass +class Embeddings: + index: int + embedding: Union[List[float], str] + + +@dataclass +class EmbeddingsResponse: + data: List[Embeddings] + usage: Usage + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "EmbeddingsResponse": + try: + embeddings = [Embeddings(**x) for x in data["data"]] + usage = Usage(**data["usage"]) + return cls(data=embeddings, usage=usage) + except (KeyError, TypeError) as e: + msg = f"Failed to parse EmbeddingsResponse from data: {data}" + raise ValueError(msg) from e diff --git a/integrations/nvidia/src/haystack_integrations/components/embedders/nvidia/document_embedder.py b/integrations/nvidia/src/haystack_integrations/components/embedders/nvidia/document_embedder.py new file mode 100644 index 000000000..139a184b7 --- /dev/null +++ b/integrations/nvidia/src/haystack_integrations/components/embedders/nvidia/document_embedder.py @@ -0,0 +1,208 @@ +from typing import Any, Dict, List, Optional, Tuple, Union + +from haystack import Document, component, default_from_dict, default_to_dict +from haystack.utils import Secret, deserialize_secrets_inplace +from haystack_integrations.utils.nvidia import NvidiaCloudFunctionsClient +from tqdm import tqdm + +from ._schema import MAX_INPUTS, EmbeddingsRequest, EmbeddingsResponse, Usage, get_model_nvcf_id +from .models import NvidiaEmbeddingModel + + +@component +class NvidiaDocumentEmbedder: + """ + A component for embedding documents using embedding models provided by + [NVIDIA AI Foundation Endpoints](https://www.nvidia.com/en-us/ai-data-science/foundation-models/). + + Usage example: + ```python + from haystack_integrations.components.embedders.nvidia import NvidiaDocumentEmbedder, NvidiaEmbeddingModel + + doc = Document(content="I love pizza!") + + text_embedder = NvidiaDocumentEmbedder(model=NvidiaEmbeddingModel.NVOLVE_40K) + text_embedder.warm_up() + + result = document_embedder.run([doc]) + print(result["documents"][0].embedding) + ``` + """ + + def __init__( + self, + model: Union[str, NvidiaEmbeddingModel], + api_key: Secret = Secret.from_env_var("NVIDIA_API_KEY"), + prefix: str = "", + suffix: str = "", + batch_size: int = 32, + progress_bar: bool = True, + meta_fields_to_embed: Optional[List[str]] = None, + embedding_separator: str = "\n", + ): + """ + Create a NvidiaTextEmbedder component. + + :param model: + Embedding model to use. + :param api_key: + API key for the NVIDIA AI Foundation Endpoints. + :param prefix: + A string to add to the beginning of each text. + :param suffix: + A string to add to the end of each text. + :param batch_size: + Number of Documents to encode at once. + Cannot be greater than 50. + :param progress_bar: + Whether to show a progress bar or not. + :param meta_fields_to_embed: + List of meta fields that should be embedded along with the Document text. + :param embedding_separator: + Separator used to concatenate the meta fields to the Document text. + """ + + if isinstance(model, str): + model = NvidiaEmbeddingModel.from_str(model) + + resolved_api_key = api_key.resolve_value() + assert resolved_api_key is not None + + # Upper-limit for the endpoint. + if batch_size > MAX_INPUTS: + msg = f"NVIDIA Cloud Functions currently support a maximum batch size of {MAX_INPUTS}." + raise ValueError(msg) + + self.api_key = api_key + self.model = model + self.prefix = prefix + self.suffix = suffix + self.batch_size = batch_size + self.progress_bar = progress_bar + self.meta_fields_to_embed = meta_fields_to_embed or [] + self.embedding_separator = embedding_separator + + self.client = NvidiaCloudFunctionsClient( + api_key=resolved_api_key, + headers={ + "Content-Type": "application/json", + "Accept": "application/json", + }, + ) + self.nvcf_id = None + self._initialized = False + + def warm_up(self): + """ + Initializes the component. + """ + if self._initialized: + return + + self.nvcf_id = get_model_nvcf_id(self.model, self.client) + self._initialized = True + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + api_key=self.api_key.to_dict(), + model=str(self.model), + prefix=self.prefix, + suffix=self.suffix, + batch_size=self.batch_size, + progress_bar=self.progress_bar, + meta_fields_to_embed=self.meta_fields_to_embed, + embedding_separator=self.embedding_separator, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "NvidiaDocumentEmbedder": + """ + Deserializes the component from a dictionary. + + :param data: + The dictionary to deserialize from. + :returns: + The deserialized component. + """ + data["init_parameters"]["model"] = NvidiaEmbeddingModel.from_str(data["init_parameters"]["model"]) + deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) + return default_from_dict(cls, data) + + def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: + texts_to_embed = [] + for doc in documents: + meta_values_to_embed = [ + str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None + ] + text_to_embed = ( + self.prefix + self.embedding_separator.join([*meta_values_to_embed, doc.content or ""]) + self.suffix + ) + texts_to_embed.append(text_to_embed) + + return texts_to_embed + + def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List[List[float]], Dict[str, Any]]: + all_embeddings: List[List[float]] = [] + usage = Usage(prompt_tokens=0, total_tokens=0) + assert self.nvcf_id is not None + + for i in tqdm( + range(0, len(texts_to_embed), batch_size), disable=not self.progress_bar, desc="Calculating embeddings" + ): + batch = texts_to_embed[i : i + batch_size] + + request = EmbeddingsRequest(input=batch, model="passage").to_dict() + json_response = self.client.query_function(self.nvcf_id, request) + response = EmbeddingsResponse.from_dict(json_response) + + # Sort resulting embeddings by index + assert all(isinstance(r.embedding, list) for r in response.data) + sorted_embeddings: List[List[float]] = [r.embedding for r in sorted(response.data, key=lambda e: e.index)] # type: ignore + all_embeddings.extend(sorted_embeddings) + + usage.prompt_tokens += response.usage.prompt_tokens + usage.total_tokens += response.usage.total_tokens + + return all_embeddings, {"usage": usage.to_dict()} + + @component.output_types(documents=List[Document], meta=Dict[str, Any]) + def run(self, documents: List[Document]): + """ + Embed a list of Documents. + + The embedding of each Document is stored in the `embedding` field of the Document. + + :param documents: + A list of Documents to embed. + :returns: + A dictionary with the following keys and values: + - `documents` - List of processed Documents with embeddings. + - `meta` - Metadata on usage statistics, etc. + :raises RuntimeError: + If the component was not initialized. + :raises TypeError: + If the input is not a string. + """ + if not self._initialized: + msg = "The embedding model has not been loaded. Please call warm_up() before running." + raise RuntimeError(msg) + if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): + msg = ( + "NvidiaDocumentEmbedder expects a list of Documents as input." + "In case you want to embed a string, please use the NvidiaTextEmbedder." + ) + raise TypeError(msg) + + texts_to_embed = self._prepare_texts_to_embed(documents) + embeddings, metadata = self._embed_batch(texts_to_embed, self.batch_size) + for doc, emb in zip(documents, embeddings): + doc.embedding = emb + + return {"documents": documents, "meta": metadata} diff --git a/integrations/nvidia/src/haystack_integrations/components/embedders/nvidia/models.py b/integrations/nvidia/src/haystack_integrations/components/embedders/nvidia/models.py new file mode 100644 index 000000000..dd11ac727 --- /dev/null +++ b/integrations/nvidia/src/haystack_integrations/components/embedders/nvidia/models.py @@ -0,0 +1,31 @@ +from enum import Enum + + +class NvidiaEmbeddingModel(Enum): + """ + [NVIDIA AI Foundation models](https://catalog.ngc.nvidia.com/ai-foundation-models) + used for generating embeddings. + """ + + #: [Retrieval QA Embedding Model](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-foundation/models/nvolve-40k). + NVOLVE_40K = "playground_nvolveqa_40k" + + def __str__(self): + return self.value + + @classmethod + def from_str(cls, string: str) -> "NvidiaEmbeddingModel": + """ + Create an embedding model from a string. + + :param string: + String to convert. + :returns: + Embedding model. + """ + enum_map = {e.value: e for e in NvidiaEmbeddingModel} + emb_model = enum_map.get(string) + if emb_model is None: + msg = f"Unknown embedding model '{string}'. Supported modes are: {list(enum_map.keys())}" + raise ValueError(msg) + return emb_model diff --git a/integrations/nvidia/src/haystack_integrations/components/embedders/nvidia/text_embedder.py b/integrations/nvidia/src/haystack_integrations/components/embedders/nvidia/text_embedder.py new file mode 100644 index 000000000..43d62ed92 --- /dev/null +++ b/integrations/nvidia/src/haystack_integrations/components/embedders/nvidia/text_embedder.py @@ -0,0 +1,144 @@ +from typing import Any, Dict, List, Union + +from haystack import component, default_from_dict, default_to_dict +from haystack.utils import Secret, deserialize_secrets_inplace +from haystack_integrations.utils.nvidia import NvidiaCloudFunctionsClient + +from ._schema import EmbeddingsRequest, EmbeddingsResponse, get_model_nvcf_id +from .models import NvidiaEmbeddingModel + + +@component +class NvidiaTextEmbedder: + """ + A component for embedding strings using embedding models provided by + [NVIDIA AI Foundation Endpoints](https://www.nvidia.com/en-us/ai-data-science/foundation-models/). + + For models that differentiate between query and document inputs, + this component embeds the input string as a query. + + Usage example: + ```python + from haystack_integrations.components.embedders.nvidia import NvidiaTextEmbedder, NvidiaEmbeddingModel + + text_to_embed = "I love pizza!" + + text_embedder = NvidiaTextEmbedder(model=NvidiaEmbeddingModel.NVOLVE_40K) + text_embedder.warm_up() + + print(text_embedder.run(text_to_embed)) + ``` + """ + + def __init__( + self, + model: Union[str, NvidiaEmbeddingModel], + api_key: Secret = Secret.from_env_var("NVIDIA_API_KEY"), + prefix: str = "", + suffix: str = "", + ): + """ + Create a NvidiaTextEmbedder component. + + :param model: + Embedding model to use. + :param api_key: + API key for the NVIDIA AI Foundation Endpoints. + :param prefix: + A string to add to the beginning of each text. + :param suffix: + A string to add to the end of each text. + """ + + if isinstance(model, str): + model = NvidiaEmbeddingModel.from_str(model) + + resolved_api_key = api_key.resolve_value() + assert resolved_api_key is not None + + self.api_key = api_key + self.model = model + self.prefix = prefix + self.suffix = suffix + self.client = NvidiaCloudFunctionsClient( + api_key=resolved_api_key, + headers={ + "Content-Type": "application/json", + "Accept": "application/json", + }, + ) + self.nvcf_id = None + self._initialized = False + + def warm_up(self): + """ + Initializes the component. + """ + if self._initialized: + return + + self.nvcf_id = get_model_nvcf_id(self.model, self.client) + self._initialized = True + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + api_key=self.api_key.to_dict(), + model=str(self.model), + prefix=self.prefix, + suffix=self.suffix, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "NvidiaTextEmbedder": + """ + Deserializes the component from a dictionary. + + :param data: + The dictionary to deserialize from. + :returns: + The deserialized component. + """ + data["init_parameters"]["model"] = NvidiaEmbeddingModel.from_str(data["init_parameters"]["model"]) + deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) + return default_from_dict(cls, data) + + @component.output_types(embedding=List[float], meta=Dict[str, Any]) + def run(self, text: str): + """ + Embed a string. + + :param text: + The text to embed. + :returns: + A dictionary with the following keys and values: + - `embedding` - Embeddng of the text. + - `meta` - Metadata on usage statistics, etc. + :raises RuntimeError: + If the component was not initialized. + :raises TypeError: + If the input is not a string. + """ + if not self._initialized: + msg = "The embedding model has not been loaded. Please call warm_up() before running." + raise RuntimeError(msg) + if not isinstance(text, str): + msg = ( + "NvidiaTextEmbedder expects a string as an input." + "In case you want to embed a list of Documents, please use the NvidiaDocumentEmbedder." + ) + raise TypeError(msg) + + assert self.nvcf_id is not None + text_to_embed = self.prefix + text + self.suffix + request = EmbeddingsRequest(input=text_to_embed, model="query").to_dict() + json_response = self.client.query_function(self.nvcf_id, request) + response = EmbeddingsResponse.from_dict(json_response) + + return {"embedding": response.data[0].embedding, "meta": {"usage": response.usage.to_dict()}} diff --git a/integrations/nvidia/src/haystack_integrations/utils/nvidia/__init__.py b/integrations/nvidia/src/haystack_integrations/utils/nvidia/__init__.py new file mode 100644 index 000000000..b8015cfda --- /dev/null +++ b/integrations/nvidia/src/haystack_integrations/utils/nvidia/__init__.py @@ -0,0 +1,3 @@ +from .client import NvidiaCloudFunctionsClient + +__all__ = ["NvidiaCloudFunctionsClient"] diff --git a/integrations/nvidia/src/haystack_integrations/utils/nvidia/client.py b/integrations/nvidia/src/haystack_integrations/utils/nvidia/client.py new file mode 100644 index 000000000..5227e8c45 --- /dev/null +++ b/integrations/nvidia/src/haystack_integrations/utils/nvidia/client.py @@ -0,0 +1,61 @@ +import copy +from dataclasses import dataclass +from typing import Dict, Optional + +import requests + +FUNCTIONS_ENDPOINT = "https://api.nvcf.nvidia.com/v2/nvcf/functions" +INVOKE_ENDPOINT = "https://api.nvcf.nvidia.com/v2/nvcf/pexec/functions" +STATUS_ENDPOINT = "https://api.nvcf.nvidia.com/v2/nvcf/pexec/status" + +ACCEPTED_STATUS_CODE = 202 + + +@dataclass +class AvailableNvidiaCloudFunctions: + name: str + id: str + status: Optional[str] = None + + +class NvidiaCloudFunctionsClient: + def __init__(self, *, api_key: str, headers: Dict[str, str], timeout: int = 60): + self.api_key = api_key + self.fetch_url_format = STATUS_ENDPOINT + self.headers = copy.deepcopy(headers) + self.headers.update( + { + "Authorization": f"Bearer {api_key}", + } + ) + self.timeout = timeout + self.session = requests.Session() + + def query_function(self, func_id: str, payload: Dict[str, str]) -> Dict[str, str]: + invoke_url = f"{INVOKE_ENDPOINT}/{func_id}" + + response = self.session.post(invoke_url, headers=self.headers, json=payload, timeout=self.timeout) + request_id = response.headers.get("NVCF-REQID") + if request_id is None: + msg = "NVCF-REQID header not found in response" + raise ValueError(msg) + + while response.status_code == ACCEPTED_STATUS_CODE: + fetch_url = f"{self.fetch_url_format}/{request_id}" + response = self.session.get(fetch_url, headers=self.headers, timeout=self.timeout) + + response.raise_for_status() + return response.json() + + def available_functions(self) -> Dict[str, AvailableNvidiaCloudFunctions]: + response = self.session.get(FUNCTIONS_ENDPOINT, headers=self.headers, timeout=self.timeout) + response.raise_for_status() + + return { + f["name"]: AvailableNvidiaCloudFunctions( + name=f["name"], + id=f["id"], + status=f.get("status"), + ) + for f in response.json()["functions"] + } diff --git a/integrations/nvidia/tests/test_document_embedder.py b/integrations/nvidia/tests/test_document_embedder.py new file mode 100644 index 000000000..4f19633e8 --- /dev/null +++ b/integrations/nvidia/tests/test_document_embedder.py @@ -0,0 +1,287 @@ +import os + +import pytest +from haystack import Document +from haystack.utils import Secret +from haystack_integrations.components.embedders.nvidia import NvidiaDocumentEmbedder, NvidiaEmbeddingModel +from haystack_integrations.utils.nvidia.client import AvailableNvidiaCloudFunctions + + +class MockClient: + def query_function(self, func_id, payload): + inputs = payload["input"] + data = [{"index": i, "embedding": [0.1, 0.2, 0.3]} for i in range(len(inputs))] + return {"data": data, "usage": {"total_tokens": 4, "prompt_tokens": 4}} + + def available_functions(self): + return { + NvidiaEmbeddingModel.NVOLVE_40K.value: AvailableNvidiaCloudFunctions( + name=NvidiaEmbeddingModel.NVOLVE_40K.value, id="fake-id", status="ACTIVE" + ) + } + + +class TestNvidiaDocumentEmbedder: + def test_init_default(self, monkeypatch): + monkeypatch.setenv("NVIDIA_API_KEY", "fake-api-key") + embedder = NvidiaDocumentEmbedder(NvidiaEmbeddingModel.NVOLVE_40K) + + assert embedder.api_key == Secret.from_env_var("NVIDIA_API_KEY") + assert embedder.model == NvidiaEmbeddingModel.NVOLVE_40K + assert embedder.prefix == "" + assert embedder.suffix == "" + assert embedder.batch_size == 32 + assert embedder.progress_bar is True + assert embedder.meta_fields_to_embed == [] + assert embedder.embedding_separator == "\n" + + def test_init_with_parameters(self): + embedder = NvidiaDocumentEmbedder( + api_key=Secret.from_token("fake-api-key"), + model="playground_nvolveqa_40k", + prefix="prefix", + suffix="suffix", + batch_size=30, + progress_bar=False, + meta_fields_to_embed=["test_field"], + embedding_separator=" | ", + ) + + assert embedder.api_key == Secret.from_token("fake-api-key") + assert embedder.model == NvidiaEmbeddingModel.NVOLVE_40K + assert embedder.prefix == "prefix" + assert embedder.suffix == "suffix" + assert embedder.batch_size == 30 + assert embedder.progress_bar is False + assert embedder.meta_fields_to_embed == ["test_field"] + assert embedder.embedding_separator == " | " + + def test_init_fail_wo_api_key(self, monkeypatch): + monkeypatch.delenv("NVIDIA_API_KEY", raising=False) + with pytest.raises(ValueError): + NvidiaDocumentEmbedder(NvidiaEmbeddingModel.NVOLVE_40K) + + def test_init_fail_batch_size(self, monkeypatch): + with pytest.raises(ValueError): + NvidiaDocumentEmbedder(model="playground_nvolveqa_40k", batch_size=55) + + def test_to_dict(self, monkeypatch): + monkeypatch.setenv("NVIDIA_API_KEY", "fake-api-key") + component = NvidiaDocumentEmbedder("playground_nvolveqa_40k") + data = component.to_dict() + assert data == { + "type": "haystack_integrations.components.embedders.nvidia.document_embedder.NvidiaDocumentEmbedder", + "init_parameters": { + "api_key": {"env_vars": ["NVIDIA_API_KEY"], "strict": True, "type": "env_var"}, + "model": "playground_nvolveqa_40k", + "prefix": "", + "suffix": "", + "batch_size": 32, + "progress_bar": True, + "meta_fields_to_embed": [], + "embedding_separator": "\n", + }, + } + + def test_to_dict_with_custom_init_parameters(self, monkeypatch): + monkeypatch.setenv("NVIDIA_API_KEY", "fake-api-key") + component = NvidiaDocumentEmbedder( + model="playground_nvolveqa_40k", + prefix="prefix", + suffix="suffix", + batch_size=10, + progress_bar=False, + meta_fields_to_embed=["test_field"], + embedding_separator=" | ", + ) + data = component.to_dict() + assert data == { + "type": "haystack_integrations.components.embedders.nvidia.document_embedder.NvidiaDocumentEmbedder", + "init_parameters": { + "api_key": {"env_vars": ["NVIDIA_API_KEY"], "strict": True, "type": "env_var"}, + "model": "playground_nvolveqa_40k", + "prefix": "prefix", + "suffix": "suffix", + "batch_size": 10, + "progress_bar": False, + "meta_fields_to_embed": ["test_field"], + "embedding_separator": " | ", + }, + } + + def test_prepare_texts_to_embed_w_metadata(self): + documents = [ + Document(content=f"document number {i}:\ncontent", meta={"meta_field": f"meta_value {i}"}) for i in range(5) + ] + + embedder = NvidiaDocumentEmbedder( + "playground_nvolveqa_40k", + api_key=Secret.from_token("fake-api-key"), + meta_fields_to_embed=["meta_field"], + embedding_separator=" | ", + ) + + prepared_texts = embedder._prepare_texts_to_embed(documents) + + # note that newline is replaced by space + assert prepared_texts == [ + "meta_value 0 | document number 0:\ncontent", + "meta_value 1 | document number 1:\ncontent", + "meta_value 2 | document number 2:\ncontent", + "meta_value 3 | document number 3:\ncontent", + "meta_value 4 | document number 4:\ncontent", + ] + + def test_prepare_texts_to_embed_w_suffix(self): + documents = [Document(content=f"document number {i}") for i in range(5)] + + embedder = NvidiaDocumentEmbedder( + "playground_nvolveqa_40k", + api_key=Secret.from_token("fake-api-key"), + prefix="my_prefix ", + suffix=" my_suffix", + ) + + prepared_texts = embedder._prepare_texts_to_embed(documents) + + assert prepared_texts == [ + "my_prefix document number 0 my_suffix", + "my_prefix document number 1 my_suffix", + "my_prefix document number 2 my_suffix", + "my_prefix document number 3 my_suffix", + "my_prefix document number 4 my_suffix", + ] + + def test_embed_batch(self): + texts = ["text 1", "text 2", "text 3", "text 4", "text 5"] + + embedder = NvidiaDocumentEmbedder( + "playground_nvolveqa_40k", + api_key=Secret.from_token("fake-api-key"), + ) + embedder.client = MockClient() + embedder.warm_up() + + embeddings, metadata = embedder._embed_batch(texts_to_embed=texts, batch_size=2) + + assert isinstance(embeddings, list) + assert len(embeddings) == len(texts) + for embedding in embeddings: + assert isinstance(embedding, list) + assert len(embedding) == 3 + assert all(isinstance(x, float) for x in embedding) + + assert metadata == {"usage": {"prompt_tokens": 3 * 4, "total_tokens": 3 * 4}} + + def test_run(self): + docs = [ + Document(content="I love cheese", meta={"topic": "Cuisine"}), + Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), + ] + + model = "playground_nvolveqa_40k" + embedder = NvidiaDocumentEmbedder( + api_key=Secret.from_token("fake-api-key"), + model=model, + prefix="prefix ", + suffix=" suffix", + meta_fields_to_embed=["topic"], + embedding_separator=" | ", + ) + embedder.client = MockClient() + embedder.warm_up() + + result = embedder.run(documents=docs) + + documents_with_embeddings = result["documents"] + metadata = result["meta"] + + assert isinstance(documents_with_embeddings, list) + assert len(documents_with_embeddings) == len(docs) + for doc in documents_with_embeddings: + assert isinstance(doc, Document) + assert isinstance(doc.embedding, list) + assert len(doc.embedding) == 3 + assert all(isinstance(x, float) for x in doc.embedding) + assert metadata == {"usage": {"prompt_tokens": 4, "total_tokens": 4}} + + def test_run_custom_batch_size(self): + docs = [ + Document(content="I love cheese", meta={"topic": "Cuisine"}), + Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), + ] + model = "playground_nvolveqa_40k" + embedder = NvidiaDocumentEmbedder( + api_key=Secret.from_token("fake-api-key"), + model=model, + prefix="prefix ", + suffix=" suffix", + meta_fields_to_embed=["topic"], + embedding_separator=" | ", + batch_size=1, + ) + embedder.client = MockClient() + embedder.warm_up() + + result = embedder.run(documents=docs) + + documents_with_embeddings = result["documents"] + metadata = result["meta"] + + assert isinstance(documents_with_embeddings, list) + assert len(documents_with_embeddings) == len(docs) + for doc in documents_with_embeddings: + assert isinstance(doc, Document) + assert isinstance(doc.embedding, list) + assert len(doc.embedding) == 3 + assert all(isinstance(x, float) for x in doc.embedding) + + assert metadata == {"usage": {"prompt_tokens": 2 * 4, "total_tokens": 2 * 4}} + + def test_run_wrong_input_format(self): + embedder = NvidiaDocumentEmbedder("playground_nvolveqa_40k", api_key=Secret.from_token("fake-api-key")) + embedder.client = MockClient() + embedder.warm_up() + + string_input = "text" + list_integers_input = [1, 2, 3] + + with pytest.raises(TypeError, match="NvidiaDocumentEmbedder expects a list of Documents as input"): + embedder.run(documents=string_input) + + with pytest.raises(TypeError, match="NvidiaDocumentEmbedder expects a list of Documents as input"): + embedder.run(documents=list_integers_input) + + def test_run_on_empty_list(self): + embedder = NvidiaDocumentEmbedder("playground_nvolveqa_40k", api_key=Secret.from_token("fake-api-key")) + embedder.client = MockClient() + embedder.warm_up() + + empty_list_input = [] + result = embedder.run(documents=empty_list_input) + + assert result["documents"] is not None + assert not result["documents"] # empty list + + @pytest.mark.skipif( + not os.environ.get("NVIDIA_API_KEY", None), + reason="Export an env var called NVIDIA_API_KEY containing the Nvidia API key to run this test.", + ) + @pytest.mark.integration + def test_run_integration(self): + embedder = NvidiaDocumentEmbedder("playground_nvolveqa_40k") + embedder.warm_up() + + docs = [ + Document(content="I love cheese", meta={"topic": "Cuisine"}), + Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), + ] + + result = embedder.run(docs) + docs_with_embeddings = result["documents"] + + assert isinstance(docs_with_embeddings, list) + assert len(docs_with_embeddings) == len(docs) + for doc in docs_with_embeddings: + assert isinstance(doc.embedding, list) + assert isinstance(doc.embedding[0], float) diff --git a/integrations/nvidia/tests/test_placeholder.py b/integrations/nvidia/tests/test_placeholder.py deleted file mode 100644 index 3ada1ee4e..000000000 --- a/integrations/nvidia/tests/test_placeholder.py +++ /dev/null @@ -1,2 +0,0 @@ -def test_placeholder(): - assert True diff --git a/integrations/nvidia/tests/test_text_embedder.py b/integrations/nvidia/tests/test_text_embedder.py new file mode 100644 index 000000000..b4239308b --- /dev/null +++ b/integrations/nvidia/tests/test_text_embedder.py @@ -0,0 +1,119 @@ +import os + +import pytest +from haystack.utils import Secret +from haystack_integrations.components.embedders.nvidia import NvidiaEmbeddingModel, NvidiaTextEmbedder +from haystack_integrations.utils.nvidia.client import AvailableNvidiaCloudFunctions + + +class MockClient: + def query_function(self, func_id, payload): + data = [{"index": 0, "embedding": [0.1, 0.2, 0.3]}] + return {"data": data, "usage": {"total_tokens": 4, "prompt_tokens": 4}} + + def available_functions(self): + return { + NvidiaEmbeddingModel.NVOLVE_40K.value: AvailableNvidiaCloudFunctions( + name=NvidiaEmbeddingModel.NVOLVE_40K.value, id="fake-id", status="ACTIVE" + ) + } + + +class TestNvidiaTextEmbedder: + def test_init_default(self, monkeypatch): + monkeypatch.setenv("NVIDIA_API_KEY", "fake-api-key") + embedder = NvidiaTextEmbedder(NvidiaEmbeddingModel.NVOLVE_40K) + + assert embedder.api_key == Secret.from_env_var("NVIDIA_API_KEY") + assert embedder.model == NvidiaEmbeddingModel.NVOLVE_40K + assert embedder.prefix == "" + assert embedder.suffix == "" + + def test_init_with_parameters(self): + embedder = NvidiaTextEmbedder( + api_key=Secret.from_token("fake-api-key"), + model="playground_nvolveqa_40k", + prefix="prefix", + suffix="suffix", + ) + assert embedder.api_key == Secret.from_token("fake-api-key") + assert embedder.model == NvidiaEmbeddingModel.NVOLVE_40K + assert embedder.prefix == "prefix" + assert embedder.suffix == "suffix" + + def test_init_fail_wo_api_key(self, monkeypatch): + monkeypatch.delenv("NVIDIA_API_KEY", raising=False) + with pytest.raises(ValueError): + NvidiaTextEmbedder(NvidiaEmbeddingModel.NVOLVE_40K) + + def test_to_dict(self, monkeypatch): + monkeypatch.setenv("NVIDIA_API_KEY", "fake-api-key") + component = NvidiaTextEmbedder(NvidiaEmbeddingModel.NVOLVE_40K) + data = component.to_dict() + assert data == { + "type": "haystack_integrations.components.embedders.nvidia.text_embedder.NvidiaTextEmbedder", + "init_parameters": { + "api_key": {"env_vars": ["NVIDIA_API_KEY"], "strict": True, "type": "env_var"}, + "model": "playground_nvolveqa_40k", + "prefix": "", + "suffix": "", + }, + } + + def test_to_dict_with_custom_init_parameters(self, monkeypatch): + monkeypatch.setenv("NVIDIA_API_KEY", "fake-api-key") + component = NvidiaTextEmbedder( + model=NvidiaEmbeddingModel.NVOLVE_40K, + prefix="prefix", + suffix="suffix", + ) + data = component.to_dict() + assert data == { + "type": "haystack_integrations.components.embedders.nvidia.text_embedder.NvidiaTextEmbedder", + "init_parameters": { + "api_key": {"env_vars": ["NVIDIA_API_KEY"], "strict": True, "type": "env_var"}, + "model": "playground_nvolveqa_40k", + "prefix": "prefix", + "suffix": "suffix", + }, + } + + def test_run(self): + embedder = NvidiaTextEmbedder( + "playground_nvolveqa_40k", api_key=Secret.from_token("fake-api-key"), prefix="prefix ", suffix=" suffix" + ) + embedder.client = MockClient() + embedder.warm_up() + result = embedder.run(text="The food was delicious") + + assert len(result["embedding"]) == 3 + assert all(isinstance(x, float) for x in result["embedding"]) + assert result["meta"] == { + "usage": {"prompt_tokens": 4, "total_tokens": 4}, + } + + def test_run_wrong_input_format(self): + embedder = NvidiaTextEmbedder("playground_nvolveqa_40k", api_key=Secret.from_token("fake-api-key")) + embedder.client = MockClient() + embedder.warm_up() + + list_integers_input = [1, 2, 3] + + with pytest.raises(TypeError, match="NvidiaTextEmbedder expects a string as an input"): + embedder.run(text=list_integers_input) + + @pytest.mark.skipif( + not os.environ.get("NVIDIA_API_KEY", None), + reason="Export an env var called NVIDIA_API_KEY containing the Nvidia API key to run this test.", + ) + @pytest.mark.integration + def test_run_integration(self): + embedder = NvidiaTextEmbedder("playground_nvolveqa_40k") + embedder.warm_up() + + result = embedder.run("A transformer is a deep learning architecture") + embedding = result["embedding"] + meta = result["meta"] + + assert all(isinstance(x, float) for x in embedding) + assert "usage" in meta