From 17359778a941cf055b0d8eac3b3aa39e489ffb12 Mon Sep 17 00:00:00 2001 From: Julian De Vita <53025729+jmdevita@users.noreply.github.com> Date: Wed, 28 Feb 2024 03:20:44 -0500 Subject: [PATCH 1/4] Changed Default Ollama Embedding models to supported model: nomic-embed-text (#490) * Changed Embedding model to supported model: nomic-embed-text * Updated workflow yml with support for llm and embedding model --- .github/workflows/ollama.yml | 6 +++++- .../components/embedders/ollama/document_embedder.py | 4 ++-- .../components/embedders/ollama/text_embedder.py | 4 ++-- integrations/ollama/tests/test_document_embedder.py | 12 ++++++------ integrations/ollama/tests/test_text_embedder.py | 6 +++--- 5 files changed, 18 insertions(+), 14 deletions(-) diff --git a/.github/workflows/ollama.yml b/.github/workflows/ollama.yml index 962537390..055f63dd0 100644 --- a/.github/workflows/ollama.yml +++ b/.github/workflows/ollama.yml @@ -22,6 +22,7 @@ env: PYTHONUNBUFFERED: "1" FORCE_COLOR: "1" LLM_FOR_TESTS: "orca-mini" + EMBEDDER_FOR_TESTS: "nomic-embed-text" jobs: run: @@ -55,7 +56,10 @@ jobs: run: hatch run lint:all - name: Pull the LLM in the Ollama service - run: docker exec ollama ollama pull ${{ env.LLM_FOR_TESTS }} + run: docker exec ollama ollama pull ${{ env.LLM_FOR_TESTS }} + + - name: Pull the Embedding Model in the Ollama service + run: docker exec ollama ollama pull ${{ env.EMBEDDER_FOR_TESTS }} - name: Generate docs if: matrix.python-version == '3.9' && runner.os == 'Linux' diff --git a/integrations/ollama/src/haystack_integrations/components/embedders/ollama/document_embedder.py b/integrations/ollama/src/haystack_integrations/components/embedders/ollama/document_embedder.py index 17b32f065..6e3273e1c 100644 --- a/integrations/ollama/src/haystack_integrations/components/embedders/ollama/document_embedder.py +++ b/integrations/ollama/src/haystack_integrations/components/embedders/ollama/document_embedder.py @@ -9,7 +9,7 @@ class OllamaDocumentEmbedder: def __init__( self, - model: str = "orca-mini", + model: str = "nomic-embed-text", url: str = "http://localhost:11434/api/embeddings", generation_kwargs: Optional[Dict[str, Any]] = None, timeout: int = 120, @@ -21,7 +21,7 @@ def __init__( ): """ :param model: The name of the model to use. The model should be available in the running Ollama instance. - Default is "orca-mini". + Default is "nomic-embed-text". "https://ollama.com/library/nomic-embed-text" :param url: The URL of the chat endpoint of a running Ollama instance. Default is "http://localhost:11434/api/embeddings". :param generation_kwargs: Optional arguments to pass to the Ollama generation endpoint, such as temperature, diff --git a/integrations/ollama/src/haystack_integrations/components/embedders/ollama/text_embedder.py b/integrations/ollama/src/haystack_integrations/components/embedders/ollama/text_embedder.py index e27fd9ff4..e2ef136b4 100644 --- a/integrations/ollama/src/haystack_integrations/components/embedders/ollama/text_embedder.py +++ b/integrations/ollama/src/haystack_integrations/components/embedders/ollama/text_embedder.py @@ -8,14 +8,14 @@ class OllamaTextEmbedder: def __init__( self, - model: str = "orca-mini", + model: str = "nomic-embed-text", url: str = "http://localhost:11434/api/embeddings", generation_kwargs: Optional[Dict[str, Any]] = None, timeout: int = 120, ): """ :param model: The name of the model to use. The model should be available in the running Ollama instance. - Default is "orca-mini". + Default is "nomic-embed-text". "https://ollama.com/library/nomic-embed-text" :param url: The URL of the chat endpoint of a running Ollama instance. Default is "http://localhost:11434/api/embeddings". :param generation_kwargs: Optional arguments to pass to the Ollama generation endpoint, such as temperature, diff --git a/integrations/ollama/tests/test_document_embedder.py b/integrations/ollama/tests/test_document_embedder.py index a5694db33..012ad9eae 100644 --- a/integrations/ollama/tests/test_document_embedder.py +++ b/integrations/ollama/tests/test_document_embedder.py @@ -11,11 +11,11 @@ def test_init_defaults(self): assert embedder.timeout == 120 assert embedder.generation_kwargs == {} assert embedder.url == "http://localhost:11434/api/embeddings" - assert embedder.model == "orca-mini" + assert embedder.model == "nomic-embed-text" def test_init(self): embedder = OllamaDocumentEmbedder( - model="orca-mini", + model="nomic-embed-text", url="http://my-custom-endpoint:11434/api/embeddings", generation_kwargs={"temperature": 0.5}, timeout=3000, @@ -24,7 +24,7 @@ def test_init(self): assert embedder.timeout == 3000 assert embedder.generation_kwargs == {"temperature": 0.5} assert embedder.url == "http://my-custom-endpoint:11434/api/embeddings" - assert embedder.model == "orca-mini" + assert embedder.model == "nomic-embed-text" @pytest.mark.integration def test_model_not_found(self): @@ -35,17 +35,17 @@ def test_model_not_found(self): @pytest.mark.integration def import_text_in_embedder(self): - embedder = OllamaDocumentEmbedder(model="orca-mini") + embedder = OllamaDocumentEmbedder(model="nomic-embed-text") with pytest.raises(TypeError): embedder.run("This is a text string. This should not work.") @pytest.mark.integration def test_run(self): - embedder = OllamaDocumentEmbedder(model="orca-mini") + embedder = OllamaDocumentEmbedder(model="nomic-embed-text") list_of_docs = [Document(content="This is a document containing some text.")] reply = embedder.run(list_of_docs) assert isinstance(reply, dict) assert all(isinstance(element, float) for element in reply["documents"][0].embedding) - assert reply["meta"]["model"] == "orca-mini" + assert reply["meta"]["model"] == "nomic-embed-text" diff --git a/integrations/ollama/tests/test_text_embedder.py b/integrations/ollama/tests/test_text_embedder.py index 9d3321e64..f4d45afec 100644 --- a/integrations/ollama/tests/test_text_embedder.py +++ b/integrations/ollama/tests/test_text_embedder.py @@ -10,7 +10,7 @@ def test_init_defaults(self): assert embedder.timeout == 120 assert embedder.generation_kwargs == {} assert embedder.url == "http://localhost:11434/api/embeddings" - assert embedder.model == "orca-mini" + assert embedder.model == "nomic-embed-text" def test_init(self): embedder = OllamaTextEmbedder( @@ -34,10 +34,10 @@ def test_model_not_found(self): @pytest.mark.integration def test_run(self): - embedder = OllamaTextEmbedder(model="orca-mini") + embedder = OllamaTextEmbedder(model="nomic-embed-text") reply = embedder.run("hello") assert isinstance(reply, dict) assert all(isinstance(element, float) for element in reply["embedding"]) - assert reply["meta"]["model"] == "orca-mini" + assert reply["meta"]["model"] == "nomic-embed-text" From 869b2597df56783f154d75713986ff97f13b9b63 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Wed, 28 Feb 2024 12:15:09 +0100 Subject: [PATCH 2/4] Update API docs (#494) --- integrations/ollama/pydoc/config.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/integrations/ollama/pydoc/config.yml b/integrations/ollama/pydoc/config.yml index 38de37211..45c336b85 100644 --- a/integrations/ollama/pydoc/config.yml +++ b/integrations/ollama/pydoc/config.yml @@ -3,7 +3,9 @@ loaders: search_path: [../src] modules: [ "haystack_integrations.components.generators.ollama.generator", - "haystack_integrations.components.generators.ollama.chat.chat_generator" + "haystack_integrations.components.generators.ollama.chat.chat_generator", + "haystack_integrations.components.embedders.ollama.document_embedder", + "haystack_integrations.components.embedders.ollama.text_embedder", ] ignore_when_discovered: ["__init__"] processors: @@ -26,4 +28,4 @@ renderer: descriptive_module_title: true add_method_class_prefix: true add_member_class_prefix: false - filename: _readme_ollama.md \ No newline at end of file + filename: _readme_ollama.md From f758de9361272d8b65ed03a78d59e95c5d464bde Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Wed, 28 Feb 2024 15:56:50 +0100 Subject: [PATCH 3/4] feat!: Add support for Optimum optimizers and quantizers (#496) * feat!: Add support for Optimum optimizers and quantizers * Fix docstring typo * Apply suggestions from code review Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> --- .github/workflows/optimum.yml | 6 +- integrations/optimum/pydoc/config.yml | 2 + .../components/embedders/optimum/__init__.py | 12 +- .../components/embedders/optimum/_backend.py | 97 +++++++++++-- .../embedders/optimum/optimization.py | 105 ++++++++++++++ .../optimum/optimum_document_embedder.py | 100 +++++++++----- .../optimum/optimum_text_embedder.py | 129 +++++++++++------- .../components/embedders/optimum/pooling.py | 7 +- .../embedders/optimum/quantization.py | 105 ++++++++++++++ .../tests/test_optimum_document_embedder.py | 85 ++++++++++-- .../tests/test_optimum_text_embedder.py | 39 +++++- 11 files changed, 561 insertions(+), 126 deletions(-) create mode 100644 integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py create mode 100644 integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py diff --git a/.github/workflows/optimum.yml b/.github/workflows/optimum.yml index 3b0d137da..f5f59ec89 100644 --- a/.github/workflows/optimum.yml +++ b/.github/workflows/optimum.yml @@ -52,9 +52,9 @@ jobs: if: matrix.python-version == '3.9' && runner.os == 'Linux' run: hatch run lint:all - - name: Generate docs - if: matrix.python-version == '3.9' && runner.os == 'Linux' - run: hatch run docs + # - name: Generate docs + # if: matrix.python-version == '3.9' && runner.os == 'Linux' + # run: hatch run docs - name: Run tests run: hatch run cov diff --git a/integrations/optimum/pydoc/config.yml b/integrations/optimum/pydoc/config.yml index 617eb4aed..996678c55 100644 --- a/integrations/optimum/pydoc/config.yml +++ b/integrations/optimum/pydoc/config.yml @@ -6,6 +6,8 @@ loaders: "haystack_integrations.components.embedders.optimum.optimum_document_embedder", "haystack_integrations.components.embedders.optimum.optimum_text_embedder", "haystack_integrations.components.embedders.optimum.pooling", + "haystack_integrations.components.embedders.optimum.optimization", + "haystack_integrations.components.embedders.optimum.quantization", ] ignore_when_discovered: ["__init__"] processors: diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/__init__.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/__init__.py index e2ab2d6b7..02e56b34c 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/__init__.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/__init__.py @@ -2,8 +2,18 @@ # # SPDX-License-Identifier: Apache-2.0 +from .optimization import OptimumEmbedderOptimizationConfig, OptimumEmbedderOptimizationMode from .optimum_document_embedder import OptimumDocumentEmbedder from .optimum_text_embedder import OptimumTextEmbedder from .pooling import OptimumEmbedderPooling +from .quantization import OptimumEmbedderQuantizationConfig, OptimumEmbedderQuantizationMode -__all__ = ["OptimumDocumentEmbedder", "OptimumEmbedderPooling", "OptimumTextEmbedder"] +__all__ = [ + "OptimumDocumentEmbedder", + "OptimumEmbedderOptimizationMode", + "OptimumEmbedderOptimizationConfig", + "OptimumEmbedderPooling", + "OptimumEmbedderQuantizationMode", + "OptimumEmbedderQuantizationConfig", + "OptimumTextEmbedder", +] diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/_backend.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/_backend.py index fc4f0b1ae..a6d226ecc 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/_backend.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/_backend.py @@ -1,7 +1,8 @@ import copy import json from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Union +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np import torch @@ -11,10 +12,17 @@ from sentence_transformers.models import Pooling as SentenceTransformerPoolingLayer from tqdm import tqdm from transformers import AutoTokenizer +from transformers.modeling_outputs import BaseModelOutput -from optimum.onnxruntime import ORTModelForFeatureExtraction +from optimum.onnxruntime import ( + ORTModelForFeatureExtraction, + ORTOptimizer, + ORTQuantizer, +) +from .optimization import OptimumEmbedderOptimizationConfig from .pooling import OptimumEmbedderPooling +from .quantization import OptimumEmbedderQuantizationConfig @dataclass @@ -29,16 +37,29 @@ class _EmbedderParams: progress_bar: bool pooling_mode: Optional[Union[str, OptimumEmbedderPooling]] model_kwargs: Optional[Dict[str, Any]] + working_dir: Optional[str] + optimizer_settings: Optional[OptimumEmbedderOptimizationConfig] + quantizer_settings: Optional[OptimumEmbedderQuantizationConfig] def serialize(self) -> Dict[str, Any]: out = {} for field in self.__dataclass_fields__.keys(): + if field in [ + "pooling_mode", + "token", + "optimizer_settings", + "quantizer_settings", + ]: + continue out[field] = copy.deepcopy(getattr(self, field)) # Fixups. assert isinstance(self.pooling_mode, OptimumEmbedderPooling) - out["pooling_mode"] = self.pooling_mode.value + out["pooling_mode"] = str(self.pooling_mode) out["token"] = self.token.to_dict() if self.token else None + out["optimizer_settings"] = self.optimizer_settings.to_dict() if self.optimizer_settings else None + out["quantizer_settings"] = self.quantizer_settings.to_dict() if self.quantizer_settings else None + out["model_kwargs"].pop("use_auth_token", None) serialize_hf_model_kwargs(out["model_kwargs"]) return out @@ -46,6 +67,11 @@ def serialize(self) -> Dict[str, Any]: @classmethod def deserialize_inplace(cls, data: Dict[str, Any]) -> Dict[str, Any]: data["pooling_mode"] = OptimumEmbedderPooling.from_str(data["pooling_mode"]) + if data["optimizer_settings"] is not None: + data["optimizer_settings"] = OptimumEmbedderOptimizationConfig.from_dict(data["optimizer_settings"]) + if data["quantizer_settings"] is not None: + data["quantizer_settings"] = OptimumEmbedderQuantizationConfig.from_dict(data["quantizer_settings"]) + deserialize_secrets_inplace(data, keys=["token"]) deserialize_hf_model_kwargs(data["model_kwargs"]) return data @@ -71,6 +97,11 @@ def __init__(self, params: _EmbedderParams): params.model_kwargs = params.model_kwargs or {} + if params.optimizer_settings or params.quantizer_settings: + if not params.working_dir: + msg = "Working directory is required for optimization and quantization" + raise ValueError(msg) + # Check if the model_kwargs contain the parameters, otherwise, populate them with values from init parameters params.model_kwargs.setdefault("model_id", params.model) params.model_kwargs.setdefault("provider", params.onnx_execution_provider) @@ -82,18 +113,48 @@ def __init__(self, params: _EmbedderParams): self.pooling_layer = None def warm_up(self): - self.model = ORTModelForFeatureExtraction.from_pretrained(**self.params.model_kwargs, export=True) + assert self.params.model_kwargs + model_kwargs = copy.deepcopy(self.params.model_kwargs) + model = ORTModelForFeatureExtraction.from_pretrained(**model_kwargs, export=True) + + # Model ID will be passed explicitly if optimization/quantization is enabled. + model_kwargs.pop("model_id", None) + + optimized_model = False + if self.params.optimizer_settings: + assert self.params.working_dir + optimizer = ORTOptimizer.from_pretrained(model) + save_dir = optimizer.optimize( + save_dir=self.params.working_dir, optimization_config=self.params.optimizer_settings.to_optimum_config() + ) + model = ORTModelForFeatureExtraction.from_pretrained(model_id=save_dir, **model_kwargs) + optimized_model = True + + if self.params.quantizer_settings: + assert self.params.working_dir + + # We need to create a subfolder for models that were optimized before quantization + # since Optimum expects no more than one ONXX model in the working directory. There's + # a file name parameter, but the optimizer only returns the working directory. + working_dir = ( + Path(self.params.working_dir) if not optimized_model else Path(self.params.working_dir) / "quantized" + ) + quantizer = ORTQuantizer.from_pretrained(model) + save_dir = quantizer.quantize( + save_dir=working_dir, quantization_config=self.params.quantizer_settings.to_optimum_config() + ) + model = ORTModelForFeatureExtraction.from_pretrained(model_id=save_dir, **model_kwargs) + + self.model = model self.tokenizer = AutoTokenizer.from_pretrained( self.params.model, token=self.params.token.resolve_value() if self.params.token else None ) # We need the width of the embeddings to initialize the pooling layer # so we do a dummy forward pass with the model. - dummy_input = self.tokenizer(["dummy input"], padding=True, truncation=True, return_tensors="pt").to( - self.model.device - ) - dummy_output = self.model(input_ids=dummy_input["input_ids"], attention_mask=dummy_input["attention_mask"]) - width = dummy_output[0].size(dim=2) # BaseModelOutput.last_hidden_state + width = self._tokenize_and_generate_outputs(["dummy input"])[1][0].size( + dim=2 + ) # BaseModelOutput.last_hidden_state self.pooling_layer = SentenceTransformerPoolingLayer( width, @@ -105,6 +166,17 @@ def warm_up(self): pooling_mode_lasttoken=self.params.pooling_mode == OptimumEmbedderPooling.LAST_TOKEN, ) + def _tokenize_and_generate_outputs(self, texts: List[str]) -> Tuple[Dict[str, Any], BaseModelOutput]: + assert self.model is not None + assert self.tokenizer is not None + + tokenizer_outputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to( + self.model.device + ) + model_inputs = {k: v for k, v in tokenizer_outputs.items() if k in self.model.inputs_names} + model_outputs = self.model(**model_inputs) + return tokenizer_outputs, model_outputs + @property def parameters(self) -> _EmbedderParams: return self.params @@ -140,11 +212,8 @@ def embed_texts( desc="Calculating embeddings", ): batch = sentences_sorted[i : i + self.params.batch_size] - encoded_input = self.tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device) - model_output = self.model( - input_ids=encoded_input["input_ids"], attention_mask=encoded_input["attention_mask"] - ) - sentence_embeddings = self.pool_embeddings(model_output[0], encoded_input["attention_mask"].to(device)) + tokenizer_output, model_output = self._tokenize_and_generate_outputs(batch) + sentence_embeddings = self.pool_embeddings(model_output[0], tokenizer_output["attention_mask"].to(device)) all_embeddings.append(sentence_embeddings) embeddings = torch.cat(all_embeddings, dim=0) diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py new file mode 100644 index 000000000..5a4447570 --- /dev/null +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py @@ -0,0 +1,105 @@ +from dataclasses import dataclass +from enum import Enum +from typing import Any, Dict + +from optimum.onnxruntime.configuration import AutoOptimizationConfig, OptimizationConfig + + +class OptimumEmbedderOptimizationMode(Enum): + """ + [ONXX Optimization Modes](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization.html) + support by the Optimum Embedders. + """ + + #: Basic general optimizations. + O1 = "o1" + + #: Basic and extended general optimizations, transformers-specific fusions. + O2 = "o2" + + #: Same as O2 with Gelu approximation. + O3 = "o3" + + #: Same as O3 with mixed precision. + O4 = "o4" + + def __str__(self): + return self.value + + @classmethod + def from_str(cls, string: str) -> "OptimumEmbedderOptimizationMode": + """ + Create an optimization mode from a string. + + :param string: + String to convert. + :returns: + Optimization mode. + """ + enum_map = {e.value: e for e in OptimumEmbedderOptimizationMode} + opt_mode = enum_map.get(string) + if opt_mode is None: + msg = f"Unknown optimization mode '{string}'. Supported modes are: {list(enum_map.keys())}" + raise ValueError(msg) + return opt_mode + + +@dataclass(frozen=True) +class OptimumEmbedderOptimizationConfig: + """ + Configuration for Optimum Embedder Optimization. + + :param mode: + Optimization mode. + :param for_gpu: + Whether to optimize for GPUs. + """ + + mode: OptimumEmbedderOptimizationMode + for_gpu: bool = True + + def to_optimum_config(self) -> OptimizationConfig: + """ + Convert the configuration to a Optimum configuration. + + :returns: + Optimum configuration. + """ + if self.mode == OptimumEmbedderOptimizationMode.O1: + return AutoOptimizationConfig.O1(for_gpu=self.for_gpu) + elif self.mode == OptimumEmbedderOptimizationMode.O2: + return AutoOptimizationConfig.O2(for_gpu=self.for_gpu) + elif self.mode == OptimumEmbedderOptimizationMode.O3: + return AutoOptimizationConfig.O3(for_gpu=self.for_gpu) + elif self.mode == OptimumEmbedderOptimizationMode.O4: + return AutoOptimizationConfig.O4(for_gpu=self.for_gpu) + else: + msg = f"Unknown optimization mode '{self.mode}'" + raise ValueError(msg) + + def to_dict(self) -> Dict[str, Any]: + """ + Convert the configuration to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return { + "mode": str(self.mode), + "for_gpu": self.for_gpu, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "OptimumEmbedderOptimizationConfig": + """ + Create an optimization configuration from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Optimization configuration. + """ + return OptimumEmbedderOptimizationConfig( + mode=OptimumEmbedderOptimizationMode.from_str(data["mode"]), + for_gpu=data["for_gpu"], + ) diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py index 2f49bd0b3..a6db47090 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py @@ -4,14 +4,17 @@ from haystack.utils import Secret from ._backend import _EmbedderBackend, _EmbedderParams +from .optimization import OptimumEmbedderOptimizationConfig from .pooling import OptimumEmbedderPooling +from .quantization import OptimumEmbedderQuantizationConfig @component class OptimumDocumentEmbedder: """ - A component for computing Document embeddings using models loaded with the HuggingFace Optimum library. - This component is designed to seamlessly inference models using the high speed ONNX runtime. + A component for computing `Document` embeddings using models loaded with the + [HuggingFace Optimum](https://huggingface.co/docs/optimum/index) library, + leveraging the ONNX runtime for high-speed inference. The embedding of each Document is stored in the `embedding` field of the Document. @@ -30,18 +33,6 @@ class OptimumDocumentEmbedder: # [0.017020374536514282, -0.023255806416273117, ...] ``` - - Key Features and Compatibility: - - **Primary Compatibility**: Designed to work seamlessly with any embedding model present on the Hugging Face - Hub. - - **Conversion to ONNX**: The models are converted to ONNX using the HuggingFace Optimum library. This is - performed in real-time, during the warm-up step. - - **Accelerated Inference on GPU**: Supports using different execution providers such as CUDA and TensorRT, to - accelerate ONNX Runtime inference on GPUs. - Simply pass the execution provider as the onnx_execution_provider parameter. Additonal parameters can be passed - to the model using the model_kwargs parameter. - For more details refer to the HuggingFace documentation: - https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu. """ def __init__( @@ -54,6 +45,9 @@ def __init__( onnx_execution_provider: str = "CPUExecutionProvider", pooling_mode: Optional[Union[str, OptimumEmbedderPooling]] = None, model_kwargs: Optional[Dict[str, Any]] = None, + working_dir: Optional[str] = None, + optimizer_settings: Optional[OptimumEmbedderOptimizationConfig] = None, + quantizer_settings: Optional[OptimumEmbedderQuantizationConfig] = None, batch_size: int = 32, progress_bar: bool = True, meta_fields_to_embed: Optional[List[str]] = None, @@ -62,13 +56,19 @@ def __init__( """ Create a OptimumDocumentEmbedder component. - :param model: A string representing the model id on HF Hub. - :param token: The HuggingFace token to use as HTTP bearer authorization. - :param prefix: A string to add to the beginning of each text. - :param suffix: A string to add to the end of each text. - :param normalize_embeddings: Whether to normalize the embeddings to unit length. - :param onnx_execution_provider: The execution provider to use for ONNX models. See - https://onnxruntime.ai/docs/execution-providers/ for possible providers. + :param model: + A string representing the model id on HF Hub. + :param token: + The HuggingFace token to use as HTTP bearer authorization. + :param prefix: + A string to add to the beginning of each text. + :param suffix: + A string to add to the end of each text. + :param normalize_embeddings: + Whether to normalize the embeddings to unit length. + :param onnx_execution_provider: + The [execution provider](https://onnxruntime.ai/docs/execution-providers/) + to use for ONNX models. Note: Using the TensorRT execution provider TensorRT requires to build its inference engine ahead of inference, which takes some time due to the model @@ -88,16 +88,31 @@ def __init__( }, ) ``` - :param pooling_mode: The pooling mode to use. When None, pooling mode will be inferred from the model config. - Refer to the OptimumEmbedderPooling enum for supported pooling modes. - :param model_kwargs: Dictionary containing additional keyword arguments to pass to the model. - In case of duplication, these kwargs override `model`, `onnx_execution_provider`, and `token` initialization - parameters. - :param batch_size: Number of Documents to encode at once. - :param progress_bar: Whether to show a progress bar or not. Can be helpful to disable in production deployments - to keep the logs clean. - :param meta_fields_to_embed: List of meta fields that should be embedded along with the Document text. - :param embedding_separator: Separator used to concatenate the meta fields to the Document text. + :param pooling_mode: + The pooling mode to use. When `None`, pooling mode will be inferred from the model config. + :param model_kwargs: + Dictionary containing additional keyword arguments to pass to the model. + In case of duplication, these kwargs override `model`, `onnx_execution_provider` + and `token` initialization parameters. + :param working_dir: + The directory to use for storing intermediate files + generated during model optimization/quantization. + + Required for optimization and quantization. + :param optimizer_settings: + Configuration for Optimum Embedder Optimization. + If `None`, no additional optimization is be applied. + :param quantizer_settings: + Configuration for Optimum Embedder Quantization. + If `None`, no quantization is be applied. + :param batch_size: + Number of Documents to encode at once. + :param progress_bar: + Whether to show a progress bar or not. + :param meta_fields_to_embed: + List of meta fields that should be embedded along with the Document text. + :param embedding_separator: + Separator used to concatenate the meta fields to the Document text. """ params = _EmbedderParams( model=model, @@ -110,6 +125,9 @@ def __init__( progress_bar=progress_bar, pooling_mode=pooling_mode, model_kwargs=model_kwargs, + working_dir=working_dir, + optimizer_settings=optimizer_settings, + quantizer_settings=quantizer_settings, ) self.meta_fields_to_embed = meta_fields_to_embed or [] self.embedding_separator = embedding_separator @@ -119,7 +137,7 @@ def __init__( def warm_up(self): """ - Load the embedding backend. + Initializes the component. """ if self._initialized: return @@ -129,7 +147,10 @@ def warm_up(self): def to_dict(self) -> Dict[str, Any]: """ - Serialize this component to a dictionary. + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. """ init_params = self._backend.parameters.serialize() init_params["meta_fields_to_embed"] = self.meta_fields_to_embed @@ -139,7 +160,12 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_dict(cls, data: Dict[str, Any]) -> "OptimumDocumentEmbedder": """ - Deserialize this component from a dictionary. + Deserializes the component from a dictionary. + + :param data: + The dictionary to deserialize from. + :returns: + The deserialized component. """ _EmbedderParams.deserialize_inplace(data["init_parameters"]) return default_from_dict(cls, data) @@ -169,8 +195,10 @@ def run(self, documents: List[Document]): Embed a list of Documents. The embedding of each Document is stored in the `embedding` field of the Document. - :param documents: A list of Documents to embed. - :return: A dictionary containing the updated Documents with their embeddings. + :param documents: + A list of Documents to embed. + :returns: + The updated Documents with their embeddings. """ if not self._initialized: msg = "The embedding model has not been loaded. Please call warm_up() before running." diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py index 64454bf9f..394ea04ad 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py @@ -4,14 +4,17 @@ from haystack.utils import Secret from ._backend import _EmbedderBackend, _EmbedderParams +from .optimization import OptimumEmbedderOptimizationConfig from .pooling import OptimumEmbedderPooling +from .quantization import OptimumEmbedderQuantizationConfig @component class OptimumTextEmbedder: """ - A component to embed text using models loaded with the HuggingFace Optimum library. - This component is designed to seamlessly inference models using the high speed ONNX runtime. + A component to embed text using models loaded with the + [HuggingFace Optimum](https://huggingface.co/docs/optimum/index) library, + leveraging the ONNX runtime for high-speed inference. Usage example: ```python @@ -26,18 +29,6 @@ class OptimumTextEmbedder: # {'embedding': [-0.07804739475250244, 0.1498992145061493,, ...]} ``` - - Key Features and Compatibility: - - **Primary Compatibility**: Designed to work seamlessly with any embedding model present on the Hugging Face - Hub. - - **Conversion to ONNX**: The models are converted to ONNX using the HuggingFace Optimum library. This is - performed in real-time, during the warm-up step. - - **Accelerated Inference on GPU**: Supports using different execution providers such as CUDA and TensorRT, to - accelerate ONNX Runtime inference on GPUs. - Simply pass the execution provider as the onnx_execution_provider parameter. Additonal parameters can be passed - to the model using the model_kwargs parameter. - For more details refer to the HuggingFace documentation: - https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu. """ def __init__( @@ -50,41 +41,62 @@ def __init__( onnx_execution_provider: str = "CPUExecutionProvider", pooling_mode: Optional[Union[str, OptimumEmbedderPooling]] = None, model_kwargs: Optional[Dict[str, Any]] = None, + working_dir: Optional[str] = None, + optimizer_settings: Optional[OptimumEmbedderOptimizationConfig] = None, + quantizer_settings: Optional[OptimumEmbedderQuantizationConfig] = None, ): """ - Create a OptimumTextEmbedder component. - - :param model: A string representing the model id on HF Hub. - :param token: The HuggingFace token to use as HTTP bearer authorization. - :param prefix: A string to add to the beginning of each text. - :param suffix: A string to add to the end of each text. - :param normalize_embeddings: Whether to normalize the embeddings to unit length. - :param onnx_execution_provider: The execution provider to use for ONNX models. See - https://onnxruntime.ai/docs/execution-providers/ for possible providers. - - Note: Using the TensorRT execution provider - TensorRT requires to build its inference engine ahead of inference, which takes some time due to the model - optimization and nodes fusion. To avoid rebuilding the engine every time the model is loaded, ONNX Runtime - provides a pair of options to save the engine: `trt_engine_cache_enable` and `trt_engine_cache_path`. We - recommend setting these two provider options using the model_kwargs parameter, when using the TensorRT - execution provider. The usage is as follows: - ```python - embedder = OptimumTextEmbedder( - model="sentence-transformers/all-mpnet-base-v2", - onnx_execution_provider="TensorrtExecutionProvider", - model_kwargs={ - "provider_options": { - "trt_engine_cache_enable": True, - "trt_engine_cache_path": "tmp/trt_cache", - } - }, - ) - ``` - :param pooling_mode: The pooling mode to use. When None, pooling mode will be inferred from the model config. - Refer to the OptimumEmbedderPooling enum for supported pooling modes. - :param model_kwargs: Dictionary containing additional keyword arguments to pass to the model. - In case of duplication, these kwargs override `model`, `onnx_execution_provider`, and `token` initialization - parameters. + Create a OptimumTextEmbedder component. + + :param model: + A string representing the model id on HF Hub. + :param token: + The HuggingFace token to use as HTTP bearer authorization. + :param prefix: + A string to add to the beginning of each text. + :param suffix: + A string to add to the end of each text. + :param normalize_embeddings: + Whether to normalize the embeddings to unit length. + :param onnx_execution_provider: + The [execution provider](https://onnxruntime.ai/docs/execution-providers/) + to use for ONNX models. + + Note: Using the TensorRT execution provider + TensorRT requires to build its inference engine ahead of inference, which takes some time due to the model + optimization and nodes fusion. To avoid rebuilding the engine every time the model is loaded, ONNX Runtime + provides a pair of options to save the engine: `trt_engine_cache_enable` and `trt_engine_cache_path`. We + recommend setting these two provider options using the model_kwargs parameter, when using the TensorRT + execution provider. The usage is as follows: + ```python + embedder = OptimumDocumentEmbedder( + model="sentence-transformers/all-mpnet-base-v2", + onnx_execution_provider="TensorrtExecutionProvider", + model_kwargs={ + "provider_options": { + "trt_engine_cache_enable": True, + "trt_engine_cache_path": "tmp/trt_cache", + } + }, + ) + ``` + :param pooling_mode: + The pooling mode to use. When `None`, pooling mode will be inferred from the model config. + :param model_kwargs: + Dictionary containing additional keyword arguments to pass to the model. + In case of duplication, these kwargs override `model`, `onnx_execution_provider` + and `token` initialization parameters. + :param working_dir: + The directory to use for storing intermediate files + generated during model optimization/quantization. + + Required for optimization and quantization. + :param optimizer_settings: + Configuration for Optimum Embedder Optimization. + If `None`, no additional optimization is applied. + :param quantizer_settings: + Configuration for Optimum Embedder Quantization. + If `None`, no quantization is applied. """ params = _EmbedderParams( model=model, @@ -97,13 +109,16 @@ def __init__( progress_bar=False, pooling_mode=pooling_mode, model_kwargs=model_kwargs, + working_dir=working_dir, + optimizer_settings=optimizer_settings, + quantizer_settings=quantizer_settings, ) self._backend = _EmbedderBackend(params) self._initialized = False def warm_up(self): """ - Load the embedding backend. + Initializes the component. """ if self._initialized: return @@ -113,7 +128,10 @@ def warm_up(self): def to_dict(self) -> Dict[str, Any]: """ - Serialize this component to a dictionary. + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. """ init_params = self._backend.parameters.serialize() # Remove init params that are not provided to the text embedder. @@ -124,7 +142,12 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_dict(cls, data: Dict[str, Any]) -> "OptimumTextEmbedder": """ - Deserialize this component from a dictionary. + Deserializes the component from a dictionary. + + :param data: + The dictionary to deserialize from. + :returns: + The deserialized component. """ _EmbedderParams.deserialize_inplace(data["init_parameters"]) return default_from_dict(cls, data) @@ -134,8 +157,10 @@ def run(self, text: str): """ Embed a string. - :param text: The text to embed. - :return: The embeddings of the text. + :param text: + The text to embed. + :returns: + The embeddings of the text. """ if not self._initialized: msg = "The embedding model has not been loaded. Please call warm_up() before running." diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py index c4d195b8e..41aa24d64 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py @@ -22,11 +22,10 @@ class OptimumEmbedderPooling(Enum): MEAN_SQRT_LEN = "mean_sqrt_len" #: Perform weighted (position) mean pooling on the output of the - #: embedding model. See https://arxiv.org/abs/2202.08904. + #: embedding model. WEIGHTED_MEAN = "weighted_mean" #: Perform Last Token Pooling on the output of the embedding model. - #: See https://arxiv.org/abs/2202.08904 & https://arxiv.org/abs/2201.10005. LAST_TOKEN = "last_token" def __str__(self): @@ -38,9 +37,9 @@ def from_str(cls, string: str) -> "OptimumEmbedderPooling": Create a pooling mode from a string. :param string: - The string to convert. + String to convert. :returns: - The pooling mode. + Pooling mode. """ enum_map = {e.value: e for e in OptimumEmbedderPooling} pooling_mode = enum_map.get(string) diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py new file mode 100644 index 000000000..2e68081b5 --- /dev/null +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py @@ -0,0 +1,105 @@ +from dataclasses import dataclass +from enum import Enum +from typing import Any, Dict + +from optimum.onnxruntime.configuration import AutoQuantizationConfig, QuantizationConfig + + +class OptimumEmbedderQuantizationMode(Enum): + """ + [Dynamic Quantization Modes](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization) + support by the Optimum Embedders. + """ + + #: Quantization for the ARM64 architecture. + ARM64 = "arm64" + + #: Quantization with AVX-2 instructions. + AVX2 = "avx2" + + #: Quantization with AVX-512 instructions. + AVX512 = "avx512" + + #: Quantization with AVX-512 and VNNI instructions. + AVX512_VNNI = "avx512_vnni" + + def __str__(self): + return self.value + + @classmethod + def from_str(cls, string: str) -> "OptimumEmbedderQuantizationMode": + """ + Create an quantization mode from a string. + + :param string: + String to convert. + :returns: + Quantization mode. + """ + enum_map = {e.value: e for e in OptimumEmbedderQuantizationMode} + q_mode = enum_map.get(string) + if q_mode is None: + msg = f"Unknown quantization mode '{string}'. Supported modes are: {list(enum_map.keys())}" + raise ValueError(msg) + return q_mode + + +@dataclass(frozen=True) +class OptimumEmbedderQuantizationConfig: + """ + Configuration for Optimum Embedder Quantization. + + :param mode: + Quantization mode. + :param per_channel: + Whether to apply per-channel quantization. + """ + + mode: OptimumEmbedderQuantizationMode + per_channel: bool = False + + def to_optimum_config(self) -> QuantizationConfig: + """ + Convert the configuration to a Optimum configuration. + + :returns: + Optimum configuration. + """ + if self.mode == OptimumEmbedderQuantizationMode.ARM64: + return AutoQuantizationConfig.arm64(is_static=False, per_channel=self.per_channel) + elif self.mode == OptimumEmbedderQuantizationMode.AVX2: + return AutoQuantizationConfig.avx2(is_static=False, per_channel=self.per_channel) + elif self.mode == OptimumEmbedderQuantizationMode.AVX512: + return AutoQuantizationConfig.avx512(is_static=False, per_channel=self.per_channel) + elif self.mode == OptimumEmbedderQuantizationMode.AVX512_VNNI: + return AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=self.per_channel) + else: + msg = f"Unknown quantization mode '{self.mode}'" + raise ValueError(msg) + + def to_dict(self) -> Dict[str, Any]: + """ + Convert the configuration to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return { + "mode": str(self.mode), + "per_channel": self.per_channel, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "OptimumEmbedderQuantizationConfig": + """ + Create a configuration from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Quantization configuration. + """ + return OptimumEmbedderQuantizationConfig( + mode=OptimumEmbedderQuantizationMode.from_str(data["mode"]), + per_channel=data["per_channel"], + ) diff --git a/integrations/optimum/tests/test_optimum_document_embedder.py b/integrations/optimum/tests/test_optimum_document_embedder.py index bcbccd533..9288bb688 100644 --- a/integrations/optimum/tests/test_optimum_document_embedder.py +++ b/integrations/optimum/tests/test_optimum_document_embedder.py @@ -1,12 +1,21 @@ from unittest.mock import MagicMock, patch +import tempfile +import copy import pytest from haystack.dataclasses import Document from haystack.utils.auth import Secret from haystack_integrations.components.embedders.optimum import OptimumDocumentEmbedder from haystack_integrations.components.embedders.optimum.pooling import OptimumEmbedderPooling +from haystack_integrations.components.embedders.optimum.optimization import ( + OptimumEmbedderOptimizationConfig, + OptimumEmbedderOptimizationMode, +) +from haystack_integrations.components.embedders.optimum.quantization import ( + OptimumEmbedderQuantizationConfig, + OptimumEmbedderQuantizationMode, +) from huggingface_hub.utils import RepositoryNotFoundError -import copy @pytest.fixture @@ -63,6 +72,9 @@ def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 pooling_mode="max", onnx_execution_provider="CUDAExecutionProvider", model_kwargs={"trust_remote_code": True}, + working_dir="working_dir", + optimizer_settings=None, + quantizer_settings=None, ) assert embedder._backend.parameters.model == "sentence-transformers/all-minilm-l6-v2" @@ -82,6 +94,9 @@ def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 "provider": "CUDAExecutionProvider", "use_auth_token": "fake-api-token", } + assert embedder._backend.parameters.working_dir == "working_dir" + assert embedder._backend.parameters.optimizer_settings is None + assert embedder._backend.parameters.quantizer_settings is None def test_to_and_from_dict(self, mock_check_valid_model, mock_get_pooling_mode): # noqa: ARG002 component = OptimumDocumentEmbedder() @@ -105,6 +120,9 @@ def test_to_and_from_dict(self, mock_check_valid_model, mock_get_pooling_mode): "model_id": "sentence-transformers/all-mpnet-base-v2", "provider": "CPUExecutionProvider", }, + "working_dir": None, + "optimizer_settings": None, + "quantizer_settings": None, }, } @@ -125,6 +143,9 @@ def test_to_and_from_dict(self, mock_check_valid_model, mock_get_pooling_mode): "provider": "CPUExecutionProvider", "use_auth_token": None, } + assert embedder._backend.parameters.working_dir is None + assert embedder._backend.parameters.optimizer_settings is None + assert embedder._backend.parameters.quantizer_settings is None def test_to_and_from_dict_with_custom_init_parameters( self, mock_check_valid_model, mock_get_pooling_mode @@ -142,6 +163,11 @@ def test_to_and_from_dict_with_custom_init_parameters( onnx_execution_provider="CUDAExecutionProvider", pooling_mode="max", model_kwargs={"trust_remote_code": True}, + working_dir="working_dir", + optimizer_settings=OptimumEmbedderOptimizationConfig(OptimumEmbedderOptimizationMode.O1, for_gpu=True), + quantizer_settings=OptimumEmbedderQuantizationConfig( + OptimumEmbedderQuantizationMode.ARM64, per_channel=True + ), ) data = component.to_dict() @@ -164,6 +190,9 @@ def test_to_and_from_dict_with_custom_init_parameters( "model_id": "sentence-transformers/all-minilm-l6-v2", "provider": "CUDAExecutionProvider", }, + "working_dir": "working_dir", + "optimizer_settings": {"mode": "o1", "for_gpu": True}, + "quantizer_settings": {"mode": "arm64", "per_channel": True}, }, } @@ -185,6 +214,13 @@ def test_to_and_from_dict_with_custom_init_parameters( "provider": "CUDAExecutionProvider", "use_auth_token": None, } + assert embedder._backend.parameters.working_dir == "working_dir" + assert embedder._backend.parameters.optimizer_settings == OptimumEmbedderOptimizationConfig( + OptimumEmbedderOptimizationMode.O1, for_gpu=True + ) + assert embedder._backend.parameters.quantizer_settings == OptimumEmbedderQuantizationConfig( + OptimumEmbedderQuantizationMode.ARM64, per_channel=True + ) def test_initialize_with_invalid_model(self, mock_check_valid_model): mock_check_valid_model.side_effect = RepositoryNotFoundError("Invalid model id") @@ -287,7 +323,7 @@ def test_run_wrong_input_format(self, mock_check_valid_model): # noqa: ARG002 def test_run_on_empty_list(self, mock_check_valid_model): # noqa: ARG002 embedder = OptimumDocumentEmbedder( - model="sentence-transformers/all-mpnet-base-v2", + model="sentence-transformers/paraphrase-albert-small-v2", ) embedder.warm_up() empty_list_input = [] @@ -297,7 +333,24 @@ def test_run_on_empty_list(self, mock_check_valid_model): # noqa: ARG002 assert not result["documents"] # empty list @pytest.mark.integration - def test_run(self): + @pytest.mark.parametrize( + "opt_config, quant_config", + [ + (None, None), + ( + OptimumEmbedderOptimizationConfig(OptimumEmbedderOptimizationMode.O1, for_gpu=False), + None, + ), + (None, OptimumEmbedderQuantizationConfig(OptimumEmbedderQuantizationMode.AVX2)), + # onxxruntime 1.17.x breaks support for quantizing optimized models. + # c.f https://discuss.huggingface.co/t/optimize-and-quantize-with-optimum/23675/12 + # ( + # OptimumEmbedderOptimizationConfig(OptimumEmbedderOptimizationMode.O2, for_gpu=False), + # OptimumEmbedderQuantizationConfig(OptimumEmbedderQuantizationMode.AVX2), + # ), + ], + ) + def test_run(self, opt_config, quant_config): docs = [ Document(content="I love cheese", meta={"topic": "Cuisine"}), Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), @@ -305,18 +358,22 @@ def test_run(self): ] docs_copy = copy.deepcopy(docs) - embedder = OptimumDocumentEmbedder( - model="sentence-transformers/all-mpnet-base-v2", - prefix="prefix ", - suffix=" suffix", - meta_fields_to_embed=["topic"], - embedding_separator=" | ", - batch_size=1, - ) - embedder.warm_up() + with tempfile.TemporaryDirectory() as tmpdirname: + embedder = OptimumDocumentEmbedder( + model="sentence-transformers/paraphrase-albert-small-v2", + prefix="prefix ", + suffix=" suffix", + meta_fields_to_embed=["topic"], + embedding_separator=" | ", + batch_size=1, + working_dir=tmpdirname, + optimizer_settings=opt_config, + quantizer_settings=quant_config, + ) + embedder.warm_up() - result = embedder.run(documents=docs) - expected = [embedder.run([d]) for d in docs_copy] + result = embedder.run(documents=docs) + expected = [embedder.run([d]) for d in docs_copy] documents_with_embeddings = result["documents"] diff --git a/integrations/optimum/tests/test_optimum_text_embedder.py b/integrations/optimum/tests/test_optimum_text_embedder.py index ce5bc2ffb..ad0e7d800 100644 --- a/integrations/optimum/tests/test_optimum_text_embedder.py +++ b/integrations/optimum/tests/test_optimum_text_embedder.py @@ -4,6 +4,14 @@ from haystack.utils.auth import Secret from haystack_integrations.components.embedders.optimum import OptimumTextEmbedder from haystack_integrations.components.embedders.optimum.pooling import OptimumEmbedderPooling +from haystack_integrations.components.embedders.optimum.optimization import ( + OptimumEmbedderOptimizationConfig, + OptimumEmbedderOptimizationMode, +) +from haystack_integrations.components.embedders.optimum.quantization import ( + OptimumEmbedderQuantizationConfig, + OptimumEmbedderQuantizationMode, +) from huggingface_hub.utils import RepositoryNotFoundError @@ -53,6 +61,9 @@ def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 pooling_mode="max", onnx_execution_provider="CUDAExecutionProvider", model_kwargs={"trust_remote_code": True}, + working_dir="working_dir", + optimizer_settings=None, + quantizer_settings=None, ) assert embedder._backend.parameters.model == "sentence-transformers/all-minilm-l6-v2" @@ -68,6 +79,9 @@ def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 "provider": "CUDAExecutionProvider", "use_auth_token": "fake-api-token", } + assert embedder._backend.parameters.working_dir == "working_dir" + assert embedder._backend.parameters.optimizer_settings is None + assert embedder._backend.parameters.quantizer_settings is None def test_to_and_from_dict(self, mock_check_valid_model, mock_get_pooling_mode): # noqa: ARG002 component = OptimumTextEmbedder() @@ -83,10 +97,13 @@ def test_to_and_from_dict(self, mock_check_valid_model, mock_get_pooling_mode): "normalize_embeddings": True, "onnx_execution_provider": "CPUExecutionProvider", "pooling_mode": "mean", + "working_dir": None, "model_kwargs": { "model_id": "sentence-transformers/all-mpnet-base-v2", "provider": "CPUExecutionProvider", }, + "optimizer_settings": None, + "quantizer_settings": None, }, } @@ -103,6 +120,9 @@ def test_to_and_from_dict(self, mock_check_valid_model, mock_get_pooling_mode): "provider": "CPUExecutionProvider", "use_auth_token": None, } + assert embedder._backend.parameters.working_dir is None + assert embedder._backend.parameters.optimizer_settings is None + assert embedder._backend.parameters.quantizer_settings is None def test_to_and_from_dict_with_custom_init_parameters(self, mock_check_valid_model): # noqa: ARG002 component = OptimumTextEmbedder( @@ -114,6 +134,11 @@ def test_to_and_from_dict_with_custom_init_parameters(self, mock_check_valid_mod onnx_execution_provider="CUDAExecutionProvider", pooling_mode="max", model_kwargs={"trust_remote_code": True}, + working_dir="working_dir", + optimizer_settings=OptimumEmbedderOptimizationConfig(OptimumEmbedderOptimizationMode.O1, for_gpu=True), + quantizer_settings=OptimumEmbedderQuantizationConfig( + OptimumEmbedderQuantizationMode.ARM64, per_channel=True + ), ) data = component.to_dict() @@ -132,6 +157,9 @@ def test_to_and_from_dict_with_custom_init_parameters(self, mock_check_valid_mod "model_id": "sentence-transformers/all-minilm-l6-v2", "provider": "CUDAExecutionProvider", }, + "working_dir": "working_dir", + "optimizer_settings": {"mode": "o1", "for_gpu": True}, + "quantizer_settings": {"mode": "arm64", "per_channel": True}, }, } @@ -149,6 +177,13 @@ def test_to_and_from_dict_with_custom_init_parameters(self, mock_check_valid_mod "provider": "CUDAExecutionProvider", "use_auth_token": None, } + assert embedder._backend.parameters.working_dir == "working_dir" + assert embedder._backend.parameters.optimizer_settings == OptimumEmbedderOptimizationConfig( + OptimumEmbedderOptimizationMode.O1, for_gpu=True + ) + assert embedder._backend.parameters.quantizer_settings == OptimumEmbedderQuantizationConfig( + OptimumEmbedderQuantizationMode.ARM64, per_channel=True + ) def test_initialize_with_invalid_model(self, mock_check_valid_model): mock_check_valid_model.side_effect = RepositoryNotFoundError("Invalid model id") @@ -194,7 +229,7 @@ def test_infer_pooling_mode_from_hf(self): def test_run_wrong_input_format(self, mock_check_valid_model): # noqa: ARG002 embedder = OptimumTextEmbedder( - model="sentence-transformers/all-mpnet-base-v2", + model="sentence-transformers/paraphrase-albert-small-v2", token=Secret.from_token("fake-api-token"), pooling_mode="mean", ) @@ -209,7 +244,7 @@ def test_run_wrong_input_format(self, mock_check_valid_model): # noqa: ARG002 def test_run(self): for pooling_mode in OptimumEmbedderPooling: embedder = OptimumTextEmbedder( - model="sentence-transformers/all-mpnet-base-v2", + model="sentence-transformers/paraphrase-albert-small-v2", prefix="prefix ", suffix=" suffix", pooling_mode=pooling_mode, From d31442d6b7945ccd379da43165c98a48d98a9918 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Wed, 28 Feb 2024 19:10:07 +0100 Subject: [PATCH 4/4] chore: update docstrings (#497) * update docstrings * Apply suggestions from code review Co-authored-by: Stefano Fiorucci * Apply suggestions from code review * Final touch to chat_generator --------- Co-authored-by: Stefano Fiorucci --- .../embedders/mistral/document_embedder.py | 32 +++++++---- .../embedders/mistral/text_embedder.py | 42 ++++++++------- .../generators/mistral/chat/chat_generator.py | 53 ++++++++++--------- 3 files changed, 71 insertions(+), 56 deletions(-) diff --git a/integrations/mistral/src/haystack_integrations/components/embedders/mistral/document_embedder.py b/integrations/mistral/src/haystack_integrations/components/embedders/mistral/document_embedder.py index 29161fd95..780eab268 100644 --- a/integrations/mistral/src/haystack_integrations/components/embedders/mistral/document_embedder.py +++ b/integrations/mistral/src/haystack_integrations/components/embedders/mistral/document_embedder.py @@ -43,17 +43,27 @@ def __init__( embedding_separator: str = "\n", ): """ - Create a MistralDocumentEmbedder component. - :param api_key: The Mistral API key. - :param model: The name of the model to use. - :param api_base_url: The Mistral API Base url, defaults to None. For more details, see Mistral [docs](https://docs.mistral.ai/api/). - :param prefix: A string to add to the beginning of each text. - :param suffix: A string to add to the end of each text. - :param batch_size: Number of Documents to encode at once. - :param progress_bar: Whether to show a progress bar or not. Can be helpful to disable in production deployments - to keep the logs clean. - :param meta_fields_to_embed: List of meta fields that should be embedded along with the Document text. - :param embedding_separator: Separator used to concatenate the meta fields to the Document text. + Creates a MistralDocumentEmbedder component. + + :param api_key: + The Mistral API key. + :param model: + The name of the model to use. + :param api_base_url: + The Mistral API Base url. For more details, see Mistral [docs](https://docs.mistral.ai/api/). + :param prefix: + A string to add to the beginning of each text. + :param suffix: + A string to add to the end of each text. + :param batch_size: + Number of Documents to encode at once. + :param progress_bar: + Whether to show a progress bar or not. Can be helpful to disable in production deployments to keep + the logs clean. + :param meta_fields_to_embed: + List of meta fields that should be embedded along with the Document text. + :param embedding_separator: + Separator used to concatenate the meta fields to the Document text. """ super(MistralDocumentEmbedder, self).__init__( # noqa: UP008 api_key=api_key, diff --git a/integrations/mistral/src/haystack_integrations/components/embedders/mistral/text_embedder.py b/integrations/mistral/src/haystack_integrations/components/embedders/mistral/text_embedder.py index d65828ef6..b299e91c7 100644 --- a/integrations/mistral/src/haystack_integrations/components/embedders/mistral/text_embedder.py +++ b/integrations/mistral/src/haystack_integrations/components/embedders/mistral/text_embedder.py @@ -11,22 +11,21 @@ @component class MistralTextEmbedder(OpenAITextEmbedder): """ - A component for embedding strings using Mistral models. + A component for embedding strings using Mistral models. - Usage example: + Usage example: ```python from haystack_integrations.components.embedders.mistral.text_embedder import MistralTextEmbedder - text_to_embed = "I love pizza!" + text_to_embed = "I love pizza!" + text_embedder = MistralTextEmbedder() + print(text_embedder.run(text_to_embed)) - text_embedder = MistralTextEmbedder() - - print(text_embedder.run(text_to_embed)) - - # {'embedding': [0.017020374536514282, -0.023255806416273117, ...], - # 'meta': {'model': 'text-embedding-ada-002-v2', - # 'usage': {'prompt_tokens': 4, 'total_tokens': 4}}} - ``` + # output: + # {'embedding': [0.017020374536514282, -0.023255806416273117, ...], + # 'meta': {'model': 'mistral-embed', + # 'usage': {'prompt_tokens': 4, 'total_tokens': 4}}} + ``` """ def __init__( @@ -38,14 +37,19 @@ def __init__( suffix: str = "", ): """ - Create an MistralTextEmbedder component. - - :param api_key: The Misttal API key. - :param model: The name of the Mistral embedding models to be used. - :param api_base_url: The Mistral API Base url, defaults to `https://api.mistral.ai/v1`. - For more details, see Mistral [docs](https://docs.mistral.ai/api/). - :param prefix: A string to add to the beginning of each text. - :param suffix: A string to add to the end of each text. + Creates an MistralTextEmbedder component. + + :param api_key: + The Mistral API key. + :param model: + The name of the Mistral embedding model to be used. + :param api_base_url: + The Mistral API Base url. + For more details, see Mistral [docs](https://docs.mistral.ai/api/). + :param prefix: + A string to add to the beginning of each text. + :param suffix: + A string to add to the end of each text. """ super(MistralTextEmbedder, self).__init__( # noqa: UP008 api_key=api_key, diff --git a/integrations/mistral/src/haystack_integrations/components/generators/mistral/chat/chat_generator.py b/integrations/mistral/src/haystack_integrations/components/generators/mistral/chat/chat_generator.py index e1399c203..6bf703bc1 100644 --- a/integrations/mistral/src/haystack_integrations/components/generators/mistral/chat/chat_generator.py +++ b/integrations/mistral/src/haystack_integrations/components/generators/mistral/chat/chat_generator.py @@ -12,17 +12,27 @@ @component class MistralChatGenerator(OpenAIChatGenerator): """ - Enables text generation using Mistral's large language models (LLMs). - Currently supports `mistral-tiny`, `mistral-small` and `mistral-medium` - models accessed through the chat completions API endpoint. + Enables text generation using Mistral AI generative models. + For supported models, see [Mistral AI docs](https://docs.mistral.ai/platform/endpoints/#operation/listModels). - Users can pass any text generation parameters valid for the `openai.ChatCompletion.create` method - directly to this component via the `**generation_kwargs` parameter in __init__ or the `**generation_kwargs` + Users can pass any text generation parameters valid for the Mistral Chat Completion API + directly to this component via the `generation_kwargs` parameter in `__init__` or the `generation_kwargs` parameter in `run` method. + Key Features and Compatibility: + - **Primary Compatibility**: Designed to work seamlessly with the Mistral API Chat Completion endpoint. + - **Streaming Support**: Supports streaming responses from the Mistral API Chat Completion endpoint. + - **Customizability**: Supports all parameters supported by the Mistral API Chat Completion endpoint. + + This component uses the ChatMessage format for structuring both input and output, + ensuring coherent and contextually relevant responses in chat-based text generation scenarios. + Details on the ChatMessage format can be found in the + [Haystack docs](https://docs.haystack.deepset.ai/v2.0/docs/data-classes#chatmessage) + For more details on the parameters supported by the Mistral API, refer to the [Mistral API Docs](https://docs.mistral.ai/api/). + Usage example: ```python from haystack_integrations.components.generators.mistral import MistralChatGenerator from haystack.dataclasses import ChatMessage @@ -38,19 +48,7 @@ class MistralChatGenerator(OpenAIChatGenerator): >>meaningful and useful.', role=, name=None, >>meta={'model': 'mistral-tiny', 'index': 0, 'finish_reason': 'stop', >>'usage': {'prompt_tokens': 15, 'completion_tokens': 36, 'total_tokens': 51}})]} - ``` - - Key Features and Compatibility: - - **Primary Compatibility**: Designed to work seamlessly with the Mistral API Chat Completion endpoint. - - **Streaming Support**: Supports streaming responses from the Mistral API Chat Completion endpoint. - - **Customizability**: Supports all parameters supported by the Mistral API Chat Completion endpoint. - - Input and Output Format: - - **ChatMessage Format**: This component uses the ChatMessage format for structuring both input and output, - ensuring coherent and contextually relevant responses in chat-based text generation scenarios. - Details on the ChatMessage format can be found at: https://github.com/openai/openai-python/blob/main/chatml.md. - Note that the Mistral API does not accept `system` messages yet. You can use `user` and `assistant` messages. """ def __init__( @@ -65,15 +63,19 @@ def __init__( Creates an instance of MistralChatGenerator. Unless specified otherwise in the `model`, this is for Mistral's `mistral-tiny` model. - :param api_key: The Mistral API key. - :param model: The name of the Mistral chat completion model to use. - :param streaming_callback: A callback function that is called when a new token is received from the stream. + :param api_key: + The Mistral API key. + :param model: + The name of the Mistral chat completion model to use. + :param streaming_callback: + A callback function that is called when a new token is received from the stream. The callback function accepts StreamingChunk as an argument. - :param api_base_url: The Mistral API Base url, defaults to `https://api.mistral.ai/v1`. - For more details, see Mistral [docs](https://docs.mistral.ai/api/). - :param generation_kwargs: Other parameters to use for the model. These parameters are all sent directly to - the Mistrak endpoint. See [Mistral API docs](https://docs.mistral.ai/api/t) for - more details. + :param api_base_url: + The Mistral API Base url. + For more details, see Mistral [docs](https://docs.mistral.ai/api/). + :param generation_kwargs: + Other parameters to use for the model. These parameters are all sent directly to + the Mistral endpoint. See [Mistral API docs](https://docs.mistral.ai/api/) for more details. Some of the supported parameters: - `max_tokens`: The maximum number of tokens the output text can have. - `temperature`: What sampling temperature to use. Higher values mean the model will take more risks. @@ -83,7 +85,6 @@ def __init__( comprising the top 10% probability mass are considered. - `stream`: Whether to stream back partial progress. If set, tokens will be sent as data-only server-sent events as they become available, with the stream terminated by a data: [DONE] message. - - `stop`: One or more sequences after which the LLM should stop generating tokens. - `safe_prompt`: Whether to inject a safety prompt before all conversations. - `random_seed`: The seed to use for random sampling. """