From 17359778a941cf055b0d8eac3b3aa39e489ffb12 Mon Sep 17 00:00:00 2001
From: Julian De Vita <53025729+jmdevita@users.noreply.github.com>
Date: Wed, 28 Feb 2024 03:20:44 -0500
Subject: [PATCH 1/4] Changed Default Ollama Embedding models to supported
 model: nomic-embed-text (#490)

* Changed Embedding model to supported model: nomic-embed-text

* Updated workflow yml with support for llm and embedding model
---
 .github/workflows/ollama.yml                         |  6 +++++-
 .../components/embedders/ollama/document_embedder.py |  4 ++--
 .../components/embedders/ollama/text_embedder.py     |  4 ++--
 integrations/ollama/tests/test_document_embedder.py  | 12 ++++++------
 integrations/ollama/tests/test_text_embedder.py      |  6 +++---
 5 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/ollama.yml b/.github/workflows/ollama.yml
index 962537390..055f63dd0 100644
--- a/.github/workflows/ollama.yml
+++ b/.github/workflows/ollama.yml
@@ -22,6 +22,7 @@ env:
   PYTHONUNBUFFERED: "1"
   FORCE_COLOR: "1"
   LLM_FOR_TESTS: "orca-mini"
+  EMBEDDER_FOR_TESTS: "nomic-embed-text"
 
 jobs:
   run:
@@ -55,7 +56,10 @@ jobs:
         run: hatch run lint:all
 
       - name: Pull the LLM in the Ollama service
-        run: docker exec ollama ollama pull ${{ env.LLM_FOR_TESTS }}        
+        run: docker exec ollama ollama pull ${{ env.LLM_FOR_TESTS }}    
+
+      - name: Pull the Embedding Model in the Ollama service
+        run: docker exec ollama ollama pull ${{ env.EMBEDDER_FOR_TESTS }}
 
       - name: Generate docs
         if: matrix.python-version == '3.9' && runner.os == 'Linux'
diff --git a/integrations/ollama/src/haystack_integrations/components/embedders/ollama/document_embedder.py b/integrations/ollama/src/haystack_integrations/components/embedders/ollama/document_embedder.py
index 17b32f065..6e3273e1c 100644
--- a/integrations/ollama/src/haystack_integrations/components/embedders/ollama/document_embedder.py
+++ b/integrations/ollama/src/haystack_integrations/components/embedders/ollama/document_embedder.py
@@ -9,7 +9,7 @@
 class OllamaDocumentEmbedder:
     def __init__(
         self,
-        model: str = "orca-mini",
+        model: str = "nomic-embed-text",
         url: str = "http://localhost:11434/api/embeddings",
         generation_kwargs: Optional[Dict[str, Any]] = None,
         timeout: int = 120,
@@ -21,7 +21,7 @@ def __init__(
     ):
         """
         :param model: The name of the model to use. The model should be available in the running Ollama instance.
-            Default is "orca-mini".
+            Default is "nomic-embed-text". "https://ollama.com/library/nomic-embed-text"
         :param url: The URL of the chat endpoint of a running Ollama instance.
             Default is "http://localhost:11434/api/embeddings".
         :param generation_kwargs: Optional arguments to pass to the Ollama generation endpoint, such as temperature,
diff --git a/integrations/ollama/src/haystack_integrations/components/embedders/ollama/text_embedder.py b/integrations/ollama/src/haystack_integrations/components/embedders/ollama/text_embedder.py
index e27fd9ff4..e2ef136b4 100644
--- a/integrations/ollama/src/haystack_integrations/components/embedders/ollama/text_embedder.py
+++ b/integrations/ollama/src/haystack_integrations/components/embedders/ollama/text_embedder.py
@@ -8,14 +8,14 @@
 class OllamaTextEmbedder:
     def __init__(
         self,
-        model: str = "orca-mini",
+        model: str = "nomic-embed-text",
         url: str = "http://localhost:11434/api/embeddings",
         generation_kwargs: Optional[Dict[str, Any]] = None,
         timeout: int = 120,
     ):
         """
         :param model: The name of the model to use. The model should be available in the running Ollama instance.
-            Default is "orca-mini".
+            Default is "nomic-embed-text". "https://ollama.com/library/nomic-embed-text"
         :param url: The URL of the chat endpoint of a running Ollama instance.
             Default is "http://localhost:11434/api/embeddings".
         :param generation_kwargs: Optional arguments to pass to the Ollama generation endpoint, such as temperature,
diff --git a/integrations/ollama/tests/test_document_embedder.py b/integrations/ollama/tests/test_document_embedder.py
index a5694db33..012ad9eae 100644
--- a/integrations/ollama/tests/test_document_embedder.py
+++ b/integrations/ollama/tests/test_document_embedder.py
@@ -11,11 +11,11 @@ def test_init_defaults(self):
         assert embedder.timeout == 120
         assert embedder.generation_kwargs == {}
         assert embedder.url == "http://localhost:11434/api/embeddings"
-        assert embedder.model == "orca-mini"
+        assert embedder.model == "nomic-embed-text"
 
     def test_init(self):
         embedder = OllamaDocumentEmbedder(
-            model="orca-mini",
+            model="nomic-embed-text",
             url="http://my-custom-endpoint:11434/api/embeddings",
             generation_kwargs={"temperature": 0.5},
             timeout=3000,
@@ -24,7 +24,7 @@ def test_init(self):
         assert embedder.timeout == 3000
         assert embedder.generation_kwargs == {"temperature": 0.5}
         assert embedder.url == "http://my-custom-endpoint:11434/api/embeddings"
-        assert embedder.model == "orca-mini"
+        assert embedder.model == "nomic-embed-text"
 
     @pytest.mark.integration
     def test_model_not_found(self):
@@ -35,17 +35,17 @@ def test_model_not_found(self):
 
     @pytest.mark.integration
     def import_text_in_embedder(self):
-        embedder = OllamaDocumentEmbedder(model="orca-mini")
+        embedder = OllamaDocumentEmbedder(model="nomic-embed-text")
 
         with pytest.raises(TypeError):
             embedder.run("This is a text string. This should not work.")
 
     @pytest.mark.integration
     def test_run(self):
-        embedder = OllamaDocumentEmbedder(model="orca-mini")
+        embedder = OllamaDocumentEmbedder(model="nomic-embed-text")
         list_of_docs = [Document(content="This is a document containing some text.")]
         reply = embedder.run(list_of_docs)
 
         assert isinstance(reply, dict)
         assert all(isinstance(element, float) for element in reply["documents"][0].embedding)
-        assert reply["meta"]["model"] == "orca-mini"
+        assert reply["meta"]["model"] == "nomic-embed-text"
diff --git a/integrations/ollama/tests/test_text_embedder.py b/integrations/ollama/tests/test_text_embedder.py
index 9d3321e64..f4d45afec 100644
--- a/integrations/ollama/tests/test_text_embedder.py
+++ b/integrations/ollama/tests/test_text_embedder.py
@@ -10,7 +10,7 @@ def test_init_defaults(self):
         assert embedder.timeout == 120
         assert embedder.generation_kwargs == {}
         assert embedder.url == "http://localhost:11434/api/embeddings"
-        assert embedder.model == "orca-mini"
+        assert embedder.model == "nomic-embed-text"
 
     def test_init(self):
         embedder = OllamaTextEmbedder(
@@ -34,10 +34,10 @@ def test_model_not_found(self):
 
     @pytest.mark.integration
     def test_run(self):
-        embedder = OllamaTextEmbedder(model="orca-mini")
+        embedder = OllamaTextEmbedder(model="nomic-embed-text")
 
         reply = embedder.run("hello")
 
         assert isinstance(reply, dict)
         assert all(isinstance(element, float) for element in reply["embedding"])
-        assert reply["meta"]["model"] == "orca-mini"
+        assert reply["meta"]["model"] == "nomic-embed-text"

From 869b2597df56783f154d75713986ff97f13b9b63 Mon Sep 17 00:00:00 2001
From: Massimiliano Pippi <mpippi@gmail.com>
Date: Wed, 28 Feb 2024 12:15:09 +0100
Subject: [PATCH 2/4] Update API docs (#494)

---
 integrations/ollama/pydoc/config.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/integrations/ollama/pydoc/config.yml b/integrations/ollama/pydoc/config.yml
index 38de37211..45c336b85 100644
--- a/integrations/ollama/pydoc/config.yml
+++ b/integrations/ollama/pydoc/config.yml
@@ -3,7 +3,9 @@ loaders:
     search_path: [../src]
     modules: [
       "haystack_integrations.components.generators.ollama.generator",
-      "haystack_integrations.components.generators.ollama.chat.chat_generator"
+      "haystack_integrations.components.generators.ollama.chat.chat_generator",
+      "haystack_integrations.components.embedders.ollama.document_embedder",
+      "haystack_integrations.components.embedders.ollama.text_embedder",
     ]
     ignore_when_discovered: ["__init__"]
 processors:
@@ -26,4 +28,4 @@ renderer:
     descriptive_module_title: true
     add_method_class_prefix: true
     add_member_class_prefix: false
-    filename: _readme_ollama.md
\ No newline at end of file
+    filename: _readme_ollama.md

From f758de9361272d8b65ed03a78d59e95c5d464bde Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Wed, 28 Feb 2024 15:56:50 +0100
Subject: [PATCH 3/4] feat!: Add support for Optimum optimizers and quantizers
 (#496)

* feat!: Add support for Optimum optimizers and quantizers

* Fix docstring typo

* Apply suggestions from code review

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>

---------

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
---
 .github/workflows/optimum.yml                 |   6 +-
 integrations/optimum/pydoc/config.yml         |   2 +
 .../components/embedders/optimum/__init__.py  |  12 +-
 .../components/embedders/optimum/_backend.py  |  97 +++++++++++--
 .../embedders/optimum/optimization.py         | 105 ++++++++++++++
 .../optimum/optimum_document_embedder.py      | 100 +++++++++-----
 .../optimum/optimum_text_embedder.py          | 129 +++++++++++-------
 .../components/embedders/optimum/pooling.py   |   7 +-
 .../embedders/optimum/quantization.py         | 105 ++++++++++++++
 .../tests/test_optimum_document_embedder.py   |  85 ++++++++++--
 .../tests/test_optimum_text_embedder.py       |  39 +++++-
 11 files changed, 561 insertions(+), 126 deletions(-)
 create mode 100644 integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py
 create mode 100644 integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py

diff --git a/.github/workflows/optimum.yml b/.github/workflows/optimum.yml
index 3b0d137da..f5f59ec89 100644
--- a/.github/workflows/optimum.yml
+++ b/.github/workflows/optimum.yml
@@ -52,9 +52,9 @@ jobs:
         if: matrix.python-version == '3.9' && runner.os == 'Linux'
         run: hatch run lint:all
 
-      - name: Generate docs
-        if: matrix.python-version == '3.9' && runner.os == 'Linux'
-        run: hatch run docs
+      # - name: Generate docs
+      #   if: matrix.python-version == '3.9' && runner.os == 'Linux'
+      #   run: hatch run docs
 
       - name: Run tests
         run: hatch run cov
diff --git a/integrations/optimum/pydoc/config.yml b/integrations/optimum/pydoc/config.yml
index 617eb4aed..996678c55 100644
--- a/integrations/optimum/pydoc/config.yml
+++ b/integrations/optimum/pydoc/config.yml
@@ -6,6 +6,8 @@ loaders:
         "haystack_integrations.components.embedders.optimum.optimum_document_embedder",
         "haystack_integrations.components.embedders.optimum.optimum_text_embedder",
         "haystack_integrations.components.embedders.optimum.pooling",
+        "haystack_integrations.components.embedders.optimum.optimization",
+        "haystack_integrations.components.embedders.optimum.quantization",
       ]
     ignore_when_discovered: ["__init__"]
 processors:
diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/__init__.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/__init__.py
index e2ab2d6b7..02e56b34c 100644
--- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/__init__.py
+++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/__init__.py
@@ -2,8 +2,18 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from .optimization import OptimumEmbedderOptimizationConfig, OptimumEmbedderOptimizationMode
 from .optimum_document_embedder import OptimumDocumentEmbedder
 from .optimum_text_embedder import OptimumTextEmbedder
 from .pooling import OptimumEmbedderPooling
+from .quantization import OptimumEmbedderQuantizationConfig, OptimumEmbedderQuantizationMode
 
-__all__ = ["OptimumDocumentEmbedder", "OptimumEmbedderPooling", "OptimumTextEmbedder"]
+__all__ = [
+    "OptimumDocumentEmbedder",
+    "OptimumEmbedderOptimizationMode",
+    "OptimumEmbedderOptimizationConfig",
+    "OptimumEmbedderPooling",
+    "OptimumEmbedderQuantizationMode",
+    "OptimumEmbedderQuantizationConfig",
+    "OptimumTextEmbedder",
+]
diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/_backend.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/_backend.py
index fc4f0b1ae..a6d226ecc 100644
--- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/_backend.py
+++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/_backend.py
@@ -1,7 +1,8 @@
 import copy
 import json
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Union
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -11,10 +12,17 @@
 from sentence_transformers.models import Pooling as SentenceTransformerPoolingLayer
 from tqdm import tqdm
 from transformers import AutoTokenizer
+from transformers.modeling_outputs import BaseModelOutput
 
-from optimum.onnxruntime import ORTModelForFeatureExtraction
+from optimum.onnxruntime import (
+    ORTModelForFeatureExtraction,
+    ORTOptimizer,
+    ORTQuantizer,
+)
 
+from .optimization import OptimumEmbedderOptimizationConfig
 from .pooling import OptimumEmbedderPooling
+from .quantization import OptimumEmbedderQuantizationConfig
 
 
 @dataclass
@@ -29,16 +37,29 @@ class _EmbedderParams:
     progress_bar: bool
     pooling_mode: Optional[Union[str, OptimumEmbedderPooling]]
     model_kwargs: Optional[Dict[str, Any]]
+    working_dir: Optional[str]
+    optimizer_settings: Optional[OptimumEmbedderOptimizationConfig]
+    quantizer_settings: Optional[OptimumEmbedderQuantizationConfig]
 
     def serialize(self) -> Dict[str, Any]:
         out = {}
         for field in self.__dataclass_fields__.keys():
+            if field in [
+                "pooling_mode",
+                "token",
+                "optimizer_settings",
+                "quantizer_settings",
+            ]:
+                continue
             out[field] = copy.deepcopy(getattr(self, field))
 
         # Fixups.
         assert isinstance(self.pooling_mode, OptimumEmbedderPooling)
-        out["pooling_mode"] = self.pooling_mode.value
+        out["pooling_mode"] = str(self.pooling_mode)
         out["token"] = self.token.to_dict() if self.token else None
+        out["optimizer_settings"] = self.optimizer_settings.to_dict() if self.optimizer_settings else None
+        out["quantizer_settings"] = self.quantizer_settings.to_dict() if self.quantizer_settings else None
+
         out["model_kwargs"].pop("use_auth_token", None)
         serialize_hf_model_kwargs(out["model_kwargs"])
         return out
@@ -46,6 +67,11 @@ def serialize(self) -> Dict[str, Any]:
     @classmethod
     def deserialize_inplace(cls, data: Dict[str, Any]) -> Dict[str, Any]:
         data["pooling_mode"] = OptimumEmbedderPooling.from_str(data["pooling_mode"])
+        if data["optimizer_settings"] is not None:
+            data["optimizer_settings"] = OptimumEmbedderOptimizationConfig.from_dict(data["optimizer_settings"])
+        if data["quantizer_settings"] is not None:
+            data["quantizer_settings"] = OptimumEmbedderQuantizationConfig.from_dict(data["quantizer_settings"])
+
         deserialize_secrets_inplace(data, keys=["token"])
         deserialize_hf_model_kwargs(data["model_kwargs"])
         return data
@@ -71,6 +97,11 @@ def __init__(self, params: _EmbedderParams):
 
         params.model_kwargs = params.model_kwargs or {}
 
+        if params.optimizer_settings or params.quantizer_settings:
+            if not params.working_dir:
+                msg = "Working directory is required for optimization and quantization"
+                raise ValueError(msg)
+
         # Check if the model_kwargs contain the parameters, otherwise, populate them with values from init parameters
         params.model_kwargs.setdefault("model_id", params.model)
         params.model_kwargs.setdefault("provider", params.onnx_execution_provider)
@@ -82,18 +113,48 @@ def __init__(self, params: _EmbedderParams):
         self.pooling_layer = None
 
     def warm_up(self):
-        self.model = ORTModelForFeatureExtraction.from_pretrained(**self.params.model_kwargs, export=True)
+        assert self.params.model_kwargs
+        model_kwargs = copy.deepcopy(self.params.model_kwargs)
+        model = ORTModelForFeatureExtraction.from_pretrained(**model_kwargs, export=True)
+
+        # Model ID will be passed explicitly if optimization/quantization is enabled.
+        model_kwargs.pop("model_id", None)
+
+        optimized_model = False
+        if self.params.optimizer_settings:
+            assert self.params.working_dir
+            optimizer = ORTOptimizer.from_pretrained(model)
+            save_dir = optimizer.optimize(
+                save_dir=self.params.working_dir, optimization_config=self.params.optimizer_settings.to_optimum_config()
+            )
+            model = ORTModelForFeatureExtraction.from_pretrained(model_id=save_dir, **model_kwargs)
+            optimized_model = True
+
+        if self.params.quantizer_settings:
+            assert self.params.working_dir
+
+            # We need to create a subfolder for models that were optimized before quantization
+            # since Optimum expects no more than one ONXX model in the working directory. There's
+            # a file name parameter, but the optimizer only returns the working directory.
+            working_dir = (
+                Path(self.params.working_dir) if not optimized_model else Path(self.params.working_dir) / "quantized"
+            )
+            quantizer = ORTQuantizer.from_pretrained(model)
+            save_dir = quantizer.quantize(
+                save_dir=working_dir, quantization_config=self.params.quantizer_settings.to_optimum_config()
+            )
+            model = ORTModelForFeatureExtraction.from_pretrained(model_id=save_dir, **model_kwargs)
+
+        self.model = model
         self.tokenizer = AutoTokenizer.from_pretrained(
             self.params.model, token=self.params.token.resolve_value() if self.params.token else None
         )
 
         # We need the width of the embeddings to initialize the pooling layer
         # so we do a dummy forward pass with the model.
-        dummy_input = self.tokenizer(["dummy input"], padding=True, truncation=True, return_tensors="pt").to(
-            self.model.device
-        )
-        dummy_output = self.model(input_ids=dummy_input["input_ids"], attention_mask=dummy_input["attention_mask"])
-        width = dummy_output[0].size(dim=2)  # BaseModelOutput.last_hidden_state
+        width = self._tokenize_and_generate_outputs(["dummy input"])[1][0].size(
+            dim=2
+        )  # BaseModelOutput.last_hidden_state
 
         self.pooling_layer = SentenceTransformerPoolingLayer(
             width,
@@ -105,6 +166,17 @@ def warm_up(self):
             pooling_mode_lasttoken=self.params.pooling_mode == OptimumEmbedderPooling.LAST_TOKEN,
         )
 
+    def _tokenize_and_generate_outputs(self, texts: List[str]) -> Tuple[Dict[str, Any], BaseModelOutput]:
+        assert self.model is not None
+        assert self.tokenizer is not None
+
+        tokenizer_outputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(
+            self.model.device
+        )
+        model_inputs = {k: v for k, v in tokenizer_outputs.items() if k in self.model.inputs_names}
+        model_outputs = self.model(**model_inputs)
+        return tokenizer_outputs, model_outputs
+
     @property
     def parameters(self) -> _EmbedderParams:
         return self.params
@@ -140,11 +212,8 @@ def embed_texts(
             desc="Calculating embeddings",
         ):
             batch = sentences_sorted[i : i + self.params.batch_size]
-            encoded_input = self.tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
-            model_output = self.model(
-                input_ids=encoded_input["input_ids"], attention_mask=encoded_input["attention_mask"]
-            )
-            sentence_embeddings = self.pool_embeddings(model_output[0], encoded_input["attention_mask"].to(device))
+            tokenizer_output, model_output = self._tokenize_and_generate_outputs(batch)
+            sentence_embeddings = self.pool_embeddings(model_output[0], tokenizer_output["attention_mask"].to(device))
             all_embeddings.append(sentence_embeddings)
 
         embeddings = torch.cat(all_embeddings, dim=0)
diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py
new file mode 100644
index 000000000..5a4447570
--- /dev/null
+++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py
@@ -0,0 +1,105 @@
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Dict
+
+from optimum.onnxruntime.configuration import AutoOptimizationConfig, OptimizationConfig
+
+
+class OptimumEmbedderOptimizationMode(Enum):
+    """
+    [ONXX Optimization Modes](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization.html)
+    support by the Optimum Embedders.
+    """
+
+    #: Basic general optimizations.
+    O1 = "o1"
+
+    #: Basic and extended general optimizations, transformers-specific fusions.
+    O2 = "o2"
+
+    #: Same as O2 with Gelu approximation.
+    O3 = "o3"
+
+    #: Same as O3 with mixed precision.
+    O4 = "o4"
+
+    def __str__(self):
+        return self.value
+
+    @classmethod
+    def from_str(cls, string: str) -> "OptimumEmbedderOptimizationMode":
+        """
+        Create an optimization mode from a string.
+
+        :param string:
+            String to convert.
+        :returns:
+            Optimization mode.
+        """
+        enum_map = {e.value: e for e in OptimumEmbedderOptimizationMode}
+        opt_mode = enum_map.get(string)
+        if opt_mode is None:
+            msg = f"Unknown optimization mode '{string}'. Supported modes are: {list(enum_map.keys())}"
+            raise ValueError(msg)
+        return opt_mode
+
+
+@dataclass(frozen=True)
+class OptimumEmbedderOptimizationConfig:
+    """
+    Configuration for Optimum Embedder Optimization.
+
+    :param mode:
+        Optimization mode.
+    :param for_gpu:
+        Whether to optimize for GPUs.
+    """
+
+    mode: OptimumEmbedderOptimizationMode
+    for_gpu: bool = True
+
+    def to_optimum_config(self) -> OptimizationConfig:
+        """
+        Convert the configuration to a Optimum configuration.
+
+        :returns:
+            Optimum configuration.
+        """
+        if self.mode == OptimumEmbedderOptimizationMode.O1:
+            return AutoOptimizationConfig.O1(for_gpu=self.for_gpu)
+        elif self.mode == OptimumEmbedderOptimizationMode.O2:
+            return AutoOptimizationConfig.O2(for_gpu=self.for_gpu)
+        elif self.mode == OptimumEmbedderOptimizationMode.O3:
+            return AutoOptimizationConfig.O3(for_gpu=self.for_gpu)
+        elif self.mode == OptimumEmbedderOptimizationMode.O4:
+            return AutoOptimizationConfig.O4(for_gpu=self.for_gpu)
+        else:
+            msg = f"Unknown optimization mode '{self.mode}'"
+            raise ValueError(msg)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert the configuration to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
+        """
+        return {
+            "mode": str(self.mode),
+            "for_gpu": self.for_gpu,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "OptimumEmbedderOptimizationConfig":
+        """
+        Create an optimization configuration from a dictionary.
+
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+            Optimization configuration.
+        """
+        return OptimumEmbedderOptimizationConfig(
+            mode=OptimumEmbedderOptimizationMode.from_str(data["mode"]),
+            for_gpu=data["for_gpu"],
+        )
diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py
index 2f49bd0b3..a6db47090 100644
--- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py
+++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py
@@ -4,14 +4,17 @@
 from haystack.utils import Secret
 
 from ._backend import _EmbedderBackend, _EmbedderParams
+from .optimization import OptimumEmbedderOptimizationConfig
 from .pooling import OptimumEmbedderPooling
+from .quantization import OptimumEmbedderQuantizationConfig
 
 
 @component
 class OptimumDocumentEmbedder:
     """
-    A component for computing Document embeddings using models loaded with the HuggingFace Optimum library.
-    This component is designed to seamlessly inference models using the high speed ONNX runtime.
+    A component for computing `Document` embeddings using models loaded with the
+    [HuggingFace Optimum](https://huggingface.co/docs/optimum/index) library,
+    leveraging the ONNX runtime for high-speed inference.
 
     The embedding of each Document is stored in the `embedding` field of the Document.
 
@@ -30,18 +33,6 @@ class OptimumDocumentEmbedder:
 
     # [0.017020374536514282, -0.023255806416273117, ...]
     ```
-
-    Key Features and Compatibility:
-        - **Primary Compatibility**: Designed to work seamlessly with any embedding model present on the Hugging Face
-        Hub.
-        - **Conversion to ONNX**: The models are converted to ONNX using the HuggingFace Optimum library. This is
-        performed in real-time, during the warm-up step.
-        - **Accelerated Inference on GPU**: Supports using different execution providers such as CUDA and TensorRT, to
-        accelerate ONNX Runtime inference on GPUs.
-        Simply pass the execution provider as the onnx_execution_provider parameter. Additonal parameters can be passed
-        to the model using the model_kwargs parameter.
-        For more details refer to the HuggingFace documentation:
-        https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu.
     """
 
     def __init__(
@@ -54,6 +45,9 @@ def __init__(
         onnx_execution_provider: str = "CPUExecutionProvider",
         pooling_mode: Optional[Union[str, OptimumEmbedderPooling]] = None,
         model_kwargs: Optional[Dict[str, Any]] = None,
+        working_dir: Optional[str] = None,
+        optimizer_settings: Optional[OptimumEmbedderOptimizationConfig] = None,
+        quantizer_settings: Optional[OptimumEmbedderQuantizationConfig] = None,
         batch_size: int = 32,
         progress_bar: bool = True,
         meta_fields_to_embed: Optional[List[str]] = None,
@@ -62,13 +56,19 @@ def __init__(
         """
         Create a OptimumDocumentEmbedder component.
 
-        :param model: A string representing the model id on HF Hub.
-        :param token: The HuggingFace token to use as HTTP bearer authorization.
-        :param prefix: A string to add to the beginning of each text.
-        :param suffix: A string to add to the end of each text.
-        :param normalize_embeddings: Whether to normalize the embeddings to unit length.
-        :param onnx_execution_provider: The execution provider to use for ONNX models. See
-            https://onnxruntime.ai/docs/execution-providers/ for possible providers.
+        :param model:
+            A string representing the model id on HF Hub.
+        :param token:
+            The HuggingFace token to use as HTTP bearer authorization.
+        :param prefix:
+            A string to add to the beginning of each text.
+        :param suffix:
+            A string to add to the end of each text.
+        :param normalize_embeddings:
+            Whether to normalize the embeddings to unit length.
+        :param onnx_execution_provider:
+            The [execution provider](https://onnxruntime.ai/docs/execution-providers/)
+            to use for ONNX models.
 
             Note: Using the TensorRT execution provider
             TensorRT requires to build its inference engine ahead of inference, which takes some time due to the model
@@ -88,16 +88,31 @@ def __init__(
                 },
             )
             ```
-        :param pooling_mode: The pooling mode to use. When None, pooling mode will be inferred from the model config.
-            Refer to the OptimumEmbedderPooling enum for supported pooling modes.
-        :param model_kwargs: Dictionary containing additional keyword arguments to pass to the model.
-            In case of duplication, these kwargs override `model`, `onnx_execution_provider`, and `token` initialization
-            parameters.
-        :param batch_size: Number of Documents to encode at once.
-        :param progress_bar: Whether to show a progress bar or not. Can be helpful to disable in production deployments
-            to keep the logs clean.
-        :param meta_fields_to_embed: List of meta fields that should be embedded along with the Document text.
-        :param embedding_separator: Separator used to concatenate the meta fields to the Document text.
+        :param pooling_mode:
+            The pooling mode to use. When `None`, pooling mode will be inferred from the model config.
+        :param model_kwargs:
+            Dictionary containing additional keyword arguments to pass to the model.
+            In case of duplication, these kwargs override `model`, `onnx_execution_provider`
+            and `token` initialization parameters.
+         :param working_dir:
+             The directory to use for storing intermediate files
+             generated during model optimization/quantization.
+
+             Required for optimization and quantization.
+         :param optimizer_settings:
+             Configuration for Optimum Embedder Optimization.
+             If `None`, no additional optimization is be applied.
+         :param quantizer_settings:
+             Configuration for Optimum Embedder Quantization.
+             If `None`, no quantization is be applied.
+        :param batch_size:
+            Number of Documents to encode at once.
+        :param progress_bar:
+            Whether to show a progress bar or not.
+        :param meta_fields_to_embed:
+            List of meta fields that should be embedded along with the Document text.
+        :param embedding_separator:
+            Separator used to concatenate the meta fields to the Document text.
         """
         params = _EmbedderParams(
             model=model,
@@ -110,6 +125,9 @@ def __init__(
             progress_bar=progress_bar,
             pooling_mode=pooling_mode,
             model_kwargs=model_kwargs,
+            working_dir=working_dir,
+            optimizer_settings=optimizer_settings,
+            quantizer_settings=quantizer_settings,
         )
         self.meta_fields_to_embed = meta_fields_to_embed or []
         self.embedding_separator = embedding_separator
@@ -119,7 +137,7 @@ def __init__(
 
     def warm_up(self):
         """
-        Load the embedding backend.
+        Initializes the component.
         """
         if self._initialized:
             return
@@ -129,7 +147,10 @@ def warm_up(self):
 
     def to_dict(self) -> Dict[str, Any]:
         """
-        Serialize this component to a dictionary.
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
         """
         init_params = self._backend.parameters.serialize()
         init_params["meta_fields_to_embed"] = self.meta_fields_to_embed
@@ -139,7 +160,12 @@ def to_dict(self) -> Dict[str, Any]:
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "OptimumDocumentEmbedder":
         """
-        Deserialize this component from a dictionary.
+        Deserializes the component from a dictionary.
+
+        :param data:
+            The dictionary to deserialize from.
+        :returns:
+            The deserialized component.
         """
         _EmbedderParams.deserialize_inplace(data["init_parameters"])
         return default_from_dict(cls, data)
@@ -169,8 +195,10 @@ def run(self, documents: List[Document]):
         Embed a list of Documents.
         The embedding of each Document is stored in the `embedding` field of the Document.
 
-        :param documents: A list of Documents to embed.
-        :return: A dictionary containing the updated Documents with their embeddings.
+        :param documents:
+            A list of Documents to embed.
+        :returns:
+            The updated Documents with their embeddings.
         """
         if not self._initialized:
             msg = "The embedding model has not been loaded. Please call warm_up() before running."
diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py
index 64454bf9f..394ea04ad 100644
--- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py
+++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py
@@ -4,14 +4,17 @@
 from haystack.utils import Secret
 
 from ._backend import _EmbedderBackend, _EmbedderParams
+from .optimization import OptimumEmbedderOptimizationConfig
 from .pooling import OptimumEmbedderPooling
+from .quantization import OptimumEmbedderQuantizationConfig
 
 
 @component
 class OptimumTextEmbedder:
     """
-    A component to embed text using models loaded with the HuggingFace Optimum library.
-    This component is designed to seamlessly inference models using the high speed ONNX runtime.
+    A component to embed text using models loaded with the
+    [HuggingFace Optimum](https://huggingface.co/docs/optimum/index) library,
+    leveraging the ONNX runtime for high-speed inference.
 
     Usage example:
     ```python
@@ -26,18 +29,6 @@ class OptimumTextEmbedder:
 
     # {'embedding': [-0.07804739475250244, 0.1498992145061493,, ...]}
     ```
-
-    Key Features and Compatibility:
-        - **Primary Compatibility**: Designed to work seamlessly with any embedding model present on the Hugging Face
-        Hub.
-        - **Conversion to ONNX**: The models are converted to ONNX using the HuggingFace Optimum library. This is
-        performed in real-time, during the warm-up step.
-        - **Accelerated Inference on GPU**: Supports using different execution providers such as CUDA and TensorRT, to
-        accelerate ONNX Runtime inference on GPUs.
-        Simply pass the execution provider as the onnx_execution_provider parameter. Additonal parameters can be passed
-        to the model using the model_kwargs parameter.
-        For more details refer to the HuggingFace documentation:
-        https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu.
     """
 
     def __init__(
@@ -50,41 +41,62 @@ def __init__(
         onnx_execution_provider: str = "CPUExecutionProvider",
         pooling_mode: Optional[Union[str, OptimumEmbedderPooling]] = None,
         model_kwargs: Optional[Dict[str, Any]] = None,
+        working_dir: Optional[str] = None,
+        optimizer_settings: Optional[OptimumEmbedderOptimizationConfig] = None,
+        quantizer_settings: Optional[OptimumEmbedderQuantizationConfig] = None,
     ):
         """
-        Create a OptimumTextEmbedder component.
-
-        :param model: A string representing the model id on HF Hub.
-        :param token: The HuggingFace token to use as HTTP bearer authorization.
-        :param prefix: A string to add to the beginning of each text.
-        :param suffix: A string to add to the end of each text.
-        :param normalize_embeddings: Whether to normalize the embeddings to unit length.
-        :param onnx_execution_provider: The execution provider to use for ONNX models. See
-            https://onnxruntime.ai/docs/execution-providers/ for possible providers.
-
-            Note: Using the TensorRT execution provider
-            TensorRT requires to build its inference engine ahead of inference, which takes some time due to the model
-            optimization and nodes fusion. To avoid rebuilding the engine every time the model is loaded, ONNX Runtime
-            provides a pair of options to save the engine: `trt_engine_cache_enable` and `trt_engine_cache_path`. We
-            recommend setting these two provider options using the model_kwargs parameter, when using the TensorRT
-            execution provider. The usage is as follows:
-            ```python
-            embedder = OptimumTextEmbedder(
-                model="sentence-transformers/all-mpnet-base-v2",
-                onnx_execution_provider="TensorrtExecutionProvider",
-                model_kwargs={
-                    "provider_options": {
-                        "trt_engine_cache_enable": True,
-                        "trt_engine_cache_path": "tmp/trt_cache",
-                    }
-                },
-            )
-            ```
-        :param pooling_mode: The pooling mode to use. When None, pooling mode will be inferred from the model config.
-            Refer to the OptimumEmbedderPooling enum for supported pooling modes.
-        :param model_kwargs: Dictionary containing additional keyword arguments to pass to the model.
-            In case of duplication, these kwargs override `model`, `onnx_execution_provider`, and `token` initialization
-            parameters.
+         Create a OptimumTextEmbedder component.
+
+        :param model:
+             A string representing the model id on HF Hub.
+         :param token:
+             The HuggingFace token to use as HTTP bearer authorization.
+         :param prefix:
+             A string to add to the beginning of each text.
+         :param suffix:
+             A string to add to the end of each text.
+         :param normalize_embeddings:
+             Whether to normalize the embeddings to unit length.
+         :param onnx_execution_provider:
+             The [execution provider](https://onnxruntime.ai/docs/execution-providers/)
+             to use for ONNX models.
+
+             Note: Using the TensorRT execution provider
+             TensorRT requires to build its inference engine ahead of inference, which takes some time due to the model
+             optimization and nodes fusion. To avoid rebuilding the engine every time the model is loaded, ONNX Runtime
+             provides a pair of options to save the engine: `trt_engine_cache_enable` and `trt_engine_cache_path`. We
+             recommend setting these two provider options using the model_kwargs parameter, when using the TensorRT
+             execution provider. The usage is as follows:
+             ```python
+             embedder = OptimumDocumentEmbedder(
+                 model="sentence-transformers/all-mpnet-base-v2",
+                 onnx_execution_provider="TensorrtExecutionProvider",
+                 model_kwargs={
+                     "provider_options": {
+                         "trt_engine_cache_enable": True,
+                         "trt_engine_cache_path": "tmp/trt_cache",
+                     }
+                 },
+             )
+             ```
+         :param pooling_mode:
+             The pooling mode to use. When `None`, pooling mode will be inferred from the model config.
+         :param model_kwargs:
+             Dictionary containing additional keyword arguments to pass to the model.
+             In case of duplication, these kwargs override `model`, `onnx_execution_provider`
+             and `token` initialization parameters.
+         :param working_dir:
+             The directory to use for storing intermediate files
+             generated during model optimization/quantization.
+
+             Required for optimization and quantization.
+         :param optimizer_settings:
+             Configuration for Optimum Embedder Optimization.
+             If `None`, no additional optimization is applied.
+         :param quantizer_settings:
+             Configuration for Optimum Embedder Quantization.
+             If `None`, no quantization is applied.
         """
         params = _EmbedderParams(
             model=model,
@@ -97,13 +109,16 @@ def __init__(
             progress_bar=False,
             pooling_mode=pooling_mode,
             model_kwargs=model_kwargs,
+            working_dir=working_dir,
+            optimizer_settings=optimizer_settings,
+            quantizer_settings=quantizer_settings,
         )
         self._backend = _EmbedderBackend(params)
         self._initialized = False
 
     def warm_up(self):
         """
-        Load the embedding backend.
+        Initializes the component.
         """
         if self._initialized:
             return
@@ -113,7 +128,10 @@ def warm_up(self):
 
     def to_dict(self) -> Dict[str, Any]:
         """
-        Serialize this component to a dictionary.
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
         """
         init_params = self._backend.parameters.serialize()
         # Remove init params that are not provided to the text embedder.
@@ -124,7 +142,12 @@ def to_dict(self) -> Dict[str, Any]:
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "OptimumTextEmbedder":
         """
-        Deserialize this component from a dictionary.
+        Deserializes the component from a dictionary.
+
+        :param data:
+            The dictionary to deserialize from.
+        :returns:
+            The deserialized component.
         """
         _EmbedderParams.deserialize_inplace(data["init_parameters"])
         return default_from_dict(cls, data)
@@ -134,8 +157,10 @@ def run(self, text: str):
         """
         Embed a string.
 
-        :param text: The text to embed.
-        :return: The embeddings of the text.
+        :param text:
+            The text to embed.
+        :returns:
+            The embeddings of the text.
         """
         if not self._initialized:
             msg = "The embedding model has not been loaded. Please call warm_up() before running."
diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py
index c4d195b8e..41aa24d64 100644
--- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py
+++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py
@@ -22,11 +22,10 @@ class OptimumEmbedderPooling(Enum):
     MEAN_SQRT_LEN = "mean_sqrt_len"
 
     #: Perform weighted (position) mean pooling on the output of the
-    #: embedding model. See https://arxiv.org/abs/2202.08904.
+    #: embedding model.
     WEIGHTED_MEAN = "weighted_mean"
 
     #: Perform Last Token Pooling on the output of the embedding model.
-    #: See https://arxiv.org/abs/2202.08904 & https://arxiv.org/abs/2201.10005.
     LAST_TOKEN = "last_token"
 
     def __str__(self):
@@ -38,9 +37,9 @@ def from_str(cls, string: str) -> "OptimumEmbedderPooling":
         Create a pooling mode from a string.
 
         :param string:
-            The string to convert.
+            String to convert.
         :returns:
-            The pooling mode.
+            Pooling mode.
         """
         enum_map = {e.value: e for e in OptimumEmbedderPooling}
         pooling_mode = enum_map.get(string)
diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py
new file mode 100644
index 000000000..2e68081b5
--- /dev/null
+++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py
@@ -0,0 +1,105 @@
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Dict
+
+from optimum.onnxruntime.configuration import AutoQuantizationConfig, QuantizationConfig
+
+
+class OptimumEmbedderQuantizationMode(Enum):
+    """
+    [Dynamic Quantization Modes](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization)
+    support by the Optimum Embedders.
+    """
+
+    #: Quantization for the ARM64 architecture.
+    ARM64 = "arm64"
+
+    #: Quantization with AVX-2 instructions.
+    AVX2 = "avx2"
+
+    #: Quantization with AVX-512 instructions.
+    AVX512 = "avx512"
+
+    #: Quantization with AVX-512 and VNNI instructions.
+    AVX512_VNNI = "avx512_vnni"
+
+    def __str__(self):
+        return self.value
+
+    @classmethod
+    def from_str(cls, string: str) -> "OptimumEmbedderQuantizationMode":
+        """
+        Create an quantization mode from a string.
+
+        :param string:
+            String to convert.
+        :returns:
+            Quantization mode.
+        """
+        enum_map = {e.value: e for e in OptimumEmbedderQuantizationMode}
+        q_mode = enum_map.get(string)
+        if q_mode is None:
+            msg = f"Unknown quantization mode '{string}'. Supported modes are: {list(enum_map.keys())}"
+            raise ValueError(msg)
+        return q_mode
+
+
+@dataclass(frozen=True)
+class OptimumEmbedderQuantizationConfig:
+    """
+    Configuration for Optimum Embedder Quantization.
+
+    :param mode:
+        Quantization mode.
+    :param per_channel:
+        Whether to apply per-channel quantization.
+    """
+
+    mode: OptimumEmbedderQuantizationMode
+    per_channel: bool = False
+
+    def to_optimum_config(self) -> QuantizationConfig:
+        """
+        Convert the configuration to a Optimum configuration.
+
+        :returns:
+            Optimum configuration.
+        """
+        if self.mode == OptimumEmbedderQuantizationMode.ARM64:
+            return AutoQuantizationConfig.arm64(is_static=False, per_channel=self.per_channel)
+        elif self.mode == OptimumEmbedderQuantizationMode.AVX2:
+            return AutoQuantizationConfig.avx2(is_static=False, per_channel=self.per_channel)
+        elif self.mode == OptimumEmbedderQuantizationMode.AVX512:
+            return AutoQuantizationConfig.avx512(is_static=False, per_channel=self.per_channel)
+        elif self.mode == OptimumEmbedderQuantizationMode.AVX512_VNNI:
+            return AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=self.per_channel)
+        else:
+            msg = f"Unknown quantization mode '{self.mode}'"
+            raise ValueError(msg)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert the configuration to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
+        """
+        return {
+            "mode": str(self.mode),
+            "per_channel": self.per_channel,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "OptimumEmbedderQuantizationConfig":
+        """
+        Create a configuration from a dictionary.
+
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+            Quantization configuration.
+        """
+        return OptimumEmbedderQuantizationConfig(
+            mode=OptimumEmbedderQuantizationMode.from_str(data["mode"]),
+            per_channel=data["per_channel"],
+        )
diff --git a/integrations/optimum/tests/test_optimum_document_embedder.py b/integrations/optimum/tests/test_optimum_document_embedder.py
index bcbccd533..9288bb688 100644
--- a/integrations/optimum/tests/test_optimum_document_embedder.py
+++ b/integrations/optimum/tests/test_optimum_document_embedder.py
@@ -1,12 +1,21 @@
 from unittest.mock import MagicMock, patch
+import tempfile
+import copy
 
 import pytest
 from haystack.dataclasses import Document
 from haystack.utils.auth import Secret
 from haystack_integrations.components.embedders.optimum import OptimumDocumentEmbedder
 from haystack_integrations.components.embedders.optimum.pooling import OptimumEmbedderPooling
+from haystack_integrations.components.embedders.optimum.optimization import (
+    OptimumEmbedderOptimizationConfig,
+    OptimumEmbedderOptimizationMode,
+)
+from haystack_integrations.components.embedders.optimum.quantization import (
+    OptimumEmbedderQuantizationConfig,
+    OptimumEmbedderQuantizationMode,
+)
 from huggingface_hub.utils import RepositoryNotFoundError
-import copy
 
 
 @pytest.fixture
@@ -63,6 +72,9 @@ def test_init_with_parameters(self, mock_check_valid_model):  # noqa: ARG002
             pooling_mode="max",
             onnx_execution_provider="CUDAExecutionProvider",
             model_kwargs={"trust_remote_code": True},
+            working_dir="working_dir",
+            optimizer_settings=None,
+            quantizer_settings=None,
         )
 
         assert embedder._backend.parameters.model == "sentence-transformers/all-minilm-l6-v2"
@@ -82,6 +94,9 @@ def test_init_with_parameters(self, mock_check_valid_model):  # noqa: ARG002
             "provider": "CUDAExecutionProvider",
             "use_auth_token": "fake-api-token",
         }
+        assert embedder._backend.parameters.working_dir == "working_dir"
+        assert embedder._backend.parameters.optimizer_settings is None
+        assert embedder._backend.parameters.quantizer_settings is None
 
     def test_to_and_from_dict(self, mock_check_valid_model, mock_get_pooling_mode):  # noqa: ARG002
         component = OptimumDocumentEmbedder()
@@ -105,6 +120,9 @@ def test_to_and_from_dict(self, mock_check_valid_model, mock_get_pooling_mode):
                     "model_id": "sentence-transformers/all-mpnet-base-v2",
                     "provider": "CPUExecutionProvider",
                 },
+                "working_dir": None,
+                "optimizer_settings": None,
+                "quantizer_settings": None,
             },
         }
 
@@ -125,6 +143,9 @@ def test_to_and_from_dict(self, mock_check_valid_model, mock_get_pooling_mode):
             "provider": "CPUExecutionProvider",
             "use_auth_token": None,
         }
+        assert embedder._backend.parameters.working_dir is None
+        assert embedder._backend.parameters.optimizer_settings is None
+        assert embedder._backend.parameters.quantizer_settings is None
 
     def test_to_and_from_dict_with_custom_init_parameters(
         self, mock_check_valid_model, mock_get_pooling_mode
@@ -142,6 +163,11 @@ def test_to_and_from_dict_with_custom_init_parameters(
             onnx_execution_provider="CUDAExecutionProvider",
             pooling_mode="max",
             model_kwargs={"trust_remote_code": True},
+            working_dir="working_dir",
+            optimizer_settings=OptimumEmbedderOptimizationConfig(OptimumEmbedderOptimizationMode.O1, for_gpu=True),
+            quantizer_settings=OptimumEmbedderQuantizationConfig(
+                OptimumEmbedderQuantizationMode.ARM64, per_channel=True
+            ),
         )
         data = component.to_dict()
 
@@ -164,6 +190,9 @@ def test_to_and_from_dict_with_custom_init_parameters(
                     "model_id": "sentence-transformers/all-minilm-l6-v2",
                     "provider": "CUDAExecutionProvider",
                 },
+                "working_dir": "working_dir",
+                "optimizer_settings": {"mode": "o1", "for_gpu": True},
+                "quantizer_settings": {"mode": "arm64", "per_channel": True},
             },
         }
 
@@ -185,6 +214,13 @@ def test_to_and_from_dict_with_custom_init_parameters(
             "provider": "CUDAExecutionProvider",
             "use_auth_token": None,
         }
+        assert embedder._backend.parameters.working_dir == "working_dir"
+        assert embedder._backend.parameters.optimizer_settings == OptimumEmbedderOptimizationConfig(
+            OptimumEmbedderOptimizationMode.O1, for_gpu=True
+        )
+        assert embedder._backend.parameters.quantizer_settings == OptimumEmbedderQuantizationConfig(
+            OptimumEmbedderQuantizationMode.ARM64, per_channel=True
+        )
 
     def test_initialize_with_invalid_model(self, mock_check_valid_model):
         mock_check_valid_model.side_effect = RepositoryNotFoundError("Invalid model id")
@@ -287,7 +323,7 @@ def test_run_wrong_input_format(self, mock_check_valid_model):  # noqa: ARG002
 
     def test_run_on_empty_list(self, mock_check_valid_model):  # noqa: ARG002
         embedder = OptimumDocumentEmbedder(
-            model="sentence-transformers/all-mpnet-base-v2",
+            model="sentence-transformers/paraphrase-albert-small-v2",
         )
         embedder.warm_up()
         empty_list_input = []
@@ -297,7 +333,24 @@ def test_run_on_empty_list(self, mock_check_valid_model):  # noqa: ARG002
         assert not result["documents"]  # empty list
 
     @pytest.mark.integration
-    def test_run(self):
+    @pytest.mark.parametrize(
+        "opt_config, quant_config",
+        [
+            (None, None),
+            (
+                OptimumEmbedderOptimizationConfig(OptimumEmbedderOptimizationMode.O1, for_gpu=False),
+                None,
+            ),
+            (None, OptimumEmbedderQuantizationConfig(OptimumEmbedderQuantizationMode.AVX2)),
+            # onxxruntime 1.17.x breaks support for quantizing optimized models.
+            # c.f https://discuss.huggingface.co/t/optimize-and-quantize-with-optimum/23675/12
+            # (
+            #     OptimumEmbedderOptimizationConfig(OptimumEmbedderOptimizationMode.O2, for_gpu=False),
+            #     OptimumEmbedderQuantizationConfig(OptimumEmbedderQuantizationMode.AVX2),
+            # ),
+        ],
+    )
+    def test_run(self, opt_config, quant_config):
         docs = [
             Document(content="I love cheese", meta={"topic": "Cuisine"}),
             Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}),
@@ -305,18 +358,22 @@ def test_run(self):
         ]
         docs_copy = copy.deepcopy(docs)
 
-        embedder = OptimumDocumentEmbedder(
-            model="sentence-transformers/all-mpnet-base-v2",
-            prefix="prefix ",
-            suffix=" suffix",
-            meta_fields_to_embed=["topic"],
-            embedding_separator=" | ",
-            batch_size=1,
-        )
-        embedder.warm_up()
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            embedder = OptimumDocumentEmbedder(
+                model="sentence-transformers/paraphrase-albert-small-v2",
+                prefix="prefix ",
+                suffix=" suffix",
+                meta_fields_to_embed=["topic"],
+                embedding_separator=" | ",
+                batch_size=1,
+                working_dir=tmpdirname,
+                optimizer_settings=opt_config,
+                quantizer_settings=quant_config,
+            )
+            embedder.warm_up()
 
-        result = embedder.run(documents=docs)
-        expected = [embedder.run([d]) for d in docs_copy]
+            result = embedder.run(documents=docs)
+            expected = [embedder.run([d]) for d in docs_copy]
 
         documents_with_embeddings = result["documents"]
 
diff --git a/integrations/optimum/tests/test_optimum_text_embedder.py b/integrations/optimum/tests/test_optimum_text_embedder.py
index ce5bc2ffb..ad0e7d800 100644
--- a/integrations/optimum/tests/test_optimum_text_embedder.py
+++ b/integrations/optimum/tests/test_optimum_text_embedder.py
@@ -4,6 +4,14 @@
 from haystack.utils.auth import Secret
 from haystack_integrations.components.embedders.optimum import OptimumTextEmbedder
 from haystack_integrations.components.embedders.optimum.pooling import OptimumEmbedderPooling
+from haystack_integrations.components.embedders.optimum.optimization import (
+    OptimumEmbedderOptimizationConfig,
+    OptimumEmbedderOptimizationMode,
+)
+from haystack_integrations.components.embedders.optimum.quantization import (
+    OptimumEmbedderQuantizationConfig,
+    OptimumEmbedderQuantizationMode,
+)
 from huggingface_hub.utils import RepositoryNotFoundError
 
 
@@ -53,6 +61,9 @@ def test_init_with_parameters(self, mock_check_valid_model):  # noqa: ARG002
             pooling_mode="max",
             onnx_execution_provider="CUDAExecutionProvider",
             model_kwargs={"trust_remote_code": True},
+            working_dir="working_dir",
+            optimizer_settings=None,
+            quantizer_settings=None,
         )
 
         assert embedder._backend.parameters.model == "sentence-transformers/all-minilm-l6-v2"
@@ -68,6 +79,9 @@ def test_init_with_parameters(self, mock_check_valid_model):  # noqa: ARG002
             "provider": "CUDAExecutionProvider",
             "use_auth_token": "fake-api-token",
         }
+        assert embedder._backend.parameters.working_dir == "working_dir"
+        assert embedder._backend.parameters.optimizer_settings is None
+        assert embedder._backend.parameters.quantizer_settings is None
 
     def test_to_and_from_dict(self, mock_check_valid_model, mock_get_pooling_mode):  # noqa: ARG002
         component = OptimumTextEmbedder()
@@ -83,10 +97,13 @@ def test_to_and_from_dict(self, mock_check_valid_model, mock_get_pooling_mode):
                 "normalize_embeddings": True,
                 "onnx_execution_provider": "CPUExecutionProvider",
                 "pooling_mode": "mean",
+                "working_dir": None,
                 "model_kwargs": {
                     "model_id": "sentence-transformers/all-mpnet-base-v2",
                     "provider": "CPUExecutionProvider",
                 },
+                "optimizer_settings": None,
+                "quantizer_settings": None,
             },
         }
 
@@ -103,6 +120,9 @@ def test_to_and_from_dict(self, mock_check_valid_model, mock_get_pooling_mode):
             "provider": "CPUExecutionProvider",
             "use_auth_token": None,
         }
+        assert embedder._backend.parameters.working_dir is None
+        assert embedder._backend.parameters.optimizer_settings is None
+        assert embedder._backend.parameters.quantizer_settings is None
 
     def test_to_and_from_dict_with_custom_init_parameters(self, mock_check_valid_model):  # noqa: ARG002
         component = OptimumTextEmbedder(
@@ -114,6 +134,11 @@ def test_to_and_from_dict_with_custom_init_parameters(self, mock_check_valid_mod
             onnx_execution_provider="CUDAExecutionProvider",
             pooling_mode="max",
             model_kwargs={"trust_remote_code": True},
+            working_dir="working_dir",
+            optimizer_settings=OptimumEmbedderOptimizationConfig(OptimumEmbedderOptimizationMode.O1, for_gpu=True),
+            quantizer_settings=OptimumEmbedderQuantizationConfig(
+                OptimumEmbedderQuantizationMode.ARM64, per_channel=True
+            ),
         )
         data = component.to_dict()
 
@@ -132,6 +157,9 @@ def test_to_and_from_dict_with_custom_init_parameters(self, mock_check_valid_mod
                     "model_id": "sentence-transformers/all-minilm-l6-v2",
                     "provider": "CUDAExecutionProvider",
                 },
+                "working_dir": "working_dir",
+                "optimizer_settings": {"mode": "o1", "for_gpu": True},
+                "quantizer_settings": {"mode": "arm64", "per_channel": True},
             },
         }
 
@@ -149,6 +177,13 @@ def test_to_and_from_dict_with_custom_init_parameters(self, mock_check_valid_mod
             "provider": "CUDAExecutionProvider",
             "use_auth_token": None,
         }
+        assert embedder._backend.parameters.working_dir == "working_dir"
+        assert embedder._backend.parameters.optimizer_settings == OptimumEmbedderOptimizationConfig(
+            OptimumEmbedderOptimizationMode.O1, for_gpu=True
+        )
+        assert embedder._backend.parameters.quantizer_settings == OptimumEmbedderQuantizationConfig(
+            OptimumEmbedderQuantizationMode.ARM64, per_channel=True
+        )
 
     def test_initialize_with_invalid_model(self, mock_check_valid_model):
         mock_check_valid_model.side_effect = RepositoryNotFoundError("Invalid model id")
@@ -194,7 +229,7 @@ def test_infer_pooling_mode_from_hf(self):
 
     def test_run_wrong_input_format(self, mock_check_valid_model):  # noqa: ARG002
         embedder = OptimumTextEmbedder(
-            model="sentence-transformers/all-mpnet-base-v2",
+            model="sentence-transformers/paraphrase-albert-small-v2",
             token=Secret.from_token("fake-api-token"),
             pooling_mode="mean",
         )
@@ -209,7 +244,7 @@ def test_run_wrong_input_format(self, mock_check_valid_model):  # noqa: ARG002
     def test_run(self):
         for pooling_mode in OptimumEmbedderPooling:
             embedder = OptimumTextEmbedder(
-                model="sentence-transformers/all-mpnet-base-v2",
+                model="sentence-transformers/paraphrase-albert-small-v2",
                 prefix="prefix ",
                 suffix=" suffix",
                 pooling_mode=pooling_mode,

From d31442d6b7945ccd379da43165c98a48d98a9918 Mon Sep 17 00:00:00 2001
From: Massimiliano Pippi <mpippi@gmail.com>
Date: Wed, 28 Feb 2024 19:10:07 +0100
Subject: [PATCH 4/4] chore: update docstrings (#497)

* update docstrings

* Apply suggestions from code review

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* Apply suggestions from code review

* Final touch to chat_generator

---------

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
---
 .../embedders/mistral/document_embedder.py    | 32 +++++++----
 .../embedders/mistral/text_embedder.py        | 42 ++++++++-------
 .../generators/mistral/chat/chat_generator.py | 53 ++++++++++---------
 3 files changed, 71 insertions(+), 56 deletions(-)

diff --git a/integrations/mistral/src/haystack_integrations/components/embedders/mistral/document_embedder.py b/integrations/mistral/src/haystack_integrations/components/embedders/mistral/document_embedder.py
index 29161fd95..780eab268 100644
--- a/integrations/mistral/src/haystack_integrations/components/embedders/mistral/document_embedder.py
+++ b/integrations/mistral/src/haystack_integrations/components/embedders/mistral/document_embedder.py
@@ -43,17 +43,27 @@ def __init__(
         embedding_separator: str = "\n",
     ):
         """
-        Create a MistralDocumentEmbedder component.
-        :param api_key: The Mistral API key.
-        :param model: The name of the model to use.
-        :param api_base_url: The Mistral API Base url, defaults to None. For more details, see Mistral [docs](https://docs.mistral.ai/api/).
-        :param prefix: A string to add to the beginning of each text.
-        :param suffix: A string to add to the end of each text.
-        :param batch_size: Number of Documents to encode at once.
-        :param progress_bar: Whether to show a progress bar or not. Can be helpful to disable in production deployments
-                             to keep the logs clean.
-        :param meta_fields_to_embed: List of meta fields that should be embedded along with the Document text.
-        :param embedding_separator: Separator used to concatenate the meta fields to the Document text.
+        Creates a MistralDocumentEmbedder component.
+
+        :param api_key:
+            The Mistral API key.
+        :param model:
+            The name of the model to use.
+        :param api_base_url:
+            The Mistral API Base url. For more details, see Mistral [docs](https://docs.mistral.ai/api/).
+        :param prefix:
+            A string to add to the beginning of each text.
+        :param suffix:
+            A string to add to the end of each text.
+        :param batch_size:
+            Number of Documents to encode at once.
+        :param progress_bar:
+            Whether to show a progress bar or not. Can be helpful to disable in production deployments to keep
+            the logs clean.
+        :param meta_fields_to_embed:
+            List of meta fields that should be embedded along with the Document text.
+        :param embedding_separator:
+            Separator used to concatenate the meta fields to the Document text.
         """
         super(MistralDocumentEmbedder, self).__init__(  # noqa: UP008
             api_key=api_key,
diff --git a/integrations/mistral/src/haystack_integrations/components/embedders/mistral/text_embedder.py b/integrations/mistral/src/haystack_integrations/components/embedders/mistral/text_embedder.py
index d65828ef6..b299e91c7 100644
--- a/integrations/mistral/src/haystack_integrations/components/embedders/mistral/text_embedder.py
+++ b/integrations/mistral/src/haystack_integrations/components/embedders/mistral/text_embedder.py
@@ -11,22 +11,21 @@
 @component
 class MistralTextEmbedder(OpenAITextEmbedder):
     """
-     A component for embedding strings using Mistral models.
+    A component for embedding strings using Mistral models.
 
-     Usage example:
+    Usage example:
      ```python
     from haystack_integrations.components.embedders.mistral.text_embedder import MistralTextEmbedder
 
-     text_to_embed = "I love pizza!"
+    text_to_embed = "I love pizza!"
+    text_embedder = MistralTextEmbedder()
+    print(text_embedder.run(text_to_embed))
 
-     text_embedder = MistralTextEmbedder()
-
-     print(text_embedder.run(text_to_embed))
-
-     # {'embedding': [0.017020374536514282, -0.023255806416273117, ...],
-     # 'meta': {'model': 'text-embedding-ada-002-v2',
-     #          'usage': {'prompt_tokens': 4, 'total_tokens': 4}}}
-     ```
+    # output:
+    # {'embedding': [0.017020374536514282, -0.023255806416273117, ...],
+    # 'meta': {'model': 'mistral-embed',
+    #          'usage': {'prompt_tokens': 4, 'total_tokens': 4}}}
+    ```
     """
 
     def __init__(
@@ -38,14 +37,19 @@ def __init__(
         suffix: str = "",
     ):
         """
-        Create an MistralTextEmbedder component.
-
-        :param api_key: The Misttal API key.
-        :param model: The name of the Mistral embedding models to be used.
-        :param api_base_url: The Mistral API Base url, defaults to `https://api.mistral.ai/v1`.
-                             For more details, see Mistral [docs](https://docs.mistral.ai/api/).
-        :param prefix: A string to add to the beginning of each text.
-        :param suffix: A string to add to the end of each text.
+        Creates an MistralTextEmbedder component.
+
+        :param api_key:
+            The Mistral API key.
+        :param model:
+            The name of the Mistral embedding model to be used.
+        :param api_base_url:
+            The Mistral API Base url.
+            For more details, see Mistral [docs](https://docs.mistral.ai/api/).
+        :param prefix:
+            A string to add to the beginning of each text.
+        :param suffix:
+            A string to add to the end of each text.
         """
         super(MistralTextEmbedder, self).__init__(  # noqa: UP008
             api_key=api_key,
diff --git a/integrations/mistral/src/haystack_integrations/components/generators/mistral/chat/chat_generator.py b/integrations/mistral/src/haystack_integrations/components/generators/mistral/chat/chat_generator.py
index e1399c203..6bf703bc1 100644
--- a/integrations/mistral/src/haystack_integrations/components/generators/mistral/chat/chat_generator.py
+++ b/integrations/mistral/src/haystack_integrations/components/generators/mistral/chat/chat_generator.py
@@ -12,17 +12,27 @@
 @component
 class MistralChatGenerator(OpenAIChatGenerator):
     """
-    Enables text generation using Mistral's large language models (LLMs).
-    Currently supports `mistral-tiny`, `mistral-small` and `mistral-medium`
-    models accessed through the chat completions API endpoint.
+    Enables text generation using Mistral AI generative models.
+    For supported models, see [Mistral AI docs](https://docs.mistral.ai/platform/endpoints/#operation/listModels).
 
-    Users can pass any text generation parameters valid for the `openai.ChatCompletion.create` method
-    directly to this component via the `**generation_kwargs` parameter in __init__ or the `**generation_kwargs`
+    Users can pass any text generation parameters valid for the Mistral Chat Completion API
+    directly to this component via the `generation_kwargs` parameter in `__init__` or the `generation_kwargs`
     parameter in `run` method.
 
+    Key Features and Compatibility:
+    - **Primary Compatibility**: Designed to work seamlessly with the Mistral API Chat Completion endpoint.
+    - **Streaming Support**: Supports streaming responses from the Mistral API Chat Completion endpoint.
+    - **Customizability**: Supports all parameters supported by the Mistral API Chat Completion endpoint.
+
+    This component uses the ChatMessage format for structuring both input and output,
+    ensuring coherent and contextually relevant responses in chat-based text generation scenarios.
+    Details on the ChatMessage format can be found in the
+    [Haystack docs](https://docs.haystack.deepset.ai/v2.0/docs/data-classes#chatmessage)
+
     For more details on the parameters supported by the Mistral API, refer to the
     [Mistral API Docs](https://docs.mistral.ai/api/).
 
+    Usage example:
     ```python
     from haystack_integrations.components.generators.mistral import MistralChatGenerator
     from haystack.dataclasses import ChatMessage
@@ -38,19 +48,7 @@ class MistralChatGenerator(OpenAIChatGenerator):
     >>meaningful and useful.', role=<ChatRole.ASSISTANT: 'assistant'>, name=None,
     >>meta={'model': 'mistral-tiny', 'index': 0, 'finish_reason': 'stop',
     >>'usage': {'prompt_tokens': 15, 'completion_tokens': 36, 'total_tokens': 51}})]}
-
     ```
-
-     Key Features and Compatibility:
-         - **Primary Compatibility**: Designed to work seamlessly with the Mistral API Chat Completion endpoint.
-         - **Streaming Support**: Supports streaming responses from the Mistral API Chat Completion endpoint.
-         - **Customizability**: Supports all parameters supported by the Mistral API Chat Completion endpoint.
-
-     Input and Output Format:
-         - **ChatMessage Format**: This component uses the ChatMessage format for structuring both input and output,
-           ensuring coherent and contextually relevant responses in chat-based text generation scenarios.
-           Details on the ChatMessage format can be found at: https://github.com/openai/openai-python/blob/main/chatml.md.
-           Note that the Mistral API does not accept `system` messages yet. You can use `user` and `assistant` messages.
     """
 
     def __init__(
@@ -65,15 +63,19 @@ def __init__(
         Creates an instance of MistralChatGenerator. Unless specified otherwise in the `model`, this is for Mistral's
         `mistral-tiny` model.
 
-        :param api_key: The Mistral API key.
-        :param model: The name of the Mistral chat completion model to use.
-        :param streaming_callback: A callback function that is called when a new token is received from the stream.
+        :param api_key:
+            The Mistral API key.
+        :param model:
+            The name of the Mistral chat completion model to use.
+        :param streaming_callback:
+            A callback function that is called when a new token is received from the stream.
             The callback function accepts StreamingChunk as an argument.
-        :param api_base_url: The Mistral API Base url, defaults to `https://api.mistral.ai/v1`.
-                             For more details, see Mistral [docs](https://docs.mistral.ai/api/).
-        :param generation_kwargs: Other parameters to use for the model. These parameters are all sent directly to
-            the Mistrak endpoint. See [Mistral API docs](https://docs.mistral.ai/api/t) for
-            more details.
+        :param api_base_url:
+            The Mistral API Base url.
+            For more details, see Mistral [docs](https://docs.mistral.ai/api/).
+        :param generation_kwargs:
+            Other parameters to use for the model. These parameters are all sent directly to
+            the Mistral endpoint. See [Mistral API docs](https://docs.mistral.ai/api/) for more details.
             Some of the supported parameters:
             - `max_tokens`: The maximum number of tokens the output text can have.
             - `temperature`: What sampling temperature to use. Higher values mean the model will take more risks.
@@ -83,7 +85,6 @@ def __init__(
                 comprising the top 10% probability mass are considered.
             - `stream`: Whether to stream back partial progress. If set, tokens will be sent as data-only server-sent
                 events as they become available, with the stream terminated by a data: [DONE] message.
-            - `stop`: One or more sequences after which the LLM should stop generating tokens.
             - `safe_prompt`: Whether to inject a safety prompt before all conversations.
             - `random_seed`: The seed to use for random sampling.
         """