refactor!: Simplify Optimum backend impl (#477)

deepset-ai · Feb 23, 2024 · 7a8a292 · 7a8a292
1 parent 9a3f802
commit 7a8a292
Show file tree

Hide file tree

Showing 11 changed files with 401 additions and 526 deletions.
diff --git a/integrations/optimum/pydoc/config.yml b/integrations/optimum/pydoc/config.yml
@@ -14,8 +14,6 @@ processors:
     documented_only: true
     do_not_filter_modules: false
     skip_empty_modules: true
-  - type: filter
-    expression: "name not in ['Pooling', 'POOLING_MODES_MAP', 'INVERSE_POOLING_MODES_MAP', 'HFPoolingMode']"
   - type: smart
   - type: crossref
 renderer:

diff --git a/integrations/optimum/pyproject.toml b/integrations/optimum/pyproject.toml
@@ -153,8 +153,8 @@ branch = true
 parallel = true
 
 [tool.coverage.paths]
-optimum = ["src/haystack_integrations", "*/optimum/src/haystack_integrations"]
-tests = ["tests", "*/optimum/tests"]
+optimum = ["src/haystack_integrations"]
+tests = ["tests"]
 
 [tool.coverage.report]
 exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]

diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/__init__.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/__init__.py
@@ -4,5 +4,6 @@
 
 from .optimum_document_embedder import OptimumDocumentEmbedder
 from .optimum_text_embedder import OptimumTextEmbedder
+from .pooling import OptimumEmbedderPooling
 
-__all__ = ["OptimumDocumentEmbedder", "OptimumTextEmbedder"]
+__all__ = ["OptimumDocumentEmbedder", "OptimumEmbedderPooling", "OptimumTextEmbedder"]
diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/_backend.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/_backend.py
@@ -0,0 +1,198 @@
+import copy
+import json
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from haystack.utils import Secret, deserialize_secrets_inplace
+from haystack.utils.hf import HFModelType, check_valid_model, deserialize_hf_model_kwargs, serialize_hf_model_kwargs
+from huggingface_hub import hf_hub_download
+from sentence_transformers.models import Pooling as SentenceTransformerPoolingLayer
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+from optimum.onnxruntime import ORTModelForFeatureExtraction
+
+from .pooling import OptimumEmbedderPooling
+
+
+@dataclass
+class _EmbedderParams:
+    model: str
+    token: Optional[Secret]
+    prefix: str
+    suffix: str
+    normalize_embeddings: bool
+    onnx_execution_provider: str
+    batch_size: int
+    progress_bar: bool
+    pooling_mode: Optional[Union[str, OptimumEmbedderPooling]]
+    model_kwargs: Optional[Dict[str, Any]]
+
+    def serialize(self) -> Dict[str, Any]:
+        out = {}
+        for field in self.__dataclass_fields__.keys():
+            out[field] = copy.deepcopy(getattr(self, field))
+
+        # Fixups.
+        assert isinstance(self.pooling_mode, OptimumEmbedderPooling)
+        out["pooling_mode"] = self.pooling_mode.value
+        out["token"] = self.token.to_dict() if self.token else None
+        out["model_kwargs"].pop("use_auth_token", None)
+        serialize_hf_model_kwargs(out["model_kwargs"])
+        return out
+
+    @classmethod
+    def deserialize_inplace(cls, data: Dict[str, Any]) -> Dict[str, Any]:
+        data["pooling_mode"] = OptimumEmbedderPooling.from_str(data["pooling_mode"])
+        deserialize_secrets_inplace(data, keys=["token"])
+        deserialize_hf_model_kwargs(data["model_kwargs"])
+        return data
+
+
+class _EmbedderBackend:
+    def __init__(self, params: _EmbedderParams):
+        check_valid_model(params.model, HFModelType.EMBEDDING, params.token)
+        resolved_token = params.token.resolve_value() if params.token else None
+
+        if isinstance(params.pooling_mode, str):
+            params.pooling_mode = OptimumEmbedderPooling.from_str(params.pooling_mode)
+        elif params.pooling_mode is None:
+            params.pooling_mode = _pooling_from_model_config(params.model, resolved_token)
+
+        if params.pooling_mode is None:
+            modes = {e.value: e for e in OptimumEmbedderPooling}
+            msg = (
+                f"Pooling mode not found in model config and not specified by user."
+                f" Supported modes are: {list(modes.keys())}"
+            )
+            raise ValueError(msg)
+
+        params.model_kwargs = params.model_kwargs or {}
+
+        # Check if the model_kwargs contain the parameters, otherwise, populate them with values from init parameters
+        params.model_kwargs.setdefault("model_id", params.model)
+        params.model_kwargs.setdefault("provider", params.onnx_execution_provider)
+        params.model_kwargs.setdefault("use_auth_token", resolved_token)
+
+        self.params = params
+        self.model = None
+        self.tokenizer = None
+        self.pooling_layer = None
+
+    def warm_up(self):
+        self.model = ORTModelForFeatureExtraction.from_pretrained(**self.params.model_kwargs, export=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.params.model, token=self.params.token.resolve_value() if self.params.token else None
+        )
+
+        # We need the width of the embeddings to initialize the pooling layer
+        # so we do a dummy forward pass with the model.
+        dummy_input = self.tokenizer(["dummy input"], padding=True, truncation=True, return_tensors="pt").to(
+            self.model.device
+        )
+        dummy_output = self.model(input_ids=dummy_input["input_ids"], attention_mask=dummy_input["attention_mask"])
+        width = dummy_output[0].size(dim=2)  # BaseModelOutput.last_hidden_state
+
+        self.pooling_layer = SentenceTransformerPoolingLayer(
+            width,
+            pooling_mode_cls_token=self.params.pooling_mode == OptimumEmbedderPooling.CLS,
+            pooling_mode_max_tokens=self.params.pooling_mode == OptimumEmbedderPooling.MAX,
+            pooling_mode_mean_tokens=self.params.pooling_mode == OptimumEmbedderPooling.MEAN,
+            pooling_mode_mean_sqrt_len_tokens=self.params.pooling_mode == OptimumEmbedderPooling.MEAN_SQRT_LEN,
+            pooling_mode_weightedmean_tokens=self.params.pooling_mode == OptimumEmbedderPooling.WEIGHTED_MEAN,
+            pooling_mode_lasttoken=self.params.pooling_mode == OptimumEmbedderPooling.LAST_TOKEN,
+        )
+
+    @property
+    def parameters(self) -> _EmbedderParams:
+        return self.params
+
+    def pool_embeddings(self, model_output: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+        assert self.pooling_layer is not None
+        features = {"token_embeddings": model_output, "attention_mask": attention_mask}
+        pooled_outputs = self.pooling_layer.forward(features)
+        return pooled_outputs["sentence_embedding"]
+
+    def embed_texts(
+        self,
+        texts_to_embed: Union[str, List[str]],
+    ) -> Union[List[List[float]], List[float]]:
+        assert self.model is not None
+        assert self.tokenizer is not None
+
+        if isinstance(texts_to_embed, str):
+            texts = [texts_to_embed]
+        else:
+            texts = texts_to_embed
+
+        device = self.model.device
+
+        # Sorting by length
+        length_sorted_idx = np.argsort([-len(sen) for sen in texts])
+        sentences_sorted = [texts[idx] for idx in length_sorted_idx]
+
+        all_embeddings = []
+        for i in tqdm(
+            range(0, len(sentences_sorted), self.params.batch_size),
+            disable=not self.params.progress_bar,
+            desc="Calculating embeddings",
+        ):
+            batch = sentences_sorted[i : i + self.params.batch_size]
+            encoded_input = self.tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
+            model_output = self.model(
+                input_ids=encoded_input["input_ids"], attention_mask=encoded_input["attention_mask"]
+            )
+            sentence_embeddings = self.pool_embeddings(model_output[0], encoded_input["attention_mask"].to(device))
+            all_embeddings.append(sentence_embeddings)
+
+        embeddings = torch.cat(all_embeddings, dim=0)
+
+        if self.params.normalize_embeddings:
+            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+
+        embeddings = embeddings.tolist()
+
+        # Reorder embeddings according to original order
+        reordered_embeddings: List[List[float]] = [None] * len(texts)  # type: ignore
+        for embedding, idx in zip(embeddings, length_sorted_idx):
+            reordered_embeddings[idx] = embedding
+
+        if isinstance(texts_to_embed, str):
+            return reordered_embeddings[0]
+        else:
+            return reordered_embeddings
+
+
+def _pooling_from_model_config(model: str, token: Optional[str] = None) -> Optional[OptimumEmbedderPooling]:
+    try:
+        pooling_config_path = hf_hub_download(repo_id=model, token=token, filename="1_Pooling/config.json")
+    except Exception as e:
+        msg = f"An error occurred while downloading the model config: {e}"
+        raise ValueError(msg) from e
+
+    with open(pooling_config_path) as f:
+        pooling_config = json.load(f)
+
+    # Filter only those keys that start with "pooling_mode" and are True
+    true_pooling_modes = [key for key, value in pooling_config.items() if key.startswith("pooling_mode") and value]
+
+    # If exactly one True pooling mode is found, return it
+    # If no True pooling modes or more than one True pooling mode is found, return None
+    if len(true_pooling_modes) == 1:
+        pooling_mode_from_config = true_pooling_modes[0]
+        pooling_mode = _POOLING_MODES_MAP.get(pooling_mode_from_config)
+    else:
+        pooling_mode = None
+    return pooling_mode
+
+
+_POOLING_MODES_MAP = {
+    "pooling_mode_cls_token": OptimumEmbedderPooling.CLS,
+    "pooling_mode_mean_tokens": OptimumEmbedderPooling.MEAN,
+    "pooling_mode_max_tokens": OptimumEmbedderPooling.MAX,
+    "pooling_mode_mean_sqrt_len_tokens": OptimumEmbedderPooling.MEAN_SQRT_LEN,
+    "pooling_mode_weightedmean_tokens": OptimumEmbedderPooling.WEIGHTED_MEAN,
+    "pooling_mode_lasttoken": OptimumEmbedderPooling.LAST_TOKEN,
+}
diff --git a/...rations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_backend.py b/...rations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_backend.py