Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: HuggingFaceAPITextEmbedder #7484

Merged
merged 9 commits into from
Apr 8, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/pydoc/config/embedders_api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ loaders:
"azure_text_embedder",
"hugging_face_tei_document_embedder",
"hugging_face_tei_text_embedder",
"hugging_face_api_text_embedder",
"openai_document_embedder",
"openai_text_embedder",
"sentence_transformers_document_embedder",
Expand Down
2 changes: 2 additions & 0 deletions haystack/components/embedders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from haystack.components.embedders.azure_document_embedder import AzureOpenAIDocumentEmbedder
from haystack.components.embedders.azure_text_embedder import AzureOpenAITextEmbedder
from haystack.components.embedders.hugging_face_api_text_embedder import HuggingFaceAPITextEmbedder
from haystack.components.embedders.hugging_face_tei_document_embedder import HuggingFaceTEIDocumentEmbedder
from haystack.components.embedders.hugging_face_tei_text_embedder import HuggingFaceTEITextEmbedder
from haystack.components.embedders.openai_document_embedder import OpenAIDocumentEmbedder
Expand All @@ -10,6 +11,7 @@
__all__ = [
"HuggingFaceTEITextEmbedder",
"HuggingFaceTEIDocumentEmbedder",
"HuggingFaceAPITextEmbedder",
"SentenceTransformersTextEmbedder",
"SentenceTransformersDocumentEmbedder",
"OpenAITextEmbedder",
Expand Down
193 changes: 193 additions & 0 deletions haystack/components/embedders/hugging_face_api_text_embedder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
import json
from typing import Any, Dict, List, Optional, Union

from haystack import component, default_from_dict, default_to_dict, logging
from haystack.lazy_imports import LazyImport
from haystack.utils import Secret, deserialize_secrets_inplace
from haystack.utils.hf import HFEmbeddingAPIType, HFModelType, check_valid_model
from haystack.utils.url_validation import is_valid_http_url

with LazyImport(message="Run 'pip install \"huggingface_hub>=0.22.0\"'") as huggingface_hub_import:
from huggingface_hub import InferenceClient

logger = logging.getLogger(__name__)


@component
class HuggingFaceAPITextEmbedder:
"""
This component can be used to embed strings using different Hugging Face APIs:
- [free Serverless Inference API]((https://huggingface.co/inference-api)
- [paid Inference Endpoints](https://huggingface.co/inference-endpoints)
- [self-hosted Text Embeddings Inference](https://github.com/huggingface/text-embeddings-inference)


Example usage with the free Serverless Inference API:
```python
from haystack.components.embedders import HuggingFaceAPITextEmbedder
from haystack.utils import Secret

text_embedder = HuggingFaceAPITextEmbedder(api_type="serverless_inference_api",
api_params={"model": "BAAI/bge-small-en-v1.5"},
token=Secret.from_token("<your-api-key>"))

print(text_embedder.run("I love pizza!"))

# {'embedding': [0.017020374536514282, -0.023255806416273117, ...],
```

Example usage with paid Inference Endpoints:
```python
from haystack.components.embedders import HuggingFaceAPITextEmbedder
from haystack.utils import Secret
text_embedder = HuggingFaceAPITextEmbedder(api_type="inference_endpoints",
api_params={"model": "BAAI/bge-small-en-v1.5"},
token=Secret.from_token("<your-api-key>"))

print(text_embedder.run("I love pizza!"))

# {'embedding': [0.017020374536514282, -0.023255806416273117, ...],
```

Example usage with self-hosted Text Embeddings Inference:
```python
from haystack.components.embedders import HuggingFaceAPITextEmbedder
from haystack.utils import Secret

text_embedder = HuggingFaceAPITextEmbedder(api_type="text_embeddings_inference",
api_params={"url": "http://localhost:8080"})

print(text_embedder.run("I love pizza!"))

# {'embedding': [0.017020374536514282, -0.023255806416273117, ...],
```
"""

def __init__(
self,
api_type: Union[HFEmbeddingAPIType, str] = HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
api_params: Optional[Dict[str, str]] = None,
anakin87 marked this conversation as resolved.
Show resolved Hide resolved
token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False),
prefix: str = "",
suffix: str = "",
truncate: bool = True,
normalize: bool = False,
):
"""
Create an HuggingFaceAPITextEmbedder component.

:param api_type:
The type of Hugging Face API to use.
:param api_params:
A dictionary containing the following keys:
- `model`: model ID on the Hugging Face Hub. Required when `api_type` is `SERVERLESS_INFERENCE_API`.
- `url`: URL of the inference endpoint. Required when `api_type` is `INFERENCE_ENDPOINTS` or `TEXT_EMBEDDINGS_INFERENCE`.
:param token: The HuggingFace token to use as HTTP bearer authorization
You can find your HF token in your [account settings](https://huggingface.co/settings/tokens)
:param prefix:
A string to add at the beginning of each text.
:param suffix:
A string to add at the end of each text.
:param truncate:
Truncate input text from the end to the maximum length supported by the model.
This parameter takes effect when the `api_type` is `TEXT_EMBEDDINGS_INFERENCE`.
It also takes effect when the `api_type` is `INFERENCE_ENDPOINTS` and the backend is based on Text Embeddings Inference.
This parameter is ignored when the `api_type` is `SERVERLESS_INFERENCE_API` (it is always set to `True` and cannot be changed).
:param normalize:
Normalize the embeddings to unit length.
This parameter takes effect when the `api_type` is `TEXT_EMBEDDINGS_INFERENCE`.
It also takes effect when the `api_type` is `INFERENCE_ENDPOINTS` and the backend is based on Text Embeddings Inference.
This parameter is ignored when the `api_type` is `SERVERLESS_INFERENCE_API` (it is always set to `False` and cannot be changed).
"""
huggingface_hub_import.check()

if isinstance(api_type, str):
api_type = HFEmbeddingAPIType.from_str(api_type)

api_params = api_params or {}

if api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API:
model = api_params.get("model")
if model is None:
raise ValueError(
"To use the Serverless Inference API, you need to specify the `model` parameter in `api_params`."
)
check_valid_model(model, HFModelType.EMBEDDING, token)
model_or_url = model
elif api_type in [HFEmbeddingAPIType.INFERENCE_ENDPOINTS, HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE]:
url = api_params.get("url")
if url is None:
raise ValueError(
"To use Text Embeddings Inference or Inference Endpoints, you need to specify the `url` parameter in `api_params`."
)
if not is_valid_http_url(url):
raise ValueError(f"Invalid URL: {url}")
model_or_url = url

self.api_type = api_type
self.api_params = api_params
self.token = token
self.prefix = prefix
self.suffix = suffix
self.truncate = truncate
self.normalize = normalize
self._client = InferenceClient(model_or_url, token=token.resolve_value() if token else None)

def to_dict(self) -> Dict[str, Any]:
"""
Serializes the component to a dictionary.

:returns:
Dictionary with serialized data.
"""
return default_to_dict(
self,
api_type=self.api_type,
api_params=self.api_params,
prefix=self.prefix,
suffix=self.suffix,
token=self.token.to_dict() if self.token else None,
truncate=self.truncate,
normalize=self.normalize,
)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "HuggingFaceAPITextEmbedder":
"""
Deserializes the component from a dictionary.

:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
return default_from_dict(cls, data)

@component.output_types(embedding=List[float])
def run(self, text: str):
"""
Embed a single string.

:param text:
Text to embed.

:returns:
A dictionary with the following keys:
- `embedding`: The embedding of the input text.
"""
if not isinstance(text, str):
raise TypeError(
"HuggingFaceAPITextEmbedder expects a string as an input."
"In case you want to embed a list of Documents, please use the HuggingFaceAPIDocumentEmbedder."
)

text_to_embed = self.prefix + text + self.suffix

response = self._client.post(
json={"inputs": [text_to_embed], "truncate": self.truncate, "normalize": self.normalize},
task="feature-extraction",
)
embedding = json.loads(response.decode())[0]

return {"embedding": embedding}
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import warnings
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse

Expand Down Expand Up @@ -74,6 +75,12 @@ def __init__(
Inference (TEI) endpoints and paid HF Inference Endpoints deployed with TEI. It will be ignored when used
with free HF Inference endpoints or paid HF Inference endpoints deployed without TEI.
"""
warnings.warn(
"`HuggingFaceTEITextEmbedder` is deprecated and will be removed in Haystack 2.3.0."
"Use `HuggingFaceAPITextEmbedder` instead.",
DeprecationWarning,
)

huggingface_hub_import.check()

if url:
Expand Down
22 changes: 22 additions & 0 deletions haystack/utils/hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,28 @@
logger = logging.getLogger(__name__)


class HFEmbeddingAPIType(Enum):
"""
API type to use for Hugging Face API Embedders.
"""

TEXT_EMBEDDINGS_INFERENCE = "text_embeddings_inference"
INFERENCE_ENDPOINTS = "inference_endpoints"
SERVERLESS_INFERENCE_API = "serverless_inference_api"
anakin87 marked this conversation as resolved.
Show resolved Hide resolved

def __str__(self):
return self.value

@staticmethod
def from_str(string: str) -> "HFEmbeddingAPIType":
enum_map = {e.value: e for e in HFEmbeddingAPIType}
mode = enum_map.get(string)
if mode is None:
msg = f"Unknown Hugging Face API type '{string}'. Supported types are: {list(enum_map.keys())}"
raise ValueError(msg)
return mode


class HFModelType(Enum):
EMBEDDING = 1
GENERATION = 2
Expand Down
6 changes: 6 additions & 0 deletions haystack/utils/url_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from urllib.parse import urlparse


def is_valid_http_url(url) -> bool:
r = urlparse(url)
return all([r.scheme in ["http", "https"], r.netloc])
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be ignored.
I introduced and tested it in #7464, but I also added it in this PR to make the 2 PRs independent.

13 changes: 13 additions & 0 deletions releasenotes/notes/hfapitextembedder-97bf5f739f413f3e.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
---
features:
- |
Introduce `HuggingFaceAPITextEmbedder`.
This component can be used to embed strings using different Hugging Face APIs:
- free Serverless Inference API
- paid Inference Endpoints
- self-hosted Text Embeddings Inference.
This embedder will replace the `HuggingFaceTEITextEmbedder` in the future.
deprecations:
- |
Deprecate `HuggingFaceTEITextEmbedder`. This component will be removed in Haystack 2.3.0.
Use `HuggingFaceAPITextEmbedder` instead.
Loading