-
Notifications
You must be signed in to change notification settings - Fork 128
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
chore: update Jina Embedder usage for V3 release (#1077)
* chore: update Jina Embedder usage for V3 release * fix: resolve lint issue * fix: resolve test error * fix: resolve test error * fix: resolve lint issues * fix: resolve lint issues * fix: resolve lint issues * fix: resolve lint issues * fix: resolve lint issues * chore: update JinaEmbedding for v3 release * fix: resolve test errors * fix: resolve test errors * chore: added test case * fix: resolve lint issues * fix: update function call * fix: resolve lint issues * fix: lint error * fix: lint error * chore: remove unnecessary test cases * chore: use 'task' instead of 'task_type' * chore: add 'late_chunking' for Jina embedders * Update integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py Co-authored-by: Silvano Cerza <[email protected]> * Update integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py Co-authored-by: Silvano Cerza <[email protected]> * Update integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py Co-authored-by: Silvano Cerza <[email protected]> * Update integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py Co-authored-by: Silvano Cerza <[email protected]> --------- Co-authored-by: Silvano Cerza <[email protected]>
- Loading branch information
Showing
4 changed files
with
166 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]> | ||
# | ||
# SPDX-License-Identifier: Apache-2.0 | ||
from typing import Any, Dict, List | ||
from typing import Any, Dict, List, Optional | ||
|
||
import requests | ||
from haystack import component, default_from_dict, default_to_dict | ||
|
@@ -21,24 +21,27 @@ class JinaTextEmbedder: | |
# Make sure that the environment variable JINA_API_KEY is set | ||
text_embedder = JinaTextEmbedder() | ||
text_embedder = JinaTextEmbedder(task="retrieval.query") | ||
text_to_embed = "I love pizza!" | ||
print(text_embedder.run(text_to_embed)) | ||
# {'embedding': [0.017020374536514282, -0.023255806416273117, ...], | ||
# 'meta': {'model': 'jina-embeddings-v2-base-en', | ||
# 'meta': {'model': 'jina-embeddings-v3', | ||
# 'usage': {'prompt_tokens': 4, 'total_tokens': 4}}} | ||
``` | ||
""" | ||
|
||
def __init__( | ||
self, | ||
api_key: Secret = Secret.from_env_var("JINA_API_KEY"), # noqa: B008 | ||
model: str = "jina-embeddings-v2-base-en", | ||
model: str = "jina-embeddings-v3", | ||
prefix: str = "", | ||
suffix: str = "", | ||
task: Optional[str] = None, | ||
dimensions: Optional[int] = None, | ||
late_chunking: Optional[bool] = None, | ||
): | ||
""" | ||
Create a JinaTextEmbedder component. | ||
|
@@ -65,6 +68,9 @@ def __init__( | |
"Content-type": "application/json", | ||
} | ||
) | ||
self.task = task | ||
self.dimensions = dimensions | ||
self.late_chunking = late_chunking | ||
|
||
def _get_telemetry_data(self) -> Dict[str, Any]: | ||
""" | ||
|
@@ -78,9 +84,20 @@ def to_dict(self) -> Dict[str, Any]: | |
:returns: | ||
Dictionary with serialized data. | ||
""" | ||
return default_to_dict( | ||
self, api_key=self.api_key.to_dict(), model=self.model_name, prefix=self.prefix, suffix=self.suffix | ||
) | ||
kwargs = { | ||
"api_key": self.api_key.to_dict(), | ||
"model": self.model_name, | ||
"prefix": self.prefix, | ||
"suffix": self.suffix, | ||
} | ||
# Optional parameters, the following two are only supported by embeddings-v3 for now | ||
if self.task is not None: | ||
kwargs["task"] = self.task | ||
if self.dimensions is not None: | ||
kwargs["dimensions"] = self.dimensions | ||
if self.late_chunking is not None: | ||
kwargs["late_chunking"] = self.late_chunking | ||
return default_to_dict(self, **kwargs) | ||
|
||
@classmethod | ||
def from_dict(cls, data: Dict[str, Any]) -> "JinaTextEmbedder": | ||
|
@@ -114,7 +131,19 @@ def run(self, text: str): | |
|
||
text_to_embed = self.prefix + text + self.suffix | ||
|
||
resp = self._session.post(JINA_API_URL, json={"input": [text_to_embed], "model": self.model_name}).json() | ||
parameters: Dict[str, Any] = {} | ||
if self.task is not None: | ||
parameters["task"] = self.task | ||
if self.dimensions is not None: | ||
parameters["dimensions"] = self.dimensions | ||
if self.late_chunking is not None: | ||
parameters["late_chunking"] = self.late_chunking | ||
|
||
resp = self._session.post( | ||
JINA_API_URL, | ||
json={"input": [text_to_embed], "model": self.model_name, **parameters}, | ||
).json() | ||
|
||
if "data" not in resp: | ||
raise RuntimeError(resp["detail"]) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters