From 5f87bddc7f434f9faedfb9c0eae984b2d5e72afa Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Wed, 11 Sep 2024 18:38:29 +0800 Subject: [PATCH 01/25] chore: update Jina Embedder usage for V3 release --- .../embedders/jina/document_embedder.py | 8 ++--- .../embedders/jina/text_embedder.py | 6 ++-- .../jina/tests/test_document_embedder.py | 32 +++++++++++++++++++ integrations/jina/tests/test_text_embedder.py | 29 +++++++++++++++++ 4 files changed, 68 insertions(+), 7 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py index 6bcd94220..4caf0ebc5 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py @@ -131,7 +131,7 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: texts_to_embed.append(text_to_embed) return texts_to_embed - def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List[List[float]], Dict[str, Any]]: + def _embed_batch(self, texts_to_embed: List[str], batch_size: int, parameters: Optional[Dict]) -> Tuple[List[List[float]], Dict[str, Any]]: """ Embed a list of texts in batches. """ @@ -142,7 +142,7 @@ def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List range(0, len(texts_to_embed), batch_size), disable=not self.progress_bar, desc="Calculating embeddings" ): batch = texts_to_embed[i : i + batch_size] - response = self._session.post(JINA_API_URL, json={"input": batch, "model": self.model_name}).json() + response = self._session.post(JINA_API_URL, json={"input": batch, "model": self.model_name, **parameters}).json() if "data" not in response: raise RuntimeError(response["detail"]) @@ -161,7 +161,7 @@ def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List return all_embeddings, metadata @component.output_types(documents=List[Document], meta=Dict[str, Any]) - def run(self, documents: List[Document]): + def run(self, documents: List[Document], parameters: Optional[Dict]): """ Compute the embeddings for a list of Documents. @@ -180,7 +180,7 @@ def run(self, documents: List[Document]): texts_to_embed = self._prepare_texts_to_embed(documents=documents) - embeddings, metadata = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self.batch_size) + embeddings, metadata = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self.batch_size, parameters=parameters) for doc, emb in zip(documents, embeddings): doc.embedding = emb diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py index 6398122a4..237ec927d 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional import requests from haystack import component, default_from_dict, default_to_dict @@ -95,7 +95,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "JinaTextEmbedder": return default_from_dict(cls, data) @component.output_types(embedding=List[float], meta=Dict[str, Any]) - def run(self, text: str): + def run(self, text: str, parameters: Optional[Dict]): """ Embed a string. @@ -114,7 +114,7 @@ def run(self, text: str): text_to_embed = self.prefix + text + self.suffix - resp = self._session.post(JINA_API_URL, json={"input": [text_to_embed], "model": self.model_name}).json() + resp = self._session.post(JINA_API_URL, json={"input": [text_to_embed], "model": self.model_name, **parameters}).json() if "data" not in resp: raise RuntimeError(resp["detail"]) diff --git a/integrations/jina/tests/test_document_embedder.py b/integrations/jina/tests/test_document_embedder.py index 9d63f8302..cd448696d 100644 --- a/integrations/jina/tests/test_document_embedder.py +++ b/integrations/jina/tests/test_document_embedder.py @@ -246,3 +246,35 @@ def test_run_on_empty_list(self): assert result["documents"] is not None assert not result["documents"] # empty list + + def test_run_with_v3(self): + docs = [ + Document(content="I love cheese", meta={"topic": "Cuisine"}), + Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), + ] + + model = "jina-embeddings-v3-base-en" + with patch("requests.sessions.Session.post", side_effect=mock_session_post_response): + embedder = JinaDocumentEmbedder( + api_key=Secret.from_token("fake-api-key"), + model=model, + prefix="prefix ", + suffix=" suffix", + meta_fields_to_embed=["topic"], + embedding_separator=" | ", + batch_size=1, + ) + result = embedder.run(documents=docs, parameters={"task_type":"retrieval.passage"}) + + documents_with_embeddings = result["documents"] + metadata = result["meta"] + + assert isinstance(documents_with_embeddings, list) + assert len(documents_with_embeddings) == len(docs) + for doc in documents_with_embeddings: + assert isinstance(doc, Document) + assert isinstance(doc.embedding, list) + assert len(doc.embedding) == 3 + assert all(isinstance(x, float) for x in doc.embedding) + assert metadata == {"model": model, "usage": {"prompt_tokens": 4, "total_tokens": 4}} + diff --git a/integrations/jina/tests/test_text_embedder.py b/integrations/jina/tests/test_text_embedder.py index 5c0f80d02..4b9ae550e 100644 --- a/integrations/jina/tests/test_text_embedder.py +++ b/integrations/jina/tests/test_text_embedder.py @@ -106,3 +106,32 @@ def test_run_wrong_input_format(self): with pytest.raises(TypeError, match="JinaTextEmbedder expects a string as an input"): embedder.run(text=list_integers_input) + + def test_with_v3(self): + model = "jina-embeddings-v3" + with patch("requests.sessions.Session.post") as mock_post: + # Configure the mock to return a specific response + mock_response = requests.Response() + mock_response.status_code = 200 + mock_response._content = json.dumps( + { + "model": "jina-embeddings-v3", + "object": "list", + "usage": {"total_tokens": 6, "prompt_tokens": 6}, + "data": [{"object": "embedding", "index": 0, "embedding": [0.1, 0.2, 0.3]}], + } + ).encode() + + mock_post.return_value = mock_response + + embedder = JinaTextEmbedder( + api_key=Secret.from_token("fake-api-key"), model=model, prefix="prefix ", suffix=" suffix" + ) + result = embedder.run(text="The food was delicious", parameters={"task_type":"retrieval.passage"}) + + assert len(result["embedding"]) == 3 + assert all(isinstance(x, float) for x in result["embedding"]) + assert result["meta"] == { + "model": "jina-embeddings-v3", + "usage": {"prompt_tokens": 6, "total_tokens": 6}, + } From e0f48cd3a10fc8273242c1c070aa1d77cc0e3b34 Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Wed, 11 Sep 2024 18:48:31 +0800 Subject: [PATCH 02/25] fix: resolve lint issue --- .../embedders/jina/document_embedder.py | 18 +++++++++++++++--- .../components/embedders/jina/text_embedder.py | 6 +++++- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py index 4caf0ebc5..ef790dd95 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py @@ -131,7 +131,12 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: texts_to_embed.append(text_to_embed) return texts_to_embed - def _embed_batch(self, texts_to_embed: List[str], batch_size: int, parameters: Optional[Dict]) -> Tuple[List[List[float]], Dict[str, Any]]: + def _embed_batch( + self, + texts_to_embed: List[str], + batch_size: int, + parameters: Optional[Dict] + ) -> Tuple[List[List[float]],Dict[str, Any]]: """ Embed a list of texts in batches. """ @@ -142,7 +147,10 @@ def _embed_batch(self, texts_to_embed: List[str], batch_size: int, parameters: O range(0, len(texts_to_embed), batch_size), disable=not self.progress_bar, desc="Calculating embeddings" ): batch = texts_to_embed[i : i + batch_size] - response = self._session.post(JINA_API_URL, json={"input": batch, "model": self.model_name, **parameters}).json() + response = self._session.post( + JINA_API_URL, + json={"input": batch, "model": self.model_name, **parameters} + ).json() if "data" not in response: raise RuntimeError(response["detail"]) @@ -180,7 +188,11 @@ def run(self, documents: List[Document], parameters: Optional[Dict]): texts_to_embed = self._prepare_texts_to_embed(documents=documents) - embeddings, metadata = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self.batch_size, parameters=parameters) + embeddings, metadata = self._embed_batch( + texts_to_embed=texts_to_embed, + batch_size=self.batch_size, + parameters=parameters + ) for doc, emb in zip(documents, embeddings): doc.embedding = emb diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py index 237ec927d..7510b8628 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py @@ -114,7 +114,11 @@ def run(self, text: str, parameters: Optional[Dict]): text_to_embed = self.prefix + text + self.suffix - resp = self._session.post(JINA_API_URL, json={"input": [text_to_embed], "model": self.model_name, **parameters}).json() + resp = self._session.post( + JINA_API_URL, + json={"input": [text_to_embed], "model": self.model_name, **parameters} + ).json() + if "data" not in resp: raise RuntimeError(resp["detail"]) From dff73c2260971c331e5d461910dd4c35a48cf8ac Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Wed, 11 Sep 2024 18:53:40 +0800 Subject: [PATCH 03/25] fix: resolve test error --- .../components/embedders/jina/document_embedder.py | 4 ++-- .../components/embedders/jina/text_embedder.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py index ef790dd95..1f88266cb 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py @@ -135,7 +135,7 @@ def _embed_batch( self, texts_to_embed: List[str], batch_size: int, - parameters: Optional[Dict] + parameters: Optional[Dict] = None ) -> Tuple[List[List[float]],Dict[str, Any]]: """ Embed a list of texts in batches. @@ -169,7 +169,7 @@ def _embed_batch( return all_embeddings, metadata @component.output_types(documents=List[Document], meta=Dict[str, Any]) - def run(self, documents: List[Document], parameters: Optional[Dict]): + def run(self, documents: List[Document], parameters: Optional[Dict]=None): """ Compute the embeddings for a list of Documents. diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py index 7510b8628..47a229f7d 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py @@ -95,7 +95,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "JinaTextEmbedder": return default_from_dict(cls, data) @component.output_types(embedding=List[float], meta=Dict[str, Any]) - def run(self, text: str, parameters: Optional[Dict]): + def run(self, text: str, parameters: Optional[Dict]=None): """ Embed a string. From 88f0eba9fa80fe4d75192a0590453661d76b586e Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Wed, 11 Sep 2024 19:00:36 +0800 Subject: [PATCH 04/25] fix: resolve test error --- .../components/embedders/jina/document_embedder.py | 6 +++++- .../components/embedders/jina/text_embedder.py | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py index 1f88266cb..ea0ead972 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py @@ -149,7 +149,11 @@ def _embed_batch( batch = texts_to_embed[i : i + batch_size] response = self._session.post( JINA_API_URL, - json={"input": batch, "model": self.model_name, **parameters} + json={ + "input": batch, + "model": self.model_name, + **(parameters if parameters is not None else {}) + }, ).json() if "data" not in response: raise RuntimeError(response["detail"]) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py index 47a229f7d..c7220b404 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py @@ -116,7 +116,11 @@ def run(self, text: str, parameters: Optional[Dict]=None): resp = self._session.post( JINA_API_URL, - json={"input": [text_to_embed], "model": self.model_name, **parameters} + json={ + "input": [text_to_embed], + "model": self.model_name, + **(parameters if parameters is not None else {}), + } ).json() if "data" not in resp: From 0ad3ffe560dbc6a150b30a00b54f1f4a8c0bd78a Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Wed, 11 Sep 2024 19:06:20 +0800 Subject: [PATCH 05/25] fix: resolve lint issues --- .../components/embedders/jina/document_embedder.py | 7 ++----- .../components/embedders/jina/text_embedder.py | 2 +- integrations/jina/tests/test_document_embedder.py | 7 +++---- integrations/jina/tests/test_text_embedder.py | 2 +- 4 files changed, 7 insertions(+), 11 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py index ea0ead972..2a9345339 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py @@ -132,10 +132,7 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: return texts_to_embed def _embed_batch( - self, - texts_to_embed: List[str], - batch_size: int, - parameters: Optional[Dict] = None + self, texts_to_embed: List[str], batch_size: int, parameters: Optional[Dict] = None ) -> Tuple[List[List[float]],Dict[str, Any]]: """ Embed a list of texts in batches. @@ -173,7 +170,7 @@ def _embed_batch( return all_embeddings, metadata @component.output_types(documents=List[Document], meta=Dict[str, Any]) - def run(self, documents: List[Document], parameters: Optional[Dict]=None): + def run(self, documents: List[Document], parameters: Optional[Dict] = None): """ Compute the embeddings for a list of Documents. diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py index c7220b404..fcffa5094 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py @@ -95,7 +95,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "JinaTextEmbedder": return default_from_dict(cls, data) @component.output_types(embedding=List[float], meta=Dict[str, Any]) - def run(self, text: str, parameters: Optional[Dict]=None): + def run(self, text: str, parameters: Optional[Dict] = None): """ Embed a string. diff --git a/integrations/jina/tests/test_document_embedder.py b/integrations/jina/tests/test_document_embedder.py index cd448696d..e5921c52e 100644 --- a/integrations/jina/tests/test_document_embedder.py +++ b/integrations/jina/tests/test_document_embedder.py @@ -246,7 +246,7 @@ def test_run_on_empty_list(self): assert result["documents"] is not None assert not result["documents"] # empty list - + def test_run_with_v3(self): docs = [ Document(content="I love cheese", meta={"topic": "Cuisine"}), @@ -264,7 +264,7 @@ def test_run_with_v3(self): embedding_separator=" | ", batch_size=1, ) - result = embedder.run(documents=docs, parameters={"task_type":"retrieval.passage"}) + result = embedder.run(documents=docs, parameters={"task_type": "retrieval.passage"}) documents_with_embeddings = result["documents"] metadata = result["meta"] @@ -276,5 +276,4 @@ def test_run_with_v3(self): assert isinstance(doc.embedding, list) assert len(doc.embedding) == 3 assert all(isinstance(x, float) for x in doc.embedding) - assert metadata == {"model": model, "usage": {"prompt_tokens": 4, "total_tokens": 4}} - + assert metadata == {"model": model, "usage": {"prompt_tokens": 4, "total_tokens": 4}} \ No newline at end of file diff --git a/integrations/jina/tests/test_text_embedder.py b/integrations/jina/tests/test_text_embedder.py index 4b9ae550e..e2f55464b 100644 --- a/integrations/jina/tests/test_text_embedder.py +++ b/integrations/jina/tests/test_text_embedder.py @@ -127,7 +127,7 @@ def test_with_v3(self): embedder = JinaTextEmbedder( api_key=Secret.from_token("fake-api-key"), model=model, prefix="prefix ", suffix=" suffix" ) - result = embedder.run(text="The food was delicious", parameters={"task_type":"retrieval.passage"}) + result = embedder.run(text="The food was delicious", parameters={"task_type": "retrieval.passage"}) assert len(result["embedding"]) == 3 assert all(isinstance(x, float) for x in result["embedding"]) From 3819a18ab630c619ae71e839c785c4a398733b0e Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Wed, 11 Sep 2024 19:11:53 +0800 Subject: [PATCH 06/25] fix: resolve lint issues --- .../embedders/jina/document_embedder.py | 16 ++++------------ .../components/embedders/jina/text_embedder.py | 6 +----- .../jina/tests/test_document_embedder.py | 2 +- 3 files changed, 6 insertions(+), 18 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py index 2a9345339..8916116c4 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py @@ -132,8 +132,8 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: return texts_to_embed def _embed_batch( - self, texts_to_embed: List[str], batch_size: int, parameters: Optional[Dict] = None - ) -> Tuple[List[List[float]],Dict[str, Any]]: + self, texts_to_embed: List[str], batch_size: int, parameters: Optional[Dict] = None + ) -> Tuple[List[List[float]],Dict[str, Any]]: """ Embed a list of texts in batches. """ @@ -146,11 +146,7 @@ def _embed_batch( batch = texts_to_embed[i : i + batch_size] response = self._session.post( JINA_API_URL, - json={ - "input": batch, - "model": self.model_name, - **(parameters if parameters is not None else {}) - }, + json={"input": batch, "model": self.model_name, **(parameters if parameters is not None else {})}, ).json() if "data" not in response: raise RuntimeError(response["detail"]) @@ -189,11 +185,7 @@ def run(self, documents: List[Document], parameters: Optional[Dict] = None): texts_to_embed = self._prepare_texts_to_embed(documents=documents) - embeddings, metadata = self._embed_batch( - texts_to_embed=texts_to_embed, - batch_size=self.batch_size, - parameters=parameters - ) + embeddings, metadata = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self.batch_size, parameters=parameters) for doc, emb in zip(documents, embeddings): doc.embedding = emb diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py index fcffa5094..8bdfb40e4 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py @@ -116,11 +116,7 @@ def run(self, text: str, parameters: Optional[Dict] = None): resp = self._session.post( JINA_API_URL, - json={ - "input": [text_to_embed], - "model": self.model_name, - **(parameters if parameters is not None else {}), - } + json={"input": [text_to_embed], "model": self.model_name, **(parameters if parameters is not None else {})}, ).json() if "data" not in resp: diff --git a/integrations/jina/tests/test_document_embedder.py b/integrations/jina/tests/test_document_embedder.py index e5921c52e..17bb6b7de 100644 --- a/integrations/jina/tests/test_document_embedder.py +++ b/integrations/jina/tests/test_document_embedder.py @@ -276,4 +276,4 @@ def test_run_with_v3(self): assert isinstance(doc.embedding, list) assert len(doc.embedding) == 3 assert all(isinstance(x, float) for x in doc.embedding) - assert metadata == {"model": model, "usage": {"prompt_tokens": 4, "total_tokens": 4}} \ No newline at end of file + assert metadata == {"model": model, "usage": {"prompt_tokens": 4, "total_tokens": 4}} From e9fa8534a0c318bd74b6eba167394be25679efd3 Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Wed, 11 Sep 2024 19:13:21 +0800 Subject: [PATCH 07/25] fix: resolve lint issues --- .../components/embedders/jina/document_embedder.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py index 8916116c4..0b2794e47 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py @@ -185,7 +185,9 @@ def run(self, documents: List[Document], parameters: Optional[Dict] = None): texts_to_embed = self._prepare_texts_to_embed(documents=documents) - embeddings, metadata = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self.batch_size, parameters=parameters) + embeddings, metadata = self._embed_batch( + texts_to_embed=texts_to_embed, batch_size=self.batch_size, parameters=parameters + ) for doc, emb in zip(documents, embeddings): doc.embedding = emb From 4c02045f3484ad0ae8db740c1f04eed45d0bbf44 Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Wed, 11 Sep 2024 19:14:54 +0800 Subject: [PATCH 08/25] fix: resolve lint issues --- .../components/embedders/jina/document_embedder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py index 0b2794e47..38539c7ea 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py @@ -133,7 +133,7 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: def _embed_batch( self, texts_to_embed: List[str], batch_size: int, parameters: Optional[Dict] = None - ) -> Tuple[List[List[float]],Dict[str, Any]]: + ) -> Tuple[List[List[float]], Dict[str, Any]]: """ Embed a list of texts in batches. """ From da2238aa12e5adc9b17cc26f9eeb61d4975e5347 Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Wed, 11 Sep 2024 19:26:38 +0800 Subject: [PATCH 09/25] fix: resolve lint issues --- integrations/jina/tests/test_document_embedder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/jina/tests/test_document_embedder.py b/integrations/jina/tests/test_document_embedder.py index 17bb6b7de..477c83586 100644 --- a/integrations/jina/tests/test_document_embedder.py +++ b/integrations/jina/tests/test_document_embedder.py @@ -253,7 +253,7 @@ def test_run_with_v3(self): Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), ] - model = "jina-embeddings-v3-base-en" + model = "jina-embeddings-v3" with patch("requests.sessions.Session.post", side_effect=mock_session_post_response): embedder = JinaDocumentEmbedder( api_key=Secret.from_token("fake-api-key"), @@ -276,4 +276,4 @@ def test_run_with_v3(self): assert isinstance(doc.embedding, list) assert len(doc.embedding) == 3 assert all(isinstance(x, float) for x in doc.embedding) - assert metadata == {"model": model, "usage": {"prompt_tokens": 4, "total_tokens": 4}} + assert metadata == {"model": model, "usage": {"prompt_tokens": 2 * 4, "total_tokens": 2 * 4}} From 4fe1ca43bcdc2b8841b7ea2d4ab1fdaf17743597 Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Thu, 12 Sep 2024 15:17:09 +0800 Subject: [PATCH 10/25] chore: update JinaEmbedding for v3 release --- .../components/embedders/jina/document_embedder.py | 4 ++-- .../components/embedders/jina/text_embedder.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py index 38539c7ea..62e746759 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py @@ -28,7 +28,7 @@ class JinaDocumentEmbedder: doc = Document(content="I love pizza!") - result = document_embedder.run([doc]) + result = document_embedder.run([doc], parameters={"task_type": "retrieval.query"}) print(result['documents'][0].embedding) # [0.017020374536514282, -0.023255806416273117, ...] @@ -38,7 +38,7 @@ class JinaDocumentEmbedder: def __init__( self, api_key: Secret = Secret.from_env_var("JINA_API_KEY"), # noqa: B008 - model: str = "jina-embeddings-v2-base-en", + model: str = "jina-embeddings-v3", prefix: str = "", suffix: str = "", batch_size: int = 32, diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py index 8bdfb40e4..617086d2f 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py @@ -25,10 +25,10 @@ class JinaTextEmbedder: text_to_embed = "I love pizza!" - print(text_embedder.run(text_to_embed)) + print(text_embedder.run(text_to_embed), parameters={"task_type": "retrieval.query"}) # {'embedding': [0.017020374536514282, -0.023255806416273117, ...], - # 'meta': {'model': 'jina-embeddings-v2-base-en', + # 'meta': {'model': 'jina-embeddings-v3', # 'usage': {'prompt_tokens': 4, 'total_tokens': 4}}} ``` """ @@ -36,7 +36,7 @@ class JinaTextEmbedder: def __init__( self, api_key: Secret = Secret.from_env_var("JINA_API_KEY"), # noqa: B008 - model: str = "jina-embeddings-v2-base-en", + model: str = "jina-embeddings-v3", prefix: str = "", suffix: str = "", ): From d517aa10f0a461c4eba7a85448dfbd50180b173a Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Thu, 12 Sep 2024 16:28:41 +0800 Subject: [PATCH 11/25] fix: resolve test errors --- integrations/jina/tests/test_document_embedder.py | 2 +- integrations/jina/tests/test_text_embedder.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/jina/tests/test_document_embedder.py b/integrations/jina/tests/test_document_embedder.py index 477c83586..27316c7c1 100644 --- a/integrations/jina/tests/test_document_embedder.py +++ b/integrations/jina/tests/test_document_embedder.py @@ -31,7 +31,7 @@ def test_init_default(self, monkeypatch): embedder = JinaDocumentEmbedder() assert embedder.api_key == Secret.from_env_var("JINA_API_KEY") - assert embedder.model_name == "jina-embeddings-v2-base-en" + assert embedder.model_name == "jina-embeddings-v3" assert embedder.prefix == "" assert embedder.suffix == "" assert embedder.batch_size == 32 diff --git a/integrations/jina/tests/test_text_embedder.py b/integrations/jina/tests/test_text_embedder.py index e2f55464b..82fe0139d 100644 --- a/integrations/jina/tests/test_text_embedder.py +++ b/integrations/jina/tests/test_text_embedder.py @@ -17,7 +17,7 @@ def test_init_default(self, monkeypatch): embedder = JinaTextEmbedder() assert embedder.api_key == Secret.from_env_var("JINA_API_KEY") - assert embedder.model_name == "jina-embeddings-v2-base-en" + assert embedder.model_name == "jina-embeddings-v3" assert embedder.prefix == "" assert embedder.suffix == "" From 9ef324ef864c2547e44a0069ac04de31cd9bccb2 Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Thu, 12 Sep 2024 16:36:08 +0800 Subject: [PATCH 12/25] fix: resolve test errors --- integrations/jina/tests/test_document_embedder.py | 2 +- integrations/jina/tests/test_text_embedder.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/jina/tests/test_document_embedder.py b/integrations/jina/tests/test_document_embedder.py index 27316c7c1..06af3a5a2 100644 --- a/integrations/jina/tests/test_document_embedder.py +++ b/integrations/jina/tests/test_document_embedder.py @@ -73,7 +73,7 @@ def test_to_dict(self, monkeypatch): "type": "haystack_integrations.components.embedders.jina.document_embedder.JinaDocumentEmbedder", "init_parameters": { "api_key": {"env_vars": ["JINA_API_KEY"], "strict": True, "type": "env_var"}, - "model": "jina-embeddings-v2-base-en", + "model": "jina-embeddings-v3", "prefix": "", "suffix": "", "batch_size": 32, diff --git a/integrations/jina/tests/test_text_embedder.py b/integrations/jina/tests/test_text_embedder.py index 82fe0139d..6ccf77937 100644 --- a/integrations/jina/tests/test_text_embedder.py +++ b/integrations/jina/tests/test_text_embedder.py @@ -46,7 +46,7 @@ def test_to_dict(self, monkeypatch): "type": "haystack_integrations.components.embedders.jina.text_embedder.JinaTextEmbedder", "init_parameters": { "api_key": {"env_vars": ["JINA_API_KEY"], "strict": True, "type": "env_var"}, - "model": "jina-embeddings-v2-base-en", + "model": "jina-embeddings-v3", "prefix": "", "suffix": "", }, From f79fa73e91a28a4e9b9ae198b23b1c443496b8c0 Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Thu, 12 Sep 2024 17:18:03 +0800 Subject: [PATCH 13/25] chore: added test case --- .../jina/tests/test_document_embedder.py | 24 +++++++++++++++++++ integrations/jina/tests/test_text_embedder.py | 15 ++++++++++++ 2 files changed, 39 insertions(+) diff --git a/integrations/jina/tests/test_document_embedder.py b/integrations/jina/tests/test_document_embedder.py index 06af3a5a2..0ef992d1f 100644 --- a/integrations/jina/tests/test_document_embedder.py +++ b/integrations/jina/tests/test_document_embedder.py @@ -277,3 +277,27 @@ def test_run_with_v3(self): assert len(doc.embedding) == 3 assert all(isinstance(x, float) for x in doc.embedding) assert metadata == {"model": model, "usage": {"prompt_tokens": 2 * 4, "total_tokens": 2 * 4}} + + @patch('requests.sessions.Session.post') + def test_run_without_tasktype(self, mock_post): + docs = [ + Document(content="I love cheese", meta={"topic": "Cuisine"}), + Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), + ] + + # Configure the mock to return a response with an error status code + mock_post.return_value.status_code = 400 + mock_post.return_value.json.return_value = {"detail":"Task type parameter is required for jina-embeddings-v3."} + + with pytest.raises(Exception) as excinfo: + embedder = JinaDocumentEmbedder( + api_key=Secret.from_token("fake-api-key"), + prefix="prefix ", + suffix=" suffix", + meta_fields_to_embed=["topic"], + embedding_separator=" | ", + batch_size=1, + ) + embedder.run(documents=docs) + + assert "Task type parameter is required for jina-embeddings-v3." in str(excinfo.value) diff --git a/integrations/jina/tests/test_text_embedder.py b/integrations/jina/tests/test_text_embedder.py index 6ccf77937..f59db454f 100644 --- a/integrations/jina/tests/test_text_embedder.py +++ b/integrations/jina/tests/test_text_embedder.py @@ -135,3 +135,18 @@ def test_with_v3(self): "model": "jina-embeddings-v3", "usage": {"prompt_tokens": 6, "total_tokens": 6}, } + + @patch('requests.sessions.Session.post') + def test_run_without_tasktype(self, mock_post): + + # Configure the mock to return a response with an error status code + mock_post.return_value.status_code = 400 + mock_post.return_value.json.return_value = {"detail":"Task type parameter is required for jina-embeddings-v3."} + + with pytest.raises(Exception) as excinfo: + embedder = JinaTextEmbedder( + api_key=Secret.from_token("fake-api-key"), prefix="prefix ", suffix=" suffix", + ) + embedder.run(text="The food was delicious") + + assert "Task type parameter is required for jina-embeddings-v3." in str(excinfo.value) From cfb18eeea92c2942cdbf26f5f8e0bee82f801b0f Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Thu, 12 Sep 2024 17:29:22 +0800 Subject: [PATCH 14/25] fix: resolve lint issues --- integrations/jina/tests/test_document_embedder.py | 4 ++-- integrations/jina/tests/test_text_embedder.py | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/integrations/jina/tests/test_document_embedder.py b/integrations/jina/tests/test_document_embedder.py index 0ef992d1f..b46a6f4d5 100644 --- a/integrations/jina/tests/test_document_embedder.py +++ b/integrations/jina/tests/test_document_embedder.py @@ -277,7 +277,7 @@ def test_run_with_v3(self): assert len(doc.embedding) == 3 assert all(isinstance(x, float) for x in doc.embedding) assert metadata == {"model": model, "usage": {"prompt_tokens": 2 * 4, "total_tokens": 2 * 4}} - + @patch('requests.sessions.Session.post') def test_run_without_tasktype(self, mock_post): docs = [ @@ -287,7 +287,7 @@ def test_run_without_tasktype(self, mock_post): # Configure the mock to return a response with an error status code mock_post.return_value.status_code = 400 - mock_post.return_value.json.return_value = {"detail":"Task type parameter is required for jina-embeddings-v3."} + mock_post.return_value.json.return_value = {"detail": "Task type parameter is required for jina-embeddings-v3."} with pytest.raises(Exception) as excinfo: embedder = JinaDocumentEmbedder( diff --git a/integrations/jina/tests/test_text_embedder.py b/integrations/jina/tests/test_text_embedder.py index f59db454f..5ab3d6b50 100644 --- a/integrations/jina/tests/test_text_embedder.py +++ b/integrations/jina/tests/test_text_embedder.py @@ -135,17 +135,19 @@ def test_with_v3(self): "model": "jina-embeddings-v3", "usage": {"prompt_tokens": 6, "total_tokens": 6}, } - + @patch('requests.sessions.Session.post') def test_run_without_tasktype(self, mock_post): # Configure the mock to return a response with an error status code mock_post.return_value.status_code = 400 - mock_post.return_value.json.return_value = {"detail":"Task type parameter is required for jina-embeddings-v3."} + mock_post.return_value.json.return_value = {"detail": "Task type parameter is required for jina-embeddings-v3."} with pytest.raises(Exception) as excinfo: embedder = JinaTextEmbedder( - api_key=Secret.from_token("fake-api-key"), prefix="prefix ", suffix=" suffix", + api_key=Secret.from_token("fake-api-key"), + prefix="prefix ", + suffix=" suffix", ) embedder.run(text="The food was delicious") From 531433f624dab1f02027d18fc7686ce0c2bd7ecf Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Fri, 13 Sep 2024 13:26:40 +0800 Subject: [PATCH 15/25] fix: update function call --- .../embedders/jina/document_embedder.py | 40 +++++++++++++------ .../embedders/jina/text_embedder.py | 29 +++++++++++--- .../jina/tests/test_document_embedder.py | 11 ++++- integrations/jina/tests/test_text_embedder.py | 13 ++++-- 4 files changed, 71 insertions(+), 22 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py index 62e746759..f6e922b64 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py @@ -45,6 +45,8 @@ def __init__( progress_bar: bool = True, meta_fields_to_embed: Optional[List[str]] = None, embedding_separator: str = "\n", + task_type: Optional[str] = None, + dimensions: Optional[int] = None, ): """ Create a JinaDocumentEmbedder component. @@ -78,6 +80,8 @@ def __init__( "Content-type": "application/json", } ) + self.task_type = task_type + self.dimensions = dimensions def _get_telemetry_data(self) -> Dict[str, Any]: """ @@ -91,17 +95,23 @@ def to_dict(self) -> Dict[str, Any]: :returns: Dictionary with serialized data. """ - return default_to_dict( - self, - api_key=self.api_key.to_dict(), - model=self.model_name, - prefix=self.prefix, - suffix=self.suffix, - batch_size=self.batch_size, - progress_bar=self.progress_bar, - meta_fields_to_embed=self.meta_fields_to_embed, - embedding_separator=self.embedding_separator, - ) + kwargs = { + "api_key": self.api_key.to_dict(), + "model": self.model_name, + "prefix": self.prefix, + "suffix": self.suffix, + "batch_size": self.batch_size, + "progress_bar": self.progress_bar, + "meta_fields_to_embed": self.meta_fields_to_embed, + "embedding_separator": self.embedding_separator, + } + # Optional parameters, the following two are only supported by embeddings-v3 for now + if self.task_type: + kwargs["task_type"] = self.task_type + if self.dimensions: + kwargs["dimensions"] = self.dimensions + + return default_to_dict(self, **kwargs) @classmethod def from_dict(cls, data: Dict[str, Any]) -> "JinaDocumentEmbedder": @@ -166,7 +176,7 @@ def _embed_batch( return all_embeddings, metadata @component.output_types(documents=List[Document], meta=Dict[str, Any]) - def run(self, documents: List[Document], parameters: Optional[Dict] = None): + def run(self, documents: List[Document]): """ Compute the embeddings for a list of Documents. @@ -184,7 +194,11 @@ def run(self, documents: List[Document], parameters: Optional[Dict] = None): raise TypeError(msg) texts_to_embed = self._prepare_texts_to_embed(documents=documents) - + parameters = {} + if self.task_type: + parameters["task_type"] = self.task_type + if self.dimensions: + parameters["dimensions"] = self.dimensions embeddings, metadata = self._embed_batch( texts_to_embed=texts_to_embed, batch_size=self.batch_size, parameters=parameters ) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py index 617086d2f..71ad72a2b 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py @@ -39,6 +39,8 @@ def __init__( model: str = "jina-embeddings-v3", prefix: str = "", suffix: str = "", + task_type: Optional[str] = None, + dimensions: Optional[int] = None, ): """ Create a JinaTextEmbedder component. @@ -65,6 +67,8 @@ def __init__( "Content-type": "application/json", } ) + self.task_type = task_type + self.dimensions = dimensions def _get_telemetry_data(self) -> Dict[str, Any]: """ @@ -78,9 +82,18 @@ def to_dict(self) -> Dict[str, Any]: :returns: Dictionary with serialized data. """ - return default_to_dict( - self, api_key=self.api_key.to_dict(), model=self.model_name, prefix=self.prefix, suffix=self.suffix - ) + kwargs = { + "api_key": self.api_key.to_dict(), + "model": self.model_name, + "prefix": self.prefix, + "suffix": self.suffix, + } + # Optional parameters, the following two are only supported by embeddings-v3 for now + if self.task_type: + kwargs["task_type"] = self.task_type + if self.dimensions: + kwargs["dimensions"] = self.dimensions + return default_to_dict(self, **kwargs) @classmethod def from_dict(cls, data: Dict[str, Any]) -> "JinaTextEmbedder": @@ -95,7 +108,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "JinaTextEmbedder": return default_from_dict(cls, data) @component.output_types(embedding=List[float], meta=Dict[str, Any]) - def run(self, text: str, parameters: Optional[Dict] = None): + def run(self, text: str): """ Embed a string. @@ -114,9 +127,15 @@ def run(self, text: str, parameters: Optional[Dict] = None): text_to_embed = self.prefix + text + self.suffix + parameters = {} + if self.task_type is not None: + parameters["task_type"] = self.task_type + if self.dimensions is not None: + parameters["dimensions"] = self.dimensions + resp = self._session.post( JINA_API_URL, - json={"input": [text_to_embed], "model": self.model_name, **(parameters if parameters is not None else {})}, + json={"input": [text_to_embed], "model": self.model_name, **parameters}, ).json() if "data" not in resp: diff --git a/integrations/jina/tests/test_document_embedder.py b/integrations/jina/tests/test_document_embedder.py index b46a6f4d5..ff4ad6042 100644 --- a/integrations/jina/tests/test_document_embedder.py +++ b/integrations/jina/tests/test_document_embedder.py @@ -49,6 +49,8 @@ def test_init_with_parameters(self): progress_bar=False, meta_fields_to_embed=["test_field"], embedding_separator=" | ", + task_type="retrieval.query", + dimensions=1024, ) assert embedder.api_key == Secret.from_token("fake-api-key") @@ -59,6 +61,8 @@ def test_init_with_parameters(self): assert embedder.progress_bar is False assert embedder.meta_fields_to_embed == ["test_field"] assert embedder.embedding_separator == " | " + assert embedder.task_type == "retrieval.query" + assert embedder.dimensions == 1024 def test_init_fail_wo_api_key(self, monkeypatch): monkeypatch.delenv("JINA_API_KEY", raising=False) @@ -93,6 +97,8 @@ def test_to_dict_with_custom_init_parameters(self, monkeypatch): progress_bar=False, meta_fields_to_embed=["test_field"], embedding_separator=" | ", + task_type="retrieval.query", + dimensions=1024, ) data = component.to_dict() assert data == { @@ -106,6 +112,8 @@ def test_to_dict_with_custom_init_parameters(self, monkeypatch): "progress_bar": False, "meta_fields_to_embed": ["test_field"], "embedding_separator": " | ", + "task_type": "retrieval.query", + "dimensions": 1024, }, } @@ -263,8 +271,9 @@ def test_run_with_v3(self): meta_fields_to_embed=["topic"], embedding_separator=" | ", batch_size=1, + task_type="retrieval.query" ) - result = embedder.run(documents=docs, parameters={"task_type": "retrieval.passage"}) + result = embedder.run(documents=docs) documents_with_embeddings = result["documents"] metadata = result["meta"] diff --git a/integrations/jina/tests/test_text_embedder.py b/integrations/jina/tests/test_text_embedder.py index 5ab3d6b50..10c4185ac 100644 --- a/integrations/jina/tests/test_text_embedder.py +++ b/integrations/jina/tests/test_text_embedder.py @@ -10,7 +10,6 @@ from haystack_integrations.components.embedders.jina import JinaTextEmbedder - class TestJinaTextEmbedder: def test_init_default(self, monkeypatch): monkeypatch.setenv("JINA_API_KEY", "fake-api-key") @@ -58,6 +57,8 @@ def test_to_dict_with_custom_init_parameters(self, monkeypatch): model="model", prefix="prefix", suffix="suffix", + task_type="retrieval.query", + dimensions=1024, ) data = component.to_dict() assert data == { @@ -67,6 +68,8 @@ def test_to_dict_with_custom_init_parameters(self, monkeypatch): "model": "model", "prefix": "prefix", "suffix": "suffix", + "task_type": "retrieval.query", + "dimensions": 1024, }, } @@ -125,9 +128,13 @@ def test_with_v3(self): mock_post.return_value = mock_response embedder = JinaTextEmbedder( - api_key=Secret.from_token("fake-api-key"), model=model, prefix="prefix ", suffix=" suffix" + api_key=Secret.from_token("fake-api-key"), + model=model, + prefix="prefix ", + suffix=" suffix", + task_type="retrieval.query" ) - result = embedder.run(text="The food was delicious", parameters={"task_type": "retrieval.passage"}) + result = embedder.run(text="The food was delicious") assert len(result["embedding"]) == 3 assert all(isinstance(x, float) for x in result["embedding"]) From c039af7b696dc49b1112813bb41004d18010da99 Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Fri, 13 Sep 2024 13:33:52 +0800 Subject: [PATCH 16/25] fix: resolve lint issues --- integrations/jina/tests/test_document_embedder.py | 2 +- integrations/jina/tests/test_text_embedder.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/integrations/jina/tests/test_document_embedder.py b/integrations/jina/tests/test_document_embedder.py index ff4ad6042..7537d4e3a 100644 --- a/integrations/jina/tests/test_document_embedder.py +++ b/integrations/jina/tests/test_document_embedder.py @@ -271,7 +271,7 @@ def test_run_with_v3(self): meta_fields_to_embed=["topic"], embedding_separator=" | ", batch_size=1, - task_type="retrieval.query" + task_type="retrieval.query", ) result = embedder.run(documents=docs) diff --git a/integrations/jina/tests/test_text_embedder.py b/integrations/jina/tests/test_text_embedder.py index 10c4185ac..fc89f78ba 100644 --- a/integrations/jina/tests/test_text_embedder.py +++ b/integrations/jina/tests/test_text_embedder.py @@ -10,6 +10,7 @@ from haystack_integrations.components.embedders.jina import JinaTextEmbedder + class TestJinaTextEmbedder: def test_init_default(self, monkeypatch): monkeypatch.setenv("JINA_API_KEY", "fake-api-key") @@ -132,7 +133,7 @@ def test_with_v3(self): model=model, prefix="prefix ", suffix=" suffix", - task_type="retrieval.query" + task_type="retrieval.query", ) result = embedder.run(text="The food was delicious") From 38781cbd37ada48438ceab41fc61c33221761752 Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Fri, 13 Sep 2024 14:17:46 +0800 Subject: [PATCH 17/25] fix: lint error --- .../components/embedders/jina/document_embedder.py | 2 +- .../components/embedders/jina/text_embedder.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py index f6e922b64..2dd7765d2 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py @@ -194,7 +194,7 @@ def run(self, documents: List[Document]): raise TypeError(msg) texts_to_embed = self._prepare_texts_to_embed(documents=documents) - parameters = {} + parameters: Dict[str, any] = {} if self.task_type: parameters["task_type"] = self.task_type if self.dimensions: diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py index 71ad72a2b..d74f44a4b 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py @@ -127,7 +127,7 @@ def run(self, text: str): text_to_embed = self.prefix + text + self.suffix - parameters = {} + parameters: Dict[str, any] = {} if self.task_type is not None: parameters["task_type"] = self.task_type if self.dimensions is not None: From c2d80f01717a5a1bd5f6770221cf29fda58762b9 Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Fri, 13 Sep 2024 14:20:46 +0800 Subject: [PATCH 18/25] fix: lint error --- .../components/embedders/jina/document_embedder.py | 2 +- .../components/embedders/jina/text_embedder.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py index 2dd7765d2..0e5c06459 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py @@ -194,7 +194,7 @@ def run(self, documents: List[Document]): raise TypeError(msg) texts_to_embed = self._prepare_texts_to_embed(documents=documents) - parameters: Dict[str, any] = {} + parameters: Dict[str, Any] = {} if self.task_type: parameters["task_type"] = self.task_type if self.dimensions: diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py index d74f44a4b..e51652ecb 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py @@ -127,7 +127,7 @@ def run(self, text: str): text_to_embed = self.prefix + text + self.suffix - parameters: Dict[str, any] = {} + parameters: Dict[str, Any] = {} if self.task_type is not None: parameters["task_type"] = self.task_type if self.dimensions is not None: From 86cc89251c1bd72ed4aa61063ab40897e7755cf8 Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Fri, 13 Sep 2024 17:09:16 +0800 Subject: [PATCH 19/25] chore: remove unnecessary test cases --- .../jina/tests/test_document_embedder.py | 24 ------------------- integrations/jina/tests/test_text_embedder.py | 17 ------------- 2 files changed, 41 deletions(-) diff --git a/integrations/jina/tests/test_document_embedder.py b/integrations/jina/tests/test_document_embedder.py index 7537d4e3a..19311a1a6 100644 --- a/integrations/jina/tests/test_document_embedder.py +++ b/integrations/jina/tests/test_document_embedder.py @@ -286,27 +286,3 @@ def test_run_with_v3(self): assert len(doc.embedding) == 3 assert all(isinstance(x, float) for x in doc.embedding) assert metadata == {"model": model, "usage": {"prompt_tokens": 2 * 4, "total_tokens": 2 * 4}} - - @patch('requests.sessions.Session.post') - def test_run_without_tasktype(self, mock_post): - docs = [ - Document(content="I love cheese", meta={"topic": "Cuisine"}), - Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), - ] - - # Configure the mock to return a response with an error status code - mock_post.return_value.status_code = 400 - mock_post.return_value.json.return_value = {"detail": "Task type parameter is required for jina-embeddings-v3."} - - with pytest.raises(Exception) as excinfo: - embedder = JinaDocumentEmbedder( - api_key=Secret.from_token("fake-api-key"), - prefix="prefix ", - suffix=" suffix", - meta_fields_to_embed=["topic"], - embedding_separator=" | ", - batch_size=1, - ) - embedder.run(documents=docs) - - assert "Task type parameter is required for jina-embeddings-v3." in str(excinfo.value) diff --git a/integrations/jina/tests/test_text_embedder.py b/integrations/jina/tests/test_text_embedder.py index fc89f78ba..26280689e 100644 --- a/integrations/jina/tests/test_text_embedder.py +++ b/integrations/jina/tests/test_text_embedder.py @@ -143,20 +143,3 @@ def test_with_v3(self): "model": "jina-embeddings-v3", "usage": {"prompt_tokens": 6, "total_tokens": 6}, } - - @patch('requests.sessions.Session.post') - def test_run_without_tasktype(self, mock_post): - - # Configure the mock to return a response with an error status code - mock_post.return_value.status_code = 400 - mock_post.return_value.json.return_value = {"detail": "Task type parameter is required for jina-embeddings-v3."} - - with pytest.raises(Exception) as excinfo: - embedder = JinaTextEmbedder( - api_key=Secret.from_token("fake-api-key"), - prefix="prefix ", - suffix=" suffix", - ) - embedder.run(text="The food was delicious") - - assert "Task type parameter is required for jina-embeddings-v3." in str(excinfo.value) From 5847efb4508884123e7d86d8027117de43eeded4 Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Wed, 18 Sep 2024 09:14:29 +0800 Subject: [PATCH 20/25] chore: use 'task' instead of 'task_type' --- .../embedders/jina/document_embedder.py | 16 ++++++++-------- .../components/embedders/jina/text_embedder.py | 16 ++++++++-------- .../jina/tests/test_document_embedder.py | 10 +++++----- integrations/jina/tests/test_text_embedder.py | 6 +++--- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py index 0e5c06459..dc4240f9a 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py @@ -24,11 +24,11 @@ class JinaDocumentEmbedder: # Make sure that the environment variable JINA_API_KEY is set - document_embedder = JinaDocumentEmbedder() + document_embedder = JinaDocumentEmbedder(task="retrieval.query") doc = Document(content="I love pizza!") - result = document_embedder.run([doc], parameters={"task_type": "retrieval.query"}) + result = document_embedder.run([doc]) print(result['documents'][0].embedding) # [0.017020374536514282, -0.023255806416273117, ...] @@ -45,7 +45,7 @@ def __init__( progress_bar: bool = True, meta_fields_to_embed: Optional[List[str]] = None, embedding_separator: str = "\n", - task_type: Optional[str] = None, + task: Optional[str] = None, dimensions: Optional[int] = None, ): """ @@ -80,7 +80,7 @@ def __init__( "Content-type": "application/json", } ) - self.task_type = task_type + self.task = task self.dimensions = dimensions def _get_telemetry_data(self) -> Dict[str, Any]: @@ -106,8 +106,8 @@ def to_dict(self) -> Dict[str, Any]: "embedding_separator": self.embedding_separator, } # Optional parameters, the following two are only supported by embeddings-v3 for now - if self.task_type: - kwargs["task_type"] = self.task_type + if self.task: + kwargs["task"] = self.task if self.dimensions: kwargs["dimensions"] = self.dimensions @@ -195,8 +195,8 @@ def run(self, documents: List[Document]): texts_to_embed = self._prepare_texts_to_embed(documents=documents) parameters: Dict[str, Any] = {} - if self.task_type: - parameters["task_type"] = self.task_type + if self.task: + parameters["task"] = self.task if self.dimensions: parameters["dimensions"] = self.dimensions embeddings, metadata = self._embed_batch( diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py index e51652ecb..05840220f 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py @@ -21,11 +21,11 @@ class JinaTextEmbedder: # Make sure that the environment variable JINA_API_KEY is set - text_embedder = JinaTextEmbedder() + text_embedder = JinaTextEmbedder(task="retrieval.query") text_to_embed = "I love pizza!" - print(text_embedder.run(text_to_embed), parameters={"task_type": "retrieval.query"}) + print(text_embedder.run(text_to_embed)) # {'embedding': [0.017020374536514282, -0.023255806416273117, ...], # 'meta': {'model': 'jina-embeddings-v3', @@ -39,7 +39,7 @@ def __init__( model: str = "jina-embeddings-v3", prefix: str = "", suffix: str = "", - task_type: Optional[str] = None, + task: Optional[str] = None, dimensions: Optional[int] = None, ): """ @@ -67,7 +67,7 @@ def __init__( "Content-type": "application/json", } ) - self.task_type = task_type + self.task = task self.dimensions = dimensions def _get_telemetry_data(self) -> Dict[str, Any]: @@ -89,8 +89,8 @@ def to_dict(self) -> Dict[str, Any]: "suffix": self.suffix, } # Optional parameters, the following two are only supported by embeddings-v3 for now - if self.task_type: - kwargs["task_type"] = self.task_type + if self.task: + kwargs["task"] = self.task if self.dimensions: kwargs["dimensions"] = self.dimensions return default_to_dict(self, **kwargs) @@ -128,8 +128,8 @@ def run(self, text: str): text_to_embed = self.prefix + text + self.suffix parameters: Dict[str, Any] = {} - if self.task_type is not None: - parameters["task_type"] = self.task_type + if self.task is not None: + parameters["task"] = self.task if self.dimensions is not None: parameters["dimensions"] = self.dimensions diff --git a/integrations/jina/tests/test_document_embedder.py b/integrations/jina/tests/test_document_embedder.py index 19311a1a6..78339a1c8 100644 --- a/integrations/jina/tests/test_document_embedder.py +++ b/integrations/jina/tests/test_document_embedder.py @@ -49,7 +49,7 @@ def test_init_with_parameters(self): progress_bar=False, meta_fields_to_embed=["test_field"], embedding_separator=" | ", - task_type="retrieval.query", + task="retrieval.query", dimensions=1024, ) @@ -61,7 +61,7 @@ def test_init_with_parameters(self): assert embedder.progress_bar is False assert embedder.meta_fields_to_embed == ["test_field"] assert embedder.embedding_separator == " | " - assert embedder.task_type == "retrieval.query" + assert embedder.task == "retrieval.query" assert embedder.dimensions == 1024 def test_init_fail_wo_api_key(self, monkeypatch): @@ -97,7 +97,7 @@ def test_to_dict_with_custom_init_parameters(self, monkeypatch): progress_bar=False, meta_fields_to_embed=["test_field"], embedding_separator=" | ", - task_type="retrieval.query", + task="retrieval.query", dimensions=1024, ) data = component.to_dict() @@ -112,7 +112,7 @@ def test_to_dict_with_custom_init_parameters(self, monkeypatch): "progress_bar": False, "meta_fields_to_embed": ["test_field"], "embedding_separator": " | ", - "task_type": "retrieval.query", + "task": "retrieval.query", "dimensions": 1024, }, } @@ -271,7 +271,7 @@ def test_run_with_v3(self): meta_fields_to_embed=["topic"], embedding_separator=" | ", batch_size=1, - task_type="retrieval.query", + task="retrieval.query", ) result = embedder.run(documents=docs) diff --git a/integrations/jina/tests/test_text_embedder.py b/integrations/jina/tests/test_text_embedder.py index 26280689e..a4623c698 100644 --- a/integrations/jina/tests/test_text_embedder.py +++ b/integrations/jina/tests/test_text_embedder.py @@ -58,7 +58,7 @@ def test_to_dict_with_custom_init_parameters(self, monkeypatch): model="model", prefix="prefix", suffix="suffix", - task_type="retrieval.query", + task="retrieval.query", dimensions=1024, ) data = component.to_dict() @@ -69,7 +69,7 @@ def test_to_dict_with_custom_init_parameters(self, monkeypatch): "model": "model", "prefix": "prefix", "suffix": "suffix", - "task_type": "retrieval.query", + "task": "retrieval.query", "dimensions": 1024, }, } @@ -133,7 +133,7 @@ def test_with_v3(self): model=model, prefix="prefix ", suffix=" suffix", - task_type="retrieval.query", + task="retrieval.query", ) result = embedder.run(text="The food was delicious") From 9e01dd53496948e31c009ad1cb447f0158387d1d Mon Sep 17 00:00:00 2001 From: Aaron Ji Date: Wed, 18 Sep 2024 10:07:02 +0800 Subject: [PATCH 21/25] chore: add 'late_chunking' for Jina embedders --- .../components/embedders/jina/document_embedder.py | 6 ++++++ .../components/embedders/jina/text_embedder.py | 6 ++++++ integrations/jina/tests/test_document_embedder.py | 2 ++ integrations/jina/tests/test_text_embedder.py | 2 ++ 4 files changed, 16 insertions(+) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py index dc4240f9a..be7b98d07 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py @@ -47,6 +47,7 @@ def __init__( embedding_separator: str = "\n", task: Optional[str] = None, dimensions: Optional[int] = None, + late_chunking: Optional[bool] = None, ): """ Create a JinaDocumentEmbedder component. @@ -82,6 +83,7 @@ def __init__( ) self.task = task self.dimensions = dimensions + self.late_chunking = late_chunking def _get_telemetry_data(self) -> Dict[str, Any]: """ @@ -110,6 +112,8 @@ def to_dict(self) -> Dict[str, Any]: kwargs["task"] = self.task if self.dimensions: kwargs["dimensions"] = self.dimensions + if self.late_chunking is not None: + kwargs["late_chunking"] = self.late_chunking return default_to_dict(self, **kwargs) @@ -199,6 +203,8 @@ def run(self, documents: List[Document]): parameters["task"] = self.task if self.dimensions: parameters["dimensions"] = self.dimensions + if self.late_chunking is not None: + parameters["late_chunking"] = self.late_chunking embeddings, metadata = self._embed_batch( texts_to_embed=texts_to_embed, batch_size=self.batch_size, parameters=parameters ) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py index 05840220f..4bcdbc71c 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py @@ -41,6 +41,7 @@ def __init__( suffix: str = "", task: Optional[str] = None, dimensions: Optional[int] = None, + late_chunking: Optional[bool] = None, ): """ Create a JinaTextEmbedder component. @@ -69,6 +70,7 @@ def __init__( ) self.task = task self.dimensions = dimensions + self.late_chunking = late_chunking def _get_telemetry_data(self) -> Dict[str, Any]: """ @@ -93,6 +95,8 @@ def to_dict(self) -> Dict[str, Any]: kwargs["task"] = self.task if self.dimensions: kwargs["dimensions"] = self.dimensions + if self.late_chunking is not None: + kwargs["late_chunking"] = self.late_chunking return default_to_dict(self, **kwargs) @classmethod @@ -132,6 +136,8 @@ def run(self, text: str): parameters["task"] = self.task if self.dimensions is not None: parameters["dimensions"] = self.dimensions + if self.late_chunking is not None: + parameters["late_chunking"] = self.late_chunking resp = self._session.post( JINA_API_URL, diff --git a/integrations/jina/tests/test_document_embedder.py b/integrations/jina/tests/test_document_embedder.py index 78339a1c8..247b95eff 100644 --- a/integrations/jina/tests/test_document_embedder.py +++ b/integrations/jina/tests/test_document_embedder.py @@ -51,6 +51,7 @@ def test_init_with_parameters(self): embedding_separator=" | ", task="retrieval.query", dimensions=1024, + late_chunking=True, ) assert embedder.api_key == Secret.from_token("fake-api-key") @@ -63,6 +64,7 @@ def test_init_with_parameters(self): assert embedder.embedding_separator == " | " assert embedder.task == "retrieval.query" assert embedder.dimensions == 1024 + assert embedder.late_chunking is True def test_init_fail_wo_api_key(self, monkeypatch): monkeypatch.delenv("JINA_API_KEY", raising=False) diff --git a/integrations/jina/tests/test_text_embedder.py b/integrations/jina/tests/test_text_embedder.py index a4623c698..058712a18 100644 --- a/integrations/jina/tests/test_text_embedder.py +++ b/integrations/jina/tests/test_text_embedder.py @@ -27,11 +27,13 @@ def test_init_with_parameters(self): model="model", prefix="prefix", suffix="suffix", + late_chunking=True, ) assert embedder.api_key == Secret.from_token("fake-api-key") assert embedder.model_name == "model" assert embedder.prefix == "prefix" assert embedder.suffix == "suffix" + assert embedder.late_chunking is True def test_init_fail_wo_api_key(self, monkeypatch): monkeypatch.delenv("JINA_API_KEY", raising=False) From 689586024df58a9aecb700f4bc287e16dfb62664 Mon Sep 17 00:00:00 2001 From: Aaron Ji <127167174+DresAaron@users.noreply.github.com> Date: Wed, 18 Sep 2024 15:47:49 +0800 Subject: [PATCH 22/25] Update integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> --- .../components/embedders/jina/document_embedder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py index be7b98d07..bcfe88a6a 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py @@ -199,9 +199,9 @@ def run(self, documents: List[Document]): texts_to_embed = self._prepare_texts_to_embed(documents=documents) parameters: Dict[str, Any] = {} - if self.task: + if self.task is not None: parameters["task"] = self.task - if self.dimensions: + if self.dimensions is not None: parameters["dimensions"] = self.dimensions if self.late_chunking is not None: parameters["late_chunking"] = self.late_chunking From 048f7a66d5eca473ac4d6b1186ac3fa2972f8906 Mon Sep 17 00:00:00 2001 From: Aaron Ji <127167174+DresAaron@users.noreply.github.com> Date: Wed, 18 Sep 2024 15:47:57 +0800 Subject: [PATCH 23/25] Update integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> --- .../components/embedders/jina/document_embedder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py index bcfe88a6a..50ac77fe8 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py @@ -108,9 +108,9 @@ def to_dict(self) -> Dict[str, Any]: "embedding_separator": self.embedding_separator, } # Optional parameters, the following two are only supported by embeddings-v3 for now - if self.task: + if self.task is not None: kwargs["task"] = self.task - if self.dimensions: + if self.dimensions is not None: kwargs["dimensions"] = self.dimensions if self.late_chunking is not None: kwargs["late_chunking"] = self.late_chunking From c0bccaa8b76d1667304f7f4ffec7527a201d824b Mon Sep 17 00:00:00 2001 From: Aaron Ji <127167174+DresAaron@users.noreply.github.com> Date: Wed, 18 Sep 2024 15:51:37 +0800 Subject: [PATCH 24/25] Update integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> --- .../components/embedders/jina/document_embedder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py index 50ac77fe8..bbac547c3 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py @@ -160,7 +160,7 @@ def _embed_batch( batch = texts_to_embed[i : i + batch_size] response = self._session.post( JINA_API_URL, - json={"input": batch, "model": self.model_name, **(parameters if parameters is not None else {})}, + json={"input": batch, "model": self.model_name, **(parameters or {})}, ).json() if "data" not in response: raise RuntimeError(response["detail"]) From 3d45a2cb78e2785d53528cf7ac7a41d449326a8f Mon Sep 17 00:00:00 2001 From: Aaron Ji <127167174+DresAaron@users.noreply.github.com> Date: Wed, 18 Sep 2024 15:51:50 +0800 Subject: [PATCH 25/25] Update integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> --- .../components/embedders/jina/text_embedder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py index 4bcdbc71c..c22f9ea2c 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py @@ -91,9 +91,9 @@ def to_dict(self) -> Dict[str, Any]: "suffix": self.suffix, } # Optional parameters, the following two are only supported by embeddings-v3 for now - if self.task: + if self.task is not None: kwargs["task"] = self.task - if self.dimensions: + if self.dimensions is not None: kwargs["dimensions"] = self.dimensions if self.late_chunking is not None: kwargs["late_chunking"] = self.late_chunking