From b78e4a05b359c701d776d20c7553a0f484d79914 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 20 Aug 2024 13:04:56 +0200 Subject: [PATCH 1/6] initial import --- .../pinecone/document_store.py | 28 +++++++++++++++ .../pinecone/tests/test_document_store.py | 36 +++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py b/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py index 1fd3adf40..383ab05fb 100644 --- a/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py +++ b/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import io import logging +import warnings from copy import copy from typing import Any, Dict, List, Literal, Optional @@ -26,6 +27,7 @@ DEFAULT_STARTER_PLAN_SPEC = {"serverless": {"region": "us-east-1", "cloud": "aws"}} +METADATA_SUPPORTED_PRIMITIVE_TYPES = str, int, bool # List[str] is supported and checked separately class PineconeDocumentStore: @@ -295,6 +297,28 @@ def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> Li return documents + @staticmethod + def check_metadata(document: Document): + def valid_type(value: Any): + return isinstance(value, METADATA_SUPPORTED_PRIMITIVE_TYPES) or ( + isinstance(value, list) and all(isinstance(i, str) for i in value) + ) + + if document.meta: + discarded_keys = [] + for key, value in document.meta.items(): + if not valid_type(value): + discarded_keys.append(key) + document.meta[key] = "IGNORED" + + if discarded_keys: + msg = (f"Document {document.id} has metadata fields with unsupported types: {discarded_keys}. " + f"Only str, int, bool, and List[str] are supported. The values of these fields will be ignored.") + logger.warning(msg) + warnings.warn(msg, UserWarning) + + return document + def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> List[Dict[str, Any]]: documents_for_pinecone = [] for document in documents: @@ -305,6 +329,10 @@ def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> Li "A dummy embedding will be used, but this can affect the search results. " ) embedding = self._dummy_vector + + if document.meta: + document = self.check_metadata(document) + doc_for_pinecone = {"id": document.id, "values": embedding, "metadata": dict(document.meta)} # we save content/dataframe as metadata diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index 90ce2ccff..b942d813d 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -142,6 +142,42 @@ def test_convert_dict_spec_to_pinecone_object_fail(): PineconeDocumentStore._convert_dict_spec_to_pinecone_object(dict_spec) +def test_check_metadata_invalid(): + invalid_metadata_doc = Document( + content="The moonlight shimmered ", + meta={ + "source_id": "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0", + "page_number": 1, + "split_id": 0, + "split_idx_start": 0, + "_split_overlap": [ + {"doc_id": "68ed48ba830048c5d7815874ed2de794722e6d10866b6c55349a914fd9a0df65", "range": (0, 20)} + ], + }, + ) + pinecone_doc = PineconeDocumentStore.check_metadata(invalid_metadata_doc) + + assert pinecone_doc.meta["source_id"] == "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0" + assert pinecone_doc.meta["page_number"] == 1 + assert pinecone_doc.meta["split_id"] == 0 + assert pinecone_doc.meta["split_idx_start"] == 0 + assert pinecone_doc.meta["_split_overlap"] == "IGNORED" + + +def test_check_metadata_valid(): + valid_metadata_doc = Document( + content="The moonlight shimmered ", + meta={ + "source_id": "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0", + "page_number": 1, + }, + ) + pinecone_doc = PineconeDocumentStore.check_metadata(valid_metadata_doc) + + assert pinecone_doc.meta["source_id"] == "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0" + assert pinecone_doc.meta["page_number"] == 1 + + @pytest.mark.integration @pytest.mark.skipif("PINECONE_API_KEY" not in os.environ, reason="PINECONE_API_KEY not set") def test_serverless_index_creation_from_scratch(sleep_time): From eb4c13d92540cfbedf8d04468ad012882c8ddead Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 20 Aug 2024 13:09:05 +0200 Subject: [PATCH 2/6] nit --- .../document_stores/pinecone/document_store.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py b/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py index 383ab05fb..167f29e25 100644 --- a/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py +++ b/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 import io import logging -import warnings from copy import copy from typing import Any, Dict, List, Literal, Optional @@ -27,7 +26,7 @@ DEFAULT_STARTER_PLAN_SPEC = {"serverless": {"region": "us-east-1", "cloud": "aws"}} -METADATA_SUPPORTED_PRIMITIVE_TYPES = str, int, bool # List[str] is supported and checked separately +METADATA_SUPPORTED_TYPES = str, int, bool # List[str] is supported and checked separately class PineconeDocumentStore: @@ -300,7 +299,7 @@ def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> Li @staticmethod def check_metadata(document: Document): def valid_type(value: Any): - return isinstance(value, METADATA_SUPPORTED_PRIMITIVE_TYPES) or ( + return isinstance(value, METADATA_SUPPORTED_TYPES) or ( isinstance(value, list) and all(isinstance(i, str) for i in value) ) @@ -315,7 +314,6 @@ def valid_type(value: Any): msg = (f"Document {document.id} has metadata fields with unsupported types: {discarded_keys}. " f"Only str, int, bool, and List[str] are supported. The values of these fields will be ignored.") logger.warning(msg) - warnings.warn(msg, UserWarning) return document From eab791a76cf5345d199b16d809a745aa1ec27900 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 20 Aug 2024 13:16:35 +0200 Subject: [PATCH 3/6] linting --- .../document_stores/pinecone/document_store.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py b/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py index 167f29e25..c0cad8b63 100644 --- a/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py +++ b/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py @@ -311,8 +311,10 @@ def valid_type(value: Any): document.meta[key] = "IGNORED" if discarded_keys: - msg = (f"Document {document.id} has metadata fields with unsupported types: {discarded_keys}. " - f"Only str, int, bool, and List[str] are supported. The values of these fields will be ignored.") + msg = ( + f"Document {document.id} has metadata fields with unsupported types: {discarded_keys}. " + f"Only str, int, bool, and List[str] are supported. The values of these fields will be ignored." + ) logger.warning(msg) return document @@ -329,7 +331,7 @@ def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> Li embedding = self._dummy_vector if document.meta: - document = self.check_metadata(document) + self.check_metadata(document) doc_for_pinecone = {"id": document.id, "values": embedding, "metadata": dict(document.meta)} From 1ed584303beddafb37f7f5ac8f8a7f48f7ef5075 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 21 Aug 2024 10:33:37 +0200 Subject: [PATCH 4/6] PR comments --- .../document_stores/pinecone/document_store.py | 16 ++++++++++++---- .../pinecone/tests/test_document_store.py | 10 +++++----- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py b/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py index c0cad8b63..d83205ee2 100644 --- a/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py +++ b/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py @@ -26,7 +26,7 @@ DEFAULT_STARTER_PLAN_SPEC = {"serverless": {"region": "us-east-1", "cloud": "aws"}} -METADATA_SUPPORTED_TYPES = str, int, bool # List[str] is supported and checked separately +METADATA_SUPPORTED_TYPES = str, int, bool, float # List[str] is supported and checked separately class PineconeDocumentStore: @@ -297,7 +297,11 @@ def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> Li return documents @staticmethod - def check_metadata(document: Document): + def _discard_invalid_meta(document: Document): + """ + Remove metadata fields with unsupported types from the document. + """ + def valid_type(value: Any): return isinstance(value, METADATA_SUPPORTED_TYPES) or ( isinstance(value, list) and all(isinstance(i, str) for i in value) @@ -305,10 +309,12 @@ def valid_type(value: Any): if document.meta: discarded_keys = [] + new_meta = {} for key, value in document.meta.items(): if not valid_type(value): discarded_keys.append(key) - document.meta[key] = "IGNORED" + else: + new_meta[key] = value if discarded_keys: msg = ( @@ -317,6 +323,8 @@ def valid_type(value: Any): ) logger.warning(msg) + document.meta = new_meta + return document def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> List[Dict[str, Any]]: @@ -331,7 +339,7 @@ def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> Li embedding = self._dummy_vector if document.meta: - self.check_metadata(document) + self._discard_invalid_meta(document) doc_for_pinecone = {"id": document.id, "values": embedding, "metadata": dict(document.meta)} diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index b942d813d..125fdee92 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -142,7 +142,7 @@ def test_convert_dict_spec_to_pinecone_object_fail(): PineconeDocumentStore._convert_dict_spec_to_pinecone_object(dict_spec) -def test_check_metadata_invalid(): +def test_validate_metadata_invalid(): invalid_metadata_doc = Document( content="The moonlight shimmered ", meta={ @@ -155,16 +155,16 @@ def test_check_metadata_invalid(): ], }, ) - pinecone_doc = PineconeDocumentStore.check_metadata(invalid_metadata_doc) + pinecone_doc = PineconeDocumentStore._discard_invalid_meta(invalid_metadata_doc) assert pinecone_doc.meta["source_id"] == "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0" assert pinecone_doc.meta["page_number"] == 1 assert pinecone_doc.meta["split_id"] == 0 assert pinecone_doc.meta["split_idx_start"] == 0 - assert pinecone_doc.meta["_split_overlap"] == "IGNORED" + assert "_split_overlap" not in pinecone_doc.meta -def test_check_metadata_valid(): +def test_validate_metadata_valid(): valid_metadata_doc = Document( content="The moonlight shimmered ", meta={ @@ -172,7 +172,7 @@ def test_check_metadata_valid(): "page_number": 1, }, ) - pinecone_doc = PineconeDocumentStore.check_metadata(valid_metadata_doc) + pinecone_doc = PineconeDocumentStore._discard_invalid_meta(valid_metadata_doc) assert pinecone_doc.meta["source_id"] == "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0" assert pinecone_doc.meta["page_number"] == 1 From e454a47c2619431c5b9e9ab019ab10a8b86f5cae Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 21 Aug 2024 11:29:26 +0200 Subject: [PATCH 5/6] Update integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py Co-authored-by: Stefano Fiorucci --- .../document_stores/pinecone/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py b/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py index d83205ee2..27eba6ecf 100644 --- a/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py +++ b/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py @@ -319,7 +319,7 @@ def valid_type(value: Any): if discarded_keys: msg = ( f"Document {document.id} has metadata fields with unsupported types: {discarded_keys}. " - f"Only str, int, bool, and List[str] are supported. The values of these fields will be ignored." + f"Only str, int, bool, and List[str] are supported. The values of these fields will be discarded." ) logger.warning(msg) From 73e19bf5d9f040886c962c9cb71d67bf980f56c9 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 21 Aug 2024 11:32:18 +0200 Subject: [PATCH 6/6] nit --- integrations/pinecone/tests/test_document_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index 125fdee92..bd443b4a8 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -142,7 +142,7 @@ def test_convert_dict_spec_to_pinecone_object_fail(): PineconeDocumentStore._convert_dict_spec_to_pinecone_object(dict_spec) -def test_validate_metadata_invalid(): +def test_discard_invalid_meta_invalid(): invalid_metadata_doc = Document( content="The moonlight shimmered ", meta={ @@ -164,7 +164,7 @@ def test_validate_metadata_invalid(): assert "_split_overlap" not in pinecone_doc.meta -def test_validate_metadata_valid(): +def test_discard_invalid_meta_valid(): valid_metadata_doc = Document( content="The moonlight shimmered ", meta={