From 4d90b8cef1cb1792f91edcec036c614b4959c600 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 13:57:09 +0100 Subject: [PATCH] improve document conversion --- .../src/pinecone_haystack/document_store.py | 52 ++++++++++--------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index 2e4fe5f14..f841b23d9 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -129,30 +129,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D f"but got {policy}. Overwriting duplicates is enabled by default." ) - documents_for_pinecone = [] - for document in deepcopy(documents): - if document.embedding is None: - logger.warning( - f"Document {document.id} has no embedding. Pinecone is a purely vector database. " - "A dummy embedding will be used, but this can affect the search results. " - ) - document.embedding = self._dummy_vector - doc_for_pinecone = {"id": document.id, "values": document.embedding, "metadata": document.meta} - - # we save content/dataframe as metadata - # currently, storing blob in Pinecone is not supported - if document.content is not None: - doc_for_pinecone["metadata"]["content"] = document.content - if document.dataframe is not None: - doc_for_pinecone["metadata"]["dataframe"] = document.dataframe.to_json() - if document.blob is not None: - logger.warning( - f"Document {document.id} has the `blob` field set, but storing `ByteStream` " - "objects in Pinecone is not supported. " - "The content of the `blob` field will be ignored." - ) - - documents_for_pinecone.append(doc_for_pinecone) + documents_for_pinecone = self._convert_documents_to_pinecone_format(documents) result = self._index.upsert( vectors=documents_for_pinecone, namespace=self.namespace, batch_size=self.batch_size @@ -261,3 +238,30 @@ def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> Li documents.append(doc) return documents + + def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> List[Dict[str, Any]]: + documents_for_pinecone = [] + for document in documents: + embedding = document.embedding + if embedding is None: + logger.warning( + f"Document {document.id} has no embedding. Pinecone is a purely vector database. " + "A dummy embedding will be used, but this can affect the search results. " + ) + embedding = self._dummy_vector + doc_for_pinecone = {"id": document.id, "values": embedding, "metadata": document.meta} + + # we save content/dataframe as metadata + # currently, storing blob in Pinecone is not supported + if document.content is not None: + doc_for_pinecone["metadata"]["content"] = document.content + if document.dataframe is not None: + doc_for_pinecone["metadata"]["dataframe"] = document.dataframe.to_json() + if document.blob is not None: + logger.warning( + f"Document {document.id} has the `blob` field set, but storing `ByteStream` " + "objects in Pinecone is not supported. " + "The content of the `blob` field will be ignored." + ) + + documents_for_pinecone.append(doc_for_pinecone)