improve document conversion

deepset-ai · Dec 22, 2023 · 4d90b8c · 4d90b8c
1 parent d918414
commit 4d90b8c
Showing 1 changed file with 28 additions and 24 deletions.
diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py
@@ -129,30 +129,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
                 f"but got {policy}. Overwriting duplicates is enabled by default."
             )
 
-        documents_for_pinecone = []
-        for document in deepcopy(documents):
-            if document.embedding is None:
-                logger.warning(
-                    f"Document {document.id} has no embedding. Pinecone is a purely vector database. "
-                    "A dummy embedding will be used, but this can affect the search results. "
-                )
-                document.embedding = self._dummy_vector
-            doc_for_pinecone = {"id": document.id, "values": document.embedding, "metadata": document.meta}
-
-            # we save content/dataframe as metadata
-            # currently, storing blob in Pinecone is not supported
-            if document.content is not None:
-                doc_for_pinecone["metadata"]["content"] = document.content
-            if document.dataframe is not None:
-                doc_for_pinecone["metadata"]["dataframe"] = document.dataframe.to_json()
-            if document.blob is not None:
-                logger.warning(
-                    f"Document {document.id} has the `blob` field set, but storing `ByteStream` "
-                    "objects in Pinecone is not supported. "
-                    "The content of the `blob` field will be ignored."
-                )
-
-            documents_for_pinecone.append(doc_for_pinecone)
+        documents_for_pinecone = self._convert_documents_to_pinecone_format(documents)
 
         result = self._index.upsert(
             vectors=documents_for_pinecone, namespace=self.namespace, batch_size=self.batch_size
@@ -261,3 +238,30 @@ def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> Li
             documents.append(doc)
 
         return documents
+
+    def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> List[Dict[str, Any]]:
+        documents_for_pinecone = []
+        for document in documents:
+            embedding = document.embedding
+            if embedding is None:
+                logger.warning(
+                    f"Document {document.id} has no embedding. Pinecone is a purely vector database. "
+                    "A dummy embedding will be used, but this can affect the search results. "
+                )
+                embedding = self._dummy_vector
+            doc_for_pinecone = {"id": document.id, "values": embedding, "metadata": document.meta}
+
+            # we save content/dataframe as metadata
+            # currently, storing blob in Pinecone is not supported
+            if document.content is not None:
+                doc_for_pinecone["metadata"]["content"] = document.content
+            if document.dataframe is not None:
+                doc_for_pinecone["metadata"]["dataframe"] = document.dataframe.to_json()
+            if document.blob is not None:
+                logger.warning(
+                    f"Document {document.id} has the `blob` field set, but storing `ByteStream` "
+                    "objects in Pinecone is not supported. "
+                    "The content of the `blob` field will be ignored."
+                )
+
+            documents_for_pinecone.append(doc_for_pinecone)