Skip to content

Commit

Permalink
improve document conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
anakin87 committed Dec 22, 2023
1 parent d918414 commit 4d90b8c
Showing 1 changed file with 28 additions and 24 deletions.
52 changes: 28 additions & 24 deletions integrations/pinecone/src/pinecone_haystack/document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,30 +129,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
f"but got {policy}. Overwriting duplicates is enabled by default."
)

documents_for_pinecone = []
for document in deepcopy(documents):
if document.embedding is None:
logger.warning(
f"Document {document.id} has no embedding. Pinecone is a purely vector database. "
"A dummy embedding will be used, but this can affect the search results. "
)
document.embedding = self._dummy_vector
doc_for_pinecone = {"id": document.id, "values": document.embedding, "metadata": document.meta}

# we save content/dataframe as metadata
# currently, storing blob in Pinecone is not supported
if document.content is not None:
doc_for_pinecone["metadata"]["content"] = document.content
if document.dataframe is not None:
doc_for_pinecone["metadata"]["dataframe"] = document.dataframe.to_json()
if document.blob is not None:
logger.warning(
f"Document {document.id} has the `blob` field set, but storing `ByteStream` "
"objects in Pinecone is not supported. "
"The content of the `blob` field will be ignored."
)

documents_for_pinecone.append(doc_for_pinecone)
documents_for_pinecone = self._convert_documents_to_pinecone_format(documents)

result = self._index.upsert(
vectors=documents_for_pinecone, namespace=self.namespace, batch_size=self.batch_size
Expand Down Expand Up @@ -261,3 +238,30 @@ def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> Li
documents.append(doc)

return documents

def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> List[Dict[str, Any]]:
documents_for_pinecone = []
for document in documents:
embedding = document.embedding
if embedding is None:
logger.warning(
f"Document {document.id} has no embedding. Pinecone is a purely vector database. "
"A dummy embedding will be used, but this can affect the search results. "
)
embedding = self._dummy_vector
doc_for_pinecone = {"id": document.id, "values": embedding, "metadata": document.meta}

# we save content/dataframe as metadata
# currently, storing blob in Pinecone is not supported
if document.content is not None:
doc_for_pinecone["metadata"]["content"] = document.content
if document.dataframe is not None:
doc_for_pinecone["metadata"]["dataframe"] = document.dataframe.to_json()
if document.blob is not None:
logger.warning(
f"Document {document.id} has the `blob` field set, but storing `ByteStream` "
"objects in Pinecone is not supported. "
"The content of the `blob` field will be ignored."
)

documents_for_pinecone.append(doc_for_pinecone)

0 comments on commit 4d90b8c

Please sign in to comment.