From ef9c0f8c9a295d6b829061f92188d7454c418c09 Mon Sep 17 00:00:00 2001 From: Silvano Cerza Date: Thu, 25 Jan 2024 12:56:19 +0100 Subject: [PATCH] Add methods to convert from Document to Weaviate data object and viceversa --- .../weaviate/document_store.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/integrations/weaviate/src/haystack_integrations/document_stores/weaviate/document_store.py b/integrations/weaviate/src/haystack_integrations/document_stores/weaviate/document_store.py index 4a9f6626d..675d78341 100644 --- a/integrations/weaviate/src/haystack_integrations/document_stores/weaviate/document_store.py +++ b/integrations/weaviate/src/haystack_integrations/document_stores/weaviate/document_store.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 +import base64 from dataclasses import asdict from typing import Any, Dict, List, Optional, Tuple, Union @@ -12,6 +13,7 @@ from weaviate.auth import AuthCredentials from weaviate.config import Config, ConnectionConfig from weaviate.embedded import EmbeddedOptions +from weaviate.util import generate_uuid5 Number = Union[int, float] TimeoutType = Union[Tuple[Number, Number], Number] @@ -195,6 +197,28 @@ def count_documents(self) -> int: def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: # noqa: ARG002 return [] + def _to_data_object(self, document: Document) -> Dict[str, Any]: + """ + Convert a Document to a Weviate data object ready to be saved. + """ + data = document.to_dict(flatten=False) + # Weaviate forces a UUID as an id. + # We don't know if the id of our Document is a UUID or not, so we save it on a different field + # and let Weaviate a UUID that we're going to ignore completely. + data["_original_id"] = data.pop("id") + if (blob := data.pop("blob")) is not None: + # Weaviate wants the blob data as a base64 encoded string + # See the official docs for more information: + # https://weaviate.io/developers/weaviate/config-refs/datatypes#datatype-blob + data["blob_data"] = base64.b64encode(bytes(blob.pop("data"))).decode() + data["blob_mime_type"] = blob.pop("mime_type") + # The embedding vector is stored separately from the rest of the data + del data["embedding"] + + # Weaviate doesn't like empty objects, let's delete meta if it's empty + if data["meta"] == {}: + del data["meta"] + def write_documents( self, documents: List[Document], # noqa: ARG002