Add methods to convert from Document to Weaviate data object and vice…

…versa (#269) * Add methods to convert from Document to Weaviate data object and viceversa * Add tests
deepset-ai · Jan 25, 2024 · 8db28ee · 8db28ee
1 parent fa72811
commit 8db28ee
Show file tree

Hide file tree

Showing 4 changed files with 109 additions and 0 deletions.
diff --git a/integrations/weaviate/src/haystack_integrations/document_stores/weaviate/document_store.py b/integrations/weaviate/src/haystack_integrations/document_stores/weaviate/document_store.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
 #
 # SPDX-License-Identifier: Apache-2.0
+import base64
 from dataclasses import asdict
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -192,6 +193,52 @@ def count_documents(self) -> int:
         res = self._client.query.aggregate(collection_name).with_meta_count().do()
         return res.get("data", {}).get("Aggregate", {}).get(collection_name, [{}])[0].get("meta", {}).get("count", 0)
 
+    def _to_data_object(self, document: Document) -> Dict[str, Any]:
+        """
+        Convert a Document to a Weviate data object ready to be saved.
+        """
+        data = document.to_dict(flatten=False)
+        # Weaviate forces a UUID as an id.
+        # We don't know if the id of our Document is a UUID or not, so we save it on a different field
+        # and let Weaviate a UUID that we're going to ignore completely.
+        data["_original_id"] = data.pop("id")
+        if (blob := data.pop("blob")) is not None:
+            # Weaviate wants the blob data as a base64 encoded string
+            # See the official docs for more information:
+            # https://weaviate.io/developers/weaviate/config-refs/datatypes#datatype-blob
+            data["blob_data"] = base64.b64encode(bytes(blob.pop("data"))).decode()
+            data["blob_mime_type"] = blob.pop("mime_type")
+        # The embedding vector is stored separately from the rest of the data
+        del data["embedding"]
+
+        # Weaviate doesn't like empty objects, let's delete meta if it's empty
+        if data["meta"] == {}:
+            del data["meta"]
+
+        return data
+
+    def _to_document(self, data: Dict[str, Any]) -> Document:
+        """
+        Convert a data object read from Weaviate into a Document.
+        """
+        data["id"] = data.pop("_original_id")
+        data["embedding"] = data["_additional"].pop("vector") if data["_additional"].get("vector") else None
+
+        if (blob_data := data.get("blob_data")) is not None:
+            data["blob"] = {
+                "data": base64.b64decode(blob_data),
+                "mime_type": data.get("blob_mime_type"),
+            }
+        # We always delete these fields as they're not part of the Document dataclass
+        data.pop("blob_data")
+        data.pop("blob_mime_type")
+
+        # We don't need these fields anymore, this usually only contains the uuid
+        # used by Weaviate to identify the object and the embedding vector that we already extracted.
+        del data["_additional"]
+
+        return Document.from_dict(data)
+
     def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:  # noqa: ARG002
         return []
 

diff --git a/integrations/weaviate/tests/conftest.py b/integrations/weaviate/tests/conftest.py
@@ -0,0 +1,8 @@
+from pathlib import Path
+
+import pytest
+
+
+@pytest.fixture()
+def test_files_path():
+    return Path(__file__).parent / "test_files"
diff --git a/integrations/weaviate/tests/test_document_store.py b/integrations/weaviate/tests/test_document_store.py
@@ -1,6 +1,9 @@
+import base64
 from unittest.mock import MagicMock, patch
 
 import pytest
+from haystack.dataclasses.byte_stream import ByteStream
+from haystack.dataclasses.document import Document
 from haystack.testing.document_store import CountDocumentsTest
 from haystack_integrations.document_stores.weaviate.document_store import (
     DOCUMENT_COLLECTION_PROPERTIES,
@@ -202,3 +205,54 @@ def test_from_dict(self, _mock_weaviate):
     def test_count_not_empty(self, document_store):
         # Skipped for the time being as we don't support writing documents
         pass
+
+    def test_to_data_object(self, document_store, test_files_path):
+        doc = Document(content="test doc")
+        data = document_store._to_data_object(doc)
+        assert data == {
+            "_original_id": doc.id,
+            "content": doc.content,
+            "dataframe": None,
+            "score": None,
+        }
+
+        image = ByteStream.from_file_path(test_files_path / "robot1.jpg", mime_type="image/jpeg")
+        doc = Document(
+            content="test doc",
+            blob=image,
+            embedding=[1, 2, 3],
+            meta={"key": "value"},
+        )
+        data = document_store._to_data_object(doc)
+        assert data == {
+            "_original_id": doc.id,
+            "content": doc.content,
+            "blob_data": base64.b64encode(image.data).decode(),
+            "blob_mime_type": "image/jpeg",
+            "dataframe": None,
+            "score": None,
+            "meta": {"key": "value"},
+        }
+
+    def test_to_document(self, document_store, test_files_path):
+        image = ByteStream.from_file_path(test_files_path / "robot1.jpg", mime_type="image/jpeg")
+        data = {
+            "_additional": {
+                "vector": [1, 2, 3],
+            },
+            "_original_id": "123",
+            "content": "some content",
+            "blob_data": base64.b64encode(image.data).decode(),
+            "blob_mime_type": "image/jpeg",
+            "dataframe": None,
+            "score": None,
+            "meta": {"key": "value"},
+        }
+
+        doc = document_store._to_document(data)
+        assert doc.id == "123"
+        assert doc.content == "some content"
+        assert doc.blob == image
+        assert doc.embedding == [1, 2, 3]
+        assert doc.score is None
+        assert doc.meta == {"key": "value"}
diff --git a/integrations/weaviate/tests/test_files/robot1.jpg b/integrations/weaviate/tests/test_files/robot1.jpg