refactor _to_document

deepset-ai · Mar 4, 2024 · 6be11af · 6be11af
1 parent 3463245
commit 6be11af
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 24 deletions.
diff --git a/integrations/weaviate/src/haystack_integrations/document_stores/weaviate/document_store.py b/integrations/weaviate/src/haystack_integrations/document_stores/weaviate/document_store.py
@@ -12,6 +12,7 @@
 from haystack.document_stores.types.policy import DuplicatePolicy
 
 import weaviate
+from weaviate.collections.classes.data import DataObject
 from weaviate.collections.classes.internal import Object
 from weaviate.config import AdditionalConfig
 from weaviate.embedded import EmbeddedOptions
@@ -211,27 +212,25 @@ def _convert_weaviate_v4_object_to_v3_object(self, data: Object) -> Dict[str, An
             v3_object[date_prop] = v3_object[date_prop].strftime("%Y-%m-%dT%H:%M:%SZ")
         return v3_object
 
-    def _to_document(self, data: Dict[str, Any]) -> Document:
+    def _to_document(self, data: DataObject) -> Document:
         """
         Convert a data object read from Weaviate into a Document.
         """
-        data["id"] = data.pop("_original_id")
-        data["embedding"] = data["_additional"].pop("vector") if data["_additional"].get("vector") else None
+        document_data = data.properties
+        document_data["id"] = document_data.pop("_original_id")
+        document_data["embedding"] = data.vector if data.vector else None
 
-        if (blob_data := data.get("blob_data")) is not None:
-            data["blob"] = {
+        if (blob_data := document_data.get("blob_data")) is not None:
+            document_data["blob"] = {
                 "data": base64.b64decode(blob_data),
-                "mime_type": data.get("blob_mime_type"),
+                "mime_type": document_data.get("blob_mime_type"),
             }
-        # We always delete these fields as they're not part of the Document dataclass
-        data.pop("blob_data")
-        data.pop("blob_mime_type")
 
-        # We don't need these fields anymore, this usually only contains the uuid
-        # used by Weaviate to identify the object and the embedding vector that we already extracted.
-        del data["_additional"]
+        # We always delete these fields as they're not part of the Document dataclass
+        document_data.pop("blob_data", None)
+        document_data.pop("blob_mime_type", None)
 
-        return Document.from_dict(data)
+        return Document.from_dict(document_data)
 
     def _query_paginated(self, properties: List[str]):
 

diff --git a/integrations/weaviate/tests/test_document_store.py b/integrations/weaviate/tests/test_document_store.py
@@ -24,6 +24,7 @@
 from numpy import array_equal as np_array_equal
 from numpy import float32 as np_float32
 from pandas import DataFrame
+from weaviate.collections.classes.data import DataObject
 
 # from weaviate.auth import AuthApiKey as WeaviateAuthApiKey
 from weaviate.config import AdditionalConfig, ConnectionConfig
@@ -357,18 +358,18 @@ def test_to_data_object(self, document_store, test_files_path):
 
     def test_to_document(self, document_store, test_files_path):
         image = ByteStream.from_file_path(test_files_path / "robot1.jpg", mime_type="image/jpeg")
-        data = {
-            "_additional": {
-                "vector": [1, 2, 3],
+        data = DataObject(
+            properties={
+                "_original_id": "123",
+                "content": "some content",
+                "blob_data": base64.b64encode(image.data).decode(),
+                "blob_mime_type": "image/jpeg",
+                "dataframe": None,
+                "score": None,
+                "key": "value",
             },
-            "_original_id": "123",
-            "content": "some content",
-            "blob_data": base64.b64encode(image.data).decode(),
-            "blob_mime_type": "image/jpeg",
-            "dataframe": None,
-            "score": None,
-            "meta": {"key": "value"},
-        }
+            vector=[1, 2, 3],
+        )
 
         doc = document_store._to_document(data)
         assert doc.id == "123"