diff --git a/integrations/weaviate/src/haystack_integrations/document_stores/weaviate/document_store.py b/integrations/weaviate/src/haystack_integrations/document_stores/weaviate/document_store.py index 6e40cbb0d..85da8c486 100644 --- a/integrations/weaviate/src/haystack_integrations/document_stores/weaviate/document_store.py +++ b/integrations/weaviate/src/haystack_integrations/document_stores/weaviate/document_store.py @@ -12,6 +12,7 @@ from haystack.document_stores.types.policy import DuplicatePolicy import weaviate +from weaviate.collections.classes.data import DataObject from weaviate.collections.classes.internal import Object from weaviate.config import AdditionalConfig from weaviate.embedded import EmbeddedOptions @@ -211,27 +212,25 @@ def _convert_weaviate_v4_object_to_v3_object(self, data: Object) -> Dict[str, An v3_object[date_prop] = v3_object[date_prop].strftime("%Y-%m-%dT%H:%M:%SZ") return v3_object - def _to_document(self, data: Dict[str, Any]) -> Document: + def _to_document(self, data: DataObject) -> Document: """ Convert a data object read from Weaviate into a Document. """ - data["id"] = data.pop("_original_id") - data["embedding"] = data["_additional"].pop("vector") if data["_additional"].get("vector") else None + document_data = data.properties + document_data["id"] = document_data.pop("_original_id") + document_data["embedding"] = data.vector if data.vector else None - if (blob_data := data.get("blob_data")) is not None: - data["blob"] = { + if (blob_data := document_data.get("blob_data")) is not None: + document_data["blob"] = { "data": base64.b64decode(blob_data), - "mime_type": data.get("blob_mime_type"), + "mime_type": document_data.get("blob_mime_type"), } - # We always delete these fields as they're not part of the Document dataclass - data.pop("blob_data") - data.pop("blob_mime_type") - # We don't need these fields anymore, this usually only contains the uuid - # used by Weaviate to identify the object and the embedding vector that we already extracted. - del data["_additional"] + # We always delete these fields as they're not part of the Document dataclass + document_data.pop("blob_data", None) + document_data.pop("blob_mime_type", None) - return Document.from_dict(data) + return Document.from_dict(document_data) def _query_paginated(self, properties: List[str]): diff --git a/integrations/weaviate/tests/test_document_store.py b/integrations/weaviate/tests/test_document_store.py index f492ab806..e497c24e7 100644 --- a/integrations/weaviate/tests/test_document_store.py +++ b/integrations/weaviate/tests/test_document_store.py @@ -24,6 +24,7 @@ from numpy import array_equal as np_array_equal from numpy import float32 as np_float32 from pandas import DataFrame +from weaviate.collections.classes.data import DataObject # from weaviate.auth import AuthApiKey as WeaviateAuthApiKey from weaviate.config import AdditionalConfig, ConnectionConfig @@ -357,18 +358,18 @@ def test_to_data_object(self, document_store, test_files_path): def test_to_document(self, document_store, test_files_path): image = ByteStream.from_file_path(test_files_path / "robot1.jpg", mime_type="image/jpeg") - data = { - "_additional": { - "vector": [1, 2, 3], + data = DataObject( + properties={ + "_original_id": "123", + "content": "some content", + "blob_data": base64.b64encode(image.data).decode(), + "blob_mime_type": "image/jpeg", + "dataframe": None, + "score": None, + "key": "value", }, - "_original_id": "123", - "content": "some content", - "blob_data": base64.b64encode(image.data).decode(), - "blob_mime_type": "image/jpeg", - "dataframe": None, - "score": None, - "meta": {"key": "value"}, - } + vector=[1, 2, 3], + ) doc = document_store._to_document(data) assert doc.id == "123"