Skip to content

Commit

Permalink
Add methods to convert from Document to Weaviate data object and vice…
Browse files Browse the repository at this point in the history
…versa (#269)

* Add methods to convert from Document to Weaviate data object and viceversa

* Add tests
  • Loading branch information
silvanocerza authored Jan 25, 2024
1 parent fa72811 commit 8db28ee
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
import base64
from dataclasses import asdict
from typing import Any, Dict, List, Optional, Tuple, Union

Expand Down Expand Up @@ -192,6 +193,52 @@ def count_documents(self) -> int:
res = self._client.query.aggregate(collection_name).with_meta_count().do()
return res.get("data", {}).get("Aggregate", {}).get(collection_name, [{}])[0].get("meta", {}).get("count", 0)

def _to_data_object(self, document: Document) -> Dict[str, Any]:
"""
Convert a Document to a Weviate data object ready to be saved.
"""
data = document.to_dict(flatten=False)
# Weaviate forces a UUID as an id.
# We don't know if the id of our Document is a UUID or not, so we save it on a different field
# and let Weaviate a UUID that we're going to ignore completely.
data["_original_id"] = data.pop("id")
if (blob := data.pop("blob")) is not None:
# Weaviate wants the blob data as a base64 encoded string
# See the official docs for more information:
# https://weaviate.io/developers/weaviate/config-refs/datatypes#datatype-blob
data["blob_data"] = base64.b64encode(bytes(blob.pop("data"))).decode()
data["blob_mime_type"] = blob.pop("mime_type")
# The embedding vector is stored separately from the rest of the data
del data["embedding"]

# Weaviate doesn't like empty objects, let's delete meta if it's empty
if data["meta"] == {}:
del data["meta"]

return data

def _to_document(self, data: Dict[str, Any]) -> Document:
"""
Convert a data object read from Weaviate into a Document.
"""
data["id"] = data.pop("_original_id")
data["embedding"] = data["_additional"].pop("vector") if data["_additional"].get("vector") else None

if (blob_data := data.get("blob_data")) is not None:
data["blob"] = {
"data": base64.b64decode(blob_data),
"mime_type": data.get("blob_mime_type"),
}
# We always delete these fields as they're not part of the Document dataclass
data.pop("blob_data")
data.pop("blob_mime_type")

# We don't need these fields anymore, this usually only contains the uuid
# used by Weaviate to identify the object and the embedding vector that we already extracted.
del data["_additional"]

return Document.from_dict(data)

def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: # noqa: ARG002
return []

Expand Down
8 changes: 8 additions & 0 deletions integrations/weaviate/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from pathlib import Path

import pytest


@pytest.fixture()
def test_files_path():
return Path(__file__).parent / "test_files"
54 changes: 54 additions & 0 deletions integrations/weaviate/tests/test_document_store.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import base64
from unittest.mock import MagicMock, patch

import pytest
from haystack.dataclasses.byte_stream import ByteStream
from haystack.dataclasses.document import Document
from haystack.testing.document_store import CountDocumentsTest
from haystack_integrations.document_stores.weaviate.document_store import (
DOCUMENT_COLLECTION_PROPERTIES,
Expand Down Expand Up @@ -202,3 +205,54 @@ def test_from_dict(self, _mock_weaviate):
def test_count_not_empty(self, document_store):
# Skipped for the time being as we don't support writing documents
pass

def test_to_data_object(self, document_store, test_files_path):
doc = Document(content="test doc")
data = document_store._to_data_object(doc)
assert data == {
"_original_id": doc.id,
"content": doc.content,
"dataframe": None,
"score": None,
}

image = ByteStream.from_file_path(test_files_path / "robot1.jpg", mime_type="image/jpeg")
doc = Document(
content="test doc",
blob=image,
embedding=[1, 2, 3],
meta={"key": "value"},
)
data = document_store._to_data_object(doc)
assert data == {
"_original_id": doc.id,
"content": doc.content,
"blob_data": base64.b64encode(image.data).decode(),
"blob_mime_type": "image/jpeg",
"dataframe": None,
"score": None,
"meta": {"key": "value"},
}

def test_to_document(self, document_store, test_files_path):
image = ByteStream.from_file_path(test_files_path / "robot1.jpg", mime_type="image/jpeg")
data = {
"_additional": {
"vector": [1, 2, 3],
},
"_original_id": "123",
"content": "some content",
"blob_data": base64.b64encode(image.data).decode(),
"blob_mime_type": "image/jpeg",
"dataframe": None,
"score": None,
"meta": {"key": "value"},
}

doc = document_store._to_document(data)
assert doc.id == "123"
assert doc.content == "some content"
assert doc.blob == image
assert doc.embedding == [1, 2, 3]
assert doc.score is None
assert doc.meta == {"key": "value"}
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 8db28ee

Please sign in to comment.