Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add methods to convert from Document to Weaviate data object and viceversa #269

Merged
merged 2 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
import base64
from dataclasses import asdict
from typing import Any, Dict, List, Optional, Tuple, Union

Expand Down Expand Up @@ -192,6 +193,52 @@ def count_documents(self) -> int:
res = self._client.query.aggregate(collection_name).with_meta_count().do()
return res.get("data", {}).get("Aggregate", {}).get(collection_name, [{}])[0].get("meta", {}).get("count", 0)

def _to_data_object(self, document: Document) -> Dict[str, Any]:
"""
Convert a Document to a Weviate data object ready to be saved.
"""
data = document.to_dict(flatten=False)
# Weaviate forces a UUID as an id.
# We don't know if the id of our Document is a UUID or not, so we save it on a different field
# and let Weaviate a UUID that we're going to ignore completely.
data["_original_id"] = data.pop("id")
if (blob := data.pop("blob")) is not None:
# Weaviate wants the blob data as a base64 encoded string
# See the official docs for more information:
# https://weaviate.io/developers/weaviate/config-refs/datatypes#datatype-blob
data["blob_data"] = base64.b64encode(bytes(blob.pop("data"))).decode()
data["blob_mime_type"] = blob.pop("mime_type")
# The embedding vector is stored separately from the rest of the data
del data["embedding"]

# Weaviate doesn't like empty objects, let's delete meta if it's empty
if data["meta"] == {}:
del data["meta"]

return data

def _to_document(self, data: Dict[str, Any]) -> Document:
"""
Convert a data object read from Weaviate into a Document.
"""
data["id"] = data.pop("_original_id")
data["embedding"] = data["_additional"].pop("vector") if data["_additional"].get("vector") else None

if (blob_data := data.get("blob_data")) is not None:
data["blob"] = {
"data": base64.b64decode(blob_data),
"mime_type": data.get("blob_mime_type"),
}
# We always delete these fields as they're not part of the Document dataclass
data.pop("blob_data")
data.pop("blob_mime_type")

# We don't need these fields anymore, this usually only contains the uuid
# used by Weaviate to identify the object and the embedding vector that we already extracted.
del data["_additional"]

return Document.from_dict(data)

def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: # noqa: ARG002
return []

Expand Down
8 changes: 8 additions & 0 deletions integrations/weaviate/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from pathlib import Path

import pytest


@pytest.fixture()
def test_files_path():
return Path(__file__).parent / "test_files"
54 changes: 54 additions & 0 deletions integrations/weaviate/tests/test_document_store.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import base64
from unittest.mock import MagicMock, patch

import pytest
from haystack.dataclasses.byte_stream import ByteStream
from haystack.dataclasses.document import Document
from haystack.testing.document_store import CountDocumentsTest
from haystack_integrations.document_stores.weaviate.document_store import (
DOCUMENT_COLLECTION_PROPERTIES,
Expand Down Expand Up @@ -202,3 +205,54 @@ def test_from_dict(self, _mock_weaviate):
def test_count_not_empty(self, document_store):
# Skipped for the time being as we don't support writing documents
pass

def test_to_data_object(self, document_store, test_files_path):
doc = Document(content="test doc")
data = document_store._to_data_object(doc)
assert data == {
"_original_id": doc.id,
"content": doc.content,
"dataframe": None,
"score": None,
}

image = ByteStream.from_file_path(test_files_path / "robot1.jpg", mime_type="image/jpeg")
doc = Document(
content="test doc",
blob=image,
embedding=[1, 2, 3],
meta={"key": "value"},
)
data = document_store._to_data_object(doc)
assert data == {
"_original_id": doc.id,
"content": doc.content,
"blob_data": base64.b64encode(image.data).decode(),
"blob_mime_type": "image/jpeg",
"dataframe": None,
"score": None,
"meta": {"key": "value"},
}

def test_to_document(self, document_store, test_files_path):
image = ByteStream.from_file_path(test_files_path / "robot1.jpg", mime_type="image/jpeg")
data = {
"_additional": {
"vector": [1, 2, 3],
},
"_original_id": "123",
"content": "some content",
"blob_data": base64.b64encode(image.data).decode(),
"blob_mime_type": "image/jpeg",
"dataframe": None,
"score": None,
"meta": {"key": "value"},
}

doc = document_store._to_document(data)
assert doc.id == "123"
assert doc.content == "some content"
assert doc.blob == image
assert doc.embedding == [1, 2, 3]
assert doc.score is None
assert doc.meta == {"key": "value"}
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.