Skip to content

Commit

Permalink
refactor _to_document
Browse files Browse the repository at this point in the history
  • Loading branch information
hsm207 committed Mar 4, 2024
1 parent 3463245 commit 6be11af
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from haystack.document_stores.types.policy import DuplicatePolicy

import weaviate
from weaviate.collections.classes.data import DataObject
from weaviate.collections.classes.internal import Object
from weaviate.config import AdditionalConfig
from weaviate.embedded import EmbeddedOptions
Expand Down Expand Up @@ -211,27 +212,25 @@ def _convert_weaviate_v4_object_to_v3_object(self, data: Object) -> Dict[str, An
v3_object[date_prop] = v3_object[date_prop].strftime("%Y-%m-%dT%H:%M:%SZ")
return v3_object

def _to_document(self, data: Dict[str, Any]) -> Document:
def _to_document(self, data: DataObject) -> Document:
"""
Convert a data object read from Weaviate into a Document.
"""
data["id"] = data.pop("_original_id")
data["embedding"] = data["_additional"].pop("vector") if data["_additional"].get("vector") else None
document_data = data.properties
document_data["id"] = document_data.pop("_original_id")
document_data["embedding"] = data.vector if data.vector else None

if (blob_data := data.get("blob_data")) is not None:
data["blob"] = {
if (blob_data := document_data.get("blob_data")) is not None:
document_data["blob"] = {
"data": base64.b64decode(blob_data),
"mime_type": data.get("blob_mime_type"),
"mime_type": document_data.get("blob_mime_type"),
}
# We always delete these fields as they're not part of the Document dataclass
data.pop("blob_data")
data.pop("blob_mime_type")

# We don't need these fields anymore, this usually only contains the uuid
# used by Weaviate to identify the object and the embedding vector that we already extracted.
del data["_additional"]
# We always delete these fields as they're not part of the Document dataclass
document_data.pop("blob_data", None)
document_data.pop("blob_mime_type", None)

return Document.from_dict(data)
return Document.from_dict(document_data)

def _query_paginated(self, properties: List[str]):

Expand Down
23 changes: 12 additions & 11 deletions integrations/weaviate/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from numpy import array_equal as np_array_equal
from numpy import float32 as np_float32
from pandas import DataFrame
from weaviate.collections.classes.data import DataObject

# from weaviate.auth import AuthApiKey as WeaviateAuthApiKey
from weaviate.config import AdditionalConfig, ConnectionConfig
Expand Down Expand Up @@ -357,18 +358,18 @@ def test_to_data_object(self, document_store, test_files_path):

def test_to_document(self, document_store, test_files_path):
image = ByteStream.from_file_path(test_files_path / "robot1.jpg", mime_type="image/jpeg")
data = {
"_additional": {
"vector": [1, 2, 3],
data = DataObject(
properties={
"_original_id": "123",
"content": "some content",
"blob_data": base64.b64encode(image.data).decode(),
"blob_mime_type": "image/jpeg",
"dataframe": None,
"score": None,
"key": "value",
},
"_original_id": "123",
"content": "some content",
"blob_data": base64.b64encode(image.data).decode(),
"blob_mime_type": "image/jpeg",
"dataframe": None,
"score": None,
"meta": {"key": "value"},
}
vector=[1, 2, 3],
)

doc = document_store._to_document(data)
assert doc.id == "123"
Expand Down

0 comments on commit 6be11af

Please sign in to comment.