From 1bcb9a8e6a3d1ab5955dcb6fb18ec90c184ea258 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Mon, 11 Nov 2024 16:40:01 +0100 Subject: [PATCH] Weaviate - skip writing _split_overlap meta field (#1173) --- .../weaviate/document_store.py | 8 +++++++ .../weaviate/tests/test_document_store.py | 24 +++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/integrations/weaviate/src/haystack_integrations/document_stores/weaviate/document_store.py b/integrations/weaviate/src/haystack_integrations/document_stores/weaviate/document_store.py index e312b1473..6acf0156e 100644 --- a/integrations/weaviate/src/haystack_integrations/document_stores/weaviate/document_store.py +++ b/integrations/weaviate/src/haystack_integrations/document_stores/weaviate/document_store.py @@ -286,6 +286,14 @@ def _to_data_object(self, document: Document) -> Dict[str, Any]: # The embedding vector is stored separately from the rest of the data del data["embedding"] + # _split_overlap meta field is unsupported because of a bug + # https://github.com/deepset-ai/haystack-core-integrations/issues/1172 + if "_split_overlap" in data: + data.pop("_split_overlap") + logger.warning( + "Document %s has the unsupported `_split_overlap` meta field. It will be ignored.", data["_original_id"] + ) + if "sparse_embedding" in data: sparse_embedding = data.pop("sparse_embedding", None) if sparse_embedding: diff --git a/integrations/weaviate/tests/test_document_store.py b/integrations/weaviate/tests/test_document_store.py index 70f1e1eb2..00af322e4 100644 --- a/integrations/weaviate/tests/test_document_store.py +++ b/integrations/weaviate/tests/test_document_store.py @@ -508,6 +508,30 @@ def test_comparison_less_than_equal_with_iso_date(self, document_store, filterab def test_comparison_not_equal_with_dataframe(self, document_store, filterable_docs): return super().test_comparison_not_equal_with_dataframe(document_store, filterable_docs) + def test_meta_split_overlap_is_skipped(self, document_store): + doc = Document( + content="The moonlight shimmered ", + meta={ + "source_id": "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0", + "page_number": 1, + "split_id": 0, + "split_idx_start": 0, + "_split_overlap": [ + {"doc_id": "68ed48ba830048c5d7815874ed2de794722e6d10866b6c55349a914fd9a0df65", "range": (0, 20)} + ], + }, + ) + document_store.write_documents([doc]) + + written_doc = document_store.filter_documents()[0] + + assert written_doc.content == "The moonlight shimmered " + assert written_doc.meta["source_id"] == "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0" + assert written_doc.meta["page_number"] == 1.0 + assert written_doc.meta["split_id"] == 0.0 + assert written_doc.meta["split_idx_start"] == 0.0 + assert "_split_overlap" not in written_doc.meta + def test_bm25_retrieval(self, document_store): document_store.write_documents( [