chore: Azure AI search - clarify that dataframe is not supported (#1407)

* Azure AI search - clarify dataframe is not supported * add Azure AI Search to README * fix * README fix
deepset-ai · Feb 17, 2025 · 95b3542 · 95b3542
1 parent 5d8a4d5
commit 95b3542
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 35 deletions.
diff --git a/README.md b/README.md
@@ -30,6 +30,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta
 | [amazon-sagemaker-haystack](integrations/amazon_sagemaker/)                                                    | Generator                     | [![PyPI - Version](https://img.shields.io/pypi/v/amazon-sagemaker-haystack.svg)](https://pypi.org/project/amazon-sagemaker-haystack)                     | [![Test / amazon_sagemaker](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_sagemaker.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_sagemaker.yml)             |
 | [anthropic-haystack](integrations/anthropic/)                                                                  | Generator                     | [![PyPI - Version](https://img.shields.io/pypi/v/anthropic-haystack.svg)](https://pypi.org/project/anthropic-haystack)                                   | [![Test / anthropic](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/anthropic.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/anthropic.yml)                                  |
 | [astra-haystack](integrations/astra/)                                                                          | Document Store                | [![PyPI - Version](https://img.shields.io/pypi/v/astra-haystack.svg)](https://pypi.org/project/astra-haystack)                                           | [![Test / astra](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/astra.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/astra.yml)                                              |
+| [azure-ai-search-haystack](integrations/azure_ai_search/)                                                                          | Document Store                | [![PyPI - Version](https://img.shields.io/pypi/v/azure-ai-search-haystack.svg)](https://pypi.org/project/azure-ai-search-haystack)                                           | [![Test / azure-ai-search](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/azure_ai_search.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/azure_ai_search.yml)                                           |
 | [chroma-haystack](integrations/chroma/)                                                                        | Document Store                | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack)                                         | [![Test / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml)                                           |
 | [cohere-haystack](integrations/cohere/)                                                                        | Embedder, Generator, Ranker   | [![PyPI - Version](https://img.shields.io/pypi/v/cohere-haystack.svg)](https://pypi.org/project/cohere-haystack)                                         | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml)                                           |
 | [deepeval-haystack](integrations/deepeval/)                                                                    | Evaluator                     | [![PyPI - Version](https://img.shields.io/pypi/v/deepeval-haystack.svg)](https://pypi.org/project/deepeval-haystack)                                     | [![Test / deepeval](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/deepeval.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/deepeval.yml)                                     |

diff --git a/...ure_ai_search/src/haystack_integrations/document_stores/azure_ai_search/document_store.py b/...ure_ai_search/src/haystack_integrations/document_stores/azure_ai_search/document_store.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import logging
 import os
-from dataclasses import asdict
 from datetime import datetime
 from typing import Any, Dict, List, Optional
 
@@ -246,15 +245,6 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
         :return: the number of documents added to index.
         """
 
-        def _convert_input_document(documents: Document):
-            document_dict = asdict(documents)
-            if not isinstance(document_dict["id"], str):
-                msg = f"Document id {document_dict['id']} is not a string, "
-                raise TypeError(msg)
-            index_document = self._convert_haystack_documents_to_azure(document_dict)
-
-            return index_document
-
         if len(documents) > 0:
             if not isinstance(documents[0], Document):
                 msg = "param 'documents' must contain a list of objects of type Document"
@@ -266,7 +256,7 @@ def _convert_input_document(documents: Document):
                 f"but got {policy}. Overwriting duplicates is enabled by default."
             )
         client = self.client
-        documents_to_write = [(_convert_input_document(doc)) for doc in documents]
+        documents_to_write = [self._convert_haystack_document_to_azure(doc) for doc in documents]
 
         if documents_to_write != []:
             client.upload_documents(documents_to_write)
@@ -370,11 +360,23 @@ def _get_raw_documents_by_id(self, document_ids: List[str]):
                 logger.warning(f"Document with ID {doc_id} not found.")
         return azure_documents
 
-    def _convert_haystack_documents_to_azure(self, document: Dict[str, Any]) -> Dict[str, Any]:
-        """Map the document keys to fields of search index"""
+    def _convert_haystack_document_to_azure(self, document: Document) -> Dict[str, Any]:
+        """Convert a Haystack Document to an Azure Search document"""
+
+        doc_dict = document.to_dict(flatten=False)
+
+        if "dataframe" in doc_dict:
+            dataframe = doc_dict.pop("dataframe")
+            if dataframe:
+                logger.warning(
+                    "Document %s has the `dataframe` field set. "
+                    "AzureAISearchDocumentStore does not support dataframes and this field will be ignored. "
+                    "The `dataframe` field will soon be removed from Haystack Document.",
+                    doc_dict["id"],
+                )
 
         # Because Azure Search does not allow dynamic fields, we only include fields that are part of the schema
-        index_document = {k: v for k, v in {**document, **document.get("meta", {})}.items() if k in self._index_fields}
+        index_document = {k: v for k, v in {**doc_dict, **doc_dict.get("meta", {})}.items() if k in self._index_fields}
         if index_document["embedding"] is None:
             index_document["embedding"] = self._dummy_vector
 

diff --git a/integrations/azure_ai_search/tests/test_document_store.py b/integrations/azure_ai_search/tests/test_document_store.py
@@ -14,10 +14,10 @@
     CountDocumentsTest,
     DeleteDocumentsTest,
     FilterDocumentsTest,
-    FilterDocumentsTestWithDataframe,
     WriteDocumentsTest,
 )
 from haystack.utils.auth import EnvVarSecret, Secret
+from pandas import DataFrame
 
 from haystack_integrations.document_stores.azure_ai_search import DEFAULT_VECTOR_SEARCH, AzureAISearchDocumentStore
 
@@ -130,6 +130,16 @@ def test_write_documents_with_meta(self, document_store: AzureAISearchDocumentSt
         doc = document_store.get_documents_by_id(["1"])
         assert doc[0] == docs[0]
 
+    def test_write_documents_skips_dataframe(self, document_store: AzureAISearchDocumentStore):
+        doc = Document(id="1", content="test")
+        doc.dataframe = DataFrame({"a": [1, 2, 3]})
+
+        assert document_store.write_documents([doc]) == 1
+        retrieved_docs = document_store.get_documents_by_id(["1"])
+        assert retrieved_docs[0].id == "1"
+        assert retrieved_docs[0].content == "test"
+        assert not hasattr(retrieved_docs[0], "dataframe") or retrieved_docs[0].dataframe is None
+
     @pytest.mark.skip(reason="Azure AI search index overwrites duplicate documents by default")
     def test_write_documents_duplicate_fail(self, document_store: AzureAISearchDocumentStore): ...
 
@@ -156,10 +166,9 @@ def _random_embeddings(n):
     ],
     indirect=True,
 )
-class TestFilters(FilterDocumentsTest, FilterDocumentsTestWithDataframe):
+class TestFilters(FilterDocumentsTest):
 
     # Overriding to change "date" to compatible ISO 8601 format
-    # and remove incompatible fields (dataframes) for Azure search index
     @pytest.fixture
     def filterable_docs(self) -> List[Document]:
         """Fixture that returns a list of Documents that can be used to test filtering."""
@@ -227,24 +236,6 @@ def assert_documents_are_equal(self, received: List[Document], expected: List[Do
         sorted_expected = sorted(expected, key=lambda doc: doc.id)
         assert sorted_recieved == sorted_expected
 
-    @pytest.mark.skip(reason="Azure AI search index does not support dataframes")
-    def test_comparison_equal_with_dataframe(self, document_store, filterable_docs): ...
-
-    @pytest.mark.skip(reason="Azure AI search index does not support dataframes")
-    def test_comparison_not_equal_with_dataframe(self, document_store, filterable_docs): ...
-
-    @pytest.mark.skip(reason="Azure AI search index does not support dataframes")
-    def test_comparison_greater_than_with_dataframe(self, document_store, filterable_docs): ...
-
-    @pytest.mark.skip(reason="Azure AI search index does not support dataframes")
-    def test_comparison_less_than_with_dataframe(self, document_store, filterable_docs): ...
-
-    @pytest.mark.skip(reason="Azure AI search index does not support dataframes")
-    def test_comparison_greater_than_equal_with_dataframe(self, document_store, filterable_docs): ...
-
-    @pytest.mark.skip(reason="Azure AI search index does not support dataframes")
-    def test_comparison_less_than_equal_with_dataframe(self, document_store, filterable_docs): ...
-
     # Azure search index supports UTC datetime in ISO 8601 format
     def test_comparison_greater_than_with_iso_date(self, document_store, filterable_docs):
         """Test filter_documents() with > comparator and datetime"""